Esempio n. 1
0
def build_vocab(json, threshold):
    """Build a simple vocabulary wrapper."""
    counter = Counter()
    for i, id in enumerate(ids):
        caption = str(coco.anns[id]['caption'])
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        counter.update(tokens)

        if (i + 1) % 1000 == 0:
            print("[{}/{}] Tokenized the captions.".format(i + 1, len(ids)))

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab
Esempio n. 2
0
    def __init__(self, corpus = ""):
        self.voc = Vocabulary()
        self.tm = scipy.sparse.dok_matrix((1000,1000), dtype=np.float32)

        self.add_from_text(corpus)
        self.start = UnigramLM(self.voc)
        self.valid = False
        self.sorted_tokens = []
def readVocabularyFrom(fileName):  
    file = open(fileName, 'rb') 
    vocabulary = Vocabulary(fileName[:-4]) 
    for line in file:  
        if line[0] != '(': continue
        vocabulary.addSentence(Sentence(line)) 
    file.close()
    print 'Vocabulary has been read from', fileName, '\n' 
    return vocabulary
Esempio n. 4
0
class Encoder_Decoder(nn.Module):
    def __init__(self,
                 dim_emb,
                 dim_hid,
                 vocab_file='./data/preprocessed/vocab_file.vocab'):
        super(Encoder_Decoder, self).__init__()
        self.vocab = Vocabulary()
        self.vocab.load(vocab_file=vocab_file)
        self.dim_hid = dim_hid
        self.word_embeddings = nn.Embedding(len(self.vocab), dim_emb)
        # self.gru = nn.GRU(dim_emb, dim_hid, batch_first=True)
        self.en_lstm = nn.LSTM(dim_emb, dim_hid, batch_first=True)

        self.de_lstm = nn.LSTM(dim_emb, dim_hid, batch_first=True)
        # LSTMの128次元の隠れ層を13次元に変換する全結合層
        self.hidden2linear = nn.Linear(dim_hid, len(self.vocab))

    def forward(self, sequence, state=None):
        embedding = self.word_embeddings(sequence)
        hs, (h, c) = self.en_lstm(embedding, state)

        output, (h, c) = self.de_lstm(embedding, (h, c))

        # アテンションを計算
        # t_output = torch.transpose(output, 1, 2)
        # s = torch.bmm(hs, t_output)
        # attention_weight = self.softmax(s)

        output = self.hidden2linear(output)
        return output, (h, c)

    def generate(self, start=None, max_len=17):

        if start is None:
            start = random.choice(self.vocab.index2word)

        idx = self.embed.weight.new_full((1, 1),
                                         self.vocab.get_index(start),
                                         dtype=torch.long)
        decoded = [start]
        state = None
        unk = self.vocab.get_index('<unk>')
        while decoded[-1] != '<eos>' and len(decoded) < max_len:
            x, state = self.forward(idx, state)
            x[:, :, unk] = -float('inf')

            # prob = list(map(self.to_int, x.squeeze().tolist()))

            # idx = torch.tensor(random.choices(
            #     list(range(len(prob))), weights=prob, k=1)).view(1, -1)

            idx = torch.argmax(x, dim=-1)

            word = self.vocab.get_word(idx.item())
            decoded.append(word)
        return ' '.join(decoded)
Esempio n. 5
0
 def create_batches(self, contexts):
     batch_data = []
     label_data = []
     for target, t_context in contexts:
         target_index = Vocabulary.getIndex(target)
         context_index = Vocabulary.getIndex(t_context)
         if target_index is not None and context_index is not None:
             batch_data.append(target_index)
             label_data.append(context_index)
     return batch_data, label_data
Esempio n. 6
0
    def from_serializable(cls, contents):
        """
        Instantiate a ReviewVectorizer from a serializable dictionary
        :param contents: (dict) the serializable dictionary
        :return: an instance of the ReviewVectorizer class
        """
        review_vocab = Vocabulary.from_serializable(contents['review_vocab'])
        rating_vocab = Vocabulary.from_serializable(contents['rating_vocab'])

        return cls(review_vocab, rating_vocab)
Esempio n. 7
0
class TransitionMatrix:
    def __init__(self, corpus = ""):
        self.voc = Vocabulary()
        self.tm = scipy.sparse.dok_matrix((1000,1000), dtype=np.float32)

        self.add_from_text(corpus)
        self.start = UnigramLM(self.voc)
        self.valid = False
        self.sorted_tokens = []

    def add_from_text(self, text):
        self.valid = False
        tss = tokenize_corpus(text)

        for ts in tss:
            if len(ts) > 0:
                self.start.add_token(ts[0])

            self.voc.expand(ts, from_tokens=True)
            wids = self.voc.get_word_id(ts)
            
            maxwid = max(wids)

            if maxwid >= self.tm.shape[0]:
                self.tm.resize((maxwid + 1, maxwid + 1))

            grams = getNGrams(wids)

            for g in grams:
                self.tm[g] += 1

    def validate(self):
        if not self.valid:
            self.p = self.tm.tocsr()
            s = self.p.sum(axis=1)
            self.p /= s
            self.valid = True
            self.sorted_tokens = self.voc.sorted_tokens()


    def sample_start(self):
        t, p = self.start.get_dist()
        # print(t, p)
        return np.random.choice(t, p = p)

    def sample(self, t):
        self.validate()
        wid = self.voc.get_word_id([t])[0]
        p = np.squeeze(np.asarray(self.p[wid, :]).reshape(-1,1))
        return np.random.choice(self.sorted_tokens, p = p)


    def __str__(self):
        return repr(self.tm)
 def __init__(self, collection, doStats=False, postingsFile=False):
     self.collection = collection
     self.lexAnalyser = False
     self.calculateStats = doStats
     self.vocabulary = Vocabulary()
     self.postings = DictionaryPostings({})
     self.documents = Documents()
     self.maxFreqInDocs = {}
     #self.positions = DictionaryPostings({})
     if self.calculateStats:
         self.stats = self.getInitStats()
Esempio n. 9
0
def generate_vocabulary(counter, threshold):
    """Generate vocabulary."""
    vocab = Vocabulary()

    # Keep words that have more occurances thatn threshold
    words = sorted([word for word, cnt in counter.items() if cnt >= threshold])

    # Add words to dictionary
    for i, word in enumerate(words):
        vocab.addWord(word)

    return vocab
Esempio n. 10
0
    def from_dataframe(cls, review_df, cutoff=25):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            review_df (pandas.DataFrame): the review dataset
            cutoff (int): the parameter for frequency-based filtering
        Returns:
            an instance of the ReviewVectorizer
        """
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)

        # Add ratings
        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)

        # Add top words if count > provided count
        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1

        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)

        return cls(review_vocab, rating_vocab)
Esempio n. 11
0
    def __init__(self,
                 dim_emb,
                 dim_hid,
                 vocab_file='./data/preprocessed/vocab_file.vocab'):
        super().__init__()

        self.vocab = Vocabulary()
        self.vocab.load(vocab_file=vocab_file)
        self.embed = torch.nn.Embedding(len(self.vocab), dim_emb)
        self.rnn1 = torch.nn.LSTM(dim_emb, dim_hid, batch_first=True)
        self.rnn2 = torch.nn.LSTM(dim_hid, dim_hid, batch_first=True)
        #         self.rnn3 = torch.nn.LSTM(dim_hid, dim_hid, batch_first=True)
        #         self.rnn4 = torch.nn.LSTM(dim_hid, dim_hid, batch_first=True)
        self.out = torch.nn.Linear(dim_hid, len(self.vocab))
Esempio n. 12
0
    def __init__(self, mode, prepocessed, srcVocaThreshold, tgtVocaThreshold,
                 deprelLabelThreshold, printEvery, trainSize, testSize,
                 devSize):
        if prepocessed:
            tgtTrain = './data/processed/train.en'
            actTrain = './data/processed/train.oracle.en'
            tgtDev = './data/processed/dev.en'
            actDev = './data/processed/dev.oracle.en'
            tgtTest = './data/processed/test.en'
            actTest = './data/processed/test.oracle.en'
            srcTrain = './data/processed/train.kr'
            deprelTrain = './data/processed/train.deprel.kr'
            srcDev = './data/processed/dev.kr'
            deprelDev = './data/processed/dev.deprel.kr'
            srcTest = './data/processed/test.kr'
            deprelTest = './data/processed/test.deprel.kr'
        else:
            train_permutation = list(range(0, 99999))
            random.shuffle(train_permutation)
            dev_permutation = list(range(0, 10000))
            random.shuffle(dev_permutation)
            print('Parsing target file into plain sentences & actions...')
            tgtTrain, actTrain = self.conll_to_action('./data/tagged_train.en',
                                                      trainSize,
                                                      train_permutation)
            tgtDev, actDev = self.conll_to_action('./data/tagged_dev.en',
                                                  devSize, dev_permutation)
            print(
                'Parsing source file into plain sentences & dependency relations...'
            )
            srcTrain, deprelTrain = self.conll_to_deprels(
                './data/tagged_train.kr', trainSize, train_permutation)
            srcDev, deprelDev = self.conll_to_deprels('./data/tagged_dev.kr',
                                                      devSize, dev_permutation)

        print('Loading processed data...')
        self.sourceVoc = Vocabulary(srcTrain, srcVocaThreshold, 'lang')
        self.targetVoc = Vocabulary(tgtTrain, tgtVocaThreshold, 'lang')
        self.actionVoc = Vocabulary(actTrain, None, 'action')
        self.deprelVoc = Vocabulary(deprelTrain, deprelLabelThreshold,
                                    'deprel')
        self.trainData = []
        self.devData = []
        self.trainData = self.loadCorpus(srcTrain, tgtTrain, actTrain,
                                         deprelTrain, self.trainData)
        self.devData = self.loadCorpus(srcDev, tgtDev, actDev, deprelDev,
                                       self.devData)
        self.printEvery = printEvery
        print('Loaded...')
Esempio n. 13
0
    def __init__(self,
                 dim_emb,
                 dim_hid,
                 vocab_file='./data/preprocessed/vocab_file.vocab'):
        super(Encoder_Decoder, self).__init__()
        self.vocab = Vocabulary()
        self.vocab.load(vocab_file=vocab_file)
        self.dim_hid = dim_hid
        self.word_embeddings = nn.Embedding(len(self.vocab), dim_emb)
        # self.gru = nn.GRU(dim_emb, dim_hid, batch_first=True)
        self.en_lstm = nn.LSTM(dim_emb, dim_hid, batch_first=True)

        self.de_lstm = nn.LSTM(dim_emb, dim_hid, batch_first=True)
        # LSTMの128次元の隠れ層を13次元に変換する全結合層
        self.hidden2linear = nn.Linear(dim_hid, len(self.vocab))
Esempio n. 14
0
def prepare_training_data(sent_pairs):
    voc = Vocabulary()
    sent_pairs_normalized = []
    for sent_p in sent_pairs:
        # normalize incorrect and correct sentence in pair, and append them to normalized sentence pairs
        incorrect_sent_normalized = normalize_string(sent_p[0])
        correct_sent_normalized = normalize_string(sent_p[1])

        normalized_sents_pair = (
            incorrect_sent_normalized, correct_sent_normalized)
        sent_pairs_normalized.append(normalized_sents_pair)
        # add normalized sentence pair to vocabulary
        voc.add_sentence_pair(normalized_sents_pair)

    return voc, sent_pairs_normalized
Esempio n. 15
0
def get_model(name='saved_models/model',
              path=extracted_data_train_dir,
              size=TRAIN_SIZE):
    train_vocab = Vocabulary(train_captions, max_vocab_size)
    wat = [
        torch.tensor(x[1:], dtype=torch.int16)
        for x in train_vocab.encoded_captions
    ]
    padded = pad_sequence(wat).permute(1, 0)

    dataset = MyDataset(enc_captions=padded[:size],
                        image_paths=train_image_paths[:size],
                        data_dir=path + 'vecs/')

    dataloader = DataLoader(dataset=dataset, batch_size=256, num_workers=0)

    criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
    model = End2End(ENC_INPUT, ENC_OUTPUT, DEC_HID_DIM, DEC_OUTPUT, EMB_DIM,
                    ATTN_DIM, train_vocab, criterion, device)

    model.load_state_dict(torch.load(name))

    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    # optimizer.lr = 0.001

    return model, dataset, dataloader, optimizer
Esempio n. 16
0
  def __init__(self, naiveBayesMatrix,beta = -1):
    naiveBayesMatrix = naiveBayesMatrix.todense()

    v = 0 #total Vocabulary words
    for x in range(0,naiveBayesMatrix.shape[0]):
        v = v + naiveBayesMatrix[x,:].sum()

    #B = 1/v
    B = beta
    if(beta == -1): B = 1/v
    alphaMinusOne = B #(a-1)

    #(length of vocab list)
    vocab = Vocabulary()
    vocabListLength = vocab.length

    # (a-1)*(length of vocab list)
    denominatorStatic = alphaMinusOne * vocabListLength

    #(count of Xi in Yk) + (a-1)
    numerator = naiveBayesMatrix + alphaMinusOne

    #P(Xi|Yk)
    for x in range(numerator.shape[0]):
        denominatorDynamic = naiveBayesMatrix[x,:].sum()
        numerator[x,:] *= (1/(denominatorDynamic + denominatorStatic))

    #log2(P(Xi|Yk))
    self.mapmatrix = np.log2(numerator)
Esempio n. 17
0
def getVocabByID(ID):
    from Vocabulary import Vocabulary
    try:
        #theData = ""
        # this returns a Vocabulary item based on the record ID in the database
        conn = sqlite3.connect('FinnVocab.db')

        num2use = str(ID)
        # debugging: print(num2use)
        # need to pass a tuple; if you pass a plain string, the query breaks on anything >=10
        # see http://stackoverflow.com/questions/4409539/pythonsqlite-the-like-query-with-wildcards#4409584
        # for where I found the answer (while working on team project & looking for something totally unrelated, of course! :D )
        cursor = conn.execute(
            "SELECT ID, FINNISH, ENGLISH FROM VOCABULARY WHERE ID = ?;",
            (num2use, ))

        # there should be exactly one result returned
        record = cursor.fetchone()
        theID = record[0]
        theFinn = record[1].strip()
        theEngl = record[2].strip()
        theData = Vocabulary(theID, theFinn, theEngl)
    except sqlite3.Error as e:
        print("Unable to retrieve record number " + str(ID) + ".")
        traceback.print_exc()
        # set theData to None so the program can fail gracefully.
        theData = None
    finally:
        # close the connection
        conn.close()
        return theData
def main(word, language, part_of_speech, number_of_clusters):
    # here we need to get sentences either from txt or db file
    sentences = get_corpus_from_txt_file(language)
    # sentences = get_corpus_from_db (language)
    print("Got sentences for language " + language)

    vocabulary = Vocabulary(language)
    print("Created Vocabulary")
    cluster = Cluster(language, number_of_clusters)

    words = vocabulary.make_array_of_words_from_sentences(sentences)
    print("Made arrays for every sentence")
    throne2vec = vocabulary.build_vocabulary(words) # get the trained model (vocabulary)
    print("Trained the model")
    all_word_vectors_matrix_2d = cluster.make_vectors_2D(throne2vec)
    print("Made matrix with vectors")
Esempio n. 19
0
    def from_serializable(cls, contents):
        """
        Intantiate a TwitterVectorizer from serializable dictionary

        Args:
            contents (dict): the serializable dictionary
        Returns:
            an instance of the TwitterVectorizer
        """
        # load the Text Vocabulary
        text_vocabulary = Vocabulary.from_serialiable(contents["text_vocabulary"])

        # load the Target Vocabulary
        target_vocabulary = Vocabulary.from_serialiable(contents["target_vocabulary"])

        return cls(text_vocabulary=text_vocabulary, target_vocabulary=target_vocabulary)
    def from_dataframe(cls, news_df):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            news_df (pandas.DataFrame): the target dataset
        Returns:
            an instance of the NREVectorizer
        """
        relation_vocab = Vocabulary()
        for relation in set(news_df.relation):
            relation_vocab.add_token(relation)

        seq_vocab = SequenceVocabulary()
        for sequence in news_df.sequence:
            word_list = list(jieba.cut(sequence, cut_all=False))
            seq_vocab.add_many(word_list)
        return cls(seq_vocab, relation_vocab)
Esempio n. 21
0
    def loadVocabs(self):
        """
        Reads the suffixVocab files from disk an stores them in dictonaries and arries
        """
        self.vocabs = {}

        for path in self.vocabPaths:
            print path
            self.vocabs[path] = Vocabulary(path)
Esempio n. 22
0
    def run(bladeDir, lang):

        indexVocabulary = Vocabulary.loadIndexFile(
            lang, vocabularyFileName='vocabulary')

        for filename in Path(bladeDir).glob('**/*.blade.php'):
            # bladeHtml       = ''
            bladeHtml = Parser.getFileContent(filename)
            items = Parser.getFromHtml(bladeHtml)
            filterItems = list(filter(Parser.filterValuesLaravel, items))
            for item in filterItems:
                index = Vocabulary.checkIndex(indexVocabulary, item)
                if (index):
                    bladeHtml = bladeHtml.replace(
                        item, "@lang('" + str(index) + "')")
            with open(filename, 'w') as file_handler:
                file_handler.write(bladeHtml)
            sys.exit(0)
Esempio n. 23
0
    def from_dataframe(cls, dataset_df, cutoff=c_frequencyCutoff):
        """
        Instantiate the Vectorizer from the dataset dataframe

        Args:
            dataset_df (pandas.DataFrame): the tweets dataset
            cutoff (int): the parameter for frequency-based filtering
        Returns:
            an instance of the TwitterVectorizer
        """
        # instantiate the Vocabulary for text column
        text_vocabulary = cls._get_text_vocabulary()

        # instantiate the Vocabulary for target column
        target_vocabulary = Vocabulary(add_unknown_token=False)

        # add elements to Target Vocabulary
        for target in sorted(set(dataset_df.target)):
            target_vocabulary.add_token(target)

        # Tweet Tokenizer to split text into tokens
        tokenizer = TweetTokenizer()

        # add word to the Text Vocabulary, if its frequency > cutoff
        word_counts = Counter()

        # iterate through the dataset
        for text in dataset_df.text:
            # split text into words
            words = tokenizer.tokenize(text)

            # update word_counts for all words in the text
            for word in words:
                word_counts[word] += 1

        # for all extacted words
        for word, count in word_counts.items():
            # if the word is not punctuation and it appears more than @cutoff times, add it to the Vocabulary
            if (word not in string.punctuation) and (count > cutoff):
                # add token to the Vocabulary
                text_vocabulary.add_token(word)

        return cls(text_vocabulary, target_vocabulary)
Esempio n. 24
0
    def __init__(self, image_ids, image_folder_path, mode = 'train', vocab_file = "", vocab_threshold = 5, batch_size = 10):
        assert mode in ['train', 'val', 'test']
        
        self.mode = mode
        self.image_folder_path = image_folder_path
        self.batch_size = batch_size
        
        # Get pre-processed objects
        all_captions_dict = load_obj('captions_dict')
        captions_dict = { image_id: all_captions_dict[image_id] for image_id in image_ids } # only include selected subset of captions

        # Obtain sample of training images
        #self.training_image_ids, captions_dict = get_training_indices(sample_size = sample_size, mode = "balanced_clean")
        
        # self.training_image_ids, self.images_path, self.image_id_dict, captions_dict \
        # = get_data(image_folder_path, annotations_path, sample_size, data_type)

        # Set up vocabulary or load from training set
        if self.mode == 'train':
            self.vocab = Vocabulary(captions_dict)
            print('Vocabulary successfully created')
        elif vocab_file != "":
            self.vocab = vocab_file
            self.word2idx = self.vocab.word2idx
            self.idx2word = self.vocab.idx2word
            #print('Vocabulary successfully loaded')
        else:
            self.vocab = load_obj("vocab")
            self.word2idx = self.vocab.word2idx
            self.idx2word = self.vocab.idx2word
            print('Vocabulary successfully loaded')

        # Batch_size set to 1 if is test
        if self.mode == 'test':
            self.batch_size = 1
        
        # Set up dataset
        self.im_ids = [] # with duplicates for indexing, i.e. if caption 1-5 all correspond to image 8, the im_ids will be [8,8,8,8,8]
        self.captions = []
        self.images = []
        self.captions_len = []
        for im_id, captions_list in captions_dict.items():
            for item in captions_list:
                self.im_ids.append(im_id)
                self.captions.append(item)
                self.captions_len.append(len(nltk.tokenize.word_tokenize(item)))
        
        # Set up paramteres for image feature extraction 
        self.transform = transforms.Compose([
            transforms.Resize(256),
            transforms.RandomCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406),(0.229, 0.224, 0.225)),
        ])
Esempio n. 25
0
    def from_serializable(cls, contents, classifier_class):  # GLOVE_MODEL
        """Instantiate a ReviewVectorizer from a serializable dictionary

        Args:
            contents (dict): the serializable dictionary
        Returns:
            an instance of the ReviewVectorizer class
        """
        if classifier_class == 'GloVe':  # GLOVE_MODEL
            predictor_vocab = SequenceVocabulary.from_serializable(
                contents['predictor_vocab'])  # GLOVE_MODEL
        else:
            predictor_vocab = Vocabulary.from_serializable(
                contents['predictor_vocab'])

        target_vocab = Vocabulary.from_serializable(contents['target_vocab'])

        return cls(predictor_vocab=predictor_vocab,
                   target_vocab=target_vocab,
                   max_predictor_length=contents['max_predictor_length'])
Esempio n. 26
0
def simple_run():
    """train without k-fold"""
    # set the logger
    utils.set_logger(config.log_dir)
    # 设置gpu为命令行参数指定的id
    if config.gpu != '':
        device = torch.device(f"cuda:{config.gpu}")
    else:
        device = torch.device("cpu")
    logging.info("device: {}".format(device))
    # 处理数据,分离文本和标签
    processor = Processor(config)
    processor.data_process()
    # 建立词表
    vocab = Vocabulary(config)
    vocab.get_vocab()
    # 分离出验证集
    word_train, word_dev, label_train, label_dev = dev_split(config.train_dir)
    # simple run without k-fold
    run(word_train, label_train, word_dev, label_dev, vocab, device)
Esempio n. 27
0
    def __init__(self, ngram_file, insert_word_posn, index_insert_words, min_count):
        
        print "\nNGramIndex: Initializing a new index" 

        self.index_insert_words = index_insert_words
        self.counts_total    = 0
        self.num_uniq_ngrams = 0 
        self.vocab = Vocabulary()

        print "Now examining ngram counts in", ngram_file
        max_lines = self.ngram_file_mincount_line(ngram_file, min_count)
        print "For a min ngram count of ", min_count,
        print "need to read", max_lines, "ngrams from", ngram_file

        self.ngram_hash   = numpy.zeros(max_lines, dtype=numpy.int64)
        self.ngram_count  = numpy.zeros(max_lines, dtype=numpy.int32)
        if index_insert_words:
            self.ngram_gapword = numpy.zeros(max_lines, dtype=numpy.int32)

        self.build_index(ngram_file, insert_word_posn, index_insert_words, 
                         max_lines, min_count)
Esempio n. 28
0
def prep_dataset():
    wiki_path = WIKI_PATH

    if CONTEXT_CAPACITY % 2 != 0:
        raise Exception("Context length should be even")

    context_window = CONTEXT_CAPACITY + 1

    print("Loading...", end="")
    wiki = WikiDataLoader(wiki_path)
    voc = Vocabulary()
    tok = Tokenizer()
    print("done")

    wiki_doc = wiki.next_doc()
    wikiprep = open("WikiPrepData.txt", "w")

    i = 0
    while wiki_doc:
        doc = tok(wiki_doc)
        voc.add(doc)

        sample = np.array(voc.text2ids(doc))
        indexer = np.arange(context_window)[None, :] + np.arange(
            len(sample) - context_window)[:, None]

        smpl = sample[indexer]

        for row in smpl:
            for val in row:
                wikiprep.write("%d " % val)
            wikiprep.write("\n")

        i += 1
        if i == 2000:
            break
        wiki_doc = wiki.next_doc()

    pickle.dump(voc, open("WikiPrepVoc.pkl", "wb"))
    print("Vocabulary ready")
Esempio n. 29
0
def main_freq():

    logging.info("Loading dataset")
    dataset = load_dataset("ag_news")
    dataset_text = [r['text'] for r in dataset['train']]
    dataset_labels = [r['label'] for r in dataset['train']]

    logging.info("Building vocabulary")
    vocab = Vocabulary(dataset_text)
    vocab.make_vocab_charts()
    plt.close()
    plt.pause(0.01)

    logging.info("Computing PPMI matrix")
    PPMI = compute_ppmi_matrix([doc['text'] for doc in dataset['train']],
                               vocab)

    logging.info("Performing Truncated SVD to reduce dimensionality")
    word_vectors = dim_reduce(PPMI)

    logging.info("Preparing T-SNE plot")
    plot_word_vectors_tsne(word_vectors, vocab)
Esempio n. 30
0
def k_fold_run():
    """train with k-fold"""
    # set the logger
    utils.set_logger(config.log_dir)
    # 设置gpu为命令行参数指定的id
    if config.gpu != '':
        device = torch.device(f"cuda:{config.gpu}")
    else:
        device = torch.device("cpu")
    logging.info("device: {}".format(device))
    # 处理数据,分离文本和标签
    processor = Processor(config)
    processor.data_process()
    # 建立词表
    vocab = Vocabulary(config)
    vocab.get_vocab()
    # 分离出验证集
    data = np.load(config.train_dir, allow_pickle=True)
    words = data["words"]
    labels = data["labels"]
    kf = KFold(n_splits=config.n_split)
    kf_data = kf.split(words, labels)
    kf_index = 0
    total_test_loss = 0
    total_f1 = 0
    for train_index, dev_index in kf_data:
        kf_index += 1
        word_train = words[train_index]
        label_train = labels[train_index]
        word_dev = words[dev_index]
        label_dev = labels[dev_index]
        test_loss, f1 = run(word_train, label_train, word_dev, label_dev,
                            vocab, device, kf_index)
        total_test_loss += test_loss
        total_f1 += f1
    average_test_loss = float(total_test_loss) / config.n_split
    average_f1 = float(total_f1) / config.n_split
    logging.info("Average test loss: {} , average f1 score: {}".format(
        average_test_loss, average_f1))
Esempio n. 31
0
    def get_result(self, event=None):
        inp = self.entry.get()
        if self.mode == 'chat':
            if inp != '':
                self.entry.set('')
                self.output, self.input, attention_plot = evaluate(
                    inp, self.v, self.enc, self.dec, self.hparams['MAX_LEN'])
                self.attention_weights = attention_plot[:len(
                    self.output.split(' ')), :len(self.input.split(' '))]

                res = Vocabulary.restore_text(self.output)
                self.update_label(inp, res)
                self.display_text.config(state=tk.NORMAL)
                self.display_text.insert(
                    tk.END, self.text_history[-2] + self.text_history[-1])
                self.display_text.config(state=tk.DISABLED)
                self.display_text.see(tk.END)
        else:
            self.entry.set('')
            if self.auto_inp == None and inp != '':
                self.auto_inp = inp
                self.main_button.config(text='Next')

            self.output, self.input, attention_plot = evaluate(
                self.auto_inp, self.v, self.enc, self.dec,
                self.hparams['MAX_LEN'])
            self.attention_weights = attention_plot[:len(self.output.split(
                ' ')), :len(self.input.split(' '))]

            res = Vocabulary.restore_text(self.output)
            self.update_label(self.auto_inp, res)
            self.display_text.config(state=tk.NORMAL)
            self.display_text.insert(
                tk.END, self.text_history[-2] + self.text_history[-1])
            self.display_text.config(state=tk.DISABLED)
            self.display_text.see(tk.END)

            self.auto_inp = res
    def similar_test(self):
        similar_to_paint = ['faint', 'saint', 'taint', 'point', 'print', 'pains']
        similar_to_coder = ['cider', 'ceder', 'comer', 'coper', 'corer', 'cower', 'cover', 'coyer', 'codex', 'codes']

        vocab = Vocabulary()
        vocab.fetch("dictionary.txt")

        similar_words = [x for x in vocab.similar("paint")]     # get words one character different from 'paint'
        for item in similar_to_paint:                           # check that the test words are contained in result list
            self.assertIn(item, similar_words)

        similar_words = [x for x in vocab.similar("coder")]     # get words one character different from 'coder'
        for item in similar_to_coder:                           # check that the test words are contained in result list
            self.assertIn(item, similar_words)

        similar_words = [x for x in vocab.similar("zzzzz")]     # test non-sense word returns no results
        self.assertEqual(similar_words, [])
Esempio n. 33
0
def load_vocab(file):
  v = Vocabulary()
  v.load(file)
  return v
    def word_ladder_test(self):
        slant_to_grain = ['slant', 'plant', 'plank', 'blank', 'bland', 'brand', 'braid', 'brain', 'grain']
        smart_to_brain = ['smart', 'start', 'stars', 'sears', 'bears', 'beans', 'brans', 'brand', 'braid', 'brain']

        vocab = Vocabulary()
        vocab.fetch("dictionary.txt")

        self.assertEqual(vocab.word_ladder('slant', 'grain'), slant_to_grain)
        self.assertEqual(vocab.word_ladder('smart', 'brain'), smart_to_brain)