def build_vocab(json, threshold): """Build a simple vocabulary wrapper.""" counter = Counter() for i, id in enumerate(ids): caption = str(coco.anns[id]['caption']) tokens = nltk.tokenize.word_tokenize(caption.lower()) counter.update(tokens) if (i + 1) % 1000 == 0: print("[{}/{}] Tokenized the captions.".format(i + 1, len(ids))) # If the word frequency is less than 'threshold', then the word is discarded. words = [word for word, cnt in counter.items() if cnt >= threshold] # Create a vocab wrapper and add some special tokens. vocab = Vocabulary() vocab.add_word('<pad>') vocab.add_word('<start>') vocab.add_word('<end>') vocab.add_word('<unk>') # Add the words to the vocabulary. for i, word in enumerate(words): vocab.add_word(word) return vocab
def __init__(self, corpus = ""): self.voc = Vocabulary() self.tm = scipy.sparse.dok_matrix((1000,1000), dtype=np.float32) self.add_from_text(corpus) self.start = UnigramLM(self.voc) self.valid = False self.sorted_tokens = []
def readVocabularyFrom(fileName): file = open(fileName, 'rb') vocabulary = Vocabulary(fileName[:-4]) for line in file: if line[0] != '(': continue vocabulary.addSentence(Sentence(line)) file.close() print 'Vocabulary has been read from', fileName, '\n' return vocabulary
class Encoder_Decoder(nn.Module): def __init__(self, dim_emb, dim_hid, vocab_file='./data/preprocessed/vocab_file.vocab'): super(Encoder_Decoder, self).__init__() self.vocab = Vocabulary() self.vocab.load(vocab_file=vocab_file) self.dim_hid = dim_hid self.word_embeddings = nn.Embedding(len(self.vocab), dim_emb) # self.gru = nn.GRU(dim_emb, dim_hid, batch_first=True) self.en_lstm = nn.LSTM(dim_emb, dim_hid, batch_first=True) self.de_lstm = nn.LSTM(dim_emb, dim_hid, batch_first=True) # LSTMの128次元の隠れ層を13次元に変換する全結合層 self.hidden2linear = nn.Linear(dim_hid, len(self.vocab)) def forward(self, sequence, state=None): embedding = self.word_embeddings(sequence) hs, (h, c) = self.en_lstm(embedding, state) output, (h, c) = self.de_lstm(embedding, (h, c)) # アテンションを計算 # t_output = torch.transpose(output, 1, 2) # s = torch.bmm(hs, t_output) # attention_weight = self.softmax(s) output = self.hidden2linear(output) return output, (h, c) def generate(self, start=None, max_len=17): if start is None: start = random.choice(self.vocab.index2word) idx = self.embed.weight.new_full((1, 1), self.vocab.get_index(start), dtype=torch.long) decoded = [start] state = None unk = self.vocab.get_index('<unk>') while decoded[-1] != '<eos>' and len(decoded) < max_len: x, state = self.forward(idx, state) x[:, :, unk] = -float('inf') # prob = list(map(self.to_int, x.squeeze().tolist())) # idx = torch.tensor(random.choices( # list(range(len(prob))), weights=prob, k=1)).view(1, -1) idx = torch.argmax(x, dim=-1) word = self.vocab.get_word(idx.item()) decoded.append(word) return ' '.join(decoded)
def create_batches(self, contexts): batch_data = [] label_data = [] for target, t_context in contexts: target_index = Vocabulary.getIndex(target) context_index = Vocabulary.getIndex(t_context) if target_index is not None and context_index is not None: batch_data.append(target_index) label_data.append(context_index) return batch_data, label_data
def from_serializable(cls, contents): """ Instantiate a ReviewVectorizer from a serializable dictionary :param contents: (dict) the serializable dictionary :return: an instance of the ReviewVectorizer class """ review_vocab = Vocabulary.from_serializable(contents['review_vocab']) rating_vocab = Vocabulary.from_serializable(contents['rating_vocab']) return cls(review_vocab, rating_vocab)
class TransitionMatrix: def __init__(self, corpus = ""): self.voc = Vocabulary() self.tm = scipy.sparse.dok_matrix((1000,1000), dtype=np.float32) self.add_from_text(corpus) self.start = UnigramLM(self.voc) self.valid = False self.sorted_tokens = [] def add_from_text(self, text): self.valid = False tss = tokenize_corpus(text) for ts in tss: if len(ts) > 0: self.start.add_token(ts[0]) self.voc.expand(ts, from_tokens=True) wids = self.voc.get_word_id(ts) maxwid = max(wids) if maxwid >= self.tm.shape[0]: self.tm.resize((maxwid + 1, maxwid + 1)) grams = getNGrams(wids) for g in grams: self.tm[g] += 1 def validate(self): if not self.valid: self.p = self.tm.tocsr() s = self.p.sum(axis=1) self.p /= s self.valid = True self.sorted_tokens = self.voc.sorted_tokens() def sample_start(self): t, p = self.start.get_dist() # print(t, p) return np.random.choice(t, p = p) def sample(self, t): self.validate() wid = self.voc.get_word_id([t])[0] p = np.squeeze(np.asarray(self.p[wid, :]).reshape(-1,1)) return np.random.choice(self.sorted_tokens, p = p) def __str__(self): return repr(self.tm)
def __init__(self, collection, doStats=False, postingsFile=False): self.collection = collection self.lexAnalyser = False self.calculateStats = doStats self.vocabulary = Vocabulary() self.postings = DictionaryPostings({}) self.documents = Documents() self.maxFreqInDocs = {} #self.positions = DictionaryPostings({}) if self.calculateStats: self.stats = self.getInitStats()
def generate_vocabulary(counter, threshold): """Generate vocabulary.""" vocab = Vocabulary() # Keep words that have more occurances thatn threshold words = sorted([word for word, cnt in counter.items() if cnt >= threshold]) # Add words to dictionary for i, word in enumerate(words): vocab.addWord(word) return vocab
def from_dataframe(cls, review_df, cutoff=25): """Instantiate the vectorizer from the dataset dataframe Args: review_df (pandas.DataFrame): the review dataset cutoff (int): the parameter for frequency-based filtering Returns: an instance of the ReviewVectorizer """ review_vocab = Vocabulary(add_unk=True) rating_vocab = Vocabulary(add_unk=False) # Add ratings for rating in sorted(set(review_df.rating)): rating_vocab.add_token(rating) # Add top words if count > provided count word_counts = Counter() for review in review_df.review: for word in review.split(" "): if word not in string.punctuation: word_counts[word] += 1 for word, count in word_counts.items(): if count > cutoff: review_vocab.add_token(word) return cls(review_vocab, rating_vocab)
def __init__(self, dim_emb, dim_hid, vocab_file='./data/preprocessed/vocab_file.vocab'): super().__init__() self.vocab = Vocabulary() self.vocab.load(vocab_file=vocab_file) self.embed = torch.nn.Embedding(len(self.vocab), dim_emb) self.rnn1 = torch.nn.LSTM(dim_emb, dim_hid, batch_first=True) self.rnn2 = torch.nn.LSTM(dim_hid, dim_hid, batch_first=True) # self.rnn3 = torch.nn.LSTM(dim_hid, dim_hid, batch_first=True) # self.rnn4 = torch.nn.LSTM(dim_hid, dim_hid, batch_first=True) self.out = torch.nn.Linear(dim_hid, len(self.vocab))
def __init__(self, mode, prepocessed, srcVocaThreshold, tgtVocaThreshold, deprelLabelThreshold, printEvery, trainSize, testSize, devSize): if prepocessed: tgtTrain = './data/processed/train.en' actTrain = './data/processed/train.oracle.en' tgtDev = './data/processed/dev.en' actDev = './data/processed/dev.oracle.en' tgtTest = './data/processed/test.en' actTest = './data/processed/test.oracle.en' srcTrain = './data/processed/train.kr' deprelTrain = './data/processed/train.deprel.kr' srcDev = './data/processed/dev.kr' deprelDev = './data/processed/dev.deprel.kr' srcTest = './data/processed/test.kr' deprelTest = './data/processed/test.deprel.kr' else: train_permutation = list(range(0, 99999)) random.shuffle(train_permutation) dev_permutation = list(range(0, 10000)) random.shuffle(dev_permutation) print('Parsing target file into plain sentences & actions...') tgtTrain, actTrain = self.conll_to_action('./data/tagged_train.en', trainSize, train_permutation) tgtDev, actDev = self.conll_to_action('./data/tagged_dev.en', devSize, dev_permutation) print( 'Parsing source file into plain sentences & dependency relations...' ) srcTrain, deprelTrain = self.conll_to_deprels( './data/tagged_train.kr', trainSize, train_permutation) srcDev, deprelDev = self.conll_to_deprels('./data/tagged_dev.kr', devSize, dev_permutation) print('Loading processed data...') self.sourceVoc = Vocabulary(srcTrain, srcVocaThreshold, 'lang') self.targetVoc = Vocabulary(tgtTrain, tgtVocaThreshold, 'lang') self.actionVoc = Vocabulary(actTrain, None, 'action') self.deprelVoc = Vocabulary(deprelTrain, deprelLabelThreshold, 'deprel') self.trainData = [] self.devData = [] self.trainData = self.loadCorpus(srcTrain, tgtTrain, actTrain, deprelTrain, self.trainData) self.devData = self.loadCorpus(srcDev, tgtDev, actDev, deprelDev, self.devData) self.printEvery = printEvery print('Loaded...')
def __init__(self, dim_emb, dim_hid, vocab_file='./data/preprocessed/vocab_file.vocab'): super(Encoder_Decoder, self).__init__() self.vocab = Vocabulary() self.vocab.load(vocab_file=vocab_file) self.dim_hid = dim_hid self.word_embeddings = nn.Embedding(len(self.vocab), dim_emb) # self.gru = nn.GRU(dim_emb, dim_hid, batch_first=True) self.en_lstm = nn.LSTM(dim_emb, dim_hid, batch_first=True) self.de_lstm = nn.LSTM(dim_emb, dim_hid, batch_first=True) # LSTMの128次元の隠れ層を13次元に変換する全結合層 self.hidden2linear = nn.Linear(dim_hid, len(self.vocab))
def prepare_training_data(sent_pairs): voc = Vocabulary() sent_pairs_normalized = [] for sent_p in sent_pairs: # normalize incorrect and correct sentence in pair, and append them to normalized sentence pairs incorrect_sent_normalized = normalize_string(sent_p[0]) correct_sent_normalized = normalize_string(sent_p[1]) normalized_sents_pair = ( incorrect_sent_normalized, correct_sent_normalized) sent_pairs_normalized.append(normalized_sents_pair) # add normalized sentence pair to vocabulary voc.add_sentence_pair(normalized_sents_pair) return voc, sent_pairs_normalized
def get_model(name='saved_models/model', path=extracted_data_train_dir, size=TRAIN_SIZE): train_vocab = Vocabulary(train_captions, max_vocab_size) wat = [ torch.tensor(x[1:], dtype=torch.int16) for x in train_vocab.encoded_captions ] padded = pad_sequence(wat).permute(1, 0) dataset = MyDataset(enc_captions=padded[:size], image_paths=train_image_paths[:size], data_dir=path + 'vecs/') dataloader = DataLoader(dataset=dataset, batch_size=256, num_workers=0) criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX) model = End2End(ENC_INPUT, ENC_OUTPUT, DEC_HID_DIM, DEC_OUTPUT, EMB_DIM, ATTN_DIM, train_vocab, criterion, device) model.load_state_dict(torch.load(name)) optimizer = optim.Adam(model.parameters(), lr=0.0001) # optimizer.lr = 0.001 return model, dataset, dataloader, optimizer
def __init__(self, naiveBayesMatrix,beta = -1): naiveBayesMatrix = naiveBayesMatrix.todense() v = 0 #total Vocabulary words for x in range(0,naiveBayesMatrix.shape[0]): v = v + naiveBayesMatrix[x,:].sum() #B = 1/v B = beta if(beta == -1): B = 1/v alphaMinusOne = B #(a-1) #(length of vocab list) vocab = Vocabulary() vocabListLength = vocab.length # (a-1)*(length of vocab list) denominatorStatic = alphaMinusOne * vocabListLength #(count of Xi in Yk) + (a-1) numerator = naiveBayesMatrix + alphaMinusOne #P(Xi|Yk) for x in range(numerator.shape[0]): denominatorDynamic = naiveBayesMatrix[x,:].sum() numerator[x,:] *= (1/(denominatorDynamic + denominatorStatic)) #log2(P(Xi|Yk)) self.mapmatrix = np.log2(numerator)
def getVocabByID(ID): from Vocabulary import Vocabulary try: #theData = "" # this returns a Vocabulary item based on the record ID in the database conn = sqlite3.connect('FinnVocab.db') num2use = str(ID) # debugging: print(num2use) # need to pass a tuple; if you pass a plain string, the query breaks on anything >=10 # see http://stackoverflow.com/questions/4409539/pythonsqlite-the-like-query-with-wildcards#4409584 # for where I found the answer (while working on team project & looking for something totally unrelated, of course! :D ) cursor = conn.execute( "SELECT ID, FINNISH, ENGLISH FROM VOCABULARY WHERE ID = ?;", (num2use, )) # there should be exactly one result returned record = cursor.fetchone() theID = record[0] theFinn = record[1].strip() theEngl = record[2].strip() theData = Vocabulary(theID, theFinn, theEngl) except sqlite3.Error as e: print("Unable to retrieve record number " + str(ID) + ".") traceback.print_exc() # set theData to None so the program can fail gracefully. theData = None finally: # close the connection conn.close() return theData
def main(word, language, part_of_speech, number_of_clusters): # here we need to get sentences either from txt or db file sentences = get_corpus_from_txt_file(language) # sentences = get_corpus_from_db (language) print("Got sentences for language " + language) vocabulary = Vocabulary(language) print("Created Vocabulary") cluster = Cluster(language, number_of_clusters) words = vocabulary.make_array_of_words_from_sentences(sentences) print("Made arrays for every sentence") throne2vec = vocabulary.build_vocabulary(words) # get the trained model (vocabulary) print("Trained the model") all_word_vectors_matrix_2d = cluster.make_vectors_2D(throne2vec) print("Made matrix with vectors")
def from_serializable(cls, contents): """ Intantiate a TwitterVectorizer from serializable dictionary Args: contents (dict): the serializable dictionary Returns: an instance of the TwitterVectorizer """ # load the Text Vocabulary text_vocabulary = Vocabulary.from_serialiable(contents["text_vocabulary"]) # load the Target Vocabulary target_vocabulary = Vocabulary.from_serialiable(contents["target_vocabulary"]) return cls(text_vocabulary=text_vocabulary, target_vocabulary=target_vocabulary)
def from_dataframe(cls, news_df): """Instantiate the vectorizer from the dataset dataframe Args: news_df (pandas.DataFrame): the target dataset Returns: an instance of the NREVectorizer """ relation_vocab = Vocabulary() for relation in set(news_df.relation): relation_vocab.add_token(relation) seq_vocab = SequenceVocabulary() for sequence in news_df.sequence: word_list = list(jieba.cut(sequence, cut_all=False)) seq_vocab.add_many(word_list) return cls(seq_vocab, relation_vocab)
def loadVocabs(self): """ Reads the suffixVocab files from disk an stores them in dictonaries and arries """ self.vocabs = {} for path in self.vocabPaths: print path self.vocabs[path] = Vocabulary(path)
def run(bladeDir, lang): indexVocabulary = Vocabulary.loadIndexFile( lang, vocabularyFileName='vocabulary') for filename in Path(bladeDir).glob('**/*.blade.php'): # bladeHtml = '' bladeHtml = Parser.getFileContent(filename) items = Parser.getFromHtml(bladeHtml) filterItems = list(filter(Parser.filterValuesLaravel, items)) for item in filterItems: index = Vocabulary.checkIndex(indexVocabulary, item) if (index): bladeHtml = bladeHtml.replace( item, "@lang('" + str(index) + "')") with open(filename, 'w') as file_handler: file_handler.write(bladeHtml) sys.exit(0)
def from_dataframe(cls, dataset_df, cutoff=c_frequencyCutoff): """ Instantiate the Vectorizer from the dataset dataframe Args: dataset_df (pandas.DataFrame): the tweets dataset cutoff (int): the parameter for frequency-based filtering Returns: an instance of the TwitterVectorizer """ # instantiate the Vocabulary for text column text_vocabulary = cls._get_text_vocabulary() # instantiate the Vocabulary for target column target_vocabulary = Vocabulary(add_unknown_token=False) # add elements to Target Vocabulary for target in sorted(set(dataset_df.target)): target_vocabulary.add_token(target) # Tweet Tokenizer to split text into tokens tokenizer = TweetTokenizer() # add word to the Text Vocabulary, if its frequency > cutoff word_counts = Counter() # iterate through the dataset for text in dataset_df.text: # split text into words words = tokenizer.tokenize(text) # update word_counts for all words in the text for word in words: word_counts[word] += 1 # for all extacted words for word, count in word_counts.items(): # if the word is not punctuation and it appears more than @cutoff times, add it to the Vocabulary if (word not in string.punctuation) and (count > cutoff): # add token to the Vocabulary text_vocabulary.add_token(word) return cls(text_vocabulary, target_vocabulary)
def __init__(self, image_ids, image_folder_path, mode = 'train', vocab_file = "", vocab_threshold = 5, batch_size = 10): assert mode in ['train', 'val', 'test'] self.mode = mode self.image_folder_path = image_folder_path self.batch_size = batch_size # Get pre-processed objects all_captions_dict = load_obj('captions_dict') captions_dict = { image_id: all_captions_dict[image_id] for image_id in image_ids } # only include selected subset of captions # Obtain sample of training images #self.training_image_ids, captions_dict = get_training_indices(sample_size = sample_size, mode = "balanced_clean") # self.training_image_ids, self.images_path, self.image_id_dict, captions_dict \ # = get_data(image_folder_path, annotations_path, sample_size, data_type) # Set up vocabulary or load from training set if self.mode == 'train': self.vocab = Vocabulary(captions_dict) print('Vocabulary successfully created') elif vocab_file != "": self.vocab = vocab_file self.word2idx = self.vocab.word2idx self.idx2word = self.vocab.idx2word #print('Vocabulary successfully loaded') else: self.vocab = load_obj("vocab") self.word2idx = self.vocab.word2idx self.idx2word = self.vocab.idx2word print('Vocabulary successfully loaded') # Batch_size set to 1 if is test if self.mode == 'test': self.batch_size = 1 # Set up dataset self.im_ids = [] # with duplicates for indexing, i.e. if caption 1-5 all correspond to image 8, the im_ids will be [8,8,8,8,8] self.captions = [] self.images = [] self.captions_len = [] for im_id, captions_list in captions_dict.items(): for item in captions_list: self.im_ids.append(im_id) self.captions.append(item) self.captions_len.append(len(nltk.tokenize.word_tokenize(item))) # Set up paramteres for image feature extraction self.transform = transforms.Compose([ transforms.Resize(256), transforms.RandomCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406),(0.229, 0.224, 0.225)), ])
def from_serializable(cls, contents, classifier_class): # GLOVE_MODEL """Instantiate a ReviewVectorizer from a serializable dictionary Args: contents (dict): the serializable dictionary Returns: an instance of the ReviewVectorizer class """ if classifier_class == 'GloVe': # GLOVE_MODEL predictor_vocab = SequenceVocabulary.from_serializable( contents['predictor_vocab']) # GLOVE_MODEL else: predictor_vocab = Vocabulary.from_serializable( contents['predictor_vocab']) target_vocab = Vocabulary.from_serializable(contents['target_vocab']) return cls(predictor_vocab=predictor_vocab, target_vocab=target_vocab, max_predictor_length=contents['max_predictor_length'])
def simple_run(): """train without k-fold""" # set the logger utils.set_logger(config.log_dir) # 设置gpu为命令行参数指定的id if config.gpu != '': device = torch.device(f"cuda:{config.gpu}") else: device = torch.device("cpu") logging.info("device: {}".format(device)) # 处理数据,分离文本和标签 processor = Processor(config) processor.data_process() # 建立词表 vocab = Vocabulary(config) vocab.get_vocab() # 分离出验证集 word_train, word_dev, label_train, label_dev = dev_split(config.train_dir) # simple run without k-fold run(word_train, label_train, word_dev, label_dev, vocab, device)
def __init__(self, ngram_file, insert_word_posn, index_insert_words, min_count): print "\nNGramIndex: Initializing a new index" self.index_insert_words = index_insert_words self.counts_total = 0 self.num_uniq_ngrams = 0 self.vocab = Vocabulary() print "Now examining ngram counts in", ngram_file max_lines = self.ngram_file_mincount_line(ngram_file, min_count) print "For a min ngram count of ", min_count, print "need to read", max_lines, "ngrams from", ngram_file self.ngram_hash = numpy.zeros(max_lines, dtype=numpy.int64) self.ngram_count = numpy.zeros(max_lines, dtype=numpy.int32) if index_insert_words: self.ngram_gapword = numpy.zeros(max_lines, dtype=numpy.int32) self.build_index(ngram_file, insert_word_posn, index_insert_words, max_lines, min_count)
def prep_dataset(): wiki_path = WIKI_PATH if CONTEXT_CAPACITY % 2 != 0: raise Exception("Context length should be even") context_window = CONTEXT_CAPACITY + 1 print("Loading...", end="") wiki = WikiDataLoader(wiki_path) voc = Vocabulary() tok = Tokenizer() print("done") wiki_doc = wiki.next_doc() wikiprep = open("WikiPrepData.txt", "w") i = 0 while wiki_doc: doc = tok(wiki_doc) voc.add(doc) sample = np.array(voc.text2ids(doc)) indexer = np.arange(context_window)[None, :] + np.arange( len(sample) - context_window)[:, None] smpl = sample[indexer] for row in smpl: for val in row: wikiprep.write("%d " % val) wikiprep.write("\n") i += 1 if i == 2000: break wiki_doc = wiki.next_doc() pickle.dump(voc, open("WikiPrepVoc.pkl", "wb")) print("Vocabulary ready")
def main_freq(): logging.info("Loading dataset") dataset = load_dataset("ag_news") dataset_text = [r['text'] for r in dataset['train']] dataset_labels = [r['label'] for r in dataset['train']] logging.info("Building vocabulary") vocab = Vocabulary(dataset_text) vocab.make_vocab_charts() plt.close() plt.pause(0.01) logging.info("Computing PPMI matrix") PPMI = compute_ppmi_matrix([doc['text'] for doc in dataset['train']], vocab) logging.info("Performing Truncated SVD to reduce dimensionality") word_vectors = dim_reduce(PPMI) logging.info("Preparing T-SNE plot") plot_word_vectors_tsne(word_vectors, vocab)
def k_fold_run(): """train with k-fold""" # set the logger utils.set_logger(config.log_dir) # 设置gpu为命令行参数指定的id if config.gpu != '': device = torch.device(f"cuda:{config.gpu}") else: device = torch.device("cpu") logging.info("device: {}".format(device)) # 处理数据,分离文本和标签 processor = Processor(config) processor.data_process() # 建立词表 vocab = Vocabulary(config) vocab.get_vocab() # 分离出验证集 data = np.load(config.train_dir, allow_pickle=True) words = data["words"] labels = data["labels"] kf = KFold(n_splits=config.n_split) kf_data = kf.split(words, labels) kf_index = 0 total_test_loss = 0 total_f1 = 0 for train_index, dev_index in kf_data: kf_index += 1 word_train = words[train_index] label_train = labels[train_index] word_dev = words[dev_index] label_dev = labels[dev_index] test_loss, f1 = run(word_train, label_train, word_dev, label_dev, vocab, device, kf_index) total_test_loss += test_loss total_f1 += f1 average_test_loss = float(total_test_loss) / config.n_split average_f1 = float(total_f1) / config.n_split logging.info("Average test loss: {} , average f1 score: {}".format( average_test_loss, average_f1))
def get_result(self, event=None): inp = self.entry.get() if self.mode == 'chat': if inp != '': self.entry.set('') self.output, self.input, attention_plot = evaluate( inp, self.v, self.enc, self.dec, self.hparams['MAX_LEN']) self.attention_weights = attention_plot[:len( self.output.split(' ')), :len(self.input.split(' '))] res = Vocabulary.restore_text(self.output) self.update_label(inp, res) self.display_text.config(state=tk.NORMAL) self.display_text.insert( tk.END, self.text_history[-2] + self.text_history[-1]) self.display_text.config(state=tk.DISABLED) self.display_text.see(tk.END) else: self.entry.set('') if self.auto_inp == None and inp != '': self.auto_inp = inp self.main_button.config(text='Next') self.output, self.input, attention_plot = evaluate( self.auto_inp, self.v, self.enc, self.dec, self.hparams['MAX_LEN']) self.attention_weights = attention_plot[:len(self.output.split( ' ')), :len(self.input.split(' '))] res = Vocabulary.restore_text(self.output) self.update_label(self.auto_inp, res) self.display_text.config(state=tk.NORMAL) self.display_text.insert( tk.END, self.text_history[-2] + self.text_history[-1]) self.display_text.config(state=tk.DISABLED) self.display_text.see(tk.END) self.auto_inp = res
def similar_test(self): similar_to_paint = ['faint', 'saint', 'taint', 'point', 'print', 'pains'] similar_to_coder = ['cider', 'ceder', 'comer', 'coper', 'corer', 'cower', 'cover', 'coyer', 'codex', 'codes'] vocab = Vocabulary() vocab.fetch("dictionary.txt") similar_words = [x for x in vocab.similar("paint")] # get words one character different from 'paint' for item in similar_to_paint: # check that the test words are contained in result list self.assertIn(item, similar_words) similar_words = [x for x in vocab.similar("coder")] # get words one character different from 'coder' for item in similar_to_coder: # check that the test words are contained in result list self.assertIn(item, similar_words) similar_words = [x for x in vocab.similar("zzzzz")] # test non-sense word returns no results self.assertEqual(similar_words, [])
def load_vocab(file): v = Vocabulary() v.load(file) return v
def word_ladder_test(self): slant_to_grain = ['slant', 'plant', 'plank', 'blank', 'bland', 'brand', 'braid', 'brain', 'grain'] smart_to_brain = ['smart', 'start', 'stars', 'sears', 'bears', 'beans', 'brans', 'brand', 'braid', 'brain'] vocab = Vocabulary() vocab.fetch("dictionary.txt") self.assertEqual(vocab.word_ladder('slant', 'grain'), slant_to_grain) self.assertEqual(vocab.word_ladder('smart', 'brain'), smart_to_brain)