def _insert(iterable): words = [] for w in iterable: w = Vocabulary.normalize(w) if valid_words and w not in valid_words: continue words.append(w) word_count.update(words)
def top_summary_words(args, examples, word_dict): """Count and return the most common question words in provided examples.""" word_count = Counter() for ex in examples: for w in ex['summary'].tokens: w = Vocabulary.normalize(w) if w in word_dict: word_count.update([w]) return word_count.most_common(args.tune_partial)
def index_embedding_words(embedding_file): """Put all the words in embedding_file into a set.""" words = set() with open(embedding_file) as f: for line in tqdm(f, total=count_file_lines(embedding_file)): w = Vocabulary.normalize(line.rstrip().split(' ')[0]) words.add(w) words.update([BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD]) return words
def _insert(iterable): words = [] for w in iterable: w = Vocabulary.normalize(w) words.append(w) word_count.update(words)