def _insert(iterable):
     words = []
     for w in iterable:
         w = Vocabulary.normalize(w)
         if valid_words and w not in valid_words:
             continue
         words.append(w)
     word_count.update(words)
def top_summary_words(args, examples, word_dict):
    """Count and return the most common question words in provided examples."""
    word_count = Counter()
    for ex in examples:
        for w in ex['summary'].tokens:
            w = Vocabulary.normalize(w)
            if w in word_dict:
                word_count.update([w])
    return word_count.most_common(args.tune_partial)
def index_embedding_words(embedding_file):
    """Put all the words in embedding_file into a set."""
    words = set()
    with open(embedding_file) as f:
        for line in tqdm(f, total=count_file_lines(embedding_file)):
            w = Vocabulary.normalize(line.rstrip().split(' ')[0])
            words.add(w)

    words.update([BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD])
    return words
Beispiel #4
0
 def _insert(iterable):
     words = []
     for w in iterable:
         w = Vocabulary.normalize(w)
         words.append(w)
     word_count.update(words)