Esempio n. 1
0
def build_word_dict(args, examples, dict_size=None, only_queries=False):
    """Return a dictionary from question and document words in
    provided examples.
    """
    word_dict = Vocabulary()
    for w in load_words(args, examples, dict_size, only_queries):
        word_dict.add(w)
    return word_dict
Esempio n. 2
0
 def _insert(iterable):
     words = []
     for w in iterable:
         w = Vocabulary.normalize(w)
         if valid_words and w not in valid_words:
             continue
         words.append(w)
     word_count.update(words)
Esempio n. 3
0
def index_embedding_words(embedding_file):
    """Put all the words in embedding_file into a set."""
    words = set()
    with open(embedding_file) as f:
        for line in tqdm(f, total=count_file_lines(embedding_file)):
            w = Vocabulary.normalize(line.rstrip().split(' ')[0])
            words.add(w)

    words.update([BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD])
    return words