def build_word_dict(args, examples, dict_size=None, only_queries=False): """Return a dictionary from question and document words in provided examples. """ word_dict = Vocabulary() for w in load_words(args, examples, dict_size, only_queries): word_dict.add(w) return word_dict
def _insert(iterable): words = [] for w in iterable: w = Vocabulary.normalize(w) if valid_words and w not in valid_words: continue words.append(w) word_count.update(words)
def index_embedding_words(embedding_file): """Put all the words in embedding_file into a set.""" words = set() with open(embedding_file) as f: for line in tqdm(f, total=count_file_lines(embedding_file)): w = Vocabulary.normalize(line.rstrip().split(' ')[0]) words.add(w) words.update([BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD]) return words