def get_vocab(datasets): all_words = [ word for dataset in datasets for item in dataset for word in item[0] ] vocab = Vocab(data.count_tokens(all_words)) glove = embedding.create('glove', source='glove.6B.' + str(args.embedding_dim) + 'd') vocab.set_embedding(glove) return vocab
def test_join_embedding(): counter = data.Counter(["love", "走秀", "vacation"]) vocab1 = Vocab(counter) vocab2 = Vocab(counter) chinese_embedding = gluonnlp.embedding.create('fasttext', source='wiki.zh') eng_embedding = gluonnlp.embedding.create('fasttext', source='wiki.simple') vocab1.set_embedding(chinese_embedding) vocab2.set_embedding(eng_embedding) print(vocab1.embedding['vacation'] + vocab2.embedding['vacation'])
def _get_vocabs(train_examples, dev_examples, emb_file_name, is_cased_embedding, shrink_word_vocab, pool): """Create both word-level and character-level vocabularies. Vocabularies are built using data from both train and dev datasets. Parameters ---------- train_examples : List[dict] Tokenized training examples dev_examples : List[dict] Tokenized dev examples emb_file_name : str Glove embedding file name is_cased_embedding : bool When True, provided embedding file is cased, uncased otherwise shrink_word_vocab : bool When True, only tokens that have embeddings in the embedding file are remained in the word_vocab. Otherwise tokens with no embedding also stay pool : Pool Multiprocessing pool to use Returns ------- word_vocab : Vocab Word-level vocabulary char_vocab : Vocab Char-level vocabulary """ tic = time.time() print('Word counters receiving started.') word_mapper = SQuADAsyncVocabMapper() word_reducer = SQuADAsyncVocabReducer() word_mapped = list( tqdm.tqdm(word_mapper.run_async( itertools.chain(train_examples, dev_examples), pool), total=len(train_examples) + len(dev_examples))) word_partitioned = tqdm.tqdm(SQuADDataPipeline._partition( itertools.chain(*word_mapped)), total=len(word_mapped)) word_counts = list( tqdm.tqdm(word_reducer.run_async(word_partitioned, pool), total=len(word_partitioned))) print('Word counters received in {:.3f} sec'.format(time.time() - tic)) tic = time.time() print('Char counters receiving started.') char_mapper = SQuADAsyncVocabMapper(iterate_over_example=True) char_reducer = SQuADAsyncVocabReducer() char_mapped = list( tqdm.tqdm(char_mapper.run_async( itertools.chain(train_examples, dev_examples), pool), total=len(train_examples) + len(dev_examples))) char_partitioned = SQuADDataPipeline._partition( itertools.chain(*char_mapped)) char_counts = list( tqdm.tqdm(char_reducer.run_async(char_partitioned, pool), total=len(char_partitioned))) print('Char counters received in {:.3f} sec'.format(time.time() - tic)) embedding = nlp.embedding.create('glove', source=emb_file_name) if is_cased_embedding: word_counts = itertools.chain( *[[(item[0], item[1]), (item[0].lower(), item[1]), (item[0].capitalize(), item[1]), (item[0].upper(), item[1])] for item in word_counts]) else: word_counts = [(item[0].lower(), item[1]) for item in word_counts] word_vocab = Vocab( { item[0]: item[1] for item in word_counts if not shrink_word_vocab or item[0] in embedding.token_to_idx }, bos_token=None, eos_token=None) word_vocab.set_embedding(embedding) char_vocab = Vocab({item[0]: item[1] for item in char_counts}, bos_token=None, eos_token=None) return word_vocab, char_vocab