def _get_vocabs(self): word_list = [] char_list = [] for ds in self._datasets: for item in ds: words = self._get_word_tokens(item[1]) word_list.extend(words) for word in words: char_list.extend(iter(word)) word_counter = data.count_tokens(word_list) char_counter = data.count_tokens(char_list) word_vocab = Vocab(word_counter) char_vocab = Vocab(char_counter) # embedding_zh = gluonnlp.embedding.create('fasttext', source='cc.zh.300') # embedding_eng = gluonnlp.embedding.create('fasttext', source='cc.en.300') # embedding_ko = gluonnlp.embedding.create('fasttext', source='cc.ko.300') # word_vocab.set_embedding(embedding_eng, embedding_zh, embedding_ko) # # count = 0 # for token, times in word_counter.items(): # if (word_vocab.embedding[token].sum() != 0).asscalar(): # count += 1 # else: # print(token) # # print("{}/{} words have embeddings".format(count, len(word_counter))) return word_vocab, char_vocab
def test_join_embedding(): counter = data.Counter(["love", "走秀", "vacation"]) vocab1 = Vocab(counter) vocab2 = Vocab(counter) chinese_embedding = gluonnlp.embedding.create('fasttext', source='wiki.zh') eng_embedding = gluonnlp.embedding.create('fasttext', source='wiki.simple') vocab1.set_embedding(chinese_embedding) vocab2.set_embedding(eng_embedding) print(vocab1.embedding['vacation'] + vocab2.embedding['vacation'])
def test_gluon_nlp(self): # get corpus statistics counter = count_tokens(['alpha', 'beta', 'gamma', 'beta']) # create Vocab vocab = Vocab(counter) # find index based on token self.assertEqual(4, vocab['beta'])
def _create_squad_vocab(tokenization_fn, dataset): all_tokens = [] for data_item in dataset: all_tokens.extend(tokenization_fn(data_item[1])) all_tokens.extend(tokenization_fn(data_item[2])) counter = data.count_tokens(all_tokens) vocab = Vocab(counter) return vocab
def get_vocab(datasets): all_words = [ word for dataset in datasets for item in dataset for word in item[0] ] vocab = Vocab(data.count_tokens(all_words)) glove = embedding.create('glove', source='glove.6B.' + str(args.embedding_dim) + 'd') vocab.set_embedding(glove) return vocab
def _get_vocabs(train_examples, dev_examples, emb_file_name, is_cased_embedding, shrink_word_vocab, pool): """Create both word-level and character-level vocabularies. Vocabularies are built using data from both train and dev datasets. Parameters ---------- train_examples : List[dict] Tokenized training examples dev_examples : List[dict] Tokenized dev examples emb_file_name : str Glove embedding file name is_cased_embedding : bool When True, provided embedding file is cased, uncased otherwise shrink_word_vocab : bool When True, only tokens that have embeddings in the embedding file are remained in the word_vocab. Otherwise tokens with no embedding also stay pool : Pool Multiprocessing pool to use Returns ------- word_vocab : Vocab Word-level vocabulary char_vocab : Vocab Char-level vocabulary """ tic = time.time() print('Word counters receiving started.') word_mapper = SQuADAsyncVocabMapper() word_reducer = SQuADAsyncVocabReducer() word_mapped = list( tqdm.tqdm(word_mapper.run_async( itertools.chain(train_examples, dev_examples), pool), total=len(train_examples) + len(dev_examples))) word_partitioned = tqdm.tqdm(SQuADDataPipeline._partition( itertools.chain(*word_mapped)), total=len(word_mapped)) word_counts = list( tqdm.tqdm(word_reducer.run_async(word_partitioned, pool), total=len(word_partitioned))) print('Word counters received in {:.3f} sec'.format(time.time() - tic)) tic = time.time() print('Char counters receiving started.') char_mapper = SQuADAsyncVocabMapper(iterate_over_example=True) char_reducer = SQuADAsyncVocabReducer() char_mapped = list( tqdm.tqdm(char_mapper.run_async( itertools.chain(train_examples, dev_examples), pool), total=len(train_examples) + len(dev_examples))) char_partitioned = SQuADDataPipeline._partition( itertools.chain(*char_mapped)) char_counts = list( tqdm.tqdm(char_reducer.run_async(char_partitioned, pool), total=len(char_partitioned))) print('Char counters received in {:.3f} sec'.format(time.time() - tic)) embedding = nlp.embedding.create('glove', source=emb_file_name) if is_cased_embedding: word_counts = itertools.chain( *[[(item[0], item[1]), (item[0].lower(), item[1]), (item[0].capitalize(), item[1]), (item[0].upper(), item[1])] for item in word_counts]) else: word_counts = [(item[0].lower(), item[1]) for item in word_counts] word_vocab = Vocab( { item[0]: item[1] for item in word_counts if not shrink_word_vocab or item[0] in embedding.token_to_idx }, bos_token=None, eos_token=None) word_vocab.set_embedding(embedding) char_vocab = Vocab({item[0]: item[1] for item in char_counts}, bos_token=None, eos_token=None) return word_vocab, char_vocab
def read_data(word_path, label_path, nature_path, max_seq_len, PAD, NOT, PAD_NATURE, UNK): ''' 读取数据中的每个句子的词,词性,词所对应的实体的标记。对每条句子的词的长度进行长截短补到指定的 max_seq_len 的长度,对词的填充使用 PAD, 词性填充使用 PAD_NATURE, 标记填充使用 NOT。 构建 词的字典,词性的字典以及标记的字典,字典中保留位置符号 UNK Args: word_path: 包含每条句子的词的数据的路径 label_path: 包含每条句子的词的标记的数据的路径 nature_path: 包含每条句子的词的词性的数据的路径 max_seq_len: 最大句子长度,以词为单位 PAD: 词的填充符号 NOT: 标记的填充符号 PAD_NATURE: 词性的填充符号 UNK: 未知符号 Returns: word_vocab:词的字典 label_vocab:词所对应的实体的标记的字典 nature_vocab:词的词性的字典 input_seqs:所有句子的输入的词的列表 [[word1, word2, ...], [word1, word2, ...], ...] output_seqs: 所有句子的词的标记的列表 [[label1, label2, ...], [label1, label2, ...], ...] nature_seqs:所有句子的词的词性的列表 [[nature1, nature2, ...], [nature1, nature2, ...], ...] ''' input_tokens, output_tokens, nature_tokens = [], [], [] input_seqs, output_seqs, nature_seqs = [], [], [] with open(word_path, 'r', encoding='utf-8') as fx, open(label_path, 'r', encoding='utf-8') as fy, open(nature_path, 'r', encoding='utf-8') as fn: word_lines = fx.readlines() label_lines = fy.readlines() word_natures = fn.readlines() assert len(word_lines) == len(word_natures) assert len(word_natures) == len(label_lines) for word_line, label_line, word_nature in zip(word_lines, label_lines, word_natures): input_seq = word_line.strip() output_seq = label_line.strip() nature_seq = word_nature.strip() cur_input_tokens = input_seq.split(' ') cur_output_tokens = output_seq.split(' ') cur_nature_tokens = nature_seq.split(' ') assert len(cur_input_tokens) == len(cur_output_tokens) assert len(cur_output_tokens) == len(cur_nature_tokens) # 跳过奇怪的实体类别标注 if '' in cur_output_tokens: continue # if-else: 长截短补 if len(cur_input_tokens) < max_seq_len or len(cur_output_tokens) < max_seq_len or len(cur_nature_tokens) < max_seq_len: # 添加 PAD 符号使每个序列长度都为 max_seq_len while len(cur_input_tokens) < max_seq_len: cur_input_tokens.append(PAD) cur_output_tokens.append(NOT) cur_nature_tokens.append(PAD_NATURE) else: cur_input_tokens = cur_input_tokens[0:max_seq_len] cur_output_tokens = cur_output_tokens[0:max_seq_len] cur_nature_tokens = cur_nature_tokens[0:max_seq_len] input_tokens.extend(cur_input_tokens) output_tokens.extend(cur_output_tokens) nature_tokens.extend(cur_nature_tokens) # 记录序列 input_seqs.append(cur_input_tokens) output_seqs.append(cur_output_tokens) nature_seqs.append(cur_nature_tokens) # 创建字典 word_vocab = Vocab(count_tokens(input_tokens), unknown_token=UNK, padding_token=PAD) label_vocab = Vocab(count_tokens(output_tokens), unknown_token=UNK, padding_token=NOT) nature_vocab = Vocab(count_tokens(nature_tokens), unknown_token=UNK, padding_token=PAD_NATURE) return word_vocab, label_vocab, nature_vocab, input_seqs, output_seqs, nature_seqs