def _load_processed_data(self): """ Load processed data from the disk Returns ------- train_examples : List[Tuple] Processed train examples. Each tuple consists of question_id, record_index, context_tokens_indices, question_tokens_indices, context_chars_indices, question_char_indices, start_token_index_of_the_answer, end_token_index_of_the_answer, context, context_tokens_spans dev_examples : List[Tuple] Processed dev examples. Each tuple consists of question_id, record_index, context_tokens_indices, question_tokens_indices, context_chars_indices, question_char_indices, start_token_index_of_the_answer, end_token_index_of_the_answer, context, context_tokens_spans word_vocab : Vocab Word-level vocabulary char_vocab : Vocab Char-level vocabulary """ with open(os.path.join(self._data_root_path, self._processed_train_data_file_name), 'r') as f: train_examples = json.load(f) with open(os.path.join(self._data_root_path, self._processed_dev_data_file_name), 'r') as f: dev_examples = json.load(f) with open(os.path.join(self._data_root_path, self._word_vocab_file_name), 'r') as f: word_vocab = Vocab.from_json(json.load(f)) with open(os.path.join(self._data_root_path, self._char_vocab_file_name), 'r') as f: char_vocab = Vocab.from_json(json.load(f)) return train_examples, dev_examples, word_vocab, char_vocab
def _get_vocabs(self): word_list = [] char_list = [] for ds in self._datasets: for item in ds: words = self._get_word_tokens(item[1]) word_list.extend(words) for word in words: char_list.extend(iter(word)) word_counter = data.count_tokens(word_list) char_counter = data.count_tokens(char_list) word_vocab = Vocab(word_counter) char_vocab = Vocab(char_counter) # embedding_zh = gluonnlp.embedding.create('fasttext', source='cc.zh.300') # embedding_eng = gluonnlp.embedding.create('fasttext', source='cc.en.300') # embedding_ko = gluonnlp.embedding.create('fasttext', source='cc.ko.300') # word_vocab.set_embedding(embedding_eng, embedding_zh, embedding_ko) # # count = 0 # for token, times in word_counter.items(): # if (word_vocab.embedding[token].sum() != 0).asscalar(): # count += 1 # else: # print(token) # # print("{}/{} words have embeddings".format(count, len(word_counter))) return word_vocab, char_vocab
def get_vocab(datasets): all_words = [ word for dataset in datasets for item in dataset for word in item[0] ] vocab = Vocab(data.count_tokens(all_words)) glove = embedding.create('glove', source='glove.6B.' + str(args.embedding_dim) + 'd') vocab.set_embedding(glove) return vocab
def __init__(self, num_classes: int, embedding_dim: int, k_max: int, vocab: Vocab) -> None: """Instantiating VDCNN class Args: num_classes (int): the number of classes embedding_dim (int): the dimension of embedding vector for token k_max (int): the parameter of k-max pooling following last convolution block vocab (gluonnlp.Vocab): the instance of gluonnlp.Vocab """ super(VDCNN, self).__init__() self._extractor = nn.Sequential(nn.Embedding(len(vocab), embedding_dim, vocab.to_indices(vocab.padding_token)), Permute(), nn.Conv1d(embedding_dim, 64, 3, 1, 1), ConvBlock(64, 64), ConvBlock(64, 64), nn.MaxPool1d(2, 2), ConvBlock(64, 128), ConvBlock(128, 128), nn.MaxPool1d(2, 2), ConvBlock(128, 256), ConvBlock(256, 256), nn.MaxPool1d(2, 2), ConvBlock(256, 512), ConvBlock(512, 512), nn.AdaptiveMaxPool1d(k_max), Flatten()) self._classifier = nn.Sequential(nn.Linear(512 * k_max, 2048), nn.ReLU(), nn.Linear(2048, 2048), nn.ReLU(), nn.Linear(2048, num_classes))
def __init__(self, num_classes: int, embedding_dim: int, vocab: Vocab) -> None: """Instantiating CharCNN class Args: num_classes (int): the number of classes embedding_dim (int): the dimension of embedding vector for token vocab (gluonnlp.Vocab): the instance of gluonnlp.Vocab """ super(CharCNN, self).__init__() self._extractor = nn.Sequential( nn.Embedding(len(vocab), embedding_dim, vocab.to_indices(vocab.padding_token)), Permute(), nn.Conv1d(in_channels=embedding_dim, out_channels=256, kernel_size=7), nn.ReLU(), nn.MaxPool1d(3, 3), nn.Conv1d(in_channels=256, out_channels=256, kernel_size=7), nn.ReLU(), nn.MaxPool1d(3, 3), nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3), nn.ReLU(), nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3), nn.ReLU(), nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3), nn.ReLU(), nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3), nn.ReLU(), nn.MaxPool1d(3, 3), Flatten()) self._classifier = nn.Sequential( nn.Linear(in_features=1792, out_features=512), nn.ReLU(), nn.Dropout(), nn.Linear(in_features=512, out_features=512), nn.ReLU(), nn.Dropout(), nn.Linear(in_features=512, out_features=num_classes)) self.apply(self._init_weights)
def test_gluon_nlp(self): # get corpus statistics counter = count_tokens(['alpha', 'beta', 'gamma', 'beta']) # create Vocab vocab = Vocab(counter) # find index based on token self.assertEqual(4, vocab['beta'])
def __init__(self, label_vocab: Vocab, token_vocab: Vocab, lstm_hidden_dim: int) -> None: """Instantiating BilstmCRF class Args: token_vocab: (gluonnlp.Vocab): the instance of gluonnlp.Vocab that has token information label_vocab: (gluonnlp.Vocab): the instance of gluonnlp.Vocab that has label information lstm_hidden_dim (int): the number of hidden dimension of lstm """ super(BilstmCRF, self).__init__() self._embedding = Embedding(token_vocab, padding_idx=token_vocab.to_indices(token_vocab.padding_token), freeze=False, permuting=False, tracking=True) self._pipe = Linker(permuting=False) self._bilstm = BiLSTM(self._embedding._ops.embedding_dim, lstm_hidden_dim, using_sequence=True) self._fc = nn.Linear(2 * lstm_hidden_dim, len(label_vocab)) self._crf = CRF(len(label_vocab), bos_tag_id=label_vocab.to_indices(label_vocab.bos_token), eos_tag_id=label_vocab.to_indices(label_vocab.eos_token), pad_tag_id=label_vocab.to_indices(label_vocab.padding_token))
def __init__(self, vocab: Vocab, word_dropout_ratio: float = .2) -> None: """Instantiating MultiChannelEmbedding class Args: vocab (gluonnlp.Vocab): the instance of gluonnlp.Vocab word_dropout_ratio (float): ratio of replacing token with "<unk>" in the sequence """ super(MultiChannelEmbedding, self).__init__() self._static = nn.Embedding.from_pretrained( torch.from_numpy(vocab.embedding.idx_to_vec.asnumpy()), freeze=True, padding_idx=vocab.to_indices(vocab.padding_token)) self._non_static = nn.Embedding.from_pretrained( torch.from_numpy(vocab.embedding.idx_to_vec.asnumpy()), freeze=False, padding_idx=vocab.to_indices(vocab.padding_token)) self._word_dropout_ratio = word_dropout_ratio
def test_join_embedding(): counter = data.Counter(["love", "走秀", "vacation"]) vocab1 = Vocab(counter) vocab2 = Vocab(counter) chinese_embedding = gluonnlp.embedding.create('fasttext', source='wiki.zh') eng_embedding = gluonnlp.embedding.create('fasttext', source='wiki.simple') vocab1.set_embedding(chinese_embedding) vocab2.set_embedding(eng_embedding) print(vocab1.embedding['vacation'] + vocab2.embedding['vacation'])
def _create_squad_vocab(tokenization_fn, dataset): all_tokens = [] for data_item in dataset: all_tokens.extend(tokenization_fn(data_item[1])) all_tokens.extend(tokenization_fn(data_item[2])) counter = data.count_tokens(all_tokens) vocab = Vocab(counter) return vocab
def __init__(self, lstm_hidden_dim: int, da: int, r: int, vocab: Vocab) -> None: """Instantiating SentenceEncoder class Args: lstm_hidden_dim (int): the number of features in the hidden states in bi-directional lstm da (int): the number of features in hidden layer from self-attention r (int): the number of aspects of self-attention vocab (gluonnlp.Vocab): the instance of gluonnlp.Vocab """ super(SentenceEncoder, self).__init__() self._embedding = Embedding(vocab, padding_idx=vocab.to_indices(vocab.padding_token), freeze=False, permuting=False, tracking=True) self._pipe = Linker(permuting=False) self._bilstm = BiLSTM(self._embedding._ops.embedding_dim, lstm_hidden_dim, using_sequence=True) self._attention = SelfAttention(2 * lstm_hidden_dim, da, r)
def __init__(self, num_classes: int, embedding_dim: int, hidden_dim: int, vocab: Vocab) -> None: """Instantiating ConvRec class Args: num_classes (int): the number of classes embedding_dim (int) : the dimension of embedding vector for token vocab (gluonnlp.Vocab): the instance of gluonnlp.Vocab """ super(ConvRec, self).__init__() self._ops = nn.Sequential( Embedding(len(vocab), embedding_dim, vocab.to_indices(vocab.padding_token), permuting=True, tracking=True), Conv1d(embedding_dim, hidden_dim, 5, 1, 1, F.relu, tracking=True), MaxPool1d(2, 2, tracking=True), Conv1d(hidden_dim, hidden_dim, 3, 1, 1, F.relu, tracking=True), MaxPool1d(2, 2, tracking=True), Linker(permuting=True), BiLSTM(hidden_dim, hidden_dim, using_sequence=False), nn.Dropout(), nn.Linear(in_features=2 * hidden_dim, out_features=num_classes)) self.apply(self._init_weights)
def vocab(self): path = os.path.join(self._path, 'vocab.json') with io.open(path, 'r', encoding='utf-8') as in_file: return Vocab.from_json(in_file.read())
def _get_vocabs(train_examples, dev_examples, emb_file_name, is_cased_embedding, shrink_word_vocab, pool): """Create both word-level and character-level vocabularies. Vocabularies are built using data from both train and dev datasets. Parameters ---------- train_examples : List[dict] Tokenized training examples dev_examples : List[dict] Tokenized dev examples emb_file_name : str Glove embedding file name is_cased_embedding : bool When True, provided embedding file is cased, uncased otherwise shrink_word_vocab : bool When True, only tokens that have embeddings in the embedding file are remained in the word_vocab. Otherwise tokens with no embedding also stay pool : Pool Multiprocessing pool to use Returns ------- word_vocab : Vocab Word-level vocabulary char_vocab : Vocab Char-level vocabulary """ tic = time.time() print('Word counters receiving started.') word_mapper = SQuADAsyncVocabMapper() word_reducer = SQuADAsyncVocabReducer() word_mapped = list( tqdm.tqdm(word_mapper.run_async( itertools.chain(train_examples, dev_examples), pool), total=len(train_examples) + len(dev_examples))) word_partitioned = tqdm.tqdm(SQuADDataPipeline._partition( itertools.chain(*word_mapped)), total=len(word_mapped)) word_counts = list( tqdm.tqdm(word_reducer.run_async(word_partitioned, pool), total=len(word_partitioned))) print('Word counters received in {:.3f} sec'.format(time.time() - tic)) tic = time.time() print('Char counters receiving started.') char_mapper = SQuADAsyncVocabMapper(iterate_over_example=True) char_reducer = SQuADAsyncVocabReducer() char_mapped = list( tqdm.tqdm(char_mapper.run_async( itertools.chain(train_examples, dev_examples), pool), total=len(train_examples) + len(dev_examples))) char_partitioned = SQuADDataPipeline._partition( itertools.chain(*char_mapped)) char_counts = list( tqdm.tqdm(char_reducer.run_async(char_partitioned, pool), total=len(char_partitioned))) print('Char counters received in {:.3f} sec'.format(time.time() - tic)) embedding = nlp.embedding.create('glove', source=emb_file_name) if is_cased_embedding: word_counts = itertools.chain( *[[(item[0], item[1]), (item[0].lower(), item[1]), (item[0].capitalize(), item[1]), (item[0].upper(), item[1])] for item in word_counts]) else: word_counts = [(item[0].lower(), item[1]) for item in word_counts] word_vocab = Vocab( { item[0]: item[1] for item in word_counts if not shrink_word_vocab or item[0] in embedding.token_to_idx }, bos_token=None, eos_token=None) word_vocab.set_embedding(embedding) char_vocab = Vocab({item[0]: item[1] for item in char_counts}, bos_token=None, eos_token=None) return word_vocab, char_vocab
def read_data(word_path, label_path, nature_path, max_seq_len, PAD, NOT, PAD_NATURE, UNK): ''' 读取数据中的每个句子的词,词性,词所对应的实体的标记。对每条句子的词的长度进行长截短补到指定的 max_seq_len 的长度,对词的填充使用 PAD, 词性填充使用 PAD_NATURE, 标记填充使用 NOT。 构建 词的字典,词性的字典以及标记的字典,字典中保留位置符号 UNK Args: word_path: 包含每条句子的词的数据的路径 label_path: 包含每条句子的词的标记的数据的路径 nature_path: 包含每条句子的词的词性的数据的路径 max_seq_len: 最大句子长度,以词为单位 PAD: 词的填充符号 NOT: 标记的填充符号 PAD_NATURE: 词性的填充符号 UNK: 未知符号 Returns: word_vocab:词的字典 label_vocab:词所对应的实体的标记的字典 nature_vocab:词的词性的字典 input_seqs:所有句子的输入的词的列表 [[word1, word2, ...], [word1, word2, ...], ...] output_seqs: 所有句子的词的标记的列表 [[label1, label2, ...], [label1, label2, ...], ...] nature_seqs:所有句子的词的词性的列表 [[nature1, nature2, ...], [nature1, nature2, ...], ...] ''' input_tokens, output_tokens, nature_tokens = [], [], [] input_seqs, output_seqs, nature_seqs = [], [], [] with open(word_path, 'r', encoding='utf-8') as fx, open(label_path, 'r', encoding='utf-8') as fy, open(nature_path, 'r', encoding='utf-8') as fn: word_lines = fx.readlines() label_lines = fy.readlines() word_natures = fn.readlines() assert len(word_lines) == len(word_natures) assert len(word_natures) == len(label_lines) for word_line, label_line, word_nature in zip(word_lines, label_lines, word_natures): input_seq = word_line.strip() output_seq = label_line.strip() nature_seq = word_nature.strip() cur_input_tokens = input_seq.split(' ') cur_output_tokens = output_seq.split(' ') cur_nature_tokens = nature_seq.split(' ') assert len(cur_input_tokens) == len(cur_output_tokens) assert len(cur_output_tokens) == len(cur_nature_tokens) # 跳过奇怪的实体类别标注 if '' in cur_output_tokens: continue # if-else: 长截短补 if len(cur_input_tokens) < max_seq_len or len(cur_output_tokens) < max_seq_len or len(cur_nature_tokens) < max_seq_len: # 添加 PAD 符号使每个序列长度都为 max_seq_len while len(cur_input_tokens) < max_seq_len: cur_input_tokens.append(PAD) cur_output_tokens.append(NOT) cur_nature_tokens.append(PAD_NATURE) else: cur_input_tokens = cur_input_tokens[0:max_seq_len] cur_output_tokens = cur_output_tokens[0:max_seq_len] cur_nature_tokens = cur_nature_tokens[0:max_seq_len] input_tokens.extend(cur_input_tokens) output_tokens.extend(cur_output_tokens) nature_tokens.extend(cur_nature_tokens) # 记录序列 input_seqs.append(cur_input_tokens) output_seqs.append(cur_output_tokens) nature_seqs.append(cur_nature_tokens) # 创建字典 word_vocab = Vocab(count_tokens(input_tokens), unknown_token=UNK, padding_token=PAD) label_vocab = Vocab(count_tokens(output_tokens), unknown_token=UNK, padding_token=NOT) nature_vocab = Vocab(count_tokens(nature_tokens), unknown_token=UNK, padding_token=PAD_NATURE) return word_vocab, label_vocab, nature_vocab, input_seqs, output_seqs, nature_seqs