def extract(args): from tokenizer import load_vocab logger.info('load vocab from {}'.format(args.vocab_path)) vocab = load_vocab(vocab_path=args.vocab_path) logger.info('vocab size: {}'.format(vocab.size())) load_pretrain_embedding(vocab, embed_size=args.embed_size, embedding_path=args.file_path)
def convert_vocab(vocab_file): """GluonNLP specific code to convert the original vocabulary to nlp.vocab.BERTVocab.""" original_vocab = load_vocab(vocab_file) token_to_idx = dict(original_vocab) num_tokens = len(token_to_idx) idx_to_token = [None] * len(original_vocab) for word in original_vocab: idx = int(original_vocab[word]) idx_to_token[idx] = word def swap(token, target_idx, token_to_idx, idx_to_token, swap_idx): original_idx = token_to_idx[token] original_token = idx_to_token[target_idx] token_to_idx[token] = target_idx token_to_idx[original_token] = original_idx idx_to_token[target_idx] = token idx_to_token[original_idx] = original_token swap_idx.append((original_idx, target_idx)) reserved_tokens = [ gluonnlp.vocab.BERTVocab.PADDING_TOKEN, gluonnlp.vocab.BERTVocab.CLS_TOKEN, gluonnlp.vocab.BERTVocab.SEP_TOKEN, gluonnlp.vocab.BERTVocab.MASK_TOKEN ] unknown_token = gluonnlp.vocab.BERTVocab.UNKNOWN_TOKEN padding_token = gluonnlp.vocab.BERTVocab.PADDING_TOKEN swap_idx = [] assert unknown_token in token_to_idx assert padding_token in token_to_idx swap(unknown_token, 0, token_to_idx, idx_to_token, swap_idx) for i, token in enumerate(reserved_tokens): swap(token, i + 1, token_to_idx, idx_to_token, swap_idx) # sanity checks assert len(token_to_idx) == num_tokens assert len(idx_to_token) == num_tokens assert None not in idx_to_token assert len(set(idx_to_token)) == num_tokens bert_vocab_dict = {} bert_vocab_dict['idx_to_token'] = idx_to_token bert_vocab_dict['token_to_idx'] = token_to_idx bert_vocab_dict['reserved_tokens'] = reserved_tokens bert_vocab_dict['unknown_token'] = unknown_token bert_vocab_dict['padding_token'] = padding_token bert_vocab_dict['bos_token'] = None bert_vocab_dict['eos_token'] = None bert_vocab_dict['mask_token'] = gluonnlp.vocab.BERTVocab.MASK_TOKEN bert_vocab_dict['sep_token'] = gluonnlp.vocab.BERTVocab.SEP_TOKEN bert_vocab_dict['cls_token'] = gluonnlp.vocab.BERTVocab.CLS_TOKEN json_str = json.dumps(bert_vocab_dict) converted_vocab = gluonnlp.vocab.BERTVocab.from_json(json_str) return converted_vocab, swap_idx
def convert_vocab(vocab_file): """GluonNLP specific code to convert the original vocabulary to nlp.vocab.Vocab.""" original_vocab = load_vocab(vocab_file) token_to_idx = dict(original_vocab) num_tokens = len(token_to_idx) idx_to_token = [None] * len(original_vocab) for word in original_vocab: idx = int(original_vocab[word]) idx_to_token[idx] = word def swap(token, target_idx, token_to_idx, idx_to_token, swap_idx): original_idx = token_to_idx[token] original_token = idx_to_token[target_idx] token_to_idx[token] = target_idx token_to_idx[original_token] = original_idx idx_to_token[target_idx] = token idx_to_token[original_idx] = original_token swap_idx.append((original_idx, target_idx)) reserved_tokens = ['[PAD]', '[CLS]', '[SEP]', '[MASK]'] unknown_token = '[UNK]' padding_token = '[PAD]' swap_idx = [] assert unknown_token in token_to_idx assert padding_token in token_to_idx swap(unknown_token, 0, token_to_idx, idx_to_token, swap_idx) for i, token in enumerate(reserved_tokens): swap(token, i + 1, token_to_idx, idx_to_token, swap_idx) # sanity checks assert len(token_to_idx) == num_tokens assert len(idx_to_token) == num_tokens assert None not in idx_to_token assert len(set(idx_to_token)) == num_tokens vocab_dict = {} vocab_dict['idx_to_token'] = idx_to_token vocab_dict['token_to_idx'] = token_to_idx vocab_dict['reserved_tokens'] = reserved_tokens vocab_dict['unknown_token'] = unknown_token vocab_dict['padding_token'] = padding_token vocab_dict['bos_token'] = None vocab_dict['eos_token'] = None json_str = json.dumps(vocab_dict) converted_vocab = gluonnlp.Vocab.from_json(json_str) return converted_vocab, swap_idx
parser.add_argument('--word_vocab', required=True) parser.add_argument('--train_file', required=True) parser.add_argument('--test_file', required=True) parser.add_argument('--model_config') parser.add_argument('--batch_size', default=32, type=int) parser.add_argument('--learning_rate', default=1e-4, type=float) parser.add_argument('--num_epoch', default=10, type=int) parser.add_argument('--device', default='cpu') parser.add_argument('--log_dir', default='logs') parser.add_argument('--weight_dir', default='weight') if __name__ == "__main__": args = parser.parse_args() print("Load vocab") tokenizer = load_vocab(args.char_vocab, args.word_vocab) print("Prepare data") train_ds = BERTDataset(args.train_file, tokenizer) test_ds = BERTDataset(args.test_file, tokenizer) train_dl = DataLoader(train_ds, shuffle=True, batch_size=args.batch_size) test_dl = DataLoader(test_ds, shuffle=False, batch_size=args.batch_size) print("Init model") char_vocab_len = len(tokenizer.char_stoi) word_vocab_len = len(tokenizer.word_stoi) if args.model_config: with open(args.model_config) as f: config = json.load(f) else:
] src = self.tokenizer.tokenize_char(src) if src.shape[0] < self.tgt_pad_len: src = np.concatenate([src, src_pad]) label = self.tgt[index] label = np.array(label, dtype=np.int64) mask = np.array(mask, dtype=np.int64) pos = np.array(pos, dtype=np.int64) src = np.array(src, dtype=np.int64) if src.shape[0] > self.tgt_pad_len: src = src[:self.tgt_pad_len] if mask.shape[0] > self.tgt_pad_len: mask = mask[:self.tgt_pad_len] if label.shape[0] > self.tgt_pad_len: label = label[:self.tgt_pad_len] if pos.shape[0] > self.tgt_pad_len: pos = pos[:self.tgt_pad_len] return src, mask, label, pos if __name__ == "__main__": from tokenizer import load_vocab tokenizer = load_vocab('vocab/char_vocab.txt', 'vocab/word_vocab.txt') ds = SingleDataset('data/test.txt', tokenizer) src, mask, label, pos = ds.__getitem__(0) print(src.dtype) print(mask.dtype) print(label.dtype) print(pos.dtype) label = tokenizer._id_to_token(label[:10], tokenizer.word_itos) print(src[:10]) print(mask[:10]) print(label[:10]) print(pos[:10])