def create_tokenizer(tokenizer_type, model_path, vocab_path): if tokenizer_type == 'whitespace': return tokenizers.create(tokenizer_type, vocab=Vocab.load(vocab_path)) elif tokenizer_type == 'spm': return tokenizers.create(tokenizer_type, model_path=model_path, vocab=vocab_path) elif tokenizer_type == 'subword_nmt': return tokenizers.create(tokenizer_type, model_path=model_path, vocab=vocab_path) elif tokenizer_type == 'yttm': return tokenizers.create(tokenizer_type, model_path=model_path) elif tokenizer_type in ['hf_bytebpe', 'hf_wordpiece', 'hf_bpe']: if huggingface.is_new_version_model_file(model_path): return tokenizers.create('hf_tokenizer', model_path=model_path, vocab=vocab_path) elif tokenizer_type == 'hf_bytebpe': return tokenizers.create(tokenizer_type, merges_file=model_path, vocab_file=vocab_path) elif tokenizer_type == 'hf_wordpiece': return tokenizers.create(tokenizer_type, vocab_file=vocab_path) elif tokenizer_type == 'hf_bpe': return tokenizers.create(tokenizer_type, merges_file=model_path, vocab_file=vocab_path) else: raise NotImplementedError
def create_tokenizer(tokenizer_type, model_path, vocab_path): if tokenizer_type == 'whitespace': return tokenizers.create(tokenizer_type, vocab=Vocab.load(vocab_path)) elif tokenizer_type == 'spm': return tokenizers.create(tokenizer_type, model_path=model_path, vocab=vocab_path) elif tokenizer_type == 'subword_nmt': return tokenizers.create(tokenizer_type, codec_path=model_path, vocab_path=vocab_path) elif tokenizer_type == 'yttm': return tokenizers.create(tokenizer_type, model_path=model_path) elif tokenizer_type == 'hf_bytebpe': return tokenizers.create(tokenizer_type, merges_file=model_path, vocab_file=vocab_path) elif tokenizer_type == 'hf_wordpiece': return tokenizers.create(tokenizer_type, vocab_file=vocab_path) elif tokenizer_type == 'hf_bpe': return tokenizers.create(tokenizer_type, merges_file=model_path, vocab_file=vocab_path) else: raise NotImplementedError
def get_base_tokenizer(method, lang): """The base tokenization method Parameters ---------- method lang Returns ------- """ if method == 'moses': return tokenizers.create('moses', lang) elif method == 'whitespace': return tokenizers.create('whitespace') elif method == 'no': return None else: raise NotImplementedError
def test_subword_algorithms_ende(model): dir_path = os.path.join(_CURR_DIR, 'learn_apply_subword_ende_results') os.makedirs(dir_path, exist_ok=True) dir_path = os.path.realpath(dir_path) parser = learn_subword.get_parser() apply_parser = apply_subword.get_parser() corpus_path_pair = [ os.path.join(_CURR_DIR, 'data', 'wmt19-test-de-en.de'), os.path.join(_CURR_DIR, 'data', 'wmt19-test-de-en.en') ] args = parser.parse_args( ['--corpus'] + corpus_path_pair + ['--model', model, '--vocab-size', '5000', '--save-dir', dir_path]) # Train the tokenizer learn_subword.main(args) if model in ['yttm', 'spm', 'subword_nmt']: model_key = model else: model_key = 'hf_tokenizer' tokenizer = tokenizers.create(model_key, model_path=os.path.join( dir_path, '{}.model'.format(model)), vocab=os.path.join(dir_path, '{}.vocab'.format(model))) args = apply_parser.parse_args(['--corpus'] + [corpus_path_pair[0]] + [ '--model', model, '--model-path', os.path.join(dir_path, '{}.model'.format(model)), '--vocab-path', os.path.join(dir_path, '{}.vocab'.format(model)), '--save-path', os.path.join(dir_path, 'wmt19-test-de-en.de.{}'.format(model)) ]) apply_subword.main(args) args = apply_parser.parse_args(['--corpus'] + [corpus_path_pair[1]] + [ '--model', model, '--model-path', os.path.join(dir_path, '{}.model'.format(model)), '--vocab-path', os.path.join(dir_path, '{}.vocab'.format(model)), '--save-path', os.path.join(dir_path, 'wmt19-test-de-en.en.{}'.format(model)) ]) apply_subword.main(args) # Decode back with the trained tokenizer for prefix_fname in [ 'wmt19-test-de-en.de.{}'.format(model), 'wmt19-test-de-en.en.{}'.format(model) ]: with open(os.path.join(dir_path, '{}.decode'.format(prefix_fname)), 'w', encoding='utf-8') as out_f: with open(os.path.join(dir_path, '{}'.format(prefix_fname)), 'r', encoding='utf-8') as in_f: for line in in_f: out_f.write(tokenizer.decode(line.split()) + '\n')
def main(args): start = time.time() if args.model == 'spm': tokenizer_model = tokenizers.create('spm', model_path=args.model_path, vocab=args.vocab_path) elif args.model == 'subword_nmt': tokenizer_model = tokenizers.create('subword_nmt', codec_path=args.model_path, vocab_path=args.vocab_path, bpe_dropout=args.bpe_dropout) elif args.model == 'yttm': args.bpe_dropout = 0.0 if not args.bpe_dropout else args.bpe_dropout tokenizer_model = tokenizers.create('yttm', model_path=args.model_path, bpe_dropout=args.bpe_dropout, n_threads=1) elif args.model == 'hf_bytebpe': tokenizer_model = tokenizers.create('hf_bytebpe', merges_file=args.model_path, vocab_file=args.vocab_path, dropout=args.bpe_dropout, lowercase=args.lowercase) elif args.model == 'hf_wordpiece': tokenizer_model = tokenizers.create('hf_wordpiece', vocab_file=args.vocab_path, lowercase=args.lowercase, strip_accents=args.strip_accents) elif args.model == 'hf_bpe': tokenizer_model = tokenizers.create('hf_bpe', merges_file=args.model_path, vocab_file=args.vocab_path, dropout=args.bpe_dropout, lowercase=args.lowercase) else: raise NotImplementedError print('Applying {} to {}'. format(tokenizer_model.__class__.__name__, ', '.join(args.corpus))) output_type = {'subword': str, 'id': int}[args.output_type] applyer = ParallelCorpusApplyer(args.corpus, tokenizer_model, output_type) with open(args.save_path, 'w', encoding='utf-8', newline='\n') as fo: with Pool(args.num_process) as pool: sentence_count = token_count = unk_count = 0 for i, (tokenized_sentences, sentence_num, token_num, unk_num) in \ enumerate(pool.imap(applyer.process_chunk, applyer.chunk_iter())): fo.write('\n'.join(tokenized_sentences)) fo.write('\n') sentence_count += sentence_num token_count += token_num unk_count += unk_num if (i + 1) % 100 == 0: print('Chunk {} , #Lines processed: {}' .format(i + 1, sentence_count)) end = time.time() print('Done, #Lines processed {}, Avg tokens of sentences {:.1f},' 'Unknown rate {:.1f}%, Time spent {}' .format(sentence_count, token_count / sentence_count, unk_count * 100 / token_count, end - start))
def test_subword_custom_token(model): parser = learn_subword.get_parser() corpus_path = os.path.join(_CURR_DIR, 'data', 'wmt19-test-zh-en.zh.jieba') with tempfile.TemporaryDirectory() as tempdir: dir_path = tempdir arguments = ['--corpus'] + [corpus_path] + \ ['--model', model, '--vocab-size', '5000', '--save-dir', dir_path, '--disable-bos', '--disable-eos', '--custom-special-tokens', 'cls_token=<cls>', 'sep_token=<sep>'] args = parser.parse_args(arguments) # Train the tokenizer learn_subword.main(args) if model in ['yttm', 'spm', 'subword_nmt']: model_key = model else: model_key = 'hf_tokenizer' tokenizer = tokenizers.create( model_key, model_path=os.path.join(dir_path, '{}.model'.format(model)), vocab=os.path.join(dir_path, '{}.vocab'.format(model))) assert tokenizer.vocab.sep_token == '<sep>' assert tokenizer.vocab.cls_token == '<cls>'
def evaluate(args): ctx_l = [mx.cpu()] if args.gpus is None or args.gpus == '' else [ mx.gpu(int(x)) for x in args.gpus.split(',') ] src_normalizer = MosesNormalizer(args.src_lang) tgt_normalizer = MosesNormalizer(args.tgt_lang) base_src_tokenizer = tokenizers.create('moses', args.src_lang) base_tgt_tokenizer = tokenizers.create('moses', args.tgt_lang) src_tokenizer = create_tokenizer(args.src_tokenizer, args.src_subword_model_path, args.src_vocab_path) tgt_tokenizer = create_tokenizer(args.tgt_tokenizer, args.tgt_subword_model_path, args.tgt_vocab_path) src_vocab = src_tokenizer.vocab tgt_vocab = tgt_tokenizer.vocab if args.cfg.endswith('.yml'): cfg = TransformerModel.get_cfg().clone_merge(args.cfg) else: cfg = TransformerModel.get_cfg(args.cfg) cfg.defrost() cfg.MODEL.src_vocab_size = len(src_vocab) cfg.MODEL.tgt_vocab_size = len(tgt_vocab) if args.fp16: cfg.MODEL.dtype = 'float16' cfg.freeze() model = TransformerModel.from_cfg(cfg) model.hybridize() model.load_parameters(args.param_path, ctx=ctx_l) inference_model = TransformerNMTInference(model=model) inference_model.hybridize() # Construct the BeamSearchSampler if args.stochastic: scorer = BeamSearchScorer(alpha=0.0, K=0.0, temperature=1.0, from_logits=False) else: scorer = BeamSearchScorer(alpha=args.lp_alpha, K=args.lp_k, from_logits=False) beam_search_sampler = BeamSearchSampler(beam_size=args.beam_size, decoder=inference_model, vocab_size=len(tgt_vocab), eos_id=tgt_vocab.eos_id, scorer=scorer, stochastic=args.stochastic, max_length_a=args.max_length_a, max_length_b=args.max_length_b) logging.info(beam_search_sampler) all_src_token_ids, all_src_lines = process_corpus( args.src_corpus, sentence_normalizer=src_normalizer, base_tokenizer=base_src_tokenizer, bpe_tokenizer=src_tokenizer, add_bos=False, add_eos=True) if args.tgt_corpus is not None: all_tgt_token_ids, all_tgt_lines = process_corpus( args.tgt_corpus, sentence_normalizer=tgt_normalizer, base_tokenizer=base_tgt_tokenizer, bpe_tokenizer=tgt_tokenizer, add_bos=True, add_eos=True) else: # when applying inference, populate the fake tgt tokens all_tgt_token_ids = all_tgt_lines = [ [] for i in range(len(all_src_token_ids)) ] test_dataloader = gluon.data.DataLoader(list( zip(all_src_token_ids, [len(ele) for ele in all_src_token_ids], all_tgt_token_ids, [len(ele) for ele in all_tgt_token_ids])), batch_size=32, batchify_fn=Tuple( Pad(), Stack(), Pad(), Stack()), shuffle=False) ctx = ctx_l[0] pred_sentences = [] start_eval_time = time.time() # evaluate if not args.inference: avg_nll_loss = 0 ntokens = 0 for i, (src_token_ids, src_valid_length, tgt_token_ids, tgt_valid_length)\ in enumerate(test_dataloader): src_token_ids = mx.np.array(src_token_ids, ctx=ctx, dtype=np.int32) src_valid_length = mx.np.array(src_valid_length, ctx=ctx, dtype=np.int32) tgt_token_ids = mx.np.array(tgt_token_ids, ctx=ctx, dtype=np.int32) tgt_valid_length = mx.np.array(tgt_valid_length, ctx=ctx, dtype=np.int32) tgt_pred = model(src_token_ids, src_valid_length, tgt_token_ids[:, :-1], tgt_valid_length - 1) pred_logits = mx.npx.log_softmax(tgt_pred, axis=-1) nll = -mx.npx.pick(pred_logits, tgt_token_ids[:, 1:]) avg_nll_loss += mx.npx.sequence_mask( nll, sequence_length=tgt_valid_length - 1, use_sequence_length=True, axis=1).sum().asnumpy() ntokens += int((tgt_valid_length - 1).sum().asnumpy()) init_input = mx.np.array( [tgt_vocab.bos_id for _ in range(src_token_ids.shape[0])], ctx=ctx) states = inference_model.init_states(src_token_ids, src_valid_length) samples, scores, valid_length = beam_search_sampler( init_input, states, src_valid_length) for j in range(samples.shape[0]): pred_tok_ids = samples[j, 0, :valid_length[ j, 0].asnumpy()].asnumpy().tolist() bpe_decode_line = tgt_tokenizer.decode(pred_tok_ids[1:-1]) pred_sentence = base_tgt_tokenizer.decode( bpe_decode_line.split(' ')) pred_sentences.append(pred_sentence) print(pred_sentence) print('Processed {}/{}'.format(len(pred_sentences), len(all_tgt_lines))) end_eval_time = time.time() avg_nll_loss = avg_nll_loss / ntokens with open(os.path.join(args.save_dir, 'gt_sentences.txt'), 'w', encoding='utf-8') as of: of.write('\n'.join(all_tgt_lines)) of.write('\n') with open(os.path.join(args.save_dir, 'pred_sentences.txt'), 'w', encoding='utf-8') as of: of.write('\n'.join(pred_sentences)) of.write('\n') sacrebleu_out = sacrebleu.corpus_bleu(sys_stream=pred_sentences, ref_streams=[all_tgt_lines]) logging.info('Time Spent: {}, #Sent={}, SacreBlEU={} ' '({:2.1f} {:2.1f} {:2.1f} {:2.1f}) ' '(BP={:.3f}, ratio={:.3f}, syslen={}, reflen={}), ' 'Avg NLL={}, Perplexity={}'.format( end_eval_time - start_eval_time, len(all_tgt_lines), sacrebleu_out.score, *sacrebleu_out.precisions, sacrebleu_out.bp, sacrebleu_out.sys_len / sacrebleu_out.ref_len, sacrebleu_out.sys_len, sacrebleu_out.ref_len, avg_nll_loss, np.exp(avg_nll_loss))) # inference only else: with open(os.path.join(args.save_dir, 'pred_sentences.txt'), 'w', encoding='utf-8') as of: processed_sentences = 0 for src_token_ids, src_valid_length, _, _ in tqdm(test_dataloader): src_token_ids = mx.np.array(src_token_ids, ctx=ctx, dtype=np.int32) src_valid_length = mx.np.array(src_valid_length, ctx=ctx, dtype=np.int32) init_input = mx.np.array( [tgt_vocab.bos_id for _ in range(src_token_ids.shape[0])], ctx=ctx) states = inference_model.init_states(src_token_ids, src_valid_length) samples, scores, valid_length = beam_search_sampler( init_input, states, src_valid_length) for j in range(samples.shape[0]): pred_tok_ids = samples[j, 0, :valid_length[ j, 0].asnumpy()].asnumpy().tolist() bpe_decode_line = tgt_tokenizer.decode(pred_tok_ids[1:-1]) pred_sentence = base_tgt_tokenizer.decode( bpe_decode_line.split(' ')) pred_sentences.append(pred_sentence) of.write('\n'.join(pred_sentences)) of.write('\n') processed_sentences += len(pred_sentences) pred_sentences = [] end_eval_time = time.time() logging.info('Time Spent: {}, Inferred sentences: {}'.format( end_eval_time - start_eval_time, processed_sentences))