def __init__(self, task, models, args, src_bpe=None, bpe_symbol='@@ '): self.task = task self.models = models self.src_dict = task.source_dictionary self.tgt_dict = task.target_dictionary self.src_bpe = src_bpe self.use_cuda = torch.cuda.is_available() and not args.cpu self.args = args # optimize model for generation for model in self.models: model.make_generation_fast_( beamable_mm_beam_size=None if self.args.no_beamable_mm else self.args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if self.use_cuda: model.cuda() self.generator = self.task.build_generator(args) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) self.align_dict = utils.load_align_dict(args.replace_unk) self.max_positions = utils.resolve_max_positions( self.task.max_positions(), *[model.max_positions() for model in models] ) self.in_transforms = [] self.out_transforms = [] if getattr(args, 'moses', False): tokenizer = MosesTokenizer(lang=args.source_lang or 'en') detokenizer = MosesDetokenizer(lang=args.target_lang or 'en') self.in_transforms.append(lambda s: tokenizer.tokenize(s, return_str=True)) self.out_transforms.append(lambda s: detokenizer.detokenize(s.split())) elif getattr(args, 'nltk', False): from nltk.tokenize import word_tokenize self.in_transforms.append(lambda s: ' '.join(word_tokenize(s))) if getattr(args, 'gpt2_bpe', False): from fairseq.gpt2_bpe.gpt2_encoding import get_encoder encoder_json = os.path.join(os.path.dirname(src_bpe), 'encoder.json') vocab_bpe = src_bpe encoder = get_encoder(encoder_json, vocab_bpe) self.in_transforms.append(lambda s: ' '.join(map(str, encoder.encode(s)))) self.out_transforms.append(lambda s: ' '.join(t for t in s.split() if t != '<unk>')) self.out_transforms.append(lambda s: encoder.decode(map(int, s.strip().split()))) elif getattr(args, 'sentencepiece', False): import sentencepiece as spm sp = spm.SentencePieceProcessor() sp.Load(src_bpe) self.in_transforms.append(lambda s: ' '.join(sp.EncodeAsPieces(s))) self.out_transforms.append(lambda s: data_utils.process_bpe_symbol(s, 'sentencepiece')) elif src_bpe is not None: bpe_parser = apply_bpe.create_parser() bpe_args = bpe_parser.parse_args(['--codes', self.src_bpe]) bpe = apply_bpe.BPE(bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries) self.in_transforms.append(lambda s: bpe.process_line(s)) self.out_transforms.append(lambda s: data_utils.process_bpe_symbol(s, bpe_symbol))
def __init__(self, data_path, checkpoint_path="checkpoint_best.pt"): self.parser = options.get_generation_parser(interactive=True) self.parser.set_defaults(path=checkpoint_path, remove_bpe="sentencepiece", dataset_impl="lazy", num_wokers=5 ) self.args = options.parse_args_and_arch(self.parser, input_args=[data_path] ) utils.import_user_module(self.args) if self.args.buffer_size < 1: self.args.buffer_size = 1 if self.args.max_tokens is None and self.args.max_sentences is None: self.args.max_sentences = 1 assert not self.args.sampling or self.args.nbest == self.args.beam, \ '--sampling requires --nbest to be equal to --beam' assert not self.args.max_sentences or self.args.max_sentences <= self.args.buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' self.use_cuda = torch.cuda.is_available() and not self.args.cpu self.task = tasks.setup_task(self.args) self.models, self._model_args = checkpoint_utils.load_model_ensemble( self.args.path.split(':'), arg_overrides=eval(self.args.model_overrides), task=self.task, ) self.src_dict = self.task.source_dictionary self.tgt_dict = self.task.target_dictionary for model in self.models: model.make_generation_fast_( beamable_mm_beam_size=None if self.args.no_beamable_mm else self.args.beam, need_attn=self.args.print_alignment, ) if self.args.fp16: model.half() if self.use_cuda: model.cuda() self.generator = self.task.build_generator(self.args) if self.args.remove_bpe == 'gpt2': from fairseq.gpt2_bpe.gpt2_encoding import get_encoder self.decoder = get_encoder( 'fairseq/gpt2_bpe/encoder.json', 'fairseq/gpt2_bpe/vocab.bpe', ) self.encode_fn = lambda x: ' '.join(map(str, self.decoder.encode(x))) else: self.decoder = None self.encode_fn = lambda x: x self.align_dict = utils.load_align_dict(self.args.replace_unk) self.max_positions = utils.resolve_max_positions( self.task.max_positions(), *[model.max_positions() for model in self.models] )
def main(args): utils.import_user_module(args) if args.buffer_size < 1: args.buffer_size = 1 if args.max_tokens is None and args.max_sentences is None: args.max_sentences = 1 assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert not args.max_sentences or args.max_sentences <= args.buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' print(args) use_cuda = th.cuda.is_available() and not args.cpu # Setup task, e.g., translation task = tasks.setup_task(args) # Load ensemble print('| loading model(s) from {}'.format(args.path)) [model], _model_args = checkpoint_utils.load_model_ensemble( args.path.split(':'), arg_overrides=eval(args.model_overrides), task=task, ) # Optimize ensemble for generation model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Hack to support GPT-2 BPE if args.remove_bpe == 'gpt2': from fairseq.gpt2_bpe.gpt2_encoding import get_encoder decoder = get_encoder( 'fairseq/gpt2_bpe/encoder.json', 'fairseq/gpt2_bpe/vocab.bpe', ) def enc_fn(x): return ' '.join(map(str, decoder.encode(x))) else: decoder = None def enc_fn(x): return x # Max position for batching max_positions = utils.resolve_max_positions(task.max_positions(), model.max_positions()) # Prompt if args.buffer_size > 1: print('| Sentence buffer size:', args.buffer_size) print('| Type the input sentence and press return:') start_idx = 0 # This tracks all encodings in the order that they are given as input all_encodings = [] # Read chunks of the input stream one at a time for inputs in buffered_read(args.input, args.buffer_size): results = [] # Make batches on the fly for batch in make_batches(inputs, args, task, max_positions, enc_fn): # Retrieve inputs src_tokens = batch.src_tokens src_lengths = batch.src_lengths if use_cuda: src_tokens = src_tokens.cuda() src_lengths = src_lengths.cuda() # Encode encodings = encode(model, src_tokens, src_lengths) # Save encodings in the correct order # (the batches are out of order to optimize padding) for i, (idx, h) in enumerate(zip(batch.ids.tolist(), encodings)): results.append((start_idx + idx, h)) # Save the encodings in order for _, h in sorted(results, key=lambda x: x[0]): all_encodings.append(h.cpu().numpy()) # update running id counter start_idx += len(inputs) # Save all encodings to npy np.save(args.output_file, np.stack(all_encodings))
def main(args): utils.import_user_module(args) if args.buffer_size < 1: args.buffer_size = 1 if args.max_tokens is None and args.max_sentences is None: args.max_sentences = 1 assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert not args.max_sentences or args.max_sentences <= args.buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Setup task, e.g., translation task = tasks.setup_task(args) # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(':'), arg_overrides=eval(args.model_overrides), task=task, ) # Set dictionaries src_dict = task.source_dictionary tgt_dict = task.target_dictionary # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Initialize generator generator = task.build_generator(args) # Hack to support GPT-2 BPE if args.remove_bpe == 'gpt2': from fairseq.gpt2_bpe.gpt2_encoding import get_encoder decoder = get_encoder( 'fairseq/gpt2_bpe/encoder.json', 'fairseq/gpt2_bpe/vocab.bpe', ) encode_fn = lambda x: ' '.join(map(str, decoder.encode(x))) else: decoder = None encode_fn = lambda x: x # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) max_positions = utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]) if args.buffer_size > 1: print('| Sentence buffer size:', args.buffer_size) print('| Type the input sentence and press return:') start_id = 0 for inputs in buffered_read(args.input, args.buffer_size): results = [] for batch in make_batches(inputs, args, task, max_positions, encode_fn): src_tokens = batch.src_tokens src_lengths = batch.src_lengths if use_cuda: src_tokens = src_tokens.cuda() src_lengths = src_lengths.cuda() sample = { 'net_input': { 'src_tokens': src_tokens, 'src_lengths': src_lengths, }, } translations = task.inference_step(generator, models, sample) for i, (id, hypos) in enumerate(zip(batch.ids.tolist(), translations)): src_tokens_i = utils.strip_pad(src_tokens[i], tgt_dict.pad()) results.append((start_id + id, src_tokens_i, hypos)) # sort output to match input order for id, src_tokens, hypos in sorted(results, key=lambda x: x[0]): if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) print('S-{}\t{}'.format(id, src_str)) # Process top predictions for hypo in hypos[:min(len(hypos), args.nbest)]: hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) if decoder is not None: hypo_str = decoder.decode( map(int, hypo_str.strip().split())) print('H-{}\t{}\t{}'.format(id, hypo['score'], hypo_str)) print('P-{}\t{}'.format( id, ' '.join( map(lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist())))) if args.print_alignment: print('A-{}\t{}'.format( id, ' '.join(map(lambda x: str(utils.item(x)), alignment)))) # update running id counter start_id += len(inputs)
def __init__(self, data_path="./data/processed", \ checkpoint_path="./checkpoints/zhen_mass_pre-training.pt",\ task='xmasked_seq2seq',\ user_dir='mass',\ s='zh', t='en',\ langs='en,zh',\ mt_steps='zh-en',\ source_langs='zh',\ target_langs='en',\ beam=5,\ use_cuda=1): self.parser = options.get_generation_parser(interactive=True) self.parser.set_defaults(path=checkpoint_path, task=task, user_dir=user_dir, s=s, t=t,\ source_langs=source_langs, target_langs=target_langs,\ langs=langs, mt_steps=mt_steps, beam=beam) self.use_cuda = use_cuda self.args = options.parse_args_and_arch(self.parser,\ input_args=[data_path]) self.args.user_dir = user_dir self.args.s = s self.args.t = t self.args.langs = langs self.args.mt_steps = mt_steps self.args.source_langs = source_langs self.args.target_langs = target_langs self.args.remove_bpe = '@@ ' #self.args, _ = self.parser.parse_known_args([data_path]) utils.import_user_module(self.args) if self.args.buffer_size < 1: self.args.buffer_size = 1 if self.args.max_tokens is None and self.args.max_sentences is None: self.args.max_sentences = 1 assert not self.args.sampling or self.args.nbest == self.args.beam, \ '--sampling requires --nbest to be equal to --beam' assert not self.args.max_sentences or self.args.max_sentences <= self.args.buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' print(self.args) #self.use_cuda = torch.cuda.is_available() and not self.args.cpu # Setup task, e.g., translation self.task = tasks.setup_task(self.args) # Load ensemble print('| loading model(s) from {}'.format(self.args.path)) self.models, self._model_args = checkpoint_utils.load_model_ensemble( self.args.path.split(':'), arg_overrides=eval(self.args.model_overrides), task=self.task, ) # Set dictionaries self.src_dict = self.task.source_dictionary self.tgt_dict = self.task.target_dictionary # Optimize ensemble for generation for model in self.models: model.make_generation_fast_( beamable_mm_beam_size=None if self.args.no_beamable_mm else self.args.beam, need_attn=self.args.print_alignment, ) if self.args.fp16: model.half() if self.use_cuda: model.cuda() # Initialize generator self.generator = self.task.build_generator(self.args) # Hack to support GPT-2 BPE if self.args.remove_bpe == 'gpt2': from fairseq.gpt2_bpe.gpt2_encoding import get_encoder self.decoder = get_encoder( 'fairseq/gpt2_bpe/encoder.json', 'fairseq/gpt2_bpe/vocab.bpe', ) self.encode_fn = lambda x: ' '.join( map(str, self.decoder.encode(x))) else: self.decoder = None self.encode_fn = lambda x: x # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) self.align_dict = utils.load_align_dict(self.args.replace_unk) self.max_positions = utils.resolve_max_positions( self.task.max_positions(), *[model.max_positions() for model in self.models]) if self.args.buffer_size > 1: print('| Sentence buffer size:', self.args.buffer_size)