def apply_bpe_function(codes_file, train_file, apply_out, vocabulary=None): parser = apply_bpe.create_parser() args = parser.parse_args([ "--codes", codes_file, "--input", train_file, "--output", apply_out, # "--vocabulary", vocabulary ]) if vocabulary: args.vocabulary = codecs.open(vocabulary, encoding='utf-8') if vocabulary: vocabulary = apply_bpe.read_vocabulary(args.vocabulary, args.vocabulary_threshold) else: vocabulary = None args.codes = codecs.open(args.codes.name, encoding='utf-8') bpe = apply_bpe.BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries) args.input = codecs.open(args.input.name, encoding='utf-8') args.output = codecs.open(args.output.name, 'w', encoding='utf-8') for line in args.input: args.output.write(bpe.process_line(line, args.dropout))
def __init__(self, cfg): if cfg.bpe_codes is None: raise ValueError("--bpe-codes is required for --bpe=subword_nmt") codes = file_utils.cached_path(cfg.bpe_codes) try: from subword_nmt import apply_bpe bpe_parser = apply_bpe.create_parser() bpe_args = bpe_parser.parse_args([ "--codes", codes, "--separator", cfg.bpe_separator, ]) self.bpe = apply_bpe.BPE( bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries, ) self.bpe_symbol = bpe_args.separator + " " except ImportError: raise ImportError( "Please install subword_nmt with: pip install subword-nmt")
def __init__(self, args): if args.bpe_codes is None: raise ValueError('--bpe-codes is required for --bpe=subword_nmt') codes = file_utils.cached_path(args.bpe_codes) try: from subword_nmt import apply_bpe bpe_parser = apply_bpe.create_parser() bpe_args = bpe_parser.parse_args([ '--codes', codes, '--separator', args.bpe_separator, ]) import codecs bpe_args.codes = codecs.open(codes, encoding='utf-8') self.bpe = apply_bpe.BPE( bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries, ) self.bpe_symbol = bpe_args.separator + ' ' except ImportError: raise ImportError( 'Please install subword_nmt with: pip install subword-nmt')
def __init__(self, task, models, args, src_bpe=None, bpe_symbol='@@ '): self.task = task self.models = models self.src_dict = task.source_dictionary self.tgt_dict = task.target_dictionary self.src_bpe = src_bpe self.use_cuda = torch.cuda.is_available() and not args.cpu self.args = args self.args.remove_bpe = bpe_symbol # optimize model for generation for model in self.models: model.make_generation_fast_( beamable_mm_beam_size=None if self.args.no_beamable_mm else self.args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if self.use_cuda: model.cuda() self.generator = self.task.build_generator(args) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) self.align_dict = utils.load_align_dict(args.replace_unk) self.max_positions = utils.resolve_max_positions( self.task.max_positions(), *[model.max_positions() for model in models] ) if hasattr(args, 'source_lang'): self.tokenizer = MosesTokenizer(lang=args.source_lang) else: self.tokenizer = MosesTokenizer() if src_bpe is not None: bpe_parser = apply_bpe.create_parser() bpe_args = bpe_parser.parse_args(['--codes', self.src_bpe]) self.bpe = apply_bpe.BPE(bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries) else: self.bpe = None
def __init__(self, args): codes = file_utils.cached_path(args.bpe_codes) try: from subword_nmt import apply_bpe bpe_parser = apply_bpe.create_parser() bpe_args = bpe_parser.parse_args([ '--codes', codes, '--separator', args.bpe_separator, ]) self.bpe = apply_bpe.BPE( bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries, ) self.bpe_symbol = bpe_args.separator + ' ' except ImportError: raise ImportError( 'Please install subword_nmt with: pip install subword-nmt')
def __init__(self, task, models, args, src_bpe=None, bpe_symbol='@@ '): self.task = task self.models = models self.src_dict = task.source_dictionary self.tgt_dict = task.target_dictionary self.src_bpe = src_bpe self.use_cuda = torch.cuda.is_available() and not args.cpu self.args = args # optimize model for generation for model in self.models: model.make_generation_fast_( beamable_mm_beam_size=None if self.args.no_beamable_mm else self.args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if self.use_cuda: model.cuda() self.generator = self.task.build_generator(args) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) self.align_dict = utils.load_align_dict(args.replace_unk) self.max_positions = utils.resolve_max_positions( self.task.max_positions(), *[model.max_positions() for model in models] ) self.in_transforms = [] self.out_transforms = [] if getattr(args, 'moses', False): tokenizer = MosesTokenizer(lang=args.source_lang or 'en') detokenizer = MosesDetokenizer(lang=args.target_lang or 'en') self.in_transforms.append(lambda s: tokenizer.tokenize(s, return_str=True)) self.out_transforms.append(lambda s: detokenizer.detokenize(s.split())) elif getattr(args, 'nltk', False): from nltk.tokenize import word_tokenize self.in_transforms.append(lambda s: ' '.join(word_tokenize(s))) if getattr(args, 'gpt2_bpe', False): from fairseq.gpt2_bpe.gpt2_encoding import get_encoder encoder_json = os.path.join(os.path.dirname(src_bpe), 'encoder.json') vocab_bpe = src_bpe encoder = get_encoder(encoder_json, vocab_bpe) self.in_transforms.append(lambda s: ' '.join(map(str, encoder.encode(s)))) self.out_transforms.append(lambda s: ' '.join(t for t in s.split() if t != '<unk>')) self.out_transforms.append(lambda s: encoder.decode(map(int, s.strip().split()))) elif getattr(args, 'sentencepiece', False): import sentencepiece as spm sp = spm.SentencePieceProcessor() sp.Load(src_bpe) self.in_transforms.append(lambda s: ' '.join(sp.EncodeAsPieces(s))) self.out_transforms.append(lambda s: data_utils.process_bpe_symbol(s, 'sentencepiece')) elif src_bpe is not None: bpe_parser = apply_bpe.create_parser() bpe_args = bpe_parser.parse_args(['--codes', self.src_bpe]) bpe = apply_bpe.BPE(bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries) self.in_transforms.append(lambda s: bpe.process_line(s)) self.out_transforms.append(lambda s: data_utils.process_bpe_symbol(s, bpe_symbol))
def create_subword_bpe(codes): bpe_parser = apply_bpe.create_parser() bpe_args = bpe_parser.parse_args(['--codes', str(codes)]) bpe = apply_bpe.BPE(bpe_args.codes) return bpe