def bpe_process(inp, outp, codes): codes = codecs.open(codes, encoding='utf-8') inp = codecs.open(inp, encoding='utf-8') outp = codecs.open(outp, "w", encoding='utf-8') bpe = apply_bpe.BPE(codes) for line in inp: outp.write(bpe.process_line(line))
def __init__(self, cfg): if cfg.bpe_codes is None: raise ValueError("--bpe-codes is required for --bpe=subword_nmt") codes = file_utils.cached_path(cfg.bpe_codes) try: from subword_nmt import apply_bpe bpe_parser = apply_bpe.create_parser() bpe_args = bpe_parser.parse_args([ "--codes", codes, "--separator", cfg.bpe_separator, ]) self.bpe = apply_bpe.BPE( bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries, ) self.bpe_symbol = bpe_args.separator + " " except ImportError: raise ImportError( "Please install subword_nmt with: pip install subword-nmt")
def apply_bpe_function(codes_file, train_file, apply_out, vocabulary=None): parser = apply_bpe.create_parser() args = parser.parse_args([ "--codes", codes_file, "--input", train_file, "--output", apply_out, # "--vocabulary", vocabulary ]) if vocabulary: args.vocabulary = codecs.open(vocabulary, encoding='utf-8') if vocabulary: vocabulary = apply_bpe.read_vocabulary(args.vocabulary, args.vocabulary_threshold) else: vocabulary = None args.codes = codecs.open(args.codes.name, encoding='utf-8') bpe = apply_bpe.BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries) args.input = codecs.open(args.input.name, encoding='utf-8') args.output = codecs.open(args.output.name, 'w', encoding='utf-8') for line in args.input: args.output.write(bpe.process_line(line, args.dropout))
def __init__(self, model_path=None, glossary: Optional[List[str]] = None): from subword_nmt import apply_bpe self.unsegment_re = re.compile(r'@@( |$)') self.model = None if model_path is not None: self.model = apply_bpe.BPE(open(model_path), glossaries=glossary)
def __call__(self, parser, args, values, option_string=None): bpe_segmenter = None if values is not None: with open(values, encoding='utf-8') as bpe_codes_file: bpe_codes = apply_bpe.BPE(bpe_codes_file) bpe_segmenter = create_bpe_word_segmenter(bpe_codes) setattr(args, self.dest, bpe_segmenter)
def __init__(self, args): if args.bpe_codes is None: raise ValueError('--bpe-codes is required for --bpe=subword_nmt') codes = file_utils.cached_path(args.bpe_codes) try: from subword_nmt import apply_bpe bpe_parser = apply_bpe.create_parser() bpe_args = bpe_parser.parse_args([ '--codes', codes, '--separator', args.bpe_separator, ]) import codecs bpe_args.codes = codecs.open(codes, encoding='utf-8') self.bpe = apply_bpe.BPE( bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries, ) self.bpe_symbol = bpe_args.separator + ' ' except ImportError: raise ImportError( 'Please install subword_nmt with: pip install subword-nmt')
def _load_from_codecs(self): """ Load BPE from codecs file. """ with PathManager.open(self.codecs, 'r', encoding='utf-8') as codecs_file: self.bpe = apply_bpe.BPE(codecs_file)
def __init__(self, host, port, model_name, bpe_codes, max_toks=512): channel = grpc.insecure_channel("%s:%d" % (host, port)) self.stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) self.model_name = model_name self.max_toks = max_toks with open(bpe_codes) as f: self.bpe = apply_bpe.BPE(f)
def generate_bpe_file(input, output, code): inputs = codecs.open(input, encoding='utf-8') outputs = codecs.open(output, mode='w', encoding='UTF-8') codes = codecs.open(code, encoding='utf-8') bpe = apply_bpe.BPE(codes) for line in inputs: outputs.write(bpe.process_line(line)) outputs.close()
def build_vocab(imgs, params): # count up the number of words captions = [] for img in imgs: for sent in img['sentences']: captions.append(' '.join(sent['tokens'])) captions = '\n'.join(captions) all_captions = tempfile.NamedTemporaryFile(delete=False) all_captions.close() with open(all_captions.name, 'w') as txt_file: txt_file.write(captions) # codecs_output = tempfile.NamedTemporaryFile(delete=False) codecs_output.close() with codecs.open(codecs_output.name, 'w', encoding='UTF-8') as output: learn_bpe.learn_bpe(codecs.open(all_captions.name, encoding='UTF-8'), output, params['symbol_count']) with codecs.open(codecs_output.name, encoding='UTF-8') as codes: bpe = apply_bpe.BPE(codes) tmp = tempfile.NamedTemporaryFile(delete=False) tmp.close() tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8') for _, img in enumerate(imgs): img['final_captions'] = [] for sent in img['sentences']: txt = ' '.join(sent['tokens']) txt = bpe.segment(txt).strip() img['final_captions'].append(txt.split(' ')) tmpout.write(txt) tmpout.write('\n') if _ < 20: print(txt) tmpout.close() tmpin = codecs.open(tmp.name, encoding='UTF-8') vocab = learn_bpe.get_vocabulary(tmpin) vocab = sorted(vocab.keys(), key=lambda x: vocab[x], reverse=True) # Always insert UNK print('inserting the special UNK token') vocab.append('UNK') print('Vocab size:', len(vocab)) os.remove(all_captions.name) with open(codecs_output.name, 'r') as codes: bpe = codes.read() os.remove(codecs_output.name) os.remove(tmp.name) return vocab, bpe
def __init__(self, host, port, model_name, preprocessor, postprocessor, bpe_codes): channel = grpc.insecure_channel("%s:%d" % (host, port)) self.stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) self.model_name = model_name self.preprocessor = preprocessor self.postprocessor = postprocessor with open(bpe_codes) as f: self.bpe = apply_bpe.BPE(f)
def process_bpe_dropout(code, vocab, in_name, out_name, dropout=0.0): """ To apply BPE on desired data and output processed files. """ codes = open(code, encoding='utf-8') vocab_file = open(vocab, encoding='utf-8') vocabulary = apply_bpe.read_vocabulary(vocab_file, 1) num_workers = apply_bpe.cpu_count() output_file = open(out_name, 'w', encoding='utf-8') bpe = apply_bpe.BPE(codes=codes, vocab=vocabulary) bpe.process_lines(in_name, output_file, dropout=dropout, num_workers=num_workers)
def __init__(self, task, models, args, src_bpe=None, bpe_symbol='@@ '): self.task = task self.models = models self.src_dict = task.source_dictionary self.tgt_dict = task.target_dictionary self.src_bpe = src_bpe self.use_cuda = torch.cuda.is_available() and not args.cpu self.args = args self.args.remove_bpe = bpe_symbol # optimize model for generation for model in self.models: model.make_generation_fast_( beamable_mm_beam_size=None if self.args.no_beamable_mm else self.args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if self.use_cuda: model.cuda() self.generator = self.task.build_generator(args) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) self.align_dict = utils.load_align_dict(args.replace_unk) self.max_positions = utils.resolve_max_positions( self.task.max_positions(), *[model.max_positions() for model in models] ) if hasattr(args, 'source_lang'): self.tokenizer = MosesTokenizer(lang=args.source_lang) else: self.tokenizer = MosesTokenizer() if src_bpe is not None: bpe_parser = apply_bpe.create_parser() bpe_args = bpe_parser.parse_args(['--codes', self.src_bpe]) self.bpe = apply_bpe.BPE(bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries) else: self.bpe = None
def subword(self, cleaned_filepaths, overwrite): bpe_filepath = get_bpe_path(self.experiment_name, self.merge_ops) if self.corpora_type == 'training': # Concatenated file necessary for BPE learning concatenated_filepath = get_concat_path(self.file_prefix) concatenate_files(cleaned_filepaths, concatenated_filepath, overwrite=overwrite) if os.path.exists(bpe_filepath) and overwrite == False: print(bpe_filepath, 'already exists') else: print('Learning BPE encoding. This may take a while.') with open(concatenated_filepath, 'r', encoding='utf-8') as infile, open( bpe_filepath, 'w', encoding='utf-8') as outfile: learn_bpe.learn_bpe( infile, outfile, num_symbols=self.merge_ops ) # Get codecs, write codecs to outfile print('Applying') with open(bpe_filepath, 'r', encoding='utf-8') as codec: bpe = apply_bpe.BPE(codec) print('Writing bpe') for i, lang in enumerate(self.langs): lang_filepath = cleaned_filepaths[i] processed_filepath = get_processed_data_path( self.experiment_name, self.corpora_type, lang) if overwrite == False and os.path.exists(processed_filepath): continue with open(lang_filepath, 'r', encoding='utf-8') as f1, open(processed_filepath, 'w', encoding='utf-8') as f2: for line in f1: f2.write(bpe.process_line(line)) if self.corpora_type == 'training': vocab_filepath = get_vocab_path(self.experiment_name, lang) with open(processed_filepath, 'r', encoding='utf-8') as train_file, open( vocab_filepath, 'w', encoding='utf-8') as vocab_file: get_vocab.get_vocab(train_file, vocab_file)
def __init__(self, args): codes = file_utils.cached_path(args.bpe_codes) try: from subword_nmt import apply_bpe bpe_parser = apply_bpe.create_parser() bpe_args = bpe_parser.parse_args([ '--codes', codes, '--separator', args.bpe_separator, ]) self.bpe = apply_bpe.BPE( bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries, ) self.bpe_symbol = bpe_args.separator + ' ' except ImportError: raise ImportError( 'Please install subword_nmt with: pip install subword-nmt')
ArgumentParser.validate_translate_opts(opt) engines[key] = {"translatorbest": build_translator(opt, report_score=True)} #translatorbest builds the best complete translation of the sentence opt.n_best = 5 opt.max_length = 2 opt.global_attention_function = 'sparsemax' ArgumentParser.validate_translate_opts(opt) engines[key]["translatorbigram"] = build_translator(opt, report_score=True) #translatorbiagram builds best translations of length two if value['src_bpe']: print("BPE in SRC side") bpe_src_code = os.path.join(dir_path, 'model', value['src_bpe']) merge_file = open(bpe_src_code, "r") bpe = apply_bpe.BPE(codes=merge_file) engines[key]["src_segmenter"] = lambda x: bpe.process_line(x.strip()) else: engines[key]["src_segmenter"] = None def preprocess_src(s, preprocess): s = s.lower() s = re.sub(r"([\“\”])", r'"', s) s = re.sub(r"([\‘\’])", r"'", s) s = re.sub(r"([\ः])", r":", s) s = re.sub(r"([-!$%^&*()_+|~=`{}\[\]:\";<>?,.\/#@।]+)", r" \1 ", s) # s = re.sub(r'"', r'"', s) # s = re.sub(r"'", r"'", s) s = re.sub(r"(\s+)", r" ", s)
def get_bpe_segmenter(bpe_codes_path): bpe = apply_bpe.BPE(codes=open(bpe_codes_path, 'r')) segmenter = lambda x: bpe.process_line(x.strip()).split( ) #string IN -> list OUT return segmenter
def run_bot(model_dir, bpe_src_code=None, tokenize=None): """ Start the bot. This means loading the model according to the config file. :param model_dir: Model directory of trained Joey NMT model. :param bpe_src_code: BPE codes for source side processing (optional). :param tokenize: If True, tokenize inputs with Moses tokenizer. :return: """ cfg_file = model_dir + "/config.yaml" logger = logging.getLogger(__name__) # load the Joey configuration cfg = load_config(cfg_file) # load the checkpoint if "load_model" in cfg['training'].keys(): ckpt = cfg['training']["load_model"] else: ckpt = get_latest_checkpoint(model_dir) if ckpt is None: raise FileNotFoundError( "No checkpoint found in directory {}.".format(model_dir)) # prediction parameters from config use_cuda = cfg["training"].get("use_cuda", False) level = cfg["data"]["level"] max_output_length = cfg["training"].get("max_output_length", None) lowercase = cfg["data"].get("lowercase", False) # load the vocabularies src_vocab_file = cfg["training"]["model_dir"] + "/src_vocab.txt" trg_vocab_file = cfg["training"]["model_dir"] + "/trg_vocab.txt" src_vocab = build_vocab(field="src", vocab_file=src_vocab_file, dataset=None, max_size=-1, min_freq=0) trg_vocab = build_vocab(field="trg", vocab_file=trg_vocab_file, dataset=None, max_size=-1, min_freq=0) # whether to use beam search for decoding, 0: greedy decoding if "testing" in cfg.keys(): beam_size = cfg["testing"].get("beam_size", 0) beam_alpha = cfg["testing"].get("alpha", -1) else: beam_size = 1 beam_alpha = -1 # pre-processing if tokenize is not None: src_tokenizer = MosesTokenizer(lang=cfg["data"]["src"]) trg_tokenizer = MosesDetokenizer(lang=cfg["data"]["trg"]) # tokenize input tokenizer = lambda x: src_tokenizer.tokenize(x, return_str=True) detokenizer = lambda x: trg_tokenizer.detokenize(x.split(), return_str=True) else: tokenizer = lambda x: x detokenizer = lambda x: x if bpe_src_code is not None and level == "bpe": # load bpe merge file merge_file = open(bpe_src_code, "r") bpe = apply_bpe.BPE(codes=merge_file) segmenter = lambda x: bpe.process_line(x.strip()) elif level == "char": # split to chars segmenter = lambda x: list(x.strip()) else: segmenter = lambda x: x.strip() # build model and load parameters into it model_checkpoint = load_checkpoint(ckpt, use_cuda) model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) model.load_state_dict(model_checkpoint["model_state"]) if use_cuda: model.cuda() print("Joey NMT model loaded successfully.") web_client = slack.WebClient(TOKEN, timeout=30) # get bot id bot_id = (web_client.api_call("auth.test")["user_id"].upper()) # find bot channel id all_channels = web_client.api_call("conversations.list")["channels"] for c in all_channels: if c["name"] == BOT_CHANNEL: bot_channel_id = c["id"] slack_events_adapter = SlackEventAdapter(BOT_SIGNIN, endpoint="/slack/events") @slack_events_adapter.on("message") def handle_message(event_data): message = event_data["event"] if message.get("subtype") is None: channel = message["channel"] user = message["user"] text = message["text"].strip() if user != bot_id and message.get("subtype") is None: # translates all messages in its channel and mentions if channel == bot_channel_id or bot_id in text: mention = "<@{}>".format(bot_id) # TODO remove all possible mentions with regex if mention in text: parts = text.split(mention) text = parts[0].strip() + parts[1].strip() message = translate(text, beam_size=beam_size, beam_alpha=beam_alpha, level=level, lowercase=lowercase, max_output_length=max_output_length, model=model, postprocess=[detokenizer], preprocess=[tokenizer, segmenter], src_vocab=src_vocab, trg_vocab=trg_vocab, use_cuda=use_cuda, logger=logger) web_client.chat_postMessage(text=message, token=TOKEN, channel=channel) # Error events @slack_events_adapter.on("error") def error_handler(err): print("ERROR: " + str(err)) slack_events_adapter.start(port=3000)
def load_bpe(self, bpe_path): with open(bpe_path, 'r') as ofile: bpe_model = apply_bpe.BPE(codes=ofile) return bpe_model
def toNumbers(self, vocabs, prevRules=True): if 'names_combined' in vocabs and vocabs[ 'names_combined'].codes != None: vocabs['names_combined'].bpe = apply_bpe.BPE( vocabs['names_combined'].codes) vocabs['bpe'] = apply_bpe.BPE(vocabs['names_combined'].codes) if vocabs['seq2seq'].codes != None: vocabs['seq2seq'].bpe = apply_bpe.BPE(vocabs['seq2seq'].codes) for e in self.examples: e['code_nums'] = vocabs['code'].to_num(e['code']) seq2seq_tokens = vocabs['seq2seq'].bpe.segment_tokens( e['seq2seq'] ) if vocabs['seq2seq'].codes is not None else e['seq2seq'] e['seq2seq_nums'] = vocabs['seq2seq'].to_num(seq2seq_tokens) e['seq2seq_vocab'] = Vocab( seq2seq_tokens, 0, 100000000, start=False, stop=False) # A vocab just for this sentence e['seq2seq_in_src_nums'] = e['seq2seq_vocab'].to_num( vocabs['seq2seq'].addStartOrEnd( seq2seq_tokens)) # use the local vocab for this sentence e['code_in_src_nums'] = e['seq2seq_vocab'].to_num( vocabs['code'].addStartOrEnd( e['code'])) # use the local vocab for this sentence if self.opt.dataset == "concode": # For concode decoder------- ------- # We have to do this because we concat them in the decoder # and there is padding between the nl, vars and methods in the same example because of batching # This isnt used, commenting it out # e['src_in_src_nums'] = e['concode_vocab'].to_num(e['src']) # use the local vocab for this sentence e['var_in_src_nums'] = e['concode_vocab'].to_num( e['concode_var']) # use the local vocab for this sentence e['method_in_src_nums'] = e['concode_vocab'].to_num( e['concode_method'] ) # use the local vocab for this sentence #------------------------------------------------------- e['concode_next_rules_in_src_nums'] = e[ 'concode_vocab'].to_num(vocabs['next_rules'].addStartOrEnd( [ rhs(x) if lhs(x) in CDDataset.pre_terminal_symbols else '<unk>' for x in e['next_rules'] ])) # use the local vocab for this sentence #------------------------ # --- Our Model ----------- e['src_nums'] = vocabs['names_combined'].to_num( [y for w in e['src'] for y in split_camel_case(w, vocabs)]) e['varTypes_nums'] = vocabs['names_combined'].to_num( [(split_camel_case(w, vocabs)) for w in e['varTypes']], char=1) e['methodReturns_nums'] = vocabs['names_combined'].to_num( [(split_camel_case(w, vocabs)) for w in e['methodReturns']], char=1) e['varNames_nums'] = vocabs['names_combined'].to_num( [(split_camel_case(w, vocabs)) for w in e['varNames']], char=1) e['methodNames_nums'] = vocabs['names_combined'].to_num( [(split_camel_case(w, vocabs)) for w in e['methodNames']], char=1) #----------------------------------- e['next_rules_in_src_nums'] = e['seq2seq_vocab'].to_num( vocabs['next_rules'].addStartOrEnd([ rhs(x) if lhs(x) in CDDataset.pre_terminal_symbols else '<unk>' for x in e['next_rules'] ])) # use the local vocab for this sentence # ------- Rule decoder # There is no unk in the vocab, so this will throw an error # if the rule isnt there in the vocab if prevRules: # We don't need to do this during prediction? e['prev_rules_nums'] = vocabs['prev_rules'].to_num( e['prev_rules'][:-1]) e['prev_rules_split_nums'] = vocabs['nt'].to_num( [['<s>']] + [[w] if '-->' not in w else [lhs(w)] + ['<sep>'] + rhs(w).split('___') for w in e['prev_rules'][:-1]], char=1) e['parent_rules_nums'] = vocabs['prev_rules'].to_num( e['parent_rules']) e['parent_rules_split_nums'] = vocabs['nt'].to_num( [['<s>']] + [[w] if '-->' not in w else [lhs(w)] + ['<sep>'] + rhs(w).split('___') for w in e['parent_rules']], char=1) # We need to ensure that only certain rules can be unked, not all. This # is taken care of when building the vocab e['nt_nums'] = vocabs['nt'].to_num(e['nt']) e['next_rules_nums'] = vocabs['next_rules'].to_num( e['next_rules'])
def create_subword_bpe(codes): bpe_parser = apply_bpe.create_parser() bpe_args = bpe_parser.parse_args(['--codes', str(codes)]) bpe = apply_bpe.BPE(bpe_args.codes) return bpe
type=str, help='Comma separated port numbers') parser.add_argument('-njobs', type=int, default=50, help='Specify number of Parallel jobs') args = parser.parse_args() codefile = open(args.codefile) if args.vocabfile != '': with open(args.vocabfile, 'r') as f: voc = f.read().split('\n') if voc[-1].strip() == '': voc = voc[:-1] vocab = apply_bpe.read_vocabulary(voc, 0) else: vocab = None bpe_encoder = apply_bpe.BPE(codefile, vocab=vocab) if args.word2bpefile != '': with open(args.word2bpefile, 'rb') as pk: word2bpe = pickle.load(pk) else: word2bpe = {} main(args)
def set_bpe(self, codes_file): with codecs.open(self.codes_file, encoding='UTF-8') as codes: self.bpe = apply_bpe.BPE(codes, separator=self.separator)
def load_model(self, src_language, trg_language, domain, bpe_src_code=None, tokenize=None): """ Load model for given trg language. """ # model_dir = "{}-{}".format(self._model_dir_prefix, trg_language) model_dir = f"{self._model_dir_prefix}{src_language}-{trg_language}-{domain}" # Load the checkpoint. ckpt_path = os.path.join(model_dir, 'model.ckpt') # Load the vocabularies. src_vocab_path = os.path.join(model_dir, 'src_vocab.txt') trg_vocab_path = os.path.join(model_dir, 'trg_vocab.txt') # Load the config. config_path = os.path.join(model_dir, 'config_orig.yaml') # Adjust config. config = load_config(config_path) new_config_file = os.path.join(model_dir, 'config.yaml') config = self._update_config(config, src_vocab_path, trg_vocab_path, model_dir, ckpt_path) with open(new_config_file, 'w') as cfile: yaml.dump(config, cfile) # print('Loaded model for {}-{}.'.format(self._src_language, trg_language)) print('Loaded model for {}-{}.'.format(src_language, trg_language)) conf = {} logger = logging.getLogger(__name__) conf["logger"] = logger # load the Joey configuration cfg = load_config(new_config_file) # load the checkpoint if "load_model" in cfg['training'].keys(): ckpt = cfg['training']["load_model"] else: ckpt = get_latest_checkpoint(model_dir) if ckpt is None: raise FileNotFoundError("No checkpoint found in directory {}." .format(model_dir)) # prediction parameters from config conf["use_cuda"] = cfg["training"].get("use_cuda", False) if torch.cuda.is_available() else False conf["level"] = cfg["data"]["level"] conf["max_output_length"] = cfg["training"].get("max_output_length", None) conf["lowercase"] = cfg["data"].get("lowercase", False) # load the vocabularies src_vocab_file = cfg["training"]["model_dir"] + "/src_vocab.txt" trg_vocab_file = cfg["training"]["model_dir"] + "/trg_vocab.txt" conf["src_vocab"] = build_vocab(field="src", vocab_file=src_vocab_file, dataset=None, max_size=-1, min_freq=0) conf["trg_vocab"] = build_vocab(field="trg", vocab_file=trg_vocab_file, dataset=None, max_size=-1, min_freq=0) # whether to use beam search for decoding, 0: greedy decoding if "testing" in cfg.keys(): conf["beam_size"] = cfg["testing"].get("beam_size", 0) conf["beam_alpha"] = cfg["testing"].get("alpha", -1) else: conf["beam_size"] = 1 conf["beam_alpha"] = -1 # pre-processing if tokenize is not None: src_tokenizer = MosesTokenizer(lang=cfg["data"]["src"]) trg_tokenizer = MosesDetokenizer(lang=cfg["data"]["trg"]) # tokenize input tokenizer = lambda x: src_tokenizer.tokenize(x, return_str=True) detokenizer = lambda x: trg_tokenizer.detokenize( x.split(), return_str=True) else: tokenizer = lambda x: x detokenizer = lambda x: x if bpe_src_code is not None and level == "bpe": # load bpe merge file merge_file = open(bpe_src_code, "r") bpe = apply_bpe.BPE(codes=merge_file) segmenter = lambda x: bpe.process_line(x.strip()) elif conf["level"] == "char": # split to chars segmenter = lambda x: list(x.strip()) else: segmenter = lambda x: x.strip() conf["preprocess"] = [tokenizer, segmenter] conf["postprocess"] = [detokenizer] # build model and load parameters into it model_checkpoint = load_checkpoint(ckpt, conf["use_cuda"]) model = build_model(cfg["model"], src_vocab=conf["src_vocab"], trg_vocab=conf["trg_vocab"]) model.load_state_dict(model_checkpoint["model_state"]) # ipdb.set_trace() if conf["use_cuda"]: model.cuda() conf["model"] = model print("Joey NMT model loaded successfully.") return conf
def __init__(self, elements, prune, max_num, start=True, stop=True, pad=True, unk=True, rule=False, bpe=-1): self.start = start self.stop = stop self.codes = None vocab = Counter() self.max_num = max_num self.itos = [] self.stoi = {} if pad: self.addSymbol('<blank>') if unk: self.addSymbol('<unk>') if start: self.addSymbol('<s>') if stop: self.addSymbol('</s>') self.rule = rule if rule: # Adding these for both ATIS and CONCODE. Extra things in the vocab are ok. for pre_terminal in CDDataset.pre_terminal_symbols: self.addSymbol(CDDataset._unk_rule_from_Nt(pre_terminal)) if bpe >= 0: self.codes = learn_bpe.learn_bpe(elements, bpe, 0) # last is min freq b = apply_bpe.BPE(self.codes) elements = b.segment_tokens(elements) for w in elements: vocab[w] += 1 if bpe >= 0: print('Vocab size {}'.format(len(vocab))) # prune low frequency words max_vocab = self.max_num if not rule else 100000000000 for (w, f) in vocab.most_common(max_vocab): if ((rule == False and f > prune) or (rule == True and not CDDataset._is_terminal_rule(w)) or (rule == True and CDDataset._is_terminal_rule(w) and len(self.itos) < self.max_num) or w.endswith("_concodeNT")): word = w.replace('concodeclass_', '').replace('concodefunc_', '') self.itos.append(word) self.stoi[word] = len(self.itos) - 1 else: #map everything else to unk if rule: # We need the right kind of UNK rule here mapped_to_known_unk = False for pre_terminal in CDDataset.pre_terminal_symbols: if pre_terminal in w: self.stoi[w] = self.stoi[ CDDataset._unk_rule_from_Nt(pre_terminal)] mapped_to_known_unk = True break if not mapped_to_known_unk: # An unk type we dont know about. Investigate. import ipdb ipdb.set_trace() # For next_rules, we cannot have any other type of unk self.stoi[w] = self.stoi['<unk>'] else: self.stoi[w] = self.stoi['<unk>']
def __init__(self, task, models, args, src_bpe=None, bpe_symbol='@@ '): self.task = task self.models = models self.src_dict = task.source_dictionary self.tgt_dict = task.target_dictionary self.src_bpe = src_bpe self.use_cuda = torch.cuda.is_available() and not args.cpu self.args = args # optimize model for generation for model in self.models: model.make_generation_fast_( beamable_mm_beam_size=None if self.args.no_beamable_mm else self.args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if self.use_cuda: model.cuda() self.generator = self.task.build_generator(args) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) self.align_dict = utils.load_align_dict(args.replace_unk) self.max_positions = utils.resolve_max_positions( self.task.max_positions(), *[model.max_positions() for model in models] ) self.in_transforms = [] self.out_transforms = [] if getattr(args, 'moses', False): tokenizer = MosesTokenizer(lang=args.source_lang or 'en') detokenizer = MosesDetokenizer(lang=args.target_lang or 'en') self.in_transforms.append(lambda s: tokenizer.tokenize(s, return_str=True)) self.out_transforms.append(lambda s: detokenizer.detokenize(s.split())) elif getattr(args, 'nltk', False): from nltk.tokenize import word_tokenize self.in_transforms.append(lambda s: ' '.join(word_tokenize(s))) if getattr(args, 'gpt2_bpe', False): from fairseq.gpt2_bpe.gpt2_encoding import get_encoder encoder_json = os.path.join(os.path.dirname(src_bpe), 'encoder.json') vocab_bpe = src_bpe encoder = get_encoder(encoder_json, vocab_bpe) self.in_transforms.append(lambda s: ' '.join(map(str, encoder.encode(s)))) self.out_transforms.append(lambda s: ' '.join(t for t in s.split() if t != '<unk>')) self.out_transforms.append(lambda s: encoder.decode(map(int, s.strip().split()))) elif getattr(args, 'sentencepiece', False): import sentencepiece as spm sp = spm.SentencePieceProcessor() sp.Load(src_bpe) self.in_transforms.append(lambda s: ' '.join(sp.EncodeAsPieces(s))) self.out_transforms.append(lambda s: data_utils.process_bpe_symbol(s, 'sentencepiece')) elif src_bpe is not None: bpe_parser = apply_bpe.create_parser() bpe_args = bpe_parser.parse_args(['--codes', self.src_bpe]) bpe = apply_bpe.BPE(bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries) self.in_transforms.append(lambda s: bpe.process_line(s)) self.out_transforms.append(lambda s: data_utils.process_bpe_symbol(s, bpe_symbol))
def load_model(model_dir, bpe_src_code=None, tokenize=None): """ Start the bot. This means loading the model according to the config file. :param model_dir: Model directory of trained Joey NMT model. :param bpe_src_code: BPE codes for source side processing (optional). :param tokenize: If True, tokenize inputs with Moses tokenizer. :return: """ conf = {} cfg_file = model_dir+"/config.yaml" logger = logging.getLogger(__name__) conf["logger"] = logger # load the Joey configuration cfg = load_config(cfg_file) # load the checkpoint if "load_model" in cfg['training'].keys(): ckpt = cfg['training']["load_model"] else: ckpt = get_latest_checkpoint(model_dir) if ckpt is None: raise FileNotFoundError("No checkpoint found in directory {}." .format(model_dir)) # prediction parameters from config conf["use_cuda"] = cfg["training"].get("use_cuda", False) conf["level"] = cfg["data"]["level"] conf["max_output_length"] = cfg["training"].get("max_output_length", None) conf["lowercase"] = cfg["data"].get("lowercase", False) # load the vocabularies src_vocab_file = cfg["training"]["model_dir"] + "/src_vocab.txt" trg_vocab_file = cfg["training"]["model_dir"] + "/trg_vocab.txt" conf["src_vocab"] = build_vocab(field="src", vocab_file=src_vocab_file, dataset=None, max_size=-1, min_freq=0) conf["trg_vocab"] = build_vocab(field="trg", vocab_file=trg_vocab_file, dataset=None, max_size=-1, min_freq=0) # whether to use beam search for decoding, 0: greedy decoding if "testing" in cfg.keys(): conf["beam_size"] = cfg["testing"].get("beam_size", 0) conf["beam_alpha"] = cfg["testing"].get("alpha", -1) else: conf["beam_size"] = 1 conf["beam_alpha"] = -1 # pre-processing if tokenize is not None: src_tokenizer = MosesTokenizer(lang=cfg["data"]["src"]) trg_tokenizer = MosesDetokenizer(lang=cfg["data"]["trg"]) # tokenize input tokenizer = lambda x: src_tokenizer.tokenize(x, return_str=True) detokenizer = lambda x: trg_tokenizer.detokenize( x.split(), return_str=True) else: tokenizer = lambda x: x detokenizer = lambda x: x if bpe_src_code is not None and level == "bpe": # load bpe merge file merge_file = open(bpe_src_code, "r") bpe = apply_bpe.BPE(codes=merge_file) segmenter = lambda x: bpe.process_line(x.strip()) elif conf["level"] == "char": # split to chars segmenter = lambda x: list(x.strip()) else: segmenter = lambda x: x.strip() conf["preprocess"] = [tokenizer, segmenter] conf["postprocess"] = [detokenizer] # build model and load parameters into it model_checkpoint = load_checkpoint(ckpt, conf["use_cuda"]) model = build_model(cfg["model"], src_vocab=conf["src_vocab"], trg_vocab=conf["trg_vocab"]) model.load_state_dict(model_checkpoint["model_state"]) if conf["use_cuda"]: model.cuda() conf["model"] = model print("Joey NMT model loaded successfully.") return conf
def load_codes(codes, joiner="@@"): bpeobject=apply_bpe.BPE(open(codes,encoding="utf-8"),separator=joiner) return(bpeobject)
def _load_from_codecs(self): with open(self.codecs, 'r') as codecs_file: self.bpe = apply_bpe.BPE(codecs_file)
def get_bpe_object(codes_file_path): codes = codecs.open(codes_file_path, encoding='utf-8') bpe = apply_bpe.BPE(codes) codes.close() return bpe