def warm_up(self, vocabs=None): """Load subword models.""" super().warm_up(None) from subword_nmt.apply_bpe import BPE, read_vocabulary import codecs src_codes = codecs.open(self.src_subword_model, encoding='utf-8') src_vocabulary, tgt_vocabulary = None, None if self.src_subword_vocab != "" and self.src_vocab_threshold > 0: src_vocabulary = read_vocabulary( codecs.open(self.src_subword_vocab, encoding='utf-8'), self.src_vocab_threshold) if self.tgt_subword_vocab != "" and self.tgt_vocab_threshold > 0: tgt_vocabulary = read_vocabulary( codecs.open(self.tgt_subword_vocab, encoding='utf-8'), self.tgt_vocab_threshold) load_src_model = BPE(codes=src_codes, vocab=src_vocabulary) if self.share_vocab and (src_vocabulary == tgt_vocabulary): self.load_models = { 'src': load_src_model, 'tgt': load_src_model } else: tgt_codes = codecs.open(self.tgt_subword_model, encoding='utf-8') load_tgt_model = BPE(codes=tgt_codes, vocab=tgt_vocabulary) self.load_models = { 'src': load_src_model, 'tgt': load_tgt_model }
def apply_bpe_function(codes_file, train_file, apply_out, vocabulary=None): parser = apply_bpe.create_parser() args = parser.parse_args([ "--codes", codes_file, "--input", train_file, "--output", apply_out, # "--vocabulary", vocabulary ]) if vocabulary: args.vocabulary = codecs.open(vocabulary, encoding='utf-8') if vocabulary: vocabulary = apply_bpe.read_vocabulary(args.vocabulary, args.vocabulary_threshold) else: vocabulary = None args.codes = codecs.open(args.codes.name, encoding='utf-8') bpe = apply_bpe.BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries) args.input = codecs.open(args.input.name, encoding='utf-8') args.output = codecs.open(args.output.name, 'w', encoding='utf-8') for line in args.input: args.output.write(bpe.process_line(line, args.dropout))
def train_subword_model(src_text, trg_text, nb_symbols=10000): # create text content with source and target text content = [] content.extend(src_text) content.extend(trg_text) bpe_model_io = io.StringIO() src_vocab_io = io.StringIO() trg_vocab_io = io.StringIO() # 1. Learn BPE model on both source and target text # 1.1 cat {train_file}.L1 {train_file}.L2 | subword-nmt learn-bpe -s {num_operations} -o {codes_file} # 1.2 subword-nmt apply-bpe -c {codes_file} < {train_file}.L1 | subword-nmt get-vocab > {vocab_file}.L1 # 1.3 subword-nmt apply-bpe -c {codes_file} < {train_file}.L2 | subword-nmt get-vocab > {vocab_file}.L2 # 1.1 learn_bpe(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input, total_symbols=args.total_symbols)) learn_bpe(content, bpe_model_io, nb_symbols, 0, False, False, False) # 1.2 src_text_tok = apply_bpe(bpe_model_io, src_text, merges=nb_symbols) get_vocab(src_text_tok, src_vocab_io) src_vocab_io.seek(0) src_vocab = read_vocabulary(src_vocab_io, 0) # 1.3 trg_text_tok = apply_bpe(bpe_model_io, trg_text, merges=nb_symbols) get_vocab(trg_text_tok, trg_vocab_io) trg_vocab_io.seek(0) trg_vocab = read_vocabulary(trg_vocab_io, 0) # 3. Re-apply BPE with the obtained vocabulary # subword-nmt apply-bpe -c {codes_file} --vocabulary {vocab_file}.L1 --vocabulary-threshold 50 < {train_file}.L1 > {train_file}.BPE.L1 src_text_tok = apply_bpe(bpe_model_io, src_text, vocab=src_vocab) trg_text_tok = apply_bpe(bpe_model_io, trg_text, vocab=trg_vocab) bpe_model = bpe_model_io.getvalue() bpe_model_io.close() src_vocab_io.close() trg_vocab_io.close() return bpe_model, src_vocab, trg_vocab, src_text_tok, trg_text_tok
def process_bpe_dropout(code, vocab, in_name, out_name, dropout=0.0): """ To apply BPE on desired data and output processed files. """ codes = open(code, encoding='utf-8') vocab_file = open(vocab, encoding='utf-8') vocabulary = apply_bpe.read_vocabulary(vocab_file, 1) num_workers = apply_bpe.cpu_count() output_file = open(out_name, 'w', encoding='utf-8') bpe = apply_bpe.BPE(codes=codes, vocab=vocabulary) bpe.process_lines(in_name, output_file, dropout=dropout, num_workers=num_workers)
def warm_up(self, vocabs=None): """Load subword models.""" super().warm_up(None) from subword_nmt.apply_bpe import BPE, read_vocabulary # Load vocabulary file if provided and set threshold src_vocabulary, tgt_vocabulary = None, None if self.src_subword_vocab != "" and self.src_vocab_threshold > 0: with open(self.src_subword_vocab, encoding='utf-8') as _sv: src_vocabulary = read_vocabulary(_sv, self.src_vocab_threshold) if self.tgt_subword_vocab != "" and self.tgt_vocab_threshold > 0: with open(self.tgt_subword_vocab, encoding='utf-8') as _tv: tgt_vocabulary = read_vocabulary(_tv, self.tgt_vocab_threshold) # Load Subword Model with open(self.src_subword_model, encoding='utf-8') as src_codes: load_src_model = BPE(codes=src_codes, vocab=src_vocabulary) if self.share_vocab and (src_vocabulary == tgt_vocabulary): self.load_models = {'src': load_src_model, 'tgt': load_src_model} else: with open(self.tgt_subword_model, encoding='utf-8') as tgt_codes: load_tgt_model = BPE(codes=tgt_codes, vocab=tgt_vocabulary) self.load_models = {'src': load_src_model, 'tgt': load_tgt_model}
def __init__(self, options, code_files, merges=1, separator="@@", vocabularies={}, glossaries={}): """BPE splitter plugin; takes the following args: merges: use this many merge operations code_files: path to code file in dict form {"pair/locale_code":code} separator: which seq to split the BPE tokens vocabulary: vocab file build by subword_nmt/get_vocab to exclude special words glossaries: all words that match that pattern will not be affected by transformation """ debug(f"Creating instance of [{self.__class__.__name__}]") self.runtime_config = options self.bpes = {} for pair in code_files: if pair not in self.bpes: self.bpes[pair] = {} for locale in code_files.get(pair, {}): debug(f" -Loading up BytePairEncoder for [{pair}/{locale}]") vocabulary = None try: vocabulary_info = vocabularies.get(pair, {}).get(locale, {}) with open(vocabulary_info["path"]) as vocab_file: vocabulary = read_vocabulary( vocab_file, vocabulary_info.get("threshold")) except Exception as err: debug( f" -Adding vocabulary caused an error: [{err.__class__.__name__}:err]... Ignoring it!" ) try: glossary = glossaries.get(pair, {}).get(locale) bpe = BPE(codes=open(code_files[pair][locale]), merges=merges, separator=separator, vocab=vocabulary, glossaries=glossary) except Exception as err: warning(err) warning("Could not create BPE for locale! Skipping it...") else: self.bpes[pair][locale] = bpe
def __init__(self, expdir): self.expdir = expdir self.en_tok = MosesTokenizer(lang="en") self.en_normalizer = MosesPunctNormalizer() self.en_detok = MosesDetokenizer(lang="en") self.xliterator = unicode_transliterate.UnicodeIndicTransliterator() print("Initializing vocab and bpe") self.vocabulary = read_vocabulary( codecs.open(f"{expdir}/vocab/vocab.SRC", encoding="utf-8"), 5 ) self.bpe = BPE( codecs.open(f"{expdir}/vocab/bpe_codes.32k.SRC", encoding="utf-8"), -1, "@@", self.vocabulary, None, ) print("Initializing model for translation") # initialize the model self.translator = Translator( f"{expdir}/final_bin", f"{expdir}/model/checkpoint_best.pt", batch_size=100 )
def __init__(self, bpe_codes: Union[str, TextIO], bpe_vocab: Union[str, TextIO]): f_bpe_codes = None f_bpe_vocab = None try: if isinstance(bpe_codes, str): f_bpe_codes = open(bpe_codes, 'r', encoding='utf-8') if isinstance(bpe_vocab, str): f_bpe_vocab = open(bpe_vocab, 'r', encoding='utf-8') self.bpe = subword_nmt_bpe(codes=BPECodesAdapter(f_bpe_codes or bpe_codes), vocab=read_vocabulary(f_bpe_vocab or bpe_vocab, threshold=None)) self.bpe.version = (0, 2) finally: if f_bpe_codes: f_bpe_codes.close() if f_bpe_vocab: f_bpe_vocab.close()
type=str, help='Comma separated port numbers') parser.add_argument('-njobs', type=int, default=50, help='Specify number of Parallel jobs') args = parser.parse_args() codefile = open(args.codefile) if args.vocabfile != '': with open(args.vocabfile, 'r') as f: voc = f.read().split('\n') if voc[-1].strip() == '': voc = voc[:-1] vocab = apply_bpe.read_vocabulary(voc, 0) else: vocab = None bpe_encoder = apply_bpe.BPE(codefile, vocab=vocab) if args.word2bpefile != '': with open(args.word2bpefile, 'rb') as pk: word2bpe = pickle.load(pk) else: word2bpe = {} main(args)
"over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"] stopwords = set(stopwords) OLD_ENGLISH = {"thy": "your", "thou": "you", "Thy": "Your", "Thou": "You"} # moses tokenizer from sacremoses import MosesTruecaser, MosesTokenizer, MosesDetokenizer, MosesDetruecaser mtok = MosesTokenizer(lang='en') mtr = MosesTruecaser("vocab/truecase-model.en") md = MosesDetokenizer(lang="en") mdtr = MosesDetruecaser() # bpe tokenizer from subword_nmt.apply_bpe import BPE, read_vocabulary vocabulary = read_vocabulary(codecs.open("vocab/vocab.bpe35000.chr", encoding='utf-8'), 10) bpe = BPE(codes=codecs.open("vocab/codes_file_chr_35000", encoding='utf-8'), merges=35000, vocab=vocabulary) # load nmt models import onmt.opts from translator_for_demo import build_translator from onmt.utils.parse import ArgumentParser def _parse_opt(opt): prec_argv = sys.argv sys.argv = sys.argv[:1] parser = ArgumentParser() onmt.opts.translate_opts(parser) opt['src'] = "dummy_src"