Esempio n. 1
0
 def warm_up(self, vocabs=None):
     """Load subword models."""
     super().warm_up(None)
     from subword_nmt.apply_bpe import BPE, read_vocabulary
     import codecs
     src_codes = codecs.open(self.src_subword_model, encoding='utf-8')
     src_vocabulary, tgt_vocabulary = None, None
     if self.src_subword_vocab != "" and self.src_vocab_threshold > 0:
         src_vocabulary = read_vocabulary(
             codecs.open(self.src_subword_vocab, encoding='utf-8'),
             self.src_vocab_threshold)
     if self.tgt_subword_vocab != "" and self.tgt_vocab_threshold > 0:
         tgt_vocabulary = read_vocabulary(
             codecs.open(self.tgt_subword_vocab, encoding='utf-8'),
             self.tgt_vocab_threshold)
     load_src_model = BPE(codes=src_codes, vocab=src_vocabulary)
     if self.share_vocab and (src_vocabulary == tgt_vocabulary):
         self.load_models = {
             'src': load_src_model,
             'tgt': load_src_model
         }
     else:
         tgt_codes = codecs.open(self.tgt_subword_model, encoding='utf-8')
         load_tgt_model = BPE(codes=tgt_codes, vocab=tgt_vocabulary)
         self.load_models = {
             'src': load_src_model,
             'tgt': load_tgt_model
         }
Esempio n. 2
0
def apply_bpe_function(codes_file, train_file, apply_out, vocabulary=None):
    parser = apply_bpe.create_parser()
    args = parser.parse_args([
        "--codes",
        codes_file,
        "--input",
        train_file,
        "--output",
        apply_out,
        # "--vocabulary", vocabulary
    ])

    if vocabulary:
        args.vocabulary = codecs.open(vocabulary, encoding='utf-8')

    if vocabulary:
        vocabulary = apply_bpe.read_vocabulary(args.vocabulary,
                                               args.vocabulary_threshold)
    else:
        vocabulary = None

    args.codes = codecs.open(args.codes.name, encoding='utf-8')
    bpe = apply_bpe.BPE(args.codes, args.merges, args.separator, vocabulary,
                        args.glossaries)
    args.input = codecs.open(args.input.name, encoding='utf-8')
    args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
    for line in args.input:
        args.output.write(bpe.process_line(line, args.dropout))
Esempio n. 3
0
def train_subword_model(src_text, trg_text, nb_symbols=10000):

    # create text content with source and target text
    content = []
    content.extend(src_text)
    content.extend(trg_text)

    bpe_model_io = io.StringIO()
    src_vocab_io = io.StringIO()
    trg_vocab_io = io.StringIO()

    # 1. Learn BPE model on both source and target text
    # 1.1 cat {train_file}.L1 {train_file}.L2 | subword-nmt learn-bpe -s {num_operations} -o {codes_file}
    # 1.2 subword-nmt apply-bpe -c {codes_file} < {train_file}.L1 | subword-nmt get-vocab > {vocab_file}.L1
    # 1.3 subword-nmt apply-bpe -c {codes_file} < {train_file}.L2 | subword-nmt get-vocab > {vocab_file}.L2

    # 1.1 learn_bpe(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input, total_symbols=args.total_symbols))
    learn_bpe(content, bpe_model_io, nb_symbols, 0, False, False, False)

    # 1.2
    src_text_tok = apply_bpe(bpe_model_io, src_text, merges=nb_symbols)
    get_vocab(src_text_tok, src_vocab_io)
    src_vocab_io.seek(0)
    src_vocab = read_vocabulary(src_vocab_io, 0)
    # 1.3
    trg_text_tok = apply_bpe(bpe_model_io, trg_text, merges=nb_symbols)
    get_vocab(trg_text_tok, trg_vocab_io)
    trg_vocab_io.seek(0)
    trg_vocab = read_vocabulary(trg_vocab_io, 0)

    # 3. Re-apply BPE with the obtained vocabulary
    # subword-nmt apply-bpe -c {codes_file} --vocabulary {vocab_file}.L1 --vocabulary-threshold 50 < {train_file}.L1 > {train_file}.BPE.L1
    src_text_tok = apply_bpe(bpe_model_io, src_text, vocab=src_vocab)
    trg_text_tok = apply_bpe(bpe_model_io, trg_text, vocab=trg_vocab)

    bpe_model = bpe_model_io.getvalue()

    bpe_model_io.close()
    src_vocab_io.close()
    trg_vocab_io.close()

    return bpe_model, src_vocab, trg_vocab, src_text_tok, trg_text_tok
def process_bpe_dropout(code, vocab, in_name, out_name, dropout=0.0):
    """
    To apply BPE on desired data and output processed files.
    """
    codes = open(code, encoding='utf-8')
    vocab_file = open(vocab, encoding='utf-8')
    vocabulary = apply_bpe.read_vocabulary(vocab_file, 1)
    num_workers = apply_bpe.cpu_count()
    output_file = open(out_name, 'w', encoding='utf-8')
    bpe = apply_bpe.BPE(codes=codes, vocab=vocabulary)
    bpe.process_lines(in_name, output_file, dropout=dropout, num_workers=num_workers)
Esempio n. 5
0
 def warm_up(self, vocabs=None):
     """Load subword models."""
     super().warm_up(None)
     from subword_nmt.apply_bpe import BPE, read_vocabulary
     # Load vocabulary file if provided and set threshold
     src_vocabulary, tgt_vocabulary = None, None
     if self.src_subword_vocab != "" and self.src_vocab_threshold > 0:
         with open(self.src_subword_vocab, encoding='utf-8') as _sv:
             src_vocabulary = read_vocabulary(_sv, self.src_vocab_threshold)
     if self.tgt_subword_vocab != "" and self.tgt_vocab_threshold > 0:
         with open(self.tgt_subword_vocab, encoding='utf-8') as _tv:
             tgt_vocabulary = read_vocabulary(_tv, self.tgt_vocab_threshold)
     # Load Subword Model
     with open(self.src_subword_model, encoding='utf-8') as src_codes:
         load_src_model = BPE(codes=src_codes, vocab=src_vocabulary)
     if self.share_vocab and (src_vocabulary == tgt_vocabulary):
         self.load_models = {'src': load_src_model, 'tgt': load_src_model}
     else:
         with open(self.tgt_subword_model, encoding='utf-8') as tgt_codes:
             load_tgt_model = BPE(codes=tgt_codes, vocab=tgt_vocabulary)
         self.load_models = {'src': load_src_model, 'tgt': load_tgt_model}
Esempio n. 6
0
    def __init__(self,
                 options,
                 code_files,
                 merges=1,
                 separator="@@",
                 vocabularies={},
                 glossaries={}):
        """BPE splitter plugin; takes the following args:
        merges: use this many merge operations
        code_files: path to code file in dict form {"pair/locale_code":code}
        separator: which seq to split the BPE tokens
        vocabulary: vocab file build by subword_nmt/get_vocab to exclude special words
        glossaries: all words that match that pattern will not be affected by transformation
        """
        debug(f"Creating instance of [{self.__class__.__name__}]")
        self.runtime_config = options
        self.bpes = {}
        for pair in code_files:
            if pair not in self.bpes:
                self.bpes[pair] = {}
            for locale in code_files.get(pair, {}):
                debug(f"  -Loading up BytePairEncoder for [{pair}/{locale}]")
                vocabulary = None
                try:
                    vocabulary_info = vocabularies.get(pair,
                                                       {}).get(locale, {})
                    with open(vocabulary_info["path"]) as vocab_file:
                        vocabulary = read_vocabulary(
                            vocab_file, vocabulary_info.get("threshold"))
                except Exception as err:
                    debug(
                        f"  -Adding vocabulary caused an error: [{err.__class__.__name__}:err]... Ignoring it!"
                    )

                try:
                    glossary = glossaries.get(pair, {}).get(locale)
                    bpe = BPE(codes=open(code_files[pair][locale]),
                              merges=merges,
                              separator=separator,
                              vocab=vocabulary,
                              glossaries=glossary)
                except Exception as err:
                    warning(err)
                    warning("Could not create BPE for locale! Skipping it...")
                else:
                    self.bpes[pair][locale] = bpe
Esempio n. 7
0
    def __init__(self, expdir):
        self.expdir = expdir
        self.en_tok = MosesTokenizer(lang="en")
        self.en_normalizer = MosesPunctNormalizer()
        self.en_detok = MosesDetokenizer(lang="en")
        self.xliterator = unicode_transliterate.UnicodeIndicTransliterator()
        print("Initializing vocab and bpe")
        self.vocabulary = read_vocabulary(
            codecs.open(f"{expdir}/vocab/vocab.SRC", encoding="utf-8"), 5
        )
        self.bpe = BPE(
            codecs.open(f"{expdir}/vocab/bpe_codes.32k.SRC", encoding="utf-8"),
            -1,
            "@@",
            self.vocabulary,
            None,
        )

        print("Initializing model for translation")
        # initialize the model
        self.translator = Translator(
            f"{expdir}/final_bin", f"{expdir}/model/checkpoint_best.pt", batch_size=100
        )
Esempio n. 8
0
    def __init__(self, bpe_codes: Union[str, TextIO],
                 bpe_vocab: Union[str, TextIO]):

        f_bpe_codes = None
        f_bpe_vocab = None

        try:
            if isinstance(bpe_codes, str):
                f_bpe_codes = open(bpe_codes, 'r', encoding='utf-8')
            if isinstance(bpe_vocab, str):
                f_bpe_vocab = open(bpe_vocab, 'r', encoding='utf-8')

            self.bpe = subword_nmt_bpe(codes=BPECodesAdapter(f_bpe_codes
                                                             or bpe_codes),
                                       vocab=read_vocabulary(f_bpe_vocab
                                                             or bpe_vocab,
                                                             threshold=None))
            self.bpe.version = (0, 2)

        finally:
            if f_bpe_codes:
                f_bpe_codes.close()
            if f_bpe_vocab:
                f_bpe_vocab.close()
Esempio n. 9
0
                        type=str,
                        help='Comma separated port numbers')
    parser.add_argument('-njobs',
                        type=int,
                        default=50,
                        help='Specify number of Parallel jobs')

    args = parser.parse_args()

    codefile = open(args.codefile)

    if args.vocabfile != '':
        with open(args.vocabfile, 'r') as f:
            voc = f.read().split('\n')
            if voc[-1].strip() == '':
                voc = voc[:-1]
            vocab = apply_bpe.read_vocabulary(voc, 0)
    else:
        vocab = None

    bpe_encoder = apply_bpe.BPE(codefile, vocab=vocab)

    if args.word2bpefile != '':
        with open(args.word2bpefile, 'rb') as pk:
            word2bpe = pickle.load(pk)

    else:
        word2bpe = {}

    main(args)
             "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how",
             "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only",
             "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
stopwords = set(stopwords)
OLD_ENGLISH = {"thy": "your", "thou": "you", "Thy": "Your", "Thou": "You"}

# moses tokenizer
from sacremoses import MosesTruecaser, MosesTokenizer, MosesDetokenizer, MosesDetruecaser
mtok = MosesTokenizer(lang='en')
mtr = MosesTruecaser("vocab/truecase-model.en")
md = MosesDetokenizer(lang="en")
mdtr = MosesDetruecaser()

# bpe tokenizer
from subword_nmt.apply_bpe import BPE, read_vocabulary
vocabulary = read_vocabulary(codecs.open("vocab/vocab.bpe35000.chr", encoding='utf-8'), 10)
bpe = BPE(codes=codecs.open("vocab/codes_file_chr_35000", encoding='utf-8'), merges=35000, vocab=vocabulary)

# load nmt models
import onmt.opts
from translator_for_demo import build_translator
from onmt.utils.parse import ArgumentParser


def _parse_opt(opt):
    prec_argv = sys.argv
    sys.argv = sys.argv[:1]
    parser = ArgumentParser()
    onmt.opts.translate_opts(parser)

    opt['src'] = "dummy_src"