Esempi in Python per read_vocabulary, esempi in Python per subword_nmt.apply_bpe.read_vocabulary

Esempio n. 1

0

Mostra file

 def warm_up(self, vocabs=None):
     """Load subword models."""
     super().warm_up(None)
     from subword_nmt.apply_bpe import BPE, read_vocabulary
     import codecs
     src_codes = codecs.open(self.src_subword_model, encoding='utf-8')
     src_vocabulary, tgt_vocabulary = None, None
     if self.src_subword_vocab != "" and self.src_vocab_threshold > 0:
         src_vocabulary = read_vocabulary(
             codecs.open(self.src_subword_vocab, encoding='utf-8'),
             self.src_vocab_threshold)
     if self.tgt_subword_vocab != "" and self.tgt_vocab_threshold > 0:
         tgt_vocabulary = read_vocabulary(
             codecs.open(self.tgt_subword_vocab, encoding='utf-8'),
             self.tgt_vocab_threshold)
     load_src_model = BPE(codes=src_codes, vocab=src_vocabulary)
     if self.share_vocab and (src_vocabulary == tgt_vocabulary):
         self.load_models = {
             'src': load_src_model,
             'tgt': load_src_model
         }
     else:
         tgt_codes = codecs.open(self.tgt_subword_model, encoding='utf-8')
         load_tgt_model = BPE(codes=tgt_codes, vocab=tgt_vocabulary)
         self.load_models = {
             'src': load_src_model,
             'tgt': load_tgt_model
         }

Esempio n. 2

0

Mostra file

def apply_bpe_function(codes_file, train_file, apply_out, vocabulary=None):
    parser = apply_bpe.create_parser()
    args = parser.parse_args([
        "--codes",
        codes_file,
        "--input",
        train_file,
        "--output",
        apply_out,
        # "--vocabulary", vocabulary
    ])

    if vocabulary:
        args.vocabulary = codecs.open(vocabulary, encoding='utf-8')

    if vocabulary:
        vocabulary = apply_bpe.read_vocabulary(args.vocabulary,
                                               args.vocabulary_threshold)
    else:
        vocabulary = None

    args.codes = codecs.open(args.codes.name, encoding='utf-8')
    bpe = apply_bpe.BPE(args.codes, args.merges, args.separator, vocabulary,
                        args.glossaries)
    args.input = codecs.open(args.input.name, encoding='utf-8')
    args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
    for line in args.input:
        args.output.write(bpe.process_line(line, args.dropout))

Esempio n. 3

0

Mostra file

File: 1.py Progetto: loicbarrault/allies_llmt_beat

def train_subword_model(src_text, trg_text, nb_symbols=10000):

    # create text content with source and target text
    content = []
    content.extend(src_text)
    content.extend(trg_text)

    bpe_model_io = io.StringIO()
    src_vocab_io = io.StringIO()
    trg_vocab_io = io.StringIO()

    # 1. Learn BPE model on both source and target text
    # 1.1 cat {train_file}.L1 {train_file}.L2 | subword-nmt learn-bpe -s {num_operations} -o {codes_file}
    # 1.2 subword-nmt apply-bpe -c {codes_file} < {train_file}.L1 | subword-nmt get-vocab > {vocab_file}.L1
    # 1.3 subword-nmt apply-bpe -c {codes_file} < {train_file}.L2 | subword-nmt get-vocab > {vocab_file}.L2

    # 1.1 learn_bpe(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input, total_symbols=args.total_symbols))
    learn_bpe(content, bpe_model_io, nb_symbols, 0, False, False, False)

    # 1.2
    src_text_tok = apply_bpe(bpe_model_io, src_text, merges=nb_symbols)
    get_vocab(src_text_tok, src_vocab_io)
    src_vocab_io.seek(0)
    src_vocab = read_vocabulary(src_vocab_io, 0)
    # 1.3
    trg_text_tok = apply_bpe(bpe_model_io, trg_text, merges=nb_symbols)
    get_vocab(trg_text_tok, trg_vocab_io)
    trg_vocab_io.seek(0)
    trg_vocab = read_vocabulary(trg_vocab_io, 0)

    # 3. Re-apply BPE with the obtained vocabulary
    # subword-nmt apply-bpe -c {codes_file} --vocabulary {vocab_file}.L1 --vocabulary-threshold 50 < {train_file}.L1 > {train_file}.BPE.L1
    src_text_tok = apply_bpe(bpe_model_io, src_text, vocab=src_vocab)
    trg_text_tok = apply_bpe(bpe_model_io, trg_text, vocab=trg_vocab)

    bpe_model = bpe_model_io.getvalue()

    bpe_model_io.close()
    src_vocab_io.close()
    trg_vocab_io.close()

    return bpe_model, src_vocab, trg_vocab, src_text_tok, trg_text_tok

Esempio n. 4

0

Mostra file

File: generate_bpe_dropout_data.py Progetto: wenyuan-wu/atmt

def process_bpe_dropout(code, vocab, in_name, out_name, dropout=0.0):
    """
    To apply BPE on desired data and output processed files.
    """
    codes = open(code, encoding='utf-8')
    vocab_file = open(vocab, encoding='utf-8')
    vocabulary = apply_bpe.read_vocabulary(vocab_file, 1)
    num_workers = apply_bpe.cpu_count()
    output_file = open(out_name, 'w', encoding='utf-8')
    bpe = apply_bpe.BPE(codes=codes, vocab=vocabulary)
    bpe.process_lines(in_name, output_file, dropout=dropout, num_workers=num_workers)

Esempio n. 5

0

Mostra file

File: tokenize.py Progetto: zoudajia/OpenNMT-py

 def warm_up(self, vocabs=None):
     """Load subword models."""
     super().warm_up(None)
     from subword_nmt.apply_bpe import BPE, read_vocabulary
     # Load vocabulary file if provided and set threshold
     src_vocabulary, tgt_vocabulary = None, None
     if self.src_subword_vocab != "" and self.src_vocab_threshold > 0:
         with open(self.src_subword_vocab, encoding='utf-8') as _sv:
             src_vocabulary = read_vocabulary(_sv, self.src_vocab_threshold)
     if self.tgt_subword_vocab != "" and self.tgt_vocab_threshold > 0:
         with open(self.tgt_subword_vocab, encoding='utf-8') as _tv:
             tgt_vocabulary = read_vocabulary(_tv, self.tgt_vocab_threshold)
     # Load Subword Model
     with open(self.src_subword_model, encoding='utf-8') as src_codes:
         load_src_model = BPE(codes=src_codes, vocab=src_vocabulary)
     if self.share_vocab and (src_vocabulary == tgt_vocabulary):
         self.load_models = {'src': load_src_model, 'tgt': load_src_model}
     else:
         with open(self.tgt_subword_model, encoding='utf-8') as tgt_codes:
             load_tgt_model = BPE(codes=tgt_codes, vocab=tgt_vocabulary)
         self.load_models = {'src': load_src_model, 'tgt': load_tgt_model}

Esempio n. 6

0

Mostra file

    def __init__(self,
                 options,
                 code_files,
                 merges=1,
                 separator="@@",
                 vocabularies={},
                 glossaries={}):
        """BPE splitter plugin; takes the following args:
        merges: use this many merge operations
        code_files: path to code file in dict form {"pair/locale_code":code}
        separator: which seq to split the BPE tokens
        vocabulary: vocab file build by subword_nmt/get_vocab to exclude special words
        glossaries: all words that match that pattern will not be affected by transformation
        """
        debug(f"Creating instance of [{self.__class__.__name__}]")
        self.runtime_config = options
        self.bpes = {}
        for pair in code_files:
            if pair not in self.bpes:
                self.bpes[pair] = {}
            for locale in code_files.get(pair, {}):
                debug(f"  -Loading up BytePairEncoder for [{pair}/{locale}]")
                vocabulary = None
                try:
                    vocabulary_info = vocabularies.get(pair,
                                                       {}).get(locale, {})
                    with open(vocabulary_info["path"]) as vocab_file:
                        vocabulary = read_vocabulary(
                            vocab_file, vocabulary_info.get("threshold"))
                except Exception as err:
                    debug(
                        f"  -Adding vocabulary caused an error: [{err.__class__.__name__}:err]... Ignoring it!"
                    )

                try:
                    glossary = glossaries.get(pair, {}).get(locale)
                    bpe = BPE(codes=open(code_files[pair][locale]),
                              merges=merges,
                              separator=separator,
                              vocab=vocabulary,
                              glossaries=glossary)
                except Exception as err:
                    warning(err)
                    warning("Could not create BPE for locale! Skipping it...")
                else:
                    self.bpes[pair][locale] = bpe

Esempio n. 7

0

Mostra file

    def __init__(self, expdir):
        self.expdir = expdir
        self.en_tok = MosesTokenizer(lang="en")
        self.en_normalizer = MosesPunctNormalizer()
        self.en_detok = MosesDetokenizer(lang="en")
        self.xliterator = unicode_transliterate.UnicodeIndicTransliterator()
        print("Initializing vocab and bpe")
        self.vocabulary = read_vocabulary(
            codecs.open(f"{expdir}/vocab/vocab.SRC", encoding="utf-8"), 5
        )
        self.bpe = BPE(
            codecs.open(f"{expdir}/vocab/bpe_codes.32k.SRC", encoding="utf-8"),
            -1,
            "@@",
            self.vocabulary,
            None,
        )

        print("Initializing model for translation")
        # initialize the model
        self.translator = Translator(
            f"{expdir}/final_bin", f"{expdir}/model/checkpoint_best.pt", batch_size=100
        )

Esempio n. 8

0

Mostra file

    def __init__(self, bpe_codes: Union[str, TextIO],
                 bpe_vocab: Union[str, TextIO]):

        f_bpe_codes = None
        f_bpe_vocab = None

        try:
            if isinstance(bpe_codes, str):
                f_bpe_codes = open(bpe_codes, 'r', encoding='utf-8')
            if isinstance(bpe_vocab, str):
                f_bpe_vocab = open(bpe_vocab, 'r', encoding='utf-8')

            self.bpe = subword_nmt_bpe(codes=BPECodesAdapter(f_bpe_codes
                                                             or bpe_codes),
                                       vocab=read_vocabulary(f_bpe_vocab
                                                             or bpe_vocab,
                                                             threshold=None))
            self.bpe.version = (0, 2)

        finally:
            if f_bpe_codes:
                f_bpe_codes.close()
            if f_bpe_vocab:
                f_bpe_vocab.close()

Esempio n. 9

0

Mostra file

                        type=str,
                        help='Comma separated port numbers')
    parser.add_argument('-njobs',
                        type=int,
                        default=50,
                        help='Specify number of Parallel jobs')

    args = parser.parse_args()

    codefile = open(args.codefile)

    if args.vocabfile != '':
        with open(args.vocabfile, 'r') as f:
            voc = f.read().split('\n')
            if voc[-1].strip() == '':
                voc = voc[:-1]
            vocab = apply_bpe.read_vocabulary(voc, 0)
    else:
        vocab = None

    bpe_encoder = apply_bpe.BPE(codefile, vocab=vocab)

    if args.word2bpefile != '':
        with open(args.word2bpefile, 'rb') as pk:
            word2bpe = pickle.load(pk)

    else:
        word2bpe = {}

    main(args)

Esempio n. 10

0

Mostra file

File: ChrEnTranslator.py Progetto: ZhangShiyue/ChrEnTranslate

             "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how",
             "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only",
             "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
stopwords = set(stopwords)
OLD_ENGLISH = {"thy": "your", "thou": "you", "Thy": "Your", "Thou": "You"}

# moses tokenizer
from sacremoses import MosesTruecaser, MosesTokenizer, MosesDetokenizer, MosesDetruecaser
mtok = MosesTokenizer(lang='en')
mtr = MosesTruecaser("vocab/truecase-model.en")
md = MosesDetokenizer(lang="en")
mdtr = MosesDetruecaser()

# bpe tokenizer
from subword_nmt.apply_bpe import BPE, read_vocabulary
vocabulary = read_vocabulary(codecs.open("vocab/vocab.bpe35000.chr", encoding='utf-8'), 10)
bpe = BPE(codes=codecs.open("vocab/codes_file_chr_35000", encoding='utf-8'), merges=35000, vocab=vocabulary)

# load nmt models
import onmt.opts
from translator_for_demo import build_translator
from onmt.utils.parse import ArgumentParser


def _parse_opt(opt):
    prec_argv = sys.argv
    sys.argv = sys.argv[:1]
    parser = ArgumentParser()
    onmt.opts.translate_opts(parser)

    opt['src'] = "dummy_src"