Example #1
0
 def bpe_process(inp, outp, codes):
     codes = codecs.open(codes, encoding='utf-8')
     inp = codecs.open(inp, encoding='utf-8')
     outp = codecs.open(outp, "w", encoding='utf-8')
     bpe = apply_bpe.BPE(codes)
     for line in inp:
         outp.write(bpe.process_line(line))
Example #2
0
    def __init__(self, cfg):
        if cfg.bpe_codes is None:
            raise ValueError("--bpe-codes is required for --bpe=subword_nmt")
        codes = file_utils.cached_path(cfg.bpe_codes)
        try:
            from subword_nmt import apply_bpe

            bpe_parser = apply_bpe.create_parser()
            bpe_args = bpe_parser.parse_args([
                "--codes",
                codes,
                "--separator",
                cfg.bpe_separator,
            ])
            self.bpe = apply_bpe.BPE(
                bpe_args.codes,
                bpe_args.merges,
                bpe_args.separator,
                None,
                bpe_args.glossaries,
            )
            self.bpe_symbol = bpe_args.separator + " "
        except ImportError:
            raise ImportError(
                "Please install subword_nmt with: pip install subword-nmt")
Example #3
0
def apply_bpe_function(codes_file, train_file, apply_out, vocabulary=None):
    parser = apply_bpe.create_parser()
    args = parser.parse_args([
        "--codes",
        codes_file,
        "--input",
        train_file,
        "--output",
        apply_out,
        # "--vocabulary", vocabulary
    ])

    if vocabulary:
        args.vocabulary = codecs.open(vocabulary, encoding='utf-8')

    if vocabulary:
        vocabulary = apply_bpe.read_vocabulary(args.vocabulary,
                                               args.vocabulary_threshold)
    else:
        vocabulary = None

    args.codes = codecs.open(args.codes.name, encoding='utf-8')
    bpe = apply_bpe.BPE(args.codes, args.merges, args.separator, vocabulary,
                        args.glossaries)
    args.input = codecs.open(args.input.name, encoding='utf-8')
    args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
    for line in args.input:
        args.output.write(bpe.process_line(line, args.dropout))
Example #4
0
    def __init__(self, model_path=None, glossary: Optional[List[str]] = None):
        from subword_nmt import apply_bpe
        self.unsegment_re = re.compile(r'@@( |$)')

        self.model = None
        if model_path is not None:
            self.model = apply_bpe.BPE(open(model_path), glossaries=glossary)
Example #5
0
 def __call__(self, parser, args, values, option_string=None):
     bpe_segmenter = None
     if values is not None:
         with open(values, encoding='utf-8') as bpe_codes_file:
             bpe_codes = apply_bpe.BPE(bpe_codes_file)
             bpe_segmenter = create_bpe_word_segmenter(bpe_codes)
     setattr(args, self.dest, bpe_segmenter)
Example #6
0
 def __init__(self, args):
     if args.bpe_codes is None:
         raise ValueError('--bpe-codes is required for --bpe=subword_nmt')
     codes = file_utils.cached_path(args.bpe_codes)
     try:
         from subword_nmt import apply_bpe
         bpe_parser = apply_bpe.create_parser()
         bpe_args = bpe_parser.parse_args([
             '--codes',
             codes,
             '--separator',
             args.bpe_separator,
         ])
         import codecs
         bpe_args.codes = codecs.open(codes, encoding='utf-8')
         self.bpe = apply_bpe.BPE(
             bpe_args.codes,
             bpe_args.merges,
             bpe_args.separator,
             None,
             bpe_args.glossaries,
         )
         self.bpe_symbol = bpe_args.separator + ' '
     except ImportError:
         raise ImportError(
             'Please install subword_nmt with: pip install subword-nmt')
Example #7
0
 def _load_from_codecs(self):
     """
     Load BPE from codecs file.
     """
     with PathManager.open(self.codecs, 'r',
                           encoding='utf-8') as codecs_file:
         self.bpe = apply_bpe.BPE(codecs_file)
Example #8
0
    def __init__(self, host, port, model_name, bpe_codes, max_toks=512):
        channel = grpc.insecure_channel("%s:%d" % (host, port))
        self.stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
        self.model_name = model_name
        self.max_toks = max_toks

        with open(bpe_codes) as f:
            self.bpe = apply_bpe.BPE(f)
Example #9
0
def generate_bpe_file(input, output, code):
    inputs = codecs.open(input, encoding='utf-8')
    outputs = codecs.open(output, mode='w', encoding='UTF-8')
    codes = codecs.open(code, encoding='utf-8')
    bpe = apply_bpe.BPE(codes)
    for line in inputs:
        outputs.write(bpe.process_line(line))
    outputs.close()
def build_vocab(imgs, params):
    # count up the number of words
    captions = []
    for img in imgs:
        for sent in img['sentences']:
            captions.append(' '.join(sent['tokens']))
    captions = '\n'.join(captions)
    all_captions = tempfile.NamedTemporaryFile(delete=False)
    all_captions.close()
    with open(all_captions.name, 'w') as txt_file:
        txt_file.write(captions)

    #
    codecs_output = tempfile.NamedTemporaryFile(delete=False)
    codecs_output.close()
    with codecs.open(codecs_output.name, 'w', encoding='UTF-8') as output:
        learn_bpe.learn_bpe(codecs.open(all_captions.name, encoding='UTF-8'),
                            output, params['symbol_count'])

    with codecs.open(codecs_output.name, encoding='UTF-8') as codes:
        bpe = apply_bpe.BPE(codes)

    tmp = tempfile.NamedTemporaryFile(delete=False)
    tmp.close()

    tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')

    for _, img in enumerate(imgs):
        img['final_captions'] = []
        for sent in img['sentences']:
            txt = ' '.join(sent['tokens'])
            txt = bpe.segment(txt).strip()
            img['final_captions'].append(txt.split(' '))
            tmpout.write(txt)
            tmpout.write('\n')
            if _ < 20:
                print(txt)

    tmpout.close()
    tmpin = codecs.open(tmp.name, encoding='UTF-8')

    vocab = learn_bpe.get_vocabulary(tmpin)
    vocab = sorted(vocab.keys(), key=lambda x: vocab[x], reverse=True)

    # Always insert UNK
    print('inserting the special UNK token')
    vocab.append('UNK')

    print('Vocab size:', len(vocab))

    os.remove(all_captions.name)
    with open(codecs_output.name, 'r') as codes:
        bpe = codes.read()
    os.remove(codecs_output.name)
    os.remove(tmp.name)

    return vocab, bpe
Example #11
0
    def __init__(self, host, port, model_name, preprocessor, postprocessor,
                 bpe_codes):
        channel = grpc.insecure_channel("%s:%d" % (host, port))
        self.stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
        self.model_name = model_name

        self.preprocessor = preprocessor
        self.postprocessor = postprocessor
        with open(bpe_codes) as f:
            self.bpe = apply_bpe.BPE(f)
def process_bpe_dropout(code, vocab, in_name, out_name, dropout=0.0):
    """
    To apply BPE on desired data and output processed files.
    """
    codes = open(code, encoding='utf-8')
    vocab_file = open(vocab, encoding='utf-8')
    vocabulary = apply_bpe.read_vocabulary(vocab_file, 1)
    num_workers = apply_bpe.cpu_count()
    output_file = open(out_name, 'w', encoding='utf-8')
    bpe = apply_bpe.BPE(codes=codes, vocab=vocabulary)
    bpe.process_lines(in_name, output_file, dropout=dropout, num_workers=num_workers)
Example #13
0
    def __init__(self, task, models, args, src_bpe=None, bpe_symbol='@@ '):
        self.task = task
        self.models = models
        self.src_dict = task.source_dictionary
        self.tgt_dict = task.target_dictionary
        self.src_bpe = src_bpe
        self.use_cuda = torch.cuda.is_available() and not args.cpu
        self.args = args

        self.args.remove_bpe = bpe_symbol

        # optimize model for generation
        for model in self.models:
            model.make_generation_fast_(
                beamable_mm_beam_size=None if self.args.no_beamable_mm else self.args.beam,
                need_attn=args.print_alignment,
            )
            if args.fp16:
                model.half()
            if self.use_cuda:
                model.cuda()

        self.generator = self.task.build_generator(args)

        # Load alignment dictionary for unknown word replacement
        # (None if no unknown word replacement, empty if no path to align dictionary)
        self.align_dict = utils.load_align_dict(args.replace_unk)

        self.max_positions = utils.resolve_max_positions(
            self.task.max_positions(),
            *[model.max_positions() for model in models]
        )

        if hasattr(args, 'source_lang'):
            self.tokenizer = MosesTokenizer(lang=args.source_lang)
        else:
            self.tokenizer = MosesTokenizer()

        if src_bpe is not None:
            bpe_parser = apply_bpe.create_parser()
            bpe_args = bpe_parser.parse_args(['--codes', self.src_bpe])
            self.bpe = apply_bpe.BPE(bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries)
        else:
            self.bpe = None
Example #14
0
 def subword(self, cleaned_filepaths, overwrite):
     bpe_filepath = get_bpe_path(self.experiment_name, self.merge_ops)
     if self.corpora_type == 'training':
         # Concatenated file necessary for BPE learning
         concatenated_filepath = get_concat_path(self.file_prefix)
         concatenate_files(cleaned_filepaths,
                           concatenated_filepath,
                           overwrite=overwrite)
         if os.path.exists(bpe_filepath) and overwrite == False:
             print(bpe_filepath, 'already exists')
         else:
             print('Learning BPE encoding. This may take a while.')
             with open(concatenated_filepath, 'r',
                       encoding='utf-8') as infile, open(
                           bpe_filepath, 'w', encoding='utf-8') as outfile:
                 learn_bpe.learn_bpe(
                     infile, outfile, num_symbols=self.merge_ops
                 )  # Get codecs, write codecs to outfile
     print('Applying')
     with open(bpe_filepath, 'r', encoding='utf-8') as codec:
         bpe = apply_bpe.BPE(codec)
     print('Writing bpe')
     for i, lang in enumerate(self.langs):
         lang_filepath = cleaned_filepaths[i]
         processed_filepath = get_processed_data_path(
             self.experiment_name, self.corpora_type, lang)
         if overwrite == False and os.path.exists(processed_filepath):
             continue
         with open(lang_filepath, 'r',
                   encoding='utf-8') as f1, open(processed_filepath,
                                                 'w',
                                                 encoding='utf-8') as f2:
             for line in f1:
                 f2.write(bpe.process_line(line))
         if self.corpora_type == 'training':
             vocab_filepath = get_vocab_path(self.experiment_name, lang)
             with open(processed_filepath, 'r',
                       encoding='utf-8') as train_file, open(
                           vocab_filepath, 'w',
                           encoding='utf-8') as vocab_file:
                 get_vocab.get_vocab(train_file, vocab_file)
 def __init__(self, args):
     codes = file_utils.cached_path(args.bpe_codes)
     try:
         from subword_nmt import apply_bpe
         bpe_parser = apply_bpe.create_parser()
         bpe_args = bpe_parser.parse_args([
             '--codes',
             codes,
             '--separator',
             args.bpe_separator,
         ])
         self.bpe = apply_bpe.BPE(
             bpe_args.codes,
             bpe_args.merges,
             bpe_args.separator,
             None,
             bpe_args.glossaries,
         )
         self.bpe_symbol = bpe_args.separator + ' '
     except ImportError:
         raise ImportError(
             'Please install subword_nmt with: pip install subword-nmt')
Example #16
0
    ArgumentParser.validate_translate_opts(opt)
    engines[key] = {"translatorbest": build_translator(opt, report_score=True)}
    #translatorbest builds the best complete translation of the sentence

    opt.n_best = 5
    opt.max_length = 2
    opt.global_attention_function = 'sparsemax'
    ArgumentParser.validate_translate_opts(opt)
    engines[key]["translatorbigram"] = build_translator(opt, report_score=True)
    #translatorbiagram builds best translations of length two

    if value['src_bpe']:
        print("BPE in SRC side")
        bpe_src_code = os.path.join(dir_path, 'model', value['src_bpe'])
        merge_file = open(bpe_src_code, "r")
        bpe = apply_bpe.BPE(codes=merge_file)
        engines[key]["src_segmenter"] = lambda x: bpe.process_line(x.strip())
    else:
        engines[key]["src_segmenter"] = None


def preprocess_src(s, preprocess):
    s = s.lower()
    s = re.sub(r"([\“\”])", r'"', s)
    s = re.sub(r"([\‘\’])", r"'", s)
    s = re.sub(r"([\ः])", r":", s)
    s = re.sub(r"([-!$%^&*()_+|~=`{}\[\]:\";<>?,.\/#@।]+)", r" \1 ", s)
    # s = re.sub(r'"', r'&quot;', s)
    # s = re.sub(r"'", r"&apos;", s)
    s = re.sub(r"(\s+)", r" ", s)
Example #17
0
def get_bpe_segmenter(bpe_codes_path):
    bpe = apply_bpe.BPE(codes=open(bpe_codes_path, 'r'))
    segmenter = lambda x: bpe.process_line(x.strip()).split(
    )  #string IN -> list OUT
    return segmenter
Example #18
0
def run_bot(model_dir, bpe_src_code=None, tokenize=None):
    """
    Start the bot. This means loading the model according to the config file.

    :param model_dir: Model directory of trained Joey NMT model.
    :param bpe_src_code: BPE codes for source side processing (optional).
    :param tokenize: If True, tokenize inputs with Moses tokenizer.
    :return:
    """

    cfg_file = model_dir + "/config.yaml"

    logger = logging.getLogger(__name__)

    # load the Joey configuration
    cfg = load_config(cfg_file)

    # load the checkpoint
    if "load_model" in cfg['training'].keys():
        ckpt = cfg['training']["load_model"]
    else:
        ckpt = get_latest_checkpoint(model_dir)
        if ckpt is None:
            raise FileNotFoundError(
                "No checkpoint found in directory {}.".format(model_dir))

    # prediction parameters from config
    use_cuda = cfg["training"].get("use_cuda", False)
    level = cfg["data"]["level"]
    max_output_length = cfg["training"].get("max_output_length", None)
    lowercase = cfg["data"].get("lowercase", False)

    # load the vocabularies
    src_vocab_file = cfg["training"]["model_dir"] + "/src_vocab.txt"
    trg_vocab_file = cfg["training"]["model_dir"] + "/trg_vocab.txt"
    src_vocab = build_vocab(field="src",
                            vocab_file=src_vocab_file,
                            dataset=None,
                            max_size=-1,
                            min_freq=0)
    trg_vocab = build_vocab(field="trg",
                            vocab_file=trg_vocab_file,
                            dataset=None,
                            max_size=-1,
                            min_freq=0)

    # whether to use beam search for decoding, 0: greedy decoding
    if "testing" in cfg.keys():
        beam_size = cfg["testing"].get("beam_size", 0)
        beam_alpha = cfg["testing"].get("alpha", -1)
    else:
        beam_size = 1
        beam_alpha = -1

    # pre-processing
    if tokenize is not None:
        src_tokenizer = MosesTokenizer(lang=cfg["data"]["src"])
        trg_tokenizer = MosesDetokenizer(lang=cfg["data"]["trg"])
        # tokenize input
        tokenizer = lambda x: src_tokenizer.tokenize(x, return_str=True)
        detokenizer = lambda x: trg_tokenizer.detokenize(x.split(),
                                                         return_str=True)
    else:
        tokenizer = lambda x: x
        detokenizer = lambda x: x

    if bpe_src_code is not None and level == "bpe":
        # load bpe merge file
        merge_file = open(bpe_src_code, "r")
        bpe = apply_bpe.BPE(codes=merge_file)
        segmenter = lambda x: bpe.process_line(x.strip())
    elif level == "char":
        # split to chars
        segmenter = lambda x: list(x.strip())
    else:
        segmenter = lambda x: x.strip()

    # build model and load parameters into it
    model_checkpoint = load_checkpoint(ckpt, use_cuda)
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)
    model.load_state_dict(model_checkpoint["model_state"])

    if use_cuda:
        model.cuda()

    print("Joey NMT model loaded successfully.")

    web_client = slack.WebClient(TOKEN, timeout=30)

    # get bot id
    bot_id = (web_client.api_call("auth.test")["user_id"].upper())

    # find bot channel id
    all_channels = web_client.api_call("conversations.list")["channels"]
    for c in all_channels:
        if c["name"] == BOT_CHANNEL:
            bot_channel_id = c["id"]

    slack_events_adapter = SlackEventAdapter(BOT_SIGNIN,
                                             endpoint="/slack/events")

    @slack_events_adapter.on("message")
    def handle_message(event_data):
        message = event_data["event"]
        if message.get("subtype") is None:
            channel = message["channel"]
            user = message["user"]
            text = message["text"].strip()
            if user != bot_id and message.get("subtype") is None:
                # translates all messages in its channel and mentions
                if channel == bot_channel_id or bot_id in text:
                    mention = "<@{}>".format(bot_id)
                    # TODO remove all possible mentions with regex
                    if mention in text:
                        parts = text.split(mention)
                        text = parts[0].strip() + parts[1].strip()
                    message = translate(text,
                                        beam_size=beam_size,
                                        beam_alpha=beam_alpha,
                                        level=level,
                                        lowercase=lowercase,
                                        max_output_length=max_output_length,
                                        model=model,
                                        postprocess=[detokenizer],
                                        preprocess=[tokenizer, segmenter],
                                        src_vocab=src_vocab,
                                        trg_vocab=trg_vocab,
                                        use_cuda=use_cuda,
                                        logger=logger)
                    web_client.chat_postMessage(text=message,
                                                token=TOKEN,
                                                channel=channel)

    # Error events
    @slack_events_adapter.on("error")
    def error_handler(err):
        print("ERROR: " + str(err))

    slack_events_adapter.start(port=3000)
Example #19
0
 def load_bpe(self, bpe_path):
   with open(bpe_path, 'r') as ofile:
     bpe_model = apply_bpe.BPE(codes=ofile)
   return bpe_model
Example #20
0
    def toNumbers(self, vocabs, prevRules=True):
        if 'names_combined' in vocabs and vocabs[
                'names_combined'].codes != None:
            vocabs['names_combined'].bpe = apply_bpe.BPE(
                vocabs['names_combined'].codes)
            vocabs['bpe'] = apply_bpe.BPE(vocabs['names_combined'].codes)
        if vocabs['seq2seq'].codes != None:
            vocabs['seq2seq'].bpe = apply_bpe.BPE(vocabs['seq2seq'].codes)

        for e in self.examples:

            e['code_nums'] = vocabs['code'].to_num(e['code'])
            seq2seq_tokens = vocabs['seq2seq'].bpe.segment_tokens(
                e['seq2seq']
            ) if vocabs['seq2seq'].codes is not None else e['seq2seq']
            e['seq2seq_nums'] = vocabs['seq2seq'].to_num(seq2seq_tokens)
            e['seq2seq_vocab'] = Vocab(
                seq2seq_tokens, 0, 100000000, start=False,
                stop=False)  # A vocab just for this sentence
            e['seq2seq_in_src_nums'] = e['seq2seq_vocab'].to_num(
                vocabs['seq2seq'].addStartOrEnd(
                    seq2seq_tokens))  # use the local vocab for this sentence
            e['code_in_src_nums'] = e['seq2seq_vocab'].to_num(
                vocabs['code'].addStartOrEnd(
                    e['code']))  # use the local vocab for this sentence

            if self.opt.dataset == "concode":
                # For concode decoder------- -------
                # We have to do this because we concat them in the decoder
                # and there is padding between the nl, vars and methods in the same example because of batching
                # This isnt used, commenting it out
                #         e['src_in_src_nums'] = e['concode_vocab'].to_num(e['src']) # use the local vocab for this sentence
                e['var_in_src_nums'] = e['concode_vocab'].to_num(
                    e['concode_var'])  # use the local vocab for this sentence
                e['method_in_src_nums'] = e['concode_vocab'].to_num(
                    e['concode_method']
                )  # use the local vocab for this sentence
                #-------------------------------------------------------
                e['concode_next_rules_in_src_nums'] = e[
                    'concode_vocab'].to_num(vocabs['next_rules'].addStartOrEnd(
                        [
                            rhs(x) if lhs(x) in CDDataset.pre_terminal_symbols
                            else '<unk>' for x in e['next_rules']
                        ]))  # use the local vocab for this sentence
                #------------------------

                # --- Our Model -----------
                e['src_nums'] = vocabs['names_combined'].to_num(
                    [y for w in e['src'] for y in split_camel_case(w, vocabs)])
                e['varTypes_nums'] = vocabs['names_combined'].to_num(
                    [(split_camel_case(w, vocabs)) for w in e['varTypes']],
                    char=1)
                e['methodReturns_nums'] = vocabs['names_combined'].to_num(
                    [(split_camel_case(w, vocabs))
                     for w in e['methodReturns']],
                    char=1)
                e['varNames_nums'] = vocabs['names_combined'].to_num(
                    [(split_camel_case(w, vocabs)) for w in e['varNames']],
                    char=1)
                e['methodNames_nums'] = vocabs['names_combined'].to_num(
                    [(split_camel_case(w, vocabs)) for w in e['methodNames']],
                    char=1)

                #-----------------------------------
            e['next_rules_in_src_nums'] = e['seq2seq_vocab'].to_num(
                vocabs['next_rules'].addStartOrEnd([
                    rhs(x)
                    if lhs(x) in CDDataset.pre_terminal_symbols else '<unk>'
                    for x in e['next_rules']
                ]))  # use the local vocab for this sentence

            # ------- Rule decoder
            # There is no unk in the vocab, so this will throw an error
            # if the rule isnt there in the vocab
            if prevRules:
                # We don't need to do this during prediction?
                e['prev_rules_nums'] = vocabs['prev_rules'].to_num(
                    e['prev_rules'][:-1])
                e['prev_rules_split_nums'] = vocabs['nt'].to_num(
                    [['<s>']] + [[w] if '-->' not in w else [lhs(w)] +
                                 ['<sep>'] + rhs(w).split('___')
                                 for w in e['prev_rules'][:-1]],
                    char=1)
                e['parent_rules_nums'] = vocabs['prev_rules'].to_num(
                    e['parent_rules'])
                e['parent_rules_split_nums'] = vocabs['nt'].to_num(
                    [['<s>']] + [[w] if '-->' not in w else [lhs(w)] +
                                 ['<sep>'] + rhs(w).split('___')
                                 for w in e['parent_rules']],
                    char=1)

                # We need to ensure that only certain rules can be unked, not all. This
                # is taken care of when building the vocab
                e['nt_nums'] = vocabs['nt'].to_num(e['nt'])
                e['next_rules_nums'] = vocabs['next_rules'].to_num(
                    e['next_rules'])
Example #21
0
def create_subword_bpe(codes):
    bpe_parser = apply_bpe.create_parser()
    bpe_args = bpe_parser.parse_args(['--codes', str(codes)])
    bpe = apply_bpe.BPE(bpe_args.codes)
    return bpe
Example #22
0
                        type=str,
                        help='Comma separated port numbers')
    parser.add_argument('-njobs',
                        type=int,
                        default=50,
                        help='Specify number of Parallel jobs')

    args = parser.parse_args()

    codefile = open(args.codefile)

    if args.vocabfile != '':
        with open(args.vocabfile, 'r') as f:
            voc = f.read().split('\n')
            if voc[-1].strip() == '':
                voc = voc[:-1]
            vocab = apply_bpe.read_vocabulary(voc, 0)
    else:
        vocab = None

    bpe_encoder = apply_bpe.BPE(codefile, vocab=vocab)

    if args.word2bpefile != '':
        with open(args.word2bpefile, 'rb') as pk:
            word2bpe = pickle.load(pk)

    else:
        word2bpe = {}

    main(args)
Example #23
0
 def set_bpe(self, codes_file):
     with codecs.open(self.codes_file, encoding='UTF-8') as codes:
         self.bpe = apply_bpe.BPE(codes, separator=self.separator)
Example #24
0
  def load_model(self, src_language, trg_language, domain, bpe_src_code=None, tokenize=None):
    """ Load model for given trg language. """
    # model_dir = "{}-{}".format(self._model_dir_prefix, trg_language)
    model_dir = f"{self._model_dir_prefix}{src_language}-{trg_language}-{domain}"

    # Load the checkpoint.
    ckpt_path = os.path.join(model_dir, 'model.ckpt')
        
    # Load the vocabularies.
    src_vocab_path = os.path.join(model_dir, 'src_vocab.txt')

    trg_vocab_path = os.path.join(model_dir, 'trg_vocab.txt')
    
    # Load the config.
    config_path = os.path.join(model_dir, 'config_orig.yaml')

    # Adjust config.
    config = load_config(config_path)
    new_config_file = os.path.join(model_dir, 'config.yaml')
    config = self._update_config(config, src_vocab_path, trg_vocab_path,
                                 model_dir, ckpt_path)
    with open(new_config_file, 'w') as cfile:
      yaml.dump(config, cfile)

    # print('Loaded model for {}-{}.'.format(self._src_language, trg_language))
    print('Loaded model for {}-{}.'.format(src_language, trg_language))

    conf = {}

    logger = logging.getLogger(__name__)
    conf["logger"] = logger

    # load the Joey configuration
    cfg = load_config(new_config_file)

    # load the checkpoint
    if "load_model" in cfg['training'].keys():
        ckpt = cfg['training']["load_model"]
    else:
        ckpt = get_latest_checkpoint(model_dir)
        if ckpt is None:
            raise FileNotFoundError("No checkpoint found in directory {}."
                                    .format(model_dir))

    # prediction parameters from config
    conf["use_cuda"] = cfg["training"].get("use_cuda", False) if torch.cuda.is_available() else False

    conf["level"] = cfg["data"]["level"]
    conf["max_output_length"] = cfg["training"].get("max_output_length", None)
    conf["lowercase"] = cfg["data"].get("lowercase", False)

    # load the vocabularies
    src_vocab_file = cfg["training"]["model_dir"] + "/src_vocab.txt"
    trg_vocab_file = cfg["training"]["model_dir"] + "/trg_vocab.txt"
    
    conf["src_vocab"] = build_vocab(field="src", vocab_file=src_vocab_file,
                            dataset=None, max_size=-1, min_freq=0)
    conf["trg_vocab"] = build_vocab(field="trg", vocab_file=trg_vocab_file,
                            dataset=None, max_size=-1, min_freq=0)

    # whether to use beam search for decoding, 0: greedy decoding
    if "testing" in cfg.keys():
        conf["beam_size"] = cfg["testing"].get("beam_size", 0)
        conf["beam_alpha"] = cfg["testing"].get("alpha", -1)
    else:
        conf["beam_size"] = 1
        conf["beam_alpha"] = -1

    # pre-processing
    if tokenize is not None:
        src_tokenizer = MosesTokenizer(lang=cfg["data"]["src"])
        trg_tokenizer = MosesDetokenizer(lang=cfg["data"]["trg"])
        # tokenize input
        tokenizer = lambda x: src_tokenizer.tokenize(x, return_str=True)
        detokenizer = lambda x: trg_tokenizer.detokenize(
            x.split(), return_str=True)
    else:
        tokenizer = lambda x: x
        detokenizer = lambda x: x

    if bpe_src_code is not None and level == "bpe":
        # load bpe merge file
        merge_file = open(bpe_src_code, "r")
        bpe = apply_bpe.BPE(codes=merge_file)
        segmenter = lambda x: bpe.process_line(x.strip())
    elif conf["level"] == "char":
        # split to chars
        segmenter = lambda x: list(x.strip())
    else:
        segmenter = lambda x: x.strip()

    conf["preprocess"] = [tokenizer, segmenter]
    conf["postprocess"] = [detokenizer]
    # build model and load parameters into it
    model_checkpoint = load_checkpoint(ckpt, conf["use_cuda"])
    model = build_model(cfg["model"], src_vocab=conf["src_vocab"], trg_vocab=conf["trg_vocab"])
    model.load_state_dict(model_checkpoint["model_state"])
    # ipdb.set_trace()
    if conf["use_cuda"]:
        model.cuda()
    conf["model"] = model
    print("Joey NMT model loaded successfully.")
    return conf
Example #25
0
    def __init__(self,
                 elements,
                 prune,
                 max_num,
                 start=True,
                 stop=True,
                 pad=True,
                 unk=True,
                 rule=False,
                 bpe=-1):
        self.start = start
        self.stop = stop
        self.codes = None
        vocab = Counter()
        self.max_num = max_num
        self.itos = []
        self.stoi = {}
        if pad:
            self.addSymbol('<blank>')
        if unk:
            self.addSymbol('<unk>')
        if start:
            self.addSymbol('<s>')
        if stop:
            self.addSymbol('</s>')
        self.rule = rule
        if rule:  # Adding these for both ATIS and CONCODE. Extra things in the vocab are ok.
            for pre_terminal in CDDataset.pre_terminal_symbols:
                self.addSymbol(CDDataset._unk_rule_from_Nt(pre_terminal))

        if bpe >= 0:
            self.codes = learn_bpe.learn_bpe(elements, bpe,
                                             0)  #  last is min freq
            b = apply_bpe.BPE(self.codes)
            elements = b.segment_tokens(elements)

        for w in elements:
            vocab[w] += 1
        if bpe >= 0:
            print('Vocab size {}'.format(len(vocab)))

        # prune low frequency words
        max_vocab = self.max_num if not rule else 100000000000
        for (w, f) in vocab.most_common(max_vocab):
            if ((rule == False and f > prune)
                    or (rule == True and not CDDataset._is_terminal_rule(w))
                    or (rule == True and CDDataset._is_terminal_rule(w)
                        and len(self.itos) < self.max_num)
                    or w.endswith("_concodeNT")):
                word = w.replace('concodeclass_',
                                 '').replace('concodefunc_', '')
                self.itos.append(word)
                self.stoi[word] = len(self.itos) - 1
            else:  #map everything else to unk
                if rule:
                    # We need the right kind of UNK rule here
                    mapped_to_known_unk = False
                    for pre_terminal in CDDataset.pre_terminal_symbols:
                        if pre_terminal in w:
                            self.stoi[w] = self.stoi[
                                CDDataset._unk_rule_from_Nt(pre_terminal)]
                            mapped_to_known_unk = True
                            break

                    if not mapped_to_known_unk:
                        # An unk type we dont know about. Investigate.
                        import ipdb
                        ipdb.set_trace()
                        # For next_rules, we cannot have any other type of unk
                        self.stoi[w] = self.stoi['<unk>']
                else:
                    self.stoi[w] = self.stoi['<unk>']
Example #26
0
    def __init__(self, task, models, args, src_bpe=None, bpe_symbol='@@ '):
        self.task = task
        self.models = models
        self.src_dict = task.source_dictionary
        self.tgt_dict = task.target_dictionary
        self.src_bpe = src_bpe
        self.use_cuda = torch.cuda.is_available() and not args.cpu
        self.args = args

        # optimize model for generation
        for model in self.models:
            model.make_generation_fast_(
                beamable_mm_beam_size=None if self.args.no_beamable_mm else self.args.beam,
                need_attn=args.print_alignment,
            )
            if args.fp16:
                model.half()
            if self.use_cuda:
                model.cuda()

        self.generator = self.task.build_generator(args)

        # Load alignment dictionary for unknown word replacement
        # (None if no unknown word replacement, empty if no path to align dictionary)
        self.align_dict = utils.load_align_dict(args.replace_unk)

        self.max_positions = utils.resolve_max_positions(
            self.task.max_positions(),
            *[model.max_positions() for model in models]
        )

        self.in_transforms = []
        self.out_transforms = []

        if getattr(args, 'moses', False):
            tokenizer = MosesTokenizer(lang=args.source_lang or 'en')
            detokenizer = MosesDetokenizer(lang=args.target_lang or 'en')
            self.in_transforms.append(lambda s: tokenizer.tokenize(s, return_str=True))
            self.out_transforms.append(lambda s: detokenizer.detokenize(s.split()))
        elif getattr(args, 'nltk', False):
            from nltk.tokenize import word_tokenize
            self.in_transforms.append(lambda s: ' '.join(word_tokenize(s)))

        if getattr(args, 'gpt2_bpe', False):
            from fairseq.gpt2_bpe.gpt2_encoding import get_encoder
            encoder_json = os.path.join(os.path.dirname(src_bpe), 'encoder.json')
            vocab_bpe = src_bpe
            encoder = get_encoder(encoder_json, vocab_bpe)
            self.in_transforms.append(lambda s: ' '.join(map(str, encoder.encode(s))))
            self.out_transforms.append(lambda s: ' '.join(t for t in s.split() if t != '<unk>'))
            self.out_transforms.append(lambda s: encoder.decode(map(int, s.strip().split())))
        elif getattr(args, 'sentencepiece', False):
            import sentencepiece as spm
            sp = spm.SentencePieceProcessor()
            sp.Load(src_bpe)
            self.in_transforms.append(lambda s: ' '.join(sp.EncodeAsPieces(s)))
            self.out_transforms.append(lambda s: data_utils.process_bpe_symbol(s, 'sentencepiece'))
        elif src_bpe is not None:
            bpe_parser = apply_bpe.create_parser()
            bpe_args = bpe_parser.parse_args(['--codes', self.src_bpe])
            bpe = apply_bpe.BPE(bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries)
            self.in_transforms.append(lambda s: bpe.process_line(s))
            self.out_transforms.append(lambda s: data_utils.process_bpe_symbol(s, bpe_symbol))
Example #27
0
def load_model(model_dir, bpe_src_code=None, tokenize=None):
    """
    Start the bot. This means loading the model according to the config file.

    :param model_dir: Model directory of trained Joey NMT model.
    :param bpe_src_code: BPE codes for source side processing (optional).
    :param tokenize: If True, tokenize inputs with Moses tokenizer.
    :return:
    """
    conf = {}
    cfg_file = model_dir+"/config.yaml"

    logger = logging.getLogger(__name__)
    conf["logger"] = logger
    # load the Joey configuration
    cfg = load_config(cfg_file)

    # load the checkpoint
    if "load_model" in cfg['training'].keys():
        ckpt = cfg['training']["load_model"]
    else:
        ckpt = get_latest_checkpoint(model_dir)
        if ckpt is None:
            raise FileNotFoundError("No checkpoint found in directory {}."
                                    .format(model_dir))

    # prediction parameters from config
    conf["use_cuda"] = cfg["training"].get("use_cuda", False)
    conf["level"] = cfg["data"]["level"]
    conf["max_output_length"] = cfg["training"].get("max_output_length", None)
    conf["lowercase"] = cfg["data"].get("lowercase", False)

    # load the vocabularies
    src_vocab_file = cfg["training"]["model_dir"] + "/src_vocab.txt"
    trg_vocab_file = cfg["training"]["model_dir"] + "/trg_vocab.txt"

    conf["src_vocab"] = build_vocab(field="src", vocab_file=src_vocab_file,
                            dataset=None, max_size=-1, min_freq=0)
    conf["trg_vocab"] = build_vocab(field="trg", vocab_file=trg_vocab_file,
                            dataset=None, max_size=-1, min_freq=0)

    # whether to use beam search for decoding, 0: greedy decoding
    if "testing" in cfg.keys():
        conf["beam_size"] = cfg["testing"].get("beam_size", 0)
        conf["beam_alpha"] = cfg["testing"].get("alpha", -1)
    else:
        conf["beam_size"] = 1
        conf["beam_alpha"] = -1

    # pre-processing
    if tokenize is not None:
        src_tokenizer = MosesTokenizer(lang=cfg["data"]["src"])
        trg_tokenizer = MosesDetokenizer(lang=cfg["data"]["trg"])
        # tokenize input
        tokenizer = lambda x: src_tokenizer.tokenize(x, return_str=True)
        detokenizer = lambda x: trg_tokenizer.detokenize(
            x.split(), return_str=True)
    else:
        tokenizer = lambda x: x
        detokenizer = lambda x: x

    if bpe_src_code is not None and level == "bpe":
        # load bpe merge file
        merge_file = open(bpe_src_code, "r")
        bpe = apply_bpe.BPE(codes=merge_file)
        segmenter = lambda x: bpe.process_line(x.strip())
    elif conf["level"] == "char":
        # split to chars
        segmenter = lambda x: list(x.strip())
    else:
        segmenter = lambda x: x.strip()

    conf["preprocess"] = [tokenizer, segmenter]
    conf["postprocess"] = [detokenizer]
    # build model and load parameters into it
    model_checkpoint = load_checkpoint(ckpt, conf["use_cuda"])
    model = build_model(cfg["model"], src_vocab=conf["src_vocab"], trg_vocab=conf["trg_vocab"])
    model.load_state_dict(model_checkpoint["model_state"])

    if conf["use_cuda"]:
        model.cuda()
    conf["model"] = model
    print("Joey NMT model loaded successfully.")
    return conf
Example #28
0
def load_codes(codes, joiner="@@"):
    bpeobject=apply_bpe.BPE(open(codes,encoding="utf-8"),separator=joiner)
    return(bpeobject)
Example #29
0
 def _load_from_codecs(self):
     with open(self.codecs, 'r') as codecs_file:
         self.bpe = apply_bpe.BPE(codecs_file)
Example #30
0
def get_bpe_object(codes_file_path):
    codes = codecs.open(codes_file_path, encoding='utf-8')
    bpe = apply_bpe.BPE(codes)
    codes.close()
    return bpe