예제 #1
0
파일: rescore.py 프로젝트: whr94621/nematus
def main(source_file, nbest_file, output_file, rescorer_settings):
    # load model model_options
    options = []
    for model in rescorer_settings.models:
        options.append(load_config(model))
        fill_options(options[-1])
    rescore_model(source_file, nbest_file, output_file, rescorer_settings, options)
예제 #2
0
def main(source_file, nbest_file, output_file, rescorer_settings):
    # load model model_options
    options = []
    for model in rescorer_settings.models:
        options.append(load_config(model))
        fill_options(options[-1])
        options[-1]['reload'] = model
        options[-1] = argparse.Namespace(**options[-1])

    rescore(source_file, nbest_file, output_file, rescorer_settings, options)
예제 #3
0
def main(models, source_file, nbest_file, saveto, b=80,
         normalize=False, verbose=False, alignweights=False):

    # load model model_options
    options = []
    for model in models:
        options.append(load_config(model))

        fill_options(options[-1])

    rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights)
예제 #4
0
    def _load_model_options(self):
        """
        Loads config options for each model.
        """
        options = []
        for model in self._models:
            options.append(load_config(model))
            # backward compatibility
            fill_options(options[-1])

        self._options = options
예제 #5
0
파일: rescore.py 프로젝트: sohuren/DL4MT
def main(models, source_file, nbest_file, saveto, b=80,
         normalize=False, verbose=False, alignweights=False):

    # load model model_options
    options = []
    for model in models:
        options.append(load_config(model))

        fill_options(options[-1])

    rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights)
예제 #6
0
def main(source_file, target_file, output_file, scorer_settings):
    # load model model_options
    options = []
    for model in scorer_settings.models:
        options.append(load_config(model))
        fill_options(options[-1])
        options[-1]['reload'] = model
        options[-1] = argparse.Namespace(**options[-1])

    scores = score_model(source_file, target_file, scorer_settings, options)
    write_scores(source_file, target_file, scores, output_file,
                 scorer_settings)
예제 #7
0
    def _load_model_options(self):
        """
        Loads config options for each model.
        """
        options = []
        for model in self._models:
            options.append(load_config(model))
            # backward compatibility
            fill_options(options[-1])
            # dummy features for single source using multi-source code
            dummy_options(options[-1])

        self._options = options
예제 #8
0
파일: translate.py 프로젝트: nd1511/nematus
    def _load_model_options(self):
        """
        Loads config options for each model.
        """

        self._options = []
        for model in self._models:
            config = load_config(model)
            # backward compatibility
            fill_options(config)
            config['reload'] = model
            self._options.append(argparse.Namespace(**config))

        _, _, _, self._num_to_target = load_dictionaries(self._options[0])
예제 #9
0
    def _load_model_options(self):
        """
        Loads config options for each model.
        """
        options = []
        for model in self._models:
            m = load_config(model)
            if not 'concatenate_lm_decoder' in m:
                m['concatenate_lm_decoder'] = False
            options.append(m)
            # backward compatibility
            fill_options(options[-1])

        self._options = options
예제 #10
0
def main(settings):
    """
    Translates a source language file (or STDIN) into a target language file
    (or STDOUT).
    """
    # Start logging.
    level = logging.DEBUG if settings.verbose else logging.INFO
    logging.basicConfig(level=level, format='%(levelname)s: %(message)s')

    # Create the TensorFlow session.
    if settings.cpu:
        logging.info("using cpu now...")
        os.environ["CUDA_VISIBLE_DEVICES"] = ""
        tf_config = tf.ConfigProto(device_count={'GPU': 0})
    else:
        os.environ["CUDA_VISIBLE_DEVICES"] = "2"
        tf_config = tf.ConfigProto()
    tf_config.allow_soft_placement = True
    session = tf.Session(config=tf_config)

    # Load config file for each model.
    configs = []
    for model in settings.models:
        config = util.load_config(model)
        compat.fill_options(config)
        config['reload'] = model
        configs.append(argparse.Namespace(**config))

    # Create the model graphs and restore their variables.
    logging.debug("Loading models")
    models = []
    for i, config in enumerate(configs):
        with tf.variable_scope("model%d" % i) as scope:
            model = rnn_model.RNNModel(config)
            saver = model_loader.init_or_restore_variables(
                config, session, ensemble_scope=scope)
            models.append(model)

    logging.debug("Models load done.")
    # Translate the source file.
    inference.translate_file(input_file=settings.input,
                             output_file=settings.output,
                             session=session,
                             models=models,
                             configs=configs,
                             beam_size=settings.beam_size,
                             nbest=settings.n_best,
                             minibatch_size=settings.minibatch_size,
                             maxibatch_size=settings.maxibatch_size,
                             normalization_alpha=settings.normalization_alpha)
예제 #11
0
파일: predict_punc.py 프로젝트: isofun/NLP
    def _load_model_options(self, options_load):
        """
        Loads config options for each model.
        """

        if options_load == None:
            options = []
            for model in self._models:
                options.append(load_config(model))
                # backward compatibility
                fill_options(options[-1])
        else:
            options = [load_config(options_load)]
            fill_options(options[-1])

        self._options = options
예제 #12
0
def main(models,
         source_file,
         target_file,
         saveto,
         b=80,
         normalization_alpha=0.0,
         verbose=False,
         alignweights=False,
         extra_sources=[],
         per_word=False):
    # load model model_options
    options = []
    for model in models:
        options.append(load_config(model))

        fill_options(options[-1])

    # multi-source or single source functions
    if len(extra_sources) == 0:
        savetos = [saveto] + [file(saveto.name, 'w') for _ in extra_sources]
        multi_rescore_model(source_file,
                            target_file,
                            savetos,
                            models,
                            options,
                            b,
                            normalization_alpha,
                            verbose,
                            alignweights,
                            per_word=per_word)
    else:
        savetos = [saveto] + [file(saveto.name, 'w') for _ in extra_sources]
        #source_files = source_files + extra_sources
        multi_rescore_model(source_file,
                            target_file,
                            savetos,
                            models,
                            options,
                            b,
                            normalization_alpha,
                            verbose,
                            alignweights,
                            per_word=per_word,
                            extra_sources=extra_sources)
예제 #13
0
def main(models,
         saveto,
         bpe_file,
         save_alignment=None,
         k=5,
         normalize=False,
         n_process=5,
         chr_level=False,
         verbose=False,
         nbest=False,
         suppress_unk=False,
         a_json=False,
         print_word_probabilities=False,
         return_hyp_graph=False):
    # load model model_options
    options = []
    for model in models:
        options.append(load_config(model))

        fill_options(options[-1])

    dictionaries = options[0]['dictionaries']

    dictionaries_source = dictionaries[:-1]
    dictionary_target = dictionaries[-1]

    # load source dictionary and invert
    word_dicts = []
    word_idicts = []
    for dictionary in dictionaries_source:
        word_dict = load_dict(dictionary)
        if options[0]['n_words_src']:
            for key, idx in word_dict.items():
                if idx >= options[0]['n_words_src']:
                    del word_dict[key]
        word_idict = dict()
        for kk, vv in word_dict.iteritems():
            word_idict[vv] = kk
        word_idict[0] = '<eos>'
        word_idict[1] = 'UNK'
        word_dicts.append(word_dict)
        word_idicts.append(word_idict)

    # load target dictionary and invert
    word_dict_trg = load_dict(dictionary_target)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    # create input and output queues for processes
    # CAN I MAKE IT INTO SERVER

    ###### The following functions should be already a part of serverisation

    # utility function
    def _seqs2words(cc):
        ww = []
        for w in cc:
            if w == 0:
                break
            ww.append(word_idict_trg[w])
        return ' '.join(ww)

    def _send_jobs(f, processes, queue):
        source_sentences = []
        for idx, line in enumerate(f):
            if chr_level:
                words = list(line.decode('utf-8').strip())
            else:
                words = line.strip().split()

            x = []
            for w in words:
                w = [
                    word_dicts[i][f] if f in word_dicts[i] else 1
                    for (i, f) in enumerate(w.split('|'))
                ]
                if len(w) != options[0]['factors']:
                    sys.stderr.write(
                        'Error: expected {0} factors, but input word has {1}\n'
                        .format(options[0]['factors'], len(w)))
                    for midx in xrange(n_process):
                        processes[midx].terminate()
                    sys.exit(1)
                x.append(w)

            x += [[0] * options[0]['factors']]
            queue.put((idx, x))
            source_sentences.append(words)
        return idx + 1, source_sentences

    def _finish_processes(queue):
        for midx in xrange(n_process):
            queue.put(None)

    def _retrieve_jobs(n_samples, processes, queue, rqueue):
        trans = [None] * n_samples
        out_idx = 0
        for idx in xrange(n_samples):
            resp = None
            while resp is None:
                try:
                    resp = rqueue.get(True, 5)
                # if queue is empty after 5s, check if processes are still alive
                except Empty:
                    for midx in xrange(n_process):
                        if not processes[midx].is_alive():
                            # kill all other processes and raise exception if one dies
                            queue.cancel_join_thread()
                            rqueue.cancel_join_thread()
                            for idx in xrange(n_process):
                                processes[idx].terminate()
                            sys.stderr.write(
                                "Error: translate worker process {0} crashed with exitcode {1}"
                                .format(processes[midx].pid,
                                        processes[midx].exitcode))
                            sys.exit(1)
            trans[resp[0]] = resp[1]
            if verbose and numpy.mod(idx, 10) == 0:
                sys.stderr.write('Sample {0} / {1} Done\n'.format((idx + 1),
                                                                  n_samples))
            while out_idx < n_samples and trans[out_idx] != None:
                yield trans[out_idx]
                out_idx += 1

    def _parallelized_main(fs_init, fs_next, c, bpe, tokenizer, detokenizer):
        source_file_t = sent_tokenize(c.recv(4096).decode('utf-8'))
        #print(source_file_t[i])
        while source_file_t[0] != "EOT":
            for i in range(len(source_file_t)):
                # print source_file_t[i].decode('utf-8')
                #pipe = subprocess.Popen("echo " + source_file_t[i] + "| perl truecase.perl --model en-truecase.mdl", shell=True)
                #pipe = subprocess.Popen(["echo", '"' + source_file_t[i] + '"', "|", "perl", "truecase.perl", "--model",
                #                         "en-truecase.mdl"], stdout=subprocess.PIPE)
                #result = pipe.stdout.read()
                #print pipe.communicate()
                #print pipe
                #print pipe.stdout
                #print pipe.stdout.read()
                #print pipe.
                #print "Here"
                #print result
                #source_file_t[i] = subprocess.check_output()
                source_file_t[i] = bpe.segment(
                    tokenizer.tokenize(source_file_t[i],
                                       return_str=True)).strip()
            #print "Passed"
            print source_file_t
            detokenized = ''
            queue = Queue()
            rqueue = Queue()
            processes = [None] * n_process
            for midx in xrange(n_process):
                processes[midx] = Process(
                    target=translate_model,
                    args=(queue, rqueue, midx, models, options, k, normalize,
                          verbose, nbest, save_alignment is not None,
                          suppress_unk, return_hyp_graph, fs_init, fs_next))
                processes[midx].start()

            n_samples, source_sentences = _send_jobs(source_file_t, processes,
                                                     queue)
            _finish_processes(queue)
            #### The model loading takes place in the head of for loop, prolly in _retrieve_jobs
            for i, trans in enumerate(
                    _retrieve_jobs(n_samples, processes, queue, rqueue)):
                print "NEXT SENTENCE:"
                if nbest:
                    samples, scores, word_probs, alignment, hyp_graph = trans
                    if return_hyp_graph:
                        renderer = HypGraphRenderer(hyp_graph)
                        renderer.wordify(word_idict_trg)
                        renderer.save_png(return_hyp_graph,
                                          detailed=True,
                                          highlight_best=True)
                    order = numpy.argsort(scores)
                    for j in order:
                        if print_word_probabilities:
                            probs = " ||| " + " ".join(
                                "{0}".format(prob) for prob in word_probs[j])
                        else:
                            probs = ""
                        saveto.write('{0} ||| {1} ||| {2}{3}\n'.format(
                            i, _seqs2words(samples[j]), scores[j], probs))
                        # print alignment matrix for each hypothesis
                        # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos
                        # translation_token_count+eos
                        if save_alignment is not None:
                            if a_json:
                                print_matrix_json(
                                    alignment[j], source_sentences[i],
                                    _seqs2words(samples[j]).split(), i, i + j,
                                    save_alignment)
                            else:
                                save_alignment.write(
                                    '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'
                                    .format(i, _seqs2words(samples[j]),
                                            scores[j],
                                            ' '.join(source_sentences[i]),
                                            len(source_sentences[i]) + 1,
                                            len(samples[j])))
                                print_matrix(alignment[j], save_alignment)
                else:
                    samples, scores, word_probs, alignment, hyp_graph = trans
                    if return_hyp_graph:
                        renderer = HypGraphRenderer(hyp_graph)
                        renderer.wordify(word_idict_trg)
                        renderer.save_png(return_hyp_graph,
                                          detailed=True,
                                          highlight_best=True)
                    ## TODO: Handle the output here
                    #print((_seqs2words(samples) + "\n").encode('utf-8'))
                    #text.append(_seqs2words(samples) + "\n")
                    x = _seqs2words(samples)
                    #print x[0].upper() + x[1:]
                    detokenized += detokenizer.detokenize(
                        (x.decode('utf-8') + " ").split(), return_str=True)
                    detokenized = detokenized[0].upper() + detokenized[1:]
                    #print "ref this"
                    #print detokenized
                    #detokenized[0] = detokenized[0].upper()
                    #c.send(detokenized.replace('@@ ', '').encode('utf-8').strip())
                    ## TODO: End of output handling
                    if print_word_probabilities:
                        for prob in word_probs:
                            saveto.write("{} ".format(prob))
                        saveto.write('\n')
                    if save_alignment is not None:
                        if a_json:
                            print_matrix_json(alignment, source_sentences[i],
                                              _seqs2words(trans[0]).split(), i,
                                              i, save_alignment)
                        else:
                            save_alignment.write(
                                '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.
                                format(i, _seqs2words(trans[0]), 0,
                                       ' '.join(source_sentences[i]),
                                       len(source_sentences[i]) + 1,
                                       len(trans[0])))
                            print_matrix(alignment, save_alignment)
            c.send(detokenized.replace('@@ ', '').encode('utf-8').strip())
            source_file_t = sent_tokenize(c.recv(4096).decode('utf-8'))
        c.close()
        sys.stderr.write('Done\n')

    def _listen(c, addr, fs_init, fs_next, tokenizer, detokenizer, bpe):
        while True:
            try:  # Establish connection with client.
                try:
                    print 'Got connection from', addr
                    print "Receiving..."
                    fname = c.recv(4096)
                except socket.error:
                    c.close()
                    print "connection closed"
                    break
                print fname
                c.send("okay")
                #if fname == 'exit':
                #    print "Terminating connection with client."
                #    c.close()
                #    break
                #else:
                #t = threading.Thread(target=_parallelized_main, args=(fname, fs_init, fs_next, c))
                try:
                    t = threading.Thread(target=_parallelized_main,
                                         args=(fs_init, fs_next, c, bpe,
                                               tokenizer, detokenizer))
                    t.start()
                    t.join()
                except socket.error:
                    c.close()
                    break
            except KeyboardInterrupt as e:
                LOG.debug('Crtrl+C issued ...')
                LOG.info('Terminating server ...')
                try:
                    c.shutdown(socket.SHUT_RDWR)
                    c.close()
                except:
                    pass
                break

    s = socket.socket()  # Create a socket object
    host = socket.gethostname()  # Get local machine name
    port = 12345  # Reserve a port for your service.
    s.bind((host, port))  # Bind to the port #  Now wait for client connection.

    # Beginning model loading
    from theano_util import (load_params, init_theano_params)
    from nmt import (build_sampler)

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    from theano import shared
    trng = RandomStreams(1234)
    use_noise = shared(numpy.float32(0.))

    fs_init = []
    fs_next = []

    for model, option in zip(models, options):
        # load model parameters and set theano shared variables
        param_list = numpy.load(model).files
        param_list = dict.fromkeys(
            [key for key in param_list if not key.startswith('adam_')], 0)
        params = load_params(model, param_list)
        tparams = init_theano_params(params)

        # word index
        f_init, f_next = build_sampler(tparams,
                                       option,
                                       use_noise,
                                       trng,
                                       return_alignment=save_alignment
                                       is not None)

        fs_init.append(f_init)
        fs_next.append(f_next)
    # end of model loading
    tokenizer = moses.MosesTokenizer()
    detokenizer = moses.MosesDetokenizer()
    # start listening to connections once models are loaded
    args.codes = codecs.open(bpe_file[0], encoding='utf-8')
    bpe = BPE(args.codes, '@@')
    while True:
        try:
            s.listen(5)
            print("Waiting for connections and stuff...")
            c, addr = s.accept()
            t = threading.Thread(target=_listen,
                                 args=(c, addr, fs_init, fs_next, tokenizer,
                                       detokenizer, bpe))
            t.start()
        except KeyboardInterrupt:
            break
    s.close()
예제 #14
0
def main(models,
         source_file,
         saveto,
         save_alignment=None,
         k=5,
         normalize=False,
         n_process=5,
         chr_level=False,
         verbose=False,
         nbest=False,
         suppress_unk=False,
         a_json=False,
         print_word_probabilities=False):
    # load model model_options
    options = []
    for model in models:
        options.append(load_config(model))

        fill_options(options[-1])

    dictionaries = options[0]['dictionaries']

    dictionaries_source = dictionaries[:-1]
    dictionary_target = dictionaries[-1]

    # load source dictionary and invert
    word_dicts = []
    word_idicts = []
    for dictionary in dictionaries_source:
        word_dict = load_dict(dictionary)
        if options[0]['n_words_src']:
            for key, idx in word_dict.items():
                if idx >= options[0]['n_words_src']:
                    del word_dict[key]
        word_idict = dict()
        for kk, vv in word_dict.iteritems():
            word_idict[vv] = kk
        word_idict[0] = '<eos>'
        word_idict[1] = 'UNK'
        word_dicts.append(word_dict)
        word_idicts.append(word_idict)

    # load target dictionary and invert
    word_dict_trg = load_dict(dictionary_target)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    # create input and output queues for processes
    queue = Queue()
    rqueue = Queue()
    processes = [None] * n_process
    for midx in xrange(n_process):
        processes[midx] = Process(target=translate_model,
                                  args=(queue, rqueue, midx, models, options,
                                        k, normalize, verbose, nbest,
                                        save_alignment
                                        is not None, suppress_unk))
        processes[midx].start()

    # utility function
    def _seqs2words(cc):
        ww = []
        for w in cc:
            if w == 0:
                break
            ww.append(word_idict_trg[w])
        return ' '.join(ww)

    def _send_jobs(f):
        source_sentences = []
        for idx, line in enumerate(f):
            if chr_level:
                words = list(line.decode('utf-8').strip())
            else:
                words = line.strip().split()

            x = []
            for w in words:
                w = [
                    word_dicts[i][f] if f in word_dicts[i] else 1
                    for (i, f) in enumerate(w.split('|'))
                ]
                if len(w) != options[0]['factors']:
                    sys.stderr.write(
                        'Error: expected {0} factors, but input word has {1}\n'
                        .format(options[0]['factors'], len(w)))
                    for midx in xrange(n_process):
                        processes[midx].terminate()
                    sys.exit(1)
                x.append(w)

            x += [[0] * options[0]['factors']]
            queue.put((idx, x))
            source_sentences.append(words)
        return idx + 1, source_sentences

    def _finish_processes():
        for midx in xrange(n_process):
            queue.put(None)

    def _retrieve_jobs(n_samples):
        trans = [None] * n_samples
        out_idx = 0
        for idx in xrange(n_samples):
            resp = rqueue.get()
            trans[resp[0]] = resp[1]
            if verbose and numpy.mod(idx, 10) == 0:
                sys.stderr.write('Sample {0} / {1} Done\n'.format((idx + 1),
                                                                  n_samples))
            while out_idx < n_samples and trans[out_idx] != None:
                yield trans[out_idx]
                out_idx += 1

    sys.stderr.write('Translating {0} ...\n'.format(source_file.name))
    n_samples, source_sentences = _send_jobs(source_file)
    _finish_processes()

    for i, trans in enumerate(_retrieve_jobs(n_samples)):
        if nbest:
            samples, scores, word_probs, alignment = trans
            order = numpy.argsort(scores)
            for j in order:
                if print_word_probabilities:
                    probs = " ||| " + " ".join("{0}".format(prob)
                                               for prob in word_probs[j])
                else:
                    probs = ""
                saveto.write('{0} ||| {1} ||| {2}{3}\n'.format(
                    i, _seqs2words(samples[j]), scores[j], probs))
                # print alignment matrix for each hypothesis
                # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos translation_token_count+eos
                if save_alignment is not None:
                    if a_json:
                        print_matrix_json(alignment[j], source_sentences[i],
                                          _seqs2words(samples[j]).split(), i,
                                          i + j, save_alignment)
                    else:
                        save_alignment.write(
                            '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format(
                                i, _seqs2words(samples[j]), scores[j],
                                ' '.join(source_sentences[i]),
                                len(source_sentences[i]) + 1, len(samples[j])))
                        print_matrix(alignment[j], save_alignment)
        else:
            samples, scores, word_probs, alignment = trans

            saveto.write(_seqs2words(samples) + "\n")
            if print_word_probabilities:
                for prob in word_probs:
                    saveto.write("{} ".format(prob))
                saveto.write('\n')
            if save_alignment is not None:
                if a_json:
                    print_matrix_json(trans[1], source_sentences[i],
                                      _seqs2words(trans[0]).split(), i, i,
                                      save_alignment)
                else:
                    save_alignment.write(
                        '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format(
                            i, _seqs2words(trans[0]), 0,
                            ' '.join(source_sentences[i]),
                            len(source_sentences[i]) + 1, len(trans[0])))
                    print_matrix(trans[3], save_alignment)

    sys.stderr.write('Done\n')
예제 #15
0
def main(models,
         source_file,
         saveto,
         save_alignment=None,
         k=5,
         normalize=False,
         n_process=5,
         chr_level=False,
         verbose=False,
         nbest=False,
         suppress_unk=False,
         print_word_probabilities=False,
         return_hyp_graph=False):
    options = []
    for model in models:  # actually, there is only one model
        options.append(load_config(model))
        fill_options(options[-1])

    dictionaries = options[0]['dictionaries']
    dictionaries_source = dictionaries[:-1]  # 0 - n-1 are source dictionaries
    dictionary_target = dictionaries[-1]

    # load source dictionaries and invert
    word_dicts = []  # list of word-id mapping
    word_idicts = []  # list of id-word mapping
    for dictionary in dictionaries_source:
        word_dict = load_dict(dictionary)
        if options[0]['n_words_src']:
            for kk, vv in word_dict.items():
                if vv >= options[0]['n_words_src']:
                    del word_dict[kk]

        word_idict = dict()
        for kk, vv in word_dict.iteritems():
            word_idict[vv] = kk
        word_idict[0] = '<eos>'
        word_idict[1] = 'UNK'
        word_dicts.append(word_dict)
        word_idicts.append(word_idict)

    # load target dictionaries and invert
    word_dict_trg = load_dict(dictionary_target)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    # create input and output queues for process; note that Queue is used to communicateion between Processes
    queue = Queue()
    rqueue = Queue()
    processes = [None] * n_process
    for pidx in xrange(n_process):
        processes[pidx] = Process(target=translate_model,
                                  args=(queue, rqueue, pidx, models, options,
                                        k, normalize, verbose, nbest,
                                        save_alignment is not None,
                                        suppress_unk, return_hyp_graph))
        processes[pidx].start()

    # put data into queue
    def _send_jobs(f):
        source_sentences = []
        for idx, line in enumerate(f):
            if chr_level:  #into single characters
                words = list(line.decode('utf-8').strip())
            else:  # into words (separated by spaces)
                words = line.strip().split()

            x = []
            for w in words:
                word = w
                w = [
                    word_dicts[i][f] if f in word_dicts[i] else 1
                    for (i, f) in enumerate(w.split('|'))
                ]
                x.append(w)
            x += [[0] * options[0]['factors']]  # end with "EOS"
            queue.put((idx, x))
            source_sentences.append(words)

        return idx + 1, source_sentences

    def _finish_processes():
        for midx in xrange(n_process):
            queue.put(None)

    # this inner function is used to get translation results
    def _retrieve_jobs(n_samples):
        trans = [None] * n_samples
        out_idx = 0
        for idx in xrange(n_samples):
            resp = None
            while resp is None:
                try:
                    resp = rqueue.get(True, 5)
                except Empty:
                    for midx in xrange(n_process):
                        if not processes[midx].is_alive():
                            # kill all other processes and raise exception if one dies
                            queue.cancel_join_thread()
                            rqueue.cancel_join_thread()
                            for idx in xrange(n_process):
                                processes[idx].terminate()
                                sys.exit(1)
            trans[resp[0]] = resp[1]
            while out_idx < n_samples and trans[out_idx] != None:
                yield trans[out_idx]
                out_idx += 1

    sys.stderr.write('Translating...{0}\n'.format(source_file.name))
    n_samples, source_sentences = _send_jobs(source_file)
    _finish_processes()

    for i, trans in enumerate(_retrieve_jobs(n_samples)):
        print trans
def theano_to_tensorflow_config(model_path):
    config = util.load_config(model_path)
    compat.fill_options(config)
    config['reload'] = None
    config['prior_model'] = None
    return argparse.Namespace(**config)
예제 #17
0
def main(models, source_file, saveto, save_alignment=None, k=5,
         normalize=False, n_process=5, chr_level=False, verbose=False, nbest=False, suppress_unk=False, a_json=False, print_word_probabilities=False, return_hyp_graph=False):
    # load model model_options
    options = []
    for model in models:
        options.append(load_config(model))

        fill_options(options[-1])

    dictionaries = options[0]['dictionaries']

    dictionaries_source = dictionaries[:-1]
    dictionary_target = dictionaries[-1]

    # load source dictionary and invert
    word_dicts = []
    word_idicts = []
    for dictionary in dictionaries_source:
        word_dict = load_dict(dictionary)
        if options[0]['n_words_src']:
            for key, idx in word_dict.items():
                if idx >= options[0]['n_words_src']:
                    del word_dict[key]
        word_idict = dict()
        for kk, vv in word_dict.iteritems():
            word_idict[vv] = kk
        word_idict[0] = '<eos>'
        word_idict[1] = 'UNK'
        word_dicts.append(word_dict)
        word_idicts.append(word_idict)

    # load target dictionary and invert
    word_dict_trg = load_dict(dictionary_target)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    # create input and output queues for processes
    queue = Queue()
    rqueue = Queue()
    processes = [None] * n_process
    for midx in xrange(n_process):
        processes[midx] = Process(
            target=translate_model,
            args=(queue, rqueue, midx, models, options, k, normalize, verbose, nbest, save_alignment is not None, suppress_unk, return_hyp_graph))
        processes[midx].start()

    # utility function
    def _seqs2words(cc):
        ww = []
        for w in cc:
            if w == 0:
                break
            ww.append(word_idict_trg[w])
        return ' '.join(ww)

    def _send_jobs(f):
        source_sentences = []
        for idx, line in enumerate(f):
            if chr_level:
                words = list(line.decode('utf-8').strip())
            else:
                words = line.strip().split()

            x = []
            for w in words:
                w = [word_dicts[i][f] if f in word_dicts[i] else 1 for (i,f) in enumerate(w.split('|'))]
                if len(w) != options[0]['factors']:
                    sys.stderr.write('Error: expected {0} factors, but input word has {1}\n'.format(options[0]['factors'], len(w)))
                    for midx in xrange(n_process):
                        processes[midx].terminate()
                    sys.exit(1)
                x.append(w)

            x += [[0]*options[0]['factors']]
            queue.put((idx, x))
            source_sentences.append(words)
        return idx+1, source_sentences

    def _finish_processes():
        for midx in xrange(n_process):
            queue.put(None)

    def _retrieve_jobs(n_samples):
        trans = [None] * n_samples
        out_idx = 0
        for idx in xrange(n_samples):
            resp = rqueue.get()
            trans[resp[0]] = resp[1]
            if verbose and numpy.mod(idx, 10) == 0:
                sys.stderr.write('Sample {0} / {1} Done\n'.format((idx+1), n_samples))
            while out_idx < n_samples and trans[out_idx] != None:
                yield trans[out_idx]
                out_idx += 1

    sys.stderr.write('Translating {0} ...\n'.format(source_file.name))
    n_samples, source_sentences = _send_jobs(source_file)
    _finish_processes()

    for i, trans in enumerate(_retrieve_jobs(n_samples)):
        if nbest:
            samples, scores, word_probs, alignment, hyp_graph = trans
            if return_hyp_graph:
                renderer = HypGraphRenderer(hyp_graph)
		renderer.wordify(word_idict_trg)
                renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True)
            order = numpy.argsort(scores)
            for j in order:
                if print_word_probabilities:
                    probs = " ||| " + " ".join("{0}".format(prob) for prob in word_probs[j])
                else:
                    probs = ""
                saveto.write('{0} ||| {1} ||| {2}{3}\n'.format(i, _seqs2words(samples[j]), scores[j], probs))
                # print alignment matrix for each hypothesis
                # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos translation_token_count+eos
                if save_alignment is not None:
                  if a_json:
                    print_matrix_json(alignment[j], source_sentences[i], _seqs2words(samples[j]).split(), i, i+j,save_alignment)
                  else:
                    save_alignment.write('{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format(
                                        i, _seqs2words(samples[j]), scores[j], ' '.join(source_sentences[i]) , len(source_sentences[i])+1, len(samples[j])))
                    print_matrix(alignment[j], save_alignment)
        else:
            samples, scores, word_probs, alignment, hyp_graph = trans
            if return_hyp_graph:
                renderer = HypGraphRenderer(hyp_graph)
		renderer.wordify(word_idict_trg)
                renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True)
            saveto.write(_seqs2words(samples) + "\n")
            if print_word_probabilities:
                for prob in word_probs:
                    saveto.write("{} ".format(prob))
                saveto.write('\n')
            if save_alignment is not None:
              if a_json:
                print_matrix_json(alignment, source_sentences[i], _seqs2words(trans[0]).split(), i, i,save_alignment)
              else:
                save_alignment.write('{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format(
                                      i, _seqs2words(trans[0]), 0, ' '.join(source_sentences[i]) , len(source_sentences[i])+1, len(trans[0])))
                print_matrix(alignment, save_alignment)

    sys.stderr.write('Done\n')
예제 #18
0
def main(models, source_file, saveto, save_alignment=None, k=5,
         normalization_alpha=0.0, n_process=5, chr_level=False, verbose=False, nbest=False, suppress_unk=False, a_json=False, print_word_probabilities=False, return_hyp_graph=False, device_list=[]):
    # load model model_options
    options = []
    for model in models:
        options.append(load_config(model))
        fill_options(options[-1])

    dictionaries = options[0]['dictionaries']

    dictionaries_source = dictionaries[:-1]
    dictionary_target = dictionaries[-1]

    # load source dictionary and invert
    word_dicts = []
    word_idicts = []
    for dictionary in dictionaries_source:
        word_dict = load_dict(dictionary)
        if options[0]['n_words_src']:
            for key, idx in word_dict.items():
                if idx >= options[0]['n_words_src']:
                    del word_dict[key]
        word_idict = dict()
        for kk, vv in word_dict.iteritems():
            word_idict[vv] = kk
        word_idict[0] = '<eos>'
        word_idict[1] = 'UNK'
        word_dicts.append(word_dict)
        word_idicts.append(word_idict)

    # load target dictionary and invert
    word_dict_trg = load_dict(dictionary_target)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    print 'input dict - 100 most common'
    for i in xrange(100):
        print i, " ", word_idict[i]

    print 'output dict - 100 most common'
    for i in xrange(100):
        print i, " ", word_idict_trg[i]

    # create input and output queues for processes
    queue = Queue()
    rqueue = Queue()
    processes = [None] * n_process
    for midx in xrange(n_process):
        deviceid = ''
        if device_list is not None and len(device_list) != 0:
            deviceid = device_list[midx % len(device_list)].strip()
        processes[midx] = Process(
            target=translate_model,
            args=(queue, rqueue, midx, models, options, k, normalization_alpha, verbose, nbest, save_alignment is not None, suppress_unk, return_hyp_graph, deviceid))
        processes[midx].start()

    # utility function
    def _seqs2words(cc):
        ww = []
        for w in cc:
            if w == 0:
                break
            ww.append(word_idict_trg[w])
        return ' '.join(ww)

    def _send_jobs(f):
        source_sentences = []
        for idx, line in enumerate(f):
            if chr_level:
                words = list(line.decode('utf-8').strip())
            else:
                words = line.strip().split()

            x = []
            for w in words:
                w = [word_dicts[i][f] if f in word_dicts[i] else 1 for (i,f) in enumerate(w.split('|'))]
                if len(w) != options[0]['factors']:
                    sys.stderr.write('Error: expected {0} factors, but input word has {1}\n'.format(options[0]['factors'], len(w)))
                    for midx in xrange(n_process):
                        processes[midx].terminate()
                    sys.exit(1)
                x.append(w)

            x += [[0]*options[0]['factors']]
            queue.put((idx, x))
            source_sentences.append(words)
        return idx+1, source_sentences

    def _finish_processes():
        for midx in xrange(n_process):
            queue.put(None)

    def _retrieve_jobs(n_samples):
        trans = [None] * n_samples
        out_idx = 0
        for idx in xrange(n_samples):
            resp = None
            while resp is None:
                try:
                    resp = rqueue.get(True, 5)
                # if queue is empty after 5s, check if processes are still alive
                except Empty:
                    for midx in xrange(n_process):
                        if not processes[midx].is_alive() and processes[midx].exitcode != 0:
                            # kill all other processes and raise exception if one dies
                            queue.cancel_join_thread()
                            rqueue.cancel_join_thread()
                            for idx in xrange(n_process):
                                processes[idx].terminate()
                            sys.stderr.write("Error: translate worker process {0} crashed with exitcode {1}".format(processes[midx].pid, processes[midx].exitcode))
                            sys.exit(1)
            trans[resp[0]] = resp[1]
            if verbose and numpy.mod(idx, 10) == 0:
                sys.stderr.write('Sample {0} / {1} Done\n'.format((idx+1), n_samples))
            while out_idx < n_samples and trans[out_idx] != None:
                yield trans[out_idx]
                out_idx += 1

    sys.stderr.write('Translating {0} ...\n'.format(source_file.name))
    n_samples, source_sentences = _send_jobs(source_file)
    _finish_processes()

    for i, trans in enumerate(_retrieve_jobs(n_samples)):
        if nbest:
            samples, scores, word_probs, alignment, hyp_graph = trans
            if return_hyp_graph:
                renderer = HypGraphRenderer(hyp_graph)
		renderer.wordify(word_idict_trg)
                renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True)
            order = numpy.argsort(scores)
            for j in order:
                if print_word_probabilities:
                    probs = " ||| " + " ".join("{0}".format(prob) for prob in word_probs[j])
                else:
                    probs = ""
                saveto.write('{0} ||| {1} ||| {2}{3}\n'.format(i, _seqs2words(samples[j]), scores[j], probs))
                # print alignment matrix for each hypothesis
                # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos translation_token_count+eos
                if save_alignment is not None:
                    if a_json:
                        print_matrix_json(alignment[j], source_sentences[i], _seqs2words(samples[j]).split(), i, i+j,save_alignment)
                    else:
                        save_alignment.write('{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format(
                                             i, _seqs2words(samples[j]), scores[j], ' '.join(source_sentences[i]) , len(source_sentences[i])+1, len(samples[j])))
                        print_matrix(alignment[j], save_alignment)
        else:
            samples, scores, word_probs, alignment, hyp_graph = trans
            if return_hyp_graph:
                renderer = HypGraphRenderer(hyp_graph)
		renderer.wordify(word_idict_trg)
                renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True)
            saveto.write(_seqs2words(samples) + "\n")
            if i%1==0:
                print 'input:'
                print ' '.join(source_sentences[i])
                print 'output:'
                print _seqs2words(samples) + "\n"
            if print_word_probabilities:
                for prob in word_probs:
                    saveto.write("{} ".format(prob))
                saveto.write('\n')
            if save_alignment is not None:
                if a_json:
                    print_matrix_json(alignment, source_sentences[i], _seqs2words(trans[0]).split(), i, i,save_alignment)
                else:
                    save_alignment.write('{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format(
                                         i, _seqs2words(trans[0]), 0, ' '.join(source_sentences[i]) , len(source_sentences[i])+1, len(trans[0])))
                    print_matrix(alignment, save_alignment)

    sys.stderr.write('Done\n')