def encode_model(queue, rqueue, pid, model, options):

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    trng = RandomStreams(1234)

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    # word index
    f_init, f_next = build_sampler(tparams, options, trng)

    def _encode(seq):
        # encode the source sentence
        code = f_init(numpy.array(seq).reshape([len(seq), 1]))[1]
        return code

    while True:
        req = queue.get()
        if req is None:
            break

        idx, x = req[0], req[1]
        print(pid, '-', idx)
        cod = _encode(x)

        rqueue.put((idx, cod))

    return
Esempio n. 2
0
def translate_model(queue, rqueue, pid, models, options, k, normalize, verbose,
                    nbest, return_alignment, suppress_unk):

    from nmt import (build_sampler, gen_sample, load_params,
                 init_params, init_tparams)

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    from theano import shared
    trng = RandomStreams(1234)
    use_noise = shared(numpy.float32(0.))

    fs_init = []
    fs_next = []

    for model, option in zip(models, options):

        # allocate model parameters
        params = init_params(option)

        # load model parameters and set theano shared variables
        params = load_params(model, params)
        tparams = init_tparams(params)

        # word index
        f_init, f_next = build_sampler(tparams, option, use_noise, trng,
                                       return_alignment=return_alignment)

        fs_init.append(f_init)
        fs_next.append(f_next)

    def _translate(seq):
        # sample given an input sequence and obtain scores
        input = [numpy.array(s).T.reshape([len(s[0]), len(s), 1]) for s in seq]
        sample, score, word_probs, alignment = gen_sample(fs_init, fs_next,
            input, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False,
            return_alignment=return_alignment, suppress_unk=suppress_unk)

        # normalize scores according to sequence lengths
        if normalize:
            lengths = numpy.array([len(s) for s in sample])
            score = score / lengths
        if nbest:
            return sample, score, word_probs, alignment
        else:
            sidx = numpy.argmin(score)
            return sample[sidx], score[sidx], word_probs[sidx], alignment[sidx]

    while True:
        req = queue.get()
        if req is None:
            break

        idx, x = req[0], req[1]
        if verbose:
            sys.stderr.write('{0} - {1}\n'.format(pid,idx))
        seq = _translate(x)

        rqueue.put((idx, seq))

    return
Esempio n. 3
0
def encode_model(queue, rqueue, pid, model, options):

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    trng = RandomStreams(1234)

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    # word index
    f_init, f_next = build_sampler(tparams, options, trng)

    def _encode(seq):
        # encode the source sentence
        code = f_init(numpy.array(seq).reshape([len(seq), 1]))[1]
        return code

    while True:
        req = queue.get()
        if req is None:
            break

        idx, x = req[0], req[1]
        print pid, '-', idx
        cod = _encode(x)

        rqueue.put((idx, cod))

    return
def encode_model(queue, rqueue, pid, model, options):

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    trng = RandomStreams(1234)

    params = init_params(options)

    params = load_params(model, params)
    tparams = init_tparams(params)

    f_init, f_next = build_sampler(tparams, options, trng)

    def _encode(seq):
        code = f_init(numpy.array(seq).reshape([len(seq), 1]))[1]
        return code

    while True:
        req = queue.get()
        if req is None:
            break

        idx, x = req[0], req[1]
        print pid, '-', idx
        cod = _encode(x)

        rqueue.put((idx, cod))

    return
Esempio n. 5
0
def translate_model(queue, rqueue, pid, models, options, k, normalize, verbose, nbest, return_alignment, suppress_unk):

    from nmt import (build_sampler, gen_sample, load_params,
                 init_params, init_theano_params)

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    from theano import shared
    trng = RandomStreams(1234)
    use_noise = shared(numpy.float32(0.))

    fs_init = []
    fs_next = []

    for model, option in zip(models, options):

        # allocate model parameters
        params = init_params(option)

        # load model parameters and set theano shared variables
        params = load_params(model, params)
        tparams = init_theano_params(params)

        # word index
        f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=return_alignment)

        fs_init.append(f_init)
        fs_next.append(f_next)

    def _translate(seq):
        # sample given an input sequence and obtain scores
        sample, score, word_probs, alignment = gen_sample(fs_init, fs_next,
                                   numpy.array(seq).T.reshape([len(seq[0]), len(seq), 1]),
                                   trng=trng, k=k, maxlen=200,
                                   stochastic=False, argmax=False, return_alignment=return_alignment, suppress_unk=suppress_unk)

        # normalize scores according to sequence lengths
        if normalize:
            lengths = numpy.array([len(s) for s in sample])
            score = score / lengths
        if nbest:
            return sample, score, word_probs, alignment
        else:
            sidx = numpy.argmin(score)
            return sample[sidx], score[sidx], word_probs[sidx], alignment[sidx]

    while True:
        req = queue.get()
        if req is None:
            break

        idx, x = req[0], req[1]
        if verbose:
            sys.stderr.write('{0} - {1}\n'.format(pid,idx))
        seq = _translate(x)

        rqueue.put((idx, seq))

    return
Esempio n. 6
0
def rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose):

    trng = RandomStreams(1234)

    fs_log_probs = []

    for model, option in zip(models, options):

        # allocate model parameters
        params = init_params(option)

        # load model parameters and set theano shared variables
        params = load_params(model, params)
        tparams = init_tparams(params)

        trng, use_noise, \
            x, x_mask, y, y_mask, \
            opt_ret, \
            cost = \
            build_model(tparams, option)
        inps = [x, x_mask, y, y_mask]
        use_noise.set_value(0.)

        f_log_probs = theano.function(inps, cost)

        fs_log_probs.append(f_log_probs)

    def _score(pairs):
        # sample given an input sequence and obtain scores
        scores = []
        for i, f_log_probs in enumerate(fs_log_probs):
            scores.append(pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize))

        return scores

    lines = source_file.readlines()
    nbest_lines = nbest_file.readlines()

    with tempfile.NamedTemporaryFile(prefix='rescore-tmpin') as tmp_in, tempfile.NamedTemporaryFile(prefix='rescore-tmpout') as tmp_out:
        for line in nbest_lines:
            linesplit = line.split(' ||| ')
            idx = int(linesplit[0])
            tmp_in.write(lines[idx])
            tmp_out.write(linesplit[1] + '\n')
        tmp_in.seek(0)
        tmp_out.seek(0)
        pairs = TextIterator(tmp_in.name, tmp_out.name,
                         options[0]['dictionaries'][0], options[0]['dictionaries'][1],
                         n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'],
                         batch_size=b,
                         maxlen=float('inf'),
                         sort_by_length=False) #TODO: sorting by length could be more efficient, but we'd have to synchronize scores with n-best list after

        scores = _score(pairs)
        for i, line in enumerate(nbest_lines):
            score_str = ' '.join(map(str,[s[i] for s in scores]))
            saveto.write('{0} {1}\n'.format(line.strip(), score_str))
Esempio n. 7
0
def translate_model(queue, rqueue, mask_left, mask_right, write_mask, eots,
                    model, options, k, normalize):

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # allocate model parameters
    #params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model)  #, params)
    tparams = init_tparams(params)

    # word index
    f_init, f_next = build_sampler(tparams, options, trng, use_noise)

    def _translate(seq, left, right, write, eot):
        # sample given an input sequence and obtain scores
        print left.shape, right.shape, write.shape, len(seq)
        sample, score = gen_sample(tparams,
                                   f_init,
                                   f_next,
                                   numpy.array(seq).reshape([len(seq), 1]),
                                   left[:, :, None],
                                   right[:, :, None],
                                   write,
                                   eot[:, None],
                                   options,
                                   trng=trng,
                                   k=k,
                                   maxlen=200,
                                   stochastic=False,
                                   argmax=False)

        # normalize scores according to sequence lengths
        if normalize:
            lengths = numpy.array([len(s) for s in sample])
            score = score / lengths
        sidx = numpy.argmin(score)
        return sample[sidx]

    for idx, [x, l, r, w, eot] in enumerate(
            zip(queue, mask_left, mask_right, write_mask, eots)):
        # req = queue.get()
        if x is None:
            break

        print idx
        seq = _translate(x, l, r, w, eot)

        rqueue.append(seq)

    return
Esempio n. 8
0
def translate_model(queue, rqueue, pid, model, options, k, normalize, n_best):

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

    trng = RandomStreams(1234)

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    # word index
    f_init, f_next = build_sampler(tparams, options, trng)

    def _translate(seq):
        # sample given an input sequence and obtain scores
        sample, score = gen_sample(
            tparams,
            f_init,
            f_next,
            numpy.array(seq).reshape([len(seq), 1]),
            options,
            trng=trng,
            k=k,
            maxlen=200,
            stochastic=False,
            argmax=False,
        )

        # normalize scores according to sequence lengths
        if normalize:
            lengths = numpy.array([len(s) for s in sample])
            score = score / lengths
        if n_best > 1:
            sidx = numpy.argsort(score)[:n_best]
        else:
            sidx = numpy.argmin(score)
        return numpy.array(sample)[sidx], numpy.array(score)[sidx]

    while True:
        req = queue.get()
        if req is None:
            break

        idx, x = req[0], req[1]
        print pid, "-", idx
        seq, scores = _translate(x)

        rqueue.put((idx, seq, scores))

    return
Esempio n. 9
0
def translate_model(queue, rqueue, pid, model, options, k, normalize, n_best):

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    from theano import shared
    trng = RandomStreams(1234)
    use_noise = shared(numpy.float32(0.))

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    # word index
    f_init, f_next = build_sampler(tparams, options, trng, use_noise)

    def _translate(seq):
        # sample given an input sequence and obtain scores
        sample, score = gen_sample(tparams,
                                   f_init,
                                   f_next,
                                   numpy.array(seq).reshape([len(seq), 1]),
                                   options,
                                   trng=trng,
                                   k=k,
                                   maxlen=200,
                                   stochastic=False,
                                   argmax=False)

        # normalize scores according to sequence lengths
        if normalize:
            lengths = numpy.array([len(s) for s in sample])
            score = score / lengths
        if n_best > 1:
            sidx = numpy.argsort(score)[:n_best]
        else:
            sidx = numpy.argmin(score)
        return numpy.array(sample)[sidx], numpy.array(score)[sidx]

    while True:
        req = queue.get()
        if req is None:
            break

        idx, x = req[0], req[1]
        print pid, '-', idx
        seq, scores = _translate(x)

        rqueue.put((idx, seq, scores))

    return
Esempio n. 10
0
    def __init__(self, trained_model):
        # load model model_options
        with open('%s.pkl' % trained_model, 'rb') as f:
            self.options = pkl.load(f)

        logging.info(self.options)
        src_dict = os.path.join(self.options['baseDir'], self.options['dictionaries'][0])
        if len(self.options['dictionaries']) == 1:
            target_dict = None
        else:
            target_dict = os.path.join(self.options['baseDir'], self.options['dictionaries'][1])

        # load source dictionary and invert
        with open(src_dict, 'rb') as f:
            self.word_dict = pkl.load(f)
        self.word_idict = dict()

        for kk, vv in self.word_dict.iteritems():
            self.word_idict[vv] = kk

        self.word_idict[0] = 'EOS'
        self.word_idict[1] = 'UNK'

        # load target dictionary and invert
        if target_dict is None:
            self.word_dict_trg = self.word_dict
            self.word_idict_trg = self.word_idict
        else:
            with open(target_dict, 'rb') as f:
                self.word_dict_trg = pkl.load(f)
            self.word_idict_trg = dict()
            for kk, vv in self.word_dict_trg.iteritems():
                self.word_idict_trg[vv] = kk
            self.word_idict_trg[0] = 'EOS'
            self.word_idict_trg[1] = 'UNK'

        from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
        self.trng = RandomStreams(1234)

        # allocate model parameters
        params = init_params(self.options)

        # load model parameters and set theano shared variables
        self.params = load_params(trained_model, params)
        self.tparams = init_tparams(params)

        # word index
    	use_noise = theano.shared(numpy.float32(0.))
        self.f_init, self.f_next = build_sampler(self.tparams, self.options, self.trng, use_noise)
Esempio n. 11
0
def translate_model(queue, rqueue, pid, model, options, k, normalize):

    import theano
    from theano import tensor
    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.), name='use_noise')

    params = init_params(options)
    params = load_params(model, params)
    tparams = init_tparams(params)

    # word index
    maxlen = 150
    options['maxlen'] = maxlen
    f_init, f_next = build_sampler(tparams, options, trng)

    def _translate(seq):
        sample, score = gen_sample(tparams,
                                   f_init,
                                   f_next,
                                   numpy.array(seq).reshape([len(seq), 1]),
                                   options,
                                   trng=trng,
                                   k=k,
                                   maxlen=maxlen,
                                   stochastic=False)
        if normalize:
            lengths = numpy.array([len(s) for s in sample])
            score = score / lengths
        sidx = numpy.argmin(score)
        return sample[sidx]

    while True:
        req = queue.get()
        if req == None:
            break

        idx, x = req[0], req[1]
        print pid, '-', idx
        seq = _translate(x)

        rqueue.put((idx, seq))

    return
Esempio n. 12
0
def translate_model(queue, rqueue, pid, model, options, k, normalize, annotations_only):

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    trng = RandomStreams(1234)

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    # word index
    # f_init outs are [init_state (to decoder), ctx (from encoder)]
    # f_next outs are [next_probs, next_sample, next_state] (decoder)
    f_init, f_next = build_sampler(tparams, options, trng, annotations_only)

    def _translate(seq):
        # sample given an input sequence and obtain scores
        if annotations_only:
            next_state, ctx = f_init(numpy.array(seq).reshape([len(seq), 1]))
            return ctx
        else:
            sample, score = gen_sample(tparams, f_init, f_next,
                                       numpy.array(seq).reshape([len(seq), 1]),
                                       options, trng=trng, k=k, maxlen=200,
                                       stochastic=False, argmax=False)

            # normalize scores according to sequence lengths
            if normalize:
                lengths = numpy.array([len(s) for s in sample])
                score = score / lengths
            sidx = numpy.argmin(score)
            return sample[sidx]

    while True:
        req = queue.get()
        if req is None:
            break

        idx, x = req[0], req[1]
        seq = _translate(x)

        rqueue.put((idx, seq))

    return
Esempio n. 13
0
def get_error(model, test_src, test_target):
    profile = False

    # reload options
    f = open('%s.pkl' % model, 'rb')
    model_options = pkl.load(f)
    logging.info(model_options)

    logging.info('Building model')
    params = init_params(model_options)

    # reload parameters
    params = load_params(model, params)
    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    dict_src = os.path.join(model_options['baseDir'],
                            model_options['dictionaries'][0])
    if len(model_options['dictionaries']) == 1:
        dict_target = None
    else:
        dict_target = os.path.join(model_options['baseDir'],
                                   model_options['dictionaries'][1])

    valid = TextIterator(test_src,
                         test_target,
                         dict_src,
                         dict_target,
                         n_words_source=model_options['n_words_src'],
                         n_words_target=model_options['n_words'],
                         batch_size=model_options['valid_batch_size'],
                         maxlen=model_options['maxlen'])

    logging.info('Building f_log_probs...')
    f_log_probs = theano.function(inps, cost, profile=profile)
    valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid)
    valid_err = valid_errs.mean()
    logging.info('Valid Error:%s' % (str(valid_err)))
Esempio n. 14
0
def get_error(model, test_src, test_target):
    profile=False

    # reload options
    f = open('%s.pkl' % model, 'rb')
    model_options = pkl.load(f)
    logging.info(model_options)

    logging.info('Building model')
    params = init_params(model_options)

    # reload parameters
    params = load_params(model, params)
    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    dict_src = os.path.join(model_options['baseDir'], model_options['dictionaries'][0])
    if len(model_options['dictionaries']) == 1:
        dict_target = None
    else:
        dict_target = os.path.join(model_options['baseDir'], model_options['dictionaries'][1])

    valid = TextIterator(test_src, test_target,
                         dict_src,
                         dict_target,
                         n_words_source=model_options['n_words_src'],
                         n_words_target=model_options['n_words'],
                         batch_size=model_options['valid_batch_size'],
                         maxlen=model_options['maxlen'])

    logging.info('Building f_log_probs...')
    f_log_probs = theano.function(inps, cost, profile=profile)
    valid_errs = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid)
    valid_err = valid_errs.mean()
    logging.info('Valid Error:%s'% (str(valid_err)))
Esempio n. 15
0
def translate_model(queue, rqueue, pid, model, options, k, normalize):

    import theano
    from theano import tensor
    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.), name='use_noise')

    params = init_params(options)
    params = load_params(model, params)
    tparams = init_tparams(params)

    # word index
    maxlen = 150
    options['maxlen'] = maxlen
    f_init, f_next = build_sampler(tparams, options, trng)

    def _translate(seq):
        sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq),1]), options,
                                   trng=trng, k=k, maxlen=maxlen, stochastic=False)
        if normalize:
            lengths = numpy.array([len(s) for s in sample])
            score = score / lengths
        sidx = numpy.argmin(score)
        return sample[sidx]

    while True:
        req = queue.get()
        if req == None:
            break

        idx, x = req[0], req[1]
        print pid, '-', idx
        seq = _translate(x)

        rqueue.put((idx, seq))

    return
Esempio n. 16
0
def build_alignment_cg(model, options):

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    trng = RandomStreams(1234)

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    # build model
    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, options)
    inps = [x, x_mask, y, y_mask]

    # compile a function and return it
    return theano.function(inps, opt_ret['dec_alphas'])
Esempio n. 17
0
def sample(model, dictionary, dictionary_target, \
           source_file, ref_file, saveto, \
           k=10, normalize=False, \
           bleu_script='./data/mteval-v11b.pl', res_to_sgm='./data/plain2sgm'):

    # load model model_options
    with open(model + '.pkl', 'rb') as f:
        options = pkl.load(f)

    # load target dictionary and invert
    with open(dictionary_target, 'rb') as f:
        word_dict_trg = pkl.load(f)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk

    val_start_time = time.time()

    trng = RandomStreams(1234)
    use_noise = shared(numpy.float32(0.))

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)
    # word index
    f_init, f_next = build_sampler(tparams, options, trng, use_noise)

    bleu_score = gen_trans(test_src=source_file, test_ref=ref_file, out_file=saveto, \
                           dict_src=dictionary, idict_trg=word_idict_trg, \
                           tparams=tparams, f_init=f_init, f_next=f_next, model_options=options, \
                           trng=trng, k=10, stochastic=False)

    print(model + ' / ' + source_file + ' / ' + 'test bleu %.4f' % bleu_score)
    print('timestamp {} {}'.format(
        'done', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    sys.stdout.flush()
Esempio n. 18
0
def translate_model(queue, model, options, k, normalize, d_maxlen=200):

    use_noise = theano.shared(numpy.float32(0.))

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    # word index
    f_init, f_next = build_sampler(tparams, options, trng, use_noise)

    def _translate(seq):
        # sample given an input sequence and obtain scores
        sample, score = gen_sample(tparams, f_init, f_next,
                                   numpy.array(seq).reshape([len(seq), 1]),
                                   options, trng=trng, k=k, maxlen=d_maxlen,
                                   stochastic=False, argmax=False)

        # normalize scores according to sequence lengths
        if normalize:
            lengths = numpy.array([len(s) for s in sample])
            score = score / lengths
        sidx = numpy.argmin(score)
        return sample[sidx]

    rqueue = []
    for req in queue:
        idx, x = req[0], req[1]
        print 'translate-', idx
        seq = _translate(x)
        rqueue.append(seq)

    return rqueue
Esempio n. 19
0
def main(model,
         pklmodel,
         dictionary,
         dictionary_target,
         dictionary_chunk,
         source_file,
         target_file,
         saveto,
         ck=5,
         wk=5,
         k=20,
         normalize=False,
         n_process=5,
         chr_level=False,
         jointProb=False,
         show_boundary=False):
    print 'load model model_options'
    with open('%s' % pklmodel, 'rb') as f:
        options = pkl.load(f)

    print 'load source dictionary and invert'
    with open(dictionary, 'rb') as f:
        word_dict = pkl.load(f)
    word_idict = dict()
    for kk, vv in word_dict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    print 'load target dictionary and invert'
    with open(dictionary_target, 'rb') as f:
        word_dict_trg = pkl.load(f)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    # dict for chunk label
    worddict_chunk = [None]
    worddict_r_chunk = [None]
    with open(dictionary_chunk, 'rb') as f:
        worddict_chunk = pkl.load(f)
    worddict_r_chunk = dict()
    for kk, vv in worddict_chunk.iteritems():
        worddict_r_chunk[vv] = kk

    def _seqs2wordsByChunk(caps, boundary, chunk, dictionary):
        capsw = []
        for cc, bb, ch in zip(caps, boundary, chunk):
            if cc == 0:
                continue
            # if w == -10000:
            #     ww.append('| NOTEND')
            #     continue
            if cc < 0:
                # ww.append('|' +  str(w))
                continue

            if bb == 0:

                capsw[-1] = capsw[-1] + "_" + (dictionary[cc])

            else:
                capsw.append(dictionary[cc])

        return capsw

    # output in the chunk format:
    # w1, POS, chunk_boundary-chunk_tag
    def _seqs2wordsByChunkFormat(caps, boundary, chunk, dictionary, chunk_dic):
        capsw = []
        current_tag = ''

        for cc, bb, ch in zip(caps, boundary, chunk):
            if cc == 0:
                continue
            # if w == -10000:
            #     ww.append('| NOTEND')
            #     continue
            if cc < 0:
                # ww.append('|' +  str(w))
                continue

            if bb == 0:

                capsw.append(dictionary[cc] + ' ' + 'I-' + chunk_dic[ch])

            else:
                capsw.append(dictionary[cc] + ' ' + 'B-' + chunk_dic[ch])

        return capsw

    # utility function
    def _seqs2words(caps, dictionary):
        capsw = []
        ww = []
        for w in caps:
            if w == 0:
                continue
            ww.append(dictionary[w])
        return ww

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    f_align = build_alignment(tparams, options)

    # begin to read by iterators
    train = TrainingTextIterator(source_file,
                                 target_file,
                                 dictionary,
                                 dictionary_target,
                                 dictionary_chunk,
                                 n_words_source=30000,
                                 n_words_target=30000,
                                 batch_size=1,
                                 max_chunk_len=50,
                                 max_word_len=10000)

    boundary_right = 0.0
    tag_right = 0.0

    boundary_total = 0.0
    tag_total = 0.0

    for x, y_chunk, y_cw in train:

        x, x_mask, y_c, y_cw, chunk_indicator, y_mask = \
            prepare_training_data(x,
                                  y_chunk,
                                  y_cw,
                                  maxlen_chunk=100000,
                                  maxlen_cw=100000,
                                  n_words_src=30000,
                                  n_words=30000)

        align, chunk_tag, chunk_boundary = f_align(x, x_mask, y_c, y_cw,
                                                   y_mask, chunk_indicator)

        x = x.reshape((x.shape[0], ))
        y_cw = y_cw.reshape((y_cw.shape[0], ))
        y_c = y_c.reshape((y_c.shape[0], ))
        chunk_indicator = chunk_indicator.reshape((chunk_indicator.shape[0], ))

        print '\n'.join(
            _seqs2wordsByChunkFormat(numpy.ndarray.tolist(y_cw),
                                     numpy.ndarray.tolist(chunk_boundary),
                                     numpy.ndarray.tolist(chunk_tag),
                                     word_idict_trg, worddict_r_chunk))

        for gold_boundary, gold_chunk_tag, predict_boundary, predict_chunk_tag in zip(
                numpy.ndarray.tolist(chunk_indicator),
                numpy.ndarray.tolist(y_c),
                numpy.ndarray.tolist(chunk_boundary),
                numpy.ndarray.tolist(chunk_tag)):
            boundary_total += 1
            tag_total += 1

            if gold_boundary == predict_boundary:
                boundary_right += 1

                if gold_chunk_tag == predict_chunk_tag:
                    tag_right += 1

        # for tag, boundary in zip(numpy.ndarray.tolist(chunk_tag), numpy.ndarray.tolist(chunk_boundary)):
        #     print
        #
        # # filter alignment
        # filter_align = []
        # for b, align in zip(numpy.ndarray.tolist(chunk_indicator), numpy.ndarray.tolist(align[0])):
        #     if b == 1.0:
        #         filter_align.append(align)
        #
        #
        # print 'align =',
        # # a = numpy.ndarray.tolist(filter_align)
        # a = numpy.array(filter_align)
        # a = numpy.transpose(a)
        # a = numpy.ndarray.tolist(a)
        #
        # print a

    print 'boundary prec: ', boundary_right / boundary_total
    print 'tag prec: ', tag_right / tag_total
    print 'Done'
Esempio n. 20
0
def main(model, bn_model, dictionary_target, fea, latex, saveto, output, k=5):

    # load model model_options
    with open('%s.pkl' % model, 'rb') as f:
        options = pkl.load(f)

    # load source dictionary and invert
    worddicts = load_dict(dictionary_target)
    worddicts_r = [None] * len(worddicts)
    for kk, vv in worddicts.iteritems():
        worddicts_r[vv] = kk

    valid, valid_uid_list = dataIterator(fea,
                                         latex,
                                         worddicts,
                                         batch_size=1,
                                         batch_Imagesize=500000,
                                         maxlen=500,
                                         maxImagesize=500000)

    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # allocate model parameters
    params = init_params(options)
    bn_params = init_bn_params(options)
    # load model parameters and set theano shared variables
    params = load_params(model, params)
    bn_params = load_params(bn_model, bn_params)
    tparams = init_tparams(params)
    bn_tparams = init_tparams(bn_params)
    f_init, f_next = build_sampler(tparams, bn_tparams, options, trng,
                                   use_noise)

    use_noise.set_value(0.)

    fpp_sample = open(saveto, 'w')
    valid_count_idx = 0
    # FIXME: random selection?
    print 'Decoding ... '
    for x, y in valid:
        for xx in x:
            print '%d : %s' % (valid_count_idx + 1,
                               valid_uid_list[valid_count_idx])
            xx_pad = numpy.zeros(
                (xx.shape[0], xx.shape[1], xx.shape[2]),
                dtype='float32')  # input_channels * height * width
            xx_pad[:, :, :] = xx / 255.
            stochastic = False
            sample, score = gen_sample(f_init,
                                       f_next,
                                       xx_pad[None, :, :, :],
                                       options,
                                       trng=trng,
                                       k=10,
                                       maxlen=1000,
                                       stochastic=stochastic,
                                       argmax=False)

            if stochastic:
                ss = sample
            else:
                score = score / numpy.array([len(s) for s in sample])
                ss = sample[score.argmin()]

            fpp_sample.write(valid_uid_list[valid_count_idx])
            valid_count_idx = valid_count_idx + 1
            for vv in ss:
                if vv == 0:  # <eol>
                    break
                fpp_sample.write(' ' + worddicts_r[vv])
            fpp_sample.write('\n')
    fpp_sample.close()
    print 'test set decode done'

    os.system('python compute-wer.py ' + saveto + ' ' + latex + ' ' + output)
    fpp = open(output)  # %WER 31.63
    stuff = fpp.readlines()
    fpp.close()
    m = re.search('WER (.*)\n', stuff[0])
    valid_per = 100. * float(m.group(1))
    m = re.search('ExpRate (.*)\n', stuff[1])
    valid_sacc = 100. * float(m.group(1))

    print 'Valid WER: %.2f%%, ExpRate: %.2f%%' % (valid_per, valid_sacc)
Esempio n. 21
0
def main(model,
         dictionary_target,
         source_fea,
         source_latex,
         saveto,
         wer_file,
         k=5):

    # load model model_options
    with open('%s.pkl' % model, 'rb') as f:
        options = pkl.load(f)

    # load source dictionary and invert
    worddicts = load_dict(dictionary_target)
    worddicts_r = [None] * len(worddicts)
    for kk, vv in worddicts.iteritems():
        worddicts_r[vv] = kk

    valid, valid_uid_list = dataIterator_valid(source_fea,
                                               source_latex,
                                               worddicts,
                                               batch_size=1,
                                               maxlen=2000)

    trng = RandomStreams(1234)

    params = init_params(options)
    params = load_params(model, params)
    tparams = init_tparams(params)
    f_init, f_next = build_sampler(tparams, options, trng)

    fpp_sample = open(saveto, 'w')
    valid_count_idx = 0

    print 'Decoding...'
    ud_epoch = 0
    ud_epoch_start = time.time()
    for x, y in valid:
        for xx in x:
            print '%d : %s' % (valid_count_idx + 1,
                               valid_uid_list[valid_count_idx])
            xx_pad = numpy.zeros((xx.shape[0] + 1, xx.shape[1]),
                                 dtype='float32')
            xx_pad[:xx.shape[0], :] = xx
            stochastic = False
            sample, score = gen_sample(f_init,
                                       f_next,
                                       xx_pad[:, None, :],
                                       options,
                                       trng=trng,
                                       k=k,
                                       maxlen=1000,
                                       stochastic=stochastic,
                                       argmax=False)

            if stochastic:
                ss = sample
            else:
                score = score / numpy.array([len(s) for s in sample])
                ss = sample[score.argmin()]

            fpp_sample.write(valid_uid_list[valid_count_idx])
            valid_count_idx = valid_count_idx + 1
            for vv in ss:
                if vv == 0:  # <eol>
                    break
                fpp_sample.write(' ' + worddicts_r[vv])
            fpp_sample.write('\n')
    fpp_sample.close()
    ud_epoch = (time.time() - ud_epoch_start) / 60.
    print 'test set decode done, cost time ...', ud_epoch
    os.system('python compute-wer.py ' + saveto + ' ' + source_latex + ' ' +
              wer_file)
    fpp = open(wer_file)
    stuff = fpp.readlines()
    fpp.close()
    m = re.search('WER (.*)\n', stuff[0])
    valid_per = 100. * float(m.group(1))
    m = re.search('ExpRate (.*)\n', stuff[1])
    valid_sacc = 100. * float(m.group(1))

    print 'Valid WER: %.2f%%, ExpRate: %.2f%%' % (valid_per, valid_sacc)
Esempio n. 22
0
def rescore_model(source_file, nbest_file, saveto, models, options, b,
                  normalize, verbose, alignweights):

    trng = RandomStreams(1234)

    fs_log_probs = []

    for model, option in zip(models, options):
        # allocate model parameters
        params = init_params(option)

        # load model parameters and set theano shared variables
        params = load_params(model, params)
        tparams = init_tparams(params)

        trng, use_noise, \
            x, x_mask, y, y_mask, \
            opt_ret, \
            cost = \
            build_model(tparams, option)
        inps = [x, x_mask, y, y_mask]
        use_noise.set_value(0.)

        if alignweights:
            print "\t*** Save weight mode ON, alignment matrix will be saved."
            outputs = [cost, opt_ret['dec_alphas']]
            f_log_probs = theano.function(inps, outputs)
        else:
            print "\t*** Save weight mode OFF, alignment matrix will not be saved."
            f_log_probs = theano.function(inps, cost)

        fs_log_probs.append(f_log_probs)

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        for i, f_log_probs in enumerate(fs_log_probs):
            score_this_batch = pred_probs(f_log_probs,
                                          prepare_data,
                                          options[i],
                                          pairs,
                                          normalize=normalize,
                                          alignweights=alignweights)
            scores.append(score_this_batch)

        return scores

    lines = source_file.readlines()
    nbest_lines = nbest_file.readlines()

    if alignweights:  ### opening the temporary file.
        temp_name = saveto.name + ".json"
        align_OUT = tempfile.NamedTemporaryFile(prefix=temp_name)

    with tempfile.NamedTemporaryFile(
            prefix='rescore-tmpin') as tmp_in, tempfile.NamedTemporaryFile(
                prefix='rescore-tmpout') as tmp_out:
        for line in nbest_lines:
            linesplit = line.split(' ||| ')
            idx = int(
                linesplit[0])  ##index from the source file. Starting from 0.
            tmp_in.write(lines[idx])
            tmp_out.write(linesplit[1] + '\n')

        tmp_in.seek(0)
        tmp_out.seek(0)
        pairs = TextIterator(
            tmp_in.name,
            tmp_out.name,
            options[0]['dictionaries'][0],
            options[0]['dictionaries'][1],
            n_words_source=options[0]['n_words_src'],
            n_words_target=options[0]['n_words'],
            batch_size=b,
            maxlen=float('inf'),
            sort_by_length=False
        )  #TODO: sorting by length could be more efficient, but we'd have to synchronize scores with n-best list after

        scores, alignments = _score(pairs, alignweights)

        for i, line in enumerate(nbest_lines):
            score_str = ' '.join(map(str, [s[i] for s in scores]))
            saveto.write('{0} {1}\n'.format(line.strip(), score_str))

        ### optional save weights mode.
        if alignweights:
            for line in alignments:
                align_OUT.write(line + "\n")
    if alignweights:
        combine_source_target_text(source_file, nbest_file, saveto.name,
                                   align_OUT)
        align_OUT.close()
def main(model,
         dictionary,
         dictionary_target,
         source_file,
         reference_file,
         chr_level=False):

    print 'load model model_options'

    with open('%s.pkl' % model, 'rb') as f:
        options = pkl.load(f)

    print 'load source dictionary and invert'
    with open(dictionary, 'rb') as f:
        word_dict = pkl.load(f)
    word_idict = dict()
    for kk, vv in word_dict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    print 'load target dictionary and invert'
    with open(dictionary_target, 'rb') as f:
        word_dict_trg = pkl.load(f)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    # utility function
    def _seqs2sen(seqs, _dict):
        sen = []
        for w in seqs:
            if w == 0:
                continue
            elif w < 0:
                continue
            sen.append(_dict[w])
        return ' '.join(sen)

    def _send_jobs(fname, _dict,
                   _n_words):  # translate source sentence into source indices
        sourceIndices = []
        source = []
        with open(fname, 'r') as f:
            for idx, line in enumerate(f):
                if chr_level:
                    words = list(line.decode('utf-8').strip())
                else:
                    words = line.strip().split()
                x = map(lambda w: _dict[w] if w in _dict else 1, words)
                x = map(lambda ii: ii if ii < _n_words else 1, x)
                x += [0]
                sourceIndices.append(x)
                source.append(line)
        return sourceIndices, source

    print 'Force Translating ', source_file, '...'
    print 'Prepare data...',
    ret = _send_jobs(source_file, word_dict, options['n_words_src'])
    sourceIndices = ret[0]
    source = ret[1]

    ret_ref = _send_jobs(reference_file, word_dict_trg, options['n_words'])
    targetIndices = ret_ref[0]
    target = ret_ref[1]

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    use_noise = theano.shared(numpy.float32(0.))

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    # word index
    force_record = build_force_sampler(tparams, options, use_noise)

    def _translate(seq, trg_seq):
        # sample given an input sequence and obtain translated result
        sampleData = force_record(
            numpy.array(seq).reshape([len(seq), 1]),
            numpy.array(trg_seq).reshape([len(trg_seq), 1]))

        alpha_buffer_record = sampleData[0]
        attention_record = sampleData[1]

        if alpha_buffer_record is None:
            buffer_weight = None
        else:
            buffer_weight = alpha_buffer_record.reshape(
                [len(trg_seq), options['buffer_size']])

        if attention_record is None:
            attention = None
        else:
            attention = attention_record.reshape([len(trg_seq), len(seq)])

        return buffer_weight, attention

    idx = 0
    print 'Done, translating...'
    for x, sSen, y in zip(sourceIndices, source, targetIndices):
        transData = _translate(x, y)

        buffer_weight = transData[0]
        attention = transData[1]

        print 'Sen ', idx, ':', sSen  # source sentence
        tSen = _seqs2sen(y, word_idict_trg)
        print 'translation:', tSen  # target sentence
        idx += 1
        print 'buffer_weight:'
        print_matrix(buffer_weight)
        print 'attention:'
        print_matrix(attention)

    print 'Done'
Esempio n. 24
0
def translate_model(queue, rqueue, mask_left, mask_right, write_mask, pid,
                    model, options, k, normalize):

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    # word index
    f_init, f_next = build_sampler(tparams, options, trng, use_noise)

    def _translate(seq, left, right, write):
        # sample given an input sequence and obtain scores
        print left.shape, right.shape, write.shape, len(seq)
        sample, score = gen_sample(tparams,
                                   f_init,
                                   f_next,
                                   numpy.array(seq).reshape([len(seq), 1]),
                                   left[:, :, None],
                                   right[:, :, None],
                                   write,
                                   options,
                                   trng=trng,
                                   k=k,
                                   maxlen=200,
                                   stochastic=False,
                                   argmax=False)

        # normalize scores according to sequence lengths
        if normalize:
            lengths = numpy.array([len(s) for s in sample])
            score = score / lengths
        sidx = numpy.argmin(score)
        return sample[sidx]

    while True:
        req = queue.get()
        if req is None:
            break

        rem_l = mask_left.get()
        rem_r = mask_right.get()
        rem_w = write_mask.get()

        idx, x = req[0], req[1]
        l = rem_l[1]
        r = rem_r[1]
        w = rem_w[1]

        print pid, '-', idx
        seq = _translate(x, l, r, w)

        rqueue.put((idx, seq))

    return
Esempio n. 25
0
def rescore_model(source_file, target_file, saveto, models, options, b,
                  normalize, verbose, alignweights):

    trng = RandomStreams(1234)

    fs_log_probs = []

    for model, option in zip(models, options):
        # allocate model parameters
        params = init_params(option)

        # load model parameters and set theano shared variables
        params = load_params(model, params)
        tparams = init_tparams(params)

        trng, use_noise, \
            x, x_mask, y, y_mask, \
            opt_ret, \
            cost = \
            build_model(tparams, option)
        inps = [x, x_mask, y, y_mask]
        use_noise.set_value(0.)

        if alignweights:
            sys.stderr.write(
                "\t*** Save weight mode ON, alignment matrix will be saved.\n")
            outputs = [cost, opt_ret['dec_alphas']]
            f_log_probs = theano.function(inps, outputs)
        else:
            f_log_probs = theano.function(inps, cost)

        fs_log_probs.append(f_log_probs)

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        alignments = []
        for i, f_log_probs in enumerate(fs_log_probs):
            score, alignment = pred_probs(f_log_probs,
                                          prepare_data,
                                          options[i],
                                          pairs,
                                          normalize=normalize,
                                          alignweights=alignweights)
            scores.append(score)
            alignments.append(alignment)

        return scores, alignments

    pairs = TextIterator(
        source_file.name,
        target_file.name,
        options[0]['dictionaries'][:-1],
        options[0]['dictionaries'][1],
        n_words_source=options[0]['n_words_src'],
        n_words_target=options[0]['n_words'],
        batch_size=b,
        maxlen=float('inf'),
        sort_by_length=False
    )  #TODO: sorting by length could be more efficient, but we'd want to resort after

    scores, alignments = _score(pairs, alignweights)

    source_file.seek(0)
    target_file.seek(0)
    source_lines = source_file.readlines()
    target_lines = target_file.readlines()

    for i, line in enumerate(target_lines):
        score_str = ' '.join(map(str, [s[i] for s in scores]))
        saveto.write('{0} {1}\n'.format(line.strip(), score_str))

    ### optional save weights mode.
    if alignweights:
        ### writing out the alignments.
        temp_name = saveto.name + ".json"
        with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT:
            for line in all_alignments:
                align_OUT.write(line + "\n")
            ### combining the actual source and target words.
            combine_source_target_text_1to1(source_file, target_file,
                                            saveto.name, align_OUT)
Esempio n. 26
0
def main(model,
         src_dict,
         trg_dict,
         src,
         trg,
         multibleu,
         batch_size=60,
         pred_dir='',
         model_list=False):
    if pred_dir is not '' and not os.path.exists(pred_dir):
        os.makedirs(pred_dir)
    if model_list:
        model_list_file = model
        with open(model_list_file) as f:
            model = f.readline().strip()

    # load dictionaries and invert them
    worddicts = [None] * 2
    worddicts_r = [None] * 2
    for ii, dd in enumerate([src_dict, trg_dict]):
        with open(dd, 'rb') as f:
            worddicts[ii] = pkl.load(f)
        worddicts_r[ii] = dict()
        for kk, vv in worddicts[ii].iteritems():
            worddicts_r[ii][vv] = kk

    # load model options
    with open('%s.pkl' % model, 'rb') as f:
        options = pkl.load(f)

    trng = RandomStreams(options['trng'])
    use_noise = theano.shared(numpy.float32(0.))

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    f_init_2, f_next_2 = build_sampler_2(tparams, options, trng, use_noise)

    iterator = TextIterator(src,
                            trg,
                            src_dict,
                            trg_dict,
                            n_words_source=options['n_words_src'],
                            n_words_target=options['n_words'],
                            batch_size=batch_size,
                            maxlen=2000,
                            shuffle=False,
                            replace=False)

    if not model_list:
        try:
            valid_out, valid_bleu = greedy_decoding(
                options,
                trg,
                iterator,
                worddicts_r,
                tparams,
                prepare_data,
                gen_sample_2,
                f_init_2,
                f_next_2,
                trng,
                multibleu,
                fname=os.path.join(pred_dir,
                                   os.path.basename(model)[:-3] + 'out'),
                maxlen=100,
                verbose=False)
        except:
            valid_out = ''
            valid_bleu = 0.0
        print valid_out, valid_bleu
    else:
        best_score = 0.
        best_model = ''
        with open(model_list_file) as f:
            for line in f:
                start = time.time()
                model = line.strip()
                if model == '':
                    continue
                params = load_params(model, params)
                for kk, pp in params.iteritems():
                    tparams[kk].set_value(params[kk])
                print model,
                try:
                    valid_out, valid_bleu = greedy_decoding(
                        options,
                        trg,
                        iterator,
                        worddicts_r,
                        tparams,
                        prepare_data,
                        gen_sample_2,
                        f_init_2,
                        f_next_2,
                        trng,
                        multibleu,
                        fname=os.path.join(
                            pred_dir,
                            os.path.basename(model)[:-3] + 'out'),
                        maxlen=100,
                        verbose=False)
                except:
                    valid_out = ''
                    valid_bleu = 0.0
                print valid_out, valid_bleu,
                if valid_bleu > best_score:
                    best_score = valid_bleu
                    best_model = model
                end = time.time()
                print "Time: ", end - start
        print 'Best model: ', best_model
        print 'Best BLEU: ', best_score
Esempio n. 27
0
def main(model,
         dictionary,
         dictionary_target,
         source_file,
         saveto,
         k=5,
         pkl_file=None,
         normalize=False,
         output_attention=False):

    # load model model_options
    if pkl_file is None:
        pkl_file = model + '.pkl'
    with open(pkl_file, 'rb') as f:
        options = pkl.load(f)

    # load source dictionary and invert
    with open(dictionary, 'rb') as f:
        word_dict = pkl.load(f)  # word2id
    word_idict = dict()  # id2word
    for kk, vv in word_dict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # load target dictionary and invert
    with open(dictionary_target, 'rb') as f:
        word_dict_trg = pkl.load(f)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    # create input and output queues for processes

    # utility function
    def _seqs2words(caps):
        capsw = []
        for cc in caps:
            ww = []
            for w in cc:
                if w == 0:
                    break
                ww.append(word_idict_trg[w])
            capsw.append(' '.join(ww))
        return capsw

    def _send_jobs(fname):
        retval = []
        retval_ori = []
        with open(fname, 'r') as f:
            for idx, line in enumerate(f):
                words = line.strip().split()
                retval_ori.append(line.strip())
                x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
                x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x)
                x += [0]
                retval.append(x)
        return retval, retval_ori

    print 'Translating ', source_file, '...'
    sys.stdout.flush()

    n_samples, n_samples_src = _send_jobs(source_file)

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # allocate model parameters
    # params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model)
    tparams = init_tparams(params)

    # word index
    f_init, f_next = build_sampler(tparams, options, trng, use_noise)

    def _translate(seq):
        # sample given an input sequence and obtain scores
        sample, score, att = gen_sample(tparams,
                                        f_init,
                                        f_next,
                                        numpy.array(seq).reshape([len(seq),
                                                                  1]),
                                        options,
                                        trng=trng,
                                        k=k,
                                        maxlen=200,
                                        stochastic=False,
                                        argmax=False)
        # normalize scores according to sequence lengths
        if normalize:
            lengths = numpy.array([len(s) for s in sample])
            score = score / lengths
        sidx = numpy.argmin(score)
        # return sample[sidx], att[sidx]
        return sample[sidx], numpy.array(att[sidx])

    def _output_attention(sent_idx, att):
        dirname = saveto + '.attention'
        if not os.path.exists(dirname):
            os.mkdir(dirname)
        with open(dirname + '/' + str(sent_idx), 'w') as fp:
            fp.write("%d %d\n" % (att.shape[0], att.shape[1]))
            for row in att:
                fp.write(
                    str(row.argmax()) + " " + ' '.join([str(x)
                                                        for x in row]) + '\n')

    # translation
    ys = []
    atts = []
    idx = 0

    for x in n_samples:
        y, att = _translate(x)
        ys.append(y)
        atts.append(att)
        print idx
        idx += 1
    trans = _seqs2words(ys)

    # save
    with open(saveto, 'w') as f:
        print >> f, '\n'.join(trans)
    if output_attention:
        with open(saveto + '.att', 'w') as f:
            for idx, (x, y, att) in enumerate(zip(n_samples_src, trans, atts)):
                print >> f, ('%d ||| %s ||| 0 ||| %s ||| %d %d' %
                             (idx, y, x, att.shape[1], att.shape[0]))
                for hehe in att:
                    print >> f, ' '.join([str(x) for x in hehe])
                print >> f

    print 'Done'
Esempio n. 28
0
def main(model,
         pklmodel,
         dictionary,
         dictionary_target,
         source_file,
         saveto,
         ck=5,
         wk=5,
         k=20,
         normalize=False,
         n_process=5,
         chr_level=False,
         jointProb=False,
         show_boundary=False):
    print 'load model model_options'
    with open('%s' % pklmodel, 'rb') as f:
        options = pkl.load(f)

    print 'load source dictionary and invert'
    with open(dictionary, 'rb') as f:
        word_dict = pkl.load(f)
    word_idict = dict()
    for kk, vv in word_dict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    print 'load target dictionary and invert'
    with open(dictionary_target, 'rb') as f:
        word_dict_trg = pkl.load(f)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    # utility function
    def _seqs2words(caps, boundary, chunk):
        capsw = []
        for cc, bb, ch in zip(caps, boundary, chunk):
            ww = []
            for w, b, c in zip(cc, bb, ch):
                if w == 0:
                    continue
                # if w == -10000:
                #     ww.append('| NOTEND')
                #     continue
                elif w < 0:
                    # ww.append('|' +  str(w))
                    continue

                if show_boundary:
                    if b == 1.0:
                        ww.append('|')
                ww.append(word_idict_trg[w])
            capsw.append(' '.join(ww))
        return capsw

    def _seqs2wordsByChunk(caps, boundary, chunk):
        capsw = []
        for cc, bb, ch in zip(caps, boundary, chunk):
            ww = []
            for w, b, c in zip(cc, bb, ch):
                if w == 0:
                    continue
                # if w == -10000:
                #     ww.append('| NOTEND')
                #     continue
                elif w < 0:
                    # ww.append('|' +  str(w))
                    continue

                if b == 1.0:
                    ww.append('| ' + str(c))
                ww.append(word_idict_trg[w])
            capsw.append(' '.join(ww))
        return capsw

    def _send_jobs(fname):
        retval = []
        with open(fname, 'r') as f:
            for idx, line in enumerate(f):
                if chr_level:
                    words = list(line.decode('utf-8').strip())
                else:
                    words = line.strip().split()
                x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
                x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x)
                x += [0]
                retval.append(x)
        return retval

    print 'Translating ', source_file, '...'

    print 'look up table'
    n_samples = _send_jobs(source_file)

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    # word index
    f_init, f_next_chunk, f_next_word = build_sampler(tparams, options, trng,
                                                      use_noise)

    def _translate(seq):

        be_stochastic = False
        # sample given an input sequence and obtain scores
        sample, boundary, chunk, score = gen_sample(tparams,
                                                    f_init,
                                                    f_next_chunk,
                                                    f_next_word,
                                                    numpy.array(seq).reshape(
                                                        [len(seq), 1]),
                                                    options,
                                                    trng=trng,
                                                    maxlen=200,
                                                    k_chunk=ck,
                                                    k_word=wk,
                                                    k=k,
                                                    stochastic=be_stochastic,
                                                    argmax=True,
                                                    jointProb=False)

        if be_stochastic:
            return sample

        # normalize scores according to sequence lengths
        if normalize:
            lengths = numpy.array([len(s) for s in sample])
            score = score / lengths

        # print 'score', score
        # print 'candidates', sample

        sidx = numpy.argmin(score)
        return sample[sidx], boundary[sidx], chunk[sidx]

    ys = []
    yb = []
    yc = []
    idx = 0
    for x in n_samples:
        y, y_boundary, y_chunk = _translate(x)
        ys.append(y)
        yb.append(y_boundary)
        yc.append(y_chunk)
        print idx
        idx += 1

    # print ys
    # print yb
    trans = _seqs2words(ys, yb, yc)
    trans_chunk = _seqs2wordsByChunk(ys, yb, yc)

    with open(saveto, 'w') as f:
        print >> f, '\n'.join(trans)
    with open(saveto + 'chunk', 'w') as f:
        print >> f, '\n'.join(trans_chunk)
    print 'Done'
Esempio n. 29
0
def main(model_files, dictionary_target, grammar_target, data_path, saveto, wer_file, k=5):

    # load source dictionary and invert
    worddicts = load_dict(dictionary_target)
    worddicts_r = [None] * len(worddicts)
    for kk, vv in worddicts.items():
        worddicts_r[vv] = kk
    grammar=compileGrammar(loadGrammar(grammar_target,worddicts))

    trng = RandomStreams(1234)
    
    models=[]
    # load model model_options
    for model_file in model_files:
        print('Loading model: %s' % model_file)
        with open('%s.pkl' % model_file, 'rb') as f:
            options = pkl.load(f)
        print(options)
        params = init_params(options)
        params = load_params(model_file, params)
        tparams = init_tparams(params)
        f_init, f_next = build_sampler(tparams, options, trng)
        models.append((f_init,f_next,options,0.8))

    for lm_file in []:
        print('Loading language model: %s' % lm_file)
        f_init,f_next,options=load_language_model(lm_file)
        models.append((f_init,f_next,options,0.2))

    valid,valid_uid_list = dataIterator_valid(data_path,
                         worddicts, batch_size=1, maxlen=250)

    fpp_sample=[open('%s.%d'%(saveto,beam),'w') for beam in range(k)]
    
    valid_count_idx=0

    print('Decoding...')
    ud_epoch = 0
    ud_epoch_start = time.time()
    
    for x,y in valid:
        for xx in x:
            print('%d : %s' % (valid_count_idx+1, valid_uid_list[valid_count_idx]))
            xx_pad = numpy.zeros((xx.shape[0]+1,xx.shape[1]), dtype='float32')
            xx_pad[:xx.shape[0],:] = xx
            stochastic = False
            sample, score = gen_sample(models,
                                       xx_pad[:, None, :],
                                       grammar,
                                       trng=trng, k=k,
                                       maxlen=250,
                                       dictlen=len(worddicts),
                                       stochastic=stochastic,
                                       argmax=False)
            score = score / numpy.array([len(s) for s in sample])
            sample_rank=numpy.argsort(score)
            for beam in range(k):
                fpp_sample[beam].write(valid_uid_list[valid_count_idx])
                if len(sample)>beam:
                    ss=sample[sample_rank[beam]]
                else:
                    ss=[0]

                for vv in ss:
                    if vv == 0: # <eol>
                        break
                    fpp_sample[beam].write(' '+worddicts_r[vv])
                fpp_sample[beam].write('\n')
            valid_count_idx=valid_count_idx+1

    ud_epoch = (time.time() - ud_epoch_start) 
    print 'test set decode done, cost time ...', ud_epoch
    for beam in range(k):
        fpp_sample[beam].flush();
        fpp_sample[beam].close();
        os.system('python compute-wer.py %s.%d %s %s'%(saveto,beam,os.path.join(data_path,"caption.txt"),wer_file))
        fpp=open(wer_file)
        stuff=fpp.readlines()
        fpp.close()
        m=re.search('WER (.*)\n',stuff[0])
        valid_per=100. * float(m.group(1))
        m=re.search('ExpRate (.*)\n',stuff[1])
        valid_sacc=100. * float(m.group(1))

        print '%d Valid WER: %.2f%%, ExpRate: %.2f%%' % (beam,valid_per,valid_sacc)
Esempio n. 30
0
def main(model,
         pklmodel,
         valid_datasets=['../data/dev/newstest2011.en.tok',
                          '../data/dev/newstest2011.fr.tok'],
         dictionaries=[
              '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
              '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'],
         dictionary_chunk='/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
         result_file='./cost.result'):





    # load the dictionaries of both source and target
    # load dictionaries and invert them
    worddicts = [None] * len(dictionaries)
    worddicts_r = [None] * len(dictionaries)
    for ii, dd in enumerate(dictionaries):
        with open(dd, 'rb') as f:
            worddicts[ii] = pkl.load(f)
        worddicts_r[ii] = dict()
        for kk, vv in worddicts[ii].iteritems():
            worddicts_r[ii][vv] = kk

    # dict for chunk label
    worddict_chunk = [None]
    worddict_r_chunk = [None]
    with open(dictionary_chunk, 'rb') as f:
        worddict_chunk = pkl.load(f)
    worddict_r_chunk = dict()
    for kk, vv in worddict_chunk.iteritems():
        worddict_r_chunk[vv] = kk
    print worddict_chunk

    print 'load model model_options'
    with open('%s' % pklmodel, 'rb') as f:
        options = pkl.load(f)


    # build valid set
    valid = TrainingTextIterator(valid_datasets[0], valid_datasets[1],
                                 dictionaries[0], dictionaries[1], dictionary_chunk,
                                 n_words_source=options['n_words_src'], n_words_target=options['n_words'],
                                 batch_size=options['batch_size'],
                                 max_chunk_len=options['maxlen_chunk'], max_word_len=options['maxlen_chunk_words'])


    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    trng, use_noise, \
    x, x_mask, y_chunk, y_mask, y_cw, y_chunk_indicator, \
    opt_ret, \
    cost, cost_cw= \
        build_model(tparams, options)


    inps = [x, x_mask, y_chunk, y_mask, y_cw, y_chunk_indicator]



    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=False)
    f_log_probs_cw = theano.function(inps, cost_cw, profile=False)
    print 'Done'

    valid_errs, valid_errs_cw = pred_probs(f_log_probs, f_log_probs_cw, prepare_training_data,
                                            options, valid)

    valid_err = valid_errs.mean()
    valid_err_cw = valid_errs_cw.mean()

    with open(result_file, 'w') as result_file:
        print >> result_file, valid_err, valid_err_cw
Esempio n. 31
0
def go(model,
       dictionary,
       dictionary_target,
       source_file_x1,
       source_file_x2,
       source_file_y2,
       reference_file_y1,
       saveto,
       k=5,
       normalize=False,
       d_maxlen=200,
       steps=None,
       max_steps=None,
       start_steps=0,
       sleep=1000,
       monitor=None):

    # inter-step
    step_test = 0

    # load model model_options
    with open('%s.pkl' % model, 'rb') as f:
        options = pkl.load(f)

    # load source dictionary and invert
    with open(dictionary, 'rb') as f:
        word_dict = pkl.load(f)
    word_idict = dict()
    for kk, vv in word_dict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # load target dictionary and invert
    with open(dictionary_target, 'rb') as f:
        word_dict_trg = pkl.load(f)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    # utility function
    def _seqs2words(caps):
        capsw = []
        for cc in caps:
            ww = []
            for w in cc:
                if w == 0:
                    break
                ww.append(word_idict_trg[w])
            capsw.append(' '.join(ww))
        return capsw

    def _send_jobs(fname_x1, fname_x2, fname_y2):
        queue_x1 = []
        queue_x2 = []
        queue_y2 = []
        queue = []
        with open(fname_x1, 'r') as f:
            for idx, line in enumerate(f):

                words = line.strip().split()
                x1 = map(lambda w: word_dict[w]
                         if w in word_dict else 1, words)
                x1 += [0]
                queue_x1.append((idx, x1))

        with open(fname_x2, 'r') as f:
            for idx, line in enumerate(f):

                words = line.strip().split()
                x2 = map(lambda w: word_dict[w]
                         if w in word_dict else 1, words)
                x2 += [0]
                queue_x2.append((idx, x2))

        with open(fname_y2, 'r') as f:
            for idx, line in enumerate(f):

                words = line.strip().split()
                y2 = map(
                    lambda w: word_dict_trg[w]
                    if w in word_dict_trg else 1, words)
                y2 += [0]
                queue_y2.append((idx, y2))

        for i, (x1, x2, y2) in enumerate(zip(queue_x1, queue_x2, queue_y2)):
            queue.append((i, x1[1], x2[1], y2[1]))

        return queue

    print '[test] build the model'
    funcs, tparams = build_networks(options, model, train=False)

    if steps is None:
        if os.path.exists(saveto):
            print 'we found translated files...skip'
        else:
            print '[test] start translating ', source_file_x1, '...to...', saveto
            queue = _send_jobs(source_file_x1, source_file_x2, source_file_y2)

            if max_steps is not None:
                checkpoint = '{}.iter{}.npz'.format(
                    os.path.splitext(model)[0], max_steps)
                print '[test] Load check-point: {}'.format(checkpoint),
                zipp(load_params(checkpoint, unzip(tparams)), tparams)
                print 'done.'

            rets = translate_model(queue, funcs, tparams, options, k,
                                   normalize, 0, d_maxlen)
            sseqs, ss, acts, gs = zip(*rets)

            trans = _seqs2words(sseqs)
            with open(saveto, 'w') as f:
                print >> f, '\n'.join(trans)
            print 'Done'

            pkl.dump(rets, open(saveto + '.pkl', 'w'))
            print 'All Done'

        # compute BLEU score.
        ref = reference_file_y1
        print '[test] compute BLEU score for {} <-> {}'.format(saveto, ref)

        os.system("sed -i 's/@@ //g' {}".format(saveto))
        out = os.popen(
            'perl ./data/multi-bleu.perl {0} < {1} | tee {1}.score'.format(
                ref, saveto))
        bleu = float(out.read().split()[2][:-1])

        print 'Done at BLEU={}'.format(bleu)

    else:
        if monitor is not None:
            import datetime
            timestamp = datetime.datetime.now().strftime("%m-%d_%H:%M")
            monitor.start_experiment('test.{}.{}'.format(timestamp, model))

        step_test = start_steps
        if step_test == 0:
            step_test += steps

        while step_test < max_steps:

            # check if the check-point is saved
            checkpoint = '{}.iter{}.npz'.format(
                os.path.splitext(model)[0], step_test)
            if not os.path.exists(checkpoint):
                if sleep > 0:
                    print '[test] Did not find checkpoint: {}. I want sleep {}s.'.format(
                        checkpoint, sleep)
                    time.sleep(sleep)
                else:
                    print '[test] Didi not find checkpoint {}, go for next one...'.format(
                        checkpoint)
                    step_test += steps

            else:

                transto = saveto + '.iter={}'.format(step_test)
                print '[test] start translating ', source_file_x1, '...to...', transto

                if os.path.exists(transto):
                    print 'we found translated files...skip'
                else:
                    print '[test] Load check-point: {}'.format(checkpoint),
                    zipp(load_params(checkpoint, unzip(tparams)), tparams)
                    print 'done.'

                    queue = _send_jobs(source_file_x1, source_file_x2,
                                       source_file_y2)
                    rets = translate_model(queue, funcs, tparams, options, k,
                                           normalize, 0, d_maxlen)
                    sseqs, ss, acts, gs = zip(*rets)

                    trans = _seqs2words(sseqs)
                    with open(transto, 'w') as f:
                        print >> f, '\n'.join(trans)
                    print 'Done'

                    pkl.dump(rets, open(transto + '.pkl', 'w'))

                # compute BLEU score.
                ref = reference_file_y1

                print '[test] compute BLEU score for {} <-> {}'.format(
                    transto, ref)

                os.system("sed -i 's/@@ //g' {}".format(transto))
                out = os.popen(
                    'perl ./data/multi-bleu.perl {0} < {1} | tee {1}.score'.
                    format(ref, transto))
                bleu = float(out.read().split()[2][:-1])
                if monitor is not None:
                    monitor.push({'BLEU': bleu}, step=step_test)

                print 'Done at iter={}, BLEU={}'.format(step_test, bleu)
                step_test += steps

            pass
Esempio n. 32
0
def main(model,
         dictionary,
         dictionary_target,
         source_file,
         saveto,
         k=5,
         normalize=False,
         n_process=5,
         chr_level=False):

    # load model model_options
    with open('%s.pkl' % model, 'rb') as f:
        options = pkl.load(f)

    # load source dictionary and invert
    with open(dictionary, 'rb') as f:
        word_dict = pkl.load(f)
    word_idict = dict()
    for kk, vv in word_dict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # load target dictionary and invert
    with open(dictionary_target, 'rb') as f:
        word_dict_trg = pkl.load(f)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    trng = RandomStreams(1234)

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    # word index
    f_init, f_next = build_sampler(tparams, options, trng)

    def _translate(seq):
        # sample given an input sequence and obtain scores
        sample, score = gen_sample(tparams,
                                   f_init,
                                   f_next,
                                   numpy.array(seq).reshape([len(seq), 1]),
                                   options,
                                   trng=trng,
                                   k=k,
                                   maxlen=200,
                                   stochastic=False,
                                   argmax=False)

        # normalize scores according to sequence lengths
        if normalize:
            lengths = numpy.array([len(s) for s in sample])
            score = score / lengths
        sidx = numpy.argmin(score)
        return sample[sidx]

    # utility function
    def _seqs2words(caps):
        capsw = []
        for cc in caps:
            ww = []
            for w in cc:
                if w == 0:
                    break
                ww.append(word_idict_trg[w])
            capsw.append(' '.join(ww))
        return capsw

    translations = []
    print "start Translating..."
    with open(source_file, 'r') as f:
        for idx, line in enumerate(f):
            if idx % 20 == 0:
                print "%s lines done!" % idx
            if chr_level:
                words = list(line.decode('utf-8').strip())
            else:
                words = line.strip().split()
            x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
            x = map(lambda ii: ii if ii < options['n_words'] else 1, x)
            x += [0]
            translation = _translate(x)
            translations.append(" ".join(_seqs2words([translation])))
    with open(saveto, 'w') as f:
        print >> f, '\n'.join(translations)
    print "Finish Translating!"
Esempio n. 33
0
def main(model, dictionary, dictionary_target, source_file, saveto, k=5,
         normalize=False, n_process=5, chr_level=False,messageOff=False):

    if not messageOff:
        print 'load model model_options'
    if os.path.exists('%s.pkl' % model):
        with open('%s.pkl' % model, 'rb') as f:
            options = pkl.load(f)
    else:
        pklName = model[:model.index('.iter')]+model[model.index('.npz'):]
        with open('%s.pkl' % pklName, 'rb') as f:
            options = pkl.load(f)
            
    if not messageOff:   
        print 'load source dictionary and invert'
    with open(dictionary, 'rb') as f:
        word_dict = pkl.load(f)
    word_idict = dict()
    for kk, vv in word_dict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'
    
    if not messageOff:
        print 'load target dictionary and invert'
    with open(dictionary_target, 'rb') as f:
        word_dict_trg = pkl.load(f)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    # utility function
    def _index2sens(caps):
        capsw = []
        for cc in caps:
            ww = []
            for w in cc:
                if w == 0:
                    continue
                elif w < 0:
                    continue
                ww.append(word_idict_trg[w])
            capsw.append(' '.join(ww))
        return capsw

    def _seqs2sen(seqs):
        sen = []
        for w in seqs:
            if w == 0:
                continue
            elif w < 0:
                continue
            sen.append(word_idict_trg[w])
        return ' '.join(sen) 

    def _send_jobs(fname):# translate source sentence into indices
        sourceIndices = []
        source = []
        with open(fname, 'r') as f:
            for idx, line in enumerate(f):
                if chr_level:
                    words = list(line.decode('utf-8').strip())
                else:
                    words = line.strip().split()
                x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
                x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x)
                x += [0]
                sourceIndices.append(x)
                source.append(line)
        return sourceIndices , source
    
    if not messageOff:
        print 'Translating ', source_file, '...'
        print 'Prepare data...',
    ret = _send_jobs(source_file)
    sourceIndices = ret[0]
    source = ret[1]

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    # word index
    f_init, f_next = build_sampler(tparams, options, trng, use_noise)

    def _translate(seq):
        # sample given an input sequence and obtain translated result
        sampleData = gen_sample(tparams, f_init, f_next,
                                   numpy.array(seq).reshape([len(seq), 1]),
                                   options, trng=trng, k=k, 
                                   maxlen=200,
                                   return_attention=True,
                                   stochastic = False, 
                                   argmax = False,
                                   normalize = normalize)
        sample=sampleData[0]
        score=sampleData[1]
        attention_record=sampleData[2]
        
        # normalize scores according to sequence lengths
        if normalize:
            lengths = numpy.array([len(s) for s in sample])
            score = score / lengths
        sidx = numpy.argmin(score)
            
        if attention_record is None:
            attention=None
        else:
            attention=attention_record[sidx]
        return sample[sidx], attention

    trans = []
    idx = 0
    if not messageOff:
        print 'Done, translating...'
    for x , sSen in zip(sourceIndices , source):
        transData = _translate(x)
        y=transData[0]
        attention=transData[1]
        if not messageOff:
            print 'Sen ',idx, ':',sSen # source sentence
        y = _seqs2sen(y)
        trans.append(y)
        if not messageOff:
            print 'translation:', y  # translation result
            print 'attention:'
    #         if attention is not None:
            print_matrix(attention)
        idx += 1

    with open(saveto, 'w') as f:
        print >>f, '\n'.join(trans)
    print 'Done'
def main(model, dictionary, dictionary_target, source_file, saveto, k=5,
         normalize=False, n_process=5, chr_level=False):

    # load model model_options
    with open('%s.pkl' % model, 'rb') as f:
        options = pkl.load(f)

    # load source dictionary and invert
    with open(dictionary, 'rb') as f:
        word_dict = pkl.load(f)
    word_idict = dict()
    for kk, vv in word_dict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # load target dictionary and invert
    with open(dictionary_target, 'rb') as f:
        word_dict_trg = pkl.load(f)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    
    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    trng = RandomStreams(1234)

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    # word index
    f_init, f_next = build_sampler(tparams, options, trng)

    def _translate(seq):
        # sample given an input sequence and obtain scores
        sample, score = gen_sample(tparams, f_init, f_next,
                                   numpy.array(seq).reshape([len(seq), 1]),
                                   options, trng=trng, k=k, maxlen=200,
                                   stochastic=False, argmax=False)

        # normalize scores according to sequence lengths
        if normalize:
            lengths = numpy.array([len(s) for s in sample])
            score = score / lengths
        sidx = numpy.argmin(score)
        return sample[sidx]

    # utility function
    def _seqs2words(caps):
        capsw = []
        for cc in caps:
            ww = []
            for w in cc:
                if w == 0:
                    break
                ww.append(word_idict_trg[w])
            capsw.append(' '.join(ww))
        return capsw

    translations = [] 
    print "start Translating..."
    with open(source_file, 'r') as f:
        for idx, line in enumerate(f):
            if idx % 20 == 0:
                print "%s lines done!" % idx
            if chr_level:
                words = list(line.decode('utf-8').strip())
            else:
                words = line.strip().split()
            x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
            x = map(lambda ii: ii if ii < options['n_words'] else 1, x)
            x += [0]
            translation = _translate(x)
            translations.append(" ".join(_seqs2words([translation])))
    with open(saveto, 'w') as f:
        print >> f, '\n'.join(translations)
    print "Finish Translating!"
Esempio n. 35
0
def main(model, dictionary, dictionary_target, source_file, saveto, k=5,
         normalize=False, n_process=5, chr_level=False):

    # load model model_options
    with open('%s.pkl' % model, 'rb') as f:
        options = pkl.load(f)

    # load source dictionary and invert
    with open(dictionary, 'rb') as f:
        word_dict = pkl.load(f)
    word_idict = dict()
    for kk, vv in word_dict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # load target dictionary and invert
    with open(dictionary_target, 'rb') as f:
        word_dict_trg = pkl.load(f)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    # utility function
    def _seq2words(cc):
        ww = []
        for w in cc:
            if w == 0:
                break
            ww.append(word_idict_trg[w])
        return ' '.join(ww)

    #init model
    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    trng = RandomStreams(1234)

    params = init_params(options)

    params = load_params(model, params)
    tparams = init_tparams(params)

    # word index
    f_init, f_next = build_sampler(tparams, options, trng)

    trans = []
    att = []
    print 'Translating ', source_file, '...'
    fo = open(saveto,'w')
    fa = open(saveto+'.att','w')
    with open(source_file, 'r') as f:
        n = 0
        for line in f:
            n += 1
            if n%10 == 0:
                print n
            if chr_level:
                words = list(line.decode('utf-8').strip())
            else:
                words = line.strip().split()
            x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
            x = map(lambda ii: ii if ii < options['n_words'] else 1, x)
            x += [0]
            y,a = _translate(x, tparams, f_init, f_next, options, trng, k, normalize)
            trans.append(y)
            att.append(a)
            print >>fo,_seq2words(y)
            print _seq2words(y)
            for i,e in enumerate(a):
                for j,p in enumerate(e):
                    print >>fa,'{}-{}-{}'.format(i,j,p),
            print >>fa

    print 'Done'
Esempio n. 36
0
def main(model, dictionary, dictionary_target, source, target, outfile,
         wordbyword):

    # load model model_options
    with open('%s.pkl' % model, 'rb') as f:
        options = pkl.load(f)
    """
    # load source dictionary and invert
    with open(dictionary, 'rb') as f:
        word_dict = pkl.load(f)
    word_idict = dict()
    for kk, vv in word_dict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # load target dictionary and invert
    with open(dictionary_target, 'rb') as f:
        word_dict_trg = pkl.load(f)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'
    """
    valid_noshuf = TextIterator(source,
                                target,
                                dictionary,
                                dictionary_target,
                                n_words_source=options['n_words_src'],
                                n_words_target=options['n_words'],
                                batch_size=options['valid_batch_size'],
                                maxlen=2000,
                                shuffle=False)

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost, cost_ = \
        build_model(tparams, options)

    inps = [x, x_mask, y, y_mask]

    if wordbyword:
        f_log_probs = theano.function(inps, cost_, profile=profile)
        valid_errs = pred_probs(f_log_probs,
                                prepare_data,
                                options,
                                valid_noshuf,
                                verbose=True,
                                as_list=True)
        with open(outfile, 'wb') as f:
            pkl.dump(valid_errs, f, pkl.HIGHEST_PROTOCOL)
    else:
        f_log_probs = theano.function(inps, cost, profile=profile)
        valid_errs = pred_probs(f_log_probs,
                                prepare_data,
                                options,
                                valid_noshuf,
                                verbose=True)
        numpy.save(outfile, valid_errs)
Esempio n. 37
0
def rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights):

    trng = RandomStreams(1234)

    fs_log_probs = []

    for model, option in zip(models, options):
        # allocate model parameters
        params = init_params(option)

        # load model parameters and set theano shared variables
        params = load_params(model, params)
        tparams = init_tparams(params)

        trng, use_noise, \
            x, x_mask, y, y_mask, \
            opt_ret, \
            cost = \
            build_model(tparams, option)
        inps = [x, x_mask, y, y_mask]
        use_noise.set_value(0.)

        if alignweights:
            sys.stderr.write("\t*** Save weight mode ON, alignment matrix will be saved.\n")
            outputs = [cost, opt_ret['dec_alphas']]
            f_log_probs = theano.function(inps, outputs)
        else:
            f_log_probs = theano.function(inps, cost)

        fs_log_probs.append(f_log_probs)

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        alignments = []
        for i, f_log_probs in enumerate(fs_log_probs):
            score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights = alignweights)
            scores.append(score)
            alignments.append(alignment)

        return scores, alignments

    lines = source_file.readlines()
    nbest_lines = nbest_file.readlines()

    if alignweights: ### opening the temporary file.
        temp_name = saveto.name + ".json"
        align_OUT = tempfile.NamedTemporaryFile(prefix=temp_name)

    with tempfile.NamedTemporaryFile(prefix='rescore-tmpin') as tmp_in, tempfile.NamedTemporaryFile(prefix='rescore-tmpout') as tmp_out:
        for line in nbest_lines:
            linesplit = line.split(' ||| ')
            idx = int(linesplit[0])   ##index from the source file. Starting from 0.
            tmp_in.write(lines[idx])
            tmp_out.write(linesplit[1] + '\n')

        tmp_in.seek(0)
        tmp_out.seek(0)
        pairs = TextIterator(tmp_in.name, tmp_out.name,
                        options[0]['dictionaries'][:-1], options[0]['dictionaries'][1],
                         n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'],
                         batch_size=b,
                         maxlen=float('inf'),
                         sort_by_length=False) #TODO: sorting by length could be more efficient, but we'd have to synchronize scores with n-best list after


        scores, alignments = _score(pairs, alignweights)

        for i, line in enumerate(nbest_lines):
            score_str = ' '.join(map(str,[s[i] for s in scores]))
            saveto.write('{0} {1}\n'.format(line.strip(), score_str))

        ### optional save weights mode.
        if alignweights:
            for line in alignments:
                align_OUT.write(line + "\n")
    if alignweights:
        combine_source_target_text(source_file, nbest_file, saveto.name, align_OUT)
        align_OUT.close()
Esempio n. 38
0
def rescore_model(source_file, target_file, saveto, models, options, b, normalize, verbose, alignweights):

    trng = RandomStreams(1234)

    fs_log_probs = []

    for model, option in zip(models, options):
        # allocate model parameters
        params = init_params(option)

        # load model parameters and set theano shared variables
        params = load_params(model, params)
        tparams = init_tparams(params)

        trng, use_noise, \
            x, x_mask, y, y_mask, \
            opt_ret, \
            cost = \
            build_model(tparams, option)
        inps = [x, x_mask, y, y_mask]
        use_noise.set_value(0.)

        if alignweights:
            print "\t*** Save weight mode ON, alignment matrix will be saved."
            outputs = [cost, opt_ret['dec_alphas']]
            f_log_probs = theano.function(inps, outputs)
        else:
            print "\t*** Save weight mode OFF, alignment matrix will not be saved."
            f_log_probs = theano.function(inps, cost)

        fs_log_probs.append(f_log_probs)

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        for i, f_log_probs in enumerate(fs_log_probs):
            score_this_batch = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights = alignweights)
            scores.append(score_this_batch)

        return scores

    pairs = TextIterator(source_file.name, target_file.name,
                    options[0]['dictionaries'][0], options[0]['dictionaries'][1],
                     n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'],
                     batch_size=b,
                     maxlen=float('inf'),
                     sort_by_length=False) #TODO: sorting by length could be more efficient, but we'd have to synchronize scores with n-best list after

    scores = _score(pairs, alignweights)

    source_file.seek(0)
    target_file.seek(0)
    source_lines = source_file.readlines()
    target_lines = target_file.readlines()

    for i, line in enumerate(target_lines):
        score_str = ' '.join(map(str,[s[i] for s in scores]))
        saveto.write('{0} {1}\n'.format(line.strip(), score_str))

    ### optional save weights mode.
    if alignweights:
        ### writing out the alignments.
        temp_name = saveto.name + ".json"
        with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT:
            for line in all_alignments:
                align_OUT.write(line + "\n")
            ### combining the actual source and target words.
            combine_source_target_text_1to1(source_file, target_file, saveto.name, align_OUT)