Ejemplo n.º 1
0
    def _score(pairs):
        # sample given an input sequence and obtain scores
        scores = []
        for i, f_log_probs in enumerate(fs_log_probs):
            scores.append(pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize))

        return scores
Ejemplo n.º 2
0
    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        for i, f_log_probs in enumerate(fs_log_probs):
            score_this_batch = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights = alignweights)
            scores.append(score_this_batch)

        return scores
Ejemplo n.º 3
0
    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        alignments = []
        for i, f_log_probs in enumerate(fs_log_probs):
            score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights = alignweights)
            scores.append(score)
            alignments.append(alignment)

        return scores, alignments
Ejemplo n.º 4
0
    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        alignments = []
        for i, model in enumerate(rescorer_settings.models):
            f_log_probs = load_scorer(model, options[i], alignweights=alignweights)
            score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalization_alpha=rescorer_settings.normalization_alpha, alignweights = alignweights)
            scores.append(score)
            alignments.append(alignment)

        return scores, alignments
Ejemplo n.º 5
0
def get_error(model, test_src, test_target):
    profile = False

    # reload options
    f = open('%s.pkl' % model, 'rb')
    model_options = pkl.load(f)
    logging.info(model_options)

    logging.info('Building model')
    params = init_params(model_options)

    # reload parameters
    params = load_params(model, params)
    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    dict_src = os.path.join(model_options['baseDir'],
                            model_options['dictionaries'][0])
    if len(model_options['dictionaries']) == 1:
        dict_target = None
    else:
        dict_target = os.path.join(model_options['baseDir'],
                                   model_options['dictionaries'][1])

    valid = TextIterator(test_src,
                         test_target,
                         dict_src,
                         dict_target,
                         n_words_source=model_options['n_words_src'],
                         n_words_target=model_options['n_words'],
                         batch_size=model_options['valid_batch_size'],
                         maxlen=model_options['maxlen'])

    logging.info('Building f_log_probs...')
    f_log_probs = theano.function(inps, cost, profile=profile)
    valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid)
    valid_err = valid_errs.mean()
    logging.info('Valid Error:%s' % (str(valid_err)))
Ejemplo n.º 6
0
def get_error(model, test_src, test_target):
    profile=False

    # reload options
    f = open('%s.pkl' % model, 'rb')
    model_options = pkl.load(f)
    logging.info(model_options)

    logging.info('Building model')
    params = init_params(model_options)

    # reload parameters
    params = load_params(model, params)
    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    dict_src = os.path.join(model_options['baseDir'], model_options['dictionaries'][0])
    if len(model_options['dictionaries']) == 1:
        dict_target = None
    else:
        dict_target = os.path.join(model_options['baseDir'], model_options['dictionaries'][1])

    valid = TextIterator(test_src, test_target,
                         dict_src,
                         dict_target,
                         n_words_source=model_options['n_words_src'],
                         n_words_target=model_options['n_words'],
                         batch_size=model_options['valid_batch_size'],
                         maxlen=model_options['maxlen'])

    logging.info('Building f_log_probs...')
    f_log_probs = theano.function(inps, cost, profile=profile)
    valid_errs = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid)
    valid_err = valid_errs.mean()
    logging.info('Valid Error:%s'% (str(valid_err)))
Ejemplo n.º 7
0
    def _score(pairs, alignweights=True):
        # sample given an input sequence and obtain scores
        scores = []
        sent_alignments = []
        costs_per_word = []
        for i, model in enumerate(models):
            f_log_probs = load_scorer(model,
                                      options[i],
                                      alignweights=alignweights)

            # TODO: make multi ?
            score, alignments, cost_per_word = pred_probs(
                f_log_probs,
                prepare_data,
                options[i],
                pairs,
                normalization_alpha=normalization_alpha,
                alignweights=alignweights)

            scores.append(score)
            sent_alignments.append(alignments)
            costs_per_word.append(cost_per_word)

        return scores, sent_alignments, costs_per_word
Ejemplo n.º 8
0
def main(model,
         src_dict,
         target_dict,
         source_file,
         target_file,
         saveto,
         source_word_level=1,
         target_word_level=0,
         valid_batch_size=128,
         n_words_src=302,
         n_words=302):
    from char_base import (init_params, build_model, build_sampler)
    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    from nmt import (pred_probs, prepare_data)

    # load model model_options
    pkl_file = model.split('.')[0] + '.pkl'
    with open(pkl_file, 'rb') as f:
        options = pkl.load(f)

    trng = RandomStreams(1234)

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)

    # create shared variables for parameters
    tparams = init_tparams(params)

    trng, use_noise, \
    x, x_mask, y, y_mask, \
    opt_ret, \
    cost = \
        build_model(tparams, options)
    inps = [x, x_mask, y, y_mask]

    print 'Building sampler...\n',
    f_init, f_next = build_sampler(tparams, options, trng, use_noise)
    print 'Done'

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost)
    print 'Done'

    print('Preparing dataset...')
    dataset = TextIterator(source=source_file,
                           target=target_file,
                           source_dict=src_dict,
                           target_dict=target_dict,
                           n_words_source=n_words_src,
                           n_words_target=n_words,
                           source_word_level=source_word_level,
                           target_word_level=target_word_level,
                           batch_size=valid_batch_size,
                           sort_size=sort_size)

    print('Predicting probs...')
    log_probs = pred_probs(f_log_probs,
                           prepare_data,
                           options,
                           dataset,
                           verboseFreq=10000)
    print('Done...')
    output_file = open(saveto, 'w')
    pwd_cnt = 0
    for line in open(target_file):
        output_file.writelines(line.rstrip() + '\t' +
                               str(1.0 / (math.e**log_probs[pwd_cnt])) + '\n')
        pwd_cnt += 1
    """
    for prob in log_probs:
        output_file.writelines(str(prob) + '\n')
    """
    output_file.flush()
    output_file.close()
    print('Evaluation finished...')
Ejemplo n.º 9
0
def main(model_dir, model_pkl, model_grads, dict_src, dict_trg, hyp_filename,
         saveto, n_words_src, n_words, workdir):

    print 'Loading model.'

    model_file = os.path.join(model_dir, model_pkl)
    with open(model_file, 'rb') as f:
        model_options = pkl.load(f)

    param_file = os.path.join(model_dir, model_grads)
    params = init_params(model_options)
    params = load_params(param_file, params)
    tparams = init_tparams(params)

    # load dictionary and invert
    with open(dict_src, 'rb') as f:
        word_dict = pkl.load(f)
    word_idict = dict()
    for kk, vv in word_dict.iteritems():
        word_idict[vv] = kk
    with open(dict_trg, 'rb') as f:
        word_dict_trg = pkl.load(f)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk

    temp_dir = workdir
    print 'Using temp directory', temp_dir
    hyp_src_fname = os.path.join(
        temp_dir,
        '%s.src.%d' % (os.path.basename(hyp_filename), int(time.time())))
    hyp_trg_fname = os.path.join(
        temp_dir,
        '%s.trg.%d' % (os.path.basename(hyp_filename), int(time.time())))
    print 'hyp temp:', hyp_src_fname
    print 'hyp temp:', hyp_trg_fname

    hyp_src = open(hyp_src_fname, 'w')
    hyp_trg = open(hyp_trg_fname, 'w')
    with open(hyp_filename, 'r') as f:
        for line in f:
            toks = line.strip().split('\t')
            hyp_src.write('%s\n' % toks[0].strip())
            hyp_trg.write('%s\n' % toks[1].strip())
    hyp_src.close()
    hyp_trg.close()

    test = TextIterator(source=hyp_src_fname,
                        target=hyp_trg_fname,
                        source_dict=dict_src,
                        target_dict=dict_trg,
                        n_words_source=n_words_src,
                        n_words_target=n_words,
                        source_word_level=0,
                        target_word_level=0,
                        batch_size=1,
                        sort_size=1)  #?? dunno what this param does

    print 'Building model...\n',
    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]
    '''
  # TODO maybe don't need this
  f_init, f_next = build_sampler(tparams, model_options, trng, use_noise)
  '''

    print 'Building f_log_probs...'
    f_log_probs = theano.function(inps, cost, profile=profile)
    use_noise.set_value(0.)

    test_scores = pred_probs(f_log_probs, prepare_data, model_options, test, 5)
    print test_scores.mean()

    os.remove(hyp_src_fname)
    os.remove(hyp_trg_fname)

    test_scores = [str(f) for f in test_scores]

    with open(saveto, 'w') as f:
        f.write(u'\n'.join(test_scores).encode('utf-8'))
        f.write(u'\n')

    print "Done", saveto
Ejemplo n.º 10
0
def main(model,
         pklmodel,
         valid_datasets=['../data/dev/newstest2011.en.tok',
                          '../data/dev/newstest2011.fr.tok'],
         dictionaries=[
              '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
              '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'],
         dictionary_chunk='/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
         result_file='./cost.result'):





    # load the dictionaries of both source and target
    # load dictionaries and invert them
    worddicts = [None] * len(dictionaries)
    worddicts_r = [None] * len(dictionaries)
    for ii, dd in enumerate(dictionaries):
        with open(dd, 'rb') as f:
            worddicts[ii] = pkl.load(f)
        worddicts_r[ii] = dict()
        for kk, vv in worddicts[ii].iteritems():
            worddicts_r[ii][vv] = kk

    # dict for chunk label
    worddict_chunk = [None]
    worddict_r_chunk = [None]
    with open(dictionary_chunk, 'rb') as f:
        worddict_chunk = pkl.load(f)
    worddict_r_chunk = dict()
    for kk, vv in worddict_chunk.iteritems():
        worddict_r_chunk[vv] = kk
    print worddict_chunk

    print 'load model model_options'
    with open('%s' % pklmodel, 'rb') as f:
        options = pkl.load(f)


    # build valid set
    valid = TrainingTextIterator(valid_datasets[0], valid_datasets[1],
                                 dictionaries[0], dictionaries[1], dictionary_chunk,
                                 n_words_source=options['n_words_src'], n_words_target=options['n_words'],
                                 batch_size=options['batch_size'],
                                 max_chunk_len=options['maxlen_chunk'], max_word_len=options['maxlen_chunk_words'])


    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    trng, use_noise, \
    x, x_mask, y_chunk, y_mask, y_cw, y_chunk_indicator, \
    opt_ret, \
    cost, cost_cw= \
        build_model(tparams, options)


    inps = [x, x_mask, y_chunk, y_mask, y_cw, y_chunk_indicator]



    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=False)
    f_log_probs_cw = theano.function(inps, cost_cw, profile=False)
    print 'Done'

    valid_errs, valid_errs_cw = pred_probs(f_log_probs, f_log_probs_cw, prepare_training_data,
                                            options, valid)

    valid_err = valid_errs.mean()
    valid_err_cw = valid_errs_cw.mean()

    with open(result_file, 'w') as result_file:
        print >> result_file, valid_err, valid_err_cw
Ejemplo n.º 11
0
def main(model, dictionary, dictionary_target, source, target, outfile,
         wordbyword):

    # load model model_options
    with open('%s.pkl' % model, 'rb') as f:
        options = pkl.load(f)
    """
    # load source dictionary and invert
    with open(dictionary, 'rb') as f:
        word_dict = pkl.load(f)
    word_idict = dict()
    for kk, vv in word_dict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # load target dictionary and invert
    with open(dictionary_target, 'rb') as f:
        word_dict_trg = pkl.load(f)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'
    """
    valid_noshuf = TextIterator(source,
                                target,
                                dictionary,
                                dictionary_target,
                                n_words_source=options['n_words_src'],
                                n_words_target=options['n_words'],
                                batch_size=options['valid_batch_size'],
                                maxlen=2000,
                                shuffle=False)

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost, cost_ = \
        build_model(tparams, options)

    inps = [x, x_mask, y, y_mask]

    if wordbyword:
        f_log_probs = theano.function(inps, cost_, profile=profile)
        valid_errs = pred_probs(f_log_probs,
                                prepare_data,
                                options,
                                valid_noshuf,
                                verbose=True,
                                as_list=True)
        with open(outfile, 'wb') as f:
            pkl.dump(valid_errs, f, pkl.HIGHEST_PROTOCOL)
    else:
        f_log_probs = theano.function(inps, cost, profile=profile)
        valid_errs = pred_probs(f_log_probs,
                                prepare_data,
                                options,
                                valid_noshuf,
                                verbose=True)
        numpy.save(outfile, valid_errs)