Ejemplo n.º 1
0
def main(model,
         dictionary_target,
         source_fea,
         source_latex,
         saveto,
         wer_file,
         k=5):

    # load model model_options
    with open('%s.pkl' % model, 'rb') as f:
        options = pkl.load(f)

    # load source dictionary and invert
    worddicts = load_dict(dictionary_target)
    worddicts_r = [None] * len(worddicts)
    for kk, vv in worddicts.iteritems():
        worddicts_r[vv] = kk

    valid, valid_uid_list = dataIterator_valid(source_fea,
                                               source_latex,
                                               worddicts,
                                               batch_size=1,
                                               maxlen=2000)

    trng = RandomStreams(1234)

    params = init_params(options)
    params = load_params(model, params)
    tparams = init_tparams(params)
    f_init, f_next = build_sampler(tparams, options, trng)

    fpp_sample = open(saveto, 'w')
    valid_count_idx = 0

    print 'Decoding...'
    ud_epoch = 0
    ud_epoch_start = time.time()
    for x, y in valid:
        for xx in x:
            print '%d : %s' % (valid_count_idx + 1,
                               valid_uid_list[valid_count_idx])
            xx_pad = numpy.zeros((xx.shape[0] + 1, xx.shape[1]),
                                 dtype='float32')
            xx_pad[:xx.shape[0], :] = xx
            stochastic = False
            sample, score = gen_sample(f_init,
                                       f_next,
                                       xx_pad[:, None, :],
                                       options,
                                       trng=trng,
                                       k=k,
                                       maxlen=1000,
                                       stochastic=stochastic,
                                       argmax=False)

            if stochastic:
                ss = sample
            else:
                score = score / numpy.array([len(s) for s in sample])
                ss = sample[score.argmin()]

            fpp_sample.write(valid_uid_list[valid_count_idx])
            valid_count_idx = valid_count_idx + 1
            for vv in ss:
                if vv == 0:  # <eol>
                    break
                fpp_sample.write(' ' + worddicts_r[vv])
            fpp_sample.write('\n')
    fpp_sample.close()
    ud_epoch = (time.time() - ud_epoch_start) / 60.
    print 'test set decode done, cost time ...', ud_epoch
    os.system('python compute-wer.py ' + saveto + ' ' + source_latex + ' ' +
              wer_file)
    fpp = open(wer_file)
    stuff = fpp.readlines()
    fpp.close()
    m = re.search('WER (.*)\n', stuff[0])
    valid_per = 100. * float(m.group(1))
    m = re.search('ExpRate (.*)\n', stuff[1])
    valid_sacc = 100. * float(m.group(1))

    print 'Valid WER: %.2f%%, ExpRate: %.2f%%' % (valid_per, valid_sacc)
Ejemplo n.º 2
0
def main(model, bn_model, dictionary_target, fea, latex, saveto, output, k=5):

    # load model model_options
    with open('%s.pkl' % model, 'rb') as f:
        options = pkl.load(f)

    # load source dictionary and invert
    worddicts = load_dict(dictionary_target)
    worddicts_r = [None] * len(worddicts)
    for kk, vv in worddicts.iteritems():
        worddicts_r[vv] = kk

    valid, valid_uid_list = dataIterator(fea,
                                         latex,
                                         worddicts,
                                         batch_size=1,
                                         batch_Imagesize=500000,
                                         maxlen=500,
                                         maxImagesize=500000)

    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # allocate model parameters
    params = init_params(options)
    bn_params = init_bn_params(options)
    # load model parameters and set theano shared variables
    params = load_params(model, params)
    bn_params = load_params(bn_model, bn_params)
    tparams = init_tparams(params)
    bn_tparams = init_tparams(bn_params)
    f_init, f_next = build_sampler(tparams, bn_tparams, options, trng,
                                   use_noise)

    use_noise.set_value(0.)

    fpp_sample = open(saveto, 'w')
    valid_count_idx = 0
    # FIXME: random selection?
    print 'Decoding ... '
    for x, y in valid:
        for xx in x:
            print '%d : %s' % (valid_count_idx + 1,
                               valid_uid_list[valid_count_idx])
            xx_pad = numpy.zeros(
                (xx.shape[0], xx.shape[1], xx.shape[2]),
                dtype='float32')  # input_channels * height * width
            xx_pad[:, :, :] = xx / 255.
            stochastic = False
            sample, score = gen_sample(f_init,
                                       f_next,
                                       xx_pad[None, :, :, :],
                                       options,
                                       trng=trng,
                                       k=10,
                                       maxlen=1000,
                                       stochastic=stochastic,
                                       argmax=False)

            if stochastic:
                ss = sample
            else:
                score = score / numpy.array([len(s) for s in sample])
                ss = sample[score.argmin()]

            fpp_sample.write(valid_uid_list[valid_count_idx])
            valid_count_idx = valid_count_idx + 1
            for vv in ss:
                if vv == 0:  # <eol>
                    break
                fpp_sample.write(' ' + worddicts_r[vv])
            fpp_sample.write('\n')
    fpp_sample.close()
    print 'test set decode done'

    os.system('python compute-wer.py ' + saveto + ' ' + latex + ' ' + output)
    fpp = open(output)  # %WER 31.63
    stuff = fpp.readlines()
    fpp.close()
    m = re.search('WER (.*)\n', stuff[0])
    valid_per = 100. * float(m.group(1))
    m = re.search('ExpRate (.*)\n', stuff[1])
    valid_sacc = 100. * float(m.group(1))

    print 'Valid WER: %.2f%%, ExpRate: %.2f%%' % (valid_per, valid_sacc)
Ejemplo n.º 3
0
def main(model_files, dictionary_target, grammar_target, data_path, saveto, wer_file, k=5):

    # load source dictionary and invert
    worddicts = load_dict(dictionary_target)
    worddicts_r = [None] * len(worddicts)
    for kk, vv in worddicts.items():
        worddicts_r[vv] = kk
    grammar=compileGrammar(loadGrammar(grammar_target,worddicts))

    trng = RandomStreams(1234)
    
    models=[]
    # load model model_options
    for model_file in model_files:
        print('Loading model: %s' % model_file)
        with open('%s.pkl' % model_file, 'rb') as f:
            options = pkl.load(f)
        print(options)
        params = init_params(options)
        params = load_params(model_file, params)
        tparams = init_tparams(params)
        f_init, f_next = build_sampler(tparams, options, trng)
        models.append((f_init,f_next,options,0.8))

    for lm_file in []:
        print('Loading language model: %s' % lm_file)
        f_init,f_next,options=load_language_model(lm_file)
        models.append((f_init,f_next,options,0.2))

    valid,valid_uid_list = dataIterator_valid(data_path,
                         worddicts, batch_size=1, maxlen=250)

    fpp_sample=[open('%s.%d'%(saveto,beam),'w') for beam in range(k)]
    
    valid_count_idx=0

    print('Decoding...')
    ud_epoch = 0
    ud_epoch_start = time.time()
    
    for x,y in valid:
        for xx in x:
            print('%d : %s' % (valid_count_idx+1, valid_uid_list[valid_count_idx]))
            xx_pad = numpy.zeros((xx.shape[0]+1,xx.shape[1]), dtype='float32')
            xx_pad[:xx.shape[0],:] = xx
            stochastic = False
            sample, score = gen_sample(models,
                                       xx_pad[:, None, :],
                                       grammar,
                                       trng=trng, k=k,
                                       maxlen=250,
                                       dictlen=len(worddicts),
                                       stochastic=stochastic,
                                       argmax=False)
            score = score / numpy.array([len(s) for s in sample])
            sample_rank=numpy.argsort(score)
            for beam in range(k):
                fpp_sample[beam].write(valid_uid_list[valid_count_idx])
                if len(sample)>beam:
                    ss=sample[sample_rank[beam]]
                else:
                    ss=[0]

                for vv in ss:
                    if vv == 0: # <eol>
                        break
                    fpp_sample[beam].write(' '+worddicts_r[vv])
                fpp_sample[beam].write('\n')
            valid_count_idx=valid_count_idx+1

    ud_epoch = (time.time() - ud_epoch_start) 
    print 'test set decode done, cost time ...', ud_epoch
    for beam in range(k):
        fpp_sample[beam].flush();
        fpp_sample[beam].close();
        os.system('python compute-wer.py %s.%d %s %s'%(saveto,beam,os.path.join(data_path,"caption.txt"),wer_file))
        fpp=open(wer_file)
        stuff=fpp.readlines()
        fpp.close()
        m=re.search('WER (.*)\n',stuff[0])
        valid_per=100. * float(m.group(1))
        m=re.search('ExpRate (.*)\n',stuff[1])
        valid_sacc=100. * float(m.group(1))

        print '%d Valid WER: %.2f%%, ExpRate: %.2f%%' % (beam,valid_per,valid_sacc)