Ejemplo n.º 1
0
def pred(_log, _config):
    p = _config

    modelname = file2name[p['modelfn']]
    mod_model = importlib.import_module('models.%s' % p['modelfn'])
    model_cls = getattr(mod_model, modelname)
    model_params = {k: v for k, v in p.items() if k in model_cls.params or k == 'modelfn'}
    model = model_cls(model_params, rnd_seed=p['seed'])
    expid = model.params_to_string(model_params)

    outdir_plot=trunc_dir('%s/train_%s/%s/predict_per_epoch/test_%s' % (p['parentdir'], p['train_years'],
                                                              p['expname'], p['test_year']))
    outdir_run=trunc_dir('%s/%s'%(outdir_plot, expid))
    tmp_dir=trunc_dir(os.path.join(outdir_run,'tmp'))
    weight_dir=trunc_dir('%s/train_%s/%s/model_weight/%s' % (p['parentdir'], p['train_years'],p['expname'], expid))
    detail_outdir=trunc_dir('%s/train_%s/%s/model_detail/' % (p['parentdir'], p['train_years'], p['expname']))

    assert os.path.isdir(weight_dir), "weight_dir " + weight_dir + " does not exist. Make sure you trained the model."
    assert os.path.isdir(detail_dir), "detail_dir " + detail_dir + " does not exist. Make sure you trained the model."

    if len(os.listdir(weight_dir)) < 1:
        raise SoftFailure('weight dir empty')

    try:
        if not os.path.isdir(outdir_run):
            os.makedirs(outdir_run)
            os.makedirs(tmp_dir)
    except OSError:
        pass
    _log.info('Processing {0}'.format(outdir_run))
    ###################
    label2tlabel={4:2,3:2,2:2,1:1,0:0,-2:0}
    topk4eval=20
    NGRAM_NFILTER, N_GRAMS = get_ngram_nfilter(p['winlen'], p['qproximity'], p['maxqlen'], p['xfilters'])

    _log.info('process {0} and output to {1}'.format(weight_dir, outdir_run))
    _log.info('{0} {1} {2} {3} {4}'.format(p['distill'], 'NGRAM_NFILTER', NGRAM_NFILTER, 'N_GRAMS', N_GRAMS))

    # prepare train data
    qids = get_train_qids(p['test_year'])
    qrelf = get_qrelf(qrelfdir, p['test_year'])
    qid_cwid_label = read_qrel(qrelf, qids, include_spam=False)
    test_qids =[qid for qid in qids if qid in qid_cwid_label]
    _log.info('%s test_num %d '%(p['test_year'], len(test_qids)))

    f_ndcg=dict()
    f_epochs = set()
    # sort weights by time and only use the first weights for each epoch
    # (in case there are duplicate weights from a failed/re-run train)
    for f in sorted(os.listdir(weight_dir),
                    key=lambda x: os.path.getctime(os.path.join(weight_dir, x))):
        if f.split('.')[-1] != 'h5':
            continue
        cols = f.split('.')[0].split('_')
        if len(cols) == 4:
            nb_epoch, loss, n_batch, n_samples = int(cols[0]), int(cols[1]), int(cols[2]), int(cols[3])
            if nb_epoch <= p['epochs'] and nb_epoch not in f_epochs:
                f_epochs.add(nb_epoch)
                f_ndcg[f]=(nb_epoch, loss, n_batch, n_samples)


    finished_epochs = {}
    for fn in sorted(os.listdir(outdir_run),
                     key=lambda x: os.path.getctime(os.path.join(outdir_run, x))):
        if fn.endswith(".run"):
            fields = fn[:-4].split("_") # trim .run
            assert len(fields) == 5

            epoch, loss = int(fields[0]), int(fields[4])
            ndcg, mapv, err = float(fields[1]), float(fields[2]), float(fields[3])

            #assert epoch not in finished_epochs
            if epoch in finished_epochs:
                _log.error("TODO two weights exist for same epoch")
            finished_epochs[epoch] = (epoch, err, ndcg, mapv, loss)

    _log.info('skipping finished epochs: {0}'.format(finished_epochs))

    def model_pred(NGRAM_NFILTER, weight_file, test_data, test_docids, test_qids):
        dump_modelplot(model.build(), detail_outdir + 'predplot_' + expid)
        model_predict = model.build_from_dump(weight_file)
        qid_cwid_pred = pred_label(model_predict, test_data, test_docids, test_qids)
        return qid_cwid_pred

    test_doc_vec, test_docids, test_qids=load_test_data(qids, rawdoc_mat_dir, qid_cwid_label, N_GRAMS, p)
    epoch_err_ndcg_loss=list()
    _log.info('start {0} {1} {2}'.format(expid, p['train_years'], p['test_year']))
    for f in sorted(f_ndcg, key=lambda x:f_ndcg[x][0]):
        nb_epoch, loss, n_batch, n_samples = f_ndcg[f]
        if nb_epoch in finished_epochs:
            epoch_err_ndcg_loss.append(finished_epochs[nb_epoch])
            continue
        weight_file = os.path.join(weight_dir, f)
        qid_cwid_pred = model_pred(NGRAM_NFILTER, weight_file, test_doc_vec, test_docids, test_qids)
        ndcg20, err20, mapv = eval_run(_log, qid_cwid_pred, expid, perlf, treceval, tmp_dir, topk4eval, qrelf)
        loss = int(loss)
        out_name = '%d_%0.4f_%0.4f_%0.4f_%d.run' % (nb_epoch, ndcg20, mapv, err20, loss)
        epoch_err_ndcg_loss.append((nb_epoch, err20, ndcg20, mapv, loss))
        print_run(qid_cwid_pred, outdir_run, out_name, expid)
        _log.info('finished {0}'.format(f))
    _log.info('finish {0} {1} {2}'.format(expid, p['train_years'], p['test_year']))

    plot_curve(epoch_err_ndcg_loss, outdir_plot, expid, p)

    if max(f_epochs) < p['epochs'] - 3:
        raise SoftFailure("prediction finished, but not all epochs are available yet. last epoch found: %s" % max(f_epochs))
Ejemplo n.º 2
0
def main(_log, _config):
    p = _config

    modelname = file2name[p['modelfn']]
    mod_model = importlib.import_module('models.%s' % p['modelfn'])
    # load the model to be employed, say from models/pacrr.py
    model_cls = getattr(mod_model, modelname)
    model_params = {k: v for k, v in p.items() if k in model_cls.params or k == 'modelfn'}
    model = model_cls(model_params, rnd_seed=p['seed'])
    # create a expid based on the configured parameters
    expid = model.params_to_string(model_params)

    # the model files
    outdir='%s/train_%s/%s/model_weight/%s'%(p['parentdir'], p['train_years'], p['expname'], expid)
    # the plots for the model, the training loss etc..
    detail_outdir='%s/train_%s/%s/model_detail/'%(p['parentdir'], p['train_years'], p['expname'])

    if not os.path.isdir(detail_outdir + 'outs'):
        print(detail_outdir + 'outs')
        os.makedirs(detail_outdir + 'outs')

    _log.info('Input parameters: {0}'.format(p))
    label2tlabel={4:2,3:2,2:2,1:1,0:0,-2:0}
    sample_label_prob=dict()
    _log.info('{0} {1} {2}'.format(p['expname'], p['train_years'], sample_label_prob))

    NGRAM_NFILTER, N_GRAMS = get_ngram_nfilter(p['winlen'], p['qproximity'], p['maxqlen'], p['xfilters'])

    _log.info('process and output to %s'%outdir)
    _log.info('{0} {1} {2} {3} {4}'.format(p['distill'], 'NGRAM_NFILTER', NGRAM_NFILTER, 'N_GRAMS', N_GRAMS))
    if os.path.exists(outdir) and len(os.listdir(outdir)) == p['epochs']:
        _log.info("outdir already seems to be full... exiting early")
        return

    # prepare train data
    qids = get_train_qids(p['train_years'])
    qrelf = get_qrelf(qrelfdir, p['train_years'])
    qid_cwid_label = read_qrel(qrelf, qids, include_spam=False)
    train_qids =[qid for qid in qids if qid in qid_cwid_label]
    _log.info('%s train_num %d '%(p['train_years'], len(train_qids)))

    def plot_curve_loss(epoch_train_loss, outdir, name, plot_id, series):
        epochs, losses = zip(*list(enumerate(epoch_train_loss)))
        argmin_loss_epoch =  np.argmin(epoch_train_loss)
        fig = plt.figure()
        plt.plot(epochs, losses, 'k:')
        plt.ylabel('Training Loss')
        plt.tick_params('y')
        plt.xlabel('epoches')
        plt.title('loss:%d %.3f'%(argmin_loss_epoch, epoch_train_loss[argmin_loss_epoch]))
        fig.savefig(trunc_dir(outdir) + trunc_file(name + '_' + plot_id + '.pdf'), format='pdf')
        plt.close()


    # dump model plot
    built_model = model.build()
    model.build_predict()  # run build_predict to verify it's working
    dump_modelplot(built_model, detail_outdir + 'model_' + expid)

    # callback function, dump the model and compute ndcg/map
    dump_weight = DumpWeight(outdir, batch_size=p['batch'], nb_sample=p['nsamples'])

    # keras 2 steps per epoch is number of batches per epoch, not number of samples per epoch
    steps_per_epoch = np.int(p['nsamples'] / p['batch'])

    # the generator for training data
    train_data_generator=\
            load_train_data_generator(qids, rawdoc_mat_dir, qid_cwid_label, N_GRAMS, p,\
                    label2tlabel=label2tlabel, sample_label_prob=sample_label_prob)

    history = built_model.fit_generator(train_data_generator, steps_per_epoch=steps_per_epoch, epochs=p['epochs'],
                                        verbose=0, callbacks=[dump_weight], max_q_size=15, workers=1, pickle_safe=False)

    epoch_train_loss = history.history['loss']

    # plot the training loss for debugging
    plot_curve_loss(epoch_train_loss, detail_outdir, 'train_', expid, ['loss'])
    historyfile = detail_outdir + 'hist_' + expid + '.history'
    with open(detail_outdir + 'hist_' + expid + '.p', 'wb') as handle:
        pickle.dump(history.history, handle, protocol=pickle.HIGHEST_PROTOCOL)