Example #1
0
File: layers.py Project: afcarl/nmt
def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None, ortho=True):
    if nin is None:
        nin = options['dim_proj']
    if nout is None:
        nout = options['dim_proj']
    params[prfx(prefix,'W')] = norm_weight(nin, nout, scale=0.01, ortho=ortho)
    params[prfx(prefix,'b')] = numpy.zeros((nout,)).astype('float32')

    return params
Example #2
0
File: layers.py Project: afcarl/nmt
def gru_layer(tparams, state_below, options, prefix='gru', mask=None, **kwargs):
    nsteps = state_below.shape[0]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    dim = tparams[prfx(prefix,'Ux')].shape[1]

    if mask == None:
        mask = tensor.alloc(1., state_below.shape[0], 1)

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n*dim:(n+1)*dim]
        return _x[:, n*dim:(n+1)*dim]

    state_below_ = tensor.dot(state_below, tparams[prfx(prefix, 'W')]) + tparams[prfx(prefix, 'b')]
    state_belowx = tensor.dot(state_below, tparams[prfx(prefix, 'Wx')]) + tparams[prfx(prefix, 'bx')]
    U = tparams[prfx(prefix, 'U')]
    Ux = tparams[prfx(prefix, 'Ux')]

    def _step_slice(m_, x_, xx_, h_, U, Ux):
        preact = tensor.dot(h_, U)
        preact += x_

        r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
        u = tensor.nnet.sigmoid(_slice(preact, 1, dim))

        preactx = tensor.dot(h_, Ux)
        preactx = preactx * r
        preactx = preactx + xx_

        h = tensor.tanh(preactx)

        h = u * h_ + (1. - u) * h
        h = m_[:,None] * h + (1. - m_)[:,None] * h_

        return h

    seqs = [mask, state_below_, state_belowx]
    _step = _step_slice

    rval, updates = theano.scan(_step,
                                sequences=seqs,
                                outputs_info = [tensor.alloc(0., n_samples, dim)],
                                non_sequences = [tparams[prfx(prefix, 'U')],
                                                 tparams[prfx(prefix, 'Ux')]],
                                name=prfx(prefix, '_layers'),
                                n_steps=nsteps,
                                profile=profile,
                                strict=True)
    rval = [rval]
    return rval
Example #3
0
File: layers.py Project: afcarl/nmt
def param_init_gru_cond(options, params, prefix='gru_cond',
                        nin=None, dim=None, dimctx=None):
    if nin is None:
        nin = options['dim']
    if dim is None:
        dim = options['dim']
    if dimctx is None:
        dimctx = options['dim']

    params = param_init_gru(options, params, prefix, nin=nin, dim=dim)

    # context to LSTM
    Wc = norm_weight(dimctx,dim*2)
    params[prfx(prefix,'Wc')] = Wc

    Wcx = norm_weight(dimctx,dim)
    params[prfx(prefix,'Wcx')] = Wcx

    # attention: prev -> hidden
    Wi_att = norm_weight(nin,dimctx)
    params[prfx(prefix,'Wi_att')] = Wi_att

    # attention: context -> hidden
    Wc_att = norm_weight(dimctx)
    params[prfx(prefix,'Wc_att')] = Wc_att

    # attention: LSTM -> hidden
    Wd_att = norm_weight(dim,dimctx)
    params[prfx(prefix,'Wd_att')] = Wd_att

    # attention: hidden bias
    b_att = numpy.zeros((dimctx,)).astype('float32')
    params[prfx(prefix,'b_att')] = b_att

    # attention:
    U_att = norm_weight(dimctx,1)
    params[prfx(prefix,'U_att')] = U_att
    c_att = numpy.zeros((1,)).astype('float32')
    params[prfx(prefix, 'c_tt')] = c_att

    return params
Example #4
0
File: layers.py Project: afcarl/nmt
def param_init_gru(options, params, prefix='gru', nin=None, dim=None, hiero=False):
    if nin == None:
        nin = options['dim_proj']
    if dim == None:
        dim = options['dim_proj']
    if not hiero:
        W = numpy.concatenate([norm_weight(nin,dim),
                               norm_weight(nin,dim)], axis=1)
        params[prfx(prefix,'W')] = W
        params[prfx(prefix,'b')] = numpy.zeros((2 * dim,)).astype('float32')
    U = numpy.concatenate([ortho_weight(dim),
                           ortho_weight(dim)], axis=1)
    params[prfx(prefix,'U')] = U

    Wx = norm_weight(nin, dim)
    params[prfx(prefix,'Wx')] = Wx
    Ux = ortho_weight(dim)
    params[prfx(prefix,'Ux')] = Ux
    params[prfx(prefix,'bx')] = numpy.zeros((dim,)).astype('float32')

    return params
def train(
        dim_word_desc=400,  # word vector dimensionality
        dim_word_q=400,
        dim_word_ans=600,
        dim_proj=300,
        dim=400,  # the number of LSTM units
        encoder_desc='lstm',
        encoder_desc_word='lstm',
        encoder_desc_sent='lstm',
        use_dq_sims=False,
        eyem=None,
        learn_h0=False,
        use_desc_skip_c_g=False,
        debug=False,
        encoder_q='lstm',
        patience=10,
        max_epochs=5000,
        dispFreq=100,
        decay_c=0.,
        alpha_c=0.,
        clip_c=-1.,
        lrate=0.01,
        n_words_q=49145,
        n_words_desc=115425,
        n_words_ans=409,
        pkl_train_files=None,
        pkl_valid_files=None,
        maxlen=2000,  # maximum length of the description
        optimizer='rmsprop',
        batch_size=2,
        vocab=None,
        valid_batch_size=16,
        use_elu_g=False,
        saveto='model.npz',
        model_dir=None,
        ms_nlayers=3,
        validFreq=1000,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        datasets=[None],
        truncate=400,
        momentum=0.9,
        use_bidir=False,
        cost_mask=None,
        valid_datasets=[
            '/u/yyu/stor/caglar/rc-data/cnn/cnn_test_data.h5',
            '/u/yyu/stor/caglar/rc-data/cnn/cnn_valid_data.h5'
        ],
        dropout_rate=0.5,
        use_dropout=True,
        reload_=True,
        **opt_ds):

    ensure_dir_exists(model_dir)
    mpath = os.path.join(model_dir, saveto)
    mpath_best = os.path.join(model_dir, prfx("best", saveto))
    mpath_last = os.path.join(model_dir, prfx("last", saveto))
    mpath_stats = os.path.join(model_dir, prfx("stats", saveto))

    # Model options
    model_options = locals().copy()
    model_options['use_sent_reps'] = opt_ds['use_sent_reps']
    stats = defaultdict(list)

    del model_options['eyem']
    del model_options['cost_mask']

    if cost_mask is not None:
        cost_mask = sharedX(cost_mask)

    # reload options and parameters
    if reload_:
        print "Reloading the model."
        if os.path.exists(mpath_best):
            print "Reloading the best model from %s." % mpath_best
            with open(os.path.join(mpath_best, '%s.pkl' % mpath_best),
                      'rb') as f:
                models_options = pkl.load(f)
            params = init_params(model_options)
            params = load_params(mpath_best, params)
        elif os.path.exists(mpath):
            print "Reloading the model from %s." % mpath
            with open(os.path.join(mpath, '%s.pkl' % mpath), 'rb') as f:
                models_options = pkl.load(f)
            params = init_params(model_options)
            params = load_params(mpath, params)
        else:
            raise IOError("Couldn't open the file.")
    else:
        print "Couldn't reload the models initializing from scratch."
        params = init_params(model_options)

    if datasets[0]:
        print "Short dataset", datasets[0]

    print 'Loading data'
    print 'Building model'
    if pkl_train_files is None or pkl_valid_files is None:
        train, valid, test = load_data(path=datasets[0],
                                       valid_path=valid_datasets[0],
                                       test_path=valid_datasets[1],
                                       batch_size=batch_size,
                                       **opt_ds)
    else:
        train, valid, test = load_pkl_data(train_file_paths=pkl_train_files,
                                           valid_file_paths=pkl_valid_files,
                                           batch_size=batch_size,
                                           vocab=vocab,
                                           eyem=eyem,
                                           **opt_ds)

    tparams = init_tparams(params)
    trng, use_noise, inps_d, \
                     opt_ret, \
                     cost, errors, ent_errors, ent_derrors, probs = \
                        build_model(tparams,
                                    model_options,
                                    prepare_data if not opt_ds['use_sent_reps'] \
                                            else prepare_data_sents,
                                    valid,
                                    cost_mask=cost_mask)

    alphas = opt_ret['dec_alphas']

    if opt_ds['use_sent_reps']:
        inps = [inps_d["desc"], \
                inps_d["word_mask"], \
                inps_d["q"], \
                inps_d['q_mask'], \
                inps_d['ans'], \
                inps_d['wlen'],
                inps_d['slen'], inps_d['qlen'],\
                inps_d['ent_mask']
                ]
    else:
        inps = [inps_d["desc"], \
                inps_d["word_mask"], \
                inps_d["q"], \
                inps_d['q_mask'], \
                inps_d['ans'], \
                inps_d['wlen'], \
                inps_d['qlen'], \
                inps_d['ent_mask']]

    outs = [cost, errors, probs, alphas]
    if ent_errors:
        outs += [ent_errors]

    if ent_derrors:
        outs += [ent_derrors]

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, outs, profile=profile)
    print 'Done'

    # Apply weight decay on the feed-forward connections
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.

        for kk, vv in tparams.iteritems():
            if "logit" in kk or "ff" in kk:
                weight_decay += (vv**2).sum()

        weight_decay *= decay_c
        cost += weight_decay

    # after any regularizer
    print 'Computing gradient...',
    grads = safe_grad(cost, itemlist(tparams))
    print 'Done'

    # Gradient clipping:
    if clip_c > 0.:
        g2 = get_norms(grads)
        for p, g in grads.iteritems():
            grads[p] = tensor.switch(g2 > (clip_c**2),
                                     (g / tensor.sqrt(g2 + 1e-8)) * clip_c, g)
    inps.pop()
    if optimizer.lower() == "adasecant":
        learning_rule = Adasecant(delta_clip=25.0,
                                  use_adagrad=True,
                                  grad_clip=0.25,
                                  gamma_clip=0.)
    elif optimizer.lower() == "rmsprop":
        learning_rule = RMSPropMomentum(init_momentum=momentum)
    elif optimizer.lower() == "adam":
        learning_rule = Adam()
    elif optimizer.lower() == "adadelta":
        learning_rule = AdaDelta()

    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    learning_rule = None

    if learning_rule:
        f_grad_shared, f_update = learning_rule.get_funcs(learning_rate=lr,
                                                          grads=grads,
                                                          inp=inps,
                                                          cost=cost,
                                                          errors=errors)
    else:
        f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps,
                                                  cost, errors)

    print 'Done'
    print 'Optimization'
    history_errs = []
    # reload history
    if reload_ and os.path.exists(mpath):
        history_errs = list(numpy.load(mpath)['history_errs'])

    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size

    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    best_found = False
    uidx = 0
    estop = False

    train_cost_ave, train_err_ave, \
            train_gnorm_ave = reset_train_vals()

    for eidx in xrange(max_epochs):
        n_samples = 0

        if train.done:
            train.reset()

        for d_, q_, a, em in train:
            n_samples += len(a)
            uidx += 1
            use_noise.set_value(1.)

            if opt_ds['use_sent_reps']:
                # To mask the description and the question.
                d, d_mask, q, q_mask, dlen, slen, qlen = prepare_data_sents(
                    d_, q_)

                if d is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    uidx -= 1
                    continue

                ud_start = time.time()
                cost, errors, gnorm, pnorm = f_grad_shared(
                    d, d_mask, q, q_mask, a, dlen, slen, qlen)
            else:
                d, d_mask, q, q_mask, dlen, qlen = prepare_data(d_, q_)

                if d is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    uidx -= 1
                    continue

                ud_start = time.time()
                cost, errors, gnorm, pnorm = f_grad_shared(
                    d, d_mask, q, q_mask, a, dlen, qlen)

            upnorm = f_update(lrate)
            ud = time.time() - ud_start

            # Collect the running ave train stats.
            train_cost_ave = running_ave(train_cost_ave, cost)
            train_err_ave = running_ave(train_err_ave, errors)
            train_gnorm_ave = running_ave(train_gnorm_ave, gnorm)

            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                import ipdb
                ipdb.set_trace()

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, ' Update ', uidx, \
                        ' Cost ', cost, ' UD ', ud, \
                        ' UpNorm ', upnorm[0].tolist(), \
                        ' GNorm ', gnorm, \
                        ' Pnorm ', pnorm, 'Terrors ', errors

            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',
                if best_p is not None and best_found:
                    numpy.savez(mpath_best,
                                history_errs=history_errs,
                                **best_p)
                    pkl.dump(model_options, open('%s.pkl' % mpath_best, 'wb'))
                else:
                    params = unzip(tparams)

                numpy.savez(mpath, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl' % mpath, 'wb'))
                pkl.dump(stats, open("%s.pkl" % mpath_stats, 'wb'))

                print 'Done'
                print_param_norms(tparams)

            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                if valid.done:
                    valid.reset()

                valid_costs, valid_errs, valid_probs, \
                        valid_alphas, error_ent, error_dent = eval_model(f_log_probs,
                                                  prepare_data if not opt_ds['use_sent_reps'] \
                                                    else prepare_data_sents,
                                                  model_options,
                                                  valid,
                                                  use_sent_rep=opt_ds['use_sent_reps'])

                valid_alphas_ = numpy.concatenate(
                    [va.argmax(0) for va in valid_alphas.tolist()], axis=0)
                valid_err = valid_errs.mean()
                valid_cost = valid_costs.mean()
                valid_alpha_ent = -negentropy(valid_alphas)

                mean_valid_alphas = valid_alphas_.mean()
                std_valid_alphas = valid_alphas_.std()

                mean_valid_probs = valid_probs.argmax(1).mean()
                std_valid_probs = valid_probs.argmax(1).std()

                history_errs.append([valid_cost, valid_err])

                stats['train_err_ave'].append(train_err_ave)
                stats['train_cost_ave'].append(train_cost_ave)
                stats['train_gnorm_ave'].append(train_gnorm_ave)

                stats['valid_errs'].append(valid_err)
                stats['valid_costs'].append(valid_cost)
                stats['valid_err_ent'].append(error_ent)
                stats['valid_err_desc_ent'].append(error_dent)

                stats['valid_alphas_mean'].append(mean_valid_alphas)
                stats['valid_alphas_std'].append(std_valid_alphas)
                stats['valid_alphas_ent'].append(valid_alpha_ent)

                stats['valid_probs_mean'].append(mean_valid_probs)
                stats['valid_probs_std'].append(std_valid_probs)

                if uidx == 0 or valid_err <= numpy.array(
                        history_errs)[:, 1].min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                    best_found = True
                else:
                    bst_found = False

                if numpy.isnan(valid_err):
                    import ipdb
                    ipdb.set_trace()

                print "============================"
                print '\t>>>Valid error: ', valid_err, \
                        ' Valid cost: ', valid_cost
                print '\t>>>Valid pred mean: ', mean_valid_probs, \
                        ' Valid pred std: ', std_valid_probs
                print '\t>>>Valid alphas mean: ', mean_valid_alphas, \
                        ' Valid alphas std: ', std_valid_alphas, \
                        ' Valid alpha negent: ', valid_alpha_ent, \
                        ' Valid error ent: ', error_ent, \
                        ' Valid error desc ent: ', error_dent

                print "============================"
                print "Running average train stats "
                print '\t>>>Train error: ', train_err_ave, \
                        ' Train cost: ', train_cost_ave, \
                        ' Train grad norm: ', train_gnorm_ave
                print "============================"


                train_cost_ave, train_err_ave, \
                    train_gnorm_ave = reset_train_vals()

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid.reset()
    valid_cost, valid_error, valid_probs, \
            valid_alphas, error_ent = eval_model(f_log_probs,
                                      prepare_data if not opt_ds['use_sent_reps'] \
                                           else prepare_data_sents,
                                      model_options, valid,
                                      use_sent_rep=opt_ds['use_sent_rep'])

    print " Final eval resuts: "
    print 'Valid error: ', valid_error.mean()
    print 'Valid cost: ', valid_cost.mean()
    print '\t>>>Valid pred mean: ', valid_probs.mean(), \
            ' Valid pred std: ', valid_probs.std(), \
            ' Valid error ent: ', error_ent

    params = copy.copy(best_p)

    numpy.savez(mpath_last,
                zipped_params=best_p,
                history_errs=history_errs,
                **params)

    return valid_err, valid_cost
def train(dim_word_desc=400,# word vector dimensionality
          dim_word_q=400,
          dim_word_ans=600,
          dim_proj=300,
          dim=400,# the number of LSTM units
          encoder_desc='lstm',
          encoder_desc_word='lstm',
          encoder_desc_sent='lstm',
          use_dq_sims=False,
          eyem=None,
          learn_h0=False,
          use_desc_skip_c_g=False,
          debug=False,
          encoder_q='lstm',
          patience=10,
          max_epochs=5000,
          dispFreq=100,
          decay_c=0.,
          alpha_c=0.,
          clip_c=-1.,
          lrate=0.01,
          n_words_q=49145,
          n_words_desc=115425,
          n_words_ans=409,
          pkl_train_files=None,
          pkl_valid_files=None,
          maxlen=2000, # maximum length of the description
          optimizer='rmsprop',
          batch_size=2,
          vocab=None,
          valid_batch_size=16,
          use_elu_g=False,
          saveto='model.npz',
          model_dir=None,
          ms_nlayers=3,
          validFreq=1000,
          saveFreq=1000, # save the parameters after every saveFreq updates
          datasets=[None],
          truncate=400,
          momentum=0.9,
          use_bidir=False,
          cost_mask=None,
          valid_datasets=['/u/yyu/stor/caglar/rc-data/cnn/cnn_test_data.h5',
                          '/u/yyu/stor/caglar/rc-data/cnn/cnn_valid_data.h5'],
          dropout_rate=0.5,
          use_dropout=True,
          reload_=True,
          **opt_ds):

    ensure_dir_exists(model_dir)
    mpath = os.path.join(model_dir, saveto)
    mpath_best = os.path.join(model_dir, prfx("best", saveto))
    mpath_last = os.path.join(model_dir, prfx("last", saveto))
    mpath_stats = os.path.join(model_dir, prfx("stats", saveto))

    # Model options
    model_options = locals().copy()
    model_options['use_sent_reps'] = opt_ds['use_sent_reps']
    stats = defaultdict(list)

    del model_options['eyem']
    del model_options['cost_mask']

    if cost_mask is not None:
        cost_mask = sharedX(cost_mask)

    # reload options and parameters
    if reload_:
        print "Reloading the model."
        if os.path.exists(mpath_best):
            print "Reloading the best model from %s." % mpath_best
            with open(os.path.join(mpath_best, '%s.pkl' % mpath_best), 'rb') as f:
                models_options = pkl.load(f)
            params = init_params(model_options)
            params = load_params(mpath_best, params)
        elif os.path.exists(mpath):
            print "Reloading the model from %s." % mpath
            with open(os.path.join(mpath, '%s.pkl' % mpath), 'rb') as f:
                models_options = pkl.load(f)
            params = init_params(model_options)
            params = load_params(mpath, params)
        else:
            raise IOError("Couldn't open the file.")
    else:
        print "Couldn't reload the models initializing from scratch."
        params = init_params(model_options)

    if datasets[0]:
        print "Short dataset", datasets[0]

    print 'Loading data'
    print 'Building model'
    if pkl_train_files is None or pkl_valid_files is None:
        train, valid, test = load_data(path=datasets[0],
                                       valid_path=valid_datasets[0],
                                       test_path=valid_datasets[1],
                                       batch_size=batch_size,
                                       **opt_ds)
    else:
        train, valid, test = load_pkl_data(train_file_paths=pkl_train_files,
                                           valid_file_paths=pkl_valid_files,
                                           batch_size=batch_size,
                                           vocab=vocab,
                                           eyem=eyem,
                                           **opt_ds)

    tparams = init_tparams(params)
    trng, use_noise, inps_d, \
                     opt_ret, \
                     cost, errors, ent_errors, ent_derrors, probs = \
                        build_model(tparams,
                                    model_options,
                                    prepare_data if not opt_ds['use_sent_reps'] \
                                            else prepare_data_sents,
                                    valid,
                                    cost_mask=cost_mask)

    alphas = opt_ret['dec_alphas']

    if opt_ds['use_sent_reps']:
        inps = [inps_d["desc"], \
                inps_d["word_mask"], \
                inps_d["q"], \
                inps_d['q_mask'], \
                inps_d['ans'], \
                inps_d['wlen'],
                inps_d['slen'], inps_d['qlen'],\
                inps_d['ent_mask']
                ]
    else:
        inps = [inps_d["desc"], \
                inps_d["word_mask"], \
                inps_d["q"], \
                inps_d['q_mask'], \
                inps_d['ans'], \
                inps_d['wlen'], \
                inps_d['qlen'], \
                inps_d['ent_mask']]

    outs = [cost, errors, probs, alphas]
    if ent_errors:
        outs += [ent_errors]

    if ent_derrors:
        outs += [ent_derrors]

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, outs, profile=profile)
    print 'Done'

    # Apply weight decay on the feed-forward connections
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.

        for kk, vv in tparams.iteritems():
            if "logit" in kk or "ff" in kk:
                weight_decay += (vv ** 2).sum()

        weight_decay *= decay_c
        cost += weight_decay

    # after any regularizer
    print 'Computing gradient...',
    grads = safe_grad(cost, itemlist(tparams))
    print 'Done'

    # Gradient clipping:
    if clip_c > 0.:
        g2 = get_norms(grads)
        for p, g in grads.iteritems():
            grads[p] = tensor.switch(g2 > (clip_c**2),
                                     (g / tensor.sqrt(g2 + 1e-8)) * clip_c,
                                     g)
    inps.pop()
    if optimizer.lower() == "adasecant":
        learning_rule = Adasecant(delta_clip=25.0,
                                  use_adagrad=True,
                                  grad_clip=0.25,
                                  gamma_clip=0.)
    elif optimizer.lower() == "rmsprop":
        learning_rule = RMSPropMomentum(init_momentum=momentum)
    elif optimizer.lower() == "adam":
        learning_rule = Adam()
    elif optimizer.lower() == "adadelta":
        learning_rule = AdaDelta()

    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    learning_rule = None

    if learning_rule:
        f_grad_shared, f_update = learning_rule.get_funcs(learning_rate=lr,
                                                          grads=grads,
                                                          inp=inps,
                                                          cost=cost,
                                                          errors=errors)
    else:
        f_grad_shared, f_update = eval(optimizer)(lr,
                                                  tparams,
                                                  grads,
                                                  inps,
                                                  cost,
                                                  errors)

    print 'Done'
    print 'Optimization'
    history_errs = []
    # reload history
    if reload_ and os.path.exists(mpath):
        history_errs = list(numpy.load(mpath)['history_errs'])

    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size

    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    best_found = False
    uidx = 0
    estop = False

    train_cost_ave, train_err_ave, \
            train_gnorm_ave = reset_train_vals()

    for eidx in xrange(max_epochs):
        n_samples = 0

        if train.done:
            train.reset()

        for d_, q_, a, em in train:
            n_samples += len(a)
            uidx += 1
            use_noise.set_value(1.)

            if opt_ds['use_sent_reps']:
                # To mask the description and the question.
                d, d_mask, q, q_mask, dlen, slen, qlen = prepare_data_sents(d_,
                                                                            q_)

                if d is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    uidx -= 1
                    continue

                ud_start = time.time()
                cost, errors, gnorm, pnorm = f_grad_shared(d,
                                                           d_mask,
                                                           q,
                                                           q_mask,
                                                           a,
                                                           dlen,
                                                           slen,
                                                           qlen)
            else:
                d, d_mask, q, q_mask, dlen, qlen = prepare_data(d_, q_)

                if d is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    uidx -= 1
                    continue

                ud_start = time.time()
                cost, errors, gnorm, pnorm = f_grad_shared(d, d_mask,
                                                           q, q_mask,
                                                           a,
                                                           dlen,
                                                           qlen)

            upnorm = f_update(lrate)
            ud = time.time() - ud_start

            # Collect the running ave train stats.
            train_cost_ave = running_ave(train_cost_ave,
                                         cost)
            train_err_ave = running_ave(train_err_ave,
                                        errors)
            train_gnorm_ave = running_ave(train_gnorm_ave,
                                          gnorm)

            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                import ipdb; ipdb.set_trace()

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, ' Update ', uidx, \
                        ' Cost ', cost, ' UD ', ud, \
                        ' UpNorm ', upnorm[0].tolist(), \
                        ' GNorm ', gnorm, \
                        ' Pnorm ', pnorm, 'Terrors ', errors

            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',
                if best_p is not None and best_found:
                    numpy.savez(mpath_best, history_errs=history_errs, **best_p)
                    pkl.dump(model_options, open('%s.pkl' % mpath_best, 'wb'))
                else:
                    params = unzip(tparams)

                numpy.savez(mpath, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl' % mpath, 'wb'))
                pkl.dump(stats, open("%s.pkl" % mpath_stats, 'wb'))

                print 'Done'
                print_param_norms(tparams)

            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                if valid.done:
                    valid.reset()

                valid_costs, valid_errs, valid_probs, \
                        valid_alphas, error_ent, error_dent = eval_model(f_log_probs,
                                                  prepare_data if not opt_ds['use_sent_reps'] \
                                                    else prepare_data_sents,
                                                  model_options,
                                                  valid,
                                                  use_sent_rep=opt_ds['use_sent_reps'])

                valid_alphas_ = numpy.concatenate([va.argmax(0) for va  in valid_alphas.tolist()], axis=0)
                valid_err = valid_errs.mean()
                valid_cost = valid_costs.mean()
                valid_alpha_ent = -negentropy(valid_alphas)

                mean_valid_alphas = valid_alphas_.mean()
                std_valid_alphas = valid_alphas_.std()

                mean_valid_probs = valid_probs.argmax(1).mean()
                std_valid_probs = valid_probs.argmax(1).std()

                history_errs.append([valid_cost, valid_err])

                stats['train_err_ave'].append(train_err_ave)
                stats['train_cost_ave'].append(train_cost_ave)
                stats['train_gnorm_ave'].append(train_gnorm_ave)

                stats['valid_errs'].append(valid_err)
                stats['valid_costs'].append(valid_cost)
                stats['valid_err_ent'].append(error_ent)
                stats['valid_err_desc_ent'].append(error_dent)

                stats['valid_alphas_mean'].append(mean_valid_alphas)
                stats['valid_alphas_std'].append(std_valid_alphas)
                stats['valid_alphas_ent'].append(valid_alpha_ent)

                stats['valid_probs_mean'].append(mean_valid_probs)
                stats['valid_probs_std'].append(std_valid_probs)

                if uidx == 0 or valid_err <= numpy.array(history_errs)[:, 1].min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                    best_found = True
                else:
                    bst_found = False

                if numpy.isnan(valid_err):
                    import ipdb; ipdb.set_trace()


                print "============================"
                print '\t>>>Valid error: ', valid_err, \
                        ' Valid cost: ', valid_cost
                print '\t>>>Valid pred mean: ', mean_valid_probs, \
                        ' Valid pred std: ', std_valid_probs
                print '\t>>>Valid alphas mean: ', mean_valid_alphas, \
                        ' Valid alphas std: ', std_valid_alphas, \
                        ' Valid alpha negent: ', valid_alpha_ent, \
                        ' Valid error ent: ', error_ent, \
                        ' Valid error desc ent: ', error_dent

                print "============================"
                print "Running average train stats "
                print '\t>>>Train error: ', train_err_ave, \
                        ' Train cost: ', train_cost_ave, \
                        ' Train grad norm: ', train_gnorm_ave
                print "============================"


                train_cost_ave, train_err_ave, \
                    train_gnorm_ave = reset_train_vals()


        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid.reset()
    valid_cost, valid_error, valid_probs, \
            valid_alphas, error_ent = eval_model(f_log_probs,
                                      prepare_data if not opt_ds['use_sent_reps'] \
                                           else prepare_data_sents,
                                      model_options, valid,
                                      use_sent_rep=opt_ds['use_sent_rep'])

    print " Final eval resuts: "
    print 'Valid error: ', valid_error.mean()
    print 'Valid cost: ', valid_cost.mean()
    print '\t>>>Valid pred mean: ', valid_probs.mean(), \
            ' Valid pred std: ', valid_probs.std(), \
            ' Valid error ent: ', error_ent

    params = copy.copy(best_p)

    numpy.savez(mpath_last,
                zipped_params=best_p,
                history_errs=history_errs,
                **params)

    return valid_err, valid_cost
Example #7
0
File: layers.py Project: afcarl/nmt
def fflayer(tparams, state_below, options, prefix='rconv', activ='lambda x: tensor.tanh(x)', **kwargs):
    return eval(activ)(tensor.dot(state_below, tparams[prfx(prefix,'W')]) + tparams[prfx(prefix,'b')])
Example #8
0
File: layers.py Project: afcarl/nmt
def gru_cond_layer(tparams,
                   state_below,
                   options,
                   prefix='gru',
                   mask=None,
                   context=None,
                   one_step=False,
                   init_memory=None,
                   init_state=None,
                   context_mask=None,
                   **kwargs):

    assert context, 'Context must be provided'

    if one_step:
        assert init_state, 'previous state must be provided'

    nsteps = state_below.shape[0]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    # mask
    if mask == None:
        mask = tensor.alloc(1., state_below.shape[0], 1)

    dim = tparams[prfx(prefix, 'Wcx')].shape[1]

    # initial/previous state
    if init_state == None:
        init_state = tensor.alloc(0., n_samples, dim)

    # projected context
    assert context.ndim == 3, 'Context must be 3-d: #annotation x #sample x dim'
    pctx_ = tensor.dot(context, tparams[prfx(prefix,'Wc_att')]) + tparams[prfx(prefix,'b_att')]

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n*dim:(n+1)*dim]
        return _x[:, n*dim:(n+1)*dim]

    # projected x
    state_belowx = tensor.dot(state_below, tparams[prfx(prefix, 'Wx')]) + tparams[prfx(prefix, 'bx')]
    state_below_ = tensor.dot(state_below, tparams[prfx(prefix, 'W')]) + tparams[prfx(prefix, 'b')]
    state_belowc = tensor.dot(state_below, tparams[prfx(prefix, 'Wi_att')])

    def _step_slice(m_, x_, xx_, xc_, h_, ctx_, alpha_, pctx_, cc_,
                    U, Wc, Wd_att, U_att, c_tt, Ux, Wcx):
        # attention
        pstate_ = tensor.dot(h_, Wd_att)
        pctx__ = pctx_ + pstate_[None,:,:]
        pctx__ += xc_
        pctx__ = tensor.tanh(pctx__)
        alpha = tensor.dot(pctx__, U_att)+c_tt
        alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]])
        alpha = tensor.exp(alpha)
        if context_mask:
            alpha = alpha * context_mask
        alpha = alpha / alpha.sum(0, keepdims=True)
        ctx_ = (cc_ * alpha[:,:,None]).sum(0) # current context

        preact = tensor.dot(h_, U)
        preact += x_
        preact += tensor.dot(ctx_, Wc)
        preact = tensor.nnet.sigmoid(preact)

        r = _slice(preact, 0, dim)
        u = _slice(preact, 1, dim)

        preactx = tensor.dot(h_, Ux)
        preactx *= r
        preactx += xx_
        preactx += tensor.dot(ctx_, Wcx)

        h = tensor.tanh(preactx)

        h = u * h_ + (1. - u) * h
        h = m_[:,None] * h + (1. - m_)[:,None] * h_

        return h, ctx_, alpha.T

    seqs = [mask, state_below_, state_belowx, state_belowc]
    _step = _step_slice

    shared_vars = [tparams[prfx(prefix, 'U')],
                   tparams[prfx(prefix, 'Wc')],
                   tparams[prfx(prefix,'Wd_att')],
                   tparams[prfx(prefix,'U_att')],
                   tparams[prfx(prefix, 'c_tt')],
                   tparams[prfx(prefix, 'Ux')],
                   tparams[prfx(prefix, 'Wcx')]]

    if one_step:
        rval = _step(*(seqs+[init_state, None, None, pctx_, context]+shared_vars))
    else:
        rval, updates = theano.scan(_step,
                                    sequences=seqs,
                                    outputs_info = [init_state,
                                                    tensor.alloc(0., n_samples, context.shape[2]),
                                                    tensor.alloc(0., n_samples, context.shape[0])],
                                    non_sequences=[pctx_,
                                                   context]+shared_vars,
                                    name=prfx(prefix, '_layers'),
                                    n_steps=nsteps,
                                    profile=profile,
                                    strict=True)
    return rval