def train_model(train, valid, test, img_feats, W, n_words=7414, n_x=300, n_h=512, max_epochs=20, lrate=0.001, batch_size=64, valid_batch_size=64, dropout_val=0.5, dispFreq=10, validFreq=500, saveFreq=1000, saveto = 'flickr30k_result_psgld_dropout.npz'): """ n_words : vocabulary size n_x : word embedding dimension n_h : LSTM/GRU number of hidden units max_epochs : The maximum number of epoch to run lrate : learning rate batch_size : batch size during training valid_batch_size : The batch size used for validation/test set dropout_val : the probability of dropout dispFreq : Display to stdout the training progress every N updates validFreq : Compute the validation error after this number of update. saveFreq : save results after this number of update. saveto : where to save. """ options = {} options['n_words'] = n_words options['n_x'] = n_x options['n_h'] = n_h options['max_epochs'] = max_epochs options['lrate'] = lrate options['batch_size'] = batch_size options['valid_batch_size'] = valid_batch_size options['dispFreq'] = dispFreq options['validFreq'] = validFreq options['saveFreq'] = saveFreq options['n_z'] = img_feats.shape[0] logger.info('Model options {}'.format(options)) logger.info('{} train examples'.format(len(train[0]))) logger.info('{} valid examples'.format(len(valid[0]))) logger.info('{} test examples'.format(len(test[0]))) logger.info('Building model...') params = init_params(options,W) tparams = init_tparams(params) (use_noise, x, mask, z, f_pred_prob, cost) = build_model(tparams,options) f_cost = theano.function([x, mask, z], cost, name='f_cost') lr_theano = tensor.scalar(name='lr') ntrain_theano = tensor.scalar(name='ntrain') f_grad_shared, f_update = pSGLD(tparams, cost, [x, mask,z], ntrain_theano, lr_theano) logger.info('Training model...') kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) estop = False # early stop history_negll = [] best_p = None best_valid_negll, best_test_negll = 0., 0. bad_counter = 0 uidx = 0 # the number of update done start_time = time.time() # statistics of data train_num_words, valid_num_words, test_num_words = 0, 0, 0 for sent in train[0]: train_num_words = train_num_words + len(sent) for sent in valid[0]: valid_num_words = valid_num_words + len(sent) for sent in test[0]: test_num_words = test_num_words + len(sent) n_average = 0 valid_probs = np.zeros((valid_num_words,)) test_probs = np.zeros((test_num_words,)) try: for eidx in xrange(max_epochs): n_samples = 0 kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(dropout_val) x = [train[0][t]for t in train_index] z = np.array([img_feats[:,train[1][t]]for t in train_index]) x, mask = prepare_data(x) n_samples += x.shape[1] cost = f_grad_shared(x, mask,z) f_update(lrate,len(train[0])) if np.isnan(cost) or np.isinf(cost): logger.info('NaN detected') return 1., 1., 1. if np.mod(uidx, dispFreq) == 0: logger.info('Epoch {} Update {} Cost {}'.format(eidx, uidx, cost)) if np.mod(uidx, saveFreq) == 0: logger.info('Saving ...') if best_p is not None: params = best_p else: params = unzip(tparams) np.savez(saveto, history_negll=history_negll, **params) logger.info('Done ...') if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) if eidx < 3: valid_negll = calu_negll(f_cost, prepare_data, valid, img_feats, kf_valid) test_negll = calu_negll(f_cost, prepare_data, test, img_feats, kf_test) history_negll.append([valid_negll, test_negll]) else: valid_probs_curr = calu_pred_prob(f_pred_prob, prepare_data, valid, img_feats, kf_valid) test_probs_curr = calu_pred_prob(f_pred_prob, prepare_data, test, img_feats, kf_test) valid_probs = (n_average * valid_probs + valid_probs_curr)/(n_average+1) test_probs = (n_average * test_probs + test_probs_curr)/(n_average+1) n_average += 1 valid_negll = -np.log(valid_probs + 1e-6).sum() / valid_num_words test_negll = -np.log(test_probs + 1e-6).sum() / test_num_words history_negll.append([valid_negll, test_negll]) logger.info('Saving {}th Sample...'.format(n_average)) params = unzip(tparams) np.savez('flickr30k_result_psgld_{}.npz'.format(n_average), valid_probs_curr=valid_probs_curr, test_probs_curr=test_probs_curr, **params) logger.info('Done ...') if (uidx == 0 or valid_negll <= np.array(history_negll)[:,0].min()): best_p = unzip(tparams) best_valid_negll = valid_negll best_test_negll = test_negll bad_counter = 0 logger.info('Perp: Valid {} Test {}'.format(np.exp(valid_negll), np.exp(test_negll))) if (len(history_negll) > 10 and valid_negll >= np.array(history_negll)[:-10,0].min()): bad_counter += 1 if bad_counter > 10: logger.info('Early Stop!') estop = True break logger.info('Seen {} samples'.format(n_samples)) if estop: break except KeyboardInterrupt: logger.info('Training interupted') end_time = time.time() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) logger.info('Perp: Valid {} Test {}'.format(np.exp(best_valid_negll), np.exp(best_test_negll))) np.savez(saveto, history_negll=history_negll, **best_p) logger.info('The code run for {} epochs, with {} sec/epochs'.format(eidx + 1, (end_time - start_time) / (1. * (eidx + 1)))) return best_valid_negll, best_test_negll
def train_classifier(train, valid, test, W, n_words=10000, n_x=300, n_h=200, dropout_val=0.5, patience=10, max_epochs=20, lrate=0.0002, batch_size=50, valid_batch_size=50, dispFreq=10, validFreq=100, saveFreq=200, saveto = 'trec_gru_result.npz'): """ train, valid, test : datasets W : the word embedding initialization n_words : vocabulary size n_x : word embedding dimension n_h : LSTM/GRU number of hidden units dropout_val: dropput probability patience : Number of epoch to wait before early stop if no progress max_epochs : The maximum number of epoch to run lrate : learning rate batch_size : batch size during training valid_batch_size : The batch size used for validation/test set dispFreq : Display to stdout the training progress every N updates validFreq : Compute the validation error after this number of update. saveFreq: save the result after this number of update. saveto: where to save the result. """ options = {} options['n_words'] = n_words options['n_x'] = n_x options['n_h'] = n_h options['patience'] = patience options['max_epochs'] = max_epochs options['lrate'] = lrate options['batch_size'] = batch_size options['valid_batch_size'] = valid_batch_size options['dispFreq'] = dispFreq options['validFreq'] = validFreq logger.info('Model options {}'.format(options)) logger.info('{} train examples'.format(len(train[0]))) logger.info('{} valid examples'.format(len(valid[0]))) logger.info('{} test examples'.format(len(test[0]))) logger.info('Building model...') n_y = np.max(train[1]) + 1 options['n_y'] = n_y params = init_params(options,W) tparams = init_tparams(params) (use_noise, x, mask, y, f_pred_prob, f_pred, cost) = build_model(tparams,options) lr = tensor.scalar(name='lr') f_grad_shared, f_update = Adam(tparams, cost, [x, mask, y], lr) logger.info('Training model...') kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) estop = False # early stop history_errs = [] best_p = None bad_counter = 0 uidx = 0 # the number of update done start_time = time.time() try: for eidx in xrange(max_epochs): kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(dropout_val) y = [train[1][t] for t in train_index] x = [train[0][t]for t in train_index] x, mask, y = prepare_data(x, y) cost = f_grad_shared(x, mask, y) f_update(lrate) if np.isnan(cost) or np.isinf(cost): logger.info('NaN detected') return 1., 1., 1. if np.mod(uidx, dispFreq) == 0: logger.info('Epoch {} Update {} Cost {}'.format(eidx, uidx, cost)) if np.mod(uidx, saveFreq) == 0: logger.info('Saving ...') if best_p is not None: params = best_p else: params = unzip(tparams) np.savez(saveto, history_errs=history_errs, **params) logger.info('Done ...') if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = pred_error(f_pred, prepare_data, train, kf) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) test_err = pred_error(f_pred, prepare_data, test, kf_test) history_errs.append([valid_err, test_err, train_err]) if (uidx == 0 or valid_err <= np.array(history_errs)[:,0].min()): best_p = unzip(tparams) bad_counter = 0 logger.info('Train {} Valid {} Test {}'.format(train_err, valid_err, test_err)) if (len(history_errs) > patience and valid_err >= np.array(history_errs)[:-patience,0].min()): bad_counter += 1 if bad_counter > patience: logger.info('Early Stop!') estop = True break if estop: break except KeyboardInterrupt: logger.info('Training interupted') end_time = time.time() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) use_noise.set_value(0.) kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size) train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) test_err = pred_error(f_pred, prepare_data, test, kf_test) logger.info('Train {} Valid {} Test {}'.format(train_err, valid_err, test_err)) np.savez(saveto, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **best_p) logger.info('The code run for {} epochs, with {} sec/epochs'.format(eidx + 1, (end_time - start_time) / (1. * (eidx + 1)))) return train_err, valid_err, test_err
def train_classifier(train, valid, test, W, n_words=10000, img_w=300, max_len=40, feature_maps=100, filter_hs=[3,4,5], dropout_val=0.5, patience=10, max_epochs=20, lrate=0.0002, batch_size=50, valid_batch_size=50, dispFreq=10, validFreq=100, saveFreq=200, saveto = 'trec_cnn_result.npz'): """ train, valid, test : datasets W : the word embedding initialization n_words : vocabulary size img_w : word embedding dimension, must be 300. max_len : the maximum length of a sentence feature_maps : the number of feature maps we used filter_hs: the filter window sizes we used dropout_val: dropput probability patience : Number of epoch to wait before early stop if no progress max_epochs : The maximum number of epoch to run lrate : learning rate batch_size : batch size during training valid_batch_size : The batch size used for validation/test set dispFreq : Display to stdout the training progress every N updates validFreq : Compute the validation error after this number of update. saveFreq: save the result after this number of update. saveto: where to save the result. """ img_h = max_len + 2*(filter_hs[-1]-1) options = {} options['n_words'] = n_words options['img_w'] = img_w options['img_h'] = img_h options['feature_maps'] = feature_maps options['filter_hs'] = filter_hs options['patience'] = patience options['max_epochs'] = max_epochs options['lrate'] = lrate options['batch_size'] = batch_size options['valid_batch_size'] = valid_batch_size options['dispFreq'] = dispFreq options['validFreq'] = validFreq logger.info('Model options {}'.format(options)) logger.info('{} train examples'.format(len(train[0]))) logger.info('{} valid examples'.format(len(valid[0]))) logger.info('{} test examples'.format(len(test[0]))) logger.info('Building model...') n_y = np.max(train[1]) + 1 options['n_y'] = n_y """ Train a simple conv net img_h = sentence length (padded where necessary) img_w = word vector length (300 for word2vec) filter_hs = filter window sizes """ filter_w = img_w filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h-filter_h+1, img_w-filter_w+1)) options['filter_shapes'] = filter_shapes options['pool_sizes'] = pool_sizes params = init_params(options,W) tparams = init_tparams(params) (use_noise, x, y, f_pred_prob, f_pred, cost) = build_model(tparams,options) lr = tensor.scalar(name='lr') f_grad_shared, f_update = Adam(tparams, cost, [x, y], lr) logger.info('Training model...') kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) estop = False # early stop history_errs = [] best_p = None bad_counter = 0 uidx = 0 # the number of update done start_time = time.time() zero_vec_tensor = tensor.vector() zero_vec = np.zeros(img_w).astype(theano.config.floatX) set_zero = theano.function([zero_vec_tensor], updates=[(tparams['Wemb'], tensor.set_subtensor(tparams['Wemb'][n_words-1,:], zero_vec_tensor))]) try: for eidx in xrange(max_epochs): kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(dropout_val) y = np.array([train[1][t] for t in train_index]).astype('int32') x = [train[0][t]for t in train_index] x = prepare_data(x,max_len,n_words,filter_hs[-1]) cost = f_grad_shared(x, y) f_update(lrate) # the special token does not need to update. set_zero(zero_vec) if np.isnan(cost) or np.isinf(cost): logger.info('NaN detected') return 1., 1., 1. if np.mod(uidx, dispFreq) == 0: logger.info('Epoch {} Update {} Cost {}'.format(eidx, uidx, cost)) if np.mod(uidx, saveFreq) == 0: logger.info('Saving ...') if best_p is not None: params = best_p else: params = unzip(tparams) np.savez(saveto, history_errs=history_errs, **params) logger.info('Done ...') if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = pred_error(f_pred, prepare_data, train, kf, max_len,n_words, filter_hs[-1]) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid, max_len,n_words, filter_hs[-1]) test_err = pred_error(f_pred, prepare_data, test, kf_test, max_len,n_words, filter_hs[-1]) history_errs.append([valid_err, test_err, train_err]) if (uidx == 0 or valid_err <= np.array(history_errs)[:,0].min()): best_p = unzip(tparams) bad_counter = 0 logger.info('Train {} Valid {} Test {}'.format(train_err, valid_err, test_err)) if (len(history_errs) > patience and valid_err >= np.array(history_errs)[:-patience,0].min()): bad_counter += 1 if bad_counter > patience: logger.info('Early Stop!') estop = True break if estop: break except KeyboardInterrupt: logger.info('Training interupted') end_time = time.time() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) use_noise.set_value(0.) kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size) train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted, max_len,n_words, filter_hs[-1]) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid, max_len,n_words, filter_hs[-1]) test_err = pred_error(f_pred, prepare_data, test, kf_test, max_len,n_words, filter_hs[-1]) logger.info('Train {} Valid {} Test {}'.format(train_err, valid_err, test_err)) np.savez(saveto, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **best_p) logger.info('The code run for {} epochs, with {} sec/epochs'.format(eidx + 1, (end_time - start_time) / (1. * (eidx + 1)))) return train_err, valid_err, test_err
def trainer(train, valid, test, n_chars=33, img_w=128, max_len=27, feature_maps=100, filter_hs=[2, 3, 4], max_epochs=20, gamma=10, ncon=50, lrate=0.0002, batch_size=100, dispFreq=10, validFreq=100, saveto='example.npz'): """ train, valid, test : datasets n_chars : vocabulary size img_w : character embedding dimension. max_len : the maximum length of a sentence feature_maps : the number of feature maps we used filter_hs: the filter window sizes we used max_epochs : The maximum number of epoch to run gamma: hyper-parameter using in ranking ncon: the number of negative samples we used for each postive sample lrate : learning rate batch_size : batch size during training dispFreq : Display to stdout the training progress every N updates validFreq : Compute the validation rank score after this number of update. saveto: where to save the result. """ img_h = max_len + 2 * (filter_hs[-1] - 1) model_options = {} model_options['n_chars'] = n_chars model_options['img_w'] = img_w model_options['img_h'] = img_h model_options['feature_maps'] = feature_maps model_options['filter_hs'] = filter_hs model_options['max_epochs'] = max_epochs model_options['gamma'] = gamma model_options['ncon'] = ncon model_options['lrate'] = lrate model_options['batch_size'] = batch_size model_options['dispFreq'] = dispFreq model_options['validFreq'] = validFreq model_options['saveto'] = saveto logger.info('Model options {}'.format(model_options)) logger.info('Building model...') filter_w = img_w filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1)) model_options['filter_shapes'] = filter_shapes model_options['pool_sizes'] = pool_sizes params = init_params(model_options) tparams = init_tparams(params) use_noise, inps, cost = build_model(tparams, model_options) logger.info('Building encoder...') inps_e, feat_x, feat_y = build_encoder(tparams, model_options) logger.info('Building functions...') f_emb = theano.function(inps_e, [feat_x, feat_y], name='f_emb') lr = tensor.scalar(name='lr') f_grad_shared, f_update = Adam(tparams, cost, inps, lr) logger.info('Training model...') uidx = 0 seed = 1234 curr = 0 history_errs = [] valid_x = prepare_data(valid[0], max_len, n_chars, filter_hs[-1]) valid_y = prepare_data(valid[1], max_len, n_chars, filter_hs[-1]) test_x = prepare_data(test[0], max_len, n_chars, filter_hs[-1]) test_y = prepare_data(test[1], max_len, n_chars, filter_hs[-1]) zero_vec_tensor = tensor.vector() zero_vec = np.zeros(img_w).astype(theano.config.floatX) set_zero = theano.function([zero_vec_tensor], updates=[(tparams['Wemb'], tensor.set_subtensor( tparams['Wemb'][n_chars - 1, :], zero_vec_tensor))]) # Main loop for eidx in range(max_epochs): prng = RandomState(seed - eidx - 1) trainA = train[0] trainB = train[1] num_samples = len(trainA) inds = np.arange(num_samples) prng.shuffle(inds) numbatches = len(inds) / batch_size for minibatch in range(numbatches): use_noise.set_value(0.) uidx += 1 conprng = RandomState(seed + uidx + 1) x = [trainA[seq] for seq in inds[minibatch::numbatches]] y = [trainB[seq] for seq in inds[minibatch::numbatches]] cinds = conprng.random_integers(low=0, high=num_samples - 1, size=ncon * len(x)) cy = [trainB[seq] for seq in cinds] x = prepare_data(x, max_len, n_chars, filter_hs[-1]) y = prepare_data(y, max_len, n_chars, filter_hs[-1]) cy = prepare_data(cy, max_len, n_chars, filter_hs[-1]) cost = f_grad_shared(x, y, cy) f_update(lrate) # the special token does not need to update. set_zero(zero_vec) if np.mod(uidx, dispFreq) == 0: logger.info('Epoch {} Update {} Cost {}'.format( eidx, uidx, cost)) if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) logger.info('Computing ranks...') feats_x, feats_y = f_emb(valid_x, valid_y) (r1, r3, r10, medr, meanr, h_meanr) = rank(feats_x, feats_y) history_errs.append([r1, r3, r10, medr, meanr, h_meanr]) logger.info('Valid Rank:{}, {}, {}, {},{},{}'.format( r1, r3, r10, medr, meanr, h_meanr)) currscore = r1 + r3 + r10 if currscore > curr: curr = currscore logger.info('Saving...') params = unzip(tparams) np.savez(saveto, history_errs=history_errs, **params) logger.info('Done...') use_noise.set_value(0.) zipp(params, tparams) logger.info('Final results...') feats_x, feats_y = f_emb(valid_x, valid_y) (r1, r3, r10, medr, meanr, h_meanr) = rank(feats_x, feats_y) logger.info('Valid Rank:{}, {}, {}, {},{},{}'.format( r1, r3, r10, medr, meanr, h_meanr)) feats_x, feats_y = f_emb(test_x, test_y) (r1, r3, r10, medr, meanr, h_meanr) = rank(feats_x, feats_y) logger.info('Test Rank:{}, {}, {}, {},{},{}'.format( r1, r3, r10, medr, meanr, h_meanr)) # np.savez("./cnn_feats.npz", feats_x=feats_x, feats_y=feats_y) return (r1, r3, r10, medr, meanr, h_meanr)
bad_counter += 1 if bad_counter > patience: print('Early Stop!') estop = True break if estop: break except KeyboardInterrupt: print('Training interupted') end_time = time.time() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) use_noise.set_value(0.) kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size) train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) test_err = pred_error(f_pred, prepare_data, test, kf_test) print('Train {} Valid {} Test {}'.format(train_err, valid_err, test_err)) np.savez(saveto, train_err=train_err, valid_err=valid_err,