def train_classifier(train, valid, test, W, n_words=10000, n_x=300, n_h=200, dropout_val=0.5, patience=10, max_epochs=20, lrate=0.0002, batch_size=50, valid_batch_size=50, dispFreq=10, validFreq=100, saveFreq=200, saveto = 'trec_gru_result.npz'): """ train, valid, test : datasets W : the word embedding initialization n_words : vocabulary size n_x : word embedding dimension n_h : LSTM/GRU number of hidden units dropout_val: dropput probability patience : Number of epoch to wait before early stop if no progress max_epochs : The maximum number of epoch to run lrate : learning rate batch_size : batch size during training valid_batch_size : The batch size used for validation/test set dispFreq : Display to stdout the training progress every N updates validFreq : Compute the validation error after this number of update. saveFreq: save the result after this number of update. saveto: where to save the result. """ options = {} options['n_words'] = n_words options['n_x'] = n_x options['n_h'] = n_h options['patience'] = patience options['max_epochs'] = max_epochs options['lrate'] = lrate options['batch_size'] = batch_size options['valid_batch_size'] = valid_batch_size options['dispFreq'] = dispFreq options['validFreq'] = validFreq logger.info('Model options {}'.format(options)) logger.info('{} train examples'.format(len(train[0]))) logger.info('{} valid examples'.format(len(valid[0]))) logger.info('{} test examples'.format(len(test[0]))) logger.info('Building model...') n_y = np.max(train[1]) + 1 options['n_y'] = n_y params = init_params(options,W) tparams = init_tparams(params) (use_noise, x, mask, y, f_pred_prob, f_pred, cost) = build_model(tparams,options) lr = tensor.scalar(name='lr') f_grad_shared, f_update = Adam(tparams, cost, [x, mask, y], lr) logger.info('Training model...') kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) estop = False # early stop history_errs = [] best_p = None bad_counter = 0 uidx = 0 # the number of update done start_time = time.time() try: for eidx in xrange(max_epochs): kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(dropout_val) y = [train[1][t] for t in train_index] x = [train[0][t]for t in train_index] x, mask, y = prepare_data(x, y) cost = f_grad_shared(x, mask, y) f_update(lrate) if np.isnan(cost) or np.isinf(cost): logger.info('NaN detected') return 1., 1., 1. if np.mod(uidx, dispFreq) == 0: logger.info('Epoch {} Update {} Cost {}'.format(eidx, uidx, cost)) if np.mod(uidx, saveFreq) == 0: logger.info('Saving ...') if best_p is not None: params = best_p else: params = unzip(tparams) np.savez(saveto, history_errs=history_errs, **params) logger.info('Done ...') if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = pred_error(f_pred, prepare_data, train, kf) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) test_err = pred_error(f_pred, prepare_data, test, kf_test) history_errs.append([valid_err, test_err, train_err]) if (uidx == 0 or valid_err <= np.array(history_errs)[:,0].min()): best_p = unzip(tparams) bad_counter = 0 logger.info('Train {} Valid {} Test {}'.format(train_err, valid_err, test_err)) if (len(history_errs) > patience and valid_err >= np.array(history_errs)[:-patience,0].min()): bad_counter += 1 if bad_counter > patience: logger.info('Early Stop!') estop = True break if estop: break except KeyboardInterrupt: logger.info('Training interupted') end_time = time.time() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) use_noise.set_value(0.) kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size) train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) test_err = pred_error(f_pred, prepare_data, test, kf_test) logger.info('Train {} Valid {} Test {}'.format(train_err, valid_err, test_err)) np.savez(saveto, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **best_p) logger.info('The code run for {} epochs, with {} sec/epochs'.format(eidx + 1, (end_time - start_time) / (1. * (eidx + 1)))) return train_err, valid_err, test_err
def train_model(train, valid, test, img_feats, W, n_words=7414, n_x=300, n_h=512, max_epochs=20, lrate=0.001, batch_size=64, valid_batch_size=64, dropout_val=0.5, dispFreq=10, validFreq=500, saveFreq=1000, saveto = 'flickr30k_result_psgld_dropout.npz'): """ n_words : vocabulary size n_x : word embedding dimension n_h : LSTM/GRU number of hidden units max_epochs : The maximum number of epoch to run lrate : learning rate batch_size : batch size during training valid_batch_size : The batch size used for validation/test set dropout_val : the probability of dropout dispFreq : Display to stdout the training progress every N updates validFreq : Compute the validation error after this number of update. saveFreq : save results after this number of update. saveto : where to save. """ options = {} options['n_words'] = n_words options['n_x'] = n_x options['n_h'] = n_h options['max_epochs'] = max_epochs options['lrate'] = lrate options['batch_size'] = batch_size options['valid_batch_size'] = valid_batch_size options['dispFreq'] = dispFreq options['validFreq'] = validFreq options['saveFreq'] = saveFreq options['n_z'] = img_feats.shape[0] logger.info('Model options {}'.format(options)) logger.info('{} train examples'.format(len(train[0]))) logger.info('{} valid examples'.format(len(valid[0]))) logger.info('{} test examples'.format(len(test[0]))) logger.info('Building model...') params = init_params(options,W) tparams = init_tparams(params) (use_noise, x, mask, z, f_pred_prob, cost) = build_model(tparams,options) f_cost = theano.function([x, mask, z], cost, name='f_cost') lr_theano = tensor.scalar(name='lr') ntrain_theano = tensor.scalar(name='ntrain') f_grad_shared, f_update = pSGLD(tparams, cost, [x, mask,z], ntrain_theano, lr_theano) logger.info('Training model...') kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) estop = False # early stop history_negll = [] best_p = None best_valid_negll, best_test_negll = 0., 0. bad_counter = 0 uidx = 0 # the number of update done start_time = time.time() # statistics of data train_num_words, valid_num_words, test_num_words = 0, 0, 0 for sent in train[0]: train_num_words = train_num_words + len(sent) for sent in valid[0]: valid_num_words = valid_num_words + len(sent) for sent in test[0]: test_num_words = test_num_words + len(sent) n_average = 0 valid_probs = np.zeros((valid_num_words,)) test_probs = np.zeros((test_num_words,)) try: for eidx in xrange(max_epochs): n_samples = 0 kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(dropout_val) x = [train[0][t]for t in train_index] z = np.array([img_feats[:,train[1][t]]for t in train_index]) x, mask = prepare_data(x) n_samples += x.shape[1] cost = f_grad_shared(x, mask,z) f_update(lrate,len(train[0])) if np.isnan(cost) or np.isinf(cost): logger.info('NaN detected') return 1., 1., 1. if np.mod(uidx, dispFreq) == 0: logger.info('Epoch {} Update {} Cost {}'.format(eidx, uidx, cost)) if np.mod(uidx, saveFreq) == 0: logger.info('Saving ...') if best_p is not None: params = best_p else: params = unzip(tparams) np.savez(saveto, history_negll=history_negll, **params) logger.info('Done ...') if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) if eidx < 3: valid_negll = calu_negll(f_cost, prepare_data, valid, img_feats, kf_valid) test_negll = calu_negll(f_cost, prepare_data, test, img_feats, kf_test) history_negll.append([valid_negll, test_negll]) else: valid_probs_curr = calu_pred_prob(f_pred_prob, prepare_data, valid, img_feats, kf_valid) test_probs_curr = calu_pred_prob(f_pred_prob, prepare_data, test, img_feats, kf_test) valid_probs = (n_average * valid_probs + valid_probs_curr)/(n_average+1) test_probs = (n_average * test_probs + test_probs_curr)/(n_average+1) n_average += 1 valid_negll = -np.log(valid_probs + 1e-6).sum() / valid_num_words test_negll = -np.log(test_probs + 1e-6).sum() / test_num_words history_negll.append([valid_negll, test_negll]) logger.info('Saving {}th Sample...'.format(n_average)) params = unzip(tparams) np.savez('flickr30k_result_psgld_{}.npz'.format(n_average), valid_probs_curr=valid_probs_curr, test_probs_curr=test_probs_curr, **params) logger.info('Done ...') if (uidx == 0 or valid_negll <= np.array(history_negll)[:,0].min()): best_p = unzip(tparams) best_valid_negll = valid_negll best_test_negll = test_negll bad_counter = 0 logger.info('Perp: Valid {} Test {}'.format(np.exp(valid_negll), np.exp(test_negll))) if (len(history_negll) > 10 and valid_negll >= np.array(history_negll)[:-10,0].min()): bad_counter += 1 if bad_counter > 10: logger.info('Early Stop!') estop = True break logger.info('Seen {} samples'.format(n_samples)) if estop: break except KeyboardInterrupt: logger.info('Training interupted') end_time = time.time() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) logger.info('Perp: Valid {} Test {}'.format(np.exp(best_valid_negll), np.exp(best_test_negll))) np.savez(saveto, history_negll=history_negll, **best_p) logger.info('The code run for {} epochs, with {} sec/epochs'.format(eidx + 1, (end_time - start_time) / (1. * (eidx + 1)))) return best_valid_negll, best_test_negll
def trainer(train, valid, test, n_chars=33, img_w=128, max_len=27, feature_maps=100, filter_hs=[2, 3, 4], max_epochs=20, gamma=10, ncon=50, lrate=0.0002, batch_size=100, dispFreq=10, validFreq=100, saveto='example.npz'): """ train, valid, test : datasets n_chars : vocabulary size img_w : character embedding dimension. max_len : the maximum length of a sentence feature_maps : the number of feature maps we used filter_hs: the filter window sizes we used max_epochs : The maximum number of epoch to run gamma: hyper-parameter using in ranking ncon: the number of negative samples we used for each postive sample lrate : learning rate batch_size : batch size during training dispFreq : Display to stdout the training progress every N updates validFreq : Compute the validation rank score after this number of update. saveto: where to save the result. """ img_h = max_len + 2 * (filter_hs[-1] - 1) model_options = {} model_options['n_chars'] = n_chars model_options['img_w'] = img_w model_options['img_h'] = img_h model_options['feature_maps'] = feature_maps model_options['filter_hs'] = filter_hs model_options['max_epochs'] = max_epochs model_options['gamma'] = gamma model_options['ncon'] = ncon model_options['lrate'] = lrate model_options['batch_size'] = batch_size model_options['dispFreq'] = dispFreq model_options['validFreq'] = validFreq model_options['saveto'] = saveto logger.info('Model options {}'.format(model_options)) logger.info('Building model...') filter_w = img_w filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1)) model_options['filter_shapes'] = filter_shapes model_options['pool_sizes'] = pool_sizes params = init_params(model_options) tparams = init_tparams(params) use_noise, inps, cost = build_model(tparams, model_options) logger.info('Building encoder...') inps_e, feat_x, feat_y = build_encoder(tparams, model_options) logger.info('Building functions...') f_emb = theano.function(inps_e, [feat_x, feat_y], name='f_emb') lr = tensor.scalar(name='lr') f_grad_shared, f_update = Adam(tparams, cost, inps, lr) logger.info('Training model...') uidx = 0 seed = 1234 curr = 0 history_errs = [] valid_x = prepare_data(valid[0], max_len, n_chars, filter_hs[-1]) valid_y = prepare_data(valid[1], max_len, n_chars, filter_hs[-1]) test_x = prepare_data(test[0], max_len, n_chars, filter_hs[-1]) test_y = prepare_data(test[1], max_len, n_chars, filter_hs[-1]) zero_vec_tensor = tensor.vector() zero_vec = np.zeros(img_w).astype(theano.config.floatX) set_zero = theano.function([zero_vec_tensor], updates=[(tparams['Wemb'], tensor.set_subtensor( tparams['Wemb'][n_chars - 1, :], zero_vec_tensor))]) # Main loop for eidx in range(max_epochs): prng = RandomState(seed - eidx - 1) trainA = train[0] trainB = train[1] num_samples = len(trainA) inds = np.arange(num_samples) prng.shuffle(inds) numbatches = len(inds) / batch_size for minibatch in range(numbatches): use_noise.set_value(0.) uidx += 1 conprng = RandomState(seed + uidx + 1) x = [trainA[seq] for seq in inds[minibatch::numbatches]] y = [trainB[seq] for seq in inds[minibatch::numbatches]] cinds = conprng.random_integers(low=0, high=num_samples - 1, size=ncon * len(x)) cy = [trainB[seq] for seq in cinds] x = prepare_data(x, max_len, n_chars, filter_hs[-1]) y = prepare_data(y, max_len, n_chars, filter_hs[-1]) cy = prepare_data(cy, max_len, n_chars, filter_hs[-1]) cost = f_grad_shared(x, y, cy) f_update(lrate) # the special token does not need to update. set_zero(zero_vec) if np.mod(uidx, dispFreq) == 0: logger.info('Epoch {} Update {} Cost {}'.format( eidx, uidx, cost)) if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) logger.info('Computing ranks...') feats_x, feats_y = f_emb(valid_x, valid_y) (r1, r3, r10, medr, meanr, h_meanr) = rank(feats_x, feats_y) history_errs.append([r1, r3, r10, medr, meanr, h_meanr]) logger.info('Valid Rank:{}, {}, {}, {},{},{}'.format( r1, r3, r10, medr, meanr, h_meanr)) currscore = r1 + r3 + r10 if currscore > curr: curr = currscore logger.info('Saving...') params = unzip(tparams) np.savez(saveto, history_errs=history_errs, **params) logger.info('Done...') use_noise.set_value(0.) zipp(params, tparams) logger.info('Final results...') feats_x, feats_y = f_emb(valid_x, valid_y) (r1, r3, r10, medr, meanr, h_meanr) = rank(feats_x, feats_y) logger.info('Valid Rank:{}, {}, {}, {},{},{}'.format( r1, r3, r10, medr, meanr, h_meanr)) feats_x, feats_y = f_emb(test_x, test_y) (r1, r3, r10, medr, meanr, h_meanr) = rank(feats_x, feats_y) logger.info('Test Rank:{}, {}, {}, {},{},{}'.format( r1, r3, r10, medr, meanr, h_meanr)) # np.savez("./cnn_feats.npz", feats_x=feats_x, feats_y=feats_y) return (r1, r3, r10, medr, meanr, h_meanr)
def train_model(train, val, test, n_words=21103, img_w=300, max_len=40, feature_maps=200, filter_hs=[3,4,5], n_x=300, n_h=600, max_epochs=8, lrate=0.0002, batch_size=64, valid_batch_size=64, dispFreq=10, validFreq=500, saveFreq=1000, saveto = 'bookcorpus_result.npz'): """ train, valid, test : datasets n_words : vocabulary size img_w : word embedding dimension, must be 300. max_len : the maximum length of a sentence feature_maps : the number of feature maps we used filter_hs: the filter window sizes we used n_x: word embedding dimension n_h: the number of hidden units in LSTM max_epochs : the maximum number of epoch to run lrate : learning rate batch_size : batch size during training valid_batch_size : The batch size used for validation/test set dispFreq : Display to stdout the training progress every N updates validFreq : Compute the validation error after this number of update. saveFreq: save the result after this number of update. saveto: where to save the result. """ img_h = max_len + 2*(filter_hs[-1]-1) options = {} options['n_words'] = n_words options['img_w'] = img_w options['img_h'] = img_h options['feature_maps'] = feature_maps options['filter_hs'] = filter_hs options['n_x'] = n_x options['n_h'] = n_h options['max_epochs'] = max_epochs options['lrate'] = lrate options['batch_size'] = batch_size options['valid_batch_size'] = valid_batch_size options['dispFreq'] = dispFreq options['validFreq'] = validFreq options['saveFreq'] = saveFreq logger.info('Model options {}'.format(options)) logger.info('Building model...') filter_w = img_w filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h-filter_h+1, img_w-filter_w+1)) options['filter_shapes'] = filter_shapes options['pool_sizes'] = pool_sizes params = init_params(options) tparams = init_tparams(params) use_noise, x, y, y_mask, cost = build_model(tparams,options) f_cost = theano.function([x, y, y_mask], cost, name='f_cost') lr = tensor.scalar(name='lr') f_grad_shared, f_update = Adam(tparams, cost, [x, y, y_mask], lr) logger.info('Training model...') history_cost = [] uidx = 0 # the number of update done start_time = time.time() kf_valid = get_minibatches_idx(len(val), valid_batch_size) zero_vec_tensor = tensor.vector() zero_vec = np.zeros(img_w).astype(theano.config.floatX) set_zero = theano.function([zero_vec_tensor], updates=[(tparams['Wemb'], tensor.set_subtensor(tparams['Wemb'][21102,:], zero_vec_tensor))]) try: for eidx in xrange(max_epochs): n_samples = 0 kf = get_minibatches_idx(len(train), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(0.) sents = [train[t]for t in train_index] x = prepare_data_for_cnn(sents) y, y_mask = prepare_data_for_rnn(sents) n_samples += y.shape[1] cost = f_grad_shared(x, y, y_mask) f_update(lrate) # the special <pad_zero> token does not need to update. set_zero(zero_vec) if np.isnan(cost) or np.isinf(cost): logger.info('NaN detected') return 1., 1., 1. if np.mod(uidx, dispFreq) == 0: logger.info('Epoch {} Update {} Cost {}'.format(eidx, uidx, np.exp(cost))) if np.mod(uidx, saveFreq) == 0: logger.info('Saving ...') params = unzip(tparams) np.savez(saveto, history_cost=history_cost, **params) logger.info('Done ...') if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_cost = calu_cost(f_cost, prepare_data_for_cnn, prepare_data_for_rnn, val, kf_valid) history_cost.append([valid_cost]) logger.info('Valid {}'.format(np.exp(valid_cost))) logger.info('Seen {} samples'.format(n_samples)) except KeyboardInterrupt: logger.info('Training interupted') end_time = time.time() # if best_p is not None: # zipp(best_p, tparams) # else: # best_p = unzip(tparams) use_noise.set_value(0.) valid_cost = calu_cost(f_cost, prepare_data_for_cnn, prepare_data_for_rnn, val, kf_valid) logger.info('Valid {}'.format(np.exp(valid_cost))) params = unzip(tparams) np.savez(saveto, history_cost=history_cost, **params) logger.info('The code run for {} epochs, with {} sec/epochs'.format(eidx + 1, (end_time - start_time) / (1. * (eidx + 1)))) return valid_cost
def train_classifier(train, valid, test, W, n_words=10000, img_w=300, max_len=40, feature_maps=100, filter_hs=[3,4,5], dropout_val=0.5, patience=10, max_epochs=20, lrate=0.0002, batch_size=50, valid_batch_size=50, dispFreq=10, validFreq=100, saveFreq=200, saveto = 'trec_cnn_result.npz'): """ train, valid, test : datasets W : the word embedding initialization n_words : vocabulary size img_w : word embedding dimension, must be 300. max_len : the maximum length of a sentence feature_maps : the number of feature maps we used filter_hs: the filter window sizes we used dropout_val: dropput probability patience : Number of epoch to wait before early stop if no progress max_epochs : The maximum number of epoch to run lrate : learning rate batch_size : batch size during training valid_batch_size : The batch size used for validation/test set dispFreq : Display to stdout the training progress every N updates validFreq : Compute the validation error after this number of update. saveFreq: save the result after this number of update. saveto: where to save the result. """ img_h = max_len + 2*(filter_hs[-1]-1) options = {} options['n_words'] = n_words options['img_w'] = img_w options['img_h'] = img_h options['feature_maps'] = feature_maps options['filter_hs'] = filter_hs options['patience'] = patience options['max_epochs'] = max_epochs options['lrate'] = lrate options['batch_size'] = batch_size options['valid_batch_size'] = valid_batch_size options['dispFreq'] = dispFreq options['validFreq'] = validFreq logger.info('Model options {}'.format(options)) logger.info('{} train examples'.format(len(train[0]))) logger.info('{} valid examples'.format(len(valid[0]))) logger.info('{} test examples'.format(len(test[0]))) logger.info('Building model...') n_y = np.max(train[1]) + 1 options['n_y'] = n_y """ Train a simple conv net img_h = sentence length (padded where necessary) img_w = word vector length (300 for word2vec) filter_hs = filter window sizes """ filter_w = img_w filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h-filter_h+1, img_w-filter_w+1)) options['filter_shapes'] = filter_shapes options['pool_sizes'] = pool_sizes params = init_params(options,W) tparams = init_tparams(params) (use_noise, x, y, f_pred_prob, f_pred, cost) = build_model(tparams,options) lr = tensor.scalar(name='lr') f_grad_shared, f_update = Adam(tparams, cost, [x, y], lr) logger.info('Training model...') kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) estop = False # early stop history_errs = [] best_p = None bad_counter = 0 uidx = 0 # the number of update done start_time = time.time() zero_vec_tensor = tensor.vector() zero_vec = np.zeros(img_w).astype(theano.config.floatX) set_zero = theano.function([zero_vec_tensor], updates=[(tparams['Wemb'], tensor.set_subtensor(tparams['Wemb'][n_words-1,:], zero_vec_tensor))]) try: for eidx in xrange(max_epochs): kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(dropout_val) y = np.array([train[1][t] for t in train_index]).astype('int32') x = [train[0][t]for t in train_index] x = prepare_data(x,max_len,n_words,filter_hs[-1]) cost = f_grad_shared(x, y) f_update(lrate) # the special token does not need to update. set_zero(zero_vec) if np.isnan(cost) or np.isinf(cost): logger.info('NaN detected') return 1., 1., 1. if np.mod(uidx, dispFreq) == 0: logger.info('Epoch {} Update {} Cost {}'.format(eidx, uidx, cost)) if np.mod(uidx, saveFreq) == 0: logger.info('Saving ...') if best_p is not None: params = best_p else: params = unzip(tparams) np.savez(saveto, history_errs=history_errs, **params) logger.info('Done ...') if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = pred_error(f_pred, prepare_data, train, kf, max_len,n_words, filter_hs[-1]) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid, max_len,n_words, filter_hs[-1]) test_err = pred_error(f_pred, prepare_data, test, kf_test, max_len,n_words, filter_hs[-1]) history_errs.append([valid_err, test_err, train_err]) if (uidx == 0 or valid_err <= np.array(history_errs)[:,0].min()): best_p = unzip(tparams) bad_counter = 0 logger.info('Train {} Valid {} Test {}'.format(train_err, valid_err, test_err)) if (len(history_errs) > patience and valid_err >= np.array(history_errs)[:-patience,0].min()): bad_counter += 1 if bad_counter > patience: logger.info('Early Stop!') estop = True break if estop: break except KeyboardInterrupt: logger.info('Training interupted') end_time = time.time() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) use_noise.set_value(0.) kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size) train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted, max_len,n_words, filter_hs[-1]) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid, max_len,n_words, filter_hs[-1]) test_err = pred_error(f_pred, prepare_data, test, kf_test, max_len,n_words, filter_hs[-1]) logger.info('Train {} Valid {} Test {}'.format(train_err, valid_err, test_err)) np.savez(saveto, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **best_p) logger.info('The code run for {} epochs, with {} sec/epochs'.format(eidx + 1, (end_time - start_time) / (1. * (eidx + 1)))) return train_err, valid_err, test_err
def train_model(train, val, test, train_lab, val_lab, test_lab, ixtoword, n_words=22153, period=887, img_w=300, img_h=148, feature_maps=300, filter_hs=[3, 4, 5], n_x=300, n_h=500, n_h2_d=200, n_h2=900, p_lambda_q=0, p_lambda_fm=0.001, p_lambda_recon=0.001, n_codes=2, max_epochs=16, lr_d=0.0001, lr_g=0.00005, kde_sigma=1., batch_size=256, valid_batch_size=256, dim_mmd=32, dispFreq=10, dg_ratio=1, Large=1e3, validFreq=500, saveFreq=500, saveto='disent_result'): """ n_words : word vocabulary size feature_maps : CNN embedding dimension for each width filter_hs : CNN width n_h : LSTM/GRU number of hidden units n_h2: discriminative network number of hidden units n_gan: number of hidden units in GAN n_codes: number of latent codes max_epochs : The maximum number of epoch to run lrate : learning rate batch_size : batch size during training valid_batch_size : The batch size used for validation/test set dispFreq : Display to stdout the training progress every N updates validFreq : Compute the validation error after this number of update. """ n_gan = len(filter_hs) * feature_maps # 900 options = {} options['n_words'] = n_words options['img_w'] = img_w options['img_h'] = img_h options['feature_maps'] = feature_maps options['filter_hs'] = filter_hs #band width options['n_x'] = n_x options['n_h'] = n_h options['n_h2'] = n_h2 options['n_h2_d'] = n_h2_d options['n_codes'] = n_codes options['lambda_q'] = p_lambda_q options['lambda_fm'] = p_lambda_fm # weight for feature matching options['lambda_recon'] = p_lambda_recon options['L'] = Large options['max_epochs'] = max_epochs options['lr_d'] = lr_d options['lr_g'] = lr_g options['kde_sigma'] = kde_sigma options['batch_size'] = batch_size options['valid_batch_size'] = valid_batch_size options['dispFreq'] = dispFreq options['validFreq'] = validFreq options['saveFreq'] = saveFreq options['dg_ratio'] = dg_ratio options['n_gan'] = n_gan options['debug'] = False options[ 'feature_match'] = 'mmd' #'mmd' #' mmd_h','mmd_ld' #'JSD_acc' # moment #None # options['shareLSTM'] = True options['delta'] = 0.00 options['sharedEmb'] = False options['cnn_activation'] = 'tanh' # tanh options['sigma_range'] = [20] # range of sigma for mmd options['diag'] = 0.1 # diagonal matrix added on cov for JSD_acc options['label_smoothing'] = 0.01 options['dim_mmd'] = dim_mmd options['force_cut'] = 'None' options['batch_norm'] = False options['wgan'] = False options['cutoff'] = 0.01 options['max_step'] = 60 options['period'] = period logger.info('Model options {}'.format(options)) logger.info('Building model...') filter_w = img_w filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1)) options['filter_shapes'] = filter_shapes options['pool_sizes'] = pool_sizes # generative model for GAN ## modified n_label = len(set(train_lab)) options['label_sizes'] = n_label n_feature = len(options['filter_hs']) * options['feature_maps'] options['input_shape'] = (n_h2_d, n_feature) options['pred_shape'] = (1, n_h2_d) if options['feature_match'] == 'mmd_ld': options['mmd_shape'] = (dim_mmd, n_h2_d) options['propose_shape'] = (n_codes, n_h2_d) # if options['reverse']: options['input_recon_shape'] = (n_h2, n_feature) options['recon_shape'] = (n_gan, n_h2) if options['shareLSTM'] else (n_gan + 1, n_h2) ## d_params_s, g_params_s, s_params_s = init_params(options) d_params, g_params, s_params = init_tparams(d_params_s, g_params_s, s_params_s, options) lr_d_t = tensor.scalar(name='lr_d') lr_g_t = tensor.scalar(name='lr_g') use_noise, use_noise2, x, z, d_cost, g_cost, r_cost, fake_recon, acc_fake_xx, acc_real_xx, acc_fake_mean, acc_real_mean, wtf1, wtf2, wtf3, wtf4, wtf5, wtf6, KDE, KDE_input = build_model( d_params, g_params, s_params, options) # change f_cost = theano.function([x, z], [d_cost, g_cost, KDE, KDE_input], name='f_cost') #f_print = theano.function([x, z],[ wtf1, wtf2, wtf3, wtf4, wtf5, wtf6, KDE, KDE_input], name='f_print',on_unused_input='ignore') f_print = theano.function([x, z], [wtf1, wtf2, wtf3, wtf4, wtf5, wtf6], name='f_print') f_recon = theano.function([x, z], [r_cost, fake_recon, d_cost], name='f_recon', on_unused_input='ignore') if options['feature_match']: ss_updates = [(s_params['acc_fake_xx'], acc_fake_xx), (s_params['acc_real_xx'], acc_real_xx), (s_params['acc_fake_mean'], acc_fake_mean), (s_params['acc_real_mean'], acc_real_mean), (s_params['seen_size'], s_params['seen_size'] + options['batch_size'])] f_update_ss = theano.function([x, z], s_params, updates=ss_updates) f_cost_d, _train_d = Adam(d_params, d_cost, [x, z], lr_d_t) if options['feature_match']: f_cost_g, _train_g = Adam(g_params, g_cost, [x, z], lr_g_t) else: f_cost_g, _train_g = Adam(g_params, g_cost, [z], lr_g_t) ## logger.info('Training model...') history_cost = [] uidx = 0 # the number of update done kdes = np.zeros(10) kde_std = 0. # standard deviation of every 10 kde_input kde_mean = 0. start_time = time.time() kf_valid = get_minibatches_idx(len(val), valid_batch_size) y_min = min(train_lab) train_lab = [t - y_min for t in train_lab] val_lab = [t - y_min for t in val_lab] test_lab = [t - y_min for t in test_lab] testset = [prepare_for_bleu(s) for s in test[:1000]] try: for eidx in xrange(max_epochs): n_samples = 0 kf = get_minibatches_idx(len(train), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(0.0) use_noise2.set_value(0.0) sents = [train[t] for t in train_index] x = prepare_data_for_cnn(sents) n_samples += x.shape[0] if options['shareLSTM']: z = np.random.uniform( -1, 1, (batch_size, n_gan)).astype('float32') else: z = np.random.uniform( -1, 1, (batch_size, n_gan + 1)).astype('float32') z[:, 0] = np.random.randint(n_codes, size=batch_size).astype('float32') # update gradient if options['feature_match']: cost_g = f_cost_g(x, z) else: cost_g = f_cost_g(z) if np.isnan(cost_g): logger.info('NaN detected') temp_out = f_print(x, z) print 'real' + str(temp_out[0]) + ' fake' + str( temp_out[1]) return 1., 1., 1. if np.isinf(cost_g): temp_out = f_print(x, z) print 'real' + str(temp_out[0]) + ' fake' + str( temp_out[1]) logger.info('Inf detected') return 1., 1., 1. # update G _train_g(lr_g) if np.mod(uidx, dispFreq) == 0: temp_out = f_print(x, z) _, _, cost_d = f_recon(x, z) np.set_printoptions(precision=3) np.set_printoptions(threshold=np.inf) print 'real ' + str(round( temp_out[0], 2)) + ' fake ' + str(round( temp_out[1], 2)) + ' Covariance loss ' + str( round(temp_out[3], 2)) + ' mean loss ' + str( round(temp_out[5], 2)) print 'cost_g ' + str(cost_g) + ' cost_d ' + str(cost_d) print( "Generated:" + " ".join( [ixtoword[x] for x in temp_out[2][0] if x != 0])) logger.info( 'Epoch {} Update {} Cost G {} Real {} Fake {} loss_cov {} meanMSE {}' .format(eidx, uidx, cost_g, round(temp_out[0], 2), round(temp_out[1], 2), temp_out[3], temp_out[5])) logger.info('Generated: {}'.format(" ".join( [ixtoword[x] for x in temp_out[2][0] if x != 0]))) if np.mod(uidx, dg_ratio) == 0: x = prepare_data_for_cnn(sents) cost_d = f_cost_d(x, z) _train_d(lr_d) if np.mod(uidx, dispFreq) == 0: logger.info('Cost D {}'.format(cost_d)) if np.mod(uidx, saveFreq) == 0: logger.info('Saving ...') d_params_s = unzip(d_params) g_params_s = unzip(g_params) params_d = OrderedDict() params_g = OrderedDict() for kk, pp in d_params_s.iteritems(): params_d[kk] = np.asarray(d_params_s[kk]) for kk, pp in g_params_s.iteritems(): params_g[kk] = np.asarray(g_params_s[kk]) np.savez(saveto + '_d.npz', history_cost=history_cost, options=options, **params_d) np.savez(saveto + '_g.npz', history_cost=history_cost, options=options, **params_g) logger.info('Done ...') if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) use_noise2.set_value(0.) if options['shareLSTM']: val_z = np.random.uniform( -1, 1, (batch_size, n_gan)).astype('float32') else: val_z = np.random.uniform( -1, 1, (batch_size, n_gan + 1)).astype('float32') temp_out = f_print(x, val_z) predset = temp_out[2] [bleu2s, bleu3s, bleu4s] = cal_BLEU([prepare_for_bleu(s) for s in predset], {0: testset}) logger.info( 'Valid BLEU2 = {}, BLEU3 = {}, BLEU4 = {}'.format( bleu2s, bleu3s, bleu4s)) print 'Valid BLEU (2,3,4): ' + ' '.join( [str(round(it, 3)) for it in (bleu2s, bleu3s, bleu4s)]) if options['feature_match']: f_update_ss(x, z) logger.info('Seen {} samples'.format(n_samples)) except KeyboardInterrupt: logger.info('Training interrupted') end_time = time.time() logger.info('The code run for {} epochs, with {} sec/epochs'.format( eidx + 1, (end_time - start_time) / (1. * (eidx + 1)))) return valid_cost
if np.isnan(cost) or np.isinf(cost): print('NaN detected') [train_err, valid_err, test_err] = [1., 1., 1.] if np.mod(uidx, dispFreq) == 0: print('Epoch {} Update {} Cost {}'.format( eidx, uidx, cost)) if np.mod(uidx, saveFreq) == 0: print('Saving ...') if best_p is not None: params = best_p else: params = unzip(tparams) np.savez(saveto, history_errs=history_errs, **params) with open('data/sst2_model.pickle', 'wb') as file: model = params cPickle.dump(model, file) print('Done ...') if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = pred_error(f_pred, prepare_data, train, kf) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) test_err = pred_error(f_pred, prepare_data, test, kf_test)