def train_nn( # Hyper-Parameters dim_token=100, # word embeding dimension lstm_layer_0_n=50, lstm_layer_1_n=50, ydim0=5, ydim1=6, #win_size = 3, #n_cueTypes = 4, n_vocb_words=15489, # Vocabulary size #n_locDiffs = 111, # Location difference size patience=10, # Number of epoch to wait before early stop if no progress max_epochs=100, # The maximum number of epoch to run #dispFreq=10, # Display to stdout the training progress every N updates #decay_c=0., # Weight decay for the classifier applied to the U weights. lrate=0.01, # Learning rate for sgd (not used for adadelta and rmsprop) dropout_p=1.0, adv_epsilon=0.001, optimizer=momentum, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). #maxlen=1000, # Sequence longer then this get ignored batch_size=10, # The batch size during training. #inter_cost_margin = 0.001, # Parameter for extra option #noise_std=0., #use_dropout=True, # if False slightly faster, but worst test error # This frequently need a bigger model. #reload_model=None, # Path to a saved model we want to start from. #test_size=-1 ): # Model options model_options = locals().copy() print('-------------------------------------------------------------') print("model options", model_options) print('-------------------------------------------------------------') #load_data, prepare_data = get_dataset(dataset) print('Loading data ... ... ...') train, valid, test = data.load_data(path='../mydata.pkl.gz', n_words=n_vocb_words) print('Building model ... ... ...') params = init_params(model_options, Wemb_value=data.read_gz_file("../../matrix.pkl.gz")) tparams = init_tparams(params) (x, masks, y, f_pred_prob, f_pred, f_adv_pred_prob, f_adv_pred, cost, adv_cost) = build_model(tparams, model_options) #f_cost = theano.function([x[0], x[1], masks[0], masks[1], y], cost, name='f_cost') grads = tensor.grad(cost, wrt=list(tparams.values())) #f_grad = theano.function([x[0], x[1], masks[0], masks[1], y], grads, name='f_grad') adv_grads = tensor.grad(cost, wrt=tparams['Wemb']) f_adv_grad = theano.function([x[0], x[1], masks[0], masks[1], y], adv_grads, name='f_adv_grad') lr = tensor.scalar(name='lr') f_grad_shared, f_update = optimizer(lr, tparams, grads, x, masks, y, cost) print('training ... ... ...') kf_valid = my_get_minibatches_idx(len(valid[0])) kf_test = my_get_minibatches_idx(len(test[0])) print("%d train examples" % len(train[0])) print("%d valid examples" % len(valid[0])) print("%d test examples" % len(test[0])) #history_errs = [] best_p = None bad_counter = 0 stop_counter = 0 #if validFreq == -1: #validFreq = len(train[0]) // batch_size #if saveFreq == -1: #saveFreq = len(train[0]) // batch_size last_ave_of_train_costs = numpy.inf costs_list = [] uidx = 0 # the number of update done estop = False # early stop #start_time = time.time() try: for eidx in range(max_epochs): n_samples = 0 # Get new shuffled index for the training set. kf = my_get_minibatches_idx(len(train[0]), shuffle=True) #training_sum_costs = 0 #ave_of_g_costs_sum = 0 #ave_of_d_costs_sum = 0 for train_batch_idx, train_index in kf: #uidx += 1 #use_noise.set_value(1.) # Select the random examples for this minibatch x_0 = train[0][train_index] x_1 = train[1][train_index] y_0 = train[2][train_index] x_0, mask_0, _ = data.prepare_data(x_0) x_1, mask_1, _ = data.prepare_data(x_1) y_0 = numpy.asarray(y_0, dtype='int32') #print(y_0) #print(type(y_0)) #print(y_0.ndim) cost = f_grad_shared(x_0, x_1, mask_0, mask_1, y_0) costs_list.append(cost) f_update(lrate) cur_adv_grad = f_adv_grad(x_0, x_1, mask_0, mask_1, y_0) tparams['p_Wemb'] = adv_epsilon * cur_adv_grad / ( tensor.sqrt(cur_adv_grad**2 + 1e-4)) if train_batch_idx % 100 == 0 or train_batch_idx == len( kf) - 1: print("---Now %d/%d training bacthes @ epoch = %d" % (train_batch_idx, len(kf), eidx)) cur_ave_of_train_costs = sum(costs_list) / len(costs_list) print("cur_ave_of_train_costs = ", cur_ave_of_train_costs, "@ epoch = ", eidx) if numpy.isnan(cur_ave_of_train_costs) or numpy.isinf( cur_ave_of_train_costs): print('bad cost detected: ', cur_ave_of_train_costs) print('End of Program') break print('outputing predicted labels of test set ... ... ...') output_pred_labels(model_options, f_pred, f_pred_prob, data.prepare_data, test, kf_test, verbose=False, path="test_pred_labels.txt") if cur_ave_of_train_costs >= last_ave_of_train_costs * 0.9: stop_counter += 1 last_ave_of_train_costs = cur_ave_of_train_costs print('counter for early stopping : %d/%d' % (stop_counter, patience)) print('learning rate in this epoch = ', lrate) print('--------------------------------------------------') del costs_list[:] if stop_counter >= patience: print('Early Stop!') estop = True break if estop: break except KeyboardInterrupt: print("Training interupted")
def train_cnn( # Hyper-Parameters dim_token=100, # word embeding dimension dim_locDiff=10, # location difference dimension dim_cueType=10, # cnn_n1=50, n2=10 + 10 + 100, ydim0=3, ydim1=3, #win_size = 3, #maxTokens1 = 60, # maximum tokens in sentence 1 n_cueTypes=5, n_words=4000, # Vocabulary size n_locDiffs=108, # Location difference size patience=10, # Number of epoch to wait before early stop if no progress max_epochs=300, # The maximum number of epoch to run #dispFreq=10, # Display to stdout the training progress every N updates #decay_c=0., # Weight decay for the classifier applied to the U weights. lrate=0.01, # Learning rate for sgd (not used for adadelta and rmsprop) optimizer=momentum, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). #maxlen=1000, # Sequence longer then this get ignored batch_size=16, # The batch size during training. # Parameter for extra option noise_std=0., use_dropout=True, # if False slightly faster, but worst test error # This frequently need a bigger model. #reload_model=None, # Path to a saved model we want to start from. test_size=-1): # Model options model_options = locals().copy() print('----------------------------------------------') print("model options", model_options) print('----------------------------------------------') #load_data, prepare_data = get_dataset(dataset) print('Loading data ... ... ...') train, valid, test = data.load_data(path='../mydata.pkl.gz', n_words=n_words, valid_portion=0.) '''if test_size > 0: # The test set is sorted by size, but we want to keep random # size example. So we must select a random selection of the # examples. idx = numpy.arange(len(test[0])) numpy.random.shuffle(idx) idx = idx[:test_size] test = ([test[0][n] for n in idx], [test[1][n] for n in idx])''' print('Building model ... ... ...') # This create the initial parameters as numpy ndarrays. # Dict name (string) -> numpy ndarray params = init_params(model_options, Wemb_value=data.read_gz_file("../matrix.pkl.gz")) '''if reload_model: load_params('cnn_model.npz', params)''' # This create Theano Shared Variable from the parameters. # Dict name (string) -> Theano Tensor Shared Variable # params and tparams have different copy of the weights. tparams = init_tparams(params) # use_noise is for dropout (use_noise, x, masks, y, f_pred_prob, f_pred, cost, f_pred_prob_test, f_pred_test) = build_model(tparams, model_options) '''if decay_c > 0.: decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c') weight_decay = 0. weight_decay += (tparams['U'] ** 2).sum() weight_decay *= decay_c cost += weight_decay''' f_cost = theano.function([ x[0], x[1], x[2], x[3], masks[0], masks[1], masks[2], masks[3], y[0], y[1] ], cost, name='f_cost') grads = tensor.grad(cost, wrt=list(tparams.values())) f_grad = theano.function([ x[0], x[1], x[2], x[3], masks[0], masks[1], masks[2], masks[3], y[0], y[1] ], grads, name='f_grad') lr = tensor.scalar(name='lr') f_grad_shared, f_update = optimizer(lr, tparams, grads, x, masks, y, cost) #print('Optimization') print('training ... ... ...') kf_valid = get_minibatches_idx(len(valid[0]), batch_size) kf_test = get_minibatches_idx(len(test[0]), batch_size) print("%d train examples" % len(train[0])) print("%d valid examples" % len(valid[0])) print("%d test examples" % len(test[0])) #history_errs = [] best_p = None bad_counter = 0 '''if validFreq == -1: validFreq = len(train[0]) // batch_size if saveFreq == -1: saveFreq = len(train[0]) // batch_size''' last_training_sum_costs = numpy.inf uidx = 0 # the number of update done estop = False # early stop #start_time = time.time() try: for eidx in range(max_epochs): n_samples = 0 # Get new shuffled index for the training set. kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) training_sum_costs = 0 for train_batch_idx, train_index in kf: uidx += 1 use_noise.set_value(1.) # Select the random examples for this minibatch x_0 = [train[0][t] for t in train_index] x_1 = [train[1][t] for t in train_index] x_2 = [train[2][t] for t in train_index] x_3 = [train[3][t] for t in train_index] y_0 = [train[4][t] for t in train_index] y_1 = [train[5][t] for t in train_index] # Get the data in numpy.ndarray format # # Return something of shape (minibatch maxlen, n samples) x_0, mask_0 = data.prepare_data(x_0) x_1, mask_1 = data.prepare_data(x_1) x_2, mask_2 = data.prepare_data(x_2) x_3, mask_3 = data.prepare_data(x_3) y_0 = numpy.asarray(y_0, dtype='int32') y_1 = numpy.asarray(y_1, dtype='int32') n_samples += x_0.shape[1] if train_batch_idx % 100 == 0 or train_batch_idx == len( kf) - 1: print("%d/%d training bacthes @ epoch = %d" % (train_batch_idx, len(kf), eidx)) cost = f_grad_shared(x_0, x_1, x_2, x_3, mask_0, mask_1, mask_2, mask_3, y_0, y_1) f_update(lrate) training_sum_costs += cost print("sum of costs of all the training samples = ", training_sum_costs, "@ epoch = ", eidx) if numpy.isnan(training_sum_costs) or numpy.isinf( training_sum_costs): print('bad cost detected: ', training_sum_costs) print('End of Program') break print('outputing predicted labels of test set ... ... ...') output_pred_labels(f_pred_test, f_pred_prob_test, data.prepare_data, test, kf_test, verbose=False, path="test_pred_labels.txt") if training_sum_costs >= last_training_sum_costs * 0.99: bad_counter += 1 if bad_counter == patience / 2: lrate /= 4. last_training_sum_costs = training_sum_costs print('bad counter for early stopping : %d/%d' % (bad_counter, patience)) print('learning rate = ', lrate) print('--------------------------------------------------') if bad_counter >= patience: print('Early Stop!') estop = True break if estop: break except KeyboardInterrupt: print("Training interupted")
def train_nn( # Hyper-Parameters dim_token=100, # word embeding dimension dim_locDiff=10, # location difference dimension dim_cueType=10, # dim_ESP_label=10, dim_latent=100, lstm_layer_n=50, lstm_decoder_layer_n=50, n2=50 + 10 + 10, ydim0=3, ydim1=3, # win_size = 2, # maxTokens1 = 60, # maximum tokens in sentence 1 # n_ESP_labels = 3, n_cueTypes=4, n_vocb_words=4396, # Vocabulary size n_locDiffs=111, # Location difference size end_idx=3194, patience=10, # Number of epoch to wait before early stop if no progress max_epochs=100, # The maximum number of epoch to run # dispFreq=10, # Display to stdout the training progress every N updates # decay_c=0., # Weight decay for the classifier applied to the U weights. lrate=0.01, # Learning rate for sgd (not used for adadelta and rmsprop) dropout_p=1.0, optimizer=momentum, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). # maxlen=1000, # Sequence longer then this get ignored batch_size=10, # The batch size during training. inter_cost_margin=0.001, # Parameter for extra option # noise_std=0., # use_dropout=True, # if False slightly faster, but worst test error # This frequently need a bigger model. # reload_model=None, # Path to a saved model we want to start from. # test_size=-1 ): # Model options model_options = locals().copy() print('-------------------------------------------------------------') print("model options", model_options) print('-------------------------------------------------------------') # load_data, prepare_data = get_dataset(dataset) print('Loading data ... ... ...') train, valid, test = data.load_data(path='mydata.pkl', n_words=n_vocb_words) print('Building model ... ... ...') params_all = init_params(model_options, Wemb_value=data.read_gz_file("word_emb.pkl")) # tparams = init_tparams(params) tparams_d = init_tparams(params_all[0]) tparams_g = init_tparams(params_all[1]) tparams_c = OrderedDict() for kk, pp in tparams_d.items(): tparams_c[kk] = tparams_d[kk] for kk, pp in tparams_g.items(): tparams_c[kk] = tparams_g[kk] (x, masks, x_d_y_fake, y, x_noises, x_maxlens, f_D_pred_prob, f_D_pred, f_G_produce, dropouts, d_cost, g_cost) = Build_Model([tparams_d, tparams_g], model_options) d_grads = tensor.grad(d_cost, wrt=list(tparams_d.values())) # print(tparams_c) g_grads = tensor.grad(g_cost, wrt=list(tparams_c.values()), consider_constant=list(tparams_d.values()), disconnected_inputs='ignore') lr = tensor.scalar(name='lr') # f_grad_shared, f_update = optimizer(lr, tparams, grads, x, masks, y, cost) f_D_grad_shared, f_D_update = optimizer(lr, tparams_d, d_grads, x + dropouts, masks, x_d_y_fake + y, d_cost) # f_G_grad_shared, f_G_update = optimizer(lr, tparams_c, g_grads, # x_noise + x_maxlen + x_d_ps + dropouts_g, [], x_g_y_fake + yg, g_cost) f_G_grad_shared, f_G_update = optimizer(lr, tparams_c, g_grads, x + x_noises + x_maxlens, masks, y, g_cost) print('training ... ... ...') kf_valid = get_minibatches_idx(len(valid[0]), batch_size) kf_test = get_minibatches_idx(len(test[0]), batch_size) print("%d train examples" % len(train[0])) print("%d valid examples" % len(valid[0])) print("%d test examples" % len(test[0])) # history_errs = [] best_p = None bad_counter = 0 stop_counter = 0 # if validFreq == -1: # validFreq = len(train[0]) // batch_size # if saveFreq == -1: # saveFreq = len(train[0]) // batch_size # last_training_sum_costs = numpy.inf last_ave_of_g_costs = numpy.inf last_ave_of_d_costs = numpy.inf g_costs_list = [] d_costs_list = [] uidx = 0 # the number of update done estop = False # early stop # start_time = time.time() try: for eidx in range(max_epochs): n_samples = 0 # Get new shuffled index for the training set. kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) # kf = get_minibatches_idx(99, batch_size, shuffle=True) # training_sum_costs = 0 # ave_of_g_costs_sum = 0 # ave_of_d_costs_sum = 0 for train_batch_idx, train_index in kf: # uidx += 1 # use_noise.set_value(1.) cur_batch_size = len(train_index) # Select the random examples for this minibatch x_0 = [train[0][t] for t in train_index] x_1 = [train[1][t] for t in train_index] x_3 = [train[2][t] for t in train_index] y_0 = [train[3][t] for t in train_index] y_1 = [train[4][t] for t in train_index] y_one_out = [train[5][t] for t in train_index] x_0, mask_0, maxlen_0 = data.prepare_data(x_0) x_1, mask_1, maxlen_1 = data.prepare_data(x_1) x_3, mask_3, maxlen_3 = data.prepare_data(x_3, addIdxNum=2) y_0 = numpy.asarray(y_0, dtype='int32') y_1 = numpy.asarray(y_1, dtype='int32') y_one_out = numpy.asarray(y_one_out, dtype='int32') rng = numpy.random.RandomState(9998) x0_noise_0 = rng.normal(scale=0.01, size=(cur_batch_size, dim_latent)).astype(config.floatX) x1_noise_1 = rng.normal(scale=0.01, size=(cur_batch_size, dim_latent)).astype(config.floatX) x3_noise_3 = rng.normal(scale=0.01, size=(cur_batch_size, dim_latent)).astype(config.floatX) generated_xs = f_G_produce(x0_noise_0, x1_noise_1, x3_noise_3, maxlen_0, maxlen_1, maxlen_3, y_0, y_1) # numpy.asarray([3] * cur_batch_size, dtype='int32')# generated_x_0 = generated_xs[0] generated_x_1 = generated_xs[1] generated_x_3 = numpy.concatenate( ( # numpy.random.randint(0, n_cueTypes, (cur_batch_size,)).astype('int32')[None,:], # numpy.random.randint(0, n_locDiffs, (cur_batch_size,)).astype('int32')[None,:], x_3[0:2, :], generated_xs[2]), axis=0) generated_m_0 = generated_xs[3] generated_m_1 = generated_xs[4] generated_m_3 = generated_xs[5] generated_y_0 = numpy.random.randint(0, ydim0 - 1, (cur_batch_size,)).astype('int32') generated_y_1 = numpy.random.randint(0, ydim1, (cur_batch_size,)).astype('int32') x_d_0 = numpy.concatenate((x_0, generated_x_0), axis=1) x_d_1 = numpy.concatenate((x_1, generated_x_1), axis=1) x_d_3 = numpy.concatenate((x_3, generated_x_3), axis=1) y_d_0_fake = numpy.asarray([1] * cur_batch_size + [0] * cur_batch_size, dtype='int32') y_d_1_fake = numpy.asarray([1] * cur_batch_size + [0] * cur_batch_size, dtype='int32') y_d_3_fake = numpy.asarray([1] * cur_batch_size + [0] * cur_batch_size, dtype='int32') # mask_ones_0 = numpy.ones_like(mask_0) # mask_ones_1 = numpy.ones_like(mask_1) # mask_ones_3 = numpy.ones_like(mask_3) mask_d_0 = numpy.concatenate((mask_0, generated_m_0), axis=1) mask_d_1 = numpy.concatenate((mask_1, generated_m_1), axis=1) mask_d_3 = numpy.concatenate((mask_3, generated_m_3), axis=1) y_d_0 = numpy.concatenate((y_0, generated_y_0), axis=0) y_d_1 = numpy.concatenate((y_1, generated_y_1), axis=0) d_cost = f_D_grad_shared(x_d_0, x_d_1, x_d_3, dropout_p, 1.0, mask_d_0, mask_d_1, mask_d_3, y_d_0_fake, y_d_1_fake, y_d_3_fake, y_d_0, y_d_1) g_cost = f_G_grad_shared(x_0, x_1, x_3, x0_noise_0, x1_noise_1, x3_noise_3, 16, 16, 12, mask_0, mask_1, mask_3, generated_y_0, generated_y_1) # print(y_g_0.shape) print('\rd_cost = %f g_cost = %f @ %d' % (d_cost, g_cost, train_batch_idx), end='') # print(cur_batch_size) # ave_of_g_costs_sum += g_cost # ave_of_d_costs_sum += d_cost g_costs_list.append(g_cost) d_costs_list.append(d_cost) if d_cost < g_cost * 0.8: for i in range(10): f_G_update(0.01) g_cost = f_G_grad_shared(x_0, x_1, x_3, x0_noise_0, x1_noise_1, x3_noise_3, 16, 16, 12, mask_0, mask_1, mask_3, generated_y_0, generated_y_1) if d_cost / g_cost >= 0.8 and d_cost / g_cost <= 1.0 / 0.8: break elif g_cost < d_cost * 0.8: for i in range(10): f_D_update(0.01) d_cost = f_D_grad_shared(x_d_0, x_d_1, x_d_3, dropout_p, 1.0, mask_d_0, mask_d_1, mask_d_3, y_d_0_fake, y_d_1_fake, y_d_3_fake, y_d_0, y_d_1) if g_cost / d_cost >= 0.8 and g_cost / d_cost <= 1.0 / 0.8: break else: f_D_update(0.01) f_G_update(0.01) if train_batch_idx % 100 == 0 or train_batch_idx == len(kf) - 1: print("---Now %d/%d training bacthes @ epoch = %d" % (train_batch_idx, len(kf), eidx)) if train_batch_idx > 0 and \ (train_batch_idx % 500 == 0 or train_batch_idx == len(kf) - 1): cur_ave_of_d_costs = sum(d_costs_list) / len(d_costs_list) cur_ave_of_g_costs = sum(g_costs_list) / len(g_costs_list) print('ave_of_d_costs_sum = %f\tave_of_g_costs_sum = %f' % (cur_ave_of_d_costs, cur_ave_of_g_costs)) # print('outputing predicted labels of test set ... ... ...') output_pred_labels(model_options, f_D_pred, f_D_pred_prob, data.prepare_data, test, kf_test, verbose=False, path="test_pred_labels.txt") if cur_ave_of_d_costs >= last_ave_of_d_costs * 0.99 and \ cur_ave_of_g_costs >= last_ave_of_g_costs * 0.99: stop_counter += 1 last_ave_of_d_costs = cur_ave_of_d_costs last_ave_of_g_costs = cur_ave_of_g_costs print('counter for early stopping : %d/%d' % (stop_counter, patience)) del d_costs_list[:] del g_costs_list[:] if stop_counter >= patience: print('Early Stop!') estop = True break # end for if stop_counter >= patience: print('Early Stop!') estop = True break if estop: break except KeyboardInterrupt: print("Training interupted")