def test_apply_penalty(self): from lasagne.regularization import apply_penalty, l2 A = T.vector() B = T.matrix() assert apply_penalty([], l2) == 0 assert equal_computations([apply_penalty(A, l2)], [l2(A)]) assert equal_computations([apply_penalty([A, B], l2)], [sum([l2(A), l2(B)])])
def regularization(network, optimization): all_params = layers.get_all_params(network, regularizable=True) # weight-decay regularization loss = 0 if "l1" in optimization: l1_penalty = apply_penalty(all_params, l1) * optimization["l1"] loss += l1_penalty if "l2" in optimization: l2_penalty = apply_penalty(all_params, l2)* optimization["l2"] loss += l2_penalty return loss
def test_regularize_layer_params_weighted(self, layers): from lasagne.regularization import regularize_layer_params_weighted from lasagne.regularization import apply_penalty, l2 l_1, l_2, l_3 = layers layers = OrderedDict() layers[l_2] = 0.1 layers[l_3] = 0.5 loss = regularize_layer_params_weighted(layers, lasagne.regularization.l2) assert equal_computations([loss], [sum([0.1 * apply_penalty([l_2.W], l2), 0.5 * apply_penalty([l_3.W], l2)])])
def prepare(): X = T.tensor4('X') y = T.ivector('y') output_layer = lenet_skinny() all_params = lasagne.layers.get_all_params(output_layer) loss_fn = x_ent prediction = lasagne.layers.get_output(output_layer, X) loss = loss_fn(prediction, y).mean() + \ args["lambda"]*apply_penalty(lasagne.layers.get_all_params(output_layer, regularizable=True), l2 ) label_vector = lasagne.layers.get_output(output_layer, X) pred = T.argmax(label_vector, axis=1) accuracy = T.mean(T.eq(pred, y)) return Container({ "X": X, "y": y, "output_layer": output_layer, "all_params": all_params, "loss": loss, "label_vector": label_vector, "pred": pred, "accuracy": accuracy })
def build_trainer(input_data, input_mask, target_data, target_mask, network_params, network_reg_params, output_layer, weight_decay, updater, learning_rate, max_grad_norm=0.0, load_updater_params=None): output_score = get_output(output_layer, deterministic=False) frame_prd_idx = T.argmax(output_score, axis=-1) one_hot_target = T.extra_ops.to_one_hot(y=T.flatten(target_data, 1), nb_class=output_dim, dtype=floatX) output_score = T.reshape(x=output_score, newshape=(-1, output_dim), ndim=2) output_score = output_score - T.max(output_score, axis=-1, keepdims=True) output_score = output_score - T.log(T.sum(T.exp(output_score), axis=-1, keepdims=True)) train_ce = -T.sum(T.mul(one_hot_target, output_score), axis=-1)*T.flatten(target_mask, 1) train_loss = T.sum(train_ce)/target_mask.shape[0] frame_loss = T.sum(train_ce)/T.sum(target_mask) frame_accr = T.sum(T.eq(frame_prd_idx, target_data)*target_mask)/T.sum(target_mask) train_total_loss = train_loss if weight_decay > 0: train_total_loss += apply_penalty(network_reg_params, l2)*10**(-weight_decay) network_grads = theano.grad(cost=train_total_loss, wrt=network_params) if max_grad_norm > 0.: network_grads, network_grads_norm = total_norm_constraint(tensor_vars=network_grads, max_norm=max_grad_norm, return_norm=True) else: network_grads_norm = T.sqrt(sum(T.sum(grad ** 2) for grad in network_grads)) train_lr = theano.shared(lasagne.utils.floatX(learning_rate)) train_updates, updater_params = updater(loss_or_grads=network_grads, params=network_params, learning_rate=train_lr, load_params_dict=load_updater_params) training_fn = theano.function(inputs=[input_data, input_mask, target_data, target_mask], outputs=[frame_loss, frame_accr, network_grads_norm], updates=train_updates) return training_fn, train_lr, updater_params
def test_regularize_layer_params_weighted(self, layers): from lasagne.regularization import regularize_layer_params_weighted from lasagne.regularization import apply_penalty, l2 l_1, l_2, l_3 = layers layers = OrderedDict() layers[l_2] = 0.1 layers[l_3] = 0.5 loss = regularize_layer_params_weighted(layers, lasagne.regularization.l2) assert equal_computations([loss], [ sum([ 0.1 * apply_penalty([l_2.W], l2), 0.5 * apply_penalty([l_3.W], l2) ]) ])
def execute(dataset, n_hidden_t_enc, n_hidden_s, num_epochs=500, learning_rate=.001, learning_rate_annealing=1.0, gamma=1, lmd=0., disc_nonlinearity="sigmoid", keep_labels=1.0, prec_recall_cutoff=True, missing_labels_val=-1.0, which_fold=1, early_stop_criterion='loss', embedding_input='raw', save_path='/Tmp/romerosa/feature_selection/', save_copy='/Tmp/romerosa/feature_selection/', dataset_path='/Tmp/carriepl/datasets/', resume=False, exp_name=None): # Load the dataset print("Loading data") x_train, y_train, x_valid, y_valid, x_test, y_test, \ x_unsup, training_labels = mlh.load_data( dataset, dataset_path, None, which_fold=which_fold, keep_labels=keep_labels, missing_labels_val=missing_labels_val, embedding_input=embedding_input) # Extract required information from data n_samples, n_feats = x_train.shape print("Number of features : ", n_feats) print("Glorot init : ", 2.0 / (n_feats + n_hidden_t_enc[-1])) n_targets = y_train.shape[1] # Set some variables batch_size = 1 # Preparing folder to save stuff print("Experiment: " + exp_name) save_path = os.path.join(save_path, dataset, exp_name) save_copy = os.path.join(save_copy, dataset, exp_name) if not os.path.exists(save_path): os.makedirs(save_path) # Prepare Theano variables for inputs and targets input_var_sup = T.matrix('input_sup') target_var_sup = T.matrix('target_sup') lr = theano.shared(np.float32(learning_rate), 'learning_rate') # Build model print("Building model") discrim_net = InputLayer((None, n_feats), input_var_sup) discrim_net = DenseLayer(discrim_net, num_units=n_hidden_t_enc[-1], nonlinearity=rectify) # Reconstruct the input using dec_feat_emb if gamma > 0: reconst_net = DenseLayer(discrim_net, num_units=n_feats, nonlinearity=linear) nets = [reconst_net] else: nets = [None] # Add supervised hidden layers for hid in n_hidden_s: discrim_net = DropoutLayer(discrim_net) discrim_net = DenseLayer(discrim_net, num_units=hid) assert disc_nonlinearity in ["sigmoid", "linear", "rectify", "softmax"] discrim_net = DropoutLayer(discrim_net) discrim_net = DenseLayer(discrim_net, num_units=n_targets, nonlinearity=eval(disc_nonlinearity)) print("Building and compiling training functions") # Build and compile training functions predictions, predictions_det = mh.define_predictions(nets, start=0) prediction_sup, prediction_sup_det = mh.define_predictions([discrim_net]) prediction_sup = prediction_sup[0] prediction_sup_det = prediction_sup_det[0] # Define losses # reconstruction losses reconst_losses, reconst_losses_det = mh.define_reconst_losses( predictions, predictions_det, [input_var_sup]) # supervised loss sup_loss, sup_loss_det = mh.define_sup_loss(disc_nonlinearity, prediction_sup, prediction_sup_det, keep_labels, target_var_sup, missing_labels_val) inputs = [input_var_sup, target_var_sup] params = lasagne.layers.get_all_params([discrim_net] + nets, trainable=True) print('Number of params: ' + str(len(params))) # Combine losses loss = sup_loss + gamma * reconst_losses[0] loss_det = sup_loss_det + gamma * reconst_losses_det[0] l2_penalty = apply_penalty(params, l2) loss = loss + lmd * l2_penalty loss_det = loss_det + lmd * l2_penalty # Compute network updates updates = lasagne.updates.rmsprop(loss, params, learning_rate=lr) # updates = lasagne.updates.sgd(loss, # params, # learning_rate=lr) # updates = lasagne.updates.momentum(loss, params, # learning_rate=lr, momentum=0.0) # Apply norm constraints on the weights for k in updates.keys(): if updates[k].ndim == 2: updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0) # Compile training function train_fn = theano.function(inputs, loss, updates=updates, on_unused_input='ignore') # Monitoring Labels monitor_labels = ["reconst. loss"] monitor_labels = [ i for i, j in zip(monitor_labels, reconst_losses) if j != 0 ] monitor_labels += ["loss. sup.", "total loss"] # Build and compile test function val_outputs = reconst_losses_det val_outputs = [i for i, j in zip(val_outputs, reconst_losses) if j != 0] val_outputs += [sup_loss_det, loss_det] # Compute accuracy and add it to monitoring list test_acc, test_pred = mh.define_test_functions(disc_nonlinearity, prediction_sup, prediction_sup_det, target_var_sup) monitor_labels.append("accuracy") val_outputs.append(test_acc) # Compile prediction function predict = theano.function([input_var_sup], test_pred) # Compile validation function val_fn = theano.function(inputs, [prediction_sup_det] + val_outputs, on_unused_input='ignore') # Finally, launch the training loop. print("Starting testing...") if not os.path.exists(save_copy + '/model_feat_sel_best.npz'): print("No saved model to be tested and/or generate" " the embedding !") else: with np.load(save_copy + '/model_feat_sel_best.npz', ) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] lasagne.layers.set_all_param_values( filter(None, nets) + [discrim_net], param_values) test_minibatches = mlh.iterate_minibatches(x_test, y_test, batch_size, shuffle=False) test_err, pred, targets = mlh.monitoring(test_minibatches, "test", val_fn, monitor_labels, prec_recall_cutoff, return_pred=True) lab = targets.argmax(1) pred_argmax = pred.argmax(1) continent_cat = mh.create_1000_genomes_continent_labels() lab_cont = np.zeros(lab.shape) pred_cont = np.zeros(pred_argmax.shape) for i, c in enumerate(continent_cat): for el in c: lab_cont[lab == el] = i pred_cont[pred_argmax == el] = i cm_e = np.zeros((26, 26)) cm_c = np.zeros((5, 5)) for i in range(26): for j in range(26): cm_e[i, j] = ((pred_argmax == i) * (lab == j)).sum() for i in range(5): for j in range(5): cm_c[i, j] = ((pred_cont == i) * (lab_cont == j)).sum() np.savez(os.path.join(save_copy, 'cm' + str(which_fold) + '.npz'), cm_e=cm_e, cm_c=cm_c) print(os.path.join(save_copy, 'cm' + str(which_fold) + '.npz'))
def __init__(self, W=None, W_path=None, K=300, num_hidden=256, batch_size=None, grad_clip=100., max_sent_len_basic=200, num_classes=2, **kwargs): W = W V = len(W) K = int(K) num_hidden = int(num_hidden) batch_size = int(batch_size) grad_clip = int(grad_clip) max_seq_len = int(max_sent_len_basic) num_classes = int(num_classes) dropout = float(kwargs["dropout"]) lambda_w = float(kwargs["lambda_w"]) index = T.lscalar() X = T.imatrix('X') M = T.imatrix('M') y = T.ivector('y') # Input Layer l_in = lasagne.layers.InputLayer((batch_size, max_seq_len), input_var=X) print(" l_in shape: {}\n".format(get_output_shape(l_in))) l_mask = lasagne.layers.InputLayer((batch_size, max_seq_len), input_var=M) #l_mask2 = lasagne.layers.InputLayer((batch_size, max_seq_len), input_var=M) #l_mask_concat = lasagne.layers.ConcatLayer([l_mask, l_mask2]) print(" l_mask shape: {}\n".format(get_output_shape(l_mask))) #print(" l_mask shape: {}\n".format(get_output_shape(l_mask_concat))) # Embedding layer l_emb = lasagne.layers.EmbeddingLayer(l_in, input_size=V, output_size=K, W=W) # keep the embeddings static l_emb.params[l_emb.W].remove('trainable') print(" l_emb shape: {}\n".format(get_output_shape(l_emb))) # add droput #l_emb = lasagne.layers.DropoutLayer(l_emb, p=.2) # Use orthogonal Initialization for LSTM gates gate_params = lasagne.layers.recurrent.Gate( W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), b=lasagne.init.Constant(0.) ) cell_params = lasagne.layers.recurrent.Gate( W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), W_cell=None, b=lasagne.init.Constant(0.), nonlinearity=lasagne.nonlinearities.tanh ) l_fwd = lasagne.layers.LSTMLayer( l_emb, num_units=num_hidden, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh, mask_input=l_mask, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, learn_init=True ) l_fwd = lasagne.layers.DropoutLayer(l_fwd,p=dropout) print(" forward shape: {}\n".format(get_output_shape(l_fwd))) if kwargs["lstm"] == "bi": gate_params_bwd = lasagne.layers.recurrent.Gate( W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), b=lasagne.init.Constant(0.) ) cell_params_bwd = lasagne.layers.recurrent.Gate( W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), W_cell=None, b=lasagne.init.Constant(0.), nonlinearity=lasagne.nonlinearities.tanh ) l_bwd = lasagne.layers.LSTMLayer( l_emb, num_units=num_hidden, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh, mask_input=l_mask, ingate=gate_params_bwd, forgetgate=gate_params_bwd, cell=cell_params_bwd, outgate=gate_params_bwd, learn_init=True, backwards=True ) l_bwd = lasagne.layers.DropoutLayer(l_bwd,p=dropout) print(" backward shape: {}\n".format(get_output_shape(l_bwd))) # concat and dropout l_concat = lasagne.layers.ConcatLayer([l_fwd, l_bwd]) #l_concat = lasagne.layers.ElemwiseSumLayer([l_fwd, l_bwd]) l_concat_dropout = lasagne.layers.DropoutLayer(l_concat,p=dropout) print(" concat shape: {}\n".format(get_output_shape(l_concat))) else: l_concat_dropout = l_fwd network = lasagne.layers.DenseLayer( l_concat_dropout, num_units=num_classes, nonlinearity=lasagne.nonlinearities.softmax ) #print(" network shape: {}\n".format(get_output_shape(network))) self.network = network output = lasagne.layers.get_output(network) # Define objective function (cost) to minimize, mean crossentropy error cost = lasagne.objectives.categorical_crossentropy(output, y).mean() # Compute gradient updates params = lasagne.layers.get_all_params(network) cost += lambda_w*apply_penalty(params, l2) # grad_updates = lasagne.updates.nesterov_momentum(cost, params,learn_rate) grad_updates = lasagne.updates.adam(cost, params) #learn_rate = .01 #grad_updates = lasagne.updates.adadelta(cost, params, learn_rate) test_output = lasagne.layers.get_output(network, deterministic=True) val_cost_fn = lasagne.objectives.categorical_crossentropy( test_output, y).mean() preds = T.argmax(test_output, axis=1) val_acc_fn = T.mean(T.eq(preds, y), dtype=theano.config.floatX) self.val_fn = theano.function([X, M, y], [val_cost_fn, val_acc_fn, preds], allow_input_downcast=True) if kwargs["lstm"] == "bi": concat_output = lasagne.layers.get_output(l_concat) fwd_output = lasagne.layers.get_output(l_fwd) bwd_output = lasagne.layers.get_output(l_bwd) mask_output = lasagne.layers.get_output(l_mask) #mask_concat_output = lasagne.layers.get_output(l_mask_concat) self.get_concat = theano.function([X,M], [concat_output, fwd_output, bwd_output, mask_output]) #, mask_concat_output]) #print(y_train) # Compile train objective print "Compiling training functions" self.train = theano.function(inputs = [X,M,y], outputs = cost, updates = grad_updates, allow_input_downcast=True) self.test = theano.function(inputs = [X,M,y], outputs = val_acc_fn) self.pred = theano.function(inputs = [X,M],outputs = preds)
def execute(dataset, learning_rate=0.00001, learning_rate_annealing=1.0, lmd=0., noise=0.0, encoder_units=[1024, 512, 256], num_epochs=500, which_fold=1, save_path=None, save_copy=None, dataset_path=None, num_fully_connected=0, exp_name='', init_args=None): # Reading dataset print("Loading data") if dataset == "1000_genomes" and which_fold == 1 and False: x_unsup = mlh.load_data(dataset, dataset_path, None, which_fold=which_fold, keep_labels=1.0, missing_labels_val=-1.0, embedding_input='raw', transpose=False) import pdb; pdb.set_trace() x_train = np.zeros((x_unsup[0].shape[0], x_unsup[0].shape[1]*2), dtype="int8") x_train[:,::2] = (x_unsup[0] == 2) x_train[:,1::2] = (x_unsup[0] >= 1) x_valid = np.zeros((x_unsup[2].shape[0], x_unsup[2].shape[1]*2), dtype="int8") x_valid[:,::2] = (x_unsup[2] == 2) x_valid[:,1::2] = (x_unsup[2] >= 1) else: x_unsup = mlh.load_data(dataset, dataset_path, None, which_fold=which_fold, keep_labels=1.0, missing_labels_val=-1.0, embedding_input='bin', transpose=True) x_train = x_unsup[0][0] x_valid = x_unsup[1][0] print(x_train.shape, x_valid.shape) n_features = x_train.shape[1] exp_name += "learn_snp2vec_dae_h" for e in encoder_units: exp_name += ('-' + str(e)) # exp_name += '_g-' + str(gamma) exp_name += '_l-' + str(lmd) exp_name += '_lr-' + str(learning_rate) exp_name += '_fold-' + str(which_fold) save_path = os.path.join(save_path, exp_name) save_copy = os.path.join(save_copy, exp_name) if not os.path.exists(save_path): os.makedirs(save_path) if not os.path.exists(save_copy): os.makedirs(save_copy) # Prepare Theano variables for inputs and targets input_var = T.matrix('input') target_reconst = T.matrix('target') lr = theano.shared(np.float32(learning_rate), 'learning_rate') batch_size = 128 # building network encoder = InputLayer((batch_size, n_features), input_var) # building the encoder and decoder #import pdb; pdb.set_trace() for i in range(len(encoder_units)): encoder = DenseLayer( encoder, num_units=encoder_units[i], W=Uniform(0.00001), nonlinearity=leaky_rectify) # if i < len(encoder_units)-1 else linear) embedding = lasagne.layers.get_output(encoder) get_embedding_fn = theano.function([input_var], embedding) params = lasagne.layers.get_all_params(encoder, trainable=True) monitor_labels = ["embedding min", "embedding mean", "embedding max"] val_outputs = [embedding.min(), embedding.mean(), embedding.max()] nets = [encoder] decoder_units = encoder_units[::-1][1:] print(decoder_units) decoder = encoder for i in range(len(decoder_units)): decoder = DenseLayer(decoder, num_units=decoder_units[i], W=Uniform(0.0001), nonlinearity=leaky_rectify) decoder = DenseLayer(decoder, num_units=n_features, W=convert_initialization( init_args["decoder_init"], nonlinearity="sigmoid"), nonlinearity=sigmoid) prediction_reconst = lasagne.layers.get_output(decoder) # Reconstruction error loss_reconst = lasagne.objectives.binary_crossentropy(prediction_reconst, target_reconst).mean() # loss_reconst = mh.define_sampled_mean_bincrossentropy( # prediction_reconst, target_reconst, gamma=gamma) #loss_reconst = mh.dice_coef_loss( # target_reconst, prediction_reconst).mean() accuracy = T.eq(T.gt(prediction_reconst, 0.5), target_reconst).mean() params += lasagne.layers.get_all_params(decoder, trainable=True) monitor_labels += ["reconst. loss", "reconst. accuracy"] val_outputs += [loss_reconst, accuracy] nets += [decoder] # sparsity_reconst = gamma * l1(prediction_reconst) # roh = input_var.mean(0) # sparsity_reconst = ((roh * T.log(roh / (prediction_reconst.mean(0)+1e-8))) +\ # ((1 - roh) * T.log((1 - roh) / (1 - prediction_reconst + 1e-8)))).sum() # Combine losses loss = loss_reconst # + sparsity_reconst # applying weight decay l2_penalty = apply_penalty(params, l2) loss = loss + lmd * l2_penalty val_outputs += [loss] monitor_labels += ['loss'] # Some variables max_patience = 100 patience = 0 train_monitored = [] valid_monitored = [] train_loss = [] updates = lasagne.updates.adam(loss, params, learning_rate=lr) for k in updates.keys(): if updates[k].ndim == 2: updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0) inputs = [input_var, target_reconst] # Compile training function print "Compiling training function" train_fn = theano.function(inputs, loss, updates=updates, on_unused_input='ignore') val_fn = theano.function(inputs, [val_outputs[0]] + val_outputs, on_unused_input='ignore') start_training = time.time() print "Starting training" for epoch in range(num_epochs): start_time = time.time() print("Epoch {} of {}".format(epoch+1, num_epochs)) nb_minibatches = 0 loss_epoch = 0 for x, target_reconst_val in data_generator(x_train, batch_size, shuffle=True, noise=noise): loss_epoch += train_fn(x, target_reconst_val) nb_minibatches += 1 loss_epoch /= nb_minibatches train_loss += [loss_epoch] # Monitoring on the training set train_minibatches = data_generator(x_train, batch_size, noise=noise) train_err = mlh.monitoring(train_minibatches, "train", val_fn, monitor_labels, 0) train_monitored += [train_err] # Monitoring on the validation set valid_minibatches = data_generator(x_valid, batch_size, noise=noise) valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn, monitor_labels, 0) valid_monitored += [valid_err] early_stop_criterion = 'loss' early_stop_val = valid_err[monitor_labels.index(early_stop_criterion)] # Early stopping if epoch == 0: best_valid = early_stop_val elif early_stop_val < best_valid and early_stop_criterion == 'loss': best_valid = early_stop_val patience = 0 # Save stuff np.savez(save_path+'/model_snp2vec_best.npz', *lasagne.layers.get_all_param_values(nets)) np.savez(save_path + "/errors_snp2vec_best.npz", zip(*train_monitored), zip(*valid_monitored)) else: patience += 1 np.savez(os.path.join(save_path, 'model_snp2vec_last.npz'), *lasagne.layers.get_all_param_values(nets)) np.savez(save_path + "/errors_snp2vec_last.npz", zip(*train_monitored), zip(*valid_monitored)) # End training if (patience == max_patience) or (epoch == num_epochs-1): print("Ending training") # Load best model if not os.path.exists(save_path + '/model_snp2vec_best.npz'): print("No saved model to be tested and/or generate" " the embedding !") else: with np.load(save_path + '/model_snp2vec_best.npz') as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] lasagne.layers.set_all_param_values(nets, param_values) # Use the saved model to generate the feature embedding # Here the feature embedding is the different in the hidden # representation between having that feature on and having it off print("Generating embedding") embedding_size = encoder_units[-1] null_input = np.zeros((1, n_features), dtype="float32") null_embedding = get_embedding_fn(null_input)[0] all_embeddings = np.zeros((n_features, embedding_size), dtype="float32") """ single_feat_input = null_input.copy() for i in range(n_features): if i % 10000 == 0: print(i, n_features) single_feat_input[:,i] = 1 all_embeddings[i] = (get_embedding_fn(single_feat_input)[0] - null_embedding) single_feat_input[:,i] = 0 result1 = all_embeddings[:1000].copy() """ block_size = 10 single_feat_batch = np.zeros((block_size, n_features), dtype="float32") for i in range(0, n_features, block_size): if i % 10000 == 0: print(i, n_features) for j in range(block_size): single_feat_batch[j, i+j] = 1 all_embeddings[i:i+10] = (get_embedding_fn(single_feat_batch) - null_embedding) for j in range(block_size): single_feat_batch[j, i+j] = 0 np.save("/Tmp/carriepl/feature_selection/all_embeddings_fold%i_noise%f.npy" % (which_fold, noise), all_embeddings) # Training set results train_minibatches = data_generator(x_train, batch_size, noise=noise) train_err = mlh.monitoring(train_minibatches, "train", val_fn, monitor_labels, 0) # Validation set results valid_minibatches = data_generator(x_valid, batch_size, noise=noise) valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn, monitor_labels, 0) # Stop print(" epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time)) break print(" epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time)) # Anneal the learning rate lr.set_value(float(lr.get_value() * learning_rate_annealing)) # Copy files to loadpath if save_path != save_copy: print('Copying model and other training files to {}'.format(save_copy)) copy_tree(save_path, save_copy)
def execute(dataset, n_hidden_u, n_hidden_t_enc, n_hidden_t_dec, n_hidden_s, learning_rate, learning_rate_annealing=1., embedding_source=None, alpha=1, beta=1, gamma=1, lmd=0, encoder_net_init=0.001, decoder_net_init=0.001, disc_nonlinearity='softmax', keep_labels=1.0, prec_recall_cutoff=True, missing_labels_val=-1.0, which_fold=0, early_stop_criterion='accuracy', save_path='/Tmp/romerosa/DietNetworks/', dataset_path='/Tmp/' + os.environ["USER"] + '/datasets/', resume=False, exp_name=''): # Prepare embedding information if embedding_source is None: embedding_input = 'raw' else: embedding_input = embedding_source embedding_source = os.path.join( dataset_path, embedding_input + '_fold' + str(which_fold) + '.npy') # Load the dataset print("Loading data") x_train, y_train, x_valid, y_valid, x_test, y_test, \ x_unsup, training_labels = mlh.load_data( dataset, dataset_path, embedding_source, which_fold=which_fold, keep_labels=keep_labels, missing_labels_val=missing_labels_val, embedding_input=embedding_input) if x_unsup is not None: n_samples_unsup = x_unsup.shape[1] else: n_samples_unsup = 0 # Extract required information from data n_samples, n_feats = x_train.shape print("Number of features : ", n_feats) print("Glorot init : ", 2.0 / (n_feats + n_hidden_t_enc[-1])) n_targets = y_train.shape[1] # Set some variables batch_size = 138 beta = gamma if (gamma == 0) else beta # Preparing folder to save stuff if embedding_source is None: embedding_name = embedding_input else: embedding_name = embedding_source.replace("_", "").split(".")[0] exp_name += embedding_name.rsplit('/', 1)[::-1][0] + '_' exp_name += mlh.define_exp_name(keep_labels, alpha, beta, gamma, lmd, n_hidden_u, n_hidden_t_enc, n_hidden_t_dec, n_hidden_s, which_fold, learning_rate, decoder_net_init, encoder_net_init, early_stop_criterion, learning_rate_annealing) print("Experiment: " + exp_name) save_path = os.path.join(save_path, dataset, exp_name) print(save_path) if not os.path.exists(save_path): os.makedirs(save_path) # Prepare Theano variables for inputs and targets input_var_sup = T.matrix('input_sup') input_var_unsup = theano.shared(x_unsup, 'input_unsup') # x_unsup TBD target_var_sup = T.matrix('target_sup') # Build model print("Building model") # Some checkings # assert len(n_hidden_u) > 0 assert len(n_hidden_t_enc) > 0 assert len(n_hidden_t_dec) > 0 assert n_hidden_t_dec[-1] == n_hidden_t_enc[-1] # Build feature embedding networks (encoding and decoding if gamma > 0) nets, embeddings, pred_feat_emb = mh.build_feat_emb_nets( embedding_source, n_feats, n_samples_unsup, input_var_unsup, n_hidden_u, n_hidden_t_enc, n_hidden_t_dec, gamma, encoder_net_init, encoder_net_init, save_path) # Build feature embedding reconstruction networks (if alpha > 0, beta > 0) nets += mh.build_feat_emb_reconst_nets( [alpha, beta], n_samples_unsup, n_hidden_u, [n_hidden_t_enc, n_hidden_t_dec], nets, [encoder_net_init, encoder_net_init]) # Supervised network discrim_net, hidden_rep = mh.build_discrim_net( batch_size, n_feats, input_var_sup, n_hidden_t_enc, n_hidden_s, embeddings[0], disc_nonlinearity, n_targets) # Reconstruct network nets += [ mh.build_reconst_net(hidden_rep, embeddings[1] if len(embeddings) > 1 else None, n_feats, gamma) ] # Load best model with np.load(os.path.join(save_path, 'dietnets_best.npz')) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] lasagne.layers.set_all_param_values( filter(None, nets) + [discrim_net], param_values) print("Building and compiling training functions") # Build functions predictions, predictions_det = mh.define_predictions(nets, start=2) prediction_sup, prediction_sup_det = mh.define_predictions([discrim_net]) prediction_sup = prediction_sup[0] prediction_sup_det = prediction_sup_det[0] # Define losses # reconstruction losses _, reconst_losses_det = mh.define_reconst_losses( predictions, predictions_det, [input_var_unsup, input_var_unsup, input_var_sup]) # supervised loss _, sup_loss_det = mh.define_sup_loss(disc_nonlinearity, prediction_sup, prediction_sup_det, keep_labels, target_var_sup, missing_labels_val) # Define inputs inputs = [input_var_sup, target_var_sup] # Combine losses loss_det = sup_loss_det + alpha*reconst_losses_det[0] + \ beta*reconst_losses_det[1] + gamma*reconst_losses_det[2] # Define parameters params = lasagne.layers.get_all_params([discrim_net] + filter(None, nets), trainable=True) l2_penalty = apply_penalty(params, l2) loss_det = loss_det + lmd * l2_penalty # Monitoring Labels monitor_labels = [ "reconst. feat. W_enc", "reconst. feat. W_dec", "reconst. loss" ] monitor_labels = [ i for i, j in zip(monitor_labels, reconst_losses_det) if j != 0 ] monitor_labels += ["feat. W_enc. mean", "feat. W_enc var"] monitor_labels += ["feat. W_dec. mean", "feat. W_dec var"] if \ (embeddings[1] is not None) else [] monitor_labels += ["loss. sup.", "total loss"] # test function val_outputs = reconst_losses_det val_outputs = [ i for i, j in zip(val_outputs, reconst_losses_det) if j != 0 ] val_outputs += [embeddings[0].mean(), embeddings[0].var()] val_outputs += [embeddings[1].mean(), embeddings[1].var()] if \ (embeddings[1] is not None) else [] val_outputs += [sup_loss_det, loss_det] # Compute accuracy and add it to monitoring list test_acc, test_pred = mh.define_test_functions(disc_nonlinearity, prediction_sup, prediction_sup_det, target_var_sup) monitor_labels.append("accuracy") val_outputs.append(test_acc) # Compile prediction function predict = theano.function([input_var_sup], test_pred) # Compile validation function val_fn = theano.function(inputs, [prediction_sup_det] + val_outputs, on_unused_input='ignore') # Finally, launch the training loop. print("Starting testing...") test_minibatches = mlh.iterate_minibatches(x_test, y_test, batch_size, shuffle=False) test_err, pred, targets = mlh.monitoring(test_minibatches, "test", val_fn, monitor_labels, prec_recall_cutoff, return_pred=True) lab = targets.argmax(1) pred_argmax = pred.argmax(1) continent_cat = mh.create_1000_genomes_continent_labels() lab_cont = np.zeros(lab.shape) pred_cont = np.zeros(pred_argmax.shape) for i, c in enumerate(continent_cat): for el in c: lab_cont[lab == el] = i pred_cont[pred_argmax == el] = i cm_e = np.zeros((26, 26)) cm_c = np.zeros((5, 5)) for i in range(26): for j in range(26): cm_e[i, j] = ((pred_argmax == i) * (lab == j)).sum() for i in range(5): for j in range(5): cm_c[i, j] = ((pred_cont == i) * (lab_cont == j)).sum() np.savez(os.path.join(save_path, 'cm' + str(which_fold) + '.npz'), cm_e=cm_e, cm_c=cm_c) print(os.path.join(save_path, 'cm' + str(which_fold) + '.npz'))
def __init__(self, V, d, max_post_length, max_sentence_length, embeddings=None, GRAD_CLIP=100, num_layers=1, learning_rate=0.01, add_biases=False, rd=100, op=False, word_attn=True, sent_attn=True, highway=False, hops=3, words=True, frames=False, discourse=False): self._hyper_params = dict(V=V, d=d, max_post_length=max_post_length, max_sentence_length=max_sentence_length, GRAD_CLIP=GRAD_CLIP, num_layers=num_layers, learning_rate=learning_rate, add_biases=add_biases, rd=rd, op=op, word_attn=word_attn, sent_attn=sent_attn, highway=highway, hops=hops) print(V, d, max_post_length, max_sentence_length) #S x N matrix of sentences (aka list of word indices) #B x S x N tensor of batches of posts idxs_rr = T.itensor3('idxs_rr') idxs_op = T.itensor3('idxs_op') #B x S x N matrix mask_rr_w = T.itensor3('mask_rr_w') mask_op_w = T.itensor3('mask_rr_w') #B x S matrix mask_rr_s = T.imatrix('mask_rr_s') mask_op_s = T.imatrix('mask_rr_s') #B-long vector gold = T.ivector('gold') lambda_w = T.scalar('lambda_w') p_dropout = T.scalar('p_dropout') biases = T.matrix('biases') weights = T.ivector('weights') #now use this as an input to an LSTM l_idxs_rr = lasagne.layers.InputLayer(shape=(None, max_post_length, max_sentence_length), input_var=idxs_rr) l_mask_rr_w = lasagne.layers.InputLayer(shape=(None, max_post_length, max_sentence_length), input_var=mask_rr_w) l_mask_rr_s = lasagne.layers.InputLayer(shape=(None, max_post_length), input_var=mask_rr_s) l_idxs_op = lasagne.layers.InputLayer(shape=(None, max_post_length, max_sentence_length), input_var=idxs_op) l_mask_op_w = lasagne.layers.InputLayer(shape=(None, max_post_length, max_sentence_length), input_var=mask_op_w) l_mask_op_s = lasagne.layers.InputLayer(shape=(None, max_post_length), input_var=mask_op_s) if add_biases: l_biases = lasagne.layers.InputLayer(shape=(None, 1), input_var=biases) #now B x S x N x D if embeddings is not None: l_emb_rr_w = lasagne.layers.EmbeddingLayer( l_idxs_rr, V, d, W=lasagne.utils.floatX(embeddings)) else: l_emb_rr_w = lasagne.layers.EmbeddingLayer(l_idxs_rr, V, d) #now B x S x D if words: if word_attn: l_attn_rr_w = AttentionWordLayer([l_emb_rr_w, l_mask_rr_w], d) l_avg_rr_s = WeightedAverageWordLayer( [l_emb_rr_w, l_attn_rr_w]) else: l_avg_rr_s = AverageWordLayer([l_emb_rr_w, l_mask_rr_w]) concats = [l_avg_rr_s] inputs = [idxs_rr, mask_rr_w, mask_rr_s] else: concats = [] inputs = [mask_rr_w, mask_rr_s] if frames: idxs_frames_rr = T.itensor3('idxs_frames_rr') inputs.append(idxs_frames_rr) l_idxs_frames_rr = lasagne.layers.InputLayer( shape=(None, max_post_length, max_sentence_length), input_var=idxs_frames_rr) l_emb_frames_rr_w = lasagne.layers.EmbeddingLayer(l_idxs_frames_rr, V, d, W=l_emb_rr_w.W) if word_attn: l_attn_rr_frames = AttentionWordLayer( [l_emb_frames_rr_w, l_mask_rr_w], d) l_avg_rr_s_frames = WeightedAverageWordLayer( [l_emb_frames_rr_w, l_attn_rr_frames]) else: l_avg_rr_s_frames = AverageWordLayer( [l_emb_frames_rr_w, l_mask_rr_w]) concats.append(l_avg_rr_s_frames) if discourse: idxs_disc_rr = T.imatrix('idxs_disc_rr') inputs.append(idxs_disc_rr) l_emb_disc_rr = lasagne.layers.EmbeddingLayer(l_idxs_disc_rr, V, d, W=l_emb_rr_w.W) concats.append(l_emb_disc_rr) l_avg_rr_s = lasagne.layers.ConcatLayer(concats, axis=-1) if highway: l_avg_rr_s = HighwayLayer( l_avg_rr_s, num_units=l_avg_rr_s.output_shape[-1], nonlinearity=lasagne.nonlinearities.rectify, num_leading_axes=2) #separate embeddings for OP if embeddings is not None: l_emb_op_w = lasagne.layers.EmbeddingLayer( l_idxs_op, V, d, W=lasagne.utils.floatX(embeddings)) else: l_emb_op_w = lasagne.layers.EmbeddingLayer(l_idxs_op, V, d) if op: if words: l_attn_op_w = AttentionWordLayer([l_emb_op_w, l_mask_op_w], d) l_avg_op_s = WeightedAverageWordLayer( [l_emb_op_w, l_attn_op_w]) concats = [l_avg_op_s] inputs.extend([idxs_op, mask_op_w, mask_op_s]) else: concats = [] inputs.extend([mask_op_w, mask_op_s]) if frames: idxs_frames_op = T.itensor3('idxs_frames_op') inputs.append(idxs_frames_op) l_idxs_frames_op = lasagne.layers.InputLayer( shape=(None, max_post_length, max_sentence_length), input_var=idxs_frames_op) l_emb_frames_op_w = lasagne.layers.EmbeddingLayer( l_idxs_frames_op, V, d, W=l_emb_op_w.W) l_attn_op_frames = AttentionWordLayer( [l_emb_frames_op_w, l_mask_op_w], d) l_avg_op_s_frames = WeightedAverageWordLayer( [l_emb_frames_op_w, l_attn_op_frames]) concats.append(l_avg_op_s_frames) if discourse: idxs_disc_op = T.imatrix('idxs_disc_op') inputs.append(idxs_disc_op) l_emb_disc_op = lasagne.layers.EmbeddingLayer(l_idxs_disc_op, V, d, W=l_emb_op_w.W) concats.append(l_emb_disc_op) l_avg_op_s = lasagne.layers.ConcatLayer(concats, axis=-1) #bidirectional LSTM l_lstm_op_s_fwd = lasagne.layers.LSTMLayer( l_avg_op_s, rd, nonlinearity=lasagne.nonlinearities.tanh, grad_clipping=GRAD_CLIP, mask_input=l_mask_op_s) l_lstm_op_s_rev = lasagne.layers.LSTMLayer( l_avg_op_s, rd, nonlinearity=lasagne.nonlinearities.tanh, grad_clipping=GRAD_CLIP, mask_input=l_mask_op_s, backwards=True) l_avg_op_s = lasagne.layers.ConcatLayer( [l_lstm_op_s_fwd, l_lstm_op_s_rev], axis=-1) l_attn_op_s = AttentionSentenceLayer([l_avg_op_s, l_mask_op_s], d) l_op_avg = WeightedAverageSentenceLayer([l_avg_op_s, l_attn_op_s]) #bidirectional LSTM l_lstm_rr_s_fwd = lasagne.layers.LSTMLayer( l_avg_rr_s, rd, nonlinearity=lasagne.nonlinearities.tanh, grad_clipping=GRAD_CLIP, mask_input=l_mask_rr_s) l_lstm_rr_s_rev = lasagne.layers.LSTMLayer( l_avg_rr_s, rd, nonlinearity=lasagne.nonlinearities.tanh, grad_clipping=GRAD_CLIP, mask_input=l_mask_rr_s, backwards=True) #for attention or avergae l_lstm_rr_s = lasagne.layers.ConcatLayer( [l_lstm_rr_s_fwd, l_lstm_rr_s_rev], axis=-1) #now memory network init_memory_response = AverageSentenceLayer([l_lstm_rr_s, l_mask_rr_s]) if op: init_memory_response = lasagne.layers.ConcatLayer( [init_memory_response, l_op_avg]) l_memory = MyConcatLayer([l_lstm_rr_s, init_memory_response]) if sent_attn: l_attn_rr_s = AttentionSentenceLayer([l_lstm_rr_s, l_mask_rr_s], d) l_rr_avg = WeightedAverageSentenceLayer([l_lstm_rr_s, l_attn_rr_s]) else: l_rr_avg = AverageSentenceLayer([l_lstm_rr_s, l_mask_rr_s]) for i in range(hops): l_attn_rr_s = AttentionSentenceLayer([l_memory, l_mask_rr_s], d) l_rr_avg = WeightedAverageSentenceLayer([l_memory, l_attn_rr_s]) if op: l_rr_avg = lasagne.layers.ConcatLayer([l_rr_avg, l_op_avg]) l_memory = MyConcatLayer([l_lstm_rr_s, l_rr_avg]) l_hid = l_rr_avg for num_layer in range(num_layers): l_hid = lasagne.layers.DenseLayer( l_hid, num_units=d, nonlinearity=lasagne.nonlinearities.rectify) #now B x 1 l_hid = lasagne.layers.DropoutLayer(l_hid, p_dropout) if add_biases: l_hid = lasagne.layers.ConcatLayer([l_hid, l_biases], axis=-1) inputs.append(biases) self.network = lasagne.layers.DenseLayer(l_hid, num_units=1, nonlinearity=T.nnet.sigmoid) predictions = lasagne.layers.get_output(self.network).ravel() xent = lasagne.objectives.binary_crossentropy(predictions, gold) loss = lasagne.objectives.aggregate(xent, weights, mode='normalized_sum') params = lasagne.layers.get_all_params(self.network, trainable=True) #add regularization loss += lambda_w * apply_penalty(params, l2) updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate=learning_rate, momentum=0.9) print('compiling...') train_outputs = loss self.train = theano.function(inputs + [gold, lambda_w, p_dropout, weights], train_outputs, updates=updates, allow_input_downcast=True, on_unused_input='warn') print('...') test_predictions = lasagne.layers.get_output( self.network, deterministic=True).ravel() self.predict = theano.function(inputs, test_predictions, allow_input_downcast=True, on_unused_input='warn') test_acc = T.mean(T.eq(test_predictions > .5, gold), dtype=theano.config.floatX) print('...') test_loss = lasagne.objectives.binary_crossentropy( test_predictions, gold).mean() self.validate = theano.function( inputs + [gold, lambda_w, p_dropout, weights], [loss, test_acc], on_unused_input='warn') print('...') #attention for words, B x S x N print('attention...') word_attention = lasagne.layers.get_output( AttentionWordLayer([l_emb_rr_w, l_mask_rr_w], d, W_w=l_attn_rr_w.W_w, u_w=l_attn_rr_w.u_w, b_w=l_attn_rr_w.b_w, normalized=False)) self.word_attention = theano.function([idxs_rr, mask_rr_w], word_attention, allow_input_downcast=True, on_unused_input='warn') #attention for sentences, B x S print('...') sentence_attention = lasagne.layers.get_output(l_attn_rr_s) if add_biases: inputs = inputs[:-1] self.sentence_attention = theano.function(inputs, sentence_attention, allow_input_downcast=True, on_unused_input='warn') print('finished compiling...')
def __init__(self, W=None, W_path=None, K=300, num_hidden=256, batch_size=None, grad_clip=100., max_sent_len=200, num_classes=2, **kwargs): W = W V = len(W) K = int(K) print("this is the value of K: {}\n".format(K)) num_hidden = int(num_hidden) batch_size = int(batch_size) grad_clip = int(grad_clip) max_seq_len = int(max_sent_len) max_post_len = int(kwargs["max_post_len"]) num_classes = int(num_classes) dropout = float(kwargs["dropout"]) lambda_w = float(kwargs["lambda_w"]) separate_attention_context = str_to_bool(kwargs["separate_attention_context"]) separate_attention_response = str_to_bool(kwargs["separate_attention_response"]) interaction = str_to_bool(kwargs["interaction"]) separate_attention_context_words = str_to_bool(kwargs["separate_attention_context_words"]) separate_attention_response_words = str_to_bool(kwargs["separate_attention_response_words"]) print("this is the separate_attention_context: {}\n".format(separate_attention_context)) print("this is the separate_attention_response: {}\n".format(separate_attention_response)) print("this is the separate_attention_context_words: {}\n".format(separate_attention_context_words)) print("this is the separate_attention_response_words: {}\n".format(separate_attention_response_words)) print("this is the interaction: {}\n".format(interaction)) #S x N matrix of sentences (aka list of word indices) #B x S x N tensor of batches of responses idxs_context = T.itensor3('idxs_context') #imatrix #B x S x N matrix mask_context_words = T.itensor3('mask_context_words') #B x S matrix mask_context_sents = T.imatrix('mask_context_sents') #B x S x N tensor of batches of responses idxs_response = T.itensor3('idxs_response') #imatrix #B x S x N matrix mask_response_words = T.itensor3('mask_response_words') #B x S matrix mask_response_sents = T.imatrix('mask_response_sents') #B-long vector y = T.ivector('y') # TODO # Add biases, other params? #lambda_w = T.scalar('lambda_w') #p_dropout = T.scalar('p_dropout') #biases = T.matrix('biases') #weights = T.ivector('weights') inputs = [idxs_response, mask_response_words, mask_response_sents] # TODO # change inputs, function calls #idxs_context, mask_context_words, mask_context_sents #now use this as an input to an LSTM l_idxs_context = lasagne.layers.InputLayer(shape=(None, max_post_len, max_sent_len), input_var=idxs_context) l_mask_context_words = lasagne.layers.InputLayer(shape=(None, max_post_len, max_sent_len),input_var=mask_context_words) l_mask_context_sents = lasagne.layers.InputLayer(shape=(None, max_post_len), input_var=mask_context_sents) #if add_biases: # l_biases = lasagne.layers.InputLayer(shape=(None,1), # input_var=biases) #now B x S x N x D #l_emb = lasagne.layers.EmbeddingLayer(l_in, input_size=V, output_size=K, W=W) l_emb_rr_w_context = lasagne.layers.EmbeddingLayer(l_idxs_context, input_size=V, output_size=K, W=W) l_emb_rr_w_context.params[l_emb_rr_w_context.W].remove('trainable') # l_hid_context = l_emb_rr_w #CBOW w/attn #now B x S x D if separate_attention_context_words: l_attention_words_context = AttentionWordLayer([l_emb_rr_w_context, l_mask_context_words], K) #print(" attention word layer shape: {}\n".format(get_output_shape(l_attention_words_context))) l_avg_rr_s_words_context = WeightedAverageWordLayer([l_emb_rr_w_context,l_attention_words_context]) else: l_avg_rr_s_words_context = WeightedAverageWordLayer([l_emb_rr_w_context, l_mask_context_words]) ##concats = l_avg_rr_s_words_context ##concats = [l_avg_rr_s_words_context] l_avg_rr_s_context = l_avg_rr_s_words_context # concats not relevant here, was just frames, sentiment etc for other task. #l_avg_rr_s_context = lasagne.layers.ConcatLayer(concats, axis=-1) # TODO # add highway ? #add MLP #if highway: # l_avg_rr_s_context = HighwayLayer(l_avg_rr_s_context, num_units=l_avg_rr_s_context.output_shape[-1], # nonlinearity=lasagne.nonlinearities.rectify, # num_leading_axes=2) # l_lstm_rr_s_context = lasagne.layers.LSTMLayer(l_avg_rr_s_context, num_hidden, nonlinearity=lasagne.nonlinearities.tanh, grad_clipping=grad_clip, mask_input=l_mask_context_sents) l_lstm_rr_s_context = lasagne.layers.DropoutLayer(l_lstm_rr_s_context,p=dropout) if interaction: #l_hid_context = l_lstm_rr_s_context if separate_attention_context: print("separate attention context\n") l_attn_rr_s_context = AttentionSentenceLayer([l_lstm_rr_s_context, l_mask_context_sents], num_hidden) l_lstm_rr_avg_context = WeightedAverageSentenceLayer([l_lstm_rr_s_context, l_attn_rr_s_context]) print(" attention weighted average sentence layer shape: {}\n".format(get_output_shape(l_lstm_rr_avg_context))) else: print("just averaged context without attention\n") l_lstm_rr_avg_context = WeightedAverageSentenceLayer([l_lstm_rr_s_context, l_mask_context_sents]) print(" attention weighted average sentence layer shape: {}\n".format(get_output_shape(l_lstm_rr_avg_context))) l_hid_context = l_lstm_rr_avg_context print("interaction\n") else: print("no interaction!!! \n") #LSTM w/ attn #now B x D if separate_attention_context: print("separate attention context\n") l_attn_rr_s_context = AttentionSentenceLayer([l_lstm_rr_s_context, l_mask_context_sents], num_hidden) l_lstm_rr_avg_context = WeightedAverageSentenceLayer([l_lstm_rr_s_context, l_attn_rr_s_context]) print(" attention weighted average sentence layer shape: {}\n".format(get_output_shape(l_lstm_rr_avg_context))) else: print("just averaged context without attention\n") l_lstm_rr_avg_context = WeightedAverageSentenceLayer([l_lstm_rr_s_context, l_mask_context_sents]) print(" attention weighted average sentence layer shape: {}\n".format(get_output_shape(l_lstm_rr_avg_context))) l_hid_context = l_lstm_rr_avg_context # TODO # change inputs, function calls #idxs_context, mask_context_words, mask_context_sents #now use this as an input to an LSTM l_idxs_response = lasagne.layers.InputLayer(shape=(None, max_post_len, max_sent_len), input_var=idxs_response) l_mask_response_words = lasagne.layers.InputLayer(shape=(None, max_post_len, max_sent_len),input_var=mask_response_words) l_mask_response_sents = lasagne.layers.InputLayer(shape=(None, max_post_len), input_var=mask_response_sents) #if add_biases: # l_biases = lasagne.layers.InputLayer(shape=(None,1), # input_var=biases) #now B x S x N x D #l_emb = lasagne.layers.EmbeddingLayer(l_in, input_size=V, output_size=K, W=W) l_emb_rr_w_response = lasagne.layers.EmbeddingLayer(l_idxs_response, input_size=V, output_size=K, W=W) l_emb_rr_w_response.params[l_emb_rr_w_response.W].remove('trainable') # l_hid_response = l_emb_rr_w #CBOW w/attn #now B x S x D if separate_attention_response_words: l_attention_words_response = AttentionWordLayer([l_emb_rr_w_response, l_mask_response_words], K) #print(" attention word layer shape: {}\n".format(get_output_shape(l_attention_words_response))) l_avg_rr_s_words_response = WeightedAverageWordLayer([l_emb_rr_w_response,l_attention_words_response]) else: l_avg_rr_s_words_response = WeightedAverageWordLayer([l_emb_rr_w_response, l_mask_response_words]) #l_attention_words_response = AttentionWordLayer([l_emb_rr_w_response, l_mask_response_words], K) #print(" attention word layer shape: {}\n".format(get_output_shape(l_attention_words_response))) #l_avg_rr_s_words_response = WeightedAverageWordLayer([l_emb_rr_w_response, l_mask_response_words]) ##concats = l_avg_rr_s_words_response ##concats = [l_avg_rr_s_words_response] l_avg_rr_s_response = l_avg_rr_s_words_response # concats not relevant here, was just frames, sentiment etc for other task. #l_avg_rr_s_response = lasagne.layers.ConcatLayer(concats, axis=-1) # TODO # add highway ? #add MLP #if highway: # l_avg_rr_s_response = HighwayLayer(l_avg_rr_s_response, num_units=l_avg_rr_s_response.output_shape[-1], # nonlinearity=lasagne.nonlinearities.rectify, # num_leading_axes=2) # if interaction: print("interaction\n") # add some cell init l_lstm_rr_s_response = lasagne.layers.LSTMLayer(l_avg_rr_s_response, num_hidden, nonlinearity=lasagne.nonlinearities.tanh, grad_clipping=grad_clip,cell_init=l_hid_context, mask_input=l_mask_response_sents) else: l_lstm_rr_s_response = lasagne.layers.LSTMLayer(l_avg_rr_s_response, num_hidden, nonlinearity=lasagne.nonlinearities.tanh, grad_clipping=grad_clip, mask_input=l_mask_response_sents) l_lstm_rr_s_response = lasagne.layers.DropoutLayer(l_lstm_rr_s_response,p=dropout) #LSTM w/ attn #now B x D if separate_attention_response: print("separate attention on the response\n") l_attn_rr_s_response = AttentionSentenceLayer([l_lstm_rr_s_response, l_mask_response_sents], num_hidden) l_lstm_rr_avg_response = WeightedAverageSentenceLayer([l_lstm_rr_s_response, l_attn_rr_s_response]) print(" attention weighted average sentence layer shape: {}\n".format(get_output_shape(l_lstm_rr_avg_response))) else: print("just average response without attention\n") l_lstm_rr_avg_response = WeightedAverageSentenceLayer([l_lstm_rr_s_response, l_mask_response_sents]) print(" attention weighted average sentence layer shape: {}\n".format(get_output_shape(l_lstm_rr_avg_response))) l_hid_response = l_lstm_rr_avg_response # TODO # add more layers? biases? #for num_layer in range(num_layers): # l_hid_response = lasagne.layers.DenseLayer(l_hid_response, num_units=rd, # nonlinearity=lasagne.nonlinearities.rectify) # #now B x 1 # l_hid_response = lasagne.layers.DropoutLayer(l_hid_response, p_dropout) # #if add_biases: # l_hid_response = lasagne.layers.ConcatLayer([l_hid_response, l_biases], axis=-1) # inputs.append(biases) # #self.network = lasagne.layers.DenseLayer(l_hid_response, num_units=2, # nonlinearity=T.nnet.sigmoid) # #predictions = lasagne.layers.get_output(self.network).ravel() # #xent = lasagne.objectives.binary_crossentropy(predictions, gold) #loss = lasagne.objectives.aggregate(xent, weights, mode='normalized_sum') # #params = lasagne.layers.get_all_params(self.network, trainable=True) # # TODO ##add regularization? different gradient technique? #loss += lambda_w*apply_penalty(params, l2) #updates = lasagne.updates.nesterov_momentum(loss, params, # learning_rate=learning_rate, momentum=0.9) #print('compiling...') #train_outputs = loss #self.train = theano.function(inputs + [gold, lambda_w, p_dropout, weights], # train_outputs, # updates=updates, # allow_input_downcast=True, # on_unused_input='warn') #print('...') #test_predictions = lasagne.layers.get_output(self.network, deterministic=True).ravel() # #self.predict = theano.function(inputs, # test_predictions, # allow_input_downcast=True, # on_unused_input='warn') #test_acc = T.mean(T.eq(test_predictions > .5, gold), # dtype=theano.config.floatX) #print('...') #test_loss = lasagne.objectives.binary_crossentropy(test_predictions, # gold).mean() #self.validate = theano.function(inputs + [gold, lambda_w, p_dropout, weights], # [loss, test_acc], # on_unused_input='warn') print('...') #attention for words, B x S x N ##attention for sentences, B x S print('finished compiling...') if interaction: l_concat = l_hid_response else: l_concat = lasagne.layers.ConcatLayer([l_hid_context,l_hid_response]) network = lasagne.layers.DenseLayer( l_concat, num_units=num_classes, nonlinearity=lasagne.nonlinearities.softmax ) self.network = network output = lasagne.layers.get_output(network) # Define objective function (cost) to minimize, mean crossentropy error cost = lasagne.objectives.categorical_crossentropy(output, y).mean() # Compute gradient updates params = lasagne.layers.get_all_params(network) cost += lambda_w*apply_penalty(params, l2) # grad_updates = lasagne.updates.nesterov_momentum(cost, params,learn_rate) grad_updates = lasagne.updates.adam(cost, params) #learn_rate = .01 #grad_updates = lasagne.updates.adadelta(cost, params, learn_rate) test_output = lasagne.layers.get_output(network, deterministic=True) val_cost_fn = lasagne.objectives.categorical_crossentropy( test_output, y).mean() preds = T.argmax(test_output, axis=1) val_acc_fn = T.mean(T.eq(preds, y), dtype=theano.config.floatX) self.val_fn = theano.function([idxs_context, mask_context_words, mask_context_sents, idxs_response, mask_response_words, mask_response_sents, y], [val_cost_fn, val_acc_fn, preds], allow_input_downcast=True,on_unused_input='warn') # Compile train objective print "Compiling training, testing, prediction functions" self.train = theano.function(inputs = [idxs_context, mask_context_words, mask_context_sents,idxs_response, mask_response_words, mask_response_sents, y], outputs = cost, updates = grad_updates, allow_input_downcast=True,on_unused_input='warn') self.test = theano.function(inputs = [idxs_context, mask_context_words, mask_context_sents,idxs_response, mask_response_words, mask_response_sents, y], outputs = val_acc_fn,allow_input_downcast=True,on_unused_input='warn') self.pred = theano.function(inputs = [idxs_context, mask_context_words, mask_context_sents, idxs_response, mask_response_words, mask_response_sents],outputs = preds,allow_input_downcast=True,on_unused_input='warn') if separate_attention_response: sentence_attention = lasagne.layers.get_output(l_attn_rr_s_response, deterministic=True) #if add_biases: # inputs = inputs[:-1] self.sentence_attention_response = theano.function([idxs_context, mask_context_words, mask_context_sents,idxs_response, mask_response_words, mask_response_sents], [sentence_attention, preds], allow_input_downcast=True, on_unused_input='warn') if separate_attention_context: sentence_attention_context = lasagne.layers.get_output(l_attn_rr_s_context, deterministic=True) #if add_biases: # inputs = inputs[:-1] self.sentence_attention_context = theano.function([idxs_context, mask_context_words, mask_context_sents,idxs_response, mask_response_words, mask_response_sents], [sentence_attention_context,preds], allow_input_downcast=True, on_unused_input='warn') if separate_attention_response_words: word_attention = lasagne.layers.get_output(l_attention_words_response, deterministic=True) self.sentence_attention_response_words = theano.function([idxs_context, mask_context_words, mask_context_sents,idxs_response, mask_response_words, mask_response_sents],[word_attention,preds], allow_input_downcast=True, on_unused_input='warn') if separate_attention_context_words: word_attention_context = lasagne.layers.get_output(l_attention_words_context, deterministic = True) self.sentence_attention_context_words = theano.function([idxs_context, mask_context_words, mask_context_sents,idxs_response, mask_response_words, mask_response_sents],[word_attention_context,preds], allow_input_downcast=True, on_unused_input='warn')
def execute(dataset, n_hidden_t_enc, n_hidden_s, num_epochs=500, learning_rate=.001, learning_rate_annealing=1.0, gamma=1, lmd=0., disc_nonlinearity="sigmoid", keep_labels=1.0, prec_recall_cutoff=True, missing_labels_val=-1.0, which_fold=1, early_stop_criterion='loss', save_path='/Tmp/romerosa/DietNetworks/', save_copy='/Tmp/romerosa/DietNetworks/', dataset_path='/Tmp/carriepl/datasets/', resume=False): # Load the dataset print("Loading data") x_train, y_train, x_valid, y_valid, x_test, y_test, \ x_unsup, training_labels = mlh.load_data( dataset, dataset_path, None, which_fold=which_fold, keep_labels=keep_labels, missing_labels_val=missing_labels_val, embedding_input='raw') # Extract required information from data n_samples, n_feats = x_train.shape print("Number of features : ", n_feats) print("Glorot init : ", 2.0 / (n_feats + n_hidden_t_enc[-1])) n_targets = y_train.shape[1] # Set some variables batch_size = 128 # Preparing folder to save stuff exp_name = 'basic_' + mlh.define_exp_name( keep_labels, 0, 0, gamma, lmd, [], n_hidden_t_enc, [], n_hidden_s, which_fold, learning_rate, 0, 0, early_stop_criterion, learning_rate_annealing) print("Experiment: " + exp_name) save_path = os.path.join(save_path, dataset, exp_name) save_copy = os.path.join(save_copy, dataset, exp_name) if not os.path.exists(save_path): os.makedirs(save_path) # Prepare Theano variables for inputs and targets input_var_sup = T.matrix('input_sup') target_var_sup = T.matrix('target_sup') lr = theano.shared(np.float32(learning_rate), 'learning_rate') # Build model print("Building model") discrim_net = InputLayer((None, n_feats), input_var_sup) discrim_net = DenseLayer(discrim_net, num_units=n_hidden_t_enc[-1], nonlinearity=rectify) # Reconstruct the input using dec_feat_emb if gamma > 0: reconst_net = DenseLayer(discrim_net, num_units=n_feats, nonlinearity=linear) nets = [reconst_net] else: nets = [None] # Add supervised hidden layers for hid in n_hidden_s: discrim_net = DropoutLayer(discrim_net) discrim_net = DenseLayer(discrim_net, num_units=hid) assert disc_nonlinearity in ["sigmoid", "linear", "rectify", "softmax"] discrim_net = DropoutLayer(discrim_net) discrim_net = DenseLayer(discrim_net, num_units=n_targets, nonlinearity=eval(disc_nonlinearity)) print("Building and compiling training functions") # Build and compile training functions predictions, predictions_det = mh.define_predictions(nets, start=0) prediction_sup, prediction_sup_det = mh.define_predictions([discrim_net]) prediction_sup = prediction_sup[0] prediction_sup_det = prediction_sup_det[0] # Define losses # reconstruction losses reconst_losses, reconst_losses_det = mh.define_reconst_losses( predictions, predictions_det, [input_var_sup]) # supervised loss sup_loss, sup_loss_det = mh.define_sup_loss(disc_nonlinearity, prediction_sup, prediction_sup_det, keep_labels, target_var_sup, missing_labels_val) inputs = [input_var_sup, target_var_sup] params = lasagne.layers.get_all_params([discrim_net] + nets, trainable=True) print('Number of params: ' + str(len(params))) # Combine losses loss = sup_loss + gamma * reconst_losses[0] loss_det = sup_loss_det + gamma * reconst_losses_det[0] l2_penalty = apply_penalty(params, l2) loss = loss + lmd * l2_penalty loss_det = loss_det + lmd * l2_penalty # Compute network updates updates = lasagne.updates.rmsprop(loss, params, learning_rate=lr) # updates = lasagne.updates.sgd(loss, # params, # learning_rate=lr) # updates = lasagne.updates.momentum(loss, params, # learning_rate=lr, momentum=0.0) # Apply norm constraints on the weights for k in updates.keys(): if updates[k].ndim == 2: updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0) # Compile training function train_fn = theano.function(inputs, loss, updates=updates, on_unused_input='ignore') # Monitoring Labels monitor_labels = ["reconst. loss"] monitor_labels = [ i for i, j in zip(monitor_labels, reconst_losses) if j != 0 ] monitor_labels += ["loss. sup.", "total loss"] # Build and compile test function val_outputs = reconst_losses_det val_outputs = [i for i, j in zip(val_outputs, reconst_losses) if j != 0] val_outputs += [sup_loss_det, loss_det] # Compute accuracy and add it to monitoring list test_acc, test_pred = mh.define_test_functions(disc_nonlinearity, prediction_sup, prediction_sup_det, target_var_sup) monitor_labels.append("accuracy") val_outputs.append(test_acc) # Compile prediction function predict = theano.function([input_var_sup], test_pred) # Compile validation function val_fn = theano.function(inputs, [prediction_sup_det] + val_outputs, on_unused_input='ignore') # Finally, launch the training loop. print("Starting training...") # Some variables max_patience = 100 patience = 0 train_monitored = [] valid_monitored = [] train_loss = [] # Pre-training monitoring print("Epoch 0 of {}".format(num_epochs)) train_minibatches = mlh.iterate_minibatches(x_train, y_train, batch_size, shuffle=False) train_err = mlh.monitoring(train_minibatches, "train", val_fn, monitor_labels, prec_recall_cutoff) valid_minibatches = mlh.iterate_minibatches(x_valid, y_valid, batch_size, shuffle=False) valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn, monitor_labels, prec_recall_cutoff) # Training loop start_training = time.time() for epoch in range(num_epochs): start_time = time.time() print("Epoch {} of {}".format(epoch + 1, num_epochs)) nb_minibatches = 0 loss_epoch = 0 # Train pass for batch in mlh.iterate_minibatches(x_train, training_labels, batch_size, shuffle=True): loss_epoch += train_fn(*batch) nb_minibatches += 1 loss_epoch /= nb_minibatches train_loss += [loss_epoch] # Monitoring on the training set train_minibatches = mlh.iterate_minibatches(x_train, y_train, batch_size, shuffle=False) train_err = mlh.monitoring(train_minibatches, "train", val_fn, monitor_labels, prec_recall_cutoff) train_monitored += [train_err] # Monitoring on the validation set valid_minibatches = mlh.iterate_minibatches(x_valid, y_valid, batch_size, shuffle=False) valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn, monitor_labels, prec_recall_cutoff) valid_monitored += [valid_err] try: early_stop_val = valid_err[monitor_labels.index( early_stop_criterion)] except: raise ValueError("There is no monitored value by the name of %s" % early_stop_criterion) # Early stopping if epoch == 0: best_valid = early_stop_val elif (early_stop_val > best_valid and early_stop_criterion == 'accuracy') or \ (early_stop_val < best_valid and early_stop_criterion == 'loss. sup.'): best_valid = early_stop_val patience = 0 # Save stuff np.savez( os.path.join(save_path, 'model_best.npz'), *lasagne.layers.get_all_param_values( filter(None, nets) + [discrim_net])) np.savez(save_path + "/errors_supervised_best.npz", zip(*train_monitored), zip(*valid_monitored)) else: patience += 1 np.savez( os.path.join(save_path, 'model_last.npz'), *lasagne.layers.get_all_param_values( filter(None, nets) + [discrim_net])) np.savez(save_path + "/errors_supervised_last.npz", zip(*train_monitored), zip(*valid_monitored)) # End training if patience == max_patience or epoch == num_epochs - 1: print("Ending training") # Load best model if not os.path.exists(save_path + '/model_best.npz'): print("No saved model to be tested and/or generate" " the embedding !") else: with np.load(save_path + '/model_best.npz', ) as f: param_values = [ f['arr_%d' % i] for i in range(len(f.files)) ] lasagne.layers.set_all_param_values( filter(None, nets) + [discrim_net], param_values) # Training set results train_minibatches = mlh.iterate_minibatches(x_train, y_train, batch_size, shuffle=False) train_err = mlh.monitoring(train_minibatches, "train", val_fn, monitor_labels, prec_recall_cutoff) # Validation set results valid_minibatches = mlh.iterate_minibatches(x_valid, y_valid, batch_size, shuffle=False) valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn, monitor_labels, prec_recall_cutoff) # Test set results if y_test is not None: test_minibatches = mlh.iterate_minibatches(x_test, y_test, batch_size, shuffle=False) test_err = mlh.monitoring(test_minibatches, "test", val_fn, monitor_labels, prec_recall_cutoff) else: for minibatch in mlh.iterate_testbatches(x_test, batch_size, shuffle=False): test_predictions = [] test_predictions += [predict(minibatch)] np.savez(os.path.join(save_path, 'test_predictions.npz'), test_predictions) # Stop print(" epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time)) break print(" epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time)) # Anneal the learning rate lr.set_value(float(lr.get_value() * learning_rate_annealing)) # Print all final errors for train, validation and test print("Training time:\t\t\t{:.3f}s".format(time.time() - start_training)) # Copy files to loadpath if save_path != save_copy: print('Copying model and other training files to {}'.format(save_copy)) copy_tree(save_path, save_copy)
def execute(dataset, learning_rate=0.00001, alpha=0., beta=1., lmd=0., encoder_units=[1024, 512, 256], num_epochs=500, which_fold=1, save_path=None, save_copy=None, dataset_path=None): # Reading dataset print("Loading data") x_unsup = mlh.load_data(dataset, dataset_path, None, which_fold=which_fold, keep_labels=1.0, missing_labels_val=-1.0, embedding_input='bin', transpose=True) x_train = x_unsup[0][0] x_valid = x_unsup[1][0] n_features = x_train.shape[1] exp_name = "learn_gene_vector_h" for e in encoder_units: exp_name += ('-' + str(e)) exp_name += '_a-' + str(alpha) exp_name += '_b-' + str(beta) exp_name += '_l-' + str(lmd) exp_name += '_lr-' + str(learning_rate) save_path = os.path.join(save_path, exp_name) save_copy = os.path.join(save_copy, exp_name) if not os.path.exists(save_path): os.makedirs(save_path) if not os.path.exists(save_copy): os.makedirs(save_copy) # Prepare Theano variables for inputs and targets input_var = T.matrix('input') target_var = T.matrix('target') target_reconst = T.matrix('target') lr = theano.shared(np.float32(learning_rate), 'learning_rate') lmd = 0.0001 # weight decay coeff num_epochs = 200 # there arent really any epochs as we are using a generator with random # sampling from dataset. This is for compat. batches_per_epoch = 1000 batch_size = 128 # building network encoder = InputLayer((batch_size, n_features), input_var) # building the encoder and decoder for i in range(len(encoder_units)): encoder = DenseLayer(encoder, num_units=encoder_units[i], nonlinearity=rectify) params = lasagne.layers.get_all_params(encoder, trainable=True) monitor_labels = [] val_outputs = [] nets = [encoder] if alpha > 0: decoder_units = encoder_units[::-1][1:] decoder = encoder for i in range(len(decoder_units)): decoder = DenseLayer(decoder, num_units=decoder_units[i], nonlinearity=rectify) decoder = DenseLayer(decoder, num_units=n_features, nonlinearity=sigmoid) prediction_reconst = lasagne.layers.get_output(decoder) # Reconstruction error loss_reconst = lasagne.objectives.binary_crossentropy( prediction_reconst, target_reconst).mean() params += lasagne.layers.get_all_params(decoder, trainable=True) monitor_labels += ["reconst."] val_outputs += [loss_reconst] nets += [decoder] else: loss_reconst = 0 if beta > 0: predictor_laysize = [encoder_units[-1]] * 4 predictor = encoder for i in range(len(predictor_laysize)): predictor = DenseLayer(predictor, num_units=predictor_laysize[i], nonlinearity=rectify) predictor = DenseLayer(predictor, num_units=2, nonlinearity=sigmoid) prediction_var = lasagne.layers.get_output(predictor) # w2v error loss_pred = lasagne.objectives.binary_crossentropy( prediction_var, target_var).mean() params += lasagne.layers.get_all_params(predictor, trainable=True) monitor_labels += ["pred."] val_outputs += [loss_pred] nets += [predictor] else: loss_pred = 0 # Combine losses loss = alpha * loss_reconst + beta * loss_pred # applying weight decay l2_penalty = apply_penalty(params, l2) loss = loss + lmd * l2_penalty # loss = loss + lmd*l2_penalty val_outputs += [loss] monitor_labels += ['loss'] # Some variables max_patience = 100 patience = 0 train_monitored = [] valid_monitored = [] train_loss = [] updates = lasagne.updates.rmsprop(loss, params, learning_rate=lr) inputs = [input_var, target_var, target_reconst] # Compile training function print "Compiling training function" train_fn = theano.function(inputs, loss, updates=updates, on_unused_input='ignore') val_fn = theano.function(inputs, [val_outputs[0]] + val_outputs, on_unused_input='ignore') start_training = time.time() print "training start time: {}".format(start_training) # data_gen = data_generator(x_train, batch_size) print "Starting training" for epoch in range(num_epochs): start_time = time.time() print("Epoch {} of {}".format(epoch + 1, num_epochs)) nb_minibatches = 0 loss_epoch = 0 for x, y, target_reconst_val in data_generator(x_train, batch_size): loss_epoch += train_fn(x, y, target_reconst_val) nb_minibatches += 1 loss_epoch /= nb_minibatches train_loss += [loss_epoch] # Monitoring on the training set train_minibatches = data_generator(x_train, batch_size) train_err = mlh.monitoring(train_minibatches, "train", val_fn, monitor_labels, 0) train_monitored += [train_err] # Monitoring on the validation set valid_minibatches = data_generator(x_valid, batch_size) valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn, monitor_labels, 0) valid_monitored += [valid_err] early_stop_criterion = 'loss' early_stop_val = valid_err[monitor_labels.index(early_stop_criterion)] # Early stopping if epoch == 0: best_valid = early_stop_val elif early_stop_val < best_valid and early_stop_criterion == 'loss': best_valid = early_stop_val patience = 0 # Save stuff np.savez(save_path + '/model_snp2vec_best.npz', *lasagne.layers.get_all_param_values(nets)) np.savez(save_path + "/errors_snp2vec_best.npz", zip(*train_monitored), zip(*valid_monitored)) else: patience += 1 np.savez(os.path.join(save_path, 'model_snp2vec_last.npz'), *lasagne.layers.get_all_param_values(nets)) np.savez(save_path + "/errors_snp2vec_last.npz", zip(*train_monitored), zip(*valid_monitored)) # End training if (patience == max_patience) or (epoch == num_epochs - 1): print("Ending training") # Load best model if not os.path.exists(save_path + '/model_snp2vec_best.npz'): print( "No saved model to be tested and/or generate" " the embedding !") else: with np.load(save_path + '/model_snp2vec_best.npz') as f: param_values = [ f['arr_%d' % i] for i in range(len(f.files)) ] lasagne.layers.set_all_param_values(nets, param_values) # Training set results train_minibatches = data_generator(x_train, batch_size) train_err = mlh.monitoring(train_minibatches, "train", val_fn, monitor_labels, 0) # Validation set results valid_minibatches = data_generator(x_valid, batch_size) valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn, monitor_labels, 0) # Stop print(" epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time)) break print(" epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time)) # Copy files to loadpath if save_path != save_copy: print('Copying model and other training files to {}'.format(save_copy)) copy_tree(save_path, save_copy)
def execute( dataset, n_hidden_u, n_hidden_t_enc, n_hidden_t_dec, n_hidden_s, embedding_source=None, num_epochs=500, learning_rate=.001, learning_rate_annealing=1.0, alpha=1, beta=1, gamma=1, lmd=.0001, disc_nonlinearity="sigmoid", encoder_net_init=0.2, decoder_net_init=0.2, keep_labels=1.0, prec_recall_cutoff=True, missing_labels_val=-1.0, which_fold=0, early_stop_criterion='loss_sup_det', embedding_input='raw', save_path='/Tmp/' + os.environ["USER"] + '/savepath/', # a default value was needed? save_copy='/Tmp/' + os.environ["USER"] + '/savecopy/', dataset_path='/Tmp/' + os.environ["USER"] + '/datasets/', resume=False, exp_name='', random_proj=0): # Load the dataset print("Loading data") x_train, y_train, x_valid, y_valid, x_test, y_test, \ x_unsup, training_labels = mlh.load_data( dataset, dataset_path, embedding_source, which_fold=which_fold, keep_labels=keep_labels, missing_labels_val=missing_labels_val, embedding_input=embedding_input) if x_unsup is not None: n_samples_unsup = x_unsup.shape[1] else: n_samples_unsup = 0 # Extract required information from data n_samples, n_feats = x_train.shape print("Number of features : ", n_feats) print("Glorot init : ", 2.0 / (n_feats + n_hidden_t_enc[-1])) n_targets = y_train.shape[1] # Set some variables batch_size = 128 beta = gamma if (gamma == 0) else beta # Preparing folder to save stuff if embedding_source is None: embedding_name = embedding_input else: embedding_name = embedding_source.replace("_", "").split(".")[0] exp_name += embedding_name.rsplit('/', 1)[::-1][0] + '_' exp_name += 'final_' exp_name += mlh.define_exp_name(keep_labels, alpha, beta, gamma, lmd, n_hidden_u, n_hidden_t_enc, n_hidden_t_dec, n_hidden_s, which_fold, embedding_input, learning_rate, decoder_net_init, encoder_net_init, early_stop_criterion, learning_rate_annealing) print("Experiment: " + exp_name) save_path = os.path.join(save_path, dataset, exp_name) save_copy = os.path.join(save_copy, dataset, exp_name) if not os.path.exists(save_path): os.makedirs(save_path) if not os.path.exists(save_copy): os.makedirs(save_copy) # Prepare Theano variables for inputs and targets input_var_sup = T.matrix('input_sup') input_var_unsup = theano.shared(x_unsup, 'input_unsup') # x_unsup TBD target_var_sup = T.matrix('target_sup') lr = theano.shared(np.float32(learning_rate), 'learning_rate') # Build model print("Building model") # Some checkings # assert len(n_hidden_u) > 0 assert len(n_hidden_t_enc) > 0 assert len(n_hidden_t_dec) > 0 assert n_hidden_t_dec[-1] == n_hidden_t_enc[-1] # Build feature embedding networks (encoding and decoding if gamma > 0) nets, embeddings, pred_feat_emb = mh.build_feat_emb_nets( embedding_source, n_feats, n_samples_unsup, input_var_unsup, n_hidden_u, n_hidden_t_enc, n_hidden_t_dec, gamma, encoder_net_init, decoder_net_init, save_path, random_proj) # Build feature embedding reconstruction networks (if alpha > 0, beta > 0) nets += mh.build_feat_emb_reconst_nets( [alpha, beta], n_samples_unsup, n_hidden_u, [n_hidden_t_enc, n_hidden_t_dec], nets, [encoder_net_init, decoder_net_init]) # Supervised network discrim_net, hidden_rep = mh.build_discrim_net( batch_size, n_feats, input_var_sup, n_hidden_t_enc, n_hidden_s, embeddings[0], disc_nonlinearity, n_targets) # Reconstruct network nets += [ mh.build_reconst_net(hidden_rep, embeddings[1] if len(embeddings) > 1 else None, n_feats, gamma) ] # Load weights if we are resuming job if resume: # Load best model with np.load(os.path.join(save_path, 'model_feat_sel_last.npz')) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] nlayers = len( lasagne.layers.get_all_params(filter(None, nets) + [discrim_net])) lasagne.layers.set_all_param_values( filter(None, nets) + [discrim_net], param_values[:nlayers]) print("Building and compiling training functions") # Build and compile training functions predictions, predictions_det = mh.define_predictions(nets, start=2) prediction_sup, prediction_sup_det = mh.define_predictions([discrim_net]) prediction_sup = prediction_sup[0] prediction_sup_det = prediction_sup_det[0] # Define losses # reconstruction losses reconst_losses, reconst_losses_det = mh.define_reconst_losses( predictions, predictions_det, [input_var_unsup, input_var_unsup, input_var_sup]) # supervised loss sup_loss, sup_loss_det = mh.define_sup_loss(disc_nonlinearity, prediction_sup, prediction_sup_det, keep_labels, target_var_sup, missing_labels_val) # Define inputs inputs = [input_var_sup, target_var_sup] # Define parameters params = lasagne.layers.get_all_params([discrim_net] + filter(None, nets), trainable=True) params_to_freeze= \ lasagne.layers.get_all_params(filter(None, nets), trainable=False) print('Number of params discrim: ' + str(len(params))) print('Number of params to freeze: ' + str(len(params_to_freeze))) for p in params_to_freeze: new_params = [el for el in params if el != p] params = new_params print('Number of params to update: ' + str(len(params))) # Combine losses loss = sup_loss + alpha*reconst_losses[0] + beta*reconst_losses[1] + \ gamma*reconst_losses[2] loss_det = sup_loss_det + alpha*reconst_losses_det[0] + \ beta*reconst_losses_det[1] + gamma*reconst_losses_det[2] l2_penalty = apply_penalty(params, l2) loss = loss + lmd * l2_penalty loss_det = loss_det + lmd * l2_penalty # Compute network updates updates = lasagne.updates.rmsprop(loss, params, learning_rate=lr) # updates = lasagne.updates.sgd(loss, # params, # learning_rate=lr) # updates = lasagne.updates.momentum(loss, params, # learning_rate=lr, momentum=0.0) # Apply norm constraints on the weights for k in updates.keys(): if updates[k].ndim == 2: updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0) # Compile training function train_fn = theano.function(inputs, loss, updates=updates, on_unused_input='ignore') # Monitoring Labels monitor_labels = [ "reconst. feat. W_enc", "reconst. feat. W_dec", "reconst. loss" ] monitor_labels = [ i for i, j in zip(monitor_labels, reconst_losses) if j != 0 ] monitor_labels += ["feat. W_enc. mean", "feat. W_enc var"] monitor_labels += ["feat. W_dec. mean", "feat. W_dec var"] if \ (embeddings[1] is not None) else [] monitor_labels += ["loss. sup.", "total loss"] # Build and compile test function val_outputs = reconst_losses_det val_outputs = [i for i, j in zip(val_outputs, reconst_losses) if j != 0] val_outputs += [embeddings[0].mean(), embeddings[0].var()] val_outputs += [embeddings[1].mean(), embeddings[1].var()] if \ (embeddings[1] is not None) else [] val_outputs += [sup_loss_det, loss_det] # Compute accuracy and add it to monitoring list test_acc, test_pred = mh.define_test_functions(disc_nonlinearity, prediction_sup, prediction_sup_det, target_var_sup) monitor_labels.append("accuracy") val_outputs.append(test_acc) # Compile prediction function predict = theano.function([input_var_sup], test_pred) # Compile validation function val_fn = theano.function(inputs, [prediction_sup_det] + val_outputs, on_unused_input='ignore') # Finally, launch the training loop. print("Starting training...") # Some variables max_patience = 100 patience = 0 train_monitored = [] valid_monitored = [] train_loss = [] # Pre-training monitoring print("Epoch 0 of {}".format(num_epochs)) train_minibatches = mlh.iterate_minibatches(x_train, y_train, batch_size, shuffle=False) train_err = mlh.monitoring(train_minibatches, "train", val_fn, monitor_labels, prec_recall_cutoff) valid_minibatches = mlh.iterate_minibatches(x_valid, y_valid, batch_size, shuffle=False) valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn, monitor_labels, prec_recall_cutoff) # Training loop start_training = time.time() for epoch in range(num_epochs): start_time = time.time() print("Epoch {} of {}".format(epoch + 1, num_epochs)) nb_minibatches = 0 loss_epoch = 0 # Train pass for batch in mlh.iterate_minibatches(x_train, training_labels, batch_size, shuffle=True): loss_epoch += train_fn(*batch) nb_minibatches += 1 loss_epoch /= nb_minibatches train_loss += [loss_epoch] # Monitoring on the training set train_minibatches = mlh.iterate_minibatches(x_train, y_train, batch_size, shuffle=False) train_err = mlh.monitoring(train_minibatches, "train", val_fn, monitor_labels, prec_recall_cutoff) train_monitored += [train_err] # Monitoring on the validation set valid_minibatches = mlh.iterate_minibatches(x_valid, y_valid, batch_size, shuffle=False) valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn, monitor_labels, prec_recall_cutoff) valid_monitored += [valid_err] try: early_stop_val = valid_err[monitor_labels.index( early_stop_criterion)] except: raise ValueError("There is no monitored value by the name of %s" % early_stop_criterion) # Early stopping if epoch == 0: best_valid = early_stop_val elif (early_stop_val > best_valid and early_stop_criterion == 'accuracy') or \ (early_stop_val < best_valid and early_stop_criterion == 'loss. sup.'): best_valid = early_stop_val patience = 0 # Save stuff np.savez( os.path.join(save_path, 'model_feat_sel_best.npz'), *lasagne.layers.get_all_param_values( filter(None, nets) + [discrim_net])) np.savez(save_path + "/errors_supervised_best.npz", zip(*train_monitored), zip(*valid_monitored)) # Monitor on the test set now because sometimes the saving doesn't # go well and there isn't a model to load at the end of training if y_test is not None: test_minibatches = mlh.iterate_minibatches(x_test, y_test, 138, shuffle=False) test_err = mlh.monitoring(test_minibatches, "test", val_fn, monitor_labels, prec_recall_cutoff) else: patience += 1 # Save stuff np.savez( os.path.join(save_path, 'model_feat_sel_last.npz'), *lasagne.layers.get_all_param_values( filter(None, nets) + [discrim_net])) np.savez(save_path + "/errors_supervised_last.npz", zip(*train_monitored), zip(*valid_monitored)) # End training if patience == max_patience or epoch == num_epochs - 1: print("Ending training") # Load best model with np.load(os.path.join(save_path, 'model_feat_sel_best.npz')) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] nlayers = len( lasagne.layers.get_all_params( filter(None, nets) + [discrim_net])) lasagne.layers.set_all_param_values( filter(None, nets) + [discrim_net], param_values[:nlayers]) if embedding_source is None: # Save embedding pred = pred_feat_emb() np.savez(os.path.join(save_path, 'feature_embedding.npz'), pred) # Training set results train_minibatches = mlh.iterate_minibatches(x_train, y_train, batch_size, shuffle=False) train_err = mlh.monitoring(train_minibatches, "train", val_fn, monitor_labels, prec_recall_cutoff) # Validation set results valid_minibatches = mlh.iterate_minibatches(x_valid, y_valid, batch_size, shuffle=False) valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn, monitor_labels, prec_recall_cutoff) # Test set results if y_test is not None: test_minibatches = mlh.iterate_minibatches(x_test, y_test, 138, shuffle=False) test_err = mlh.monitoring(test_minibatches, "test", val_fn, monitor_labels, prec_recall_cutoff) np.savez(os.path.join(save_path, 'final_errors.npz'), test_err) else: for minibatch in mlh.iterate_testbatches(x_test, 138, shuffle=False): test_predictions = [] test_predictions += [predict(minibatch)] np.savez(os.path.join(save_path, 'test_predictions.npz'), test_predictions) # Stop print(" epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time)) break print(" epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time)) # Anneal the learning rate lr.set_value(float(lr.get_value() * learning_rate_annealing)) # Print and save all final errors for train, validation and test print("Training time:\t\t\t{:.3f}s".format(time.time() - start_training)) print("test_err:", test_err) # Copy files to loadpath if save_path != save_copy: print('Copying model and other training files to {}'.format(save_copy)) copy_tree(save_path, save_copy)
def execute(dataset, learning_rate=0.00001, learning_rate_annealing=1.0, alpha=0., beta=1., lmd=0., encoder_units=[1024, 512, 256], num_epochs=500, which_fold=1, save_path=None, save_copy=None, dataset_path=None, num_fully_connected=0, exp_name='', init_args=None): # Reading dataset print("Loading data") x_unsup = mlh.load_data(dataset, dataset_path, None, which_fold=which_fold, keep_labels=1.0, missing_labels_val=-1.0, embedding_input='bin', transpose=True) x_train = x_unsup[0][0] x_valid = x_unsup[1][0] n_features = x_train.shape[1] exp_name += "learn_gene_vector_h" for e in encoder_units: exp_name += ('-' + str(e)) exp_name += '_a-' + str(alpha) exp_name += '_b-' + str(beta) # exp_name += '_g-' + str(gamma) exp_name += '_l-' + str(lmd) exp_name += '_lr-' + str(learning_rate) save_path = os.path.join(save_path, exp_name) save_copy = os.path.join(save_copy, exp_name) if not os.path.exists(save_path): os.makedirs(save_path) if not os.path.exists(save_copy): os.makedirs(save_copy) # Prepare Theano variables for inputs and targets input_var = T.matrix('input') target_var = T.matrix('target') target_reconst = T.matrix('target') lr = theano.shared(np.float32(learning_rate), 'learning_rate') batch_size = 128 # building network encoder = InputLayer((batch_size, n_features), input_var) # building the encoder and decoder #import pdb; pdb.set_trace() for i in range(len(encoder_units)): encoder = DenseLayer( encoder, num_units=encoder_units[i], W=HeNormal('relu'), nonlinearity=rectify) # if i < len(encoder_units)-1 else linear) embedding = lasagne.layers.get_output(encoder) params = lasagne.layers.get_all_params(encoder, trainable=True) monitor_labels = ["embedding min", "embedding max"] val_outputs = [embedding.min(), embedding.max()] nets = [encoder] if alpha > 0: decoder_units = encoder_units[::-1][1:] print(decoder_units) decoder = encoder for i in range(len(decoder_units)): decoder = DenseLayer(decoder, num_units=decoder_units[i], W=HeNormal('relu'), nonlinearity=rectify) decoder = DenseLayer(decoder, num_units=n_features, W=convert_initialization( init_args["decoder_init"], nonlinearity="sigmoid"), nonlinearity=sigmoid) prediction_reconst = lasagne.layers.get_output(decoder) # Reconstruction error loss_reconst = lasagne.objectives.binary_crossentropy( prediction_reconst, target_reconst).mean() # loss_reconst = mh.define_sampled_mean_bincrossentropy( # prediction_reconst, target_reconst, gamma=gamma) #loss_reconst = mh.dice_coef_loss( # target_reconst, prediction_reconst).mean() accuracy = T.eq(T.gt(prediction_reconst, 0.5), target_reconst).mean() params += lasagne.layers.get_all_params(decoder, trainable=True) monitor_labels += ["reconst. loss", "reconst. accuracy"] val_outputs += [loss_reconst, accuracy] nets += [decoder] # sparsity_reconst = gamma * l1(prediction_reconst) # roh = input_var.mean(0) # sparsity_reconst = ((roh * T.log(roh / (prediction_reconst.mean(0)+1e-8))) +\ # ((1 - roh) * T.log((1 - roh) / (1 - prediction_reconst + 1e-8)))).sum() else: loss_reconst = 0 # sparsity_reconst = 0 if beta > 0: predictor_laysize = [encoder_units[-1]] * num_fully_connected predictor = encoder for i in range(len(predictor_laysize)): predictor = DenseLayer(predictor, num_units=predictor_laysize[i], nonlinearity=rectify, W=convert_initialization( init_args["predictor_init"], nonlinearity="relu")) predictor = DenseLayer(predictor, num_units=2, nonlinearity=sigmoid, W=convert_initialization( init_args["predictor_init"], nonlinearity="sigmoid")) prediction_var = lasagne.layers.get_output(predictor) # w2v error # loss_pred = lasagne.objectives.binary_crossentropy( # prediction_var, target_var # ).mean() loss_pred = mh.dice_coef_loss(target_var, prediction_var).mean() accuracy = T.eq(T.gt(prediction_var, 0.5), target_var).mean() params += lasagne.layers.get_all_params(predictor, trainable=True) monitor_labels += ["pred. loss", "pred. accuracy"] val_outputs += [loss_pred, accuracy] nets += [predictor] # sparsity_pred = gamma * l1(prediction_var) # roh = 0.05 # sparsity_pred = ((roh * T.log(roh / prediction_pred.mean(0))) +\ # ((1 - roh) * T.log((1 - roh) / (1 - prediction_pred)))).sum() else: loss_pred = 0 # sparsity_pred = 0 # Combine losses loss = alpha * loss_reconst + beta * loss_pred # sparsity_pred # + sparsity_reconst # applying weight decay l2_penalty = apply_penalty(params, l2) loss = loss + lmd * l2_penalty # loss = loss + lmd*l2_penalty val_outputs += [loss] monitor_labels += ['loss'] # Some variables max_patience = 100 patience = 0 train_monitored = [] valid_monitored = [] train_loss = [] updates = lasagne.updates.adam(loss, params, learning_rate=lr) for k in updates.keys(): if updates[k].ndim == 2: updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0) inputs = [input_var, target_var, target_reconst] # Compile training function print "Compiling training function" train_fn = theano.function(inputs, loss, updates=updates, on_unused_input='ignore') val_fn = theano.function(inputs, [val_outputs[0]] + val_outputs, on_unused_input='ignore') if alpha > 0: pred_fn = theano.function([input_var], prediction_reconst) start_training = time.time() # data_gen = data_generator(x_train, batch_size) print "Starting training" for epoch in range(num_epochs): start_time = time.time() print("Epoch {} of {}".format(epoch + 1, num_epochs)) nb_minibatches = 0 loss_epoch = 0 for x, y, target_reconst_val in data_generator(x_train, batch_size, shuffle=True): loss_epoch += train_fn(x, y, target_reconst_val) nb_minibatches += 1 if alpha > 0: pr = pred_fn(x) print('min pr:' + str(pr.min())) print('max pr:' + str(pr.max())) print('mean pr:' + str(pr.mean())) loss_epoch /= nb_minibatches train_loss += [loss_epoch] # Monitoring on the training set train_minibatches = data_generator(x_train, batch_size) train_err = mlh.monitoring(train_minibatches, "train", val_fn, monitor_labels, 0) train_monitored += [train_err] # Monitoring on the validation set valid_minibatches = data_generator(x_valid, batch_size) valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn, monitor_labels, 0) valid_monitored += [valid_err] early_stop_criterion = 'loss' early_stop_val = valid_err[monitor_labels.index(early_stop_criterion)] # Early stopping if epoch == 0: best_valid = early_stop_val elif early_stop_val < best_valid and early_stop_criterion == 'loss': best_valid = early_stop_val patience = 0 # Save stuff np.savez(save_path + '/model_snp2vec_best.npz', *lasagne.layers.get_all_param_values(nets)) np.savez(save_path + "/errors_snp2vec_best.npz", zip(*train_monitored), zip(*valid_monitored)) else: patience += 1 np.savez(os.path.join(save_path, 'model_snp2vec_last.npz'), *lasagne.layers.get_all_param_values(nets)) np.savez(save_path + "/errors_snp2vec_last.npz", zip(*train_monitored), zip(*valid_monitored)) # End training if (patience == max_patience) or (epoch == num_epochs - 1): print("Ending training") # Load best model if not os.path.exists(save_path + '/model_snp2vec_best.npz'): print( "No saved model to be tested and/or generate" " the embedding !") else: with np.load(save_path + '/model_snp2vec_best.npz') as f: param_values = [ f['arr_%d' % i] for i in range(len(f.files)) ] lasagne.layers.set_all_param_values(nets, param_values) # Training set results train_minibatches = data_generator(x_train, batch_size) train_err = mlh.monitoring(train_minibatches, "train", val_fn, monitor_labels, 0) # Validation set results valid_minibatches = data_generator(x_valid, batch_size) valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn, monitor_labels, 0) # Stop print(" epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time)) break print(" epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time)) # Anneal the learning rate lr.set_value(float(lr.get_value() * learning_rate_annealing)) # Copy files to loadpath if save_path != save_copy: print('Copying model and other training files to {}'.format(save_copy)) copy_tree(save_path, save_copy)
def reset(): if any(np.isnan(scale.get_value()) for scale in scales): for scale in scales: scale.set_value(1.) for l in l_hiddens: l.b.set_value(Constant()(l.b.get_value().shape)) l.W.set_value(Orthogonal()(l.W.get_value().shape)) l_out.b.set_value(Constant()(l_out.b.get_value().shape)) l_out.W.set_value(Orthogonal()(l_out.W.get_value().shape)) for p in (p for u in (updates_ada, updates_other, updates_scal) for p in u if p not in get_all_params(l_out)): p.set_value(Constant()(p.get_value().shape)) chunky_l2 = apply_penalty(get_all_params(l_out, regularizable=True), l2) - l2( l_hiddens[0].W) + l2(l_hiddens[0].W / T.reshape(vscale, (206279, 1))) chunky_l1 = apply_penalty(get_all_params(l_out, regularizable=True), l1) - l1( l_hiddens[0].W) + l1(l_hiddens[0].W / T.reshape(vscale, (206279, 1))) simple_l2 = apply_penalty(get_all_params(l_out, regularizable=True), l2) #l_out2 = DenseLayer(dropout(l_hiddens2[-1]), num_units=y.shape[1]) #l_out = lasagne.layers.NonlinearityLayer(lasagne.layers.ElemwiseSumLayer((l_out1,l_out2),.5), softmax) #categorical_crossentropy(get_output(l_out)[train_indice]) target = T.fmatrix(name="target") #f=theano.function([l_in.input_var],get_output(l_out),allow_input_downcast=True) #f(X[0,:].toarray()) loss = categorical_crossentropy(get_output(l_out), target).mean() # train_loss_smoo=categorical_crossentropy(get_output(l_out,deterministic=True)[train_indices,],target[train_indices,]).mean()
def __init__(self, W=None, W_path=None, K=300, num_hidden=256, batch_size=None, grad_clip=100., max_sent_len=200, num_classes=2, **kwargs): W = W V = len(W) K = int(K) num_hidden = int(num_hidden) batch_size = int(batch_size) grad_clip = int(grad_clip) max_seq_len = int(max_sent_len) max_post_len = int(kwargs["max_post_len"]) max_len = max(max_seq_len, max_post_len) max_seq_len = max_len max_post_len = max_len num_classes = int(num_classes) ''' Boolean on and/or sentence and word level attention''' ''' to use context or not''' separate_attention_context_sents = str_to_bool( kwargs["separate_attention_context"]) separate_attention_response_sents = str_to_bool( kwargs["separate_attention_response"]) separate_attention_context_words = str_to_bool( kwargs["separate_attention_context_words"]) separate_attention_response_words = str_to_bool( kwargs["separate_attention_response_words"]) print("separate_attention_context_sentence is : {}\n".format( separate_attention_context_sents)) print("separate_attention_response_sentence is : {}\n".format( separate_attention_response_sents)) print("separate_attention_context_words is : {}\n".format( separate_attention_context_words)) print("separate_attention_response_words is : {}\n".format( separate_attention_response_words)) #B x S x N tensor of batches of context idxs_context = T.itensor3('idxs_context') #imatrix, i = int #B x S x N matrix mask_context_words = T.itensor3('mask_context_words') #B x S matrix mask_context_sents = T.imatrix('mask_context_sents') #B x S x N tensor of batches of responses idxs_response = T.itensor3('idxs_response') #imatrix, i = int #B x S X N matrix for words mask_response_words = T.itensor3('mask_response_words') #B x S matrix for sentences mask_response_sents = T.imatrix('mask_response_sents') #B-long vector gold = T.ivector('y') # dropout dropout_val = T.scalar('p_dropout') #lambda, cost lambda_cost = T.scalar('lambda_w') #biases biases_cost = T.matrix('biases') #weights weights = T.ivector('weights') ''' check biases''' biases_present = False if biases_present: lstm_biases = lasagne.layers.InputLayer(shape=(None, 1), input_var=biases_cost) ''' building the context layer via function''' if separate_attention_context_sents: lstm_hidden_context,lstm_attn_words_context,lstm_attn_sents_context = self.buildThePostLayer(idxs_context,mask_context_words,\ mask_context_sents,\ separate_attention_context_words,\ separate_attention_context_sents,num_hidden,grad_clip,V,K,W,max_post_len,max_sent_len) ''' do the same for response layer''' if separate_attention_response_sents: lstm_hidden_response,lstm_attn_words_response,lstm_attn_sents_response = self.buildThePostLayer(idxs_response,mask_response_words,\ mask_context_sents,\ separate_attention_context_words,\ separate_attention_context_sents,num_hidden,grad_clip,V,K,W,max_post_len,max_sent_len) print('...') print('finished compiling...') ''' prepare the final network of connections now''' if separate_attention_response_sents and separate_attention_context_sents: output, network = self.buildNetwork(lstm_hidden_context, lstm_hidden_response, num_classes) elif separate_attention_context_sents: output, network = self.buildNetworkOnlyContext( lstm_hidden_context, num_classes) '''Define objective function (cost) to minimize mean cross-entropy error''' params = lasagne.layers.get_all_params(network) cost = lasagne.objectives.categorical_crossentropy(output, gold).mean() lambda_w = .000001 cost += lambda_w * apply_penalty(params, l2) grad_updates = lasagne.updates.adam(cost, params) test_output = lasagne.layers.get_output(network, deterministic=True) val_cost_fn = lasagne.objectives.categorical_crossentropy( test_output, gold).mean() preds = T.argmax(test_output, axis=1) val_acc_fn = T.mean(T.eq(preds, gold), dtype=theano.config.floatX) if separate_attention_context_sents and separate_attention_response_sents: self.val_fn = theano.function([idxs_context, mask_context_words, mask_context_sents, idxs_response, \ mask_response_words, mask_response_sents, gold], [val_cost_fn, val_acc_fn, preds], allow_input_downcast=True,on_unused_input='warn') # Compile train objective print "Compiling training, testing, prediction functions" self.train = theano.function(inputs = [idxs_context, mask_context_words, mask_context_sents,\ idxs_response, mask_response_words, mask_response_sents, gold],\ outputs = cost, updates = grad_updates, allow_input_downcast=True,on_unused_input='warn') self.test = theano.function(inputs = [idxs_context, mask_context_words, mask_context_sents,idxs_response,\ mask_response_words, mask_response_sents, gold],\ outputs = val_acc_fn,allow_input_downcast=True,on_unused_input='warn') self.pred = theano.function(inputs = [idxs_context, mask_context_words, mask_context_sents, \ idxs_response, mask_response_words, mask_response_sents],\ outputs = preds,allow_input_downcast=True,on_unused_input='warn') elif separate_attention_context_sents: self.val_fn = theano.function([idxs_context, mask_context_words, mask_context_sents, \ gold], [val_cost_fn, val_acc_fn, preds], allow_input_downcast=True,on_unused_input='warn') print "Compiling training, testing, prediction functions" self.train = theano.function(inputs = [idxs_context, mask_context_words, mask_context_sents,\ gold],\ outputs = cost, updates = grad_updates, allow_input_downcast=True,on_unused_input='warn') self.test = theano.function(inputs = [idxs_context, mask_context_words, mask_context_sents,\ gold],\ outputs = val_acc_fn,allow_input_downcast=True,on_unused_input='warn') self.pred = theano.function(inputs = [idxs_context, mask_context_words, mask_context_sents \ ],\ outputs = preds,allow_input_downcast=True,on_unused_input='warn') if separate_attention_response_sents: sentence_attention = lasagne.layers.get_output( lstm_attn_sents_response) #if add_biases: # inputs = inputs[:-1] self.sentence_attention_response = theano.function([idxs_context, mask_context_words,\ mask_context_sents,idxs_response, mask_response_words, mask_response_sents], sentence_attention, allow_input_downcast=True, on_unused_input='warn') if separate_attention_context_sents: sentence_attention_context = lasagne.layers.get_output( lstm_attn_sents_context) #if add_biases: # inputs = inputs[:-1] self.sentence_attention_context = theano.function([idxs_context, mask_context_words,\ mask_context_sents,idxs_response, mask_response_words, mask_response_sents], [sentence_attention_context, preds], allow_input_downcast=True, on_unused_input='warn') if separate_attention_response_words: sentence_attention_words = lasagne.layers.get_output( lstm_attn_words_response) #if add_biases: # inputs = inputs[:-1] self.sentence_attention_response_words = theano.function( [ idxs_context, mask_context_words, mask_context_sents, idxs_response, mask_response_words, mask_response_sents ], sentence_attention_words, allow_input_downcast=True, on_unused_input='warn') if separate_attention_context_words: sentence_attention_context_words = lasagne.layers.get_output( lstm_attn_words_context) #if add_biases: # inputs = inputs[:-1] self.sentence_attention_context_words = theano.function( [ idxs_context, mask_context_words, mask_context_sents, idxs_response, mask_response_words, mask_response_sents ], sentence_attention_context_words, allow_input_downcast=True, on_unused_input='warn') '''compare the results with regular code and then add the bias etc. '''
# Get regularizable params regularization_params = layers.get_all_params(unsupervised_graph, regularizable=True) + \ layers.get_all_params(supervised_graph, regularizable=True) regularization_params = utils.unique(regularization_params) # Creating loss functions # Train loss has to take into account of labeled image or not if run_parameters.unsupervised_cost_fun == 'squared_error': loss1 = objectives.squared_error(reconstruction, input_var) elif run_parameters.unsupervised_cost_fun == 'categorical_crossentropy': loss1 = objectives.categorical_crossentropy(reconstruction, input_var) if supervised_cost_fun == 'squared_error': loss2 = objectives.squared_error(prediction, target_var) * repeat_col(labeled_var, 10) elif supervised_cost_fun == 'categorical_crossentropy': loss2 = objectives.categorical_crossentropy(prediction, target_var) * labeled_var.T l2_penalties = regularization.apply_penalty(regularization_params, regularization.l2) sparse_layers = get_all_sparse_layers(unsupervised_graph) sparse_layers_output = layers.get_output(sparse_layers, deterministic=True) if run_parameters.sparse_regularizer_type == 0: sparse_regularizer = reduce(lambda x, y: x + T.clip((T.mean(abs(y)) - run_parameters.sparse_regularize_factor) * y.size, 0, float('inf')), sparse_layers_output, 0) elif run_parameters.sparse_regularizer_type == 1: sparse_regularizer = reduce( lambda x, y: x + T.clip(T.mean(abs(y), axis=1) - run_parameters.sparse_regularize_factor, 0, float('inf')).sum() * y.shape[1], sparse_layers_output, 0) loss = losses_ratio[0] * loss1.mean() + \ losses_ratio[1] * loss2.mean() + \ losses_ratio[2] * l2_penalties.mean() + \
def execute(dataset, n_hidden_u, num_epochs=500, learning_rate=.001, learning_rate_annealing=1.0, lmd=.0001, embedding_input='raw', which_fold=0, save_path='/Tmp/$USER/feature_selection/newmodel/', save_copy='/Tmp/$USER/feature_selection/newmodel/', dataset_path='/Tmp/$USER/feature_selection/newmodel/'): # Load the dataset print("Loading data") x_unsup = mlh.load_data(dataset, dataset_path, None, which_fold=which_fold, keep_labels=1.0, missing_labels_val=-1.0, embedding_input=embedding_input, transpose=True) x_train = x_unsup[0][0] x_valid = x_unsup[1][0] # Extract required information from data n_row, n_col = x_train.shape print('Data size ' + str(n_row) + 'x' + str(n_col)) # Set some variables batch_size = 256 # Define experiment name exp_name = 'pretrain_' + mlh.define_exp_name( 1., 0, 0, 0, lmd, n_hidden_u, [], [], [], which_fold, embedding_input, learning_rate, 0, 0, 'reconst_loss', learning_rate_annealing) print('Experiment: ' + exp_name) # Preparing folder to save stuff save_path = os.path.join(save_path, dataset, exp_name) save_copy = os.path.join(save_copy, dataset, exp_name) if not os.path.exists(save_path): os.makedirs(save_path) # Prepare Theano variables for inputs and targets input_var = T.matrix('input_unsup') lr = theano.shared(np.float32(learning_rate), 'learning_rate') # Build model print("Building model") # Some checkings assert len(n_hidden_u) > 0 # Build unsupervised network encoder_net = InputLayer((None, n_col), input_var) for out in n_hidden_u: encoder_net = DenseLayer(encoder_net, num_units=out, nonlinearity=tanh) encoder_net = DropoutLayer(encoder_net) decoder_net = encoder_net for i in range(len(n_hidden_u) - 2, -1, -1): decoder_net = DenseLayer(decoder_net, num_units=n_hidden_u[i], nonlinearity=linear) decoder_net = DropoutLayer(decoder_net) decoder_net = DenseLayer(decoder_net, num_units=n_col, nonlinearity=linear) if embedding_input == 'raw' or embedding_input == 'w2v': final_nonlin = linear elif embedding_input == 'bin': final_nonlin = sigmoid elif 'histo' in embedding_input: final_nonlin = softmax if embedding_input == 'histo3x26': laySize = lasagne.layers.get_output(decoder_net).shape decoder_net = ReshapeLayer(decoder_net, (laySize[0] * 26, 3)) decoder_net = NonlinearityLayer(decoder_net, nonlinearity=final_nonlin) if embedding_input == 'histo3x26': decoder_net = ReshapeLayer(decoder_net, (laySize[0], laySize[1])) print("Building and compiling training functions") # Build and compile training functions predictions, predictions_det = mh.define_predictions( [encoder_net, decoder_net], start=0) prediction_sup, prediction_sup_det = mh.define_predictions( [encoder_net, decoder_net], start=0) # Define losses # reconstruction losses loss, loss_det = mh.define_loss(predictions[1], predictions_det[1], input_var, embedding_input) # Define parameters params = lasagne.layers.get_all_params(decoder_net, trainable=True) l2_penalty = apply_penalty(params, l2) loss = loss + lmd * l2_penalty loss_det = loss_det + lmd * l2_penalty # Compute network updates updates = lasagne.updates.adam(loss, params, learning_rate=lr) # updates = lasagne.updates.sgd(loss, # params, # learning_rate=lr) # updates = lasagne.updates.momentum(loss, params, # learning_rate=lr, momentum=0.0) # Apply norm constraints on the weights for k in updates.keys(): if updates[k].ndim == 2: updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0) # Compile training function train_fn = theano.function([input_var], loss, updates=updates, on_unused_input='ignore') # Expressions required for test monitor_labels = ['loss'] val_outputs = [loss_det] # Add some monitoring on the learned feature embedding val_outputs += [ predictions[0].min(), predictions[0].mean(), predictions[0].max(), predictions[0].var() ] monitor_labels += [ "feat. emb. min", "feat. emb. mean", "feat. emb. max", "feat. emb. var" ] # Compile validation function val_fn = theano.function([input_var], val_outputs) pred_feat_emb = theano.function([input_var], predictions_det[0]) # Finally, launch the training loop. print("Starting training...") # Some variables max_patience = 100 patience = 0 train_monitored = [] valid_monitored = [] train_loss = [] nb_minibatches = n_row / batch_size print("Nb of minibatches: " + str(nb_minibatches)) start_training = time.time() for epoch in range(num_epochs): start_time = time.time() print("Epoch {} of {}".format(epoch + 1, num_epochs)) loss_epoch = 0 # Train pass for batch in mlh.iterate_minibatches_unsup(x_train, batch_size, shuffle=True): loss_epoch += train_fn(batch) loss_epoch /= nb_minibatches train_loss += [loss_epoch] train_minibatches = mlh.iterate_minibatches_unsup(x_train, batch_size, shuffle=True) train_err = mlh.monitoring(train_minibatches, "train", val_fn, monitor_labels, start=0) train_monitored += [train_err] # Validation pass valid_minibatches = mlh.iterate_minibatches_unsup(x_valid, batch_size, shuffle=True) valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn, monitor_labels, start=0) valid_monitored += [valid_err] try: early_stop_val = valid_err[monitor_labels.index('loss')] except: raise ValueError("There is no monitored value by the name of %s" % early_stop_criterion) # Eearly stopping if epoch == 0: best_valid = early_stop_val elif early_stop_val < best_valid: best_valid = early_stop_val patience = 0 # Save stuff np.savez( os.path.join(save_path, 'model_enc_unsupervised_best.npz'), *lasagne.layers.get_all_param_values(encoder_net)) np.savez(os.path.join(save_path, 'model_ae_unsupervised_best.npz'), *lasagne.layers.get_all_param_values(encoder_net)) np.savez(os.path.join(save_path, "errors_unsupervised_best.npz"), zip(*train_monitored), zip(*valid_monitored)) else: patience += 1 # Save stuff np.savez( os.path.join(save_path, 'model_enc_unsupervised_last.npz'), *lasagne.layers.get_all_param_values(encoder_net)) np.savez(os.path.join(save_path, 'model_ae_unsupervised_last.npz'), *lasagne.layers.get_all_param_values(encoder_net)) np.savez(os.path.join(save_path, "errors_unsupervised_last.npz"), zip(*train_monitored), zip(*valid_monitored)) # End training if patience == max_patience or epoch == num_epochs - 1: print(" Ending training") # Load unsupervised best model if not os.path.exists(save_path + '/model_enc_unsupervised_best.npz'): print("No saved model to be tested and/or generate" " the embedding !") else: with np.load(save_path + '/model_enc_unsupervised_best.npz', ) as f: param_values = [ f['arr_%d' % i] for i in range(len(f.files)) ] lasagne.layers.set_all_param_values( encoder_net, param_values) # Save embedding preds = [] for batch in mlh.iterate_minibatches_unsup(x_train, 1, shuffle=False): preds.append(pred_feat_emb(batch)) for batch in mlh.iterate_minibatches_unsup(x_valid, 1, shuffle=False): preds.append(pred_feat_emb(batch)) preds = np.vstack(preds) np.savez(os.path.join(save_path, 'feature_embedding.npz'), preds) # Stop print(" epoch time:\t\t\t{:.3f}s".format(time.time() - start_time)) break print(" epoch time:\t\t\t{:.3f}s".format(time.time() - start_time)) # Anneal the learning rate lr.set_value(float(lr.get_value() * learning_rate_annealing)) # Print all final errors for train, validation and test print("Training time:\t\t\t{:.3f}s".format(time.time() - start_training)) # Copy files to loadpath if save_path != save_copy: print('Copying model and other training files to {}'.format(save_copy)) copy_tree(save_path, save_copy)
def execute(dataset, n_hidden_u, n_hidden_t_enc, n_hidden_t_dec, n_hidden_s, embedding_source=histo_GenotypicFrequency_perclass, additional_unsup_input=None, num_epochs=500, learning_rate=.001, learning_rate_annealing=1.0, alpha=1, beta=1, delta=1, gamma=1, lmd=.0001, disc_nonlinearity="sigmoid", encoder_net_init=0.2, decoder_net_init=0.2, optimizer="rmsprop", max_patience=100, batchnorm=0, input_dropout=1.0, embedding_noise=0.0, keep_labels=1.0, prec_recall_cutoff=True, missing_labels_val=-1.0, which_fold=0, early_stop_criterion='loss_sup_det', input_decoder_mode="regression", save_path='/Users/Marie-Elyse/Downloads/embedding2', save_copy='/Users/Marie-Elyse/Downloads/embedding2', dataset_path='/Users/Marie-Elyse/Downloads/embedding2', resume=False, exp_name='', random_proj=0, bootstrap_snp_embeddings=0, bootstrap_cutoff=0.9): # Prepare embedding information : # - If no embedding is specified, use the transposed input matrix # - If a file is specified, use it's content as feature embeddings # - Else (a embedding category like 'histo3x26' is provided), load a # pregenerated embedding of the specified category if embedding_source is None or embedding_source == "raw": embedding_source = None embedding_input = 'raw' elif os.path.exists(embedding_source): embedding_input = embedding_source else: embedding_input = embedding_source embedding_source = os.path.join( dataset_path, embedding_input + '_fold' + str(which_fold) + '.npy') # Load the dataset print("Loading data") (x_train, y_train, exmpl_ids_train, x_valid, y_valid, exmpl_ids_valid, x_test, y_test, exmpl_ids_test, x_unsup, training_labels, feature_names, label_names) = mlh.load_data(dataset, dataset_path, embedding_source, which_fold=which_fold, keep_labels=keep_labels, missing_labels_val=missing_labels_val, embedding_input=embedding_input, norm=False) # Load the additional unsupervised data, if some is specified if additional_unsup_input is not None: print("Adding additional data to the model's unsupervised inputs") paths = additional_unsup_input.split(";") additional_unsup_data = [np.load(p) for p in paths] print(x_unsup.shape) x_unsup = np.hstack(additional_unsup_data + [x_unsup]) print(x_unsup.shape) if x_unsup is not None: n_samples_unsup = x_unsup.shape[1] else: n_samples_unsup = 0 original_x_train = x_train.copy() original_x_valid = x_valid.copy() original_x_test = x_test.copy() # Change how the missing data values are encoded. Right now they are # encoded as being the mean of the corresponding feature so that, after # feature normalization, they will be 0s. However, this prevents us from # transfering the minibatch data as int8 so we replace those values with -1s. for i in range(x_train.shape[1]): feature_mean = x_train[:, i].mean() x_train[:, i] = mh.replace_arr_value(x_train[:, i], feature_mean, -1) x_valid[:, i] = mh.replace_arr_value(x_valid[:, i], feature_mean, -1) x_test[:, i] = mh.replace_arr_value(x_test[:, i], feature_mean, -1) x_train = x_train.astype("int8") x_valid = x_valid.astype("int8") x_test = x_test.astype("int8") # Normalize the input data. The mlh.load_data() function already offers # this feature but we need to do it here so that we will have access to # both the normalized and unnormalized input data. norm_mus = original_x_train.mean(axis=0) norm_sigmas = original_x_train.std(axis=0) + 1e-6 #x_train = (x_train - norm_mus[None, :]) / norm_sigmas[None, :] #x_valid = (x_valid - norm_mus[None, :]) / norm_sigmas[None, :] #x_test = (x_test - norm_mus[None, :]) / norm_sigmas[None, :] #x_train *= (315345. / 553107) #x_valid *= (315345. / 553107) #x_test *= (315345. / 553107) # Setup variables to build the right type of decoder bases on the value of # `input_decoder_mode` assert input_decoder_mode in ["regression", "classification"] if input_decoder_mode == "regression": # The size of the input reconstruction will be the same as the number # of inputs decoder_encoder_unit_ratio = 1 elif input_decoder_mode == "classification": # # The size of the input reconstruction will be the N times larger as # the number of inputs where N is the number of distinct discrete # values that each input can take. For SNP input data with an additive # coding scheme, N=3 because the 3 possible values are : {0, 1, 2}. nb_discrete_vals_by_input = int(original_x_train.max() + 1) decoder_encoder_unit_ratio = nb_discrete_vals_by_input # Print baseline accuracy for the imputation of genes print("Distribution of input values in valid: %f %f %f" % ((original_x_train == 0).mean(), (original_x_train == 1).mean(), (original_x_train == 2).mean())) print("Distribution of input values in test: %f %f %f" % ((original_x_test == 0).mean(), (original_x_test == 1).mean(), (original_x_test == 2).mean())) # Extract required information from data n_samples, n_feats = x_train.shape print("Number of features : ", n_feats) print("Glorot init : ", 2.0 / (n_feats + n_hidden_t_enc[-1])) n_targets = y_train.shape[1] if y_train.ndim == 2 else y_train.max() + 1 # Set some variables batch_size = 138 beta = gamma if (gamma == 0) else beta # Generate an name for the experiment based on the hyperparameters used if embedding_source is None: embedding_name = embedding_input else: embedding_name = embedding_source.replace("_", "").split(".")[0] exp_name += embedding_name.rsplit('/', 1)[::-1][0] + '_' exp_name += mlh.define_exp_name( keep_labels, alpha, beta, gamma, lmd, n_hidden_u, n_hidden_t_enc, n_hidden_t_dec, n_hidden_s, which_fold, learning_rate, decoder_net_init, encoder_net_init, batchnorm, input_dropout, embedding_noise, early_stop_criterion, learning_rate_annealing, input_decoder_mode) print("Experiment: " + exp_name) # Ensure that the folders where the results of the experiment will be # saved do exist. Create them if they don't. save_path = os.path.join(save_path, dataset, exp_name) save_copy = os.path.join(save_copy, dataset, exp_name) if not os.path.exists(save_path): os.makedirs(save_path) if not os.path.exists(save_copy): os.makedirs(save_copy) # Prepare Theano variables for inputs and targets input_var_sup = T.bmatrix('input_sup') input_var_unsup = theano.shared(x_unsup, 'input_unsup') # x_unsup TBD target_var_sup = T.matrix('target_sup') lr = theano.shared(np.float32(learning_rate), 'learning_rate') # Use the provided mus and sigmas to process the missing values and # normalize the inputs b_input_var_sup = input_var_sup.astype("float32") normed_input_sup = (T.eq(b_input_var_sup, -1) * norm_mus + T.neq(b_input_var_sup, -1) * b_input_var_sup) normed_input_sup = (normed_input_sup - norm_mus) / norm_sigmas reconst_target_sup = T.cast(input_var_sup, "int32") # Build model print("Building model") # Some checkings # assert len(n_hidden_u) > 0 assert len(n_hidden_t_enc) > 0 assert len(n_hidden_t_dec) > 0 assert n_hidden_t_dec[-1] == n_hidden_t_enc[-1] # Build feature embedding networks (encoding and decoding if gamma > 0) nets, embeddings, pred_feat_emb = mh.build_feat_emb_nets( embedding_source, n_feats, n_samples_unsup, input_var_unsup, n_hidden_u, n_hidden_t_enc, n_hidden_t_dec, gamma, encoder_net_init, decoder_net_init, save_path, random_proj, decoder_encoder_unit_ratio, embedding_noise) # Build feature embedding reconstruction networks (if alpha > 0, beta > 0) nets += mh.build_feat_emb_reconst_nets( [alpha, beta], n_samples_unsup, n_hidden_u, [n_hidden_t_enc, n_hidden_t_dec], nets, [encoder_net_init, decoder_net_init]) # Supervised network discrim_net, hidden_rep = mh.build_discrim_net( batch_size, n_feats, normed_input_sup, n_hidden_t_enc, n_hidden_s, embeddings[0], disc_nonlinearity, n_targets, batchnorm, input_dropout) # Reconstruct network nets += [ mh.build_reconst_net(hidden_rep, embeddings[1] if len(embeddings) > 1 else None, n_feats * decoder_encoder_unit_ratio, gamma, decoder_encoder_unit_ratio) ] # Load weights if we are resuming job if resume: # Load best model with np.load(os.path.join(save_copy, 'dietnet_best.npz')) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] nlayers = len( lasagne.layers.get_all_params(filter(None, nets) + [discrim_net])) #lasagne.layers.set_all_param_values(filter(None, nets) + # [discrim_net], # param_values[:nlayers]) params = lasagne.layers.get_all_params( filter(None, nets) + [discrim_net]) for p, v in zip(params, param_values[:nlayers]): # Do not overwrite embedding value with old embedding. Removing # the following condition will prevent a trained model from being # tested on a different dataset if p.name != "feat_emb": p.set_value(v) print("Building and compiling training functions") # Build and compile training functions predictions, predictions_det = mh.define_predictions(nets, start=2) prediction_sup, prediction_sup_det = mh.define_predictions([discrim_net]) prediction_sup = prediction_sup[0] prediction_sup_det = prediction_sup_det[0] # Define losses # reconstruction losses if input_decoder_mode == "regression": reconst_losses, reconst_losses_det = mh.define_reconst_losses( predictions, predictions_det, [input_var_unsup, input_var_unsup, normed_input_sup]) elif input_decoder_mode == "classification": # Obtain regular reconstruction losses for every reconstruction # but the reconstruction of the supervised input data reconst_losses1, reconst_losses_det1 = mh.define_reconst_losses( predictions[:-1], predictions_det[:-1], [input_var_unsup, input_var_unsup]) # Obtain a "classification" reconstruction loss for the reconstruction # of the supervised input data. This classification loss will be # performed on the input data without normalization reconst_losses2, reconst_losses_det2 = mh.define_classif_reconst_losses( predictions[-1:], predictions_det[-1:], [reconst_target_sup], [decoder_encoder_unit_ratio]) reconst_losses = reconst_losses1 + reconst_losses2 reconst_losses_det = reconst_losses_det1 + reconst_losses_det2 # supervised loss sup_loss, sup_loss_det = mh.define_sup_loss(disc_nonlinearity, prediction_sup, prediction_sup_det, keep_labels, target_var_sup, missing_labels_val) # Define inputs inputs = [input_var_sup, target_var_sup] # Define parameters params = lasagne.layers.get_all_params([discrim_net] + filter(None, nets), trainable=True, unwrap_shared=False) params_to_freeze= \ lasagne.layers.get_all_params(filter(None, nets), trainable=False, unwrap_shared=False) # Remove unshared variables from params and params_to_freeze params = [ p for p in params if isinstance(p, theano.compile.sharedvalue.SharedVariable) ] params_to_freeze = [ p for p in params_to_freeze if isinstance(p, theano.compile.sharedvalue.SharedVariable) ] print("Params : ", params) feat_emb_var = next(p for p in lasagne.layers.get_all_params([discrim_net]) if p.name == 'input_unsup' or p.name == 'feat_emb') # feat_emb_var = lasagne.layers.get_all_params([discrim_net])[0] print(feat_emb_var) feat_emb_val = feat_emb_var.get_value() feat_emb_norms = (feat_emb_val**2).sum(0)**0.5 feat_emb_var.set_value(feat_emb_val / feat_emb_norms) print('Number of params discrim: ' + str(len(params))) print('Number of params to freeze: ' + str(len(params_to_freeze))) for p in params_to_freeze: new_params = [el for el in params if el != p] params = new_params print('Number of params to update: ' + str(len(params))) # Combine losses loss = delta*sup_loss + alpha*reconst_losses[0] + beta*reconst_losses[1] + \ gamma*reconst_losses[2] loss_det = delta*sup_loss_det + alpha*reconst_losses_det[0] + \ beta*reconst_losses_det[1] + gamma*reconst_losses_det[2] l2_penalty = apply_penalty(params, l2) loss = loss + lmd * l2_penalty loss_det = loss_det + lmd * l2_penalty # Compute network updates assert optimizer in ["rmsprop", "adam", "amsgrad"] if optimizer == "rmsprop": updates = lasagne.updates.rmsprop(loss, params, learning_rate=lr) elif optimizer == "adam": updates = lasagne.updates.adam(loss, params, learning_rate=lr) elif optimizer == "amsgrad": updates = lasagne.updates.amsgrad(loss, params, learning_rate=lr) #updates = lasagne.updates.sgd(loss, # params, # learning_rate=lr) # updates = lasagne.updates.momentum(loss, params, # learning_rate=lr, momentum=0.0) # Apply norm constraints on the weights for k in updates.keys(): if updates[k].ndim == 2: updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0) # Compile training function train_fn = theano.function(inputs, loss, updates=updates, on_unused_input='ignore') # Monitoring Labels monitor_labels = [ "reconst. feat. W_enc", "reconst. feat. W_dec", "reconst. loss" ] monitor_labels = [ i for i, j in zip(monitor_labels, reconst_losses) if j != 0 ] monitor_labels += ["feat. W_enc. mean", "feat. W_enc var"] monitor_labels += ["feat. W_dec. mean", "feat. W_dec var"] if \ (embeddings[1] is not None) else [] monitor_labels += ["loss. sup.", "total loss"] # Build and compile test function val_outputs = reconst_losses_det val_outputs = [i for i, j in zip(val_outputs, reconst_losses) if j != 0] val_outputs += [embeddings[0].mean(), embeddings[0].var()] val_outputs += [embeddings[1].mean(), embeddings[1].var()] if \ (embeddings[1] is not None) else [] val_outputs += [sup_loss_det, loss_det] # Compute supervised accuracy and add it to monitoring list test_acc, test_pred = mh.define_test_functions(disc_nonlinearity, prediction_sup, prediction_sup_det, target_var_sup) monitor_labels.append("accuracy") val_outputs.append(test_acc) # If appropriate, compute the input reconstruction accuracy and add it to # the monitoring list if input_decoder_mode == "classification": input_reconst_acc = mh.define_classif_reconst_acc( predictions_det[-1], reconst_target_sup, decoder_encoder_unit_ratio) #import pdb; pdb.set_trace() monitor_labels.append("input_reconst_acc") val_outputs.append(input_reconst_acc) # Compile prediction function predict = theano.function([input_var_sup], test_pred) predict_from_normed_inps = theano.function([normed_input_sup], test_pred) predict_scores = theano.function([input_var_sup], prediction_sup_det) predict_scores_from_normed_inps = theano.function([input_var_sup], prediction_sup_det) # Compile validation function val_fn = theano.function(inputs, [prediction_sup_det] + val_outputs, on_unused_input='ignore') # Finally, launch the training loop. print("Starting training...") # Some variables patience = 0 train_monitored = [] valid_monitored = [] train_loss = [] # Pre-training monitoring print("Epoch 0 of {}".format(num_epochs)) train_minibatches = mlh.iterate_minibatches(x_train, y_train, batch_size, shuffle=False) train_err = mlh.monitoring(train_minibatches, "train", val_fn, monitor_labels, prec_recall_cutoff) valid_minibatches = mlh.iterate_minibatches(x_valid, y_valid, batch_size, shuffle=False) valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn, monitor_labels, prec_recall_cutoff) # Before starting training, save a copy of the model in case np.savez( os.path.join(save_path, 'dietnet_best.npz'), *lasagne.layers.get_all_param_values( filter(None, nets) + [discrim_net])) # Training loop start_training = time.time() for epoch in range(num_epochs): start_time = time.time() print("Epoch {} of {}".format(epoch + 1, num_epochs)) nb_minibatches = 0 loss_epoch = 0 # Train pass for batch in mlh.iterate_minibatches(x_train, training_labels, batch_size, shuffle=True): loss_epoch += train_fn(*batch) nb_minibatches += 1 loss_epoch /= nb_minibatches train_loss += [loss_epoch] # Monitoring on the training set train_minibatches = mlh.iterate_minibatches(x_train, y_train, batch_size, shuffle=False) train_err = mlh.monitoring(train_minibatches, "train", val_fn, monitor_labels, prec_recall_cutoff) train_monitored += [train_err] # Monitoring on the validation set valid_minibatches = mlh.iterate_minibatches(x_valid, y_valid, batch_size, shuffle=False) valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn, monitor_labels, prec_recall_cutoff) valid_monitored += [valid_err] try: early_stop_val = valid_err[monitor_labels.index( early_stop_criterion)] except: raise ValueError("There is no monitored value by the name of %s" % early_stop_criterion) valid_loss_sup_hist = [ v[monitor_labels.index("loss. sup.")] for v in valid_monitored ] valid_loss_sup = valid_loss_sup_hist[-1] # Early stopping if epoch == 0: best_valid = early_stop_val elif ((early_stop_val > best_valid and early_stop_criterion == 'input_reconst_acc') or (early_stop_val > best_valid and early_stop_criterion == 'accuracy') or (early_stop_val >= best_valid and early_stop_criterion == 'accuracy' and valid_loss_sup == min(valid_loss_sup_hist)) or (early_stop_val < best_valid and early_stop_criterion == 'loss. sup.')): best_valid = early_stop_val patience = 0 # Save stuff np.savez( os.path.join(save_path, 'dietnet_best.npz'), *lasagne.layers.get_all_param_values( filter(None, nets) + [discrim_net])) np.savez(save_path + "/errors_supervised_best.npz", zip(*train_monitored), zip(*valid_monitored)) # Monitor on the test set now because sometimes the saving doesn't # go well and there isn't a model to load at the end of training if y_test is not None: test_minibatches = mlh.iterate_minibatches(x_test, y_test, 138, shuffle=False) test_err = mlh.monitoring(test_minibatches, "test", val_fn, monitor_labels, prec_recall_cutoff) else: patience += 1 # Save stuff np.savez( os.path.join(save_path, 'dietnet_last.npz'), *lasagne.layers.get_all_param_values( filter(None, nets) + [discrim_net])) np.savez(save_path + "/errors_supervised_last.npz", zip(*train_monitored), zip(*valid_monitored)) print(" epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time)) # End training if needed if patience == max_patience or epoch == num_epochs - 1: break # Anneal the learning rate lr.set_value( np.array(lr.get_value() * learning_rate_annealing, dtype="float32")) # End training with a final monitoring step on the best model print("Ending training") # Load best model with np.load(os.path.join(save_path, 'dietnet_best.npz')) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] nlayers = len( lasagne.layers.get_all_params(filter(None, nets) + [discrim_net])) #lasagne.layers.set_all_param_values(filter(None, nets) + # [discrim_net], # param_values[:nlayers]) params = lasagne.layers.get_all_params( filter(None, nets) + [discrim_net]) for p, v in zip(params, param_values[:nlayers]): # Do not overwrite embedding value with old embedding. Removing # the following condition will prevent a trained model from being # tested on a different dataset if p.name != "feat_emb": p.set_value(v) if embedding_source is None: # Save embedding pred = pred_feat_emb() np.savez(os.path.join(save_path, 'feature_embedding.npz'), pred) # Training set results train_minibatches = mlh.iterate_minibatches(x_train, y_train, batch_size, shuffle=False) train_err = mlh.monitoring(train_minibatches, "train", val_fn, monitor_labels, prec_recall_cutoff) # Validation set results valid_minibatches = mlh.iterate_minibatches(x_valid, y_valid, batch_size, shuffle=False) valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn, monitor_labels, prec_recall_cutoff) # Test set results if y_test is not None: test_minibatches = mlh.iterate_minibatches(x_test, y_test, 138, shuffle=False) test_err = mlh.monitoring(test_minibatches, "test", val_fn, monitor_labels, prec_recall_cutoff) # Test the model's accuracy with varying levels of provided SNPs test_minibatches = mlh.iterate_minibatches(x_test, y_test, 138, shuffle=False) mlh.eval_prediction(test_minibatches, "test (rescaled)", predict_from_normed_inps, norm_mus, norm_sigmas, nb_evals=1, rescale_inputs=True) # Save the model's test predictions to file print(x_test.shape) test_predictions = [] for minibatch in mlh.iterate_testbatches(x_test, 1, shuffle=False): test_predictions += [predict(minibatch)] print(len(test_predictions)) print(sum([t.shape[0] for t in test_predictions])) np.savez(os.path.join(save_path, 'test_predictions.npz'), test_predictions) # Get the scores assigned by the model to each class for each test sample test_scores = [] for minibatch in mlh.iterate_testbatches(x_test, 1, shuffle=False): test_scores += [predict_scores(minibatch)] np.savez(os.path.join(save_path, 'test_scores.npz'), test_scores) # Generate new SNP embeddings using test examples labeled according # to the model's predictions if bootstrap_snp_embeddings: if bootstrap_cutoff == "soft": bootstrap_snp_data = np.hstack( (x_train.transpose(), x_valid.transpose(), x_test.transpose())) bootstrap_labels = np.vstack( (y_train, y_valid, np.array(test_scores)[:, 0, :])) filename_genotypic = 'bootstrap_gen_snp_embeddings_softlabels.npy' filename_allelic = 'bootstrap_all_snp_embeddings_softlabels.npy' else: # Hard cutoff sure_test_idxs = np.argwhere( (np.array(test_scores)[:, 0, :] > bootstrap_cutoff).sum(1)).flatten() sure_test_inputs = x_test[sure_test_idxs] sure_test_preds = np.array(test_scores)[sure_test_idxs, 0].argmax(1) bootstrap_snp_data = np.hstack( (x_train.transpose(), x_valid.transpose(), sure_test_inputs.transpose())) bootstrap_labels = np.hstack( (y_train.argmax(1), y_valid.argmax(1), sure_test_preds)) filename_genotypic = 'bootstrap_gen_snp_embeddings_cutoff%f.npy' % bootstrap_cutoff filename_allelic = 'bootstrap_all_snp_embeddings_cutoff%f.npy' % bootstrap_cutoff utils_helpers.generate_snp_hist( bootstrap_snp_data, bootstrap_labels, label_names=label_names, perclass=True, sum_to_one=True, filename_genotypic=os.path.join(save_path, filename_genotypic), filename_allelic=os.path.join(save_path, filename_allelic)) # Print all final errors for train, validation and test print("Training time:\t\t\t{:.3f}s".format(time.time() - start_training)) # Analyse the model gradients to determine the influence of each SNP on # each of the model's prediction print(label_names) class_idx = T.iscalar("class index") grad_fn = theano.function([input_var_sup, class_idx], T.grad(prediction_sup_det[:, class_idx].mean(), input_var_sup).mean(0)) grads_wrt_inputs = mlh.get_grads_wrt_inputs(x_test, grad_fn, feature_names, label_names) # Obtain function that takes as inputs normed inputs and returns the # gradient of a class score wrt the normed inputs themselves (this is # requird because computing the integrated gradients requires to be able # to interpolate between an example where all features are missing and an # example where any number of features are provided) grad_from_normed_fn = theano.function( [normed_input_sup, class_idx], T.grad(prediction_sup_det[:, class_idx].sum(), normed_input_sup).mean(0)) # Collect integrated gradients over the whole test set. Obtain, for each # SNP, for each possible value (0, 1 or 2), the average contribution of that # value for what SNP to the score of each class. avg_int_grads = np.zeros((x_test.shape[1], 3, len(label_names)), dtype="float32") counts_int_grads = np.zeros((x_test.shape[1], 3), dtype="int32") for test_idx in range(x_test.shape[0]): int_grads = mlh.get_integrated_gradients(x_test[test_idx], grad_from_normed_fn, feature_names, label_names, norm_mus, norm_sigmas, m=100) snp_value_mask = np.arange(3) == x_test[test_idx][:, None] avg_int_grads += snp_value_mask[:, :, None] * int_grads.transpose()[:, None, :] counts_int_grads += snp_value_mask avg_int_grads = avg_int_grads / counts_int_grads[:, :, None] # Save all the additional information required for model analysis : # - Test predictions # - SNP IDs # - Subject IDs # - Normalization parameters for the input minibatches np.savez(os.path.join(save_path, 'additional_data.npz'), test_labels=y_test, test_scores=np.array(test_scores)[:, 0], test_predictions=np.array(test_predictions)[:, 0], norm_mus=norm_mus, norm_sigmas=norm_sigmas, grads_wrt_inputs=grads_wrt_inputs, exmpl_ids_train=exmpl_ids_train, exmpl_ids_valid=exmpl_ids_valid, exmpl_ids_test=exmpl_ids_test, feature_names=feature_names, label_names=label_names, avg_int_grads=avg_int_grads) # Copy files to loadpath (only if some training has beeen done so there # is a local saved version) if save_path != save_copy and num_epochs > 0: print('Copying model and other training files to {}'.format(save_copy)) copy_tree(save_path, save_copy)
for i in xrange(0): l_hiddens.append(DenseLayer(dropout(l_hiddens[-1]), num_units=100, nonlinearity=rectify)) l_out = DenseLayer(dropout(l_hiddens[-1]), num_units=y.shape[1], nonlinearity=softmax, W=Orthogonal()) def reset(): if any(np.isnan(scale.get_value()) for scale in scales): for scale in scales: scale.set_value(1.) for l in l_hiddens: l.b.set_value(Constant()(l.b.get_value().shape)) l.W.set_value(Orthogonal()(l.W.get_value().shape)) l_out.b.set_value(Constant()(l_out.b.get_value().shape)) l_out.W.set_value(Orthogonal()(l_out.W.get_value().shape)) for p in (p for u in (updates_ada,updates_other,updates_scal) for p in u if p not in get_all_params(l_out)): p.set_value(Constant()(p.get_value().shape)) chunky_l2 = apply_penalty(get_all_params(l_out,regularizable=True),l2)-l2(l_hiddens[0].W)+l2(l_hiddens[0].W/T.reshape(vscale,(206279,1))) chunky_l1 = apply_penalty(get_all_params(l_out,regularizable=True),l1)-l1(l_hiddens[0].W)+l1(l_hiddens[0].W/T.reshape(vscale,(206279,1))) simple_l2 = apply_penalty(get_all_params(l_out,regularizable=True),l2) #l_out2 = DenseLayer(dropout(l_hiddens2[-1]), num_units=y.shape[1]) #l_out = lasagne.layers.NonlinearityLayer(lasagne.layers.ElemwiseSumLayer((l_out1,l_out2),.5), softmax) #categorical_crossentropy(get_output(l_out)[train_indice]) target=T.fmatrix(name="target") #f=theano.function([l_in.input_var],get_output(l_out),allow_input_downcast=True) #f(X[0,:].toarray()) loss=categorical_crossentropy(get_output(l_out),target).mean() # train_loss_smoo=categorical_crossentropy(get_output(l_out,deterministic=True)[train_indices,],target[train_indices,]).mean() # valid_loss=categorical_crossentropy(get_output(l_out)[valid_indices,],target[valid_indices,]).mean() # valid_loss_smoo=categorical_crossentropy(get_output(l_out,deterministic=True)[valid_indices,],target[valid_indices,]).mean()