def test_apply_penalty(self):
        from lasagne.regularization import apply_penalty, l2
        A = T.vector()
        B = T.matrix()

        assert apply_penalty([], l2) == 0

        assert equal_computations([apply_penalty(A, l2)], [l2(A)])

        assert equal_computations([apply_penalty([A, B], l2)],
                                  [sum([l2(A), l2(B)])])
Beispiel #2
0
def regularization(network, optimization):
	all_params = layers.get_all_params(network, regularizable=True)    

	# weight-decay regularization
	loss = 0
	if "l1" in optimization:
		l1_penalty = apply_penalty(all_params, l1) * optimization["l1"]
		loss += l1_penalty
	if "l2" in optimization:
		l2_penalty = apply_penalty(all_params, l2)* optimization["l2"]        
		loss += l2_penalty 
	return loss
    def test_apply_penalty(self):
        from lasagne.regularization import apply_penalty, l2
        A = T.vector()
        B = T.matrix()

        assert apply_penalty([], l2) == 0

        assert equal_computations([apply_penalty(A, l2)],
                                  [l2(A)])

        assert equal_computations([apply_penalty([A, B], l2)],
                                  [sum([l2(A), l2(B)])])
    def test_regularize_layer_params_weighted(self, layers):
        from lasagne.regularization import regularize_layer_params_weighted
        from lasagne.regularization import apply_penalty, l2
        l_1, l_2, l_3 = layers

        layers = OrderedDict()
        layers[l_2] = 0.1
        layers[l_3] = 0.5

        loss = regularize_layer_params_weighted(layers,
                                                lasagne.regularization.l2)
        assert equal_computations([loss],
                                  [sum([0.1 * apply_penalty([l_2.W], l2),
                                        0.5 * apply_penalty([l_3.W], l2)])])
Beispiel #5
0
def prepare():

    X = T.tensor4('X')
    y = T.ivector('y')

    output_layer = lenet_skinny()

    all_params = lasagne.layers.get_all_params(output_layer)

    loss_fn = x_ent

    prediction = lasagne.layers.get_output(output_layer, X)
    loss = loss_fn(prediction, y).mean() + \
        args["lambda"]*apply_penalty(lasagne.layers.get_all_params(output_layer, regularizable=True), l2 )

    label_vector = lasagne.layers.get_output(output_layer, X)
    pred = T.argmax(label_vector, axis=1)
    accuracy = T.mean(T.eq(pred, y))

    return Container({
        "X": X,
        "y": y,
        "output_layer": output_layer,
        "all_params": all_params,
        "loss": loss,
        "label_vector": label_vector,
        "pred": pred,
        "accuracy": accuracy
    })
Beispiel #6
0
def build_trainer(input_data,
                  input_mask,
                  target_data,
                  target_mask,
                  network_params,
                  network_reg_params,
                  output_layer,
                  weight_decay,
                  updater,
                  learning_rate,
                  max_grad_norm=0.0,
                  load_updater_params=None):
    output_score = get_output(output_layer, deterministic=False)
    frame_prd_idx = T.argmax(output_score, axis=-1)

    one_hot_target = T.extra_ops.to_one_hot(y=T.flatten(target_data, 1),
                                            nb_class=output_dim,
                                            dtype=floatX)

    output_score = T.reshape(x=output_score,
                             newshape=(-1, output_dim),
                             ndim=2)
    output_score = output_score - T.max(output_score, axis=-1, keepdims=True)
    output_score = output_score - T.log(T.sum(T.exp(output_score), axis=-1, keepdims=True))

    train_ce = -T.sum(T.mul(one_hot_target, output_score), axis=-1)*T.flatten(target_mask, 1)

    train_loss = T.sum(train_ce)/target_mask.shape[0]
    frame_loss = T.sum(train_ce)/T.sum(target_mask)

    frame_accr = T.sum(T.eq(frame_prd_idx, target_data)*target_mask)/T.sum(target_mask)

    train_total_loss = train_loss
    if weight_decay > 0:
        train_total_loss += apply_penalty(network_reg_params, l2)*10**(-weight_decay)

    network_grads = theano.grad(cost=train_total_loss, wrt=network_params)

    if max_grad_norm > 0.:
        network_grads, network_grads_norm = total_norm_constraint(tensor_vars=network_grads,
                                                                  max_norm=max_grad_norm,
                                                                  return_norm=True)
    else:
        network_grads_norm = T.sqrt(sum(T.sum(grad ** 2) for grad in network_grads))

    train_lr = theano.shared(lasagne.utils.floatX(learning_rate))
    train_updates, updater_params = updater(loss_or_grads=network_grads,
                                            params=network_params,
                                            learning_rate=train_lr,
                                            load_params_dict=load_updater_params)

    training_fn = theano.function(inputs=[input_data,
                                          input_mask,
                                          target_data,
                                          target_mask],
                                  outputs=[frame_loss,
                                           frame_accr,
                                           network_grads_norm],
                                  updates=train_updates)
    return training_fn, train_lr, updater_params
    def test_regularize_layer_params_weighted(self, layers):
        from lasagne.regularization import regularize_layer_params_weighted
        from lasagne.regularization import apply_penalty, l2
        l_1, l_2, l_3 = layers

        layers = OrderedDict()
        layers[l_2] = 0.1
        layers[l_3] = 0.5

        loss = regularize_layer_params_weighted(layers,
                                                lasagne.regularization.l2)
        assert equal_computations([loss], [
            sum([
                0.1 * apply_penalty([l_2.W], l2),
                0.5 * apply_penalty([l_3.W], l2)
            ])
        ])
Beispiel #8
0
def execute(dataset,
            n_hidden_t_enc,
            n_hidden_s,
            num_epochs=500,
            learning_rate=.001,
            learning_rate_annealing=1.0,
            gamma=1,
            lmd=0.,
            disc_nonlinearity="sigmoid",
            keep_labels=1.0,
            prec_recall_cutoff=True,
            missing_labels_val=-1.0,
            which_fold=1,
            early_stop_criterion='loss',
            embedding_input='raw',
            save_path='/Tmp/romerosa/feature_selection/',
            save_copy='/Tmp/romerosa/feature_selection/',
            dataset_path='/Tmp/carriepl/datasets/',
            resume=False,
            exp_name=None):

    # Load the dataset
    print("Loading data")
    x_train, y_train, x_valid, y_valid, x_test, y_test, \
        x_unsup, training_labels = mlh.load_data(
            dataset, dataset_path, None,
            which_fold=which_fold, keep_labels=keep_labels,
            missing_labels_val=missing_labels_val,
            embedding_input=embedding_input)

    # Extract required information from data
    n_samples, n_feats = x_train.shape
    print("Number of features : ", n_feats)
    print("Glorot init : ", 2.0 / (n_feats + n_hidden_t_enc[-1]))
    n_targets = y_train.shape[1]

    # Set some variables
    batch_size = 1

    # Preparing folder to save stuff
    print("Experiment: " + exp_name)
    save_path = os.path.join(save_path, dataset, exp_name)
    save_copy = os.path.join(save_copy, dataset, exp_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # Prepare Theano variables for inputs and targets
    input_var_sup = T.matrix('input_sup')
    target_var_sup = T.matrix('target_sup')
    lr = theano.shared(np.float32(learning_rate), 'learning_rate')

    # Build model
    print("Building model")
    discrim_net = InputLayer((None, n_feats), input_var_sup)
    discrim_net = DenseLayer(discrim_net,
                             num_units=n_hidden_t_enc[-1],
                             nonlinearity=rectify)

    # Reconstruct the input using dec_feat_emb
    if gamma > 0:
        reconst_net = DenseLayer(discrim_net,
                                 num_units=n_feats,
                                 nonlinearity=linear)
        nets = [reconst_net]
    else:
        nets = [None]

    # Add supervised hidden layers
    for hid in n_hidden_s:
        discrim_net = DropoutLayer(discrim_net)
        discrim_net = DenseLayer(discrim_net, num_units=hid)

    assert disc_nonlinearity in ["sigmoid", "linear", "rectify", "softmax"]
    discrim_net = DropoutLayer(discrim_net)
    discrim_net = DenseLayer(discrim_net,
                             num_units=n_targets,
                             nonlinearity=eval(disc_nonlinearity))

    print("Building and compiling training functions")

    # Build and compile training functions
    predictions, predictions_det = mh.define_predictions(nets, start=0)
    prediction_sup, prediction_sup_det = mh.define_predictions([discrim_net])
    prediction_sup = prediction_sup[0]
    prediction_sup_det = prediction_sup_det[0]

    # Define losses
    # reconstruction losses
    reconst_losses, reconst_losses_det = mh.define_reconst_losses(
        predictions, predictions_det, [input_var_sup])
    # supervised loss
    sup_loss, sup_loss_det = mh.define_sup_loss(disc_nonlinearity,
                                                prediction_sup,
                                                prediction_sup_det,
                                                keep_labels, target_var_sup,
                                                missing_labels_val)

    inputs = [input_var_sup, target_var_sup]
    params = lasagne.layers.get_all_params([discrim_net] + nets,
                                           trainable=True)

    print('Number of params: ' + str(len(params)))

    # Combine losses
    loss = sup_loss + gamma * reconst_losses[0]
    loss_det = sup_loss_det + gamma * reconst_losses_det[0]

    l2_penalty = apply_penalty(params, l2)
    loss = loss + lmd * l2_penalty
    loss_det = loss_det + lmd * l2_penalty

    # Compute network updates
    updates = lasagne.updates.rmsprop(loss, params, learning_rate=lr)
    # updates = lasagne.updates.sgd(loss,
    #                               params,
    #                               learning_rate=lr)
    # updates = lasagne.updates.momentum(loss, params,
    #                                    learning_rate=lr, momentum=0.0)

    # Apply norm constraints on the weights
    for k in updates.keys():
        if updates[k].ndim == 2:
            updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0)

    # Compile training function
    train_fn = theano.function(inputs,
                               loss,
                               updates=updates,
                               on_unused_input='ignore')

    # Monitoring Labels
    monitor_labels = ["reconst. loss"]
    monitor_labels = [
        i for i, j in zip(monitor_labels, reconst_losses) if j != 0
    ]
    monitor_labels += ["loss. sup.", "total loss"]

    # Build and compile test function
    val_outputs = reconst_losses_det
    val_outputs = [i for i, j in zip(val_outputs, reconst_losses) if j != 0]
    val_outputs += [sup_loss_det, loss_det]

    # Compute accuracy and add it to monitoring list
    test_acc, test_pred = mh.define_test_functions(disc_nonlinearity,
                                                   prediction_sup,
                                                   prediction_sup_det,
                                                   target_var_sup)
    monitor_labels.append("accuracy")
    val_outputs.append(test_acc)

    # Compile prediction function
    predict = theano.function([input_var_sup], test_pred)

    # Compile validation function
    val_fn = theano.function(inputs, [prediction_sup_det] + val_outputs,
                             on_unused_input='ignore')

    # Finally, launch the training loop.
    print("Starting testing...")

    if not os.path.exists(save_copy + '/model_feat_sel_best.npz'):
        print("No saved model to be tested and/or generate" " the embedding !")
    else:
        with np.load(save_copy + '/model_feat_sel_best.npz', ) as f:
            param_values = [f['arr_%d' % i] for i in range(len(f.files))]
            lasagne.layers.set_all_param_values(
                filter(None, nets) + [discrim_net], param_values)

            test_minibatches = mlh.iterate_minibatches(x_test,
                                                       y_test,
                                                       batch_size,
                                                       shuffle=False)

            test_err, pred, targets = mlh.monitoring(test_minibatches,
                                                     "test",
                                                     val_fn,
                                                     monitor_labels,
                                                     prec_recall_cutoff,
                                                     return_pred=True)

        lab = targets.argmax(1)
        pred_argmax = pred.argmax(1)

        continent_cat = mh.create_1000_genomes_continent_labels()

        lab_cont = np.zeros(lab.shape)
        pred_cont = np.zeros(pred_argmax.shape)

        for i, c in enumerate(continent_cat):
            for el in c:
                lab_cont[lab == el] = i
                pred_cont[pred_argmax == el] = i

        cm_e = np.zeros((26, 26))
        cm_c = np.zeros((5, 5))

        for i in range(26):
            for j in range(26):
                cm_e[i, j] = ((pred_argmax == i) * (lab == j)).sum()

        for i in range(5):
            for j in range(5):
                cm_c[i, j] = ((pred_cont == i) * (lab_cont == j)).sum()

        np.savez(os.path.join(save_copy, 'cm' + str(which_fold) + '.npz'),
                 cm_e=cm_e,
                 cm_c=cm_c)

        print(os.path.join(save_copy, 'cm' + str(which_fold) + '.npz'))
Beispiel #9
0
    def __init__(self, 
                W=None, 
                W_path=None, 
                K=300, 
                num_hidden=256,
                batch_size=None,
                grad_clip=100., 
                max_sent_len_basic=200, 
                num_classes=2, 
                **kwargs):

        W = W
        V = len(W)
        K = int(K)
        num_hidden = int(num_hidden)
        batch_size = int(batch_size)
        grad_clip = int(grad_clip)
        max_seq_len = int(max_sent_len_basic)
        num_classes = int(num_classes)    
        dropout = float(kwargs["dropout"])
        lambda_w = float(kwargs["lambda_w"])


        index = T.lscalar() 
        X = T.imatrix('X')
        M = T.imatrix('M')
        y = T.ivector('y')
        # Input Layer
        l_in = lasagne.layers.InputLayer((batch_size, max_seq_len), input_var=X)
        print(" l_in shape: {}\n".format(get_output_shape(l_in)))
        l_mask = lasagne.layers.InputLayer((batch_size, max_seq_len), input_var=M)
        #l_mask2 = lasagne.layers.InputLayer((batch_size, max_seq_len), input_var=M)
        #l_mask_concat = lasagne.layers.ConcatLayer([l_mask, l_mask2])

        print(" l_mask shape: {}\n".format(get_output_shape(l_mask)))
        #print(" l_mask shape: {}\n".format(get_output_shape(l_mask_concat)))

    
    
        # Embedding layer
        l_emb = lasagne.layers.EmbeddingLayer(l_in, input_size=V, output_size=K, W=W)
        # keep the embeddings static
        l_emb.params[l_emb.W].remove('trainable')
        print(" l_emb shape: {}\n".format(get_output_shape(l_emb)))
    
        # add droput
        #l_emb = lasagne.layers.DropoutLayer(l_emb, p=.2)
    
        # Use orthogonal Initialization for LSTM gates
        gate_params = lasagne.layers.recurrent.Gate(
            W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(),
            b=lasagne.init.Constant(0.)
        )
        cell_params = lasagne.layers.recurrent.Gate(
            W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(),
            W_cell=None, b=lasagne.init.Constant(0.),
            nonlinearity=lasagne.nonlinearities.tanh
        )
    
        l_fwd = lasagne.layers.LSTMLayer(
            l_emb, num_units=num_hidden, grad_clipping=grad_clip,
            nonlinearity=lasagne.nonlinearities.tanh, mask_input=l_mask,
            ingate=gate_params, forgetgate=gate_params, cell=cell_params,
            outgate=gate_params, learn_init=True
        )
        l_fwd = lasagne.layers.DropoutLayer(l_fwd,p=dropout)
        print(" forward shape: {}\n".format(get_output_shape(l_fwd)))
        if kwargs["lstm"] == "bi":
            gate_params_bwd = lasagne.layers.recurrent.Gate(
                W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(),
                b=lasagne.init.Constant(0.)
            )
            cell_params_bwd = lasagne.layers.recurrent.Gate(
                W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(),
                W_cell=None, b=lasagne.init.Constant(0.),
                nonlinearity=lasagne.nonlinearities.tanh
            )
            l_bwd = lasagne.layers.LSTMLayer(
                     l_emb, num_units=num_hidden, grad_clipping=grad_clip,
                     nonlinearity=lasagne.nonlinearities.tanh, mask_input=l_mask,
                     ingate=gate_params_bwd, forgetgate=gate_params_bwd, cell=cell_params_bwd,
                     outgate=gate_params_bwd, learn_init=True,
                     backwards=True
            )
            l_bwd = lasagne.layers.DropoutLayer(l_bwd,p=dropout)
            print(" backward shape: {}\n".format(get_output_shape(l_bwd)))

            # concat and dropout
            l_concat = lasagne.layers.ConcatLayer([l_fwd, l_bwd])
            #l_concat = lasagne.layers.ElemwiseSumLayer([l_fwd, l_bwd])
            l_concat_dropout = lasagne.layers.DropoutLayer(l_concat,p=dropout)
            print(" concat shape: {}\n".format(get_output_shape(l_concat)))
        else:
            l_concat_dropout = l_fwd
    
    
        network = lasagne.layers.DenseLayer(
            l_concat_dropout,
            num_units=num_classes,
            nonlinearity=lasagne.nonlinearities.softmax
        )
        #print(" network shape: {}\n".format(get_output_shape(network)))

        self.network = network
        output = lasagne.layers.get_output(network)

        # Define objective function (cost) to minimize, mean crossentropy error
        cost = lasagne.objectives.categorical_crossentropy(output, y).mean()

        # Compute gradient updates
        params = lasagne.layers.get_all_params(network)

        cost += lambda_w*apply_penalty(params, l2)
        # grad_updates = lasagne.updates.nesterov_momentum(cost, params,learn_rate)
        grad_updates = lasagne.updates.adam(cost, params)
        #learn_rate = .01
        #grad_updates = lasagne.updates.adadelta(cost, params, learn_rate)
        test_output = lasagne.layers.get_output(network, deterministic=True)
        val_cost_fn = lasagne.objectives.categorical_crossentropy(
            test_output, y).mean()
        preds = T.argmax(test_output, axis=1)

        val_acc_fn = T.mean(T.eq(preds, y),
                            dtype=theano.config.floatX)
        self.val_fn = theano.function([X, M, y], [val_cost_fn, val_acc_fn, preds],
                                 allow_input_downcast=True)
        if kwargs["lstm"] == "bi":
            concat_output = lasagne.layers.get_output(l_concat) 
            fwd_output = lasagne.layers.get_output(l_fwd) 
            bwd_output = lasagne.layers.get_output(l_bwd) 
            mask_output  = lasagne.layers.get_output(l_mask)
            #mask_concat_output  = lasagne.layers.get_output(l_mask_concat)

            self.get_concat = theano.function([X,M], [concat_output, fwd_output, bwd_output, mask_output]) #, mask_concat_output])
        #print(y_train)
        # Compile train objective
        print "Compiling training functions"
        self.train = theano.function(inputs = [X,M,y], outputs = cost, updates = grad_updates, allow_input_downcast=True)
        self.test = theano.function(inputs = [X,M,y], outputs = val_acc_fn)
        self.pred = theano.function(inputs = [X,M],outputs = preds)
Beispiel #10
0
def execute(dataset, learning_rate=0.00001, learning_rate_annealing=1.0,
            lmd=0., noise=0.0, encoder_units=[1024, 512, 256],
            num_epochs=500, which_fold=1,
            save_path=None, save_copy=None, dataset_path=None,
            num_fully_connected=0, exp_name='', init_args=None):

    # Reading dataset
    print("Loading data")
    if dataset == "1000_genomes" and which_fold == 1 and False:
        x_unsup = mlh.load_data(dataset, dataset_path, None,
                                which_fold=which_fold, keep_labels=1.0,
                                missing_labels_val=-1.0,
                                embedding_input='raw', transpose=False)
        import pdb; pdb.set_trace()
        
        x_train = np.zeros((x_unsup[0].shape[0], x_unsup[0].shape[1]*2), dtype="int8")
        x_train[:,::2] = (x_unsup[0] == 2)
        x_train[:,1::2] = (x_unsup[0] >= 1)
        
        x_valid = np.zeros((x_unsup[2].shape[0], x_unsup[2].shape[1]*2), dtype="int8")
        x_valid[:,::2] = (x_unsup[2] == 2)
        x_valid[:,1::2] = (x_unsup[2] >= 1)
    else:
        x_unsup = mlh.load_data(dataset, dataset_path, None,
                                which_fold=which_fold, keep_labels=1.0,
                                missing_labels_val=-1.0,
                                embedding_input='bin', transpose=True)
        x_train = x_unsup[0][0]
        x_valid = x_unsup[1][0]
    
    print(x_train.shape, x_valid.shape)

    n_features = x_train.shape[1]

    exp_name += "learn_snp2vec_dae_h"
    for e in encoder_units:
        exp_name += ('-' + str(e))
    # exp_name += '_g-' + str(gamma)
    exp_name += '_l-' + str(lmd)
    exp_name += '_lr-' + str(learning_rate)
    exp_name += '_fold-' + str(which_fold)

    save_path = os.path.join(save_path, exp_name)
    save_copy = os.path.join(save_copy, exp_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    if not os.path.exists(save_copy):
        os.makedirs(save_copy)

    # Prepare Theano variables for inputs and targets
    input_var = T.matrix('input')
    target_reconst = T.matrix('target')
    lr = theano.shared(np.float32(learning_rate), 'learning_rate')
    batch_size = 128

    # building network
    encoder = InputLayer((batch_size, n_features), input_var)

    # building the encoder and decoder
    #import pdb; pdb.set_trace()
    for i in range(len(encoder_units)):
        encoder = DenseLayer(
                encoder,
                num_units=encoder_units[i],
                W=Uniform(0.00001),
                nonlinearity=leaky_rectify)  # if i < len(encoder_units)-1 else linear)
        
    embedding = lasagne.layers.get_output(encoder)
    get_embedding_fn = theano.function([input_var], embedding)
    
    params = lasagne.layers.get_all_params(encoder, trainable=True)
    monitor_labels = ["embedding min", "embedding mean", "embedding max"]
    val_outputs = [embedding.min(), embedding.mean(), embedding.max()]
    nets = [encoder]

    decoder_units = encoder_units[::-1][1:]
    print(decoder_units)
    decoder = encoder
    for i in range(len(decoder_units)):
        decoder = DenseLayer(decoder,
                             num_units=decoder_units[i],
                             W=Uniform(0.0001),
                             nonlinearity=leaky_rectify)
    decoder = DenseLayer(decoder,
                         num_units=n_features,
                         W=convert_initialization(
                                init_args["decoder_init"],
                                nonlinearity="sigmoid"),
                         nonlinearity=sigmoid)
    prediction_reconst = lasagne.layers.get_output(decoder)

    # Reconstruction error
    loss_reconst = lasagne.objectives.binary_crossentropy(prediction_reconst,
                                                          target_reconst).mean()

    # loss_reconst = mh.define_sampled_mean_bincrossentropy(
    #    prediction_reconst, target_reconst, gamma=gamma)

    #loss_reconst = mh.dice_coef_loss(
    #    target_reconst, prediction_reconst).mean()
        
    accuracy = T.eq(T.gt(prediction_reconst, 0.5), target_reconst).mean()

    params += lasagne.layers.get_all_params(decoder, trainable=True)
    monitor_labels += ["reconst. loss", "reconst. accuracy"]
    val_outputs += [loss_reconst, accuracy]
    nets += [decoder]
    # sparsity_reconst = gamma * l1(prediction_reconst)
    # roh = input_var.mean(0)
    # sparsity_reconst = ((roh * T.log(roh / (prediction_reconst.mean(0)+1e-8))) +\
    #     ((1 - roh) * T.log((1 - roh) / (1 - prediction_reconst + 1e-8)))).sum()

    # Combine losses
    loss = loss_reconst # + sparsity_reconst

    # applying weight decay
    l2_penalty = apply_penalty(params, l2)
    loss = loss + lmd * l2_penalty

    val_outputs += [loss]
    monitor_labels += ['loss']

    # Some variables
    max_patience = 100
    patience = 0

    train_monitored = []
    valid_monitored = []
    train_loss = []

    updates = lasagne.updates.adam(loss,
                                   params,
                                   learning_rate=lr)

    for k in updates.keys():
        if updates[k].ndim == 2:
            updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0)

    inputs = [input_var, target_reconst]

    # Compile training function
    print "Compiling training function"
    train_fn = theano.function(inputs, loss, updates=updates,
                               on_unused_input='ignore')
    val_fn = theano.function(inputs,
                             [val_outputs[0]] + val_outputs,
                             on_unused_input='ignore')

    start_training = time.time()

    print "Starting training"
    for epoch in range(num_epochs):
        start_time = time.time()
        print("Epoch {} of {}".format(epoch+1, num_epochs))
        nb_minibatches = 0
        loss_epoch = 0

        for x, target_reconst_val in data_generator(x_train, batch_size,
                                                    shuffle=True, noise=noise):
            loss_epoch += train_fn(x, target_reconst_val)
            nb_minibatches += 1

        loss_epoch /= nb_minibatches
        train_loss += [loss_epoch]

        # Monitoring on the training set
        train_minibatches = data_generator(x_train, batch_size, noise=noise)
        train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                   monitor_labels, 0)
        train_monitored += [train_err]

        # Monitoring on the validation set
        valid_minibatches = data_generator(x_valid, batch_size, noise=noise)

        valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                   monitor_labels, 0)
        valid_monitored += [valid_err]

        early_stop_criterion = 'loss'
        early_stop_val = valid_err[monitor_labels.index(early_stop_criterion)]

        # Early stopping
        if epoch == 0:
            best_valid = early_stop_val
        elif early_stop_val < best_valid and early_stop_criterion == 'loss':
            best_valid = early_stop_val
            patience = 0

            # Save stuff
            np.savez(save_path+'/model_snp2vec_best.npz',
                     *lasagne.layers.get_all_param_values(nets))
            np.savez(save_path + "/errors_snp2vec_best.npz",
                     zip(*train_monitored), zip(*valid_monitored))
        else:
            patience += 1
            np.savez(os.path.join(save_path, 'model_snp2vec_last.npz'),
                     *lasagne.layers.get_all_param_values(nets))
            np.savez(save_path + "/errors_snp2vec_last.npz",
                     zip(*train_monitored), zip(*valid_monitored))

        # End training
        if (patience == max_patience) or (epoch == num_epochs-1):
            print("Ending training")
            # Load best model
            if not os.path.exists(save_path + '/model_snp2vec_best.npz'):
                print("No saved model to be tested and/or generate"
                      " the embedding !")
            else:
                with np.load(save_path + '/model_snp2vec_best.npz') as f:
                    param_values = [f['arr_%d' % i]
                                    for i in range(len(f.files))]
                    lasagne.layers.set_all_param_values(nets, param_values)
                    
            # Use the saved model to generate the feature embedding
            # Here the feature embedding is the different in the hidden
            # representation between having that feature on and having it off
            print("Generating embedding")
            embedding_size = encoder_units[-1]
            null_input = np.zeros((1, n_features), dtype="float32")
            null_embedding = get_embedding_fn(null_input)[0]
            
            all_embeddings = np.zeros((n_features,
                                       embedding_size), dtype="float32")
            
            """
            single_feat_input = null_input.copy()
            for i in range(n_features):
                if i % 10000 == 0:
                    print(i, n_features)

                single_feat_input[:,i] = 1
                all_embeddings[i] = (get_embedding_fn(single_feat_input)[0] -
                                     null_embedding)
                single_feat_input[:,i] = 0
                
            result1 = all_embeddings[:1000].copy()
            """
            
            block_size = 10
            single_feat_batch = np.zeros((block_size, n_features), dtype="float32")
            for i in range(0, n_features, block_size):
                if i % 10000 == 0:
                    print(i, n_features)
                
                for j in range(block_size):
                    single_feat_batch[j, i+j] = 1
                    
                all_embeddings[i:i+10] = (get_embedding_fn(single_feat_batch) -
                                          null_embedding)
                    
                for j in range(block_size):
                    single_feat_batch[j, i+j] = 0
                
            np.save("/Tmp/carriepl/feature_selection/all_embeddings_fold%i_noise%f.npy" % (which_fold, noise),
                    all_embeddings)

            # Training set results
            train_minibatches = data_generator(x_train, batch_size, noise=noise)
            train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                       monitor_labels, 0)

            # Validation set results
            valid_minibatches = data_generator(x_valid, batch_size, noise=noise)
            valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                       monitor_labels, 0)

            # Stop
            print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() -
                                                         start_time))
            break

        print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time))
        # Anneal the learning rate
        lr.set_value(float(lr.get_value() * learning_rate_annealing))


    # Copy files to loadpath
    if save_path != save_copy:
        print('Copying model and other training files to {}'.format(save_copy))
        copy_tree(save_path, save_copy)
Beispiel #11
0
def execute(dataset,
            n_hidden_u,
            n_hidden_t_enc,
            n_hidden_t_dec,
            n_hidden_s,
            learning_rate,
            learning_rate_annealing=1.,
            embedding_source=None,
            alpha=1,
            beta=1,
            gamma=1,
            lmd=0,
            encoder_net_init=0.001,
            decoder_net_init=0.001,
            disc_nonlinearity='softmax',
            keep_labels=1.0,
            prec_recall_cutoff=True,
            missing_labels_val=-1.0,
            which_fold=0,
            early_stop_criterion='accuracy',
            save_path='/Tmp/romerosa/DietNetworks/',
            dataset_path='/Tmp/' + os.environ["USER"] + '/datasets/',
            resume=False,
            exp_name=''):

    # Prepare embedding information
    if embedding_source is None:
        embedding_input = 'raw'
    else:
        embedding_input = embedding_source
        embedding_source = os.path.join(
            dataset_path, embedding_input + '_fold' + str(which_fold) + '.npy')

    # Load the dataset
    print("Loading data")
    x_train, y_train, x_valid, y_valid, x_test, y_test, \
        x_unsup, training_labels = mlh.load_data(
            dataset, dataset_path, embedding_source,
            which_fold=which_fold, keep_labels=keep_labels,
            missing_labels_val=missing_labels_val,
            embedding_input=embedding_input)

    if x_unsup is not None:
        n_samples_unsup = x_unsup.shape[1]
    else:
        n_samples_unsup = 0

    # Extract required information from data
    n_samples, n_feats = x_train.shape
    print("Number of features : ", n_feats)
    print("Glorot init : ", 2.0 / (n_feats + n_hidden_t_enc[-1]))
    n_targets = y_train.shape[1]

    # Set some variables
    batch_size = 138
    beta = gamma if (gamma == 0) else beta

    # Preparing folder to save stuff
    if embedding_source is None:
        embedding_name = embedding_input
    else:
        embedding_name = embedding_source.replace("_", "").split(".")[0]
        exp_name += embedding_name.rsplit('/', 1)[::-1][0] + '_'

    exp_name += mlh.define_exp_name(keep_labels, alpha, beta, gamma, lmd,
                                    n_hidden_u, n_hidden_t_enc, n_hidden_t_dec,
                                    n_hidden_s, which_fold, learning_rate,
                                    decoder_net_init, encoder_net_init,
                                    early_stop_criterion,
                                    learning_rate_annealing)

    print("Experiment: " + exp_name)
    save_path = os.path.join(save_path, dataset, exp_name)
    print(save_path)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # Prepare Theano variables for inputs and targets
    input_var_sup = T.matrix('input_sup')
    input_var_unsup = theano.shared(x_unsup, 'input_unsup')  # x_unsup TBD
    target_var_sup = T.matrix('target_sup')

    # Build model
    print("Building model")

    # Some checkings
    # assert len(n_hidden_u) > 0
    assert len(n_hidden_t_enc) > 0
    assert len(n_hidden_t_dec) > 0
    assert n_hidden_t_dec[-1] == n_hidden_t_enc[-1]

    # Build feature embedding networks (encoding and decoding if gamma > 0)
    nets, embeddings, pred_feat_emb = mh.build_feat_emb_nets(
        embedding_source, n_feats, n_samples_unsup, input_var_unsup,
        n_hidden_u, n_hidden_t_enc, n_hidden_t_dec, gamma, encoder_net_init,
        encoder_net_init, save_path)

    # Build feature embedding reconstruction networks (if alpha > 0, beta > 0)
    nets += mh.build_feat_emb_reconst_nets(
        [alpha, beta], n_samples_unsup, n_hidden_u,
        [n_hidden_t_enc, n_hidden_t_dec], nets,
        [encoder_net_init, encoder_net_init])

    # Supervised network
    discrim_net, hidden_rep = mh.build_discrim_net(
        batch_size, n_feats, input_var_sup, n_hidden_t_enc, n_hidden_s,
        embeddings[0], disc_nonlinearity, n_targets)

    # Reconstruct network
    nets += [
        mh.build_reconst_net(hidden_rep,
                             embeddings[1] if len(embeddings) > 1 else None,
                             n_feats, gamma)
    ]

    # Load best model
    with np.load(os.path.join(save_path, 'dietnets_best.npz')) as f:
        param_values = [f['arr_%d' % i] for i in range(len(f.files))]
    lasagne.layers.set_all_param_values(
        filter(None, nets) + [discrim_net], param_values)

    print("Building and compiling training functions")

    # Build functions
    predictions, predictions_det = mh.define_predictions(nets, start=2)
    prediction_sup, prediction_sup_det = mh.define_predictions([discrim_net])
    prediction_sup = prediction_sup[0]
    prediction_sup_det = prediction_sup_det[0]

    # Define losses
    # reconstruction losses
    _, reconst_losses_det = mh.define_reconst_losses(
        predictions, predictions_det,
        [input_var_unsup, input_var_unsup, input_var_sup])
    # supervised loss
    _, sup_loss_det = mh.define_sup_loss(disc_nonlinearity, prediction_sup,
                                         prediction_sup_det, keep_labels,
                                         target_var_sup, missing_labels_val)

    # Define inputs
    inputs = [input_var_sup, target_var_sup]

    # Combine losses
    loss_det = sup_loss_det + alpha*reconst_losses_det[0] + \
        beta*reconst_losses_det[1] + gamma*reconst_losses_det[2]

    # Define parameters
    params = lasagne.layers.get_all_params([discrim_net] + filter(None, nets),
                                           trainable=True)

    l2_penalty = apply_penalty(params, l2)
    loss_det = loss_det + lmd * l2_penalty

    # Monitoring Labels
    monitor_labels = [
        "reconst. feat. W_enc", "reconst. feat. W_dec", "reconst. loss"
    ]
    monitor_labels = [
        i for i, j in zip(monitor_labels, reconst_losses_det) if j != 0
    ]
    monitor_labels += ["feat. W_enc. mean", "feat. W_enc var"]
    monitor_labels += ["feat. W_dec. mean", "feat. W_dec var"] if \
        (embeddings[1] is not None) else []
    monitor_labels += ["loss. sup.", "total loss"]

    # test function
    val_outputs = reconst_losses_det
    val_outputs = [
        i for i, j in zip(val_outputs, reconst_losses_det) if j != 0
    ]
    val_outputs += [embeddings[0].mean(), embeddings[0].var()]
    val_outputs += [embeddings[1].mean(), embeddings[1].var()] if \
        (embeddings[1] is not None) else []
    val_outputs += [sup_loss_det, loss_det]

    # Compute accuracy and add it to monitoring list
    test_acc, test_pred = mh.define_test_functions(disc_nonlinearity,
                                                   prediction_sup,
                                                   prediction_sup_det,
                                                   target_var_sup)
    monitor_labels.append("accuracy")
    val_outputs.append(test_acc)

    # Compile prediction function
    predict = theano.function([input_var_sup], test_pred)

    # Compile validation function
    val_fn = theano.function(inputs, [prediction_sup_det] + val_outputs,
                             on_unused_input='ignore')

    # Finally, launch the training loop.
    print("Starting testing...")
    test_minibatches = mlh.iterate_minibatches(x_test,
                                               y_test,
                                               batch_size,
                                               shuffle=False)
    test_err, pred, targets = mlh.monitoring(test_minibatches,
                                             "test",
                                             val_fn,
                                             monitor_labels,
                                             prec_recall_cutoff,
                                             return_pred=True)

    lab = targets.argmax(1)
    pred_argmax = pred.argmax(1)

    continent_cat = mh.create_1000_genomes_continent_labels()

    lab_cont = np.zeros(lab.shape)
    pred_cont = np.zeros(pred_argmax.shape)

    for i, c in enumerate(continent_cat):
        for el in c:
            lab_cont[lab == el] = i
            pred_cont[pred_argmax == el] = i

    cm_e = np.zeros((26, 26))
    cm_c = np.zeros((5, 5))

    for i in range(26):
        for j in range(26):
            cm_e[i, j] = ((pred_argmax == i) * (lab == j)).sum()

    for i in range(5):
        for j in range(5):
            cm_c[i, j] = ((pred_cont == i) * (lab_cont == j)).sum()

    np.savez(os.path.join(save_path, 'cm' + str(which_fold) + '.npz'),
             cm_e=cm_e,
             cm_c=cm_c)

    print(os.path.join(save_path, 'cm' + str(which_fold) + '.npz'))
Beispiel #12
0
    def __init__(self,
                 V,
                 d,
                 max_post_length,
                 max_sentence_length,
                 embeddings=None,
                 GRAD_CLIP=100,
                 num_layers=1,
                 learning_rate=0.01,
                 add_biases=False,
                 rd=100,
                 op=False,
                 word_attn=True,
                 sent_attn=True,
                 highway=False,
                 hops=3,
                 words=True,
                 frames=False,
                 discourse=False):

        self._hyper_params = dict(V=V,
                                  d=d,
                                  max_post_length=max_post_length,
                                  max_sentence_length=max_sentence_length,
                                  GRAD_CLIP=GRAD_CLIP,
                                  num_layers=num_layers,
                                  learning_rate=learning_rate,
                                  add_biases=add_biases,
                                  rd=rd,
                                  op=op,
                                  word_attn=word_attn,
                                  sent_attn=sent_attn,
                                  highway=highway,
                                  hops=hops)

        print(V, d, max_post_length, max_sentence_length)

        #S x N matrix of sentences (aka list of word indices)
        #B x S x N tensor of batches of posts
        idxs_rr = T.itensor3('idxs_rr')
        idxs_op = T.itensor3('idxs_op')
        #B x S x N matrix
        mask_rr_w = T.itensor3('mask_rr_w')
        mask_op_w = T.itensor3('mask_rr_w')
        #B x S matrix
        mask_rr_s = T.imatrix('mask_rr_s')
        mask_op_s = T.imatrix('mask_rr_s')
        #B-long vector
        gold = T.ivector('gold')
        lambda_w = T.scalar('lambda_w')
        p_dropout = T.scalar('p_dropout')

        biases = T.matrix('biases')
        weights = T.ivector('weights')

        #now use this as an input to an LSTM
        l_idxs_rr = lasagne.layers.InputLayer(shape=(None, max_post_length,
                                                     max_sentence_length),
                                              input_var=idxs_rr)
        l_mask_rr_w = lasagne.layers.InputLayer(shape=(None, max_post_length,
                                                       max_sentence_length),
                                                input_var=mask_rr_w)
        l_mask_rr_s = lasagne.layers.InputLayer(shape=(None, max_post_length),
                                                input_var=mask_rr_s)
        l_idxs_op = lasagne.layers.InputLayer(shape=(None, max_post_length,
                                                     max_sentence_length),
                                              input_var=idxs_op)
        l_mask_op_w = lasagne.layers.InputLayer(shape=(None, max_post_length,
                                                       max_sentence_length),
                                                input_var=mask_op_w)
        l_mask_op_s = lasagne.layers.InputLayer(shape=(None, max_post_length),
                                                input_var=mask_op_s)

        if add_biases:
            l_biases = lasagne.layers.InputLayer(shape=(None, 1),
                                                 input_var=biases)
        #now B x S x N x D
        if embeddings is not None:
            l_emb_rr_w = lasagne.layers.EmbeddingLayer(
                l_idxs_rr, V, d, W=lasagne.utils.floatX(embeddings))
        else:
            l_emb_rr_w = lasagne.layers.EmbeddingLayer(l_idxs_rr, V, d)

        #now B x S x D
        if words:
            if word_attn:
                l_attn_rr_w = AttentionWordLayer([l_emb_rr_w, l_mask_rr_w], d)
                l_avg_rr_s = WeightedAverageWordLayer(
                    [l_emb_rr_w, l_attn_rr_w])
            else:
                l_avg_rr_s = AverageWordLayer([l_emb_rr_w, l_mask_rr_w])
            concats = [l_avg_rr_s]
            inputs = [idxs_rr, mask_rr_w, mask_rr_s]
        else:
            concats = []
            inputs = [mask_rr_w, mask_rr_s]

        if frames:
            idxs_frames_rr = T.itensor3('idxs_frames_rr')
            inputs.append(idxs_frames_rr)
            l_idxs_frames_rr = lasagne.layers.InputLayer(
                shape=(None, max_post_length, max_sentence_length),
                input_var=idxs_frames_rr)
            l_emb_frames_rr_w = lasagne.layers.EmbeddingLayer(l_idxs_frames_rr,
                                                              V,
                                                              d,
                                                              W=l_emb_rr_w.W)
            if word_attn:
                l_attn_rr_frames = AttentionWordLayer(
                    [l_emb_frames_rr_w, l_mask_rr_w], d)
                l_avg_rr_s_frames = WeightedAverageWordLayer(
                    [l_emb_frames_rr_w, l_attn_rr_frames])
            else:
                l_avg_rr_s_frames = AverageWordLayer(
                    [l_emb_frames_rr_w, l_mask_rr_w])
            concats.append(l_avg_rr_s_frames)

        if discourse:
            idxs_disc_rr = T.imatrix('idxs_disc_rr')
            inputs.append(idxs_disc_rr)
            l_emb_disc_rr = lasagne.layers.EmbeddingLayer(l_idxs_disc_rr,
                                                          V,
                                                          d,
                                                          W=l_emb_rr_w.W)
            concats.append(l_emb_disc_rr)

        l_avg_rr_s = lasagne.layers.ConcatLayer(concats, axis=-1)

        if highway:
            l_avg_rr_s = HighwayLayer(
                l_avg_rr_s,
                num_units=l_avg_rr_s.output_shape[-1],
                nonlinearity=lasagne.nonlinearities.rectify,
                num_leading_axes=2)

        #separate embeddings for OP
        if embeddings is not None:
            l_emb_op_w = lasagne.layers.EmbeddingLayer(
                l_idxs_op, V, d, W=lasagne.utils.floatX(embeddings))
        else:
            l_emb_op_w = lasagne.layers.EmbeddingLayer(l_idxs_op, V, d)

        if op:
            if words:
                l_attn_op_w = AttentionWordLayer([l_emb_op_w, l_mask_op_w], d)
                l_avg_op_s = WeightedAverageWordLayer(
                    [l_emb_op_w, l_attn_op_w])
                concats = [l_avg_op_s]
                inputs.extend([idxs_op, mask_op_w, mask_op_s])
            else:
                concats = []
                inputs.extend([mask_op_w, mask_op_s])

            if frames:
                idxs_frames_op = T.itensor3('idxs_frames_op')
                inputs.append(idxs_frames_op)
                l_idxs_frames_op = lasagne.layers.InputLayer(
                    shape=(None, max_post_length, max_sentence_length),
                    input_var=idxs_frames_op)
                l_emb_frames_op_w = lasagne.layers.EmbeddingLayer(
                    l_idxs_frames_op, V, d, W=l_emb_op_w.W)
                l_attn_op_frames = AttentionWordLayer(
                    [l_emb_frames_op_w, l_mask_op_w], d)
                l_avg_op_s_frames = WeightedAverageWordLayer(
                    [l_emb_frames_op_w, l_attn_op_frames])
                concats.append(l_avg_op_s_frames)

            if discourse:
                idxs_disc_op = T.imatrix('idxs_disc_op')
                inputs.append(idxs_disc_op)
                l_emb_disc_op = lasagne.layers.EmbeddingLayer(l_idxs_disc_op,
                                                              V,
                                                              d,
                                                              W=l_emb_op_w.W)
                concats.append(l_emb_disc_op)

            l_avg_op_s = lasagne.layers.ConcatLayer(concats, axis=-1)

            #bidirectional LSTM
            l_lstm_op_s_fwd = lasagne.layers.LSTMLayer(
                l_avg_op_s,
                rd,
                nonlinearity=lasagne.nonlinearities.tanh,
                grad_clipping=GRAD_CLIP,
                mask_input=l_mask_op_s)
            l_lstm_op_s_rev = lasagne.layers.LSTMLayer(
                l_avg_op_s,
                rd,
                nonlinearity=lasagne.nonlinearities.tanh,
                grad_clipping=GRAD_CLIP,
                mask_input=l_mask_op_s,
                backwards=True)
            l_avg_op_s = lasagne.layers.ConcatLayer(
                [l_lstm_op_s_fwd, l_lstm_op_s_rev], axis=-1)
            l_attn_op_s = AttentionSentenceLayer([l_avg_op_s, l_mask_op_s], d)
            l_op_avg = WeightedAverageSentenceLayer([l_avg_op_s, l_attn_op_s])

        #bidirectional LSTM
        l_lstm_rr_s_fwd = lasagne.layers.LSTMLayer(
            l_avg_rr_s,
            rd,
            nonlinearity=lasagne.nonlinearities.tanh,
            grad_clipping=GRAD_CLIP,
            mask_input=l_mask_rr_s)
        l_lstm_rr_s_rev = lasagne.layers.LSTMLayer(
            l_avg_rr_s,
            rd,
            nonlinearity=lasagne.nonlinearities.tanh,
            grad_clipping=GRAD_CLIP,
            mask_input=l_mask_rr_s,
            backwards=True)

        #for attention or avergae
        l_lstm_rr_s = lasagne.layers.ConcatLayer(
            [l_lstm_rr_s_fwd, l_lstm_rr_s_rev], axis=-1)

        #now memory network
        init_memory_response = AverageSentenceLayer([l_lstm_rr_s, l_mask_rr_s])
        if op:
            init_memory_response = lasagne.layers.ConcatLayer(
                [init_memory_response, l_op_avg])
        l_memory = MyConcatLayer([l_lstm_rr_s, init_memory_response])

        if sent_attn:
            l_attn_rr_s = AttentionSentenceLayer([l_lstm_rr_s, l_mask_rr_s], d)
            l_rr_avg = WeightedAverageSentenceLayer([l_lstm_rr_s, l_attn_rr_s])
        else:
            l_rr_avg = AverageSentenceLayer([l_lstm_rr_s, l_mask_rr_s])

        for i in range(hops):
            l_attn_rr_s = AttentionSentenceLayer([l_memory, l_mask_rr_s], d)
            l_rr_avg = WeightedAverageSentenceLayer([l_memory, l_attn_rr_s])
            if op:
                l_rr_avg = lasagne.layers.ConcatLayer([l_rr_avg, l_op_avg])
            l_memory = MyConcatLayer([l_lstm_rr_s, l_rr_avg])

        l_hid = l_rr_avg

        for num_layer in range(num_layers):
            l_hid = lasagne.layers.DenseLayer(
                l_hid,
                num_units=d,
                nonlinearity=lasagne.nonlinearities.rectify)

            #now B x 1
            l_hid = lasagne.layers.DropoutLayer(l_hid, p_dropout)

        if add_biases:
            l_hid = lasagne.layers.ConcatLayer([l_hid, l_biases], axis=-1)
            inputs.append(biases)

        self.network = lasagne.layers.DenseLayer(l_hid,
                                                 num_units=1,
                                                 nonlinearity=T.nnet.sigmoid)

        predictions = lasagne.layers.get_output(self.network).ravel()

        xent = lasagne.objectives.binary_crossentropy(predictions, gold)
        loss = lasagne.objectives.aggregate(xent,
                                            weights,
                                            mode='normalized_sum')

        params = lasagne.layers.get_all_params(self.network, trainable=True)

        #add regularization
        loss += lambda_w * apply_penalty(params, l2)

        updates = lasagne.updates.nesterov_momentum(
            loss, params, learning_rate=learning_rate, momentum=0.9)

        print('compiling...')
        train_outputs = loss
        self.train = theano.function(inputs +
                                     [gold, lambda_w, p_dropout, weights],
                                     train_outputs,
                                     updates=updates,
                                     allow_input_downcast=True,
                                     on_unused_input='warn')
        print('...')
        test_predictions = lasagne.layers.get_output(
            self.network, deterministic=True).ravel()

        self.predict = theano.function(inputs,
                                       test_predictions,
                                       allow_input_downcast=True,
                                       on_unused_input='warn')

        test_acc = T.mean(T.eq(test_predictions > .5, gold),
                          dtype=theano.config.floatX)
        print('...')
        test_loss = lasagne.objectives.binary_crossentropy(
            test_predictions, gold).mean()
        self.validate = theano.function(
            inputs + [gold, lambda_w, p_dropout, weights], [loss, test_acc],
            on_unused_input='warn')

        print('...')

        #attention for words, B x S x N
        print('attention...')
        word_attention = lasagne.layers.get_output(
            AttentionWordLayer([l_emb_rr_w, l_mask_rr_w],
                               d,
                               W_w=l_attn_rr_w.W_w,
                               u_w=l_attn_rr_w.u_w,
                               b_w=l_attn_rr_w.b_w,
                               normalized=False))
        self.word_attention = theano.function([idxs_rr, mask_rr_w],
                                              word_attention,
                                              allow_input_downcast=True,
                                              on_unused_input='warn')

        #attention for sentences, B x S
        print('...')
        sentence_attention = lasagne.layers.get_output(l_attn_rr_s)
        if add_biases:
            inputs = inputs[:-1]
        self.sentence_attention = theano.function(inputs,
                                                  sentence_attention,
                                                  allow_input_downcast=True,
                                                  on_unused_input='warn')
        print('finished compiling...')
Beispiel #13
0
    def __init__(self, 
                W=None, 
                W_path=None, 
                K=300, 
                num_hidden=256,
                batch_size=None,
                grad_clip=100., 
                max_sent_len=200, 
                num_classes=2, 
                **kwargs):

        W = W
        V = len(W)
        K = int(K)
        print("this is the value of K: {}\n".format(K))
        num_hidden = int(num_hidden)
        batch_size = int(batch_size)
        grad_clip = int(grad_clip)
        max_seq_len = int(max_sent_len)
        max_post_len = int(kwargs["max_post_len"])
        num_classes = int(num_classes)    
        dropout = float(kwargs["dropout"])
        lambda_w = float(kwargs["lambda_w"])
        separate_attention_context = str_to_bool(kwargs["separate_attention_context"])
        separate_attention_response = str_to_bool(kwargs["separate_attention_response"])
        interaction = str_to_bool(kwargs["interaction"])
        separate_attention_context_words = str_to_bool(kwargs["separate_attention_context_words"])
        separate_attention_response_words = str_to_bool(kwargs["separate_attention_response_words"])

        print("this is the separate_attention_context: {}\n".format(separate_attention_context))

        print("this is the separate_attention_response: {}\n".format(separate_attention_response))
        print("this is the separate_attention_context_words: {}\n".format(separate_attention_context_words))

        print("this is the separate_attention_response_words: {}\n".format(separate_attention_response_words))
        print("this is the interaction: {}\n".format(interaction))


        #S x N matrix of sentences (aka list of word indices)
        #B x S x N tensor of batches of responses
        idxs_context = T.itensor3('idxs_context') #imatrix
        #B x S x N matrix
        mask_context_words = T.itensor3('mask_context_words')
        #B x S matrix
        mask_context_sents = T.imatrix('mask_context_sents')
        #B x S x N tensor of batches of responses
        idxs_response = T.itensor3('idxs_response') #imatrix
        #B x S x N matrix
        mask_response_words = T.itensor3('mask_response_words')
        #B x S matrix
        mask_response_sents = T.imatrix('mask_response_sents')
        #B-long vector
        y = T.ivector('y')
        # TODO
        # Add biases, other params? 
        #lambda_w = T.scalar('lambda_w')
        #p_dropout = T.scalar('p_dropout')

        #biases = T.matrix('biases')
        #weights = T.ivector('weights')
        
        inputs = [idxs_response, mask_response_words, mask_response_sents]
        # TODO 
        # change inputs, function calls
        #idxs_context, mask_context_words, mask_context_sents
                
        #now use this as an input to an LSTM
        l_idxs_context = lasagne.layers.InputLayer(shape=(None, max_post_len, max_sent_len),
                                            input_var=idxs_context)
        l_mask_context_words = lasagne.layers.InputLayer(shape=(None, max_post_len, max_sent_len),input_var=mask_context_words)
        l_mask_context_sents = lasagne.layers.InputLayer(shape=(None, max_post_len),
                                                input_var=mask_context_sents)

        #if add_biases:
        #    l_biases = lasagne.layers.InputLayer(shape=(None,1),
                                                 # input_var=biases)
        #now B x S x N x D
        #l_emb = lasagne.layers.EmbeddingLayer(l_in, input_size=V, output_size=K, W=W)
        l_emb_rr_w_context = lasagne.layers.EmbeddingLayer(l_idxs_context, input_size=V, output_size=K,
                                                   W=W)
        l_emb_rr_w_context.params[l_emb_rr_w_context.W].remove('trainable')
#        l_hid_context = l_emb_rr_w
        #CBOW w/attn
        #now B x S x D
        if separate_attention_context_words:
            l_attention_words_context = AttentionWordLayer([l_emb_rr_w_context, l_mask_context_words], K)
            #print(" attention word layer shape: {}\n".format(get_output_shape(l_attention_words_context)))
            l_avg_rr_s_words_context = WeightedAverageWordLayer([l_emb_rr_w_context,l_attention_words_context])
        else:
            l_avg_rr_s_words_context = WeightedAverageWordLayer([l_emb_rr_w_context, l_mask_context_words])
        ##concats = l_avg_rr_s_words_context
        ##concats = [l_avg_rr_s_words_context]
        l_avg_rr_s_context = l_avg_rr_s_words_context

        # concats not relevant here, was just frames, sentiment etc for other task.
            
            
        #l_avg_rr_s_context = lasagne.layers.ConcatLayer(concats, axis=-1)

        # TODO
        # add highway ?
        #add MLP
        #if highway:
        #    l_avg_rr_s_context = HighwayLayer(l_avg_rr_s_context, num_units=l_avg_rr_s_context.output_shape[-1],
        #                              nonlinearity=lasagne.nonlinearities.rectify,
        #                              num_leading_axes=2)
        #    
        l_lstm_rr_s_context = lasagne.layers.LSTMLayer(l_avg_rr_s_context, num_hidden,
                                               nonlinearity=lasagne.nonlinearities.tanh,
                                               grad_clipping=grad_clip,
                                               mask_input=l_mask_context_sents)
        
        l_lstm_rr_s_context = lasagne.layers.DropoutLayer(l_lstm_rr_s_context,p=dropout)
        if interaction:
            #l_hid_context = l_lstm_rr_s_context
            if separate_attention_context:
                print("separate attention context\n")
                l_attn_rr_s_context = AttentionSentenceLayer([l_lstm_rr_s_context, l_mask_context_sents], num_hidden)        
                l_lstm_rr_avg_context = WeightedAverageSentenceLayer([l_lstm_rr_s_context, l_attn_rr_s_context])
                print(" attention weighted average sentence layer shape: {}\n".format(get_output_shape(l_lstm_rr_avg_context)))
            else:
                print("just averaged context without attention\n")
                l_lstm_rr_avg_context = WeightedAverageSentenceLayer([l_lstm_rr_s_context, l_mask_context_sents])
                print(" attention weighted average sentence layer shape: {}\n".format(get_output_shape(l_lstm_rr_avg_context)))

            l_hid_context = l_lstm_rr_avg_context
            print("interaction\n")
        else:
            print("no interaction!!! \n")
            #LSTM w/ attn
            #now B x D
            if separate_attention_context:
                print("separate attention context\n")
                l_attn_rr_s_context = AttentionSentenceLayer([l_lstm_rr_s_context, l_mask_context_sents], num_hidden)        
                l_lstm_rr_avg_context = WeightedAverageSentenceLayer([l_lstm_rr_s_context, l_attn_rr_s_context])
                print(" attention weighted average sentence layer shape: {}\n".format(get_output_shape(l_lstm_rr_avg_context)))
            else:
                print("just averaged context without attention\n")
                l_lstm_rr_avg_context = WeightedAverageSentenceLayer([l_lstm_rr_s_context, l_mask_context_sents])
                print(" attention weighted average sentence layer shape: {}\n".format(get_output_shape(l_lstm_rr_avg_context)))

            l_hid_context = l_lstm_rr_avg_context

        # TODO 
        # change inputs, function calls
        #idxs_context, mask_context_words, mask_context_sents
                
        #now use this as an input to an LSTM
        l_idxs_response = lasagne.layers.InputLayer(shape=(None, max_post_len, max_sent_len),
                                            input_var=idxs_response)
        l_mask_response_words = lasagne.layers.InputLayer(shape=(None, max_post_len, max_sent_len),input_var=mask_response_words)
        l_mask_response_sents = lasagne.layers.InputLayer(shape=(None, max_post_len),
                                                input_var=mask_response_sents)

        #if add_biases:
        #    l_biases = lasagne.layers.InputLayer(shape=(None,1),
                                                 # input_var=biases)
        #now B x S x N x D
        #l_emb = lasagne.layers.EmbeddingLayer(l_in, input_size=V, output_size=K, W=W)
        l_emb_rr_w_response = lasagne.layers.EmbeddingLayer(l_idxs_response, input_size=V, output_size=K,
                                                   W=W)
        l_emb_rr_w_response.params[l_emb_rr_w_response.W].remove('trainable')
#        l_hid_response = l_emb_rr_w
        #CBOW w/attn
        #now B x S x D
        if separate_attention_response_words:
            l_attention_words_response = AttentionWordLayer([l_emb_rr_w_response, l_mask_response_words], K)
            #print(" attention word layer shape: {}\n".format(get_output_shape(l_attention_words_response)))
            l_avg_rr_s_words_response = WeightedAverageWordLayer([l_emb_rr_w_response,l_attention_words_response])
        else:
            l_avg_rr_s_words_response = WeightedAverageWordLayer([l_emb_rr_w_response, l_mask_response_words])
        #l_attention_words_response = AttentionWordLayer([l_emb_rr_w_response, l_mask_response_words], K)
        #print(" attention word layer shape: {}\n".format(get_output_shape(l_attention_words_response)))
        #l_avg_rr_s_words_response = WeightedAverageWordLayer([l_emb_rr_w_response, l_mask_response_words])
        ##concats = l_avg_rr_s_words_response
        ##concats = [l_avg_rr_s_words_response]
        l_avg_rr_s_response = l_avg_rr_s_words_response

        # concats not relevant here, was just frames, sentiment etc for other task.
            
            
        #l_avg_rr_s_response = lasagne.layers.ConcatLayer(concats, axis=-1)

        # TODO
        # add highway ?
        #add MLP
        #if highway:
        #    l_avg_rr_s_response = HighwayLayer(l_avg_rr_s_response, num_units=l_avg_rr_s_response.output_shape[-1],
        #                              nonlinearity=lasagne.nonlinearities.rectify,
        #                              num_leading_axes=2)
        #    
        if interaction:
            print("interaction\n")
            # add some cell init
            l_lstm_rr_s_response = lasagne.layers.LSTMLayer(l_avg_rr_s_response, num_hidden,
                                                   nonlinearity=lasagne.nonlinearities.tanh,
                                                   grad_clipping=grad_clip,cell_init=l_hid_context,
                                                   mask_input=l_mask_response_sents)
        else:
            l_lstm_rr_s_response = lasagne.layers.LSTMLayer(l_avg_rr_s_response, num_hidden,
                                                   nonlinearity=lasagne.nonlinearities.tanh,
                                                   grad_clipping=grad_clip,
                                                   mask_input=l_mask_response_sents)
            
        l_lstm_rr_s_response = lasagne.layers.DropoutLayer(l_lstm_rr_s_response,p=dropout)
        #LSTM w/ attn
        #now B x D
        if separate_attention_response:
            print("separate attention on the response\n")
            l_attn_rr_s_response = AttentionSentenceLayer([l_lstm_rr_s_response, l_mask_response_sents], num_hidden)        
            l_lstm_rr_avg_response = WeightedAverageSentenceLayer([l_lstm_rr_s_response, l_attn_rr_s_response])
            print(" attention weighted average sentence layer shape: {}\n".format(get_output_shape(l_lstm_rr_avg_response)))
        else:
            print("just average response without attention\n")
            l_lstm_rr_avg_response = WeightedAverageSentenceLayer([l_lstm_rr_s_response, l_mask_response_sents])
            print(" attention weighted average sentence layer shape: {}\n".format(get_output_shape(l_lstm_rr_avg_response)))

        l_hid_response = l_lstm_rr_avg_response

        # TODO
        # add more layers? biases? 
        #for num_layer in range(num_layers):
        #    l_hid_response = lasagne.layers.DenseLayer(l_hid_response, num_units=rd,
        #                                  nonlinearity=lasagne.nonlinearities.rectify)

        #    #now B x 1
        #    l_hid_response = lasagne.layers.DropoutLayer(l_hid_response, p_dropout)
        #    
        #if add_biases:
        #    l_hid_response = lasagne.layers.ConcatLayer([l_hid_response, l_biases], axis=-1)
        #    inputs.append(biases)
        #    
        #self.network = lasagne.layers.DenseLayer(l_hid_response, num_units=2,
        #                                         nonlinearity=T.nnet.sigmoid)
        #
        #predictions = lasagne.layers.get_output(self.network).ravel()
        #
        #xent = lasagne.objectives.binary_crossentropy(predictions, gold)
        #loss = lasagne.objectives.aggregate(xent, weights, mode='normalized_sum')
        #
        #params = lasagne.layers.get_all_params(self.network, trainable=True)
        #
        # TODO
        ##add regularization? different gradient technique?
        #loss += lambda_w*apply_penalty(params, l2)

        #updates = lasagne.updates.nesterov_momentum(loss, params,
        #                                            learning_rate=learning_rate, momentum=0.9)

        #print('compiling...')
        #train_outputs = loss
        #self.train = theano.function(inputs + [gold, lambda_w, p_dropout, weights],
        #                             train_outputs,
        #                              updates=updates,
        #                              allow_input_downcast=True,
        #                              on_unused_input='warn')
        #print('...')
        #test_predictions = lasagne.layers.get_output(self.network, deterministic=True).ravel()
        #
        #self.predict = theano.function(inputs,
        #                               test_predictions,
        #                               allow_input_downcast=True,
        #                              on_unused_input='warn')

        #test_acc = T.mean(T.eq(test_predictions > .5, gold),
        #                                    dtype=theano.config.floatX)
        #print('...')
        #test_loss = lasagne.objectives.binary_crossentropy(test_predictions,
        #                                                    gold).mean()        
        #self.validate = theano.function(inputs + [gold, lambda_w, p_dropout, weights],
        #                                [loss, test_acc],
        #                              on_unused_input='warn')

        print('...')
        #attention for words, B x S x N        

        ##attention for sentences, B x S
        print('finished compiling...')
    
    
        if interaction:
            l_concat = l_hid_response
        else:
            l_concat = lasagne.layers.ConcatLayer([l_hid_context,l_hid_response])
        network = lasagne.layers.DenseLayer(
            l_concat,
            num_units=num_classes,
            nonlinearity=lasagne.nonlinearities.softmax
        )

        self.network = network
        output = lasagne.layers.get_output(network)

        # Define objective function (cost) to minimize, mean crossentropy error
        cost = lasagne.objectives.categorical_crossentropy(output, y).mean()

        # Compute gradient updates
        params = lasagne.layers.get_all_params(network)
        cost += lambda_w*apply_penalty(params, l2)
        # grad_updates = lasagne.updates.nesterov_momentum(cost, params,learn_rate)
        grad_updates = lasagne.updates.adam(cost, params)
        #learn_rate = .01
        #grad_updates = lasagne.updates.adadelta(cost, params, learn_rate)
        test_output = lasagne.layers.get_output(network, deterministic=True)
        val_cost_fn = lasagne.objectives.categorical_crossentropy(
            test_output, y).mean()
        preds = T.argmax(test_output, axis=1)

        val_acc_fn = T.mean(T.eq(preds, y),
                            dtype=theano.config.floatX)
        self.val_fn = theano.function([idxs_context, mask_context_words, mask_context_sents, idxs_response, mask_response_words, mask_response_sents, y], [val_cost_fn, val_acc_fn, preds],
                                 allow_input_downcast=True,on_unused_input='warn')
        # Compile train objective
        print "Compiling training, testing, prediction functions"
        self.train = theano.function(inputs = [idxs_context, mask_context_words, mask_context_sents,idxs_response, mask_response_words, mask_response_sents, y], outputs = cost, updates = grad_updates, allow_input_downcast=True,on_unused_input='warn')
        self.test = theano.function(inputs = [idxs_context, mask_context_words, mask_context_sents,idxs_response, mask_response_words, mask_response_sents, y], outputs = val_acc_fn,allow_input_downcast=True,on_unused_input='warn')
        self.pred = theano.function(inputs = [idxs_context, mask_context_words, mask_context_sents, idxs_response, mask_response_words, mask_response_sents],outputs = preds,allow_input_downcast=True,on_unused_input='warn')
        if separate_attention_response:
            sentence_attention = lasagne.layers.get_output(l_attn_rr_s_response, deterministic=True)
            #if add_biases:
            #    inputs = inputs[:-1]
            self.sentence_attention_response = theano.function([idxs_context, mask_context_words, mask_context_sents,idxs_response, mask_response_words, mask_response_sents],
                                                      [sentence_attention, preds],
                                                      allow_input_downcast=True,
                                                      on_unused_input='warn')
        if separate_attention_context:
            sentence_attention_context = lasagne.layers.get_output(l_attn_rr_s_context, deterministic=True)
            #if add_biases:
            #    inputs = inputs[:-1]
            self.sentence_attention_context = theano.function([idxs_context, mask_context_words, mask_context_sents,idxs_response, mask_response_words, mask_response_sents],
                                                      [sentence_attention_context,preds],
                                                      allow_input_downcast=True,
                                                      on_unused_input='warn')
        if separate_attention_response_words:
            word_attention = lasagne.layers.get_output(l_attention_words_response, deterministic=True) 
            self.sentence_attention_response_words = theano.function([idxs_context, mask_context_words, mask_context_sents,idxs_response, mask_response_words, mask_response_sents],[word_attention,preds], 
                    allow_input_downcast=True,
                    on_unused_input='warn')
        if separate_attention_context_words:
            word_attention_context = lasagne.layers.get_output(l_attention_words_context, deterministic = True) 
            self.sentence_attention_context_words = theano.function([idxs_context, mask_context_words, mask_context_sents,idxs_response, mask_response_words, mask_response_sents],[word_attention_context,preds], 
                    allow_input_downcast=True,
                    on_unused_input='warn')
def execute(dataset,
            n_hidden_t_enc,
            n_hidden_s,
            num_epochs=500,
            learning_rate=.001,
            learning_rate_annealing=1.0,
            gamma=1,
            lmd=0.,
            disc_nonlinearity="sigmoid",
            keep_labels=1.0,
            prec_recall_cutoff=True,
            missing_labels_val=-1.0,
            which_fold=1,
            early_stop_criterion='loss',
            save_path='/Tmp/romerosa/DietNetworks/',
            save_copy='/Tmp/romerosa/DietNetworks/',
            dataset_path='/Tmp/carriepl/datasets/',
            resume=False):

    # Load the dataset
    print("Loading data")
    x_train, y_train, x_valid, y_valid, x_test, y_test, \
        x_unsup, training_labels = mlh.load_data(
            dataset, dataset_path, None,
            which_fold=which_fold, keep_labels=keep_labels,
            missing_labels_val=missing_labels_val,
            embedding_input='raw')

    # Extract required information from data
    n_samples, n_feats = x_train.shape
    print("Number of features : ", n_feats)
    print("Glorot init : ", 2.0 / (n_feats + n_hidden_t_enc[-1]))
    n_targets = y_train.shape[1]

    # Set some variables
    batch_size = 128

    # Preparing folder to save stuff
    exp_name = 'basic_' + mlh.define_exp_name(
        keep_labels, 0, 0, gamma, lmd, [], n_hidden_t_enc, [], n_hidden_s,
        which_fold, learning_rate, 0, 0, early_stop_criterion,
        learning_rate_annealing)
    print("Experiment: " + exp_name)
    save_path = os.path.join(save_path, dataset, exp_name)
    save_copy = os.path.join(save_copy, dataset, exp_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # Prepare Theano variables for inputs and targets
    input_var_sup = T.matrix('input_sup')
    target_var_sup = T.matrix('target_sup')
    lr = theano.shared(np.float32(learning_rate), 'learning_rate')

    # Build model
    print("Building model")
    discrim_net = InputLayer((None, n_feats), input_var_sup)
    discrim_net = DenseLayer(discrim_net,
                             num_units=n_hidden_t_enc[-1],
                             nonlinearity=rectify)

    # Reconstruct the input using dec_feat_emb
    if gamma > 0:
        reconst_net = DenseLayer(discrim_net,
                                 num_units=n_feats,
                                 nonlinearity=linear)
        nets = [reconst_net]
    else:
        nets = [None]

    # Add supervised hidden layers
    for hid in n_hidden_s:
        discrim_net = DropoutLayer(discrim_net)
        discrim_net = DenseLayer(discrim_net, num_units=hid)

    assert disc_nonlinearity in ["sigmoid", "linear", "rectify", "softmax"]
    discrim_net = DropoutLayer(discrim_net)
    discrim_net = DenseLayer(discrim_net,
                             num_units=n_targets,
                             nonlinearity=eval(disc_nonlinearity))

    print("Building and compiling training functions")

    # Build and compile training functions
    predictions, predictions_det = mh.define_predictions(nets, start=0)
    prediction_sup, prediction_sup_det = mh.define_predictions([discrim_net])
    prediction_sup = prediction_sup[0]
    prediction_sup_det = prediction_sup_det[0]

    # Define losses
    # reconstruction losses
    reconst_losses, reconst_losses_det = mh.define_reconst_losses(
        predictions, predictions_det, [input_var_sup])
    # supervised loss
    sup_loss, sup_loss_det = mh.define_sup_loss(disc_nonlinearity,
                                                prediction_sup,
                                                prediction_sup_det,
                                                keep_labels, target_var_sup,
                                                missing_labels_val)

    inputs = [input_var_sup, target_var_sup]
    params = lasagne.layers.get_all_params([discrim_net] + nets,
                                           trainable=True)

    print('Number of params: ' + str(len(params)))

    # Combine losses
    loss = sup_loss + gamma * reconst_losses[0]
    loss_det = sup_loss_det + gamma * reconst_losses_det[0]

    l2_penalty = apply_penalty(params, l2)
    loss = loss + lmd * l2_penalty
    loss_det = loss_det + lmd * l2_penalty

    # Compute network updates
    updates = lasagne.updates.rmsprop(loss, params, learning_rate=lr)
    # updates = lasagne.updates.sgd(loss,
    #                               params,
    #                               learning_rate=lr)
    # updates = lasagne.updates.momentum(loss, params,
    #                                    learning_rate=lr, momentum=0.0)

    # Apply norm constraints on the weights
    for k in updates.keys():
        if updates[k].ndim == 2:
            updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0)

    # Compile training function
    train_fn = theano.function(inputs,
                               loss,
                               updates=updates,
                               on_unused_input='ignore')

    # Monitoring Labels
    monitor_labels = ["reconst. loss"]
    monitor_labels = [
        i for i, j in zip(monitor_labels, reconst_losses) if j != 0
    ]
    monitor_labels += ["loss. sup.", "total loss"]

    # Build and compile test function
    val_outputs = reconst_losses_det
    val_outputs = [i for i, j in zip(val_outputs, reconst_losses) if j != 0]
    val_outputs += [sup_loss_det, loss_det]

    # Compute accuracy and add it to monitoring list
    test_acc, test_pred = mh.define_test_functions(disc_nonlinearity,
                                                   prediction_sup,
                                                   prediction_sup_det,
                                                   target_var_sup)
    monitor_labels.append("accuracy")
    val_outputs.append(test_acc)

    # Compile prediction function
    predict = theano.function([input_var_sup], test_pred)

    # Compile validation function
    val_fn = theano.function(inputs, [prediction_sup_det] + val_outputs,
                             on_unused_input='ignore')

    # Finally, launch the training loop.
    print("Starting training...")

    # Some variables
    max_patience = 100
    patience = 0

    train_monitored = []
    valid_monitored = []
    train_loss = []

    # Pre-training monitoring
    print("Epoch 0 of {}".format(num_epochs))

    train_minibatches = mlh.iterate_minibatches(x_train,
                                                y_train,
                                                batch_size,
                                                shuffle=False)
    train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                               monitor_labels, prec_recall_cutoff)

    valid_minibatches = mlh.iterate_minibatches(x_valid,
                                                y_valid,
                                                batch_size,
                                                shuffle=False)
    valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                               monitor_labels, prec_recall_cutoff)

    # Training loop
    start_training = time.time()
    for epoch in range(num_epochs):
        start_time = time.time()
        print("Epoch {} of {}".format(epoch + 1, num_epochs))
        nb_minibatches = 0
        loss_epoch = 0

        # Train pass
        for batch in mlh.iterate_minibatches(x_train,
                                             training_labels,
                                             batch_size,
                                             shuffle=True):
            loss_epoch += train_fn(*batch)
            nb_minibatches += 1

        loss_epoch /= nb_minibatches
        train_loss += [loss_epoch]

        # Monitoring on the training set
        train_minibatches = mlh.iterate_minibatches(x_train,
                                                    y_train,
                                                    batch_size,
                                                    shuffle=False)
        train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                   monitor_labels, prec_recall_cutoff)
        train_monitored += [train_err]

        # Monitoring on the validation set
        valid_minibatches = mlh.iterate_minibatches(x_valid,
                                                    y_valid,
                                                    batch_size,
                                                    shuffle=False)

        valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                   monitor_labels, prec_recall_cutoff)
        valid_monitored += [valid_err]

        try:
            early_stop_val = valid_err[monitor_labels.index(
                early_stop_criterion)]
        except:
            raise ValueError("There is no monitored value by the name of %s" %
                             early_stop_criterion)

        # Early stopping
        if epoch == 0:
            best_valid = early_stop_val
        elif (early_stop_val > best_valid and early_stop_criterion == 'accuracy') or \
             (early_stop_val < best_valid and early_stop_criterion ==
              'loss. sup.'):
            best_valid = early_stop_val
            patience = 0

            # Save stuff
            np.savez(
                os.path.join(save_path, 'model_best.npz'),
                *lasagne.layers.get_all_param_values(
                    filter(None, nets) + [discrim_net]))
            np.savez(save_path + "/errors_supervised_best.npz",
                     zip(*train_monitored), zip(*valid_monitored))
        else:
            patience += 1
            np.savez(
                os.path.join(save_path, 'model_last.npz'),
                *lasagne.layers.get_all_param_values(
                    filter(None, nets) + [discrim_net]))
            np.savez(save_path + "/errors_supervised_last.npz",
                     zip(*train_monitored), zip(*valid_monitored))

        # End training
        if patience == max_patience or epoch == num_epochs - 1:
            print("Ending training")
            # Load best model
            if not os.path.exists(save_path + '/model_best.npz'):
                print("No saved model to be tested and/or generate"
                      " the embedding !")
            else:
                with np.load(save_path + '/model_best.npz', ) as f:
                    param_values = [
                        f['arr_%d' % i] for i in range(len(f.files))
                    ]
                    lasagne.layers.set_all_param_values(
                        filter(None, nets) + [discrim_net], param_values)

            # Training set results
            train_minibatches = mlh.iterate_minibatches(x_train,
                                                        y_train,
                                                        batch_size,
                                                        shuffle=False)
            train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                       monitor_labels, prec_recall_cutoff)

            # Validation set results
            valid_minibatches = mlh.iterate_minibatches(x_valid,
                                                        y_valid,
                                                        batch_size,
                                                        shuffle=False)
            valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                       monitor_labels, prec_recall_cutoff)

            # Test set results
            if y_test is not None:
                test_minibatches = mlh.iterate_minibatches(x_test,
                                                           y_test,
                                                           batch_size,
                                                           shuffle=False)

                test_err = mlh.monitoring(test_minibatches, "test", val_fn,
                                          monitor_labels, prec_recall_cutoff)
            else:
                for minibatch in mlh.iterate_testbatches(x_test,
                                                         batch_size,
                                                         shuffle=False):
                    test_predictions = []
                    test_predictions += [predict(minibatch)]
                np.savez(os.path.join(save_path, 'test_predictions.npz'),
                         test_predictions)

            # Stop
            print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() -
                                                         start_time))
            break

        print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time))

        # Anneal the learning rate
        lr.set_value(float(lr.get_value() * learning_rate_annealing))

    # Print all final errors for train, validation and test
    print("Training time:\t\t\t{:.3f}s".format(time.time() - start_training))

    # Copy files to loadpath
    if save_path != save_copy:
        print('Copying model and other training files to {}'.format(save_copy))
        copy_tree(save_path, save_copy)
def execute(dataset,
            learning_rate=0.00001,
            alpha=0.,
            beta=1.,
            lmd=0.,
            encoder_units=[1024, 512, 256],
            num_epochs=500,
            which_fold=1,
            save_path=None,
            save_copy=None,
            dataset_path=None):

    # Reading dataset
    print("Loading data")
    x_unsup = mlh.load_data(dataset,
                            dataset_path,
                            None,
                            which_fold=which_fold,
                            keep_labels=1.0,
                            missing_labels_val=-1.0,
                            embedding_input='bin',
                            transpose=True)

    x_train = x_unsup[0][0]
    x_valid = x_unsup[1][0]

    n_features = x_train.shape[1]

    exp_name = "learn_gene_vector_h"
    for e in encoder_units:
        exp_name += ('-' + str(e))
    exp_name += '_a-' + str(alpha)
    exp_name += '_b-' + str(beta)
    exp_name += '_l-' + str(lmd)
    exp_name += '_lr-' + str(learning_rate)

    save_path = os.path.join(save_path, exp_name)
    save_copy = os.path.join(save_copy, exp_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    if not os.path.exists(save_copy):
        os.makedirs(save_copy)

    # Prepare Theano variables for inputs and targets
    input_var = T.matrix('input')
    target_var = T.matrix('target')
    target_reconst = T.matrix('target')
    lr = theano.shared(np.float32(learning_rate), 'learning_rate')
    lmd = 0.0001  # weight decay coeff
    num_epochs = 200
    # there arent really any epochs as we are using a generator with random
    # sampling from dataset. This is for compat.
    batches_per_epoch = 1000
    batch_size = 128

    # building network
    encoder = InputLayer((batch_size, n_features), input_var)

    # building the encoder and decoder
    for i in range(len(encoder_units)):
        encoder = DenseLayer(encoder,
                             num_units=encoder_units[i],
                             nonlinearity=rectify)

    params = lasagne.layers.get_all_params(encoder, trainable=True)
    monitor_labels = []
    val_outputs = []
    nets = [encoder]

    if alpha > 0:
        decoder_units = encoder_units[::-1][1:]
        decoder = encoder
        for i in range(len(decoder_units)):
            decoder = DenseLayer(decoder,
                                 num_units=decoder_units[i],
                                 nonlinearity=rectify)
        decoder = DenseLayer(decoder,
                             num_units=n_features,
                             nonlinearity=sigmoid)
        prediction_reconst = lasagne.layers.get_output(decoder)

        # Reconstruction error
        loss_reconst = lasagne.objectives.binary_crossentropy(
            prediction_reconst, target_reconst).mean()

        params += lasagne.layers.get_all_params(decoder, trainable=True)
        monitor_labels += ["reconst."]
        val_outputs += [loss_reconst]
        nets += [decoder]

    else:
        loss_reconst = 0

    if beta > 0:
        predictor_laysize = [encoder_units[-1]] * 4
        predictor = encoder
        for i in range(len(predictor_laysize)):
            predictor = DenseLayer(predictor,
                                   num_units=predictor_laysize[i],
                                   nonlinearity=rectify)

        predictor = DenseLayer(predictor, num_units=2, nonlinearity=sigmoid)

        prediction_var = lasagne.layers.get_output(predictor)

        # w2v error
        loss_pred = lasagne.objectives.binary_crossentropy(
            prediction_var, target_var).mean()

        params += lasagne.layers.get_all_params(predictor, trainable=True)
        monitor_labels += ["pred."]
        val_outputs += [loss_pred]
        nets += [predictor]
    else:
        loss_pred = 0

    # Combine losses
    loss = alpha * loss_reconst + beta * loss_pred

    # applying weight decay
    l2_penalty = apply_penalty(params, l2)
    loss = loss + lmd * l2_penalty
    # loss = loss + lmd*l2_penalty

    val_outputs += [loss]
    monitor_labels += ['loss']

    # Some variables
    max_patience = 100
    patience = 0

    train_monitored = []
    valid_monitored = []
    train_loss = []

    updates = lasagne.updates.rmsprop(loss, params, learning_rate=lr)

    inputs = [input_var, target_var, target_reconst]

    # Compile training function
    print "Compiling training function"
    train_fn = theano.function(inputs,
                               loss,
                               updates=updates,
                               on_unused_input='ignore')
    val_fn = theano.function(inputs, [val_outputs[0]] + val_outputs,
                             on_unused_input='ignore')
    start_training = time.time()
    print "training start time: {}".format(start_training)

    # data_gen = data_generator(x_train, batch_size)
    print "Starting training"
    for epoch in range(num_epochs):
        start_time = time.time()
        print("Epoch {} of {}".format(epoch + 1, num_epochs))
        nb_minibatches = 0
        loss_epoch = 0

        for x, y, target_reconst_val in data_generator(x_train, batch_size):
            loss_epoch += train_fn(x, y, target_reconst_val)
            nb_minibatches += 1

        loss_epoch /= nb_minibatches
        train_loss += [loss_epoch]

        # Monitoring on the training set
        train_minibatches = data_generator(x_train, batch_size)
        train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                   monitor_labels, 0)
        train_monitored += [train_err]

        # Monitoring on the validation set
        valid_minibatches = data_generator(x_valid, batch_size)

        valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                   monitor_labels, 0)
        valid_monitored += [valid_err]

        early_stop_criterion = 'loss'
        early_stop_val = valid_err[monitor_labels.index(early_stop_criterion)]

        # Early stopping
        if epoch == 0:
            best_valid = early_stop_val
        elif early_stop_val < best_valid and early_stop_criterion == 'loss':
            best_valid = early_stop_val
            patience = 0

            # Save stuff
            np.savez(save_path + '/model_snp2vec_best.npz',
                     *lasagne.layers.get_all_param_values(nets))
            np.savez(save_path + "/errors_snp2vec_best.npz",
                     zip(*train_monitored), zip(*valid_monitored))
        else:
            patience += 1
            np.savez(os.path.join(save_path, 'model_snp2vec_last.npz'),
                     *lasagne.layers.get_all_param_values(nets))
            np.savez(save_path + "/errors_snp2vec_last.npz",
                     zip(*train_monitored), zip(*valid_monitored))

        # End training
        if (patience == max_patience) or (epoch == num_epochs - 1):
            print("Ending training")
            # Load best model
            if not os.path.exists(save_path + '/model_snp2vec_best.npz'):
                print(
                    "No saved model to be tested and/or generate"
                    " the embedding !")
            else:
                with np.load(save_path + '/model_snp2vec_best.npz') as f:
                    param_values = [
                        f['arr_%d' % i] for i in range(len(f.files))
                    ]
                    lasagne.layers.set_all_param_values(nets, param_values)

            # Training set results
            train_minibatches = data_generator(x_train, batch_size)
            train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                       monitor_labels, 0)

            # Validation set results
            valid_minibatches = data_generator(x_valid, batch_size)
            valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                       monitor_labels, 0)

            # Stop
            print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() -
                                                         start_time))
            break

        print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time))

    # Copy files to loadpath
    if save_path != save_copy:
        print('Copying model and other training files to {}'.format(save_copy))
        copy_tree(save_path, save_copy)
Beispiel #16
0
def execute(
        dataset,
        n_hidden_u,
        n_hidden_t_enc,
        n_hidden_t_dec,
        n_hidden_s,
        embedding_source=None,
        num_epochs=500,
        learning_rate=.001,
        learning_rate_annealing=1.0,
        alpha=1,
        beta=1,
        gamma=1,
        lmd=.0001,
        disc_nonlinearity="sigmoid",
        encoder_net_init=0.2,
        decoder_net_init=0.2,
        keep_labels=1.0,
        prec_recall_cutoff=True,
        missing_labels_val=-1.0,
        which_fold=0,
        early_stop_criterion='loss_sup_det',
        embedding_input='raw',
        save_path='/Tmp/' + os.environ["USER"] +
    '/savepath/',  # a default value was needed?
        save_copy='/Tmp/' + os.environ["USER"] + '/savecopy/',
        dataset_path='/Tmp/' + os.environ["USER"] + '/datasets/',
        resume=False,
        exp_name='',
        random_proj=0):

    # Load the dataset
    print("Loading data")
    x_train, y_train, x_valid, y_valid, x_test, y_test, \
        x_unsup, training_labels = mlh.load_data(
            dataset, dataset_path, embedding_source,
            which_fold=which_fold, keep_labels=keep_labels,
            missing_labels_val=missing_labels_val,
            embedding_input=embedding_input)

    if x_unsup is not None:
        n_samples_unsup = x_unsup.shape[1]
    else:
        n_samples_unsup = 0

    # Extract required information from data
    n_samples, n_feats = x_train.shape
    print("Number of features : ", n_feats)
    print("Glorot init : ", 2.0 / (n_feats + n_hidden_t_enc[-1]))
    n_targets = y_train.shape[1]

    # Set some variables
    batch_size = 128
    beta = gamma if (gamma == 0) else beta

    # Preparing folder to save stuff
    if embedding_source is None:
        embedding_name = embedding_input
    else:
        embedding_name = embedding_source.replace("_", "").split(".")[0]
        exp_name += embedding_name.rsplit('/', 1)[::-1][0] + '_'

    exp_name += 'final_'

    exp_name += mlh.define_exp_name(keep_labels, alpha, beta, gamma, lmd,
                                    n_hidden_u, n_hidden_t_enc, n_hidden_t_dec,
                                    n_hidden_s, which_fold, embedding_input,
                                    learning_rate, decoder_net_init,
                                    encoder_net_init, early_stop_criterion,
                                    learning_rate_annealing)

    print("Experiment: " + exp_name)
    save_path = os.path.join(save_path, dataset, exp_name)
    save_copy = os.path.join(save_copy, dataset, exp_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    if not os.path.exists(save_copy):
        os.makedirs(save_copy)

    # Prepare Theano variables for inputs and targets
    input_var_sup = T.matrix('input_sup')
    input_var_unsup = theano.shared(x_unsup, 'input_unsup')  # x_unsup TBD
    target_var_sup = T.matrix('target_sup')
    lr = theano.shared(np.float32(learning_rate), 'learning_rate')

    # Build model
    print("Building model")

    # Some checkings
    # assert len(n_hidden_u) > 0
    assert len(n_hidden_t_enc) > 0
    assert len(n_hidden_t_dec) > 0
    assert n_hidden_t_dec[-1] == n_hidden_t_enc[-1]

    # Build feature embedding networks (encoding and decoding if gamma > 0)
    nets, embeddings, pred_feat_emb = mh.build_feat_emb_nets(
        embedding_source, n_feats, n_samples_unsup, input_var_unsup,
        n_hidden_u, n_hidden_t_enc, n_hidden_t_dec, gamma, encoder_net_init,
        decoder_net_init, save_path, random_proj)

    # Build feature embedding reconstruction networks (if alpha > 0, beta > 0)
    nets += mh.build_feat_emb_reconst_nets(
        [alpha, beta], n_samples_unsup, n_hidden_u,
        [n_hidden_t_enc, n_hidden_t_dec], nets,
        [encoder_net_init, decoder_net_init])

    # Supervised network
    discrim_net, hidden_rep = mh.build_discrim_net(
        batch_size, n_feats, input_var_sup, n_hidden_t_enc, n_hidden_s,
        embeddings[0], disc_nonlinearity, n_targets)

    # Reconstruct network
    nets += [
        mh.build_reconst_net(hidden_rep,
                             embeddings[1] if len(embeddings) > 1 else None,
                             n_feats, gamma)
    ]

    # Load weights if we are resuming job
    if resume:
        # Load best model
        with np.load(os.path.join(save_path, 'model_feat_sel_last.npz')) as f:
            param_values = [f['arr_%d' % i] for i in range(len(f.files))]
        nlayers = len(
            lasagne.layers.get_all_params(filter(None, nets) + [discrim_net]))
        lasagne.layers.set_all_param_values(
            filter(None, nets) + [discrim_net], param_values[:nlayers])

    print("Building and compiling training functions")

    # Build and compile training functions
    predictions, predictions_det = mh.define_predictions(nets, start=2)
    prediction_sup, prediction_sup_det = mh.define_predictions([discrim_net])
    prediction_sup = prediction_sup[0]
    prediction_sup_det = prediction_sup_det[0]

    # Define losses
    # reconstruction losses
    reconst_losses, reconst_losses_det = mh.define_reconst_losses(
        predictions, predictions_det,
        [input_var_unsup, input_var_unsup, input_var_sup])
    # supervised loss
    sup_loss, sup_loss_det = mh.define_sup_loss(disc_nonlinearity,
                                                prediction_sup,
                                                prediction_sup_det,
                                                keep_labels, target_var_sup,
                                                missing_labels_val)

    # Define inputs
    inputs = [input_var_sup, target_var_sup]

    # Define parameters
    params = lasagne.layers.get_all_params([discrim_net] + filter(None, nets),
                                           trainable=True)
    params_to_freeze= \
        lasagne.layers.get_all_params(filter(None, nets), trainable=False)

    print('Number of params discrim: ' + str(len(params)))
    print('Number of params to freeze: ' + str(len(params_to_freeze)))

    for p in params_to_freeze:
        new_params = [el for el in params if el != p]
        params = new_params

    print('Number of params to update: ' + str(len(params)))

    # Combine losses
    loss = sup_loss + alpha*reconst_losses[0] + beta*reconst_losses[1] + \
        gamma*reconst_losses[2]
    loss_det = sup_loss_det + alpha*reconst_losses_det[0] + \
        beta*reconst_losses_det[1] + gamma*reconst_losses_det[2]

    l2_penalty = apply_penalty(params, l2)
    loss = loss + lmd * l2_penalty
    loss_det = loss_det + lmd * l2_penalty

    # Compute network updates
    updates = lasagne.updates.rmsprop(loss, params, learning_rate=lr)
    # updates = lasagne.updates.sgd(loss,
    #                               params,
    #                               learning_rate=lr)
    # updates = lasagne.updates.momentum(loss, params,
    #                                    learning_rate=lr, momentum=0.0)

    # Apply norm constraints on the weights
    for k in updates.keys():
        if updates[k].ndim == 2:
            updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0)

    # Compile training function
    train_fn = theano.function(inputs,
                               loss,
                               updates=updates,
                               on_unused_input='ignore')

    # Monitoring Labels
    monitor_labels = [
        "reconst. feat. W_enc", "reconst. feat. W_dec", "reconst. loss"
    ]
    monitor_labels = [
        i for i, j in zip(monitor_labels, reconst_losses) if j != 0
    ]
    monitor_labels += ["feat. W_enc. mean", "feat. W_enc var"]
    monitor_labels += ["feat. W_dec. mean", "feat. W_dec var"] if \
        (embeddings[1] is not None) else []
    monitor_labels += ["loss. sup.", "total loss"]

    # Build and compile test function
    val_outputs = reconst_losses_det
    val_outputs = [i for i, j in zip(val_outputs, reconst_losses) if j != 0]
    val_outputs += [embeddings[0].mean(), embeddings[0].var()]
    val_outputs += [embeddings[1].mean(), embeddings[1].var()] if \
        (embeddings[1] is not None) else []
    val_outputs += [sup_loss_det, loss_det]

    # Compute accuracy and add it to monitoring list
    test_acc, test_pred = mh.define_test_functions(disc_nonlinearity,
                                                   prediction_sup,
                                                   prediction_sup_det,
                                                   target_var_sup)
    monitor_labels.append("accuracy")
    val_outputs.append(test_acc)

    # Compile prediction function
    predict = theano.function([input_var_sup], test_pred)

    # Compile validation function
    val_fn = theano.function(inputs, [prediction_sup_det] + val_outputs,
                             on_unused_input='ignore')

    # Finally, launch the training loop.
    print("Starting training...")

    # Some variables
    max_patience = 100
    patience = 0

    train_monitored = []
    valid_monitored = []
    train_loss = []

    # Pre-training monitoring
    print("Epoch 0 of {}".format(num_epochs))

    train_minibatches = mlh.iterate_minibatches(x_train,
                                                y_train,
                                                batch_size,
                                                shuffle=False)
    train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                               monitor_labels, prec_recall_cutoff)

    valid_minibatches = mlh.iterate_minibatches(x_valid,
                                                y_valid,
                                                batch_size,
                                                shuffle=False)
    valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                               monitor_labels, prec_recall_cutoff)

    # Training loop
    start_training = time.time()
    for epoch in range(num_epochs):
        start_time = time.time()
        print("Epoch {} of {}".format(epoch + 1, num_epochs))
        nb_minibatches = 0
        loss_epoch = 0

        # Train pass
        for batch in mlh.iterate_minibatches(x_train,
                                             training_labels,
                                             batch_size,
                                             shuffle=True):
            loss_epoch += train_fn(*batch)
            nb_minibatches += 1

        loss_epoch /= nb_minibatches
        train_loss += [loss_epoch]

        # Monitoring on the training set
        train_minibatches = mlh.iterate_minibatches(x_train,
                                                    y_train,
                                                    batch_size,
                                                    shuffle=False)
        train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                   monitor_labels, prec_recall_cutoff)
        train_monitored += [train_err]

        # Monitoring on the validation set
        valid_minibatches = mlh.iterate_minibatches(x_valid,
                                                    y_valid,
                                                    batch_size,
                                                    shuffle=False)

        valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                   monitor_labels, prec_recall_cutoff)
        valid_monitored += [valid_err]

        try:
            early_stop_val = valid_err[monitor_labels.index(
                early_stop_criterion)]
        except:
            raise ValueError("There is no monitored value by the name of %s" %
                             early_stop_criterion)

        # Early stopping
        if epoch == 0:
            best_valid = early_stop_val
        elif (early_stop_val > best_valid and early_stop_criterion == 'accuracy') or \
             (early_stop_val < best_valid and early_stop_criterion == 'loss. sup.'):
            best_valid = early_stop_val
            patience = 0

            # Save stuff
            np.savez(
                os.path.join(save_path, 'model_feat_sel_best.npz'),
                *lasagne.layers.get_all_param_values(
                    filter(None, nets) + [discrim_net]))
            np.savez(save_path + "/errors_supervised_best.npz",
                     zip(*train_monitored), zip(*valid_monitored))

            # Monitor on the test set now because sometimes the saving doesn't
            # go well and there isn't a model to load at the end of training
            if y_test is not None:
                test_minibatches = mlh.iterate_minibatches(x_test,
                                                           y_test,
                                                           138,
                                                           shuffle=False)

                test_err = mlh.monitoring(test_minibatches, "test", val_fn,
                                          monitor_labels, prec_recall_cutoff)
        else:
            patience += 1
            # Save stuff
            np.savez(
                os.path.join(save_path, 'model_feat_sel_last.npz'),
                *lasagne.layers.get_all_param_values(
                    filter(None, nets) + [discrim_net]))
            np.savez(save_path + "/errors_supervised_last.npz",
                     zip(*train_monitored), zip(*valid_monitored))

        # End training
        if patience == max_patience or epoch == num_epochs - 1:
            print("Ending training")
            # Load best model
            with np.load(os.path.join(save_path,
                                      'model_feat_sel_best.npz')) as f:
                param_values = [f['arr_%d' % i] for i in range(len(f.files))]
            nlayers = len(
                lasagne.layers.get_all_params(
                    filter(None, nets) + [discrim_net]))
            lasagne.layers.set_all_param_values(
                filter(None, nets) + [discrim_net], param_values[:nlayers])
            if embedding_source is None:
                # Save embedding
                pred = pred_feat_emb()
                np.savez(os.path.join(save_path, 'feature_embedding.npz'),
                         pred)

            # Training set results
            train_minibatches = mlh.iterate_minibatches(x_train,
                                                        y_train,
                                                        batch_size,
                                                        shuffle=False)
            train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                       monitor_labels, prec_recall_cutoff)

            # Validation set results
            valid_minibatches = mlh.iterate_minibatches(x_valid,
                                                        y_valid,
                                                        batch_size,
                                                        shuffle=False)
            valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                       monitor_labels, prec_recall_cutoff)

            # Test set results
            if y_test is not None:
                test_minibatches = mlh.iterate_minibatches(x_test,
                                                           y_test,
                                                           138,
                                                           shuffle=False)

                test_err = mlh.monitoring(test_minibatches, "test", val_fn,
                                          monitor_labels, prec_recall_cutoff)
                np.savez(os.path.join(save_path, 'final_errors.npz'), test_err)
            else:
                for minibatch in mlh.iterate_testbatches(x_test,
                                                         138,
                                                         shuffle=False):
                    test_predictions = []
                    test_predictions += [predict(minibatch)]
                np.savez(os.path.join(save_path, 'test_predictions.npz'),
                         test_predictions)

            # Stop
            print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() -
                                                         start_time))
            break

        print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time))

        # Anneal the learning rate
        lr.set_value(float(lr.get_value() * learning_rate_annealing))

    # Print and save all final errors for train, validation and test
    print("Training time:\t\t\t{:.3f}s".format(time.time() - start_training))
    print("test_err:", test_err)

    # Copy files to loadpath
    if save_path != save_copy:
        print('Copying model and other training files to {}'.format(save_copy))
        copy_tree(save_path, save_copy)
Beispiel #17
0
def execute(dataset,
            learning_rate=0.00001,
            learning_rate_annealing=1.0,
            alpha=0.,
            beta=1.,
            lmd=0.,
            encoder_units=[1024, 512, 256],
            num_epochs=500,
            which_fold=1,
            save_path=None,
            save_copy=None,
            dataset_path=None,
            num_fully_connected=0,
            exp_name='',
            init_args=None):

    # Reading dataset
    print("Loading data")
    x_unsup = mlh.load_data(dataset,
                            dataset_path,
                            None,
                            which_fold=which_fold,
                            keep_labels=1.0,
                            missing_labels_val=-1.0,
                            embedding_input='bin',
                            transpose=True)

    x_train = x_unsup[0][0]
    x_valid = x_unsup[1][0]

    n_features = x_train.shape[1]

    exp_name += "learn_gene_vector_h"
    for e in encoder_units:
        exp_name += ('-' + str(e))
    exp_name += '_a-' + str(alpha)
    exp_name += '_b-' + str(beta)
    # exp_name += '_g-' + str(gamma)
    exp_name += '_l-' + str(lmd)
    exp_name += '_lr-' + str(learning_rate)

    save_path = os.path.join(save_path, exp_name)
    save_copy = os.path.join(save_copy, exp_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    if not os.path.exists(save_copy):
        os.makedirs(save_copy)

    # Prepare Theano variables for inputs and targets
    input_var = T.matrix('input')
    target_var = T.matrix('target')
    target_reconst = T.matrix('target')
    lr = theano.shared(np.float32(learning_rate), 'learning_rate')
    batch_size = 128

    # building network
    encoder = InputLayer((batch_size, n_features), input_var)

    # building the encoder and decoder
    #import pdb; pdb.set_trace()
    for i in range(len(encoder_units)):
        encoder = DenseLayer(
            encoder,
            num_units=encoder_units[i],
            W=HeNormal('relu'),
            nonlinearity=rectify)  # if i < len(encoder_units)-1 else linear)

    embedding = lasagne.layers.get_output(encoder)

    params = lasagne.layers.get_all_params(encoder, trainable=True)
    monitor_labels = ["embedding min", "embedding max"]
    val_outputs = [embedding.min(), embedding.max()]
    nets = [encoder]

    if alpha > 0:
        decoder_units = encoder_units[::-1][1:]
        print(decoder_units)
        decoder = encoder
        for i in range(len(decoder_units)):
            decoder = DenseLayer(decoder,
                                 num_units=decoder_units[i],
                                 W=HeNormal('relu'),
                                 nonlinearity=rectify)
        decoder = DenseLayer(decoder,
                             num_units=n_features,
                             W=convert_initialization(
                                 init_args["decoder_init"],
                                 nonlinearity="sigmoid"),
                             nonlinearity=sigmoid)
        prediction_reconst = lasagne.layers.get_output(decoder)

        # Reconstruction error
        loss_reconst = lasagne.objectives.binary_crossentropy(
            prediction_reconst, target_reconst).mean()

        # loss_reconst = mh.define_sampled_mean_bincrossentropy(
        #    prediction_reconst, target_reconst, gamma=gamma)

        #loss_reconst = mh.dice_coef_loss(
        #    target_reconst, prediction_reconst).mean()

        accuracy = T.eq(T.gt(prediction_reconst, 0.5), target_reconst).mean()

        params += lasagne.layers.get_all_params(decoder, trainable=True)
        monitor_labels += ["reconst. loss", "reconst. accuracy"]
        val_outputs += [loss_reconst, accuracy]
        nets += [decoder]
        # sparsity_reconst = gamma * l1(prediction_reconst)
        # roh = input_var.mean(0)
        # sparsity_reconst = ((roh * T.log(roh / (prediction_reconst.mean(0)+1e-8))) +\
        #     ((1 - roh) * T.log((1 - roh) / (1 - prediction_reconst + 1e-8)))).sum()

    else:
        loss_reconst = 0
        # sparsity_reconst = 0

    if beta > 0:
        predictor_laysize = [encoder_units[-1]] * num_fully_connected
        predictor = encoder
        for i in range(len(predictor_laysize)):
            predictor = DenseLayer(predictor,
                                   num_units=predictor_laysize[i],
                                   nonlinearity=rectify,
                                   W=convert_initialization(
                                       init_args["predictor_init"],
                                       nonlinearity="relu"))

        predictor = DenseLayer(predictor,
                               num_units=2,
                               nonlinearity=sigmoid,
                               W=convert_initialization(
                                   init_args["predictor_init"],
                                   nonlinearity="sigmoid"))

        prediction_var = lasagne.layers.get_output(predictor)

        # w2v error
        # loss_pred = lasagne.objectives.binary_crossentropy(
        #     prediction_var, target_var
        # ).mean()

        loss_pred = mh.dice_coef_loss(target_var, prediction_var).mean()

        accuracy = T.eq(T.gt(prediction_var, 0.5), target_var).mean()

        params += lasagne.layers.get_all_params(predictor, trainable=True)
        monitor_labels += ["pred. loss", "pred. accuracy"]
        val_outputs += [loss_pred, accuracy]
        nets += [predictor]

        # sparsity_pred = gamma * l1(prediction_var)
        # roh = 0.05
        # sparsity_pred = ((roh * T.log(roh / prediction_pred.mean(0))) +\
        #     ((1 - roh) * T.log((1 - roh) / (1 - prediction_pred)))).sum()
    else:
        loss_pred = 0
        # sparsity_pred = 0

    # Combine losses
    loss = alpha * loss_reconst + beta * loss_pred  # sparsity_pred  # + sparsity_reconst

    # applying weight decay
    l2_penalty = apply_penalty(params, l2)
    loss = loss + lmd * l2_penalty
    # loss = loss + lmd*l2_penalty

    val_outputs += [loss]
    monitor_labels += ['loss']

    # Some variables
    max_patience = 100
    patience = 0

    train_monitored = []
    valid_monitored = []
    train_loss = []

    updates = lasagne.updates.adam(loss, params, learning_rate=lr)

    for k in updates.keys():
        if updates[k].ndim == 2:
            updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0)

    inputs = [input_var, target_var, target_reconst]

    # Compile training function
    print "Compiling training function"
    train_fn = theano.function(inputs,
                               loss,
                               updates=updates,
                               on_unused_input='ignore')
    val_fn = theano.function(inputs, [val_outputs[0]] + val_outputs,
                             on_unused_input='ignore')

    if alpha > 0:
        pred_fn = theano.function([input_var], prediction_reconst)

    start_training = time.time()

    # data_gen = data_generator(x_train, batch_size)
    print "Starting training"
    for epoch in range(num_epochs):
        start_time = time.time()
        print("Epoch {} of {}".format(epoch + 1, num_epochs))
        nb_minibatches = 0
        loss_epoch = 0

        for x, y, target_reconst_val in data_generator(x_train,
                                                       batch_size,
                                                       shuffle=True):
            loss_epoch += train_fn(x, y, target_reconst_val)
            nb_minibatches += 1

        if alpha > 0:
            pr = pred_fn(x)
            print('min pr:' + str(pr.min()))
            print('max pr:' + str(pr.max()))
            print('mean pr:' + str(pr.mean()))

        loss_epoch /= nb_minibatches
        train_loss += [loss_epoch]

        # Monitoring on the training set
        train_minibatches = data_generator(x_train, batch_size)
        train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                   monitor_labels, 0)
        train_monitored += [train_err]

        # Monitoring on the validation set
        valid_minibatches = data_generator(x_valid, batch_size)

        valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                   monitor_labels, 0)
        valid_monitored += [valid_err]

        early_stop_criterion = 'loss'
        early_stop_val = valid_err[monitor_labels.index(early_stop_criterion)]

        # Early stopping
        if epoch == 0:
            best_valid = early_stop_val
        elif early_stop_val < best_valid and early_stop_criterion == 'loss':
            best_valid = early_stop_val
            patience = 0

            # Save stuff
            np.savez(save_path + '/model_snp2vec_best.npz',
                     *lasagne.layers.get_all_param_values(nets))
            np.savez(save_path + "/errors_snp2vec_best.npz",
                     zip(*train_monitored), zip(*valid_monitored))
        else:
            patience += 1
            np.savez(os.path.join(save_path, 'model_snp2vec_last.npz'),
                     *lasagne.layers.get_all_param_values(nets))
            np.savez(save_path + "/errors_snp2vec_last.npz",
                     zip(*train_monitored), zip(*valid_monitored))

        # End training
        if (patience == max_patience) or (epoch == num_epochs - 1):
            print("Ending training")
            # Load best model
            if not os.path.exists(save_path + '/model_snp2vec_best.npz'):
                print(
                    "No saved model to be tested and/or generate"
                    " the embedding !")
            else:
                with np.load(save_path + '/model_snp2vec_best.npz') as f:
                    param_values = [
                        f['arr_%d' % i] for i in range(len(f.files))
                    ]
                    lasagne.layers.set_all_param_values(nets, param_values)

            # Training set results
            train_minibatches = data_generator(x_train, batch_size)
            train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                       monitor_labels, 0)

            # Validation set results
            valid_minibatches = data_generator(x_valid, batch_size)
            valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                       monitor_labels, 0)

            # Stop
            print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() -
                                                         start_time))
            break

        print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time))
        # Anneal the learning rate
        lr.set_value(float(lr.get_value() * learning_rate_annealing))

    # Copy files to loadpath
    if save_path != save_copy:
        print('Copying model and other training files to {}'.format(save_copy))
        copy_tree(save_path, save_copy)
Beispiel #18
0
def reset():
    if any(np.isnan(scale.get_value()) for scale in scales):
        for scale in scales:
            scale.set_value(1.)
    for l in l_hiddens:
        l.b.set_value(Constant()(l.b.get_value().shape))
        l.W.set_value(Orthogonal()(l.W.get_value().shape))
    l_out.b.set_value(Constant()(l_out.b.get_value().shape))
    l_out.W.set_value(Orthogonal()(l_out.W.get_value().shape))
    for p in (p for u in (updates_ada, updates_other, updates_scal) for p in u
              if p not in get_all_params(l_out)):
        p.set_value(Constant()(p.get_value().shape))


chunky_l2 = apply_penalty(get_all_params(l_out, regularizable=True), l2) - l2(
    l_hiddens[0].W) + l2(l_hiddens[0].W / T.reshape(vscale, (206279, 1)))
chunky_l1 = apply_penalty(get_all_params(l_out, regularizable=True), l1) - l1(
    l_hiddens[0].W) + l1(l_hiddens[0].W / T.reshape(vscale, (206279, 1)))
simple_l2 = apply_penalty(get_all_params(l_out, regularizable=True), l2)
#l_out2 = DenseLayer(dropout(l_hiddens2[-1]), num_units=y.shape[1])
#l_out = lasagne.layers.NonlinearityLayer(lasagne.layers.ElemwiseSumLayer((l_out1,l_out2),.5), softmax)

#categorical_crossentropy(get_output(l_out)[train_indice])

target = T.fmatrix(name="target")
#f=theano.function([l_in.input_var],get_output(l_out),allow_input_downcast=True)
#f(X[0,:].toarray())

loss = categorical_crossentropy(get_output(l_out), target).mean()
# train_loss_smoo=categorical_crossentropy(get_output(l_out,deterministic=True)[train_indices,],target[train_indices,]).mean()
    def __init__(self,
                 W=None,
                 W_path=None,
                 K=300,
                 num_hidden=256,
                 batch_size=None,
                 grad_clip=100.,
                 max_sent_len=200,
                 num_classes=2,
                 **kwargs):

        W = W
        V = len(W)
        K = int(K)
        num_hidden = int(num_hidden)
        batch_size = int(batch_size)
        grad_clip = int(grad_clip)
        max_seq_len = int(max_sent_len)
        max_post_len = int(kwargs["max_post_len"])
        max_len = max(max_seq_len, max_post_len)

        max_seq_len = max_len
        max_post_len = max_len

        num_classes = int(num_classes)
        ''' Boolean on and/or sentence and word level attention'''
        ''' to use context or not'''

        separate_attention_context_sents = str_to_bool(
            kwargs["separate_attention_context"])
        separate_attention_response_sents = str_to_bool(
            kwargs["separate_attention_response"])

        separate_attention_context_words = str_to_bool(
            kwargs["separate_attention_context_words"])
        separate_attention_response_words = str_to_bool(
            kwargs["separate_attention_response_words"])

        print("separate_attention_context_sentence is : {}\n".format(
            separate_attention_context_sents))
        print("separate_attention_response_sentence is : {}\n".format(
            separate_attention_response_sents))
        print("separate_attention_context_words is : {}\n".format(
            separate_attention_context_words))
        print("separate_attention_response_words is : {}\n".format(
            separate_attention_response_words))

        #B x S x N tensor of batches of context
        idxs_context = T.itensor3('idxs_context')  #imatrix, i = int
        #B x S x N matrix
        mask_context_words = T.itensor3('mask_context_words')
        #B x S matrix
        mask_context_sents = T.imatrix('mask_context_sents')

        #B x S x N tensor of batches of responses
        idxs_response = T.itensor3('idxs_response')  #imatrix, i = int

        #B x S X N matrix for words
        mask_response_words = T.itensor3('mask_response_words')
        #B x S matrix for sentences
        mask_response_sents = T.imatrix('mask_response_sents')

        #B-long vector
        gold = T.ivector('y')

        # dropout
        dropout_val = T.scalar('p_dropout')

        #lambda, cost
        lambda_cost = T.scalar('lambda_w')

        #biases
        biases_cost = T.matrix('biases')

        #weights
        weights = T.ivector('weights')
        ''' check biases'''
        biases_present = False
        if biases_present:
            lstm_biases = lasagne.layers.InputLayer(shape=(None, 1),
                                                    input_var=biases_cost)
        ''' building the context  layer via function'''
        if separate_attention_context_sents:
            lstm_hidden_context,lstm_attn_words_context,lstm_attn_sents_context = self.buildThePostLayer(idxs_context,mask_context_words,\
                                                mask_context_sents,\
                                                separate_attention_context_words,\
                                                separate_attention_context_sents,num_hidden,grad_clip,V,K,W,max_post_len,max_sent_len)
        ''' do the same for response layer'''
        if separate_attention_response_sents:
            lstm_hidden_response,lstm_attn_words_response,lstm_attn_sents_response = self.buildThePostLayer(idxs_response,mask_response_words,\
                                                mask_context_sents,\
                                                separate_attention_context_words,\
                                                separate_attention_context_sents,num_hidden,grad_clip,V,K,W,max_post_len,max_sent_len)

        print('...')
        print('finished compiling...')
        ''' prepare the final network of connections now'''
        if separate_attention_response_sents and separate_attention_context_sents:
            output, network = self.buildNetwork(lstm_hidden_context,
                                                lstm_hidden_response,
                                                num_classes)

        elif separate_attention_context_sents:
            output, network = self.buildNetworkOnlyContext(
                lstm_hidden_context, num_classes)
        '''Define objective function (cost) to minimize mean cross-entropy error'''
        params = lasagne.layers.get_all_params(network)
        cost = lasagne.objectives.categorical_crossentropy(output, gold).mean()
        lambda_w = .000001
        cost += lambda_w * apply_penalty(params, l2)
        grad_updates = lasagne.updates.adam(cost, params)

        test_output = lasagne.layers.get_output(network, deterministic=True)
        val_cost_fn = lasagne.objectives.categorical_crossentropy(
            test_output, gold).mean()
        preds = T.argmax(test_output, axis=1)

        val_acc_fn = T.mean(T.eq(preds, gold), dtype=theano.config.floatX)

        if separate_attention_context_sents and separate_attention_response_sents:

            self.val_fn = theano.function([idxs_context, mask_context_words, mask_context_sents, idxs_response, \
                                           mask_response_words, mask_response_sents, gold], [val_cost_fn, val_acc_fn, preds],
                                          allow_input_downcast=True,on_unused_input='warn')
            # Compile train objective
            print "Compiling training, testing, prediction functions"
            self.train = theano.function(inputs = [idxs_context, mask_context_words, mask_context_sents,\
                                                   idxs_response, mask_response_words, mask_response_sents, gold],\
                                          outputs = cost, updates = grad_updates, allow_input_downcast=True,on_unused_input='warn')

            self.test = theano.function(inputs = [idxs_context, mask_context_words, mask_context_sents,idxs_response,\
                                                  mask_response_words, mask_response_sents, gold],\
                                                   outputs = val_acc_fn,allow_input_downcast=True,on_unused_input='warn')

            self.pred = theano.function(inputs = [idxs_context, mask_context_words, mask_context_sents, \
                                                  idxs_response, mask_response_words, mask_response_sents],\
                                        outputs = preds,allow_input_downcast=True,on_unused_input='warn')

        elif separate_attention_context_sents:

            self.val_fn = theano.function([idxs_context, mask_context_words, mask_context_sents,  \
                                           gold], [val_cost_fn, val_acc_fn, preds],
                                         allow_input_downcast=True,on_unused_input='warn')

            print "Compiling training, testing, prediction functions"
            self.train = theano.function(inputs = [idxs_context, mask_context_words, mask_context_sents,\
                                                   gold],\
                                         outputs = cost, updates = grad_updates, allow_input_downcast=True,on_unused_input='warn')

            self.test = theano.function(inputs = [idxs_context, mask_context_words, mask_context_sents,\
                                                  gold],\
                                                  outputs = val_acc_fn,allow_input_downcast=True,on_unused_input='warn')

            self.pred = theano.function(inputs = [idxs_context, mask_context_words, mask_context_sents \
                                                 ],\
                                       outputs = preds,allow_input_downcast=True,on_unused_input='warn')

        if separate_attention_response_sents:
            sentence_attention = lasagne.layers.get_output(
                lstm_attn_sents_response)
            #if add_biases:
            #    inputs = inputs[:-1]
            self.sentence_attention_response = theano.function([idxs_context, mask_context_words,\
                                                mask_context_sents,idxs_response, mask_response_words, mask_response_sents],
                                                      sentence_attention,
                                                      allow_input_downcast=True,
                                                      on_unused_input='warn')
        if separate_attention_context_sents:
            sentence_attention_context = lasagne.layers.get_output(
                lstm_attn_sents_context)
            #if add_biases:
            #    inputs = inputs[:-1]
            self.sentence_attention_context = theano.function([idxs_context, mask_context_words,\
                                             mask_context_sents,idxs_response, mask_response_words, mask_response_sents],
                                                      [sentence_attention_context, preds],
                                                      allow_input_downcast=True,
                                                      on_unused_input='warn')

        if separate_attention_response_words:
            sentence_attention_words = lasagne.layers.get_output(
                lstm_attn_words_response)
            #if add_biases:
            #    inputs = inputs[:-1]
            self.sentence_attention_response_words = theano.function(
                [
                    idxs_context, mask_context_words, mask_context_sents,
                    idxs_response, mask_response_words, mask_response_sents
                ],
                sentence_attention_words,
                allow_input_downcast=True,
                on_unused_input='warn')
        if separate_attention_context_words:
            sentence_attention_context_words = lasagne.layers.get_output(
                lstm_attn_words_context)
            #if add_biases:
            #    inputs = inputs[:-1]
            self.sentence_attention_context_words = theano.function(
                [
                    idxs_context, mask_context_words, mask_context_sents,
                    idxs_response, mask_response_words, mask_response_sents
                ],
                sentence_attention_context_words,
                allow_input_downcast=True,
                on_unused_input='warn')
        '''compare the results with regular code and then add the bias etc. '''
# Get regularizable params
regularization_params = layers.get_all_params(unsupervised_graph, regularizable=True) + \
                        layers.get_all_params(supervised_graph, regularizable=True)
regularization_params = utils.unique(regularization_params)

# Creating loss functions
# Train loss has to take into account of labeled image or not
if run_parameters.unsupervised_cost_fun == 'squared_error':
    loss1 = objectives.squared_error(reconstruction, input_var)
elif run_parameters.unsupervised_cost_fun == 'categorical_crossentropy':
    loss1 = objectives.categorical_crossentropy(reconstruction, input_var)
if supervised_cost_fun == 'squared_error':
    loss2 = objectives.squared_error(prediction, target_var) * repeat_col(labeled_var, 10)
elif supervised_cost_fun == 'categorical_crossentropy':
    loss2 = objectives.categorical_crossentropy(prediction, target_var) * labeled_var.T
l2_penalties = regularization.apply_penalty(regularization_params, regularization.l2)
sparse_layers = get_all_sparse_layers(unsupervised_graph)
sparse_layers_output = layers.get_output(sparse_layers, deterministic=True)
if run_parameters.sparse_regularizer_type == 0:
    sparse_regularizer = reduce(lambda x, y: x + T.clip((T.mean(abs(y)) - run_parameters.sparse_regularize_factor) *
                                                        y.size, 0, float('inf')),
                                sparse_layers_output, 0)
elif run_parameters.sparse_regularizer_type == 1:
    sparse_regularizer = reduce(
        lambda x, y: x + T.clip(T.mean(abs(y), axis=1) - run_parameters.sparse_regularize_factor,
                                0, float('inf')).sum() * y.shape[1],
        sparse_layers_output, 0)

loss = losses_ratio[0] * loss1.mean() + \
       losses_ratio[1] * loss2.mean() + \
       losses_ratio[2] * l2_penalties.mean() + \
Beispiel #21
0
def execute(dataset,
            n_hidden_u,
            num_epochs=500,
            learning_rate=.001,
            learning_rate_annealing=1.0,
            lmd=.0001,
            embedding_input='raw',
            which_fold=0,
            save_path='/Tmp/$USER/feature_selection/newmodel/',
            save_copy='/Tmp/$USER/feature_selection/newmodel/',
            dataset_path='/Tmp/$USER/feature_selection/newmodel/'):

    # Load the dataset
    print("Loading data")
    x_unsup = mlh.load_data(dataset,
                            dataset_path,
                            None,
                            which_fold=which_fold,
                            keep_labels=1.0,
                            missing_labels_val=-1.0,
                            embedding_input=embedding_input,
                            transpose=True)

    x_train = x_unsup[0][0]
    x_valid = x_unsup[1][0]

    # Extract required information from data
    n_row, n_col = x_train.shape
    print('Data size ' + str(n_row) + 'x' + str(n_col))

    # Set some variables
    batch_size = 256

    # Define experiment name
    exp_name = 'pretrain_' + mlh.define_exp_name(
        1., 0, 0, 0, lmd, n_hidden_u, [], [], [], which_fold, embedding_input,
        learning_rate, 0, 0, 'reconst_loss', learning_rate_annealing)
    print('Experiment: ' + exp_name)

    # Preparing folder to save stuff
    save_path = os.path.join(save_path, dataset, exp_name)
    save_copy = os.path.join(save_copy, dataset, exp_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # Prepare Theano variables for inputs and targets
    input_var = T.matrix('input_unsup')
    lr = theano.shared(np.float32(learning_rate), 'learning_rate')

    # Build model
    print("Building model")

    # Some checkings
    assert len(n_hidden_u) > 0

    # Build unsupervised network
    encoder_net = InputLayer((None, n_col), input_var)

    for out in n_hidden_u:
        encoder_net = DenseLayer(encoder_net, num_units=out, nonlinearity=tanh)
        encoder_net = DropoutLayer(encoder_net)

    decoder_net = encoder_net
    for i in range(len(n_hidden_u) - 2, -1, -1):
        decoder_net = DenseLayer(decoder_net,
                                 num_units=n_hidden_u[i],
                                 nonlinearity=linear)
        decoder_net = DropoutLayer(decoder_net)

    decoder_net = DenseLayer(decoder_net, num_units=n_col, nonlinearity=linear)

    if embedding_input == 'raw' or embedding_input == 'w2v':
        final_nonlin = linear
    elif embedding_input == 'bin':
        final_nonlin = sigmoid
    elif 'histo' in embedding_input:
        final_nonlin = softmax

    if embedding_input == 'histo3x26':
        laySize = lasagne.layers.get_output(decoder_net).shape
        decoder_net = ReshapeLayer(decoder_net, (laySize[0] * 26, 3))

    decoder_net = NonlinearityLayer(decoder_net, nonlinearity=final_nonlin)

    if embedding_input == 'histo3x26':
        decoder_net = ReshapeLayer(decoder_net, (laySize[0], laySize[1]))

    print("Building and compiling training functions")
    # Build and compile training functions
    predictions, predictions_det = mh.define_predictions(
        [encoder_net, decoder_net], start=0)
    prediction_sup, prediction_sup_det = mh.define_predictions(
        [encoder_net, decoder_net], start=0)

    # Define losses
    # reconstruction losses
    loss, loss_det = mh.define_loss(predictions[1], predictions_det[1],
                                    input_var, embedding_input)

    # Define parameters
    params = lasagne.layers.get_all_params(decoder_net, trainable=True)

    l2_penalty = apply_penalty(params, l2)
    loss = loss + lmd * l2_penalty
    loss_det = loss_det + lmd * l2_penalty

    # Compute network updates
    updates = lasagne.updates.adam(loss, params, learning_rate=lr)
    # updates = lasagne.updates.sgd(loss,
    #                              params,
    #                              learning_rate=lr)
    # updates = lasagne.updates.momentum(loss, params,
    #                                    learning_rate=lr, momentum=0.0)

    # Apply norm constraints on the weights
    for k in updates.keys():
        if updates[k].ndim == 2:
            updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0)

    # Compile training function
    train_fn = theano.function([input_var],
                               loss,
                               updates=updates,
                               on_unused_input='ignore')

    # Expressions required for test
    monitor_labels = ['loss']
    val_outputs = [loss_det]

    # Add some monitoring on the learned feature embedding
    val_outputs += [
        predictions[0].min(), predictions[0].mean(), predictions[0].max(),
        predictions[0].var()
    ]
    monitor_labels += [
        "feat. emb. min", "feat. emb. mean", "feat. emb. max", "feat. emb. var"
    ]

    # Compile validation function
    val_fn = theano.function([input_var], val_outputs)

    pred_feat_emb = theano.function([input_var], predictions_det[0])

    # Finally, launch the training loop.
    print("Starting training...")

    # Some variables
    max_patience = 100
    patience = 0

    train_monitored = []
    valid_monitored = []
    train_loss = []

    nb_minibatches = n_row / batch_size
    print("Nb of minibatches: " + str(nb_minibatches))
    start_training = time.time()
    for epoch in range(num_epochs):
        start_time = time.time()
        print("Epoch {} of {}".format(epoch + 1, num_epochs))

        loss_epoch = 0

        # Train pass
        for batch in mlh.iterate_minibatches_unsup(x_train,
                                                   batch_size,
                                                   shuffle=True):
            loss_epoch += train_fn(batch)

        loss_epoch /= nb_minibatches
        train_loss += [loss_epoch]

        train_minibatches = mlh.iterate_minibatches_unsup(x_train,
                                                          batch_size,
                                                          shuffle=True)
        train_err = mlh.monitoring(train_minibatches,
                                   "train",
                                   val_fn,
                                   monitor_labels,
                                   start=0)
        train_monitored += [train_err]

        # Validation pass
        valid_minibatches = mlh.iterate_minibatches_unsup(x_valid,
                                                          batch_size,
                                                          shuffle=True)

        valid_err = mlh.monitoring(valid_minibatches,
                                   "valid",
                                   val_fn,
                                   monitor_labels,
                                   start=0)

        valid_monitored += [valid_err]

        try:
            early_stop_val = valid_err[monitor_labels.index('loss')]
        except:
            raise ValueError("There is no monitored value by the name of %s" %
                             early_stop_criterion)

        # Eearly stopping
        if epoch == 0:
            best_valid = early_stop_val
        elif early_stop_val < best_valid:
            best_valid = early_stop_val
            patience = 0

            # Save stuff
            np.savez(
                os.path.join(save_path, 'model_enc_unsupervised_best.npz'),
                *lasagne.layers.get_all_param_values(encoder_net))
            np.savez(os.path.join(save_path, 'model_ae_unsupervised_best.npz'),
                     *lasagne.layers.get_all_param_values(encoder_net))
            np.savez(os.path.join(save_path, "errors_unsupervised_best.npz"),
                     zip(*train_monitored), zip(*valid_monitored))
        else:
            patience += 1
            # Save stuff
            np.savez(
                os.path.join(save_path, 'model_enc_unsupervised_last.npz'),
                *lasagne.layers.get_all_param_values(encoder_net))
            np.savez(os.path.join(save_path, 'model_ae_unsupervised_last.npz'),
                     *lasagne.layers.get_all_param_values(encoder_net))
            np.savez(os.path.join(save_path, "errors_unsupervised_last.npz"),
                     zip(*train_monitored), zip(*valid_monitored))

        # End training
        if patience == max_patience or epoch == num_epochs - 1:
            print("   Ending training")
            # Load unsupervised best model
            if not os.path.exists(save_path +
                                  '/model_enc_unsupervised_best.npz'):
                print("No saved model to be tested and/or generate"
                      " the embedding !")
            else:
                with np.load(save_path +
                             '/model_enc_unsupervised_best.npz', ) as f:
                    param_values = [
                        f['arr_%d' % i] for i in range(len(f.files))
                    ]
                    lasagne.layers.set_all_param_values(
                        encoder_net, param_values)

                # Save embedding
                preds = []
                for batch in mlh.iterate_minibatches_unsup(x_train,
                                                           1,
                                                           shuffle=False):
                    preds.append(pred_feat_emb(batch))
                for batch in mlh.iterate_minibatches_unsup(x_valid,
                                                           1,
                                                           shuffle=False):
                    preds.append(pred_feat_emb(batch))
                preds = np.vstack(preds)
                np.savez(os.path.join(save_path, 'feature_embedding.npz'),
                         preds)

            # Stop
            print(" epoch time:\t\t\t{:.3f}s".format(time.time() - start_time))
            break

        print("  epoch time:\t\t\t{:.3f}s".format(time.time() - start_time))
        # Anneal the learning rate
        lr.set_value(float(lr.get_value() * learning_rate_annealing))

    # Print all final errors for train, validation and test
    print("Training time:\t\t\t{:.3f}s".format(time.time() - start_training))

    # Copy files to loadpath
    if save_path != save_copy:
        print('Copying model and other training files to {}'.format(save_copy))
        copy_tree(save_path, save_copy)
def execute(dataset,
            n_hidden_u,
            n_hidden_t_enc,
            n_hidden_t_dec,
            n_hidden_s,
            embedding_source=histo_GenotypicFrequency_perclass,
            additional_unsup_input=None,
            num_epochs=500,
            learning_rate=.001,
            learning_rate_annealing=1.0,
            alpha=1,
            beta=1,
            delta=1,
            gamma=1,
            lmd=.0001,
            disc_nonlinearity="sigmoid",
            encoder_net_init=0.2,
            decoder_net_init=0.2,
            optimizer="rmsprop",
            max_patience=100,
            batchnorm=0,
            input_dropout=1.0,
            embedding_noise=0.0,
            keep_labels=1.0,
            prec_recall_cutoff=True,
            missing_labels_val=-1.0,
            which_fold=0,
            early_stop_criterion='loss_sup_det',
            input_decoder_mode="regression",
            save_path='/Users/Marie-Elyse/Downloads/embedding2',
            save_copy='/Users/Marie-Elyse/Downloads/embedding2',
            dataset_path='/Users/Marie-Elyse/Downloads/embedding2',
            resume=False,
            exp_name='',
            random_proj=0,
            bootstrap_snp_embeddings=0,
            bootstrap_cutoff=0.9):

    # Prepare embedding information :
    # - If no embedding is specified, use the transposed input matrix
    # - If a file is specified, use it's content as feature embeddings
    # - Else (a embedding category like  'histo3x26' is provided), load a
    #   pregenerated embedding of the specified category
    if embedding_source is None or embedding_source == "raw":
        embedding_source = None
        embedding_input = 'raw'
    elif os.path.exists(embedding_source):
        embedding_input = embedding_source
    else:
        embedding_input = embedding_source
        embedding_source = os.path.join(
            dataset_path, embedding_input + '_fold' + str(which_fold) + '.npy')

    # Load the dataset
    print("Loading data")
    (x_train, y_train, exmpl_ids_train, x_valid, y_valid, exmpl_ids_valid,
     x_test, y_test, exmpl_ids_test, x_unsup, training_labels, feature_names,
     label_names) = mlh.load_data(dataset,
                                  dataset_path,
                                  embedding_source,
                                  which_fold=which_fold,
                                  keep_labels=keep_labels,
                                  missing_labels_val=missing_labels_val,
                                  embedding_input=embedding_input,
                                  norm=False)

    # Load the additional unsupervised data, if some is specified
    if additional_unsup_input is not None:
        print("Adding additional data to the model's unsupervised inputs")
        paths = additional_unsup_input.split(";")
        additional_unsup_data = [np.load(p) for p in paths]
        print(x_unsup.shape)
        x_unsup = np.hstack(additional_unsup_data + [x_unsup])
        print(x_unsup.shape)

    if x_unsup is not None:
        n_samples_unsup = x_unsup.shape[1]
    else:
        n_samples_unsup = 0

    original_x_train = x_train.copy()
    original_x_valid = x_valid.copy()
    original_x_test = x_test.copy()

    # Change how the missing data values are encoded. Right now they are
    # encoded as being the mean of the corresponding feature so that, after
    # feature normalization, they will be 0s. However, this prevents us from
    # transfering the minibatch data as int8 so we replace those values with -1s.
    for i in range(x_train.shape[1]):
        feature_mean = x_train[:, i].mean()
        x_train[:, i] = mh.replace_arr_value(x_train[:, i], feature_mean, -1)
        x_valid[:, i] = mh.replace_arr_value(x_valid[:, i], feature_mean, -1)
        x_test[:, i] = mh.replace_arr_value(x_test[:, i], feature_mean, -1)
    x_train = x_train.astype("int8")
    x_valid = x_valid.astype("int8")
    x_test = x_test.astype("int8")

    # Normalize the input data. The mlh.load_data() function already offers
    # this feature but we need to do it here so that we will have access to
    # both the normalized and unnormalized input data.
    norm_mus = original_x_train.mean(axis=0)
    norm_sigmas = original_x_train.std(axis=0) + 1e-6

    #x_train = (x_train - norm_mus[None, :]) / norm_sigmas[None, :]
    #x_valid = (x_valid - norm_mus[None, :]) / norm_sigmas[None, :]
    #x_test = (x_test - norm_mus[None, :]) / norm_sigmas[None, :]

    #x_train *= (315345. / 553107)
    #x_valid *= (315345. / 553107)
    #x_test *= (315345. / 553107)

    # Setup variables to build the right type of decoder bases on the value of
    # `input_decoder_mode`
    assert input_decoder_mode in ["regression", "classification"]
    if input_decoder_mode == "regression":
        # The size of the input reconstruction will be the same as the number
        # of inputs
        decoder_encoder_unit_ratio = 1
    elif input_decoder_mode == "classification":
        # # The size of the input reconstruction will be the N times larger as
        # the number of inputs where N is the number of distinct discrete
        # values that each input can take. For SNP input data with an additive
        # coding scheme, N=3 because the 3 possible values are : {0, 1, 2}.
        nb_discrete_vals_by_input = int(original_x_train.max() + 1)
        decoder_encoder_unit_ratio = nb_discrete_vals_by_input

        # Print baseline accuracy for the imputation of genes
        print("Distribution of input values in valid: %f %f %f" %
              ((original_x_train == 0).mean(), (original_x_train == 1).mean(),
               (original_x_train == 2).mean()))
        print("Distribution of input values in test: %f %f %f" %
              ((original_x_test == 0).mean(), (original_x_test == 1).mean(),
               (original_x_test == 2).mean()))

    # Extract required information from data
    n_samples, n_feats = x_train.shape
    print("Number of features : ", n_feats)
    print("Glorot init : ", 2.0 / (n_feats + n_hidden_t_enc[-1]))
    n_targets = y_train.shape[1] if y_train.ndim == 2 else y_train.max() + 1

    # Set some variables
    batch_size = 138
    beta = gamma if (gamma == 0) else beta

    # Generate an name for the experiment based on the hyperparameters used
    if embedding_source is None:
        embedding_name = embedding_input
    else:
        embedding_name = embedding_source.replace("_", "").split(".")[0]
        exp_name += embedding_name.rsplit('/', 1)[::-1][0] + '_'

    exp_name += mlh.define_exp_name(
        keep_labels, alpha, beta, gamma, lmd, n_hidden_u, n_hidden_t_enc,
        n_hidden_t_dec, n_hidden_s, which_fold, learning_rate,
        decoder_net_init, encoder_net_init, batchnorm, input_dropout,
        embedding_noise, early_stop_criterion, learning_rate_annealing,
        input_decoder_mode)
    print("Experiment: " + exp_name)

    # Ensure that the folders where the results of the experiment will be
    # saved do exist. Create them if they don't.
    save_path = os.path.join(save_path, dataset, exp_name)
    save_copy = os.path.join(save_copy, dataset, exp_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    if not os.path.exists(save_copy):
        os.makedirs(save_copy)

    # Prepare Theano variables for inputs and targets
    input_var_sup = T.bmatrix('input_sup')
    input_var_unsup = theano.shared(x_unsup, 'input_unsup')  # x_unsup TBD
    target_var_sup = T.matrix('target_sup')
    lr = theano.shared(np.float32(learning_rate), 'learning_rate')

    # Use the provided mus and sigmas to process the missing values and
    # normalize the inputs
    b_input_var_sup = input_var_sup.astype("float32")
    normed_input_sup = (T.eq(b_input_var_sup, -1) * norm_mus +
                        T.neq(b_input_var_sup, -1) * b_input_var_sup)
    normed_input_sup = (normed_input_sup - norm_mus) / norm_sigmas

    reconst_target_sup = T.cast(input_var_sup, "int32")

    # Build model
    print("Building model")

    # Some checkings
    # assert len(n_hidden_u) > 0
    assert len(n_hidden_t_enc) > 0
    assert len(n_hidden_t_dec) > 0
    assert n_hidden_t_dec[-1] == n_hidden_t_enc[-1]

    # Build feature embedding networks (encoding and decoding if gamma > 0)
    nets, embeddings, pred_feat_emb = mh.build_feat_emb_nets(
        embedding_source, n_feats, n_samples_unsup, input_var_unsup,
        n_hidden_u, n_hidden_t_enc, n_hidden_t_dec, gamma, encoder_net_init,
        decoder_net_init, save_path, random_proj, decoder_encoder_unit_ratio,
        embedding_noise)

    # Build feature embedding reconstruction networks (if alpha > 0, beta > 0)
    nets += mh.build_feat_emb_reconst_nets(
        [alpha, beta], n_samples_unsup, n_hidden_u,
        [n_hidden_t_enc, n_hidden_t_dec], nets,
        [encoder_net_init, decoder_net_init])

    # Supervised network
    discrim_net, hidden_rep = mh.build_discrim_net(
        batch_size, n_feats, normed_input_sup, n_hidden_t_enc, n_hidden_s,
        embeddings[0], disc_nonlinearity, n_targets, batchnorm, input_dropout)

    # Reconstruct network
    nets += [
        mh.build_reconst_net(hidden_rep,
                             embeddings[1] if len(embeddings) > 1 else None,
                             n_feats * decoder_encoder_unit_ratio, gamma,
                             decoder_encoder_unit_ratio)
    ]

    # Load weights if we are resuming job
    if resume:
        # Load best model
        with np.load(os.path.join(save_copy, 'dietnet_best.npz')) as f:
            param_values = [f['arr_%d' % i] for i in range(len(f.files))]
        nlayers = len(
            lasagne.layers.get_all_params(filter(None, nets) + [discrim_net]))
        #lasagne.layers.set_all_param_values(filter(None, nets) +
        #                                    [discrim_net],
        #                                    param_values[:nlayers])

        params = lasagne.layers.get_all_params(
            filter(None, nets) + [discrim_net])
        for p, v in zip(params, param_values[:nlayers]):
            # Do not overwrite embedding value with old embedding. Removing
            # the following condition will prevent a trained model from being
            # tested on a different dataset
            if p.name != "feat_emb":
                p.set_value(v)

    print("Building and compiling training functions")

    # Build and compile training functions
    predictions, predictions_det = mh.define_predictions(nets, start=2)
    prediction_sup, prediction_sup_det = mh.define_predictions([discrim_net])
    prediction_sup = prediction_sup[0]
    prediction_sup_det = prediction_sup_det[0]

    # Define losses
    # reconstruction losses
    if input_decoder_mode == "regression":
        reconst_losses, reconst_losses_det = mh.define_reconst_losses(
            predictions, predictions_det,
            [input_var_unsup, input_var_unsup, normed_input_sup])
    elif input_decoder_mode == "classification":
        # Obtain regular reconstruction losses for every reconstruction
        # but the reconstruction of the supervised input data
        reconst_losses1, reconst_losses_det1 = mh.define_reconst_losses(
            predictions[:-1], predictions_det[:-1],
            [input_var_unsup, input_var_unsup])

        # Obtain a "classification" reconstruction loss for the reconstruction
        # of the supervised input data. This classification loss will be
        # performed on the input data without normalization
        reconst_losses2, reconst_losses_det2 = mh.define_classif_reconst_losses(
            predictions[-1:], predictions_det[-1:], [reconst_target_sup],
            [decoder_encoder_unit_ratio])

        reconst_losses = reconst_losses1 + reconst_losses2
        reconst_losses_det = reconst_losses_det1 + reconst_losses_det2

    # supervised loss
    sup_loss, sup_loss_det = mh.define_sup_loss(disc_nonlinearity,
                                                prediction_sup,
                                                prediction_sup_det,
                                                keep_labels, target_var_sup,
                                                missing_labels_val)

    # Define inputs
    inputs = [input_var_sup, target_var_sup]

    # Define parameters
    params = lasagne.layers.get_all_params([discrim_net] + filter(None, nets),
                                           trainable=True,
                                           unwrap_shared=False)
    params_to_freeze= \
        lasagne.layers.get_all_params(filter(None, nets), trainable=False,
                                      unwrap_shared=False)

    # Remove unshared variables from params and params_to_freeze
    params = [
        p for p in params
        if isinstance(p, theano.compile.sharedvalue.SharedVariable)
    ]
    params_to_freeze = [
        p for p in params_to_freeze
        if isinstance(p, theano.compile.sharedvalue.SharedVariable)
    ]
    print("Params : ", params)

    feat_emb_var = next(p for p in lasagne.layers.get_all_params([discrim_net])
                        if p.name == 'input_unsup' or p.name == 'feat_emb')
    # feat_emb_var = lasagne.layers.get_all_params([discrim_net])[0]
    print(feat_emb_var)
    feat_emb_val = feat_emb_var.get_value()
    feat_emb_norms = (feat_emb_val**2).sum(0)**0.5
    feat_emb_var.set_value(feat_emb_val / feat_emb_norms)

    print('Number of params discrim: ' + str(len(params)))
    print('Number of params to freeze: ' + str(len(params_to_freeze)))

    for p in params_to_freeze:
        new_params = [el for el in params if el != p]
        params = new_params

    print('Number of params to update: ' + str(len(params)))

    # Combine losses
    loss = delta*sup_loss + alpha*reconst_losses[0] + beta*reconst_losses[1] + \
        gamma*reconst_losses[2]
    loss_det = delta*sup_loss_det + alpha*reconst_losses_det[0] + \
        beta*reconst_losses_det[1] + gamma*reconst_losses_det[2]

    l2_penalty = apply_penalty(params, l2)
    loss = loss + lmd * l2_penalty
    loss_det = loss_det + lmd * l2_penalty

    # Compute network updates
    assert optimizer in ["rmsprop", "adam", "amsgrad"]
    if optimizer == "rmsprop":
        updates = lasagne.updates.rmsprop(loss, params, learning_rate=lr)
    elif optimizer == "adam":
        updates = lasagne.updates.adam(loss, params, learning_rate=lr)
    elif optimizer == "amsgrad":
        updates = lasagne.updates.amsgrad(loss, params, learning_rate=lr)
    #updates = lasagne.updates.sgd(loss,
    #                              params,
    #                              learning_rate=lr)
    # updates = lasagne.updates.momentum(loss, params,
    #                                    learning_rate=lr, momentum=0.0)

    # Apply norm constraints on the weights
    for k in updates.keys():
        if updates[k].ndim == 2:
            updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0)

    # Compile training function
    train_fn = theano.function(inputs,
                               loss,
                               updates=updates,
                               on_unused_input='ignore')

    # Monitoring Labels
    monitor_labels = [
        "reconst. feat. W_enc", "reconst. feat. W_dec", "reconst. loss"
    ]
    monitor_labels = [
        i for i, j in zip(monitor_labels, reconst_losses) if j != 0
    ]
    monitor_labels += ["feat. W_enc. mean", "feat. W_enc var"]
    monitor_labels += ["feat. W_dec. mean", "feat. W_dec var"] if \
        (embeddings[1] is not None) else []
    monitor_labels += ["loss. sup.", "total loss"]

    # Build and compile test function
    val_outputs = reconst_losses_det
    val_outputs = [i for i, j in zip(val_outputs, reconst_losses) if j != 0]
    val_outputs += [embeddings[0].mean(), embeddings[0].var()]
    val_outputs += [embeddings[1].mean(), embeddings[1].var()] if \
        (embeddings[1] is not None) else []
    val_outputs += [sup_loss_det, loss_det]

    # Compute supervised accuracy and add it to monitoring list
    test_acc, test_pred = mh.define_test_functions(disc_nonlinearity,
                                                   prediction_sup,
                                                   prediction_sup_det,
                                                   target_var_sup)
    monitor_labels.append("accuracy")
    val_outputs.append(test_acc)

    # If appropriate, compute the input reconstruction accuracy and add it to
    # the monitoring list
    if input_decoder_mode == "classification":
        input_reconst_acc = mh.define_classif_reconst_acc(
            predictions_det[-1], reconst_target_sup,
            decoder_encoder_unit_ratio)
        #import pdb; pdb.set_trace()
        monitor_labels.append("input_reconst_acc")
        val_outputs.append(input_reconst_acc)

    # Compile prediction function
    predict = theano.function([input_var_sup], test_pred)
    predict_from_normed_inps = theano.function([normed_input_sup], test_pred)

    predict_scores = theano.function([input_var_sup], prediction_sup_det)
    predict_scores_from_normed_inps = theano.function([input_var_sup],
                                                      prediction_sup_det)

    # Compile validation function
    val_fn = theano.function(inputs, [prediction_sup_det] + val_outputs,
                             on_unused_input='ignore')

    # Finally, launch the training loop.
    print("Starting training...")

    # Some variables
    patience = 0

    train_monitored = []
    valid_monitored = []
    train_loss = []

    # Pre-training monitoring
    print("Epoch 0 of {}".format(num_epochs))

    train_minibatches = mlh.iterate_minibatches(x_train,
                                                y_train,
                                                batch_size,
                                                shuffle=False)
    train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                               monitor_labels, prec_recall_cutoff)

    valid_minibatches = mlh.iterate_minibatches(x_valid,
                                                y_valid,
                                                batch_size,
                                                shuffle=False)
    valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                               monitor_labels, prec_recall_cutoff)

    # Before starting training, save a copy of the model in case
    np.savez(
        os.path.join(save_path, 'dietnet_best.npz'),
        *lasagne.layers.get_all_param_values(
            filter(None, nets) + [discrim_net]))

    # Training loop
    start_training = time.time()
    for epoch in range(num_epochs):
        start_time = time.time()
        print("Epoch {} of {}".format(epoch + 1, num_epochs))
        nb_minibatches = 0
        loss_epoch = 0

        # Train pass
        for batch in mlh.iterate_minibatches(x_train,
                                             training_labels,
                                             batch_size,
                                             shuffle=True):
            loss_epoch += train_fn(*batch)
            nb_minibatches += 1

        loss_epoch /= nb_minibatches
        train_loss += [loss_epoch]

        # Monitoring on the training set
        train_minibatches = mlh.iterate_minibatches(x_train,
                                                    y_train,
                                                    batch_size,
                                                    shuffle=False)
        train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                   monitor_labels, prec_recall_cutoff)
        train_monitored += [train_err]

        # Monitoring on the validation set
        valid_minibatches = mlh.iterate_minibatches(x_valid,
                                                    y_valid,
                                                    batch_size,
                                                    shuffle=False)

        valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                   monitor_labels, prec_recall_cutoff)
        valid_monitored += [valid_err]

        try:
            early_stop_val = valid_err[monitor_labels.index(
                early_stop_criterion)]
        except:
            raise ValueError("There is no monitored value by the name of %s" %
                             early_stop_criterion)

        valid_loss_sup_hist = [
            v[monitor_labels.index("loss. sup.")] for v in valid_monitored
        ]
        valid_loss_sup = valid_loss_sup_hist[-1]

        # Early stopping
        if epoch == 0:
            best_valid = early_stop_val
        elif ((early_stop_val > best_valid
               and early_stop_criterion == 'input_reconst_acc')
              or (early_stop_val > best_valid
                  and early_stop_criterion == 'accuracy')
              or (early_stop_val >= best_valid
                  and early_stop_criterion == 'accuracy'
                  and valid_loss_sup == min(valid_loss_sup_hist))
              or (early_stop_val < best_valid
                  and early_stop_criterion == 'loss. sup.')):
            best_valid = early_stop_val
            patience = 0

            # Save stuff
            np.savez(
                os.path.join(save_path, 'dietnet_best.npz'),
                *lasagne.layers.get_all_param_values(
                    filter(None, nets) + [discrim_net]))
            np.savez(save_path + "/errors_supervised_best.npz",
                     zip(*train_monitored), zip(*valid_monitored))

            # Monitor on the test set now because sometimes the saving doesn't
            # go well and there isn't a model to load at the end of training
            if y_test is not None:
                test_minibatches = mlh.iterate_minibatches(x_test,
                                                           y_test,
                                                           138,
                                                           shuffle=False)

                test_err = mlh.monitoring(test_minibatches, "test", val_fn,
                                          monitor_labels, prec_recall_cutoff)
        else:
            patience += 1
            # Save stuff
            np.savez(
                os.path.join(save_path, 'dietnet_last.npz'),
                *lasagne.layers.get_all_param_values(
                    filter(None, nets) + [discrim_net]))
            np.savez(save_path + "/errors_supervised_last.npz",
                     zip(*train_monitored), zip(*valid_monitored))

        print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time))

        # End training if needed
        if patience == max_patience or epoch == num_epochs - 1:
            break

        # Anneal the learning rate
        lr.set_value(
            np.array(lr.get_value() * learning_rate_annealing,
                     dtype="float32"))

    # End training with a final monitoring step on the best model
    print("Ending training")

    # Load best model
    with np.load(os.path.join(save_path, 'dietnet_best.npz')) as f:
        param_values = [f['arr_%d' % i] for i in range(len(f.files))]
        nlayers = len(
            lasagne.layers.get_all_params(filter(None, nets) + [discrim_net]))

        #lasagne.layers.set_all_param_values(filter(None, nets) +
        #                                    [discrim_net],
        #                                    param_values[:nlayers])
        params = lasagne.layers.get_all_params(
            filter(None, nets) + [discrim_net])
        for p, v in zip(params, param_values[:nlayers]):
            # Do not overwrite embedding value with old embedding. Removing
            # the following condition will prevent a trained model from being
            # tested on a different dataset
            if p.name != "feat_emb":
                p.set_value(v)

        if embedding_source is None:
            # Save embedding
            pred = pred_feat_emb()
            np.savez(os.path.join(save_path, 'feature_embedding.npz'), pred)

        # Training set results
        train_minibatches = mlh.iterate_minibatches(x_train,
                                                    y_train,
                                                    batch_size,
                                                    shuffle=False)
        train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                   monitor_labels, prec_recall_cutoff)

        # Validation set results
        valid_minibatches = mlh.iterate_minibatches(x_valid,
                                                    y_valid,
                                                    batch_size,
                                                    shuffle=False)
        valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                   monitor_labels, prec_recall_cutoff)

        # Test set results
        if y_test is not None:
            test_minibatches = mlh.iterate_minibatches(x_test,
                                                       y_test,
                                                       138,
                                                       shuffle=False)

            test_err = mlh.monitoring(test_minibatches, "test", val_fn,
                                      monitor_labels, prec_recall_cutoff)

            # Test the model's accuracy with varying levels of provided SNPs
            test_minibatches = mlh.iterate_minibatches(x_test,
                                                       y_test,
                                                       138,
                                                       shuffle=False)
            mlh.eval_prediction(test_minibatches,
                                "test (rescaled)",
                                predict_from_normed_inps,
                                norm_mus,
                                norm_sigmas,
                                nb_evals=1,
                                rescale_inputs=True)

        # Save the model's test predictions to file
        print(x_test.shape)
        test_predictions = []
        for minibatch in mlh.iterate_testbatches(x_test, 1, shuffle=False):
            test_predictions += [predict(minibatch)]
        print(len(test_predictions))
        print(sum([t.shape[0] for t in test_predictions]))
        np.savez(os.path.join(save_path, 'test_predictions.npz'),
                 test_predictions)

        # Get the scores assigned by the model to each class for each test sample
        test_scores = []
        for minibatch in mlh.iterate_testbatches(x_test, 1, shuffle=False):
            test_scores += [predict_scores(minibatch)]
        np.savez(os.path.join(save_path, 'test_scores.npz'), test_scores)

        # Generate new SNP embeddings using test examples labeled according
        # to the model's predictions
        if bootstrap_snp_embeddings:

            if bootstrap_cutoff == "soft":
                bootstrap_snp_data = np.hstack(
                    (x_train.transpose(), x_valid.transpose(),
                     x_test.transpose()))
                bootstrap_labels = np.vstack(
                    (y_train, y_valid, np.array(test_scores)[:, 0, :]))

                filename_genotypic = 'bootstrap_gen_snp_embeddings_softlabels.npy'
                filename_allelic = 'bootstrap_all_snp_embeddings_softlabels.npy'

            else:  # Hard cutoff
                sure_test_idxs = np.argwhere(
                    (np.array(test_scores)[:, 0, :] >
                     bootstrap_cutoff).sum(1)).flatten()
                sure_test_inputs = x_test[sure_test_idxs]
                sure_test_preds = np.array(test_scores)[sure_test_idxs,
                                                        0].argmax(1)

                bootstrap_snp_data = np.hstack(
                    (x_train.transpose(), x_valid.transpose(),
                     sure_test_inputs.transpose()))
                bootstrap_labels = np.hstack(
                    (y_train.argmax(1), y_valid.argmax(1), sure_test_preds))

                filename_genotypic = 'bootstrap_gen_snp_embeddings_cutoff%f.npy' % bootstrap_cutoff
                filename_allelic = 'bootstrap_all_snp_embeddings_cutoff%f.npy' % bootstrap_cutoff

            utils_helpers.generate_snp_hist(
                bootstrap_snp_data,
                bootstrap_labels,
                label_names=label_names,
                perclass=True,
                sum_to_one=True,
                filename_genotypic=os.path.join(save_path, filename_genotypic),
                filename_allelic=os.path.join(save_path, filename_allelic))

    # Print all final errors for train, validation and test
    print("Training time:\t\t\t{:.3f}s".format(time.time() - start_training))

    # Analyse the model gradients to determine the influence of each SNP on
    # each of the model's prediction
    print(label_names)
    class_idx = T.iscalar("class index")
    grad_fn = theano.function([input_var_sup, class_idx],
                              T.grad(prediction_sup_det[:, class_idx].mean(),
                                     input_var_sup).mean(0))
    grads_wrt_inputs = mlh.get_grads_wrt_inputs(x_test, grad_fn, feature_names,
                                                label_names)

    # Obtain function that takes as inputs normed inputs and returns the
    # gradient of a class score wrt the normed inputs themselves (this is
    # requird because computing the integrated gradients requires to be able
    # to interpolate between an example where all features are missing and an
    # example where any number of features are provided)
    grad_from_normed_fn = theano.function(
        [normed_input_sup, class_idx],
        T.grad(prediction_sup_det[:, class_idx].sum(),
               normed_input_sup).mean(0))

    # Collect integrated gradients over the whole test set. Obtain, for each
    # SNP, for each possible value (0, 1 or 2), the average contribution of that
    # value for what SNP to the score of each class.
    avg_int_grads = np.zeros((x_test.shape[1], 3, len(label_names)),
                             dtype="float32")
    counts_int_grads = np.zeros((x_test.shape[1], 3), dtype="int32")
    for test_idx in range(x_test.shape[0]):
        int_grads = mlh.get_integrated_gradients(x_test[test_idx],
                                                 grad_from_normed_fn,
                                                 feature_names,
                                                 label_names,
                                                 norm_mus,
                                                 norm_sigmas,
                                                 m=100)

        snp_value_mask = np.arange(3) == x_test[test_idx][:, None]
        avg_int_grads += snp_value_mask[:, :,
                                        None] * int_grads.transpose()[:,
                                                                      None, :]
        counts_int_grads += snp_value_mask
    avg_int_grads = avg_int_grads / counts_int_grads[:, :, None]

    # Save all the additional information required for model analysis :
    # - Test predictions
    # - SNP IDs
    # - Subject IDs
    # - Normalization parameters for the input minibatches
    np.savez(os.path.join(save_path, 'additional_data.npz'),
             test_labels=y_test,
             test_scores=np.array(test_scores)[:, 0],
             test_predictions=np.array(test_predictions)[:, 0],
             norm_mus=norm_mus,
             norm_sigmas=norm_sigmas,
             grads_wrt_inputs=grads_wrt_inputs,
             exmpl_ids_train=exmpl_ids_train,
             exmpl_ids_valid=exmpl_ids_valid,
             exmpl_ids_test=exmpl_ids_test,
             feature_names=feature_names,
             label_names=label_names,
             avg_int_grads=avg_int_grads)

    # Copy files to loadpath (only if some training has beeen done so there
    # is a local saved version)
    if save_path != save_copy and num_epochs > 0:
        print('Copying model and other training files to {}'.format(save_copy))
        copy_tree(save_path, save_copy)
Beispiel #23
0
for i in xrange(0):
    l_hiddens.append(DenseLayer(dropout(l_hiddens[-1]), num_units=100, nonlinearity=rectify))
l_out = DenseLayer(dropout(l_hiddens[-1]), num_units=y.shape[1], nonlinearity=softmax, W=Orthogonal())

def reset():
    if any(np.isnan(scale.get_value()) for scale in scales):
        for scale in scales:
            scale.set_value(1.)
    for l in l_hiddens:
        l.b.set_value(Constant()(l.b.get_value().shape))
        l.W.set_value(Orthogonal()(l.W.get_value().shape))
    l_out.b.set_value(Constant()(l_out.b.get_value().shape))
    l_out.W.set_value(Orthogonal()(l_out.W.get_value().shape))
    for p in (p for u in (updates_ada,updates_other,updates_scal) for p in u if p not in get_all_params(l_out)):
        p.set_value(Constant()(p.get_value().shape))
chunky_l2 = apply_penalty(get_all_params(l_out,regularizable=True),l2)-l2(l_hiddens[0].W)+l2(l_hiddens[0].W/T.reshape(vscale,(206279,1)))
chunky_l1 = apply_penalty(get_all_params(l_out,regularizable=True),l1)-l1(l_hiddens[0].W)+l1(l_hiddens[0].W/T.reshape(vscale,(206279,1)))
simple_l2 = apply_penalty(get_all_params(l_out,regularizable=True),l2)
#l_out2 = DenseLayer(dropout(l_hiddens2[-1]), num_units=y.shape[1])
#l_out = lasagne.layers.NonlinearityLayer(lasagne.layers.ElemwiseSumLayer((l_out1,l_out2),.5), softmax)

#categorical_crossentropy(get_output(l_out)[train_indice])

target=T.fmatrix(name="target")
#f=theano.function([l_in.input_var],get_output(l_out),allow_input_downcast=True)
#f(X[0,:].toarray())

loss=categorical_crossentropy(get_output(l_out),target).mean()
# train_loss_smoo=categorical_crossentropy(get_output(l_out,deterministic=True)[train_indices,],target[train_indices,]).mean()
# valid_loss=categorical_crossentropy(get_output(l_out)[valid_indices,],target[valid_indices,]).mean()
# valid_loss_smoo=categorical_crossentropy(get_output(l_out,deterministic=True)[valid_indices,],target[valid_indices,]).mean()