Beispiel #1
0
def execute(raw_path, batch_size, fold, hidden_sizes, n_epochs, patience, use_embed_layer, dropout_sizes):
    dataset_path = 'data/'
    embedding_source = []
    if use_embed_layer:
        embedding_source = 'embed_4x26_fold'

    learning_rate = 3e-5
    n_hidden_1 = hidden_sizes[0]
    n_hidden_2 = hidden_sizes[1]

    # print("Load data")
    x_train, y_train, x_valid, y_valid, x_test, y_test, \
    x_unsup, training_labels = mlh.load_data(dataset_path, raw_path, embedding_source, fold)
    n_feats = x_train.shape[1]
    n_targets = training_labels.shape[1]

    embedding = []

    print('Build models')
    # Build discrim model
    if use_embed_layer:
        n_emb = x_unsup.shape[1]
        feat_emb = Variable(torch.from_numpy(x_unsup))
        # Build embedding model
        emb_model = mh.feat_emb_net(n_emb, n_hidden_1, n_hidden_2)
        embedding = emb_model(feat_emb)
        # embedding size: n_emb x n_hidden_2
        # transpose to fit the weights in the discriminative network
        embedding = torch.transpose(embedding, 1, 0)

    discrim_model = mh.discrim_net(embedding, n_feats, n_hidden_1, n_hidden_2, n_targets, dropout_sizes)

    loss_fn = nn.MSELoss(reduction='mean')
    optimizer = torch.optim.Adam(discrim_model.parameters(), lr=learning_rate)

    # Finally, launch the training loop.
    print("Start training ...")
    train_minibatches = list(mlh.iterate_minibatches(x_train, y_train,
                                                      batch_size))
    valid_minibatches = list(mlh.iterate_minibatches(x_valid, y_valid,
                                                    batch_size))
    test_minibatches = list(mlh.iterate_minibatches(x_test, y_test,
                                                    batch_size))

    # initialize the early_stopping object
    early_stopping = EarlyStopping(patience=patience, verbose=True)
    train_step = mlh.make_train_step(discrim_model, loss_fn, optimizer)
    epoch_len = len(str(n_epochs))

    train_losses = []
    valid_losses = []

    valid_accs = []
    train_accs = []
    start_training = time.time()
    epoch_times = []
    for epoch in range(n_epochs):
        print("Epoch {} of {}".format(epoch + 1, n_epochs))
        epoch_start_time = time.time()

        class_correct = list(0. for i in range(n_targets))
        class_total = list(0. for i in range(n_targets))
        train_loss = 0.
        for x_batch, y_batch in train_minibatches:
            x_train = Variable(torch.from_numpy(x_batch))
            y_train = Variable(torch.from_numpy(y_batch))

            loss, pred = train_step(x_train, y_train)
            train_loss += loss * batch_size

            # compare predictions to true label
            y_train = np.argmax(y_train.data, axis=1)
            correct = np.squeeze(pred.eq(y_train.view_as(pred)))

            for i in range(batch_size):
                label = y_train[i]
                class_correct[label] += correct[i].item()
                class_total[label] += 1

        train_acc = 100. * np.sum(class_correct) / np.sum(class_total)
        train_accs.append(train_acc)
        print(f'Train accuracy: {train_acc:.1f}')
        discrim_model.eval()

        valid_loss = 0.
        with torch.no_grad():
            for x_val, y_val in valid_minibatches:
                x_val = Variable(torch.from_numpy(x_val))
                y_val = Variable(torch.from_numpy(y_val))

                yhat = discrim_model(x_val)
                loss = loss_fn(y_val, yhat)

                valid_loss += loss.item() * batch_size

                # compare predictions to true label
                _, pred = torch.max(yhat, 1)
                y_val = np.argmax(y_val.data, axis=1)
                correct = np.squeeze(pred.eq(y_val.view_as(pred)))

                for i in range(batch_size):
                    label = y_val[i]
                    class_correct[label] += correct[i].item()
                    class_total[label] += 1

            valid_acc = 100. * np.sum(class_correct) / np.sum(class_total)
            print(f'Valid accuracy: {valid_acc:.1f}')
            valid_accs.append(valid_acc)

        # finished a batch

        train_loss = train_loss / len(train_minibatches)
        valid_loss = valid_loss / len(valid_minibatches)

        train_losses.append(train_loss)
        valid_losses.append(valid_loss)

        print_msg = (f'[{epoch+1:>{epoch_len}}/{n_epochs:>{epoch_len}}] ' +
                     f'train_loss: {train_loss:.5f} ' +
                     f'valid_loss: {valid_loss:.5f}')

        print(print_msg)

        epoch_time = time.time() - epoch_start_time
        # print("epoch time: {:.3f}s".format(epoch_time))
        epoch_times.append(epoch_time)

        # early_stopping needs the validation loss to check if it has decresed,
        # and if it has, it will make a checkpoint of the current model
        early_stopping(valid_loss, discrim_model)

        if early_stopping.early_stop:
            print("Early stopping")
            break

    # load the last checkpoint with the best model
    discrim_model.load_state_dict(torch.load('checkpoint.pt'))

    # test
    # initialize lists to monitor test loss and accuracy
    test_loss = 0.
    class_correct = list(0. for i in range(n_targets))
    class_total = list(0. for i in range(n_targets))

    discrim_model.eval()

    for x_test, y_test in test_minibatches:
        x_test = Variable(torch.from_numpy(x_test))
        y_test = Variable(torch.from_numpy(y_test))

        yhat = discrim_model(x_test)
        loss = loss_fn(y_test, yhat)

        test_loss += loss.item() * batch_size

        # convert output probabilities to predicted class
        _, pred = torch.max(yhat, 1)
        # compare predictions to true label
        y_test = np.argmax(y_test.data, axis=1)

        # correct size: [batch_size]
        correct = np.squeeze(pred.eq(y_test.view_as(pred)))
        # calculate test accuracy for each object class
        y_test = y_test.tolist()

        for i in range(batch_size):
            label = y_test[i]
            class_correct[label] += correct[i].item()
            class_total[label] += 1

    # printing results
    test_loss = test_loss / len(test_minibatches)
    print('Test Loss: {:.6f}\t'.format(test_loss))

    train_time = time.time() - start_training
    test_acc = 100. * np.sum(class_correct) / np.sum(class_total)
    print('Test Accuracy (Overall): %2f%% (%2d/%2d)\t' % (
        test_acc, np.sum(class_correct), np.sum(class_total)))

    torch.save(discrim_model, 'discrim_model.pt')
    print('Saved the model: ', 'discrim_model.pt')
    return [train_losses, valid_losses, train_accs, valid_accs, test_acc, epoch_times, train_time]
def execute(dataset,
            n_hidden_u,
            n_hidden_t_enc,
            n_hidden_t_dec,
            n_hidden_s,
            embedding_source=histo_GenotypicFrequency_perclass,
            additional_unsup_input=None,
            num_epochs=500,
            learning_rate=.001,
            learning_rate_annealing=1.0,
            alpha=1,
            beta=1,
            delta=1,
            gamma=1,
            lmd=.0001,
            disc_nonlinearity="sigmoid",
            encoder_net_init=0.2,
            decoder_net_init=0.2,
            optimizer="rmsprop",
            max_patience=100,
            batchnorm=0,
            input_dropout=1.0,
            embedding_noise=0.0,
            keep_labels=1.0,
            prec_recall_cutoff=True,
            missing_labels_val=-1.0,
            which_fold=0,
            early_stop_criterion='loss_sup_det',
            input_decoder_mode="regression",
            save_path='/Users/Marie-Elyse/Downloads/embedding2',
            save_copy='/Users/Marie-Elyse/Downloads/embedding2',
            dataset_path='/Users/Marie-Elyse/Downloads/embedding2',
            resume=False,
            exp_name='',
            random_proj=0,
            bootstrap_snp_embeddings=0,
            bootstrap_cutoff=0.9):

    # Prepare embedding information :
    # - If no embedding is specified, use the transposed input matrix
    # - If a file is specified, use it's content as feature embeddings
    # - Else (a embedding category like  'histo3x26' is provided), load a
    #   pregenerated embedding of the specified category
    if embedding_source is None or embedding_source == "raw":
        embedding_source = None
        embedding_input = 'raw'
    elif os.path.exists(embedding_source):
        embedding_input = embedding_source
    else:
        embedding_input = embedding_source
        embedding_source = os.path.join(
            dataset_path, embedding_input + '_fold' + str(which_fold) + '.npy')

    # Load the dataset
    print("Loading data")
    (x_train, y_train, exmpl_ids_train, x_valid, y_valid, exmpl_ids_valid,
     x_test, y_test, exmpl_ids_test, x_unsup, training_labels, feature_names,
     label_names) = mlh.load_data(dataset,
                                  dataset_path,
                                  embedding_source,
                                  which_fold=which_fold,
                                  keep_labels=keep_labels,
                                  missing_labels_val=missing_labels_val,
                                  embedding_input=embedding_input,
                                  norm=False)

    # Load the additional unsupervised data, if some is specified
    if additional_unsup_input is not None:
        print("Adding additional data to the model's unsupervised inputs")
        paths = additional_unsup_input.split(";")
        additional_unsup_data = [np.load(p) for p in paths]
        print(x_unsup.shape)
        x_unsup = np.hstack(additional_unsup_data + [x_unsup])
        print(x_unsup.shape)

    if x_unsup is not None:
        n_samples_unsup = x_unsup.shape[1]
    else:
        n_samples_unsup = 0

    original_x_train = x_train.copy()
    original_x_valid = x_valid.copy()
    original_x_test = x_test.copy()

    # Change how the missing data values are encoded. Right now they are
    # encoded as being the mean of the corresponding feature so that, after
    # feature normalization, they will be 0s. However, this prevents us from
    # transfering the minibatch data as int8 so we replace those values with -1s.
    for i in range(x_train.shape[1]):
        feature_mean = x_train[:, i].mean()
        x_train[:, i] = mh.replace_arr_value(x_train[:, i], feature_mean, -1)
        x_valid[:, i] = mh.replace_arr_value(x_valid[:, i], feature_mean, -1)
        x_test[:, i] = mh.replace_arr_value(x_test[:, i], feature_mean, -1)
    x_train = x_train.astype("int8")
    x_valid = x_valid.astype("int8")
    x_test = x_test.astype("int8")

    # Normalize the input data. The mlh.load_data() function already offers
    # this feature but we need to do it here so that we will have access to
    # both the normalized and unnormalized input data.
    norm_mus = original_x_train.mean(axis=0)
    norm_sigmas = original_x_train.std(axis=0) + 1e-6

    #x_train = (x_train - norm_mus[None, :]) / norm_sigmas[None, :]
    #x_valid = (x_valid - norm_mus[None, :]) / norm_sigmas[None, :]
    #x_test = (x_test - norm_mus[None, :]) / norm_sigmas[None, :]

    #x_train *= (315345. / 553107)
    #x_valid *= (315345. / 553107)
    #x_test *= (315345. / 553107)

    # Setup variables to build the right type of decoder bases on the value of
    # `input_decoder_mode`
    assert input_decoder_mode in ["regression", "classification"]
    if input_decoder_mode == "regression":
        # The size of the input reconstruction will be the same as the number
        # of inputs
        decoder_encoder_unit_ratio = 1
    elif input_decoder_mode == "classification":
        # # The size of the input reconstruction will be the N times larger as
        # the number of inputs where N is the number of distinct discrete
        # values that each input can take. For SNP input data with an additive
        # coding scheme, N=3 because the 3 possible values are : {0, 1, 2}.
        nb_discrete_vals_by_input = int(original_x_train.max() + 1)
        decoder_encoder_unit_ratio = nb_discrete_vals_by_input

        # Print baseline accuracy for the imputation of genes
        print("Distribution of input values in valid: %f %f %f" %
              ((original_x_train == 0).mean(), (original_x_train == 1).mean(),
               (original_x_train == 2).mean()))
        print("Distribution of input values in test: %f %f %f" %
              ((original_x_test == 0).mean(), (original_x_test == 1).mean(),
               (original_x_test == 2).mean()))

    # Extract required information from data
    n_samples, n_feats = x_train.shape
    print("Number of features : ", n_feats)
    print("Glorot init : ", 2.0 / (n_feats + n_hidden_t_enc[-1]))
    n_targets = y_train.shape[1] if y_train.ndim == 2 else y_train.max() + 1

    # Set some variables
    batch_size = 138
    beta = gamma if (gamma == 0) else beta

    # Generate an name for the experiment based on the hyperparameters used
    if embedding_source is None:
        embedding_name = embedding_input
    else:
        embedding_name = embedding_source.replace("_", "").split(".")[0]
        exp_name += embedding_name.rsplit('/', 1)[::-1][0] + '_'

    exp_name += mlh.define_exp_name(
        keep_labels, alpha, beta, gamma, lmd, n_hidden_u, n_hidden_t_enc,
        n_hidden_t_dec, n_hidden_s, which_fold, learning_rate,
        decoder_net_init, encoder_net_init, batchnorm, input_dropout,
        embedding_noise, early_stop_criterion, learning_rate_annealing,
        input_decoder_mode)
    print("Experiment: " + exp_name)

    # Ensure that the folders where the results of the experiment will be
    # saved do exist. Create them if they don't.
    save_path = os.path.join(save_path, dataset, exp_name)
    save_copy = os.path.join(save_copy, dataset, exp_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    if not os.path.exists(save_copy):
        os.makedirs(save_copy)

    # Prepare Theano variables for inputs and targets
    input_var_sup = T.bmatrix('input_sup')
    input_var_unsup = theano.shared(x_unsup, 'input_unsup')  # x_unsup TBD
    target_var_sup = T.matrix('target_sup')
    lr = theano.shared(np.float32(learning_rate), 'learning_rate')

    # Use the provided mus and sigmas to process the missing values and
    # normalize the inputs
    b_input_var_sup = input_var_sup.astype("float32")
    normed_input_sup = (T.eq(b_input_var_sup, -1) * norm_mus +
                        T.neq(b_input_var_sup, -1) * b_input_var_sup)
    normed_input_sup = (normed_input_sup - norm_mus) / norm_sigmas

    reconst_target_sup = T.cast(input_var_sup, "int32")

    # Build model
    print("Building model")

    # Some checkings
    # assert len(n_hidden_u) > 0
    assert len(n_hidden_t_enc) > 0
    assert len(n_hidden_t_dec) > 0
    assert n_hidden_t_dec[-1] == n_hidden_t_enc[-1]

    # Build feature embedding networks (encoding and decoding if gamma > 0)
    nets, embeddings, pred_feat_emb = mh.build_feat_emb_nets(
        embedding_source, n_feats, n_samples_unsup, input_var_unsup,
        n_hidden_u, n_hidden_t_enc, n_hidden_t_dec, gamma, encoder_net_init,
        decoder_net_init, save_path, random_proj, decoder_encoder_unit_ratio,
        embedding_noise)

    # Build feature embedding reconstruction networks (if alpha > 0, beta > 0)
    nets += mh.build_feat_emb_reconst_nets(
        [alpha, beta], n_samples_unsup, n_hidden_u,
        [n_hidden_t_enc, n_hidden_t_dec], nets,
        [encoder_net_init, decoder_net_init])

    # Supervised network
    discrim_net, hidden_rep = mh.build_discrim_net(
        batch_size, n_feats, normed_input_sup, n_hidden_t_enc, n_hidden_s,
        embeddings[0], disc_nonlinearity, n_targets, batchnorm, input_dropout)

    # Reconstruct network
    nets += [
        mh.build_reconst_net(hidden_rep,
                             embeddings[1] if len(embeddings) > 1 else None,
                             n_feats * decoder_encoder_unit_ratio, gamma,
                             decoder_encoder_unit_ratio)
    ]

    # Load weights if we are resuming job
    if resume:
        # Load best model
        with np.load(os.path.join(save_copy, 'dietnet_best.npz')) as f:
            param_values = [f['arr_%d' % i] for i in range(len(f.files))]
        nlayers = len(
            lasagne.layers.get_all_params(filter(None, nets) + [discrim_net]))
        #lasagne.layers.set_all_param_values(filter(None, nets) +
        #                                    [discrim_net],
        #                                    param_values[:nlayers])

        params = lasagne.layers.get_all_params(
            filter(None, nets) + [discrim_net])
        for p, v in zip(params, param_values[:nlayers]):
            # Do not overwrite embedding value with old embedding. Removing
            # the following condition will prevent a trained model from being
            # tested on a different dataset
            if p.name != "feat_emb":
                p.set_value(v)

    print("Building and compiling training functions")

    # Build and compile training functions
    predictions, predictions_det = mh.define_predictions(nets, start=2)
    prediction_sup, prediction_sup_det = mh.define_predictions([discrim_net])
    prediction_sup = prediction_sup[0]
    prediction_sup_det = prediction_sup_det[0]

    # Define losses
    # reconstruction losses
    if input_decoder_mode == "regression":
        reconst_losses, reconst_losses_det = mh.define_reconst_losses(
            predictions, predictions_det,
            [input_var_unsup, input_var_unsup, normed_input_sup])
    elif input_decoder_mode == "classification":
        # Obtain regular reconstruction losses for every reconstruction
        # but the reconstruction of the supervised input data
        reconst_losses1, reconst_losses_det1 = mh.define_reconst_losses(
            predictions[:-1], predictions_det[:-1],
            [input_var_unsup, input_var_unsup])

        # Obtain a "classification" reconstruction loss for the reconstruction
        # of the supervised input data. This classification loss will be
        # performed on the input data without normalization
        reconst_losses2, reconst_losses_det2 = mh.define_classif_reconst_losses(
            predictions[-1:], predictions_det[-1:], [reconst_target_sup],
            [decoder_encoder_unit_ratio])

        reconst_losses = reconst_losses1 + reconst_losses2
        reconst_losses_det = reconst_losses_det1 + reconst_losses_det2

    # supervised loss
    sup_loss, sup_loss_det = mh.define_sup_loss(disc_nonlinearity,
                                                prediction_sup,
                                                prediction_sup_det,
                                                keep_labels, target_var_sup,
                                                missing_labels_val)

    # Define inputs
    inputs = [input_var_sup, target_var_sup]

    # Define parameters
    params = lasagne.layers.get_all_params([discrim_net] + filter(None, nets),
                                           trainable=True,
                                           unwrap_shared=False)
    params_to_freeze= \
        lasagne.layers.get_all_params(filter(None, nets), trainable=False,
                                      unwrap_shared=False)

    # Remove unshared variables from params and params_to_freeze
    params = [
        p for p in params
        if isinstance(p, theano.compile.sharedvalue.SharedVariable)
    ]
    params_to_freeze = [
        p for p in params_to_freeze
        if isinstance(p, theano.compile.sharedvalue.SharedVariable)
    ]
    print("Params : ", params)

    feat_emb_var = next(p for p in lasagne.layers.get_all_params([discrim_net])
                        if p.name == 'input_unsup' or p.name == 'feat_emb')
    # feat_emb_var = lasagne.layers.get_all_params([discrim_net])[0]
    print(feat_emb_var)
    feat_emb_val = feat_emb_var.get_value()
    feat_emb_norms = (feat_emb_val**2).sum(0)**0.5
    feat_emb_var.set_value(feat_emb_val / feat_emb_norms)

    print('Number of params discrim: ' + str(len(params)))
    print('Number of params to freeze: ' + str(len(params_to_freeze)))

    for p in params_to_freeze:
        new_params = [el for el in params if el != p]
        params = new_params

    print('Number of params to update: ' + str(len(params)))

    # Combine losses
    loss = delta*sup_loss + alpha*reconst_losses[0] + beta*reconst_losses[1] + \
        gamma*reconst_losses[2]
    loss_det = delta*sup_loss_det + alpha*reconst_losses_det[0] + \
        beta*reconst_losses_det[1] + gamma*reconst_losses_det[2]

    l2_penalty = apply_penalty(params, l2)
    loss = loss + lmd * l2_penalty
    loss_det = loss_det + lmd * l2_penalty

    # Compute network updates
    assert optimizer in ["rmsprop", "adam", "amsgrad"]
    if optimizer == "rmsprop":
        updates = lasagne.updates.rmsprop(loss, params, learning_rate=lr)
    elif optimizer == "adam":
        updates = lasagne.updates.adam(loss, params, learning_rate=lr)
    elif optimizer == "amsgrad":
        updates = lasagne.updates.amsgrad(loss, params, learning_rate=lr)
    #updates = lasagne.updates.sgd(loss,
    #                              params,
    #                              learning_rate=lr)
    # updates = lasagne.updates.momentum(loss, params,
    #                                    learning_rate=lr, momentum=0.0)

    # Apply norm constraints on the weights
    for k in updates.keys():
        if updates[k].ndim == 2:
            updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0)

    # Compile training function
    train_fn = theano.function(inputs,
                               loss,
                               updates=updates,
                               on_unused_input='ignore')

    # Monitoring Labels
    monitor_labels = [
        "reconst. feat. W_enc", "reconst. feat. W_dec", "reconst. loss"
    ]
    monitor_labels = [
        i for i, j in zip(monitor_labels, reconst_losses) if j != 0
    ]
    monitor_labels += ["feat. W_enc. mean", "feat. W_enc var"]
    monitor_labels += ["feat. W_dec. mean", "feat. W_dec var"] if \
        (embeddings[1] is not None) else []
    monitor_labels += ["loss. sup.", "total loss"]

    # Build and compile test function
    val_outputs = reconst_losses_det
    val_outputs = [i for i, j in zip(val_outputs, reconst_losses) if j != 0]
    val_outputs += [embeddings[0].mean(), embeddings[0].var()]
    val_outputs += [embeddings[1].mean(), embeddings[1].var()] if \
        (embeddings[1] is not None) else []
    val_outputs += [sup_loss_det, loss_det]

    # Compute supervised accuracy and add it to monitoring list
    test_acc, test_pred = mh.define_test_functions(disc_nonlinearity,
                                                   prediction_sup,
                                                   prediction_sup_det,
                                                   target_var_sup)
    monitor_labels.append("accuracy")
    val_outputs.append(test_acc)

    # If appropriate, compute the input reconstruction accuracy and add it to
    # the monitoring list
    if input_decoder_mode == "classification":
        input_reconst_acc = mh.define_classif_reconst_acc(
            predictions_det[-1], reconst_target_sup,
            decoder_encoder_unit_ratio)
        #import pdb; pdb.set_trace()
        monitor_labels.append("input_reconst_acc")
        val_outputs.append(input_reconst_acc)

    # Compile prediction function
    predict = theano.function([input_var_sup], test_pred)
    predict_from_normed_inps = theano.function([normed_input_sup], test_pred)

    predict_scores = theano.function([input_var_sup], prediction_sup_det)
    predict_scores_from_normed_inps = theano.function([input_var_sup],
                                                      prediction_sup_det)

    # Compile validation function
    val_fn = theano.function(inputs, [prediction_sup_det] + val_outputs,
                             on_unused_input='ignore')

    # Finally, launch the training loop.
    print("Starting training...")

    # Some variables
    patience = 0

    train_monitored = []
    valid_monitored = []
    train_loss = []

    # Pre-training monitoring
    print("Epoch 0 of {}".format(num_epochs))

    train_minibatches = mlh.iterate_minibatches(x_train,
                                                y_train,
                                                batch_size,
                                                shuffle=False)
    train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                               monitor_labels, prec_recall_cutoff)

    valid_minibatches = mlh.iterate_minibatches(x_valid,
                                                y_valid,
                                                batch_size,
                                                shuffle=False)
    valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                               monitor_labels, prec_recall_cutoff)

    # Before starting training, save a copy of the model in case
    np.savez(
        os.path.join(save_path, 'dietnet_best.npz'),
        *lasagne.layers.get_all_param_values(
            filter(None, nets) + [discrim_net]))

    # Training loop
    start_training = time.time()
    for epoch in range(num_epochs):
        start_time = time.time()
        print("Epoch {} of {}".format(epoch + 1, num_epochs))
        nb_minibatches = 0
        loss_epoch = 0

        # Train pass
        for batch in mlh.iterate_minibatches(x_train,
                                             training_labels,
                                             batch_size,
                                             shuffle=True):
            loss_epoch += train_fn(*batch)
            nb_minibatches += 1

        loss_epoch /= nb_minibatches
        train_loss += [loss_epoch]

        # Monitoring on the training set
        train_minibatches = mlh.iterate_minibatches(x_train,
                                                    y_train,
                                                    batch_size,
                                                    shuffle=False)
        train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                   monitor_labels, prec_recall_cutoff)
        train_monitored += [train_err]

        # Monitoring on the validation set
        valid_minibatches = mlh.iterate_minibatches(x_valid,
                                                    y_valid,
                                                    batch_size,
                                                    shuffle=False)

        valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                   monitor_labels, prec_recall_cutoff)
        valid_monitored += [valid_err]

        try:
            early_stop_val = valid_err[monitor_labels.index(
                early_stop_criterion)]
        except:
            raise ValueError("There is no monitored value by the name of %s" %
                             early_stop_criterion)

        valid_loss_sup_hist = [
            v[monitor_labels.index("loss. sup.")] for v in valid_monitored
        ]
        valid_loss_sup = valid_loss_sup_hist[-1]

        # Early stopping
        if epoch == 0:
            best_valid = early_stop_val
        elif ((early_stop_val > best_valid
               and early_stop_criterion == 'input_reconst_acc')
              or (early_stop_val > best_valid
                  and early_stop_criterion == 'accuracy')
              or (early_stop_val >= best_valid
                  and early_stop_criterion == 'accuracy'
                  and valid_loss_sup == min(valid_loss_sup_hist))
              or (early_stop_val < best_valid
                  and early_stop_criterion == 'loss. sup.')):
            best_valid = early_stop_val
            patience = 0

            # Save stuff
            np.savez(
                os.path.join(save_path, 'dietnet_best.npz'),
                *lasagne.layers.get_all_param_values(
                    filter(None, nets) + [discrim_net]))
            np.savez(save_path + "/errors_supervised_best.npz",
                     zip(*train_monitored), zip(*valid_monitored))

            # Monitor on the test set now because sometimes the saving doesn't
            # go well and there isn't a model to load at the end of training
            if y_test is not None:
                test_minibatches = mlh.iterate_minibatches(x_test,
                                                           y_test,
                                                           138,
                                                           shuffle=False)

                test_err = mlh.monitoring(test_minibatches, "test", val_fn,
                                          monitor_labels, prec_recall_cutoff)
        else:
            patience += 1
            # Save stuff
            np.savez(
                os.path.join(save_path, 'dietnet_last.npz'),
                *lasagne.layers.get_all_param_values(
                    filter(None, nets) + [discrim_net]))
            np.savez(save_path + "/errors_supervised_last.npz",
                     zip(*train_monitored), zip(*valid_monitored))

        print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time))

        # End training if needed
        if patience == max_patience or epoch == num_epochs - 1:
            break

        # Anneal the learning rate
        lr.set_value(
            np.array(lr.get_value() * learning_rate_annealing,
                     dtype="float32"))

    # End training with a final monitoring step on the best model
    print("Ending training")

    # Load best model
    with np.load(os.path.join(save_path, 'dietnet_best.npz')) as f:
        param_values = [f['arr_%d' % i] for i in range(len(f.files))]
        nlayers = len(
            lasagne.layers.get_all_params(filter(None, nets) + [discrim_net]))

        #lasagne.layers.set_all_param_values(filter(None, nets) +
        #                                    [discrim_net],
        #                                    param_values[:nlayers])
        params = lasagne.layers.get_all_params(
            filter(None, nets) + [discrim_net])
        for p, v in zip(params, param_values[:nlayers]):
            # Do not overwrite embedding value with old embedding. Removing
            # the following condition will prevent a trained model from being
            # tested on a different dataset
            if p.name != "feat_emb":
                p.set_value(v)

        if embedding_source is None:
            # Save embedding
            pred = pred_feat_emb()
            np.savez(os.path.join(save_path, 'feature_embedding.npz'), pred)

        # Training set results
        train_minibatches = mlh.iterate_minibatches(x_train,
                                                    y_train,
                                                    batch_size,
                                                    shuffle=False)
        train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                   monitor_labels, prec_recall_cutoff)

        # Validation set results
        valid_minibatches = mlh.iterate_minibatches(x_valid,
                                                    y_valid,
                                                    batch_size,
                                                    shuffle=False)
        valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                   monitor_labels, prec_recall_cutoff)

        # Test set results
        if y_test is not None:
            test_minibatches = mlh.iterate_minibatches(x_test,
                                                       y_test,
                                                       138,
                                                       shuffle=False)

            test_err = mlh.monitoring(test_minibatches, "test", val_fn,
                                      monitor_labels, prec_recall_cutoff)

            # Test the model's accuracy with varying levels of provided SNPs
            test_minibatches = mlh.iterate_minibatches(x_test,
                                                       y_test,
                                                       138,
                                                       shuffle=False)
            mlh.eval_prediction(test_minibatches,
                                "test (rescaled)",
                                predict_from_normed_inps,
                                norm_mus,
                                norm_sigmas,
                                nb_evals=1,
                                rescale_inputs=True)

        # Save the model's test predictions to file
        print(x_test.shape)
        test_predictions = []
        for minibatch in mlh.iterate_testbatches(x_test, 1, shuffle=False):
            test_predictions += [predict(minibatch)]
        print(len(test_predictions))
        print(sum([t.shape[0] for t in test_predictions]))
        np.savez(os.path.join(save_path, 'test_predictions.npz'),
                 test_predictions)

        # Get the scores assigned by the model to each class for each test sample
        test_scores = []
        for minibatch in mlh.iterate_testbatches(x_test, 1, shuffle=False):
            test_scores += [predict_scores(minibatch)]
        np.savez(os.path.join(save_path, 'test_scores.npz'), test_scores)

        # Generate new SNP embeddings using test examples labeled according
        # to the model's predictions
        if bootstrap_snp_embeddings:

            if bootstrap_cutoff == "soft":
                bootstrap_snp_data = np.hstack(
                    (x_train.transpose(), x_valid.transpose(),
                     x_test.transpose()))
                bootstrap_labels = np.vstack(
                    (y_train, y_valid, np.array(test_scores)[:, 0, :]))

                filename_genotypic = 'bootstrap_gen_snp_embeddings_softlabels.npy'
                filename_allelic = 'bootstrap_all_snp_embeddings_softlabels.npy'

            else:  # Hard cutoff
                sure_test_idxs = np.argwhere(
                    (np.array(test_scores)[:, 0, :] >
                     bootstrap_cutoff).sum(1)).flatten()
                sure_test_inputs = x_test[sure_test_idxs]
                sure_test_preds = np.array(test_scores)[sure_test_idxs,
                                                        0].argmax(1)

                bootstrap_snp_data = np.hstack(
                    (x_train.transpose(), x_valid.transpose(),
                     sure_test_inputs.transpose()))
                bootstrap_labels = np.hstack(
                    (y_train.argmax(1), y_valid.argmax(1), sure_test_preds))

                filename_genotypic = 'bootstrap_gen_snp_embeddings_cutoff%f.npy' % bootstrap_cutoff
                filename_allelic = 'bootstrap_all_snp_embeddings_cutoff%f.npy' % bootstrap_cutoff

            utils_helpers.generate_snp_hist(
                bootstrap_snp_data,
                bootstrap_labels,
                label_names=label_names,
                perclass=True,
                sum_to_one=True,
                filename_genotypic=os.path.join(save_path, filename_genotypic),
                filename_allelic=os.path.join(save_path, filename_allelic))

    # Print all final errors for train, validation and test
    print("Training time:\t\t\t{:.3f}s".format(time.time() - start_training))

    # Analyse the model gradients to determine the influence of each SNP on
    # each of the model's prediction
    print(label_names)
    class_idx = T.iscalar("class index")
    grad_fn = theano.function([input_var_sup, class_idx],
                              T.grad(prediction_sup_det[:, class_idx].mean(),
                                     input_var_sup).mean(0))
    grads_wrt_inputs = mlh.get_grads_wrt_inputs(x_test, grad_fn, feature_names,
                                                label_names)

    # Obtain function that takes as inputs normed inputs and returns the
    # gradient of a class score wrt the normed inputs themselves (this is
    # requird because computing the integrated gradients requires to be able
    # to interpolate between an example where all features are missing and an
    # example where any number of features are provided)
    grad_from_normed_fn = theano.function(
        [normed_input_sup, class_idx],
        T.grad(prediction_sup_det[:, class_idx].sum(),
               normed_input_sup).mean(0))

    # Collect integrated gradients over the whole test set. Obtain, for each
    # SNP, for each possible value (0, 1 or 2), the average contribution of that
    # value for what SNP to the score of each class.
    avg_int_grads = np.zeros((x_test.shape[1], 3, len(label_names)),
                             dtype="float32")
    counts_int_grads = np.zeros((x_test.shape[1], 3), dtype="int32")
    for test_idx in range(x_test.shape[0]):
        int_grads = mlh.get_integrated_gradients(x_test[test_idx],
                                                 grad_from_normed_fn,
                                                 feature_names,
                                                 label_names,
                                                 norm_mus,
                                                 norm_sigmas,
                                                 m=100)

        snp_value_mask = np.arange(3) == x_test[test_idx][:, None]
        avg_int_grads += snp_value_mask[:, :,
                                        None] * int_grads.transpose()[:,
                                                                      None, :]
        counts_int_grads += snp_value_mask
    avg_int_grads = avg_int_grads / counts_int_grads[:, :, None]

    # Save all the additional information required for model analysis :
    # - Test predictions
    # - SNP IDs
    # - Subject IDs
    # - Normalization parameters for the input minibatches
    np.savez(os.path.join(save_path, 'additional_data.npz'),
             test_labels=y_test,
             test_scores=np.array(test_scores)[:, 0],
             test_predictions=np.array(test_predictions)[:, 0],
             norm_mus=norm_mus,
             norm_sigmas=norm_sigmas,
             grads_wrt_inputs=grads_wrt_inputs,
             exmpl_ids_train=exmpl_ids_train,
             exmpl_ids_valid=exmpl_ids_valid,
             exmpl_ids_test=exmpl_ids_test,
             feature_names=feature_names,
             label_names=label_names,
             avg_int_grads=avg_int_grads)

    # Copy files to loadpath (only if some training has beeen done so there
    # is a local saved version)
    if save_path != save_copy and num_epochs > 0:
        print('Copying model and other training files to {}'.format(save_copy))
        copy_tree(save_path, save_copy)
Beispiel #3
0
def execute(dataset, learning_rate=0.00001, learning_rate_annealing=1.0,
            lmd=0., noise=0.0, encoder_units=[1024, 512, 256],
            num_epochs=500, which_fold=1,
            save_path=None, save_copy=None, dataset_path=None,
            num_fully_connected=0, exp_name='', init_args=None):

    # Reading dataset
    print("Loading data")
    if dataset == "1000_genomes" and which_fold == 1 and False:
        x_unsup = mlh.load_data(dataset, dataset_path, None,
                                which_fold=which_fold, keep_labels=1.0,
                                missing_labels_val=-1.0,
                                embedding_input='raw', transpose=False)
        import pdb; pdb.set_trace()
        
        x_train = np.zeros((x_unsup[0].shape[0], x_unsup[0].shape[1]*2), dtype="int8")
        x_train[:,::2] = (x_unsup[0] == 2)
        x_train[:,1::2] = (x_unsup[0] >= 1)
        
        x_valid = np.zeros((x_unsup[2].shape[0], x_unsup[2].shape[1]*2), dtype="int8")
        x_valid[:,::2] = (x_unsup[2] == 2)
        x_valid[:,1::2] = (x_unsup[2] >= 1)
    else:
        x_unsup = mlh.load_data(dataset, dataset_path, None,
                                which_fold=which_fold, keep_labels=1.0,
                                missing_labels_val=-1.0,
                                embedding_input='bin', transpose=True)
        x_train = x_unsup[0][0]
        x_valid = x_unsup[1][0]
    
    print(x_train.shape, x_valid.shape)

    n_features = x_train.shape[1]

    exp_name += "learn_snp2vec_dae_h"
    for e in encoder_units:
        exp_name += ('-' + str(e))
    # exp_name += '_g-' + str(gamma)
    exp_name += '_l-' + str(lmd)
    exp_name += '_lr-' + str(learning_rate)
    exp_name += '_fold-' + str(which_fold)

    save_path = os.path.join(save_path, exp_name)
    save_copy = os.path.join(save_copy, exp_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    if not os.path.exists(save_copy):
        os.makedirs(save_copy)

    # Prepare Theano variables for inputs and targets
    input_var = T.matrix('input')
    target_reconst = T.matrix('target')
    lr = theano.shared(np.float32(learning_rate), 'learning_rate')
    batch_size = 128

    # building network
    encoder = InputLayer((batch_size, n_features), input_var)

    # building the encoder and decoder
    #import pdb; pdb.set_trace()
    for i in range(len(encoder_units)):
        encoder = DenseLayer(
                encoder,
                num_units=encoder_units[i],
                W=Uniform(0.00001),
                nonlinearity=leaky_rectify)  # if i < len(encoder_units)-1 else linear)
        
    embedding = lasagne.layers.get_output(encoder)
    get_embedding_fn = theano.function([input_var], embedding)
    
    params = lasagne.layers.get_all_params(encoder, trainable=True)
    monitor_labels = ["embedding min", "embedding mean", "embedding max"]
    val_outputs = [embedding.min(), embedding.mean(), embedding.max()]
    nets = [encoder]

    decoder_units = encoder_units[::-1][1:]
    print(decoder_units)
    decoder = encoder
    for i in range(len(decoder_units)):
        decoder = DenseLayer(decoder,
                             num_units=decoder_units[i],
                             W=Uniform(0.0001),
                             nonlinearity=leaky_rectify)
    decoder = DenseLayer(decoder,
                         num_units=n_features,
                         W=convert_initialization(
                                init_args["decoder_init"],
                                nonlinearity="sigmoid"),
                         nonlinearity=sigmoid)
    prediction_reconst = lasagne.layers.get_output(decoder)

    # Reconstruction error
    loss_reconst = lasagne.objectives.binary_crossentropy(prediction_reconst,
                                                          target_reconst).mean()

    # loss_reconst = mh.define_sampled_mean_bincrossentropy(
    #    prediction_reconst, target_reconst, gamma=gamma)

    #loss_reconst = mh.dice_coef_loss(
    #    target_reconst, prediction_reconst).mean()
        
    accuracy = T.eq(T.gt(prediction_reconst, 0.5), target_reconst).mean()

    params += lasagne.layers.get_all_params(decoder, trainable=True)
    monitor_labels += ["reconst. loss", "reconst. accuracy"]
    val_outputs += [loss_reconst, accuracy]
    nets += [decoder]
    # sparsity_reconst = gamma * l1(prediction_reconst)
    # roh = input_var.mean(0)
    # sparsity_reconst = ((roh * T.log(roh / (prediction_reconst.mean(0)+1e-8))) +\
    #     ((1 - roh) * T.log((1 - roh) / (1 - prediction_reconst + 1e-8)))).sum()

    # Combine losses
    loss = loss_reconst # + sparsity_reconst

    # applying weight decay
    l2_penalty = apply_penalty(params, l2)
    loss = loss + lmd * l2_penalty

    val_outputs += [loss]
    monitor_labels += ['loss']

    # Some variables
    max_patience = 100
    patience = 0

    train_monitored = []
    valid_monitored = []
    train_loss = []

    updates = lasagne.updates.adam(loss,
                                   params,
                                   learning_rate=lr)

    for k in updates.keys():
        if updates[k].ndim == 2:
            updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0)

    inputs = [input_var, target_reconst]

    # Compile training function
    print "Compiling training function"
    train_fn = theano.function(inputs, loss, updates=updates,
                               on_unused_input='ignore')
    val_fn = theano.function(inputs,
                             [val_outputs[0]] + val_outputs,
                             on_unused_input='ignore')

    start_training = time.time()

    print "Starting training"
    for epoch in range(num_epochs):
        start_time = time.time()
        print("Epoch {} of {}".format(epoch+1, num_epochs))
        nb_minibatches = 0
        loss_epoch = 0

        for x, target_reconst_val in data_generator(x_train, batch_size,
                                                    shuffle=True, noise=noise):
            loss_epoch += train_fn(x, target_reconst_val)
            nb_minibatches += 1

        loss_epoch /= nb_minibatches
        train_loss += [loss_epoch]

        # Monitoring on the training set
        train_minibatches = data_generator(x_train, batch_size, noise=noise)
        train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                   monitor_labels, 0)
        train_monitored += [train_err]

        # Monitoring on the validation set
        valid_minibatches = data_generator(x_valid, batch_size, noise=noise)

        valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                   monitor_labels, 0)
        valid_monitored += [valid_err]

        early_stop_criterion = 'loss'
        early_stop_val = valid_err[monitor_labels.index(early_stop_criterion)]

        # Early stopping
        if epoch == 0:
            best_valid = early_stop_val
        elif early_stop_val < best_valid and early_stop_criterion == 'loss':
            best_valid = early_stop_val
            patience = 0

            # Save stuff
            np.savez(save_path+'/model_snp2vec_best.npz',
                     *lasagne.layers.get_all_param_values(nets))
            np.savez(save_path + "/errors_snp2vec_best.npz",
                     zip(*train_monitored), zip(*valid_monitored))
        else:
            patience += 1
            np.savez(os.path.join(save_path, 'model_snp2vec_last.npz'),
                     *lasagne.layers.get_all_param_values(nets))
            np.savez(save_path + "/errors_snp2vec_last.npz",
                     zip(*train_monitored), zip(*valid_monitored))

        # End training
        if (patience == max_patience) or (epoch == num_epochs-1):
            print("Ending training")
            # Load best model
            if not os.path.exists(save_path + '/model_snp2vec_best.npz'):
                print("No saved model to be tested and/or generate"
                      " the embedding !")
            else:
                with np.load(save_path + '/model_snp2vec_best.npz') as f:
                    param_values = [f['arr_%d' % i]
                                    for i in range(len(f.files))]
                    lasagne.layers.set_all_param_values(nets, param_values)
                    
            # Use the saved model to generate the feature embedding
            # Here the feature embedding is the different in the hidden
            # representation between having that feature on and having it off
            print("Generating embedding")
            embedding_size = encoder_units[-1]
            null_input = np.zeros((1, n_features), dtype="float32")
            null_embedding = get_embedding_fn(null_input)[0]
            
            all_embeddings = np.zeros((n_features,
                                       embedding_size), dtype="float32")
            
            """
            single_feat_input = null_input.copy()
            for i in range(n_features):
                if i % 10000 == 0:
                    print(i, n_features)

                single_feat_input[:,i] = 1
                all_embeddings[i] = (get_embedding_fn(single_feat_input)[0] -
                                     null_embedding)
                single_feat_input[:,i] = 0
                
            result1 = all_embeddings[:1000].copy()
            """
            
            block_size = 10
            single_feat_batch = np.zeros((block_size, n_features), dtype="float32")
            for i in range(0, n_features, block_size):
                if i % 10000 == 0:
                    print(i, n_features)
                
                for j in range(block_size):
                    single_feat_batch[j, i+j] = 1
                    
                all_embeddings[i:i+10] = (get_embedding_fn(single_feat_batch) -
                                          null_embedding)
                    
                for j in range(block_size):
                    single_feat_batch[j, i+j] = 0
                
            np.save("/Tmp/carriepl/feature_selection/all_embeddings_fold%i_noise%f.npy" % (which_fold, noise),
                    all_embeddings)

            # Training set results
            train_minibatches = data_generator(x_train, batch_size, noise=noise)
            train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                       monitor_labels, 0)

            # Validation set results
            valid_minibatches = data_generator(x_valid, batch_size, noise=noise)
            valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                       monitor_labels, 0)

            # Stop
            print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() -
                                                         start_time))
            break

        print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time))
        # Anneal the learning rate
        lr.set_value(float(lr.get_value() * learning_rate_annealing))


    # Copy files to loadpath
    if save_path != save_copy:
        print('Copying model and other training files to {}'.format(save_copy))
        copy_tree(save_path, save_copy)
Beispiel #4
0
def execute(dataset,
            n_hidden_u,
            n_hidden_t_enc,
            n_hidden_t_dec,
            n_hidden_s,
            learning_rate,
            learning_rate_annealing=.99,
            embedding_source=None,
            alpha=1,
            beta=1,
            gamma=1,
            lmd=0.0,
            encoder_net_init=0.1,
            decoder_net_init=0.1,
            keep_labels=1.0,
            which_fold=0,
            early_stop_criterion='accuracy',
            exp_name='',
            representation='features',
            which_set='test',
            model_path='/Tmp/romerosa/DietNetworks/newmodel/',
            save_path='/Tmp/romerosa/DietNetworks/',
            dataset_path='/Tmp/' + os.environ["USER"] + '/datasets/'):

    print(save_path)

    # Prepare embedding information
    if embedding_source is None:
        embedding_input = 'raw'
    else:
        embedding_input = embedding_source
        embedding_source = os.path.join(
            dataset_path, embedding_input + '_fold' + str(which_fold) + '.npy')

    # Load the dataset
    print("Loading data")
    x_train, y_train, x_valid, y_valid, x_test, y_test, \
        x_unsup, training_labels = mlh.load_data(
            dataset, dataset_path, embedding_source,
            which_fold=which_fold, keep_labels=1.0,
            missing_labels_val=-1.0,
            embedding_input=embedding_input)

    if which_set == 'train':
        x = x_train
        y = y_train
    elif which_set == 'valid':
        x = x_valid
        y = y_valid
    elif which_set == 'test':
        x = x_test
        y = y_test

    if x_unsup is not None:
        n_samples_unsup = x_unsup.shape[1]
    else:
        n_samples_unsup = 0

    # Extract required information from data
    n_samples, n_feats = x_train.shape
    n_targets = y_train.shape[1]

    # Set some variables
    batch_size = 1
    beta = gamma if (gamma == 0) else beta

    # Preparing folder to save stuff
    if embedding_source is None:
        embedding_name = embedding_input
    else:
        embedding_name = embedding_source.replace("_", "").split(".")[0]
        exp_name += embedding_name.rsplit('/', 1)[::-1][0] + '_'

    exp_name += mlh.define_exp_name(keep_labels, alpha, beta, gamma, lmd,
                                    n_hidden_u, n_hidden_t_enc, n_hidden_t_dec,
                                    n_hidden_s, which_fold, learning_rate,
                                    decoder_net_init, encoder_net_init,
                                    early_stop_criterion,
                                    learning_rate_annealing)

    print("Experiment: " + exp_name)
    model_path = os.path.join(model_path, dataset, exp_name)
    print(model_path)
    save_path = os.path.join(save_path, representation, embedding_input,
                             'fold' + str(which_fold))
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # Prepare Theano variables for inputs and targets
    input_var_sup = T.matrix('input_sup')
    input_var_unsup = theano.shared(x_unsup, 'input_unsup')  # x_unsup TBD
    target_var_sup = T.matrix('target_sup')

    # Build model
    print("Building model")

    # Some checkings
    # assert len(n_hidden_u) > 0
    assert len(n_hidden_t_enc) > 0
    assert len(n_hidden_t_dec) > 0
    assert n_hidden_t_dec[-1] == n_hidden_t_enc[-1]

    # Build feature embedding networks (encoding and decoding if gamma > 0)
    nets, embeddings, pred_feat_emb = mh.build_feat_emb_nets(
        embedding_source, n_feats, n_samples_unsup, input_var_unsup,
        n_hidden_u, n_hidden_t_enc, n_hidden_t_dec, gamma, encoder_net_init,
        encoder_net_init, save_path)

    # Build feature embedding reconstruction networks (if alpha > 0, beta > 0)
    nets += mh.build_feat_emb_reconst_nets(
        [alpha, beta], n_samples_unsup, n_hidden_u,
        [n_hidden_t_enc, n_hidden_t_dec], nets,
        [encoder_net_init, encoder_net_init])

    # Supervised network
    discrim_net, hidden_rep = mh.build_discrim_net(batch_size, n_feats,
                                                   input_var_sup,
                                                   n_hidden_t_enc, n_hidden_s,
                                                   embeddings[0], 'softmax',
                                                   n_targets)

    # Reconstruct network
    nets += [
        mh.build_reconst_net(hidden_rep,
                             embeddings[1] if len(embeddings) > 1 else None,
                             n_feats, gamma)
    ]

    # Load best model
    with np.load(os.path.join(model_path, 'dietnets_best.npz')) as f:
        param_values = [f['arr_%d' % i] for i in range(len(f.files))]
    lasagne.layers.set_all_param_values(
        filter(None, nets) + [discrim_net], param_values)

    print("Building and compiling training functions")

    # Build and compile training functions
    if representation == 'features':
        feat_layers = lasagne.layers.get_all_layers(nets[0])
        predictions = lasagne.layers.get_output(feat_layers)
        inputs = []
        predict = theano.function(inputs, predictions)
        all_pred = predict()
        all_pred = all_pred

        for i, el in enumerate(all_pred):
            file_name = os.path.join(save_path, 'layer' + str(i) + '.npy')
            print(file_name)
            np.save(file_name, el)

    elif representation == 'subjects':
        subject_layers = lasagne.layers.get_all_layers(discrim_net)
        subject_layers = [
            el for el in subject_layers if isinstance(el, DenseLayer)
        ]
        predictions = lasagne.layers.get_output(subject_layers)
        inputs = [input_var_sup]
        predict = theano.function(inputs, predictions)

        iterate_minibatches = mlh.iterate_minibatches(x,
                                                      y,
                                                      batch_size,
                                                      shuffle=False)
        print("Starting testing...")
        all_pred = []
        for batch in iterate_minibatches:
            all_pred += [predict(batch[0])]

        all_pred = zip(*all_pred)
        all_pred = [np.vstack(el) for el in all_pred]

        for i, el in enumerate(all_pred):
            file_name = os.path.join(
                save_path, 'layer' + str(i) + '_' + which_set + '.npz')
            print(file_name)
            np.savez(file_name, representation=el, label=y.argmax(1))
Beispiel #5
0
def execute(dataset,
            n_hidden_u,
            num_epochs=500,
            learning_rate=.001,
            learning_rate_annealing=1.0,
            lmd=.0001,
            embedding_input='raw',
            which_fold=0,
            save_path='/Tmp/$USER/feature_selection/newmodel/',
            save_copy='/Tmp/$USER/feature_selection/newmodel/',
            dataset_path='/Tmp/$USER/feature_selection/newmodel/'):

    # Load the dataset
    print("Loading data")
    x_unsup = mlh.load_data(dataset,
                            dataset_path,
                            None,
                            which_fold=which_fold,
                            keep_labels=1.0,
                            missing_labels_val=-1.0,
                            embedding_input=embedding_input,
                            transpose=True)

    x_train = x_unsup[0][0]
    x_valid = x_unsup[1][0]

    # Extract required information from data
    n_row, n_col = x_train.shape
    print('Data size ' + str(n_row) + 'x' + str(n_col))

    # Set some variables
    batch_size = 256

    # Define experiment name
    exp_name = 'pretrain_' + mlh.define_exp_name(
        1., 0, 0, 0, lmd, n_hidden_u, [], [], [], which_fold, embedding_input,
        learning_rate, 0, 0, 'reconst_loss', learning_rate_annealing)
    print('Experiment: ' + exp_name)

    # Preparing folder to save stuff
    save_path = os.path.join(save_path, dataset, exp_name)
    save_copy = os.path.join(save_copy, dataset, exp_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # Prepare Theano variables for inputs and targets
    input_var = T.matrix('input_unsup')
    lr = theano.shared(np.float32(learning_rate), 'learning_rate')

    # Build model
    print("Building model")

    # Some checkings
    assert len(n_hidden_u) > 0

    # Build unsupervised network
    encoder_net = InputLayer((None, n_col), input_var)

    for out in n_hidden_u:
        encoder_net = DenseLayer(encoder_net, num_units=out, nonlinearity=tanh)
        encoder_net = DropoutLayer(encoder_net)

    decoder_net = encoder_net
    for i in range(len(n_hidden_u) - 2, -1, -1):
        decoder_net = DenseLayer(decoder_net,
                                 num_units=n_hidden_u[i],
                                 nonlinearity=linear)
        decoder_net = DropoutLayer(decoder_net)

    decoder_net = DenseLayer(decoder_net, num_units=n_col, nonlinearity=linear)

    if embedding_input == 'raw' or embedding_input == 'w2v':
        final_nonlin = linear
    elif embedding_input == 'bin':
        final_nonlin = sigmoid
    elif 'histo' in embedding_input:
        final_nonlin = softmax

    if embedding_input == 'histo3x26':
        laySize = lasagne.layers.get_output(decoder_net).shape
        decoder_net = ReshapeLayer(decoder_net, (laySize[0] * 26, 3))

    decoder_net = NonlinearityLayer(decoder_net, nonlinearity=final_nonlin)

    if embedding_input == 'histo3x26':
        decoder_net = ReshapeLayer(decoder_net, (laySize[0], laySize[1]))

    print("Building and compiling training functions")
    # Build and compile training functions
    predictions, predictions_det = mh.define_predictions(
        [encoder_net, decoder_net], start=0)
    prediction_sup, prediction_sup_det = mh.define_predictions(
        [encoder_net, decoder_net], start=0)

    # Define losses
    # reconstruction losses
    loss, loss_det = mh.define_loss(predictions[1], predictions_det[1],
                                    input_var, embedding_input)

    # Define parameters
    params = lasagne.layers.get_all_params(decoder_net, trainable=True)

    l2_penalty = apply_penalty(params, l2)
    loss = loss + lmd * l2_penalty
    loss_det = loss_det + lmd * l2_penalty

    # Compute network updates
    updates = lasagne.updates.adam(loss, params, learning_rate=lr)
    # updates = lasagne.updates.sgd(loss,
    #                              params,
    #                              learning_rate=lr)
    # updates = lasagne.updates.momentum(loss, params,
    #                                    learning_rate=lr, momentum=0.0)

    # Apply norm constraints on the weights
    for k in updates.keys():
        if updates[k].ndim == 2:
            updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0)

    # Compile training function
    train_fn = theano.function([input_var],
                               loss,
                               updates=updates,
                               on_unused_input='ignore')

    # Expressions required for test
    monitor_labels = ['loss']
    val_outputs = [loss_det]

    # Add some monitoring on the learned feature embedding
    val_outputs += [
        predictions[0].min(), predictions[0].mean(), predictions[0].max(),
        predictions[0].var()
    ]
    monitor_labels += [
        "feat. emb. min", "feat. emb. mean", "feat. emb. max", "feat. emb. var"
    ]

    # Compile validation function
    val_fn = theano.function([input_var], val_outputs)

    pred_feat_emb = theano.function([input_var], predictions_det[0])

    # Finally, launch the training loop.
    print("Starting training...")

    # Some variables
    max_patience = 100
    patience = 0

    train_monitored = []
    valid_monitored = []
    train_loss = []

    nb_minibatches = n_row / batch_size
    print("Nb of minibatches: " + str(nb_minibatches))
    start_training = time.time()
    for epoch in range(num_epochs):
        start_time = time.time()
        print("Epoch {} of {}".format(epoch + 1, num_epochs))

        loss_epoch = 0

        # Train pass
        for batch in mlh.iterate_minibatches_unsup(x_train,
                                                   batch_size,
                                                   shuffle=True):
            loss_epoch += train_fn(batch)

        loss_epoch /= nb_minibatches
        train_loss += [loss_epoch]

        train_minibatches = mlh.iterate_minibatches_unsup(x_train,
                                                          batch_size,
                                                          shuffle=True)
        train_err = mlh.monitoring(train_minibatches,
                                   "train",
                                   val_fn,
                                   monitor_labels,
                                   start=0)
        train_monitored += [train_err]

        # Validation pass
        valid_minibatches = mlh.iterate_minibatches_unsup(x_valid,
                                                          batch_size,
                                                          shuffle=True)

        valid_err = mlh.monitoring(valid_minibatches,
                                   "valid",
                                   val_fn,
                                   monitor_labels,
                                   start=0)

        valid_monitored += [valid_err]

        try:
            early_stop_val = valid_err[monitor_labels.index('loss')]
        except:
            raise ValueError("There is no monitored value by the name of %s" %
                             early_stop_criterion)

        # Eearly stopping
        if epoch == 0:
            best_valid = early_stop_val
        elif early_stop_val < best_valid:
            best_valid = early_stop_val
            patience = 0

            # Save stuff
            np.savez(
                os.path.join(save_path, 'model_enc_unsupervised_best.npz'),
                *lasagne.layers.get_all_param_values(encoder_net))
            np.savez(os.path.join(save_path, 'model_ae_unsupervised_best.npz'),
                     *lasagne.layers.get_all_param_values(encoder_net))
            np.savez(os.path.join(save_path, "errors_unsupervised_best.npz"),
                     zip(*train_monitored), zip(*valid_monitored))
        else:
            patience += 1
            # Save stuff
            np.savez(
                os.path.join(save_path, 'model_enc_unsupervised_last.npz'),
                *lasagne.layers.get_all_param_values(encoder_net))
            np.savez(os.path.join(save_path, 'model_ae_unsupervised_last.npz'),
                     *lasagne.layers.get_all_param_values(encoder_net))
            np.savez(os.path.join(save_path, "errors_unsupervised_last.npz"),
                     zip(*train_monitored), zip(*valid_monitored))

        # End training
        if patience == max_patience or epoch == num_epochs - 1:
            print("   Ending training")
            # Load unsupervised best model
            if not os.path.exists(save_path +
                                  '/model_enc_unsupervised_best.npz'):
                print("No saved model to be tested and/or generate"
                      " the embedding !")
            else:
                with np.load(save_path +
                             '/model_enc_unsupervised_best.npz', ) as f:
                    param_values = [
                        f['arr_%d' % i] for i in range(len(f.files))
                    ]
                    lasagne.layers.set_all_param_values(
                        encoder_net, param_values)

                # Save embedding
                preds = []
                for batch in mlh.iterate_minibatches_unsup(x_train,
                                                           1,
                                                           shuffle=False):
                    preds.append(pred_feat_emb(batch))
                for batch in mlh.iterate_minibatches_unsup(x_valid,
                                                           1,
                                                           shuffle=False):
                    preds.append(pred_feat_emb(batch))
                preds = np.vstack(preds)
                np.savez(os.path.join(save_path, 'feature_embedding.npz'),
                         preds)

            # Stop
            print(" epoch time:\t\t\t{:.3f}s".format(time.time() - start_time))
            break

        print("  epoch time:\t\t\t{:.3f}s".format(time.time() - start_time))
        # Anneal the learning rate
        lr.set_value(float(lr.get_value() * learning_rate_annealing))

    # Print all final errors for train, validation and test
    print("Training time:\t\t\t{:.3f}s".format(time.time() - start_training))

    # Copy files to loadpath
    if save_path != save_copy:
        print('Copying model and other training files to {}'.format(save_copy))
        copy_tree(save_path, save_copy)
Beispiel #6
0
def execute(dataset,
            n_hidden_u,
            n_hidden_t_enc,
            n_hidden_t_dec,
            n_hidden_s,
            learning_rate,
            learning_rate_annealing=1.,
            embedding_source=None,
            alpha=1,
            beta=1,
            gamma=1,
            lmd=0,
            encoder_net_init=0.001,
            decoder_net_init=0.001,
            disc_nonlinearity='softmax',
            keep_labels=1.0,
            prec_recall_cutoff=True,
            missing_labels_val=-1.0,
            which_fold=0,
            early_stop_criterion='accuracy',
            save_path='/Tmp/romerosa/DietNetworks/',
            dataset_path='/Tmp/' + os.environ["USER"] + '/datasets/',
            resume=False,
            exp_name=''):

    # Prepare embedding information
    if embedding_source is None:
        embedding_input = 'raw'
    else:
        embedding_input = embedding_source
        embedding_source = os.path.join(
            dataset_path, embedding_input + '_fold' + str(which_fold) + '.npy')

    # Load the dataset
    print("Loading data")
    x_train, y_train, x_valid, y_valid, x_test, y_test, \
        x_unsup, training_labels = mlh.load_data(
            dataset, dataset_path, embedding_source,
            which_fold=which_fold, keep_labels=keep_labels,
            missing_labels_val=missing_labels_val,
            embedding_input=embedding_input)

    if x_unsup is not None:
        n_samples_unsup = x_unsup.shape[1]
    else:
        n_samples_unsup = 0

    # Extract required information from data
    n_samples, n_feats = x_train.shape
    print("Number of features : ", n_feats)
    print("Glorot init : ", 2.0 / (n_feats + n_hidden_t_enc[-1]))
    n_targets = y_train.shape[1]

    # Set some variables
    batch_size = 138
    beta = gamma if (gamma == 0) else beta

    # Preparing folder to save stuff
    if embedding_source is None:
        embedding_name = embedding_input
    else:
        embedding_name = embedding_source.replace("_", "").split(".")[0]
        exp_name += embedding_name.rsplit('/', 1)[::-1][0] + '_'

    exp_name += mlh.define_exp_name(keep_labels, alpha, beta, gamma, lmd,
                                    n_hidden_u, n_hidden_t_enc, n_hidden_t_dec,
                                    n_hidden_s, which_fold, learning_rate,
                                    decoder_net_init, encoder_net_init,
                                    early_stop_criterion,
                                    learning_rate_annealing)

    print("Experiment: " + exp_name)
    save_path = os.path.join(save_path, dataset, exp_name)
    print(save_path)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # Prepare Theano variables for inputs and targets
    input_var_sup = T.matrix('input_sup')
    input_var_unsup = theano.shared(x_unsup, 'input_unsup')  # x_unsup TBD
    target_var_sup = T.matrix('target_sup')

    # Build model
    print("Building model")

    # Some checkings
    # assert len(n_hidden_u) > 0
    assert len(n_hidden_t_enc) > 0
    assert len(n_hidden_t_dec) > 0
    assert n_hidden_t_dec[-1] == n_hidden_t_enc[-1]

    # Build feature embedding networks (encoding and decoding if gamma > 0)
    nets, embeddings, pred_feat_emb = mh.build_feat_emb_nets(
        embedding_source, n_feats, n_samples_unsup, input_var_unsup,
        n_hidden_u, n_hidden_t_enc, n_hidden_t_dec, gamma, encoder_net_init,
        encoder_net_init, save_path)

    # Build feature embedding reconstruction networks (if alpha > 0, beta > 0)
    nets += mh.build_feat_emb_reconst_nets(
        [alpha, beta], n_samples_unsup, n_hidden_u,
        [n_hidden_t_enc, n_hidden_t_dec], nets,
        [encoder_net_init, encoder_net_init])

    # Supervised network
    discrim_net, hidden_rep = mh.build_discrim_net(
        batch_size, n_feats, input_var_sup, n_hidden_t_enc, n_hidden_s,
        embeddings[0], disc_nonlinearity, n_targets)

    # Reconstruct network
    nets += [
        mh.build_reconst_net(hidden_rep,
                             embeddings[1] if len(embeddings) > 1 else None,
                             n_feats, gamma)
    ]

    # Load best model
    with np.load(os.path.join(save_path, 'dietnets_best.npz')) as f:
        param_values = [f['arr_%d' % i] for i in range(len(f.files))]
    lasagne.layers.set_all_param_values(
        filter(None, nets) + [discrim_net], param_values)

    print("Building and compiling training functions")

    # Build functions
    predictions, predictions_det = mh.define_predictions(nets, start=2)
    prediction_sup, prediction_sup_det = mh.define_predictions([discrim_net])
    prediction_sup = prediction_sup[0]
    prediction_sup_det = prediction_sup_det[0]

    # Define losses
    # reconstruction losses
    _, reconst_losses_det = mh.define_reconst_losses(
        predictions, predictions_det,
        [input_var_unsup, input_var_unsup, input_var_sup])
    # supervised loss
    _, sup_loss_det = mh.define_sup_loss(disc_nonlinearity, prediction_sup,
                                         prediction_sup_det, keep_labels,
                                         target_var_sup, missing_labels_val)

    # Define inputs
    inputs = [input_var_sup, target_var_sup]

    # Combine losses
    loss_det = sup_loss_det + alpha*reconst_losses_det[0] + \
        beta*reconst_losses_det[1] + gamma*reconst_losses_det[2]

    # Define parameters
    params = lasagne.layers.get_all_params([discrim_net] + filter(None, nets),
                                           trainable=True)

    l2_penalty = apply_penalty(params, l2)
    loss_det = loss_det + lmd * l2_penalty

    # Monitoring Labels
    monitor_labels = [
        "reconst. feat. W_enc", "reconst. feat. W_dec", "reconst. loss"
    ]
    monitor_labels = [
        i for i, j in zip(monitor_labels, reconst_losses_det) if j != 0
    ]
    monitor_labels += ["feat. W_enc. mean", "feat. W_enc var"]
    monitor_labels += ["feat. W_dec. mean", "feat. W_dec var"] if \
        (embeddings[1] is not None) else []
    monitor_labels += ["loss. sup.", "total loss"]

    # test function
    val_outputs = reconst_losses_det
    val_outputs = [
        i for i, j in zip(val_outputs, reconst_losses_det) if j != 0
    ]
    val_outputs += [embeddings[0].mean(), embeddings[0].var()]
    val_outputs += [embeddings[1].mean(), embeddings[1].var()] if \
        (embeddings[1] is not None) else []
    val_outputs += [sup_loss_det, loss_det]

    # Compute accuracy and add it to monitoring list
    test_acc, test_pred = mh.define_test_functions(disc_nonlinearity,
                                                   prediction_sup,
                                                   prediction_sup_det,
                                                   target_var_sup)
    monitor_labels.append("accuracy")
    val_outputs.append(test_acc)

    # Compile prediction function
    predict = theano.function([input_var_sup], test_pred)

    # Compile validation function
    val_fn = theano.function(inputs, [prediction_sup_det] + val_outputs,
                             on_unused_input='ignore')

    # Finally, launch the training loop.
    print("Starting testing...")
    test_minibatches = mlh.iterate_minibatches(x_test,
                                               y_test,
                                               batch_size,
                                               shuffle=False)
    test_err, pred, targets = mlh.monitoring(test_minibatches,
                                             "test",
                                             val_fn,
                                             monitor_labels,
                                             prec_recall_cutoff,
                                             return_pred=True)

    lab = targets.argmax(1)
    pred_argmax = pred.argmax(1)

    continent_cat = mh.create_1000_genomes_continent_labels()

    lab_cont = np.zeros(lab.shape)
    pred_cont = np.zeros(pred_argmax.shape)

    for i, c in enumerate(continent_cat):
        for el in c:
            lab_cont[lab == el] = i
            pred_cont[pred_argmax == el] = i

    cm_e = np.zeros((26, 26))
    cm_c = np.zeros((5, 5))

    for i in range(26):
        for j in range(26):
            cm_e[i, j] = ((pred_argmax == i) * (lab == j)).sum()

    for i in range(5):
        for j in range(5):
            cm_c[i, j] = ((pred_cont == i) * (lab_cont == j)).sum()

    np.savez(os.path.join(save_path, 'cm' + str(which_fold) + '.npz'),
             cm_e=cm_e,
             cm_c=cm_c)

    print(os.path.join(save_path, 'cm' + str(which_fold) + '.npz'))
Beispiel #7
0
def execute(dataset,
            n_hidden_t_enc,
            n_hidden_s,
            num_epochs=500,
            learning_rate=.001,
            learning_rate_annealing=1.0,
            gamma=1,
            lmd=0.,
            disc_nonlinearity="sigmoid",
            keep_labels=1.0,
            prec_recall_cutoff=True,
            missing_labels_val=-1.0,
            which_fold=1,
            early_stop_criterion='loss',
            embedding_input='raw',
            save_path='/Tmp/romerosa/feature_selection/',
            save_copy='/Tmp/romerosa/feature_selection/',
            dataset_path='/Tmp/carriepl/datasets/',
            resume=False,
            exp_name=None):

    # Load the dataset
    print("Loading data")
    x_train, y_train, x_valid, y_valid, x_test, y_test, \
        x_unsup, training_labels = mlh.load_data(
            dataset, dataset_path, None,
            which_fold=which_fold, keep_labels=keep_labels,
            missing_labels_val=missing_labels_val,
            embedding_input=embedding_input)

    # Extract required information from data
    n_samples, n_feats = x_train.shape
    print("Number of features : ", n_feats)
    print("Glorot init : ", 2.0 / (n_feats + n_hidden_t_enc[-1]))
    n_targets = y_train.shape[1]

    # Set some variables
    batch_size = 1

    # Preparing folder to save stuff
    print("Experiment: " + exp_name)
    save_path = os.path.join(save_path, dataset, exp_name)
    save_copy = os.path.join(save_copy, dataset, exp_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # Prepare Theano variables for inputs and targets
    input_var_sup = T.matrix('input_sup')
    target_var_sup = T.matrix('target_sup')
    lr = theano.shared(np.float32(learning_rate), 'learning_rate')

    # Build model
    print("Building model")
    discrim_net = InputLayer((None, n_feats), input_var_sup)
    discrim_net = DenseLayer(discrim_net,
                             num_units=n_hidden_t_enc[-1],
                             nonlinearity=rectify)

    # Reconstruct the input using dec_feat_emb
    if gamma > 0:
        reconst_net = DenseLayer(discrim_net,
                                 num_units=n_feats,
                                 nonlinearity=linear)
        nets = [reconst_net]
    else:
        nets = [None]

    # Add supervised hidden layers
    for hid in n_hidden_s:
        discrim_net = DropoutLayer(discrim_net)
        discrim_net = DenseLayer(discrim_net, num_units=hid)

    assert disc_nonlinearity in ["sigmoid", "linear", "rectify", "softmax"]
    discrim_net = DropoutLayer(discrim_net)
    discrim_net = DenseLayer(discrim_net,
                             num_units=n_targets,
                             nonlinearity=eval(disc_nonlinearity))

    print("Building and compiling training functions")

    # Build and compile training functions
    predictions, predictions_det = mh.define_predictions(nets, start=0)
    prediction_sup, prediction_sup_det = mh.define_predictions([discrim_net])
    prediction_sup = prediction_sup[0]
    prediction_sup_det = prediction_sup_det[0]

    # Define losses
    # reconstruction losses
    reconst_losses, reconst_losses_det = mh.define_reconst_losses(
        predictions, predictions_det, [input_var_sup])
    # supervised loss
    sup_loss, sup_loss_det = mh.define_sup_loss(disc_nonlinearity,
                                                prediction_sup,
                                                prediction_sup_det,
                                                keep_labels, target_var_sup,
                                                missing_labels_val)

    inputs = [input_var_sup, target_var_sup]
    params = lasagne.layers.get_all_params([discrim_net] + nets,
                                           trainable=True)

    print('Number of params: ' + str(len(params)))

    # Combine losses
    loss = sup_loss + gamma * reconst_losses[0]
    loss_det = sup_loss_det + gamma * reconst_losses_det[0]

    l2_penalty = apply_penalty(params, l2)
    loss = loss + lmd * l2_penalty
    loss_det = loss_det + lmd * l2_penalty

    # Compute network updates
    updates = lasagne.updates.rmsprop(loss, params, learning_rate=lr)
    # updates = lasagne.updates.sgd(loss,
    #                               params,
    #                               learning_rate=lr)
    # updates = lasagne.updates.momentum(loss, params,
    #                                    learning_rate=lr, momentum=0.0)

    # Apply norm constraints on the weights
    for k in updates.keys():
        if updates[k].ndim == 2:
            updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0)

    # Compile training function
    train_fn = theano.function(inputs,
                               loss,
                               updates=updates,
                               on_unused_input='ignore')

    # Monitoring Labels
    monitor_labels = ["reconst. loss"]
    monitor_labels = [
        i for i, j in zip(monitor_labels, reconst_losses) if j != 0
    ]
    monitor_labels += ["loss. sup.", "total loss"]

    # Build and compile test function
    val_outputs = reconst_losses_det
    val_outputs = [i for i, j in zip(val_outputs, reconst_losses) if j != 0]
    val_outputs += [sup_loss_det, loss_det]

    # Compute accuracy and add it to monitoring list
    test_acc, test_pred = mh.define_test_functions(disc_nonlinearity,
                                                   prediction_sup,
                                                   prediction_sup_det,
                                                   target_var_sup)
    monitor_labels.append("accuracy")
    val_outputs.append(test_acc)

    # Compile prediction function
    predict = theano.function([input_var_sup], test_pred)

    # Compile validation function
    val_fn = theano.function(inputs, [prediction_sup_det] + val_outputs,
                             on_unused_input='ignore')

    # Finally, launch the training loop.
    print("Starting testing...")

    if not os.path.exists(save_copy + '/model_feat_sel_best.npz'):
        print("No saved model to be tested and/or generate" " the embedding !")
    else:
        with np.load(save_copy + '/model_feat_sel_best.npz', ) as f:
            param_values = [f['arr_%d' % i] for i in range(len(f.files))]
            lasagne.layers.set_all_param_values(
                filter(None, nets) + [discrim_net], param_values)

            test_minibatches = mlh.iterate_minibatches(x_test,
                                                       y_test,
                                                       batch_size,
                                                       shuffle=False)

            test_err, pred, targets = mlh.monitoring(test_minibatches,
                                                     "test",
                                                     val_fn,
                                                     monitor_labels,
                                                     prec_recall_cutoff,
                                                     return_pred=True)

        lab = targets.argmax(1)
        pred_argmax = pred.argmax(1)

        continent_cat = mh.create_1000_genomes_continent_labels()

        lab_cont = np.zeros(lab.shape)
        pred_cont = np.zeros(pred_argmax.shape)

        for i, c in enumerate(continent_cat):
            for el in c:
                lab_cont[lab == el] = i
                pred_cont[pred_argmax == el] = i

        cm_e = np.zeros((26, 26))
        cm_c = np.zeros((5, 5))

        for i in range(26):
            for j in range(26):
                cm_e[i, j] = ((pred_argmax == i) * (lab == j)).sum()

        for i in range(5):
            for j in range(5):
                cm_c[i, j] = ((pred_cont == i) * (lab_cont == j)).sum()

        np.savez(os.path.join(save_copy, 'cm' + str(which_fold) + '.npz'),
                 cm_e=cm_e,
                 cm_c=cm_c)

        print(os.path.join(save_copy, 'cm' + str(which_fold) + '.npz'))
Beispiel #8
0
def execute(
        dataset,
        n_hidden_u,
        n_hidden_t_enc,
        n_hidden_t_dec,
        n_hidden_s,
        embedding_source=None,
        num_epochs=500,
        learning_rate=.001,
        learning_rate_annealing=1.0,
        alpha=1,
        beta=1,
        gamma=1,
        lmd=.0001,
        disc_nonlinearity="sigmoid",
        encoder_net_init=0.2,
        decoder_net_init=0.2,
        keep_labels=1.0,
        prec_recall_cutoff=True,
        missing_labels_val=-1.0,
        which_fold=0,
        early_stop_criterion='loss_sup_det',
        embedding_input='raw',
        save_path='/Tmp/' + os.environ["USER"] +
    '/savepath/',  # a default value was needed?
        save_copy='/Tmp/' + os.environ["USER"] + '/savecopy/',
        dataset_path='/Tmp/' + os.environ["USER"] + '/datasets/',
        resume=False,
        exp_name='',
        random_proj=0):

    # Load the dataset
    print("Loading data")
    x_train, y_train, x_valid, y_valid, x_test, y_test, \
        x_unsup, training_labels = mlh.load_data(
            dataset, dataset_path, embedding_source,
            which_fold=which_fold, keep_labels=keep_labels,
            missing_labels_val=missing_labels_val,
            embedding_input=embedding_input)

    if x_unsup is not None:
        n_samples_unsup = x_unsup.shape[1]
    else:
        n_samples_unsup = 0

    # Extract required information from data
    n_samples, n_feats = x_train.shape
    print("Number of features : ", n_feats)
    print("Glorot init : ", 2.0 / (n_feats + n_hidden_t_enc[-1]))
    n_targets = y_train.shape[1]

    # Set some variables
    batch_size = 128
    beta = gamma if (gamma == 0) else beta

    # Preparing folder to save stuff
    if embedding_source is None:
        embedding_name = embedding_input
    else:
        embedding_name = embedding_source.replace("_", "").split(".")[0]
        exp_name += embedding_name.rsplit('/', 1)[::-1][0] + '_'

    exp_name += 'final_'

    exp_name += mlh.define_exp_name(keep_labels, alpha, beta, gamma, lmd,
                                    n_hidden_u, n_hidden_t_enc, n_hidden_t_dec,
                                    n_hidden_s, which_fold, embedding_input,
                                    learning_rate, decoder_net_init,
                                    encoder_net_init, early_stop_criterion,
                                    learning_rate_annealing)

    print("Experiment: " + exp_name)
    save_path = os.path.join(save_path, dataset, exp_name)
    save_copy = os.path.join(save_copy, dataset, exp_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    if not os.path.exists(save_copy):
        os.makedirs(save_copy)

    # Prepare Theano variables for inputs and targets
    input_var_sup = T.matrix('input_sup')
    input_var_unsup = theano.shared(x_unsup, 'input_unsup')  # x_unsup TBD
    target_var_sup = T.matrix('target_sup')
    lr = theano.shared(np.float32(learning_rate), 'learning_rate')

    # Build model
    print("Building model")

    # Some checkings
    # assert len(n_hidden_u) > 0
    assert len(n_hidden_t_enc) > 0
    assert len(n_hidden_t_dec) > 0
    assert n_hidden_t_dec[-1] == n_hidden_t_enc[-1]

    # Build feature embedding networks (encoding and decoding if gamma > 0)
    nets, embeddings, pred_feat_emb = mh.build_feat_emb_nets(
        embedding_source, n_feats, n_samples_unsup, input_var_unsup,
        n_hidden_u, n_hidden_t_enc, n_hidden_t_dec, gamma, encoder_net_init,
        decoder_net_init, save_path, random_proj)

    # Build feature embedding reconstruction networks (if alpha > 0, beta > 0)
    nets += mh.build_feat_emb_reconst_nets(
        [alpha, beta], n_samples_unsup, n_hidden_u,
        [n_hidden_t_enc, n_hidden_t_dec], nets,
        [encoder_net_init, decoder_net_init])

    # Supervised network
    discrim_net, hidden_rep = mh.build_discrim_net(
        batch_size, n_feats, input_var_sup, n_hidden_t_enc, n_hidden_s,
        embeddings[0], disc_nonlinearity, n_targets)

    # Reconstruct network
    nets += [
        mh.build_reconst_net(hidden_rep,
                             embeddings[1] if len(embeddings) > 1 else None,
                             n_feats, gamma)
    ]

    # Load weights if we are resuming job
    if resume:
        # Load best model
        with np.load(os.path.join(save_path, 'model_feat_sel_last.npz')) as f:
            param_values = [f['arr_%d' % i] for i in range(len(f.files))]
        nlayers = len(
            lasagne.layers.get_all_params(filter(None, nets) + [discrim_net]))
        lasagne.layers.set_all_param_values(
            filter(None, nets) + [discrim_net], param_values[:nlayers])

    print("Building and compiling training functions")

    # Build and compile training functions
    predictions, predictions_det = mh.define_predictions(nets, start=2)
    prediction_sup, prediction_sup_det = mh.define_predictions([discrim_net])
    prediction_sup = prediction_sup[0]
    prediction_sup_det = prediction_sup_det[0]

    # Define losses
    # reconstruction losses
    reconst_losses, reconst_losses_det = mh.define_reconst_losses(
        predictions, predictions_det,
        [input_var_unsup, input_var_unsup, input_var_sup])
    # supervised loss
    sup_loss, sup_loss_det = mh.define_sup_loss(disc_nonlinearity,
                                                prediction_sup,
                                                prediction_sup_det,
                                                keep_labels, target_var_sup,
                                                missing_labels_val)

    # Define inputs
    inputs = [input_var_sup, target_var_sup]

    # Define parameters
    params = lasagne.layers.get_all_params([discrim_net] + filter(None, nets),
                                           trainable=True)
    params_to_freeze= \
        lasagne.layers.get_all_params(filter(None, nets), trainable=False)

    print('Number of params discrim: ' + str(len(params)))
    print('Number of params to freeze: ' + str(len(params_to_freeze)))

    for p in params_to_freeze:
        new_params = [el for el in params if el != p]
        params = new_params

    print('Number of params to update: ' + str(len(params)))

    # Combine losses
    loss = sup_loss + alpha*reconst_losses[0] + beta*reconst_losses[1] + \
        gamma*reconst_losses[2]
    loss_det = sup_loss_det + alpha*reconst_losses_det[0] + \
        beta*reconst_losses_det[1] + gamma*reconst_losses_det[2]

    l2_penalty = apply_penalty(params, l2)
    loss = loss + lmd * l2_penalty
    loss_det = loss_det + lmd * l2_penalty

    # Compute network updates
    updates = lasagne.updates.rmsprop(loss, params, learning_rate=lr)
    # updates = lasagne.updates.sgd(loss,
    #                               params,
    #                               learning_rate=lr)
    # updates = lasagne.updates.momentum(loss, params,
    #                                    learning_rate=lr, momentum=0.0)

    # Apply norm constraints on the weights
    for k in updates.keys():
        if updates[k].ndim == 2:
            updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0)

    # Compile training function
    train_fn = theano.function(inputs,
                               loss,
                               updates=updates,
                               on_unused_input='ignore')

    # Monitoring Labels
    monitor_labels = [
        "reconst. feat. W_enc", "reconst. feat. W_dec", "reconst. loss"
    ]
    monitor_labels = [
        i for i, j in zip(monitor_labels, reconst_losses) if j != 0
    ]
    monitor_labels += ["feat. W_enc. mean", "feat. W_enc var"]
    monitor_labels += ["feat. W_dec. mean", "feat. W_dec var"] if \
        (embeddings[1] is not None) else []
    monitor_labels += ["loss. sup.", "total loss"]

    # Build and compile test function
    val_outputs = reconst_losses_det
    val_outputs = [i for i, j in zip(val_outputs, reconst_losses) if j != 0]
    val_outputs += [embeddings[0].mean(), embeddings[0].var()]
    val_outputs += [embeddings[1].mean(), embeddings[1].var()] if \
        (embeddings[1] is not None) else []
    val_outputs += [sup_loss_det, loss_det]

    # Compute accuracy and add it to monitoring list
    test_acc, test_pred = mh.define_test_functions(disc_nonlinearity,
                                                   prediction_sup,
                                                   prediction_sup_det,
                                                   target_var_sup)
    monitor_labels.append("accuracy")
    val_outputs.append(test_acc)

    # Compile prediction function
    predict = theano.function([input_var_sup], test_pred)

    # Compile validation function
    val_fn = theano.function(inputs, [prediction_sup_det] + val_outputs,
                             on_unused_input='ignore')

    # Finally, launch the training loop.
    print("Starting training...")

    # Some variables
    max_patience = 100
    patience = 0

    train_monitored = []
    valid_monitored = []
    train_loss = []

    # Pre-training monitoring
    print("Epoch 0 of {}".format(num_epochs))

    train_minibatches = mlh.iterate_minibatches(x_train,
                                                y_train,
                                                batch_size,
                                                shuffle=False)
    train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                               monitor_labels, prec_recall_cutoff)

    valid_minibatches = mlh.iterate_minibatches(x_valid,
                                                y_valid,
                                                batch_size,
                                                shuffle=False)
    valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                               monitor_labels, prec_recall_cutoff)

    # Training loop
    start_training = time.time()
    for epoch in range(num_epochs):
        start_time = time.time()
        print("Epoch {} of {}".format(epoch + 1, num_epochs))
        nb_minibatches = 0
        loss_epoch = 0

        # Train pass
        for batch in mlh.iterate_minibatches(x_train,
                                             training_labels,
                                             batch_size,
                                             shuffle=True):
            loss_epoch += train_fn(*batch)
            nb_minibatches += 1

        loss_epoch /= nb_minibatches
        train_loss += [loss_epoch]

        # Monitoring on the training set
        train_minibatches = mlh.iterate_minibatches(x_train,
                                                    y_train,
                                                    batch_size,
                                                    shuffle=False)
        train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                   monitor_labels, prec_recall_cutoff)
        train_monitored += [train_err]

        # Monitoring on the validation set
        valid_minibatches = mlh.iterate_minibatches(x_valid,
                                                    y_valid,
                                                    batch_size,
                                                    shuffle=False)

        valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                   monitor_labels, prec_recall_cutoff)
        valid_monitored += [valid_err]

        try:
            early_stop_val = valid_err[monitor_labels.index(
                early_stop_criterion)]
        except:
            raise ValueError("There is no monitored value by the name of %s" %
                             early_stop_criterion)

        # Early stopping
        if epoch == 0:
            best_valid = early_stop_val
        elif (early_stop_val > best_valid and early_stop_criterion == 'accuracy') or \
             (early_stop_val < best_valid and early_stop_criterion == 'loss. sup.'):
            best_valid = early_stop_val
            patience = 0

            # Save stuff
            np.savez(
                os.path.join(save_path, 'model_feat_sel_best.npz'),
                *lasagne.layers.get_all_param_values(
                    filter(None, nets) + [discrim_net]))
            np.savez(save_path + "/errors_supervised_best.npz",
                     zip(*train_monitored), zip(*valid_monitored))

            # Monitor on the test set now because sometimes the saving doesn't
            # go well and there isn't a model to load at the end of training
            if y_test is not None:
                test_minibatches = mlh.iterate_minibatches(x_test,
                                                           y_test,
                                                           138,
                                                           shuffle=False)

                test_err = mlh.monitoring(test_minibatches, "test", val_fn,
                                          monitor_labels, prec_recall_cutoff)
        else:
            patience += 1
            # Save stuff
            np.savez(
                os.path.join(save_path, 'model_feat_sel_last.npz'),
                *lasagne.layers.get_all_param_values(
                    filter(None, nets) + [discrim_net]))
            np.savez(save_path + "/errors_supervised_last.npz",
                     zip(*train_monitored), zip(*valid_monitored))

        # End training
        if patience == max_patience or epoch == num_epochs - 1:
            print("Ending training")
            # Load best model
            with np.load(os.path.join(save_path,
                                      'model_feat_sel_best.npz')) as f:
                param_values = [f['arr_%d' % i] for i in range(len(f.files))]
            nlayers = len(
                lasagne.layers.get_all_params(
                    filter(None, nets) + [discrim_net]))
            lasagne.layers.set_all_param_values(
                filter(None, nets) + [discrim_net], param_values[:nlayers])
            if embedding_source is None:
                # Save embedding
                pred = pred_feat_emb()
                np.savez(os.path.join(save_path, 'feature_embedding.npz'),
                         pred)

            # Training set results
            train_minibatches = mlh.iterate_minibatches(x_train,
                                                        y_train,
                                                        batch_size,
                                                        shuffle=False)
            train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                       monitor_labels, prec_recall_cutoff)

            # Validation set results
            valid_minibatches = mlh.iterate_minibatches(x_valid,
                                                        y_valid,
                                                        batch_size,
                                                        shuffle=False)
            valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                       monitor_labels, prec_recall_cutoff)

            # Test set results
            if y_test is not None:
                test_minibatches = mlh.iterate_minibatches(x_test,
                                                           y_test,
                                                           138,
                                                           shuffle=False)

                test_err = mlh.monitoring(test_minibatches, "test", val_fn,
                                          monitor_labels, prec_recall_cutoff)
                np.savez(os.path.join(save_path, 'final_errors.npz'), test_err)
            else:
                for minibatch in mlh.iterate_testbatches(x_test,
                                                         138,
                                                         shuffle=False):
                    test_predictions = []
                    test_predictions += [predict(minibatch)]
                np.savez(os.path.join(save_path, 'test_predictions.npz'),
                         test_predictions)

            # Stop
            print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() -
                                                         start_time))
            break

        print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time))

        # Anneal the learning rate
        lr.set_value(float(lr.get_value() * learning_rate_annealing))

    # Print and save all final errors for train, validation and test
    print("Training time:\t\t\t{:.3f}s".format(time.time() - start_training))
    print("test_err:", test_err)

    # Copy files to loadpath
    if save_path != save_copy:
        print('Copying model and other training files to {}'.format(save_copy))
        copy_tree(save_path, save_copy)
Beispiel #9
0
def execute(dataset,
            learning_rate=0.00001,
            learning_rate_annealing=1.0,
            alpha=0.,
            beta=1.,
            lmd=0.,
            encoder_units=[1024, 512, 256],
            num_epochs=500,
            which_fold=1,
            save_path=None,
            save_copy=None,
            dataset_path=None,
            num_fully_connected=0,
            exp_name='',
            init_args=None):

    # Reading dataset
    print("Loading data")
    x_unsup = mlh.load_data(dataset,
                            dataset_path,
                            None,
                            which_fold=which_fold,
                            keep_labels=1.0,
                            missing_labels_val=-1.0,
                            embedding_input='bin',
                            transpose=True)

    x_train = x_unsup[0][0]
    x_valid = x_unsup[1][0]

    n_features = x_train.shape[1]

    exp_name += "learn_gene_vector_h"
    for e in encoder_units:
        exp_name += ('-' + str(e))
    exp_name += '_a-' + str(alpha)
    exp_name += '_b-' + str(beta)
    # exp_name += '_g-' + str(gamma)
    exp_name += '_l-' + str(lmd)
    exp_name += '_lr-' + str(learning_rate)

    save_path = os.path.join(save_path, exp_name)
    save_copy = os.path.join(save_copy, exp_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    if not os.path.exists(save_copy):
        os.makedirs(save_copy)

    # Prepare Theano variables for inputs and targets
    input_var = T.matrix('input')
    target_var = T.matrix('target')
    target_reconst = T.matrix('target')
    lr = theano.shared(np.float32(learning_rate), 'learning_rate')
    batch_size = 128

    # building network
    encoder = InputLayer((batch_size, n_features), input_var)

    # building the encoder and decoder
    #import pdb; pdb.set_trace()
    for i in range(len(encoder_units)):
        encoder = DenseLayer(
            encoder,
            num_units=encoder_units[i],
            W=HeNormal('relu'),
            nonlinearity=rectify)  # if i < len(encoder_units)-1 else linear)

    embedding = lasagne.layers.get_output(encoder)

    params = lasagne.layers.get_all_params(encoder, trainable=True)
    monitor_labels = ["embedding min", "embedding max"]
    val_outputs = [embedding.min(), embedding.max()]
    nets = [encoder]

    if alpha > 0:
        decoder_units = encoder_units[::-1][1:]
        print(decoder_units)
        decoder = encoder
        for i in range(len(decoder_units)):
            decoder = DenseLayer(decoder,
                                 num_units=decoder_units[i],
                                 W=HeNormal('relu'),
                                 nonlinearity=rectify)
        decoder = DenseLayer(decoder,
                             num_units=n_features,
                             W=convert_initialization(
                                 init_args["decoder_init"],
                                 nonlinearity="sigmoid"),
                             nonlinearity=sigmoid)
        prediction_reconst = lasagne.layers.get_output(decoder)

        # Reconstruction error
        loss_reconst = lasagne.objectives.binary_crossentropy(
            prediction_reconst, target_reconst).mean()

        # loss_reconst = mh.define_sampled_mean_bincrossentropy(
        #    prediction_reconst, target_reconst, gamma=gamma)

        #loss_reconst = mh.dice_coef_loss(
        #    target_reconst, prediction_reconst).mean()

        accuracy = T.eq(T.gt(prediction_reconst, 0.5), target_reconst).mean()

        params += lasagne.layers.get_all_params(decoder, trainable=True)
        monitor_labels += ["reconst. loss", "reconst. accuracy"]
        val_outputs += [loss_reconst, accuracy]
        nets += [decoder]
        # sparsity_reconst = gamma * l1(prediction_reconst)
        # roh = input_var.mean(0)
        # sparsity_reconst = ((roh * T.log(roh / (prediction_reconst.mean(0)+1e-8))) +\
        #     ((1 - roh) * T.log((1 - roh) / (1 - prediction_reconst + 1e-8)))).sum()

    else:
        loss_reconst = 0
        # sparsity_reconst = 0

    if beta > 0:
        predictor_laysize = [encoder_units[-1]] * num_fully_connected
        predictor = encoder
        for i in range(len(predictor_laysize)):
            predictor = DenseLayer(predictor,
                                   num_units=predictor_laysize[i],
                                   nonlinearity=rectify,
                                   W=convert_initialization(
                                       init_args["predictor_init"],
                                       nonlinearity="relu"))

        predictor = DenseLayer(predictor,
                               num_units=2,
                               nonlinearity=sigmoid,
                               W=convert_initialization(
                                   init_args["predictor_init"],
                                   nonlinearity="sigmoid"))

        prediction_var = lasagne.layers.get_output(predictor)

        # w2v error
        # loss_pred = lasagne.objectives.binary_crossentropy(
        #     prediction_var, target_var
        # ).mean()

        loss_pred = mh.dice_coef_loss(target_var, prediction_var).mean()

        accuracy = T.eq(T.gt(prediction_var, 0.5), target_var).mean()

        params += lasagne.layers.get_all_params(predictor, trainable=True)
        monitor_labels += ["pred. loss", "pred. accuracy"]
        val_outputs += [loss_pred, accuracy]
        nets += [predictor]

        # sparsity_pred = gamma * l1(prediction_var)
        # roh = 0.05
        # sparsity_pred = ((roh * T.log(roh / prediction_pred.mean(0))) +\
        #     ((1 - roh) * T.log((1 - roh) / (1 - prediction_pred)))).sum()
    else:
        loss_pred = 0
        # sparsity_pred = 0

    # Combine losses
    loss = alpha * loss_reconst + beta * loss_pred  # sparsity_pred  # + sparsity_reconst

    # applying weight decay
    l2_penalty = apply_penalty(params, l2)
    loss = loss + lmd * l2_penalty
    # loss = loss + lmd*l2_penalty

    val_outputs += [loss]
    monitor_labels += ['loss']

    # Some variables
    max_patience = 100
    patience = 0

    train_monitored = []
    valid_monitored = []
    train_loss = []

    updates = lasagne.updates.adam(loss, params, learning_rate=lr)

    for k in updates.keys():
        if updates[k].ndim == 2:
            updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0)

    inputs = [input_var, target_var, target_reconst]

    # Compile training function
    print "Compiling training function"
    train_fn = theano.function(inputs,
                               loss,
                               updates=updates,
                               on_unused_input='ignore')
    val_fn = theano.function(inputs, [val_outputs[0]] + val_outputs,
                             on_unused_input='ignore')

    if alpha > 0:
        pred_fn = theano.function([input_var], prediction_reconst)

    start_training = time.time()

    # data_gen = data_generator(x_train, batch_size)
    print "Starting training"
    for epoch in range(num_epochs):
        start_time = time.time()
        print("Epoch {} of {}".format(epoch + 1, num_epochs))
        nb_minibatches = 0
        loss_epoch = 0

        for x, y, target_reconst_val in data_generator(x_train,
                                                       batch_size,
                                                       shuffle=True):
            loss_epoch += train_fn(x, y, target_reconst_val)
            nb_minibatches += 1

        if alpha > 0:
            pr = pred_fn(x)
            print('min pr:' + str(pr.min()))
            print('max pr:' + str(pr.max()))
            print('mean pr:' + str(pr.mean()))

        loss_epoch /= nb_minibatches
        train_loss += [loss_epoch]

        # Monitoring on the training set
        train_minibatches = data_generator(x_train, batch_size)
        train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                   monitor_labels, 0)
        train_monitored += [train_err]

        # Monitoring on the validation set
        valid_minibatches = data_generator(x_valid, batch_size)

        valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                   monitor_labels, 0)
        valid_monitored += [valid_err]

        early_stop_criterion = 'loss'
        early_stop_val = valid_err[monitor_labels.index(early_stop_criterion)]

        # Early stopping
        if epoch == 0:
            best_valid = early_stop_val
        elif early_stop_val < best_valid and early_stop_criterion == 'loss':
            best_valid = early_stop_val
            patience = 0

            # Save stuff
            np.savez(save_path + '/model_snp2vec_best.npz',
                     *lasagne.layers.get_all_param_values(nets))
            np.savez(save_path + "/errors_snp2vec_best.npz",
                     zip(*train_monitored), zip(*valid_monitored))
        else:
            patience += 1
            np.savez(os.path.join(save_path, 'model_snp2vec_last.npz'),
                     *lasagne.layers.get_all_param_values(nets))
            np.savez(save_path + "/errors_snp2vec_last.npz",
                     zip(*train_monitored), zip(*valid_monitored))

        # End training
        if (patience == max_patience) or (epoch == num_epochs - 1):
            print("Ending training")
            # Load best model
            if not os.path.exists(save_path + '/model_snp2vec_best.npz'):
                print(
                    "No saved model to be tested and/or generate"
                    " the embedding !")
            else:
                with np.load(save_path + '/model_snp2vec_best.npz') as f:
                    param_values = [
                        f['arr_%d' % i] for i in range(len(f.files))
                    ]
                    lasagne.layers.set_all_param_values(nets, param_values)

            # Training set results
            train_minibatches = data_generator(x_train, batch_size)
            train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                       monitor_labels, 0)

            # Validation set results
            valid_minibatches = data_generator(x_valid, batch_size)
            valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                       monitor_labels, 0)

            # Stop
            print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() -
                                                         start_time))
            break

        print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time))
        # Anneal the learning rate
        lr.set_value(float(lr.get_value() * learning_rate_annealing))

    # Copy files to loadpath
    if save_path != save_copy:
        print('Copying model and other training files to {}'.format(save_copy))
        copy_tree(save_path, save_copy)
def execute(dataset,
            learning_rate=0.00001,
            alpha=0.,
            beta=1.,
            lmd=0.,
            encoder_units=[1024, 512, 256],
            num_epochs=500,
            which_fold=1,
            save_path=None,
            save_copy=None,
            dataset_path=None):

    # Reading dataset
    print("Loading data")
    x_unsup = mlh.load_data(dataset,
                            dataset_path,
                            None,
                            which_fold=which_fold,
                            keep_labels=1.0,
                            missing_labels_val=-1.0,
                            embedding_input='bin',
                            transpose=True)

    x_train = x_unsup[0][0]
    x_valid = x_unsup[1][0]

    n_features = x_train.shape[1]

    exp_name = "learn_gene_vector_h"
    for e in encoder_units:
        exp_name += ('-' + str(e))
    exp_name += '_a-' + str(alpha)
    exp_name += '_b-' + str(beta)
    exp_name += '_l-' + str(lmd)
    exp_name += '_lr-' + str(learning_rate)

    save_path = os.path.join(save_path, exp_name)
    save_copy = os.path.join(save_copy, exp_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    if not os.path.exists(save_copy):
        os.makedirs(save_copy)

    # Prepare Theano variables for inputs and targets
    input_var = T.matrix('input')
    target_var = T.matrix('target')
    target_reconst = T.matrix('target')
    lr = theano.shared(np.float32(learning_rate), 'learning_rate')
    lmd = 0.0001  # weight decay coeff
    num_epochs = 200
    # there arent really any epochs as we are using a generator with random
    # sampling from dataset. This is for compat.
    batches_per_epoch = 1000
    batch_size = 128

    # building network
    encoder = InputLayer((batch_size, n_features), input_var)

    # building the encoder and decoder
    for i in range(len(encoder_units)):
        encoder = DenseLayer(encoder,
                             num_units=encoder_units[i],
                             nonlinearity=rectify)

    params = lasagne.layers.get_all_params(encoder, trainable=True)
    monitor_labels = []
    val_outputs = []
    nets = [encoder]

    if alpha > 0:
        decoder_units = encoder_units[::-1][1:]
        decoder = encoder
        for i in range(len(decoder_units)):
            decoder = DenseLayer(decoder,
                                 num_units=decoder_units[i],
                                 nonlinearity=rectify)
        decoder = DenseLayer(decoder,
                             num_units=n_features,
                             nonlinearity=sigmoid)
        prediction_reconst = lasagne.layers.get_output(decoder)

        # Reconstruction error
        loss_reconst = lasagne.objectives.binary_crossentropy(
            prediction_reconst, target_reconst).mean()

        params += lasagne.layers.get_all_params(decoder, trainable=True)
        monitor_labels += ["reconst."]
        val_outputs += [loss_reconst]
        nets += [decoder]

    else:
        loss_reconst = 0

    if beta > 0:
        predictor_laysize = [encoder_units[-1]] * 4
        predictor = encoder
        for i in range(len(predictor_laysize)):
            predictor = DenseLayer(predictor,
                                   num_units=predictor_laysize[i],
                                   nonlinearity=rectify)

        predictor = DenseLayer(predictor, num_units=2, nonlinearity=sigmoid)

        prediction_var = lasagne.layers.get_output(predictor)

        # w2v error
        loss_pred = lasagne.objectives.binary_crossentropy(
            prediction_var, target_var).mean()

        params += lasagne.layers.get_all_params(predictor, trainable=True)
        monitor_labels += ["pred."]
        val_outputs += [loss_pred]
        nets += [predictor]
    else:
        loss_pred = 0

    # Combine losses
    loss = alpha * loss_reconst + beta * loss_pred

    # applying weight decay
    l2_penalty = apply_penalty(params, l2)
    loss = loss + lmd * l2_penalty
    # loss = loss + lmd*l2_penalty

    val_outputs += [loss]
    monitor_labels += ['loss']

    # Some variables
    max_patience = 100
    patience = 0

    train_monitored = []
    valid_monitored = []
    train_loss = []

    updates = lasagne.updates.rmsprop(loss, params, learning_rate=lr)

    inputs = [input_var, target_var, target_reconst]

    # Compile training function
    print "Compiling training function"
    train_fn = theano.function(inputs,
                               loss,
                               updates=updates,
                               on_unused_input='ignore')
    val_fn = theano.function(inputs, [val_outputs[0]] + val_outputs,
                             on_unused_input='ignore')
    start_training = time.time()
    print "training start time: {}".format(start_training)

    # data_gen = data_generator(x_train, batch_size)
    print "Starting training"
    for epoch in range(num_epochs):
        start_time = time.time()
        print("Epoch {} of {}".format(epoch + 1, num_epochs))
        nb_minibatches = 0
        loss_epoch = 0

        for x, y, target_reconst_val in data_generator(x_train, batch_size):
            loss_epoch += train_fn(x, y, target_reconst_val)
            nb_minibatches += 1

        loss_epoch /= nb_minibatches
        train_loss += [loss_epoch]

        # Monitoring on the training set
        train_minibatches = data_generator(x_train, batch_size)
        train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                   monitor_labels, 0)
        train_monitored += [train_err]

        # Monitoring on the validation set
        valid_minibatches = data_generator(x_valid, batch_size)

        valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                   monitor_labels, 0)
        valid_monitored += [valid_err]

        early_stop_criterion = 'loss'
        early_stop_val = valid_err[monitor_labels.index(early_stop_criterion)]

        # Early stopping
        if epoch == 0:
            best_valid = early_stop_val
        elif early_stop_val < best_valid and early_stop_criterion == 'loss':
            best_valid = early_stop_val
            patience = 0

            # Save stuff
            np.savez(save_path + '/model_snp2vec_best.npz',
                     *lasagne.layers.get_all_param_values(nets))
            np.savez(save_path + "/errors_snp2vec_best.npz",
                     zip(*train_monitored), zip(*valid_monitored))
        else:
            patience += 1
            np.savez(os.path.join(save_path, 'model_snp2vec_last.npz'),
                     *lasagne.layers.get_all_param_values(nets))
            np.savez(save_path + "/errors_snp2vec_last.npz",
                     zip(*train_monitored), zip(*valid_monitored))

        # End training
        if (patience == max_patience) or (epoch == num_epochs - 1):
            print("Ending training")
            # Load best model
            if not os.path.exists(save_path + '/model_snp2vec_best.npz'):
                print(
                    "No saved model to be tested and/or generate"
                    " the embedding !")
            else:
                with np.load(save_path + '/model_snp2vec_best.npz') as f:
                    param_values = [
                        f['arr_%d' % i] for i in range(len(f.files))
                    ]
                    lasagne.layers.set_all_param_values(nets, param_values)

            # Training set results
            train_minibatches = data_generator(x_train, batch_size)
            train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                       monitor_labels, 0)

            # Validation set results
            valid_minibatches = data_generator(x_valid, batch_size)
            valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                       monitor_labels, 0)

            # Stop
            print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() -
                                                         start_time))
            break

        print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time))

    # Copy files to loadpath
    if save_path != save_copy:
        print('Copying model and other training files to {}'.format(save_copy))
        copy_tree(save_path, save_copy)
def execute(dataset,
            n_hidden_t_enc,
            n_hidden_s,
            num_epochs=500,
            learning_rate=.001,
            learning_rate_annealing=1.0,
            gamma=1,
            lmd=0.,
            disc_nonlinearity="sigmoid",
            keep_labels=1.0,
            prec_recall_cutoff=True,
            missing_labels_val=-1.0,
            which_fold=1,
            early_stop_criterion='loss',
            save_path='/Tmp/romerosa/DietNetworks/',
            save_copy='/Tmp/romerosa/DietNetworks/',
            dataset_path='/Tmp/carriepl/datasets/',
            resume=False):

    # Load the dataset
    print("Loading data")
    x_train, y_train, x_valid, y_valid, x_test, y_test, \
        x_unsup, training_labels = mlh.load_data(
            dataset, dataset_path, None,
            which_fold=which_fold, keep_labels=keep_labels,
            missing_labels_val=missing_labels_val,
            embedding_input='raw')

    # Extract required information from data
    n_samples, n_feats = x_train.shape
    print("Number of features : ", n_feats)
    print("Glorot init : ", 2.0 / (n_feats + n_hidden_t_enc[-1]))
    n_targets = y_train.shape[1]

    # Set some variables
    batch_size = 128

    # Preparing folder to save stuff
    exp_name = 'basic_' + mlh.define_exp_name(
        keep_labels, 0, 0, gamma, lmd, [], n_hidden_t_enc, [], n_hidden_s,
        which_fold, learning_rate, 0, 0, early_stop_criterion,
        learning_rate_annealing)
    print("Experiment: " + exp_name)
    save_path = os.path.join(save_path, dataset, exp_name)
    save_copy = os.path.join(save_copy, dataset, exp_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # Prepare Theano variables for inputs and targets
    input_var_sup = T.matrix('input_sup')
    target_var_sup = T.matrix('target_sup')
    lr = theano.shared(np.float32(learning_rate), 'learning_rate')

    # Build model
    print("Building model")
    discrim_net = InputLayer((None, n_feats), input_var_sup)
    discrim_net = DenseLayer(discrim_net,
                             num_units=n_hidden_t_enc[-1],
                             nonlinearity=rectify)

    # Reconstruct the input using dec_feat_emb
    if gamma > 0:
        reconst_net = DenseLayer(discrim_net,
                                 num_units=n_feats,
                                 nonlinearity=linear)
        nets = [reconst_net]
    else:
        nets = [None]

    # Add supervised hidden layers
    for hid in n_hidden_s:
        discrim_net = DropoutLayer(discrim_net)
        discrim_net = DenseLayer(discrim_net, num_units=hid)

    assert disc_nonlinearity in ["sigmoid", "linear", "rectify", "softmax"]
    discrim_net = DropoutLayer(discrim_net)
    discrim_net = DenseLayer(discrim_net,
                             num_units=n_targets,
                             nonlinearity=eval(disc_nonlinearity))

    print("Building and compiling training functions")

    # Build and compile training functions
    predictions, predictions_det = mh.define_predictions(nets, start=0)
    prediction_sup, prediction_sup_det = mh.define_predictions([discrim_net])
    prediction_sup = prediction_sup[0]
    prediction_sup_det = prediction_sup_det[0]

    # Define losses
    # reconstruction losses
    reconst_losses, reconst_losses_det = mh.define_reconst_losses(
        predictions, predictions_det, [input_var_sup])
    # supervised loss
    sup_loss, sup_loss_det = mh.define_sup_loss(disc_nonlinearity,
                                                prediction_sup,
                                                prediction_sup_det,
                                                keep_labels, target_var_sup,
                                                missing_labels_val)

    inputs = [input_var_sup, target_var_sup]
    params = lasagne.layers.get_all_params([discrim_net] + nets,
                                           trainable=True)

    print('Number of params: ' + str(len(params)))

    # Combine losses
    loss = sup_loss + gamma * reconst_losses[0]
    loss_det = sup_loss_det + gamma * reconst_losses_det[0]

    l2_penalty = apply_penalty(params, l2)
    loss = loss + lmd * l2_penalty
    loss_det = loss_det + lmd * l2_penalty

    # Compute network updates
    updates = lasagne.updates.rmsprop(loss, params, learning_rate=lr)
    # updates = lasagne.updates.sgd(loss,
    #                               params,
    #                               learning_rate=lr)
    # updates = lasagne.updates.momentum(loss, params,
    #                                    learning_rate=lr, momentum=0.0)

    # Apply norm constraints on the weights
    for k in updates.keys():
        if updates[k].ndim == 2:
            updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0)

    # Compile training function
    train_fn = theano.function(inputs,
                               loss,
                               updates=updates,
                               on_unused_input='ignore')

    # Monitoring Labels
    monitor_labels = ["reconst. loss"]
    monitor_labels = [
        i for i, j in zip(monitor_labels, reconst_losses) if j != 0
    ]
    monitor_labels += ["loss. sup.", "total loss"]

    # Build and compile test function
    val_outputs = reconst_losses_det
    val_outputs = [i for i, j in zip(val_outputs, reconst_losses) if j != 0]
    val_outputs += [sup_loss_det, loss_det]

    # Compute accuracy and add it to monitoring list
    test_acc, test_pred = mh.define_test_functions(disc_nonlinearity,
                                                   prediction_sup,
                                                   prediction_sup_det,
                                                   target_var_sup)
    monitor_labels.append("accuracy")
    val_outputs.append(test_acc)

    # Compile prediction function
    predict = theano.function([input_var_sup], test_pred)

    # Compile validation function
    val_fn = theano.function(inputs, [prediction_sup_det] + val_outputs,
                             on_unused_input='ignore')

    # Finally, launch the training loop.
    print("Starting training...")

    # Some variables
    max_patience = 100
    patience = 0

    train_monitored = []
    valid_monitored = []
    train_loss = []

    # Pre-training monitoring
    print("Epoch 0 of {}".format(num_epochs))

    train_minibatches = mlh.iterate_minibatches(x_train,
                                                y_train,
                                                batch_size,
                                                shuffle=False)
    train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                               monitor_labels, prec_recall_cutoff)

    valid_minibatches = mlh.iterate_minibatches(x_valid,
                                                y_valid,
                                                batch_size,
                                                shuffle=False)
    valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                               monitor_labels, prec_recall_cutoff)

    # Training loop
    start_training = time.time()
    for epoch in range(num_epochs):
        start_time = time.time()
        print("Epoch {} of {}".format(epoch + 1, num_epochs))
        nb_minibatches = 0
        loss_epoch = 0

        # Train pass
        for batch in mlh.iterate_minibatches(x_train,
                                             training_labels,
                                             batch_size,
                                             shuffle=True):
            loss_epoch += train_fn(*batch)
            nb_minibatches += 1

        loss_epoch /= nb_minibatches
        train_loss += [loss_epoch]

        # Monitoring on the training set
        train_minibatches = mlh.iterate_minibatches(x_train,
                                                    y_train,
                                                    batch_size,
                                                    shuffle=False)
        train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                   monitor_labels, prec_recall_cutoff)
        train_monitored += [train_err]

        # Monitoring on the validation set
        valid_minibatches = mlh.iterate_minibatches(x_valid,
                                                    y_valid,
                                                    batch_size,
                                                    shuffle=False)

        valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                   monitor_labels, prec_recall_cutoff)
        valid_monitored += [valid_err]

        try:
            early_stop_val = valid_err[monitor_labels.index(
                early_stop_criterion)]
        except:
            raise ValueError("There is no monitored value by the name of %s" %
                             early_stop_criterion)

        # Early stopping
        if epoch == 0:
            best_valid = early_stop_val
        elif (early_stop_val > best_valid and early_stop_criterion == 'accuracy') or \
             (early_stop_val < best_valid and early_stop_criterion ==
              'loss. sup.'):
            best_valid = early_stop_val
            patience = 0

            # Save stuff
            np.savez(
                os.path.join(save_path, 'model_best.npz'),
                *lasagne.layers.get_all_param_values(
                    filter(None, nets) + [discrim_net]))
            np.savez(save_path + "/errors_supervised_best.npz",
                     zip(*train_monitored), zip(*valid_monitored))
        else:
            patience += 1
            np.savez(
                os.path.join(save_path, 'model_last.npz'),
                *lasagne.layers.get_all_param_values(
                    filter(None, nets) + [discrim_net]))
            np.savez(save_path + "/errors_supervised_last.npz",
                     zip(*train_monitored), zip(*valid_monitored))

        # End training
        if patience == max_patience or epoch == num_epochs - 1:
            print("Ending training")
            # Load best model
            if not os.path.exists(save_path + '/model_best.npz'):
                print("No saved model to be tested and/or generate"
                      " the embedding !")
            else:
                with np.load(save_path + '/model_best.npz', ) as f:
                    param_values = [
                        f['arr_%d' % i] for i in range(len(f.files))
                    ]
                    lasagne.layers.set_all_param_values(
                        filter(None, nets) + [discrim_net], param_values)

            # Training set results
            train_minibatches = mlh.iterate_minibatches(x_train,
                                                        y_train,
                                                        batch_size,
                                                        shuffle=False)
            train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                       monitor_labels, prec_recall_cutoff)

            # Validation set results
            valid_minibatches = mlh.iterate_minibatches(x_valid,
                                                        y_valid,
                                                        batch_size,
                                                        shuffle=False)
            valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                       monitor_labels, prec_recall_cutoff)

            # Test set results
            if y_test is not None:
                test_minibatches = mlh.iterate_minibatches(x_test,
                                                           y_test,
                                                           batch_size,
                                                           shuffle=False)

                test_err = mlh.monitoring(test_minibatches, "test", val_fn,
                                          monitor_labels, prec_recall_cutoff)
            else:
                for minibatch in mlh.iterate_testbatches(x_test,
                                                         batch_size,
                                                         shuffle=False):
                    test_predictions = []
                    test_predictions += [predict(minibatch)]
                np.savez(os.path.join(save_path, 'test_predictions.npz'),
                         test_predictions)

            # Stop
            print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() -
                                                         start_time))
            break

        print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time))

        # Anneal the learning rate
        lr.set_value(float(lr.get_value() * learning_rate_annealing))

    # Print all final errors for train, validation and test
    print("Training time:\t\t\t{:.3f}s".format(time.time() - start_training))

    # Copy files to loadpath
    if save_path != save_copy:
        print('Copying model and other training files to {}'.format(save_copy))
        copy_tree(save_path, save_copy)