Ejemplo n.º 1
0
def main(argv=None):

    opt = parse_args(argv)
    # TODO: set the seed
    seed = opt.seed
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.manual_seed(seed)

    exp_dir = opt.load_folder
    if exp_dir is None:  # we create a new folder if we don't load.
        exp_dir = monitoring.create_experiment_folder(opt)

    # creating the dataset
    print("Getting the dataset...")
    dataset = datasets.get_dataset(opt, exp_dir)

    # Creating a model
    print("Getting the model...")

    my_model, optimizer, epoch, opt = monitoring.load_checkpoint(
        exp_dir, opt, dataset.dataset.input_size(),
        dataset.dataset.additional_info())

    # Training optimizer and stuff
    criterion = torch.nn.MSELoss()

    if not opt.cpu:
        print("Putting the model on gpu...")
        my_model.cuda(opt.gpu_selection)

    # The training.
    print("Start training.")
    #monitoring and predictions
    predictions = np.zeros(
        (dataset.dataset.nb_patient, dataset.dataset.nb_gene))
    indices_patients = np.arange(dataset.dataset.nb_patient)
    indices_genes = np.arange(dataset.dataset.nb_gene)
    xdata = np.transpose([
        np.tile(indices_genes, len(indices_patients)),
        np.repeat(indices_patients, len(indices_genes))
    ])
    progress_bar_modulo = len(dataset) / 100

    monitoring_dic = {}
    monitoring_dic['train_loss'] = []

    for t in range(epoch, opt.epoch):

        start_timer = time.time()

        thisepoch_trainloss = []

        with tqdm(dataset, unit="batch") as tepoch:
            for mini in tepoch:
                tepoch.set_description(f"Epoch {t}")

                inputs, targets = mini[0], mini[1]

                inputs = Variable(inputs, requires_grad=False).float()
                targets = Variable(targets, requires_grad=False).float()

                if not opt.cpu:
                    inputs = inputs.cuda(opt.gpu_selection)
                    targets = targets.cuda(opt.gpu_selection)

                # Forward pass: Compute predicted y by passing x to the model
                y_pred = my_model(inputs).float()
                y_pred = y_pred.squeeze()

                targets = torch.reshape(targets, (targets.shape[0], ))
                # Compute and print loss

                loss = criterion(y_pred, targets)
                to_list = loss.cpu().data.numpy().reshape((1, ))[0]
                thisepoch_trainloss.append(to_list)
                tepoch.set_postfix(loss=loss.item())

                np.save(os.path.join(exp_dir, 'pixel_epoch_{}'.format(t)),
                        my_model.emb_1.weight.cpu().data.numpy())
                np.save(os.path.join(exp_dir, 'digit_epoch_{}'.format(t)),
                        my_model.emb_2.weight.cpu().data.numpy())

                # Zero gradients, perform a backward pass, and update the weights.
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        monitoring.save_checkpoint(my_model, optimizer, t, opt, exp_dir)
        monitoring_dic['train_loss'].append(np.mean(thisepoch_trainloss))
        np.save(f'{exp_dir}/train_loss.npy', monitoring_dic['train_loss'])
Ejemplo n.º 2
0
def main(argv=None):

    opt = parse_args(argv)
    # TODO: set the seed
    seed = opt.seed
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.manual_seed(seed)

    exp_dir = opt.load_folder
    if exp_dir is None:  # we create a new folder if we don't load.
        exp_dir = monitoring.create_experiment_folder(opt)

    # creating the dataset
    print "Getting the dataset..."
    dataset = datasets.get_dataset(opt)

    # Creating a model
    print "Getting the model..."
    my_model, optimizer, epoch, opt = monitoring.load_checkpoint(
        exp_dir, opt, dataset.dataset.input_size())

    # Training optimizer and stuff
    criterion = torch.nn.MSELoss()

    if not opt.cpu:
        print "Putting the model on gpu..."
        my_model.cuda()

    # The training.
    print "Start training."
    progress_bar_modulo = len(dataset) / 100

    for t in range(epoch, opt.epoch):

        start_timer = time.time()

        if opt.save_error:
            outfname_g = '_'.join(['gene_epoch', str(t), 'prediction.npy'])
            outfname_g = os.path.join(exp_dir, outfname_g)
            outfname_t = '_'.join(['tissue_epoch', str(t), 'prediction.npy'])
            outfname_t = os.path.join(exp_dir, outfname_t)
            train_trace = np.zeros(
                (dataset.dataset.nb_gene, dataset.dataset.nb_patient))

        for no_b, mini in enumerate(dataset):

            inputs, targets = mini[0], mini[1]

            inputs = Variable(inputs, requires_grad=False).float()
            targets = Variable(targets, requires_grad=False).float()

            if not opt.cpu:
                inputs = inputs.cuda()
                targets = targets.cuda()

            # Forward pass: Compute predicted y by passing x to the model
            y_pred = my_model(inputs).float()

            if opt.save_error:
                # Log the predicted values per sample and per gene (S.L. validation)
                batch_inputs = mini[0].numpy()
                predicted_values = y_pred.data.cpu().numpy()
                train_trace[batch_inputs[:, 0],
                            batch_inputs[:, 1]] = predicted_values[:, 0]

            # Compute and print loss
            loss = criterion(y_pred, targets)
            # TODO: the logging here.
            if no_b % progress_bar_modulo == 0:
                print "Doing epoch {}, examples {}/{}. Loss: {}".format(
                    t, no_b, len(dataset), loss.data[0])

                # Saving the emb
                monitoring.save_everything(exp_dir, t, my_model,
                                           dataset.dataset)

            # Zero gradients, perform a backward pass, and update the weights.
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if opt.save_error:
            monitoring.dump_error_by_tissue(train_trace, dataset.dataset.data,
                                            outfname_t, exp_dir,
                                            dataset.dataset.data_type,
                                            dataset.dataset.nb_patient)
            monitoring.dump_error_by_gene(train_trace, dataset.dataset.data,
                                          outfname_g, exp_dir)

        print "Saving the model..."
        monitoring.save_checkpoint(my_model, optimizer, t, opt, exp_dir)

    print "Done!"
Ejemplo n.º 3
0
def main(argv=None):

    opt = parse_args(argv)
    # TODO: set the seed
    seed = opt.seed
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.manual_seed(seed)
    if opt.cache == 0:
        opt.cache = random.getrandbits(128)

    exp_dir = opt.load_folder
    if exp_dir is None:  # we create a new folder if we don't load.
        exp_dir = monitoring.create_experiment_folder(opt)

    if opt.model == 'RNN':
        print('This model is deprecated - please use TCRonly from now on')
    # creating the dataset
    print("Getting the dataset...")
    if not 'cached_dataset' in os.listdir('.'):
        os.mkdir('cached_dataset')

    opt.dataset = 'binary_thome'
    tenth = opt.tenth
    #dataset = datasets.get_dataset(opt,exp_dir,tenth=opt.tenth)
    dataset = old_datasets.get_dataset(opt, exp_dir, test=True)

    # Creating a model
    print("Getting the model...")
    my_model, optimizer, epoch, opt = monitoring.load_checkpoint(
        exp_dir,
        opt,
        dataset.dataset.input_size(),
    )

    criterion = torch.nn.MSELoss()
    # Training optimizer and stuff
    if opt.loss == 'NLL' or opt.model == 'allseq_bin':
        criterion = torch.nn.NLLLoss()
        criterion = torch.nn.BCELoss()

    if not 'tcr_embs' in os.listdir(exp_dir):
        if opt.model == 'TCRonly':
            os.mkdir(f'{exp_dir}/tcr_embs/')
        elif opt.model == 'allseq' or opt.model == 'allseq_bin':
            os.mkdir(f'{exp_dir}/tcr_embs/')
            os.mkdir(f'{exp_dir}/hla_embs/')

    if not opt.cpu:
        print("Putting the model on gpu...")
        my_model.cuda(opt.gpu_selection)

    loss_dict = {}
    loss_dict['train_losses'] = []

    def estimate_batch_accuracy(y, yhat):
        return np.sum([i == j for i, j in zip(y, yhat)]) / y.shape[0]

    if opt.model == 'allseq' or opt.model == 'allseq_bin':
        valid_list = np.load(
            '/u/trofimov/Emerson/processed_data/valid_list.npy')
        loss_dict['valid_losses'] = []

    # The training.
    print("Getting the likelihood")
    os.mkdir(f'{exp_dir}/Thome_tenth{tenth}_preds_100/')
    #monitoring and predictions
    for t in range(1):
        loss_dict = monitoring.update_loss_dict(loss_dict, start=True)
        if opt.model == 'allseq_bin':
            good = 0

        for no_b, mini in enumerate(dataset):

            if opt.model == 'TCRonly':

                y_pred, my_model, targets = training.TCRonly_batch(
                    mini, opt, my_model)
                np.save(f'{exp_dir}/preds_100/likelihood_batch{no_b}.npy',
                        y_pred.data.cpu().numpy())

                if no_b % 5 == 0:
                    print(f"Doing epoch{t},examples{no_b}/{len(dataset)}")

                # Saving the emb

            elif opt.model == 'allseq':

                inputs_k, inputs_h1, inputs_h2, inputs_h3, inputs_h4, targets = training.allseq_batch(
                    mini, opt)
                y_pred = my_model(inputs_k, inputs_h1, inputs_h2, inputs_h3,
                                  inputs_h4).float()

                np.save(f'{exp_dir}/preds_100/likelihood_batch{no_b}.npy',
                        y_pred.data.cpu().numpy())
                batch_number = dataset.dataset.data[no_b]
                bn = batch_number[0]
                np.save(f'{exp_dir}/preds_100/likelihood_batch{bn}.npy',
                        y_pred.data.cpu().numpy())

                if no_b % 5 == 0:
                    print(f"Doing epoch {t},examples{no_b}/{len(dataset)}")

            elif opt.model == 'allseq_bin':

                inputs_k, inputs_h1, inputs_h2, inputs_h3, inputs_h4, targets = training.binallseq_batch(
                    mini, opt)
                y_pred = my_model(inputs_k, inputs_h1, inputs_h2, inputs_h3,
                                  inputs_h4).float()
                batch_number = dataset.dataset.data[no_b]
                bn = batch_number[0]
                np.save(
                    f'{exp_dir}/Thome_tenth{tenth}_preds_100/likelihood_batch{bn}.npy',
                    y_pred.data.cpu().numpy())
                if no_b % 5 == 0:
                    print(f"Doing epoch {t},examples{no_b}/{len(dataset)}")
Ejemplo n.º 4
0
def main(argv=None):
    """Main."""
    opt = parse_args(argv)
    seed = opt.seed
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.manual_seed(seed)

    exp_dir = opt.load_folder
    if exp_dir is None:  # we create a new folder if we don't load.
        exp_dir = monitoring.create_experiment_folder(opt)

    # creating the dataset
    print("Getting the dataset...")
    dataset = datasets.get_dataset(opt, exp_dir)

    # Creating a model
    print("Getting the model...")

    my_model, optimizer, epoch, opt = monitoring.load_checkpoint(exp_dir, opt)

    # Training optimizer and stuff
    criterion = nn.NLLLoss()
    criterion = nn.CrossEntropyLoss()

    if not opt.cpu:
        print("Putting the model on gpu...")
        my_model.cuda(opt.gpu_selection)

    # The training.
    print("Start training.")

    monitoring_dic = {}
    monitoring_dic['train_loss'] = []
    monitoring_dic['valid_loss'] = []
    monitoring_dic['train_accuracy'] = []
    monitoring_dic['valid_accuracy'] = []
    max_accuracy = 0
    min_loss = 10000
    patience = 5

    for t in range(epoch, opt.epoch):
        if patience == 0:
            break
        thisepoch_trainloss = []
        thisepoch_trainaccuracy = []

        with tqdm(dataset, unit="batch") as tepoch:
            for mini in tepoch:

                tepoch.set_description(f"Epoch {t}")
                inputs, targets = mini[0], mini[1]

                inputs = Variable(inputs, requires_grad=False).long()
                targets = Variable(targets, requires_grad=False).long()

                if not opt.cpu:
                    inputs = inputs.cuda(opt.gpu_selection)
                    targets = targets.cuda(opt.gpu_selection)

                optimizer.zero_grad()
                y_pred = my_model(inputs).float()
                #y_pred = torch.reshape(y_pred, (y_pred.shape[0], ))

                #targets = torch.reshape(targets, (targets.shape[1], 1))

                loss = criterion(y_pred, targets)
                to_list = loss.cpu().data.numpy().reshape((1, ))[0]
                thisepoch_trainloss.append(to_list)

                loss.backward()
                optimizer.step()
                accuracy = np.sum(
                    np.argmax(y_pred.cpu().data.numpy(), axis=1) ==
                    targets.cpu().data.numpy()) / targets.shape[0]
                thisepoch_trainaccuracy.append(accuracy)
                tepoch.set_postfix(loss=loss.item(), accuracy=accuracy)

        inputs = dataset.dataset.valid_inputs
        inputs = torch.FloatTensor(inputs)
        inputs = Variable(inputs, requires_grad=False).long()

        targets = dataset.dataset.valid_targets
        targets = torch.FloatTensor(targets)
        targets = Variable(targets, requires_grad=False).long()

        if not opt.cpu:
            inputs = inputs.cuda(opt.gpu_selection)
            targets = targets.cuda(opt.gpu_selection)

        vlosses = []
        accuracies = []
        split = np.sum(targets.cpu().data.numpy()) / targets.shape[0]

        for i in tqdm(range(0, inputs.shape[0], 100)):
            with torch.no_grad():
                y_pred = my_model(inputs[i:i + 100])
                vloss = criterion(y_pred, targets[i:i + 100])

            accuracy = np.sum(
                np.argmax(y_pred.cpu().data.numpy(), axis=1) ==
                targets[i:i + 100].cpu().data.numpy()) / y_pred.shape[0]

            vloss = vloss.cpu().data.numpy().reshape((1, ))[0]
            vlosses.append(vloss)
            accuracies.append(accuracy)

        vloss = np.mean(vlosses)
        accuracy = np.mean(accuracies)

        if vloss < min_loss:
            monitoring.save_checkpoint(my_model, optimizer, t, opt, exp_dir)
            print(
                f'*** new min loss*** Validation loss: epoch {t}, loss: {vloss}, accuracy: {accuracy}, split {split}'
            )
            min_loss = vloss
            patience = 5
        else:

            print(
                f'Validation loss: epoch {t}, loss: {vloss}, accuracy:{accuracy}, split {split}'
            )
            patience -= 1

        monitoring_dic['valid_loss'].append(vloss)
        monitoring_dic['valid_accuracy'].append(accuracy)
        monitoring_dic['train_loss'].append(np.mean(thisepoch_trainloss))
        monitoring_dic['train_accuracy'].append(
            np.mean(thisepoch_trainaccuracy))
        trainacc = np.mean(thisepoch_trainaccuracy)
        print(f'Training accuracy: {trainacc}')
        np.save(f'{exp_dir}/train_loss.npy', monitoring_dic['train_loss'])
        np.save(f'{exp_dir}/valid_loss.npy', monitoring_dic['valid_loss'])
        np.save(f'{exp_dir}/train_accuracy.npy',
                monitoring_dic['train_accuracy'])
        np.save(f'{exp_dir}/valid_accuracy.npy',
                monitoring_dic['valid_accuracy'])
    print('Done training!')
Ejemplo n.º 5
0
def main(argv=None):

    opt = parse_args(argv)
    # TODO: set the seed
    seed = opt.seed
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.manual_seed(seed)
    if opt.cache==0:
        opt.cache = random.getrandbits(128)

    exp_dir = opt.load_folder
    if exp_dir is None: # we create a new folder if we don't load.
        exp_dir = monitoring.create_experiment_folder(opt)

    if opt.model == 'RNN':
        print ('This model is deprecated - please use TCRonly from now on')
    # creating the dataset
    print ("Getting the dataset...")
    if not 'cached_dataset' in os.listdir('.'):
        os.mkdir('cached_dataset')

    dataset = datasets.get_dataset(opt,exp_dir)

    # Creating a model
    print ("Getting the model...")
    my_model, optimizer, epoch, opt = monitoring.load_checkpoint(exp_dir, opt, dataset.dataset.input_size(), )

    criterion = torch.nn.MSELoss()
    # Training optimizer and stuff
    if opt.loss == 'NLL' or opt.model=='allseq_bin':
        criterion = torch.nn.NLLLoss()
        criterion = torch.nn.BCELoss()



    if not 'tcr_embs' in os.listdir(exp_dir):
        if opt.model == 'TCRonly':
            os.mkdir(f'{exp_dir}/tcr_embs/')
        elif opt.model == 'allseq' or opt.model == 'allseq_bin':
                os.mkdir(f'{exp_dir}/tcr_embs/')
                os.mkdir(f'{exp_dir}/hla_embs/')
                os.mkdir(f'{exp_dir}/predictions')


    if not opt.cpu:
        print ("Putting the model on gpu...")
        my_model.cuda(opt.gpu_selection)



    loss_dict = {}
    loss_dict['train_losses'] = []

    def estimate_batch_accuracy(y,yhat):
        return np.sum([i==j for i,j in zip(y,yhat)])/y.shape[0]

    if opt.model == 'allseq' or opt.model == 'allseq_bin':
        valid_list = np.load('/u/trofimov/Emerson/processed_data/valid_list.npy')
        loss_dict['valid_losses'] = []




    # The training.
    print ("Start training.")
    #monitoring and predictions
    for t in range(epoch, opt.epoch):
        loss_dict = monitoring.update_loss_dict(loss_dict,start = True)
        if opt.model == 'allseq_bin':
            good = 0

        for no_b, mini in enumerate(dataset):

            if opt.model == 'TCRonly':

                y_pred, my_model, targets = training.TCRonly_batch(mini,opt,my_model)

                loss = criterion(y_pred, targets)
                loss_save = loss.data.cpu().numpy().reshape(1,)[0]
                loss_dict['train_losses_epoch'].append(loss_save)

                if no_b % 5 == 0:
                    print (f"Doing epoch{t},examples{no_b}/{len(dataset)}.Loss:{loss_save}")

                # Saving the emb
                np.save(os.path.join(exp_dir, 'pixel_epoch_{}'.format(t)),my_model.emb_1.weight.cpu().data.numpy())


                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                kmerembs = my_model.get_embeddings(inputs_k, inputs_s)[0].squeeze()
                np.save(f'{exp_dir}/kmer_embs/kmer_embs_batch_{no_b}',kmerembs.cpu().data.numpy())

            elif opt.model == 'allseq':

                inputs_k,inputs_h1, inputs_h2, inputs_h3, inputs_h4, targets = training.allseq_batch(mini,opt)
                y_pred = my_model(inputs_k,inputs_h1, inputs_h2, inputs_h3,
                                  inputs_h4).float()

                loss = criterion(y_pred, targets)
                loss_save = loss.data.cpu().numpy().reshape(1,)[0]
                if no_b in valid_list:

                    loss_dict['valid_losses_epoch'].append(loss_save)
                    print (f"Validation error {t},examples{no_b}/{len(dataset)}.Loss:{loss_save}")

                elif no_b % 5 == 0:
                    loss_dict['train_losses_epoch'].append(loss_save)
                    print (f"Doing epoch {t},examples{no_b}/{len(dataset)}.Loss:{loss_save}")
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                batch_number = dataset.dataset.data[no_b]
                kmerembs = my_model.get_embeddings(inputs_k, inputs_h1,
                                                   inputs_h2, inputs_h3,
                                                   inputs_h4)
                kmerembs1 = kmerembs[0].squeeze()
                bn = batch_number[0]
                np.save(f'{exp_dir}/tcr_embs/tcr_embs_batch_{bn}',kmerembs1.cpu().data.numpy())

                for i in range(4):
                    kmerembs1 = kmerembs[i+1].squeeze()
                    kmerembs1 = kmerembs1[0]
                    np.save(f'{exp_dir}/hla_embs/hla_embs_batch_{bn}_h{i+1}',kmerembs1.cpu().data.numpy())


                kmerembs1 = my_model.hla_representation
                kmerembs1 = kmerembs1[0].squeeze()
                np.save(f'{exp_dir}/hla_embs/ppl_embs_batch_{bn}',kmerembs1.cpu().data.numpy())




            elif opt.model == 'allseq_bin':

                inputs_k, inputs_h1, inputs_h2, inputs_h3, inputs_h4, targets = training.binallseq_batch(mini,opt)
                y_pred = my_model(inputs_k,inputs_h1, inputs_h2, inputs_h3,
                                  inputs_h4).float()

                loss = criterion(y_pred, targets)
                #nb_pos = (np.sum(np.argmax(y_pred.cpu().detach().numpy(),axis=1))/y_pred.shape[0])
                #b_accuracy = (estimate_batch_accuracy(np.argmax(y_pred.cpu().detach().numpy(),axis=1),
                #                               np.argmax(targets.cpu().detach().numpy(),axis=1)))

                #if no_b % 10 == 0:
                #    print (f'predicted proportion: {nb_pos} - accuracy: {b_accuracy}')
                #if b_accuracy>0.75:
                #    good+=1
                loss_save = loss.data.cpu().numpy().reshape(1,)[0]
                if no_b in valid_list:
                    loss_dict['valid_losses_epoch'].append(loss_save)
                    print (f"Validation error {t},examples{no_b}/{len(dataset)}.Loss:{loss_save}")

                else:
                    loss_dict['train_losses_epoch'].append(loss_save)
                    if no_b % 50 == 0:
                        print (f"Doing epoch {t},examples{no_b}/{len(dataset)}.Loss:{loss_save}")
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    batch_number = dataset.dataset.data[no_b]
                    bn = batch_number[0]
                    for newpass in range(100):
                        y_pred = my_model(inputs_k,inputs_h1, inputs_h2, inputs_h3,
                                          inputs_h4).float()

                        loss = criterion(y_pred, targets)
                        loss_newpass =  loss.data.cpu().numpy().reshape(1,)[0]
                        print (f'batch {bn} pass {newpass} loss: {loss_newpass}')
                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()
                    preds_targets = np.hstack((y_pred.cpu().data.numpy(),
                                               targets.cpu().data.numpy()))
                    np.save(f'{exp_dir}/predictions/batch_{bn}_{no_b}.npy',preds_targets)
                    monitoring.save_checkpoint(my_model, optimizer, t, opt, exp_dir)


                batch_number = dataset.dataset.data[no_b]
                kmerembs = my_model.get_embeddings(inputs_k, inputs_h1,
                                                   inputs_h2, inputs_h3,
                                                   inputs_h4)
                kmerembs1 = kmerembs[0].squeeze()
                bn = batch_number[0]
                true_size = int(kmerembs1.shape[0]/2)
                np.save(f'{exp_dir}/tcr_embs/tcr_embs_batch_{bn}',kmerembs1.cpu().data.numpy()[:true_size])

                for i in range(4):
                    kmerembs1 = kmerembs[i+1].squeeze()
                    kmerembs1 = kmerembs1[0]
                    np.save(f'{exp_dir}/hla_embs/hla_embs_batch_{bn}_h{i+1}',kmerembs1.cpu().data.numpy()[:true_size])


                kmerembs1 = my_model.hla_representation
                kmerembs1 = kmerembs1[0].squeeze()
                np.save(f'{exp_dir}/hla_embs/ppl_embs_batch_{bn}',kmerembs1.cpu().data.numpy()[:true_size])


        print ("Saving the model...")
        if opt.model=='allseq_bin' or opt.model=='allseq':
            validation_scores = loss_dict['valid_losses_epoch']
        else:
            validation_scores = None

        #print (f'number correct examples: {good}')
        if opt.model == 'allseq' or opt.model=='allseq_bin':
            toprint = loss_dict['valid_losses_epoch']
            print (f'validation loss matrix: {toprint}')
        monitoring.save_checkpoint(my_model, optimizer, t, opt, exp_dir)
        monitoring.update_loss_dict(loss_dict, start=False)
        monitoring.save_loss(loss_dict,exp_dir)
        if t % opt.plot_frequency==0:
            monitoring.plot_training_curve(exp_dir, loss_dict)



    print ('Finished training! Starting evaluations')
    tcr_rep_dir = f'{exp_dir}/tcr_embs'
    patient_to_index = f'data/hla_for_model_eval/pt_names.csv'
    original_data_dir = f'/u/trofimov/Emerson/original'
    validation_scores = np.load(f'{exp_dir}/validation_loss.npy')

    nb_patients = 15

    validation_scores = np.load(f'{exp_dir}/validation_loss.npy')
    output = evaluations.evaluate_model(opt, my_model ,exp_dir, tcr_rep_dir, patient_to_index, 
                          original_data_dir, validation_scores, nb_patients,
                           train_on_index=0)
    with open(f'{exp_dir}/evaluation_results.json', 'w') as json_file:
            json.dump(output, json_file)
def main(argv=None):

    opt = parse_args(argv)
    # TODO: set the seed
    seed = opt.seed
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.manual_seed(seed)

    exp_dir = opt.load_folder
    if exp_dir is None:  # we create a new folder if we don't load.
        exp_dir = monitoring.create_experiment_folder(opt)

    # creating the dataset
    print("Getting the dataset...")
    dataset = datasets.get_dataset(opt, exp_dir)

    # Creating a model
    print("Getting the model...")

    my_model, optimizer, epoch, opt = monitoring.load_checkpoint(
        exp_dir, opt, dataset.dataset.input_size(),
        dataset.dataset.additional_info())

    # Training optimizer and stuff
    criterion = torch.nn.MSELoss()

    if not opt.cpu:
        print("Putting the model on gpu...")
        my_model.cuda(opt.gpu_selection)

    # The training.
    print("Start training.")
    #monitoring and predictions
    predictions = np.zeros(
        (dataset.dataset.nb_patient, dataset.dataset.nb_gene))
    indices_patients = np.arange(dataset.dataset.nb_patient)
    indices_genes = np.arange(dataset.dataset.nb_gene)
    xdata = np.transpose([
        np.tile(indices_genes, len(indices_patients)),
        np.repeat(indices_patients, len(indices_genes))
    ])
    progress_bar_modulo = len(dataset) / 100
    for t in range(epoch, opt.epoch):

        start_timer = time.time()

        if opt.save_error:
            outfname_g = '_'.join(['gene_epoch', str(t), 'prediction.npy'])
            outfname_g = os.path.join(exp_dir, outfname_g)
            outfname_t = '_'.join(['tissue_epoch', str(t), 'prediction.npy'])
            outfname_t = os.path.join(exp_dir, outfname_t)
            train_trace = np.zeros(
                (dataset.dataset.nb_gene, dataset.dataset.nb_patient))
        ### making predictions:
        nb_proteins = my_model.emb_3.weight.cpu().data.numpy().shape[0]
        nb_patients = my_model.emb_2.weight.cpu().data.numpy().shape[0]
        predictions_protein = np.zeros((nb_patients, nb_proteins))
        patient_embs = my_model.emb_2.weight.cpu().data.numpy()

        for patient in np.arange(nb_patients):
            new = my_model.generate_datapoint_protein(patient_embs[patient, :],
                                                      gpu=2)
            new = new.cpu().data.numpy()
            predictions_protein[patient, :] = new[:, 0]
        np.save(f'predictions_protein_{epoch}.npy', predictions_protein)

        for no_b, mini in enumerate(dataset):

            inputs, targets, inputs2, targets2 = mini[0], mini[1], mini[
                2], mini[3]

            inputs = Variable(inputs, requires_grad=False).float()
            targets = Variable(targets, requires_grad=False).float()
            inputs2 = Variable(inputs2, requires_grad=False).float()
            targets2 = Variable(targets2, requires_grad=False).float()

            if not opt.cpu:
                inputs = inputs.cuda(opt.gpu_selection)
                targets = targets.cuda(opt.gpu_selection)
                inputs2 = inputs2.cuda(opt.gpu_selection)
                targets2 = targets2.cuda(opt.gpu_selection)

            # Forward pass: Compute predicted y by passing x to the model
            y_pred = my_model([inputs, inputs2])

            #if opt.save_error:
            # Log the predicted values per sample and per gene (S.L. validation)
            #    batch_inputs = mini[0].numpy()
            #    predicted_values = y_pred.data.cpu().numpy()
            #    train_trace[batch_inputs[:,0],batch_inputs[:,1]] = predicted_values[:,0]
            targets = torch.reshape(targets, (targets.shape[0], 1))
            targets2 = torch.reshape(targets2, (targets2.shape[0], 1))
            # Compute and print loss

            loss1 = criterion(y_pred[0], targets)
            loss2 = criterion(y_pred[1], targets2)
            loss = loss1 + loss2
            if no_b % 5 == 0:
                print(
                    f"Doing epoch {t},examples{no_b}/{len(dataset)}.Loss:{loss.data.cpu().numpy().reshape(1,)[0]}"
                )

                # Saving the emb
                np.save(os.path.join(exp_dir, 'pixel_epoch_{}'.format(t)),
                        my_model.emb_1.weight.cpu().data.numpy())
                np.save(os.path.join(exp_dir, 'digit_epoch_{}'.format(t)),
                        my_model.emb_2.weight.cpu().data.numpy())

            # Zero gradients, perform a backward pass, and update the weights.
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #my_model.generate_datapoint([0,0], opt.gpu_selection)
        #monitoring.save_predictions(exp_dir, predictions)

#        for i in range(0,xdata.shape[0],1000):
#            #import pdb; pdb.set_trace()
#            inputs = torch.FloatTensor(xdata[i:i+1000,:])
#            inputs = Variable(inputs, requires_grad=False).float()
#            if not opt.cpu:
#                inputs = inputs.cuda(opt.gpu_selection)
#            y_pred = my_model(inputs).float()
#            predictions[inputs.data.cpu().numpy()[:,1].astype('int32'),inputs.data.cpu().numpy()[:,0].astype('int32')] = y_pred.data.cpu().numpy()[:,0]
#      monitoring.dump_error_by_tissue(train_trace, dataset.dataset.data, outfname_t, exp_dir, dataset.dataset.data_type, dataset.dataset.nb_patient)
#      monitoring.dump_error_by_gene(train_trace, dataset.dataset.data, outfname_g, exp_dir)

#print ("Saving the model...")
        monitoring.save_checkpoint(my_model, optimizer, t, opt, exp_dir)
Ejemplo n.º 7
0
def main(argv=None):

    opt = parse_args(argv)
    # TODO: set the seed
    seed = opt.seed
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.manual_seed(seed)

    exp_dir = opt.load_folder
    if exp_dir is None:  # we create a new folder if we don't load.
        exp_dir = monitoring.create_experiment_folder(opt)

    # creating the dataset
    print("Getting the dataset...")
    dataset = datasets.get_dataset(opt, exp_dir)

    # Creating a model
    print("Getting the model...")
    my_model, optimizer, epoch, opt = monitoring.load_checkpoint(
        exp_dir,
        opt,
        dataset.dataset.input_size(),
    )

    criterion = torch.nn.MSELoss()
    # Training optimizer and stuff
    if opt.loss == 'NLL':
        criterion = torch.nn.NLLLoss()

    os.mkdir(f'{exp_dir}/kmer_embs/')
    if not opt.cpu:
        print("Putting the model on gpu...")
        my_model.cuda(opt.gpu_selection)

    # The training.
    print("Start training.")
    #monitoring and predictions
    for t in range(epoch, opt.epoch):

        start_timer = time.time()
        for no_b, mini in enumerate(dataset):

            inputs_s, inputs_k, targets = mini[0], mini[1], mini[2]

            inputs_s = Variable(inputs_s, requires_grad=False).float()
            inputs_k = Variable(inputs_k, requires_grad=False).float()
            targets = Variable(targets, requires_grad=False).float()

            if not opt.cpu:
                inputs_s = inputs_s.cuda(opt.gpu_selection)
                inputs_k = inputs_k.cuda(opt.gpu_selection)
                targets = targets.cuda(opt.gpu_selection)
            # Forward pass: Compute predicted y by passing x to the model
            inputs_k = inputs_k.view(-1, 31, 4)
            y_pred = my_model(inputs_k, inputs_s).float()
            #import pdb; pdb.set_trace()

            #targets = torch.reshape(targets,(targets.shape[0],1))
            # Compute and print loss

            loss = criterion(y_pred, targets)
            if no_b % 5 == 0:
                print(
                    f"Doing epoch {t},examples{no_b}/{len(dataset)}.Loss:{loss.data.cpu().numpy().reshape(1,)[0]}"
                )

                # Saving the emb
                np.save(os.path.join(exp_dir, 'pixel_epoch_{}'.format(t)),
                        my_model.emb_1.weight.cpu().data.numpy())

            # Zero gradients, perform a backward pass, and update the weights.
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            kmerembs_batch = my_model.get_embeddings(inputs_k, inputs_s)[0]
            kmerembs = kmerembs_batch[:int(kmerembs_batch.shape[0] /
                                           opt.nb_patient)]
            np.save(f'{exp_dir}/kmer_embs/kmer_embs_batch_{no_b}',
                    kmerembs.cpu().data.numpy())

        #print ("Saving the model...")
        monitoring.save_checkpoint(my_model, optimizer, t, opt, exp_dir)
Ejemplo n.º 8
0
def main(argv=None):

    opt = parse_args(argv)
    # TODO: set the seed
    seed = opt.seed
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.manual_seed(seed)
    if opt.cache == 0:
        opt.cache = random.getrandbits(128)

    exp_dir = opt.load_folder
    if exp_dir is None:  # we create a new folder if we don't load.
        exp_dir = monitoring.create_experiment_folder(opt)

    # creating the dataset
    print("Getting the dataset...")
    if not 'cached_dataset' in os.listdir('.'):
        os.mkdir('cached_dataset')

    dataset = datasets.get_dataset(opt, exp_dir)

    # Creating a model
    print("Getting the model...")
    my_model, optimizer, epoch, opt = monitoring.load_checkpoint(
        exp_dir,
        opt,
        dataset.dataset.input_size(),
    )

    # Training optimizer and stuff
    criterion = torch.nn.BCELoss()

    if not opt.cpu:
        print("Putting the model on gpu...")
        my_model.cuda(opt.gpu_selection)

    loss_dict = {}
    loss_dict['train_losses'] = []

    valid_list = np.load('/u/trofimov/Emerson/processed_data/valid_list.npy')
    loss_dict['valid_losses'] = []

    # The training.
    print("Start training.")
    #monitoring and predictions
    for t in range(epoch, opt.epoch):
        loss_dict = monitoring.update_loss_dict(loss_dict, start=True)

        for no_b, mini in enumerate(dataset):

            inputs_x, targets = mini[0], mini[1]
            inputs_x = Variable(inputs_x, requires_grad=False).float()
            targets = Variable(targets, requires_grad=False).float()
            if not opt.cpu:
                inputs_x = inputs_x.cuda(opt.gpu_selection)
                targets = targets.cuda(opt.gpu_selection)

            y_pred = my_model(inputs_x)
            loss = criterion(y_pred, targets)
            loss_save = loss.data.cpu().numpy().reshape(1, )[0]
            if no_b in valid_list:
                loss_dict['valid_losses_epoch'].append(loss_save)
                print(
                    f"Validation error {t},examples{no_b}/{len(dataset)}.Loss:{loss_save}"
                )

            else:
                loss_dict['train_losses_epoch'].append(loss_save)
                print(
                    f"Doing epoch {t},examples{no_b}/{len(dataset)}.Loss:{loss_save}"
                )
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        validation_scores = loss_dict['valid_losses_epoch']

        monitoring.save_checkpoint(my_model, optimizer, t, opt, exp_dir)
        monitoring.update_loss_dict(loss_dict, start=False)
        monitoring.save_loss(loss_dict, exp_dir)
        if t % opt.plot_frequency == 0:
            monitoring.plot_training_curve(exp_dir, loss_dict)
Ejemplo n.º 9
0
def impute(argv=None):

    opt = parse_args(argv)

    seed = opt.seed
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.manual_seed(seed)

    exp_dir = opt.load_folder
    new_save_dir = monitoring.create_experiment_folder(opt)
    if exp_dir is None: 
        print ("Experiment doesn't exist!")
    else:
        # creating the dataset
        print ("Getting the dataset...")
        dataset = datasets.get_dataset(opt,exp_dir)
        old_nb_patients = dataset.dataset.nb_patient
        old_data_file = opt.data_file
        new_data_file = opt.new_data_file
        imputation_list = opt.imputation_list
        nb_shuffles = opt.nb_shuffles


        # Creating a model
        print ("Getting the model...")
        my_model, optimizer, epoch, opt = monitoring.load_checkpoint(exp_dir,opt,dataset.dataset.input_size(),impute=True)
        old_model_size = dataset.dataset.input_size()
        ### Making sure updates are only on the patient embedding layer
        #my_model.freeze_all()
        #optimizer = torch.optim.RMSprop(my_model.emb_2.parameters(), lr=opt.lr, weight_decay=opt.weight_decay)

        ### Replacing the first embeddings as the new number of patients to predict
        my_model.emb_2.weight[:dataset.dataset.nb_patient,:] = Variable(torch.FloatTensor(np.zeros((dataset.dataset.nb_patient,2))), requires_grad=False).float()

        # Training optimizer and stuff
        criterion = torch.nn.MSELoss()

        if not opt.cpu:
            print ("Putting the model on gpu...")
            my_model.cuda(opt.gpu_selection)

        # The training.
        print ("Start training.")


        for nb_genes in imputation_list:
            print (f'Imputation with {nb_genes} genes given...')
            for shuffle in range(nb_shuffles):
                opt.data_file = new_data_file
                print ("Re-getting the dataset...")
                dataset = datasets.get_dataset(opt,exp_dir, masked = nb_genes)
                new_embs = np.zeros((dataset.dataset.nb_patient,2))
                #monitoring and predictions
                predictions=np.zeros((dataset.dataset.nb_patient,my_model.emb_1.weight.shape[0]))
                indices_patients = np.arange(predictions.shape[0])
                indices_genes = np.arange(predictions.shape[1])
                xdata = np.transpose([np.tile(indices_genes, len(indices_patients)),
                                      np.repeat(indices_patients, len(indices_genes))])


                progress_bar_modulo = len(dataset)/100



                for t in range(25):

                    my_model, optimizer, epoch, opt = monitoring.load_checkpoint(exp_dir,opt,old_model_size,impute=True)
                    temp_embs = my_model.emb_2.weight.cpu().data.numpy()
                    temp_embs[:dataset.dataset.nb_patient,:] = new_embs

                    my_model.emb_2.weight.data = my_model.emb_2.weight.data.copy_(torch.from_numpy(temp_embs))
                    #my_model.emb_2.weight[:dataset.dataset.nb_patient,:] = Variable(torch.FloatTensor(new_embs),requires_grad=False).float() 
                    start_timer = time.time()

                    #if opt.save_error:
                        #outfname_g = f'shuffle_{shuffle}_{nb_genes}_genes_epoch_{t}_prediction_genes.npy'
                        #outfname_g = os.path.join(new_exp_dir,outfname_g)
                        #outfname_t = f'shuffle_{shuffle}_{nb_genes}_genes_epoch_{t}_prediction_tissue.npy'
                        #outfname_t = os.path.join(new_exp_dir,outfname_t)
                        #train_trace = np.zeros((dataset.dataset.nb_gene, dataset.dataset.nb_patient))

                    for no_b, mini in enumerate(dataset):

                        inputs, targets = mini[0], mini[1]

                        inputs = Variable(inputs, requires_grad=False).float()
                        targets = Variable(targets, requires_grad=False).float()

                        if not opt.cpu:
                            inputs = inputs.cuda(opt.gpu_selection)
                            targets = targets.cuda(opt.gpu_selection)

                        # Forward pass: Compute predicted y by passing x to the model
                        y_pred = my_model(inputs).float()

                        #if opt.save_error:
                            ## Log the predicted values per sample and per gene (S.L. validation)
                            #batch_inputs = mini[0].numpy()
                            #predicted_values = y_pred.data.cpu().numpy()
                            #train_trace[batch_inputs[:,0],batch_inputs[:,1]] = predicted_values[:,0]
                        #import pdb; pdb.set_trace()
                        targets = torch.reshape(targets,(targets.shape[0],1))
                        # Compute and print loss

                        loss = criterion(y_pred, targets)
                        if no_b % 5 == 0:
                            print (f"Doing epoch {t},examples{no_b}/{len(dataset)}.Loss:{loss.data.cpu().numpy().reshape(1,)[0]}")

                            # Saving the emb
                            np.save(os.path.join(new_save_dir, 'pixel_epoch_{}'.format(t)),my_model.emb_1.weight.cpu().data.numpy() )
                            np.save(os.path.join(new_save_dir,'digit_epoch_{}'.format(t)),my_model.emb_2.weight.cpu().data.numpy())


                        # Zero gradients, perform a backward pass, and update the weights.
                        optimizer.zero_grad()
                        #import pdb; pdb.set_trace()
                        loss.backward()
                        optimizer.step()

                        new_embs = my_model.emb_2.weight.cpu().data.numpy()[:dataset.dataset.nb_patient,:]
                        #my_model.generate_datapoint([0,0], opt.gpu_selection)

                for i in range(0,xdata.shape[0],1000):
                    #import pdb; pdb.set_trace()
                    inputs = torch.FloatTensor(xdata[i:i+1000,:])
                    inputs = Variable(inputs, requires_grad=False).float()
                    if not opt.cpu:
                        inputs = inputs.cuda(opt.gpu_selection)
                    y_pred = my_model(inputs).float()
                    predictions[inputs.data.cpu().numpy()[:,1].astype('int32'),inputs.data.cpu().numpy()[:,0].astype('int32')] = y_pred.data.cpu().numpy()[:,0]
                outfname_pred = f'shuffle_{shuffle}_{nb_genes}_genes_prediction.npy'
                outfname_pred = os.path.join(new_save_dir,outfname_pred)
                np.save(outfname_pred, predictions)