Ejemplo n.º 1
0
def main():

    ## get model/training params 
    args = parser.parse_args()
    if args.debug:
        print ('==== DEBUGGING MODE ====')
    
    # get name of script for saving models
    script_name = os.path.basename(__file__)

    ## Initialize metrics  ###
    TrainingEval = utils.TrainingMetrics(script_name)
    working_dir  = TrainingEval.working_dir
    valid = Prepare_Data(args.data,'valid/valid')
    valid_batches = DataLoader(valid, args.batch_size, 
                               drop_last=True, shuffle=True)
    Validation = utils.Metrics(valid_batches, working_dir ,'validation')
    
    # cp running script to working dir. 
    os.system('cp {} {}'.format(script_name, working_dir))  

    
    ## Initialize model
    if torch.cuda.is_available(): 
        model = ConvNet(args.kernel_size, args.stride, args.padding,
                args.ks_pool, args.str_pool, args.pad_pool).cuda()
    else:
        model = ConvNet(args.kernel_size, args.stride, args.padding, 
                args.ks_pool, args.str_pool, args.pad_pool) 
    
    ## log model/training params to file 
    LogFile = utils.LogFile(args, model, working_dir)
    
    ## Loss and optimizer 
    criterion = nn.CrossEntropyLoss() # doees not ignore padding (0) ignore_index=0
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    
    
    # Train the model
    step = -1 # nr of batches 
    loss_list = []
    acc_list = []
    valid_loss_list = []
    valid_acc_list = []
    
    
    for epoch in range(args.num_epochs):
        
        for train_ds in range(0,10):
            f = args.data
            name = 'train/train_{}'.format(train_ds)
            train = Prepare_Data(f,name)
            train_batches = DataLoader(train, batch_size=args.batch_size, 
                               drop_last=True, shuffle=True)
           
            for i, batch in enumerate(train_batches):
                step += 1
    
                # one hot encode
                batch = utils.to_one_hot(batch)
    
                # transpose to input seq as vector
                batch = torch.transpose(batch,1,2) #transpose dim 1,2 => channels=aa
    
                ## Run the forward pass ##
                out = model(batch) # sandsynligheder=> skal være [10,25,502] hvor de 25 er sandsynligheder
                
                # convert back to aa labels from one hot for loss 
                batch_labels = utils.from_one_hot(batch) # integers for labels med 100% sikkerhed
    
    
                ## loss ##
                loss = criterion(out, batch_labels)
                loss_list.append(loss.item())
    
                ## switch model to training mode, clear gradient accumulators ##
                model.train()
                optimizer.zero_grad()
    
                ##  Backprop and perform Adam optimisation  ##
                loss.backward()
                optimizer.step()
    
                ##  Track the accuracy  ##
                if i  % 50 == 0:   
    #               ##########
                    acc = TrainingEval.get_acc(out,batch_labels)
                    acc_list.append(acc)
                    TrainingEval.save_metrics(acc, loss.item(), step, epoch)
                    print('Epoch [{}/{}], Step: {}, Loss: {:.4f}, Accuracy: {:.4f}%'
                            .format(epoch + 1, args.num_epochs, step, 
                                    loss.item(), acc*100))
    
                # Validation ##
                if i % 1000 == 0:
                    val_loss, val_acc, conf_matrix = \
                    Validation.get_performance(model,criterion,
                            confusion_matrix = True)
                    Validation.save(val_acc, val_loss, epoch, step)

                    # add to list for fast plotting
                    valid_loss_list.append(val_loss)
                    valid_acc_list.append(val_acc)
                    print('Validation:  Loss: {:.4f}, Accuracy: {:.4f}%\n'
                            .format(val_loss, val_acc*100))  
                    # plot 
                    TrainingEval.plot_metrics(acc_list, loss_list,
                            valid_acc_list, valid_loss_list, epoch)

                    Validation.plot_confusion_matrix(conf_matrix)
                    Validation.plot_per_class(conf_matrix)
    
    #            if i % 2000 == 0:
    #                 # Save the model
    #                 TrainingEval.save_model(model.state_dict(), i)
    #                 LogFile.log_saved_model(step)

            # Save the model every two train_-ds
            if train_ds % 5 ==0:
                utils.save_checkpoint(model, optimizer, epoch, train_ds,loss_list, acc_list, working_dir)
                utils.save_final_model(model, working_dir)
                LogFile.log_saved_model(step)
    
    LogFile.log_performance(acc, loss.item(), ds_type='Training')

    
    if args.testing: 

        f = args.data
        name = 'test/test_1'
        test = Prepare_Data(f,name)
        test_batches = DataLoader(test, batch_size=args.batch_size, 
                           drop_last=True, shuffle=True)

        Test = utils.Metrics(test_batches, working_dir ,'test')
        test_loss, test_acc, conf_matrix = Test.get_performance(
                            model, criterion, confusion_matrix = True)
        Test.save(test_acc, test_loss, epoch=-1, step=-1)
        Test.plot_confusion_matrix(conf_matrix)
        Test.save_conf_matrix(conf_matrix)
        Test.plot_per_class(conf_matrix)
        LogFile.log_performance(test_acc, test_loss, ds_type='Test')
Ejemplo n.º 2
0
    def get_performance(self,
                        model,
                        criterion,
                        confusion_matrix=False,
                        pos_acc=False,
                        debug=False):
        ''' Get mean accuracy of model from init data/batches
        '''

        model.eval(
        )  # with eval mode, there is large udsving i validation, due to params updated faster than in train mode

        ## init performance stuff ##
        acc_list = []  # list for correct predictions
        loss_list = []
        acc_list_pad = []  # list for predicting paddings
        conf_matrix = np.zeros((25, 25))

        ## position specific accuracy
        N_term_pos = torch.from_numpy(np.zeros((500), dtype=np.int))
        N_term_pad = torch.from_numpy(np.zeros(
            (500), dtype=np.int))  # correctness of padding
        C_term_pos = torch.from_numpy(np.zeros((20), dtype=np.int))
        # get len of proteins for normalisation of posiiton accuraty
        len_seq = torch.from_numpy(np.zeros((500),
                                            dtype=np.int))  # for normalisation
        len_pad = torch.from_numpy(np.zeros((500), dtype=np.int))
        len_seq_C_term = 0.0  # for normalisation - does not need to seee how often, as always same

        # initiate random shuffle between sub dataset (ds)
        h5_file = h5py.File(self.data, 'r')
        random_ds = list(h5_file[self.ds_type].keys())
        random_ds = np.array(random_ds)
        np.random.shuffle(random_ds)  # shuffle

        # loop over all datasets of that datatype (test/valid)
        for idx, ds in enumerate(random_ds):

            ds = str(self.ds_type) + '/' + ds
            prep_data = Prepare_Data(path=self.data, name=ds, debug=debug)
            # inialise from dataloader
            batches = DataLoader(prep_data,
                                 self.batch_size,
                                 drop_last=True,
                                 shuffle=True)

            with torch.no_grad():
                for i, batch in enumerate(batches):
                    batch = to_one_hot(batch)

                    # transpose to input seq as vector
                    batch = torch.transpose(
                        batch, 1, 2)  #transpose dim 1,2 => channels=aa

                    ## Run the forward pass ##
                    out = model(
                        batch
                    )  # sandsynligheder=> skal vaere [10,25,502] hvor de 25 er sandsynligheder

                    # convert back to aa labels from one hot for loss
                    batch_labels = from_one_hot(
                        batch)  # integers for labels med i.e. 100% sikkerhed

                    # loss
                    loss = criterion(out, batch_labels)
                    loss_list.append(loss.item())

                    # get accuracy
                    _, predicted = torch.max(
                        out.data, dim=1)  # convert back from one hot

                    ## PREDICTIONS/ACCURACY ##
                    # filter out padding (0)
                    msk = batch_labels != 0
                    target_msk = batch_labels[msk]
                    pred_msk = predicted[msk]

                    # count correct predictions
                    total = target_msk.shape[0]
                    correct = (pred_msk == target_msk).sum().item()

                    # save to list
                    acc_list.append(correct / total)

                    ## PADDINGS PREDICTIONS ##
                    # filter out all but padding (=0)
                    msk_pad = batch_labels == 0
                    target_msk_pad = batch_labels[msk_pad]
                    pred_msk_pad = predicted[msk_pad]
                    # count correct predictions
                    total_pad = target_msk_pad.shape[0]
                    correct_pad = (pred_msk_pad == target_msk_pad).sum().item()

                    # save to list
                    acc_list_pad.append(correct_pad / total_pad)

                    # confusion matrix
                    if confusion_matrix:
                        conf_matrix += cm(target_msk.view(-1).cpu(),
                                          pred_msk.view(-1).cpu(),
                                          labels=np.arange(25))

                    # get position specific accuracy
                    if pos_acc:
                        # N-Term
                        correct = (batch_labels == predicted)
                        correct_msk = np.logical_and(
                            correct.cpu(), msk.cpu())  #msk out padding
                        N_term_pos += torch.sum(correct_msk,
                                                dim=0)  # sum ovre columns
                        len_seq += torch.sum(msk.cpu(), dim=0)
                        # Padding N-Term
                        correct_msk_pad = np.logical_and(
                            correct.cpu(),
                            msk_pad.cpu())  #msk out all but padding
                        N_term_pad += torch.sum(correct_msk_pad,
                                                dim=0)  # sum ovre columns
                        len_pad += torch.sum(
                            msk_pad.cpu(),
                            dim=0)  # sum how often pad correct at given pos

                        # C-term
                        #find end index of last aa for each protein, for predictions in C-term
                        bckwrds_indx = torch.sum(msk, dim=1)
                        bckwrds_indx += -21

                        try:  # in case protein smaller than 21
                            bckwrds = self.bck_seq(correct_msk,
                                                   bckwrds_indx.cpu(),
                                                   num_elem=20)
                            C_term_pos += torch.sum(bckwrds, dim=0)
                            len_seq_C_term += self.batch_size

                        except:
                            pass

        # average accuracy on test set
        mean_acc = sum(acc_list) / len(acc_list)
        mean_acc_pad = sum(acc_list_pad) / len(acc_list_pad)
        mean_loss = sum(loss_list) / len(loss_list)
        model.train()

        if pos_acc:
            # position specific accuracy
            N_term = np.divide(N_term_pos[:], len_seq[:])
            C_term = C_term_pos / float(len_seq_C_term)
            N_pad = np.divide(N_term_pad[:], len_pad[:])

        # return things
        if confusion_matrix and pos_acc:
            return mean_loss, mean_acc, mean_acc_pad, conf_matrix, N_term, C_term, N_pad

        elif confusion_matrix and not pos_acc:
            return mean_loss, mean_acc, mean_acc_pad, conf_matrix

        elif not confusion_matrix and pos_acc:
            return mean_loss, mean_acc, mean_acc_pad, N_term, C_term, N_pad

        else:
            return mean_loss, mean_acc, mean_acc_pad
Ejemplo n.º 3
0
    def get_performance(self,
                        model,
                        criterion,
                        confusion_matrix=False,
                        pos_acc=False):
        '''get mean accuracy of model from init data/batches'''

        # performance stuff
        model.eval(
        )  # with eval mode, there is large udsving i validation, due to params updated faster than in train mode
        acc_list = []
        loss_list = []
        conf_matrix = np.zeros((25, 25))

        ## pos specific accuracy
        N_term_pos = torch.from_numpy(np.zeros((500), dtype=np.int))
        C_term_pos = torch.from_numpy(np.zeros((20), dtype=np.int))
        len_seq = torch.from_numpy(np.zeros((500),
                                            dtype=np.int))  # for normalisation
        len_seq_C_term = 0.0  # for normalisation - does not need to seee how often, as always same

        with torch.no_grad():
            for i, batch in enumerate(self.batches):
                batch = to_one_hot(batch)

                # transpose to input seq as vector
                batch = torch.transpose(batch, 1,
                                        2)  #transpose dim 1,2 => channels=aa

                ## Run the forward pass ##
                out = model(
                    batch
                )  # sandsynligheder=> skal vaere [10,25,502] hvor de 25 er sandsynligheder

                # convert back to aa labels from one hot for loss
                batch_labels = from_one_hot(
                    batch)  # integers for labels med i.e. 100% sikkerhed

                # loss
                loss = criterion(out, batch_labels)
                loss_list.append(loss.item())

                # get accuracy
                _, predicted = torch.max(out.data,
                                         dim=1)  # convert back from one hot

                # filter out padding (0)
                msk = batch_labels != 0
                target_msk = batch_labels[msk]
                pred_msk = predicted[msk]

                # count correct predictions
                total = target_msk.shape[0]
                correct = (pred_msk == target_msk).sum().item()
                incorrect = (pred_msk != target_msk).sum().item()

                # save to list
                acc_list.append(correct / total)

                # confusion matrix
                if confusion_matrix:
                    conf_matrix += cm(target_msk.view(-1).cpu(),
                                      pred_msk.view(-1).cpu(),
                                      labels=np.arange(25))

                # get position specific accuracy
                if pos_acc:
                    correct = (batch_labels == predicted)
                    correct_msk = np.logical_and(correct.cpu(),
                                                 msk.cpu())  #msk out padding
                    N_term_pos += torch.sum(correct_msk,
                                            dim=0)  # sum ovre columns
                    len_seq += torch.sum(msk.cpu(), dim=0)

                    bckwrds_indx = torch.sum(
                        msk, dim=1)  #find end indeex of each protein
                    bckwrds_indx += -21

                    try:  # in case protein smaller than 21
                        bckwrds = self.bck_seq(correct_msk,
                                               bckwrds_indx.cpu(),
                                               num_elem=20)
                        C_term_pos += torch.sum(bckwrds, dim=0)
                        len_seq_C_term += batch_size

                    except:
                        pass

        # average accuracy on test set
        mean_acc = sum(acc_list) / len(acc_list)
        mean_loss = sum(loss_list) / len(loss_list)
        model.train()

        if pos_acc:
            # position specific accuracy
            N_term = np.divide(N_term_pos[:], len_seq[:])
            C_term = C_term_pos / float(len_seq_C_term)

        # return things
        if confusion_matrix and pos_acc:
            return mean_loss, mean_acc, conf_matrix, N_term, C_term
        elif confusion_matrix and not pos_acc:
            return mean_loss, mean_acc, conf_matrix
        elif not confusion_matrix and pos_acc:
            return mean_loss, mean_acc, N_term, C_term
        else:
            return mean_loss, mean_acc
Ejemplo n.º 4
0
def main():

    ## get model/training params ##
    args = parser.parse_args()

    ## specify name of output dir ##
    # dir to be created once initializing TrainingMetrics
    if args.debug:
        top_working_dir = 'debugging'

    elif args.out_dir is not None:
        top_working_dir = args.out_dir

    else:
        top_working_dir = str(args.nn_model.split(".py")[0])

    ## Initialize training metrics  ###

    # simultanously creates working_dir
    TrainingEval = utils.TrainingMetrics(top_working_dir, args.restart)

    # get name of output/working dir
    working_dir = TrainingEval.working_dir

    ## Initialize Validation metrics ##
    Validation = utils.PerformMetrics(args.data, working_dir, args.batch_size,
                                      'validation')

    ## Initialise Test metrics: ##
    if args.testing:
        Test = utils.PerformMetrics(args.data, working_dir, args.batch_size,
                                    'test')

    ## Logging of scripts, models and params ##
    # cp nn_model script to working dir.
    os.system('cp nn_models/{} {}'.format(args.nn_model, working_dir))

    ## Load nn model architecture ##
    path = './nn_models/' + args.nn_model
    spec = importlib.util.spec_from_file_location('nn_module', path)
    nn_module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(nn_module)
    model = nn_module.ConvNet(args.kernel_size, args.stride, args.padding,
                              args.ks_pool, args.str_pool, args.pad_pool)

    #     nn_model = importlib.import_module('.{}'.format(args.nn_model), package='nn_models')
    #     model = nn_model.ConvNet(args.kernel_size, args.stride, args.padding,
    #                              args.ks_pool, args.str_pool, args.pad_pool)
    # CUDA
    if torch.cuda.is_available():
        model = model.cuda()

    # initalise optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)

    # load from restart file, params are conv to cuda in loading
    if args.restart is not None:
        model, optimizer, epoch_start, train_idx, loss_list, acc_list = \
            utils.load_checkpoint(model, optimizer, filename=args.restart)
        print('loaded checkpoint model', flush=True)
    else:
        loss_list = []
        acc_list = []
        epoch_start = 0

    # log model/training params to file
    LogFile = utils.LogFile(args, model, working_dir)

    ## Loss
    criterion = nn.CrossEntropyLoss(
    )  # does not ignore padding (0) ignore_index=0

    # Train the model
    nr_of_batches = -1  # count batches for logging
    valid_loss_list = []
    valid_acc_list = []

    #initiate random shuffle between  training sub dataset
    random_ds = list(h5py.File(args.data,
                               'r')['train'].keys())  # get sub-names
    random_ds = np.array(random_ds)
    np.random.shuffle(random_ds)  # shuffle

    # loop over entire training set multiple times
    for epoch in range(epoch_start, args.num_epochs):

        # loop over sub training sets (for memory reasons)
        for train_idx, sub_name in enumerate(random_ds):
            # load data
            f = args.data
            name = 'train/{}'.format((sub_name))
            train = utils.Prepare_Data(f, name, debug=args.debug)

            # make batches of the data
            train_batches = DataLoader(train,
                                       batch_size=args.batch_size,
                                       drop_last=True,
                                       shuffle=True)

            for i, batch in enumerate(train_batches):
                nr_of_batches += 1

                # one hot encode
                batch = utils.to_one_hot(batch)

                # transpose to input seq as vector
                batch = torch.transpose(batch, 1,
                                        2)  #transpose dim 1,2 => channels=aa

                ## Run the forward pass ##
                out = model(
                    batch
                )  # sandsynligheder=> skal vaere [10,25,502] hvor de 25 er sandsynligheder

                # convert back to aa labels from one hot for loss
                batch_labels = utils.from_one_hot(
                    batch)  # integers for labels med 100% sikkerhed

                ## loss ##
                loss = criterion(out, batch_labels)
                loss_list.append(loss.item())

                ## switch model to training mode, clear gradient accumulators ##
                model.train()
                optimizer.zero_grad()

                ##  Backprop and perform Adam optimisation  ##
                loss.backward()
                optimizer.step()

            ##  Track the training accuracy  ##
            if train_idx % 1 == 0:
                acc = TrainingEval.get_acc(out, batch_labels)
                acc_list.append(acc)
                TrainingEval.save_metrics(acc, loss.item(), nr_of_batches,
                                          epoch)
                print(
                    'Epoch [{}/{}], sub training set: {} , nr_batches: {}, Loss: {:.4f}, Accuracy: {:.4f}%'
                    .format(epoch, args.num_epochs, train_idx, nr_of_batches,
                            loss.item(), acc * 100),
                    flush=True)

                # Validation ##
        #    # if i % 1000 == 0:
            if train_idx % 5 == 0:
                # get nn model performance on valid set
                val_loss, val_acc, val_acc_pad, N_term, C_term, N_pad = Validation.get_performance(
                    model, criterion, pos_acc=True, debug=args.debug)

                # save validation metrics to file
                Validation.save(val_acc, val_loss, val_acc_pad, epoch,
                                nr_of_batches)

                # add to list for fast plotting
                valid_loss_list.append(val_loss)
                valid_acc_list.append(val_acc)
                print('Validation:  Loss: {:.4f}, Accuracy: {:.4f}%\n'.format(
                    val_loss, val_acc * 100),
                      flush=True)
                # plot
                TrainingEval.plot_metrics(acc_list, loss_list, valid_acc_list,
                                          valid_loss_list, epoch)

        # Save the model every 2 epochs
        if train_idx % 5 == 0:
            # save nn model as checkpoint to restart from
            utils.save_checkpoint(model, optimizer, \
                                  epoch, train_idx,  \
                                  loss_list, acc_list,\
                                  working_dir)

            # save nn model as final (weights only)
            #             utils.save_final_model(model, working_dir)
            # log current training status to log file
            LogFile.log_saved_model(steps=nr_of_batches)
            LogFile.log_performance(\
                    acc, loss.item(), ds_type='Training')

            # test nn model on test data set
            if args.testing:

                # get performance of current nn model on test data
                test_loss, test_acc, test_acc_pad, conf_matrix, N_term,C_term, N_pad = \
                            Test.get_performance(
                            model, criterion, \
                            confusion_matrix = True, \
                            pos_acc=True, \
                            debug = args.debug)

                # save test set metrics of nn model
                Test.save(test_acc,
                          test_loss,
                          test_acc_pad,
                          epoch=epoch,
                          step=nr_of_batches)

                # plots different model analyses
                Test.plot_confusion_matrix(conf_matrix)
                Test.save_conf_matrix(conf_matrix)

                # plot performance prediction on each aa type
                Test.plot_per_class(conf_matrix)

                # plot positional accuracy, i.e. how well predicts from N-term and C-term
                Test.plot_pos_acc(N_term, C_term, N_pad)

                # log test metrics in log file
                LogFile.log_performance(test_acc, test_loss, ds_type='Test')