def main(): ## get model/training params args = parser.parse_args() if args.debug: print ('==== DEBUGGING MODE ====') # get name of script for saving models script_name = os.path.basename(__file__) ## Initialize metrics ### TrainingEval = utils.TrainingMetrics(script_name) working_dir = TrainingEval.working_dir valid = Prepare_Data(args.data,'valid/valid') valid_batches = DataLoader(valid, args.batch_size, drop_last=True, shuffle=True) Validation = utils.Metrics(valid_batches, working_dir ,'validation') # cp running script to working dir. os.system('cp {} {}'.format(script_name, working_dir)) ## Initialize model if torch.cuda.is_available(): model = ConvNet(args.kernel_size, args.stride, args.padding, args.ks_pool, args.str_pool, args.pad_pool).cuda() else: model = ConvNet(args.kernel_size, args.stride, args.padding, args.ks_pool, args.str_pool, args.pad_pool) ## log model/training params to file LogFile = utils.LogFile(args, model, working_dir) ## Loss and optimizer criterion = nn.CrossEntropyLoss() # doees not ignore padding (0) ignore_index=0 optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) # Train the model step = -1 # nr of batches loss_list = [] acc_list = [] valid_loss_list = [] valid_acc_list = [] for epoch in range(args.num_epochs): for train_ds in range(0,10): f = args.data name = 'train/train_{}'.format(train_ds) train = Prepare_Data(f,name) train_batches = DataLoader(train, batch_size=args.batch_size, drop_last=True, shuffle=True) for i, batch in enumerate(train_batches): step += 1 # one hot encode batch = utils.to_one_hot(batch) # transpose to input seq as vector batch = torch.transpose(batch,1,2) #transpose dim 1,2 => channels=aa ## Run the forward pass ## out = model(batch) # sandsynligheder=> skal være [10,25,502] hvor de 25 er sandsynligheder # convert back to aa labels from one hot for loss batch_labels = utils.from_one_hot(batch) # integers for labels med 100% sikkerhed ## loss ## loss = criterion(out, batch_labels) loss_list.append(loss.item()) ## switch model to training mode, clear gradient accumulators ## model.train() optimizer.zero_grad() ## Backprop and perform Adam optimisation ## loss.backward() optimizer.step() ## Track the accuracy ## if i % 50 == 0: # ########## acc = TrainingEval.get_acc(out,batch_labels) acc_list.append(acc) TrainingEval.save_metrics(acc, loss.item(), step, epoch) print('Epoch [{}/{}], Step: {}, Loss: {:.4f}, Accuracy: {:.4f}%' .format(epoch + 1, args.num_epochs, step, loss.item(), acc*100)) # Validation ## if i % 1000 == 0: val_loss, val_acc, conf_matrix = \ Validation.get_performance(model,criterion, confusion_matrix = True) Validation.save(val_acc, val_loss, epoch, step) # add to list for fast plotting valid_loss_list.append(val_loss) valid_acc_list.append(val_acc) print('Validation: Loss: {:.4f}, Accuracy: {:.4f}%\n' .format(val_loss, val_acc*100)) # plot TrainingEval.plot_metrics(acc_list, loss_list, valid_acc_list, valid_loss_list, epoch) Validation.plot_confusion_matrix(conf_matrix) Validation.plot_per_class(conf_matrix) # if i % 2000 == 0: # # Save the model # TrainingEval.save_model(model.state_dict(), i) # LogFile.log_saved_model(step) # Save the model every two train_-ds if train_ds % 5 ==0: utils.save_checkpoint(model, optimizer, epoch, train_ds,loss_list, acc_list, working_dir) utils.save_final_model(model, working_dir) LogFile.log_saved_model(step) LogFile.log_performance(acc, loss.item(), ds_type='Training') if args.testing: f = args.data name = 'test/test_1' test = Prepare_Data(f,name) test_batches = DataLoader(test, batch_size=args.batch_size, drop_last=True, shuffle=True) Test = utils.Metrics(test_batches, working_dir ,'test') test_loss, test_acc, conf_matrix = Test.get_performance( model, criterion, confusion_matrix = True) Test.save(test_acc, test_loss, epoch=-1, step=-1) Test.plot_confusion_matrix(conf_matrix) Test.save_conf_matrix(conf_matrix) Test.plot_per_class(conf_matrix) LogFile.log_performance(test_acc, test_loss, ds_type='Test')
def get_performance(self, model, criterion, confusion_matrix=False, pos_acc=False, debug=False): ''' Get mean accuracy of model from init data/batches ''' model.eval( ) # with eval mode, there is large udsving i validation, due to params updated faster than in train mode ## init performance stuff ## acc_list = [] # list for correct predictions loss_list = [] acc_list_pad = [] # list for predicting paddings conf_matrix = np.zeros((25, 25)) ## position specific accuracy N_term_pos = torch.from_numpy(np.zeros((500), dtype=np.int)) N_term_pad = torch.from_numpy(np.zeros( (500), dtype=np.int)) # correctness of padding C_term_pos = torch.from_numpy(np.zeros((20), dtype=np.int)) # get len of proteins for normalisation of posiiton accuraty len_seq = torch.from_numpy(np.zeros((500), dtype=np.int)) # for normalisation len_pad = torch.from_numpy(np.zeros((500), dtype=np.int)) len_seq_C_term = 0.0 # for normalisation - does not need to seee how often, as always same # initiate random shuffle between sub dataset (ds) h5_file = h5py.File(self.data, 'r') random_ds = list(h5_file[self.ds_type].keys()) random_ds = np.array(random_ds) np.random.shuffle(random_ds) # shuffle # loop over all datasets of that datatype (test/valid) for idx, ds in enumerate(random_ds): ds = str(self.ds_type) + '/' + ds prep_data = Prepare_Data(path=self.data, name=ds, debug=debug) # inialise from dataloader batches = DataLoader(prep_data, self.batch_size, drop_last=True, shuffle=True) with torch.no_grad(): for i, batch in enumerate(batches): batch = to_one_hot(batch) # transpose to input seq as vector batch = torch.transpose( batch, 1, 2) #transpose dim 1,2 => channels=aa ## Run the forward pass ## out = model( batch ) # sandsynligheder=> skal vaere [10,25,502] hvor de 25 er sandsynligheder # convert back to aa labels from one hot for loss batch_labels = from_one_hot( batch) # integers for labels med i.e. 100% sikkerhed # loss loss = criterion(out, batch_labels) loss_list.append(loss.item()) # get accuracy _, predicted = torch.max( out.data, dim=1) # convert back from one hot ## PREDICTIONS/ACCURACY ## # filter out padding (0) msk = batch_labels != 0 target_msk = batch_labels[msk] pred_msk = predicted[msk] # count correct predictions total = target_msk.shape[0] correct = (pred_msk == target_msk).sum().item() # save to list acc_list.append(correct / total) ## PADDINGS PREDICTIONS ## # filter out all but padding (=0) msk_pad = batch_labels == 0 target_msk_pad = batch_labels[msk_pad] pred_msk_pad = predicted[msk_pad] # count correct predictions total_pad = target_msk_pad.shape[0] correct_pad = (pred_msk_pad == target_msk_pad).sum().item() # save to list acc_list_pad.append(correct_pad / total_pad) # confusion matrix if confusion_matrix: conf_matrix += cm(target_msk.view(-1).cpu(), pred_msk.view(-1).cpu(), labels=np.arange(25)) # get position specific accuracy if pos_acc: # N-Term correct = (batch_labels == predicted) correct_msk = np.logical_and( correct.cpu(), msk.cpu()) #msk out padding N_term_pos += torch.sum(correct_msk, dim=0) # sum ovre columns len_seq += torch.sum(msk.cpu(), dim=0) # Padding N-Term correct_msk_pad = np.logical_and( correct.cpu(), msk_pad.cpu()) #msk out all but padding N_term_pad += torch.sum(correct_msk_pad, dim=0) # sum ovre columns len_pad += torch.sum( msk_pad.cpu(), dim=0) # sum how often pad correct at given pos # C-term #find end index of last aa for each protein, for predictions in C-term bckwrds_indx = torch.sum(msk, dim=1) bckwrds_indx += -21 try: # in case protein smaller than 21 bckwrds = self.bck_seq(correct_msk, bckwrds_indx.cpu(), num_elem=20) C_term_pos += torch.sum(bckwrds, dim=0) len_seq_C_term += self.batch_size except: pass # average accuracy on test set mean_acc = sum(acc_list) / len(acc_list) mean_acc_pad = sum(acc_list_pad) / len(acc_list_pad) mean_loss = sum(loss_list) / len(loss_list) model.train() if pos_acc: # position specific accuracy N_term = np.divide(N_term_pos[:], len_seq[:]) C_term = C_term_pos / float(len_seq_C_term) N_pad = np.divide(N_term_pad[:], len_pad[:]) # return things if confusion_matrix and pos_acc: return mean_loss, mean_acc, mean_acc_pad, conf_matrix, N_term, C_term, N_pad elif confusion_matrix and not pos_acc: return mean_loss, mean_acc, mean_acc_pad, conf_matrix elif not confusion_matrix and pos_acc: return mean_loss, mean_acc, mean_acc_pad, N_term, C_term, N_pad else: return mean_loss, mean_acc, mean_acc_pad
def get_performance(self, model, criterion, confusion_matrix=False, pos_acc=False): '''get mean accuracy of model from init data/batches''' # performance stuff model.eval( ) # with eval mode, there is large udsving i validation, due to params updated faster than in train mode acc_list = [] loss_list = [] conf_matrix = np.zeros((25, 25)) ## pos specific accuracy N_term_pos = torch.from_numpy(np.zeros((500), dtype=np.int)) C_term_pos = torch.from_numpy(np.zeros((20), dtype=np.int)) len_seq = torch.from_numpy(np.zeros((500), dtype=np.int)) # for normalisation len_seq_C_term = 0.0 # for normalisation - does not need to seee how often, as always same with torch.no_grad(): for i, batch in enumerate(self.batches): batch = to_one_hot(batch) # transpose to input seq as vector batch = torch.transpose(batch, 1, 2) #transpose dim 1,2 => channels=aa ## Run the forward pass ## out = model( batch ) # sandsynligheder=> skal vaere [10,25,502] hvor de 25 er sandsynligheder # convert back to aa labels from one hot for loss batch_labels = from_one_hot( batch) # integers for labels med i.e. 100% sikkerhed # loss loss = criterion(out, batch_labels) loss_list.append(loss.item()) # get accuracy _, predicted = torch.max(out.data, dim=1) # convert back from one hot # filter out padding (0) msk = batch_labels != 0 target_msk = batch_labels[msk] pred_msk = predicted[msk] # count correct predictions total = target_msk.shape[0] correct = (pred_msk == target_msk).sum().item() incorrect = (pred_msk != target_msk).sum().item() # save to list acc_list.append(correct / total) # confusion matrix if confusion_matrix: conf_matrix += cm(target_msk.view(-1).cpu(), pred_msk.view(-1).cpu(), labels=np.arange(25)) # get position specific accuracy if pos_acc: correct = (batch_labels == predicted) correct_msk = np.logical_and(correct.cpu(), msk.cpu()) #msk out padding N_term_pos += torch.sum(correct_msk, dim=0) # sum ovre columns len_seq += torch.sum(msk.cpu(), dim=0) bckwrds_indx = torch.sum( msk, dim=1) #find end indeex of each protein bckwrds_indx += -21 try: # in case protein smaller than 21 bckwrds = self.bck_seq(correct_msk, bckwrds_indx.cpu(), num_elem=20) C_term_pos += torch.sum(bckwrds, dim=0) len_seq_C_term += batch_size except: pass # average accuracy on test set mean_acc = sum(acc_list) / len(acc_list) mean_loss = sum(loss_list) / len(loss_list) model.train() if pos_acc: # position specific accuracy N_term = np.divide(N_term_pos[:], len_seq[:]) C_term = C_term_pos / float(len_seq_C_term) # return things if confusion_matrix and pos_acc: return mean_loss, mean_acc, conf_matrix, N_term, C_term elif confusion_matrix and not pos_acc: return mean_loss, mean_acc, conf_matrix elif not confusion_matrix and pos_acc: return mean_loss, mean_acc, N_term, C_term else: return mean_loss, mean_acc
def main(): ## get model/training params ## args = parser.parse_args() ## specify name of output dir ## # dir to be created once initializing TrainingMetrics if args.debug: top_working_dir = 'debugging' elif args.out_dir is not None: top_working_dir = args.out_dir else: top_working_dir = str(args.nn_model.split(".py")[0]) ## Initialize training metrics ### # simultanously creates working_dir TrainingEval = utils.TrainingMetrics(top_working_dir, args.restart) # get name of output/working dir working_dir = TrainingEval.working_dir ## Initialize Validation metrics ## Validation = utils.PerformMetrics(args.data, working_dir, args.batch_size, 'validation') ## Initialise Test metrics: ## if args.testing: Test = utils.PerformMetrics(args.data, working_dir, args.batch_size, 'test') ## Logging of scripts, models and params ## # cp nn_model script to working dir. os.system('cp nn_models/{} {}'.format(args.nn_model, working_dir)) ## Load nn model architecture ## path = './nn_models/' + args.nn_model spec = importlib.util.spec_from_file_location('nn_module', path) nn_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(nn_module) model = nn_module.ConvNet(args.kernel_size, args.stride, args.padding, args.ks_pool, args.str_pool, args.pad_pool) # nn_model = importlib.import_module('.{}'.format(args.nn_model), package='nn_models') # model = nn_model.ConvNet(args.kernel_size, args.stride, args.padding, # args.ks_pool, args.str_pool, args.pad_pool) # CUDA if torch.cuda.is_available(): model = model.cuda() # initalise optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) # load from restart file, params are conv to cuda in loading if args.restart is not None: model, optimizer, epoch_start, train_idx, loss_list, acc_list = \ utils.load_checkpoint(model, optimizer, filename=args.restart) print('loaded checkpoint model', flush=True) else: loss_list = [] acc_list = [] epoch_start = 0 # log model/training params to file LogFile = utils.LogFile(args, model, working_dir) ## Loss criterion = nn.CrossEntropyLoss( ) # does not ignore padding (0) ignore_index=0 # Train the model nr_of_batches = -1 # count batches for logging valid_loss_list = [] valid_acc_list = [] #initiate random shuffle between training sub dataset random_ds = list(h5py.File(args.data, 'r')['train'].keys()) # get sub-names random_ds = np.array(random_ds) np.random.shuffle(random_ds) # shuffle # loop over entire training set multiple times for epoch in range(epoch_start, args.num_epochs): # loop over sub training sets (for memory reasons) for train_idx, sub_name in enumerate(random_ds): # load data f = args.data name = 'train/{}'.format((sub_name)) train = utils.Prepare_Data(f, name, debug=args.debug) # make batches of the data train_batches = DataLoader(train, batch_size=args.batch_size, drop_last=True, shuffle=True) for i, batch in enumerate(train_batches): nr_of_batches += 1 # one hot encode batch = utils.to_one_hot(batch) # transpose to input seq as vector batch = torch.transpose(batch, 1, 2) #transpose dim 1,2 => channels=aa ## Run the forward pass ## out = model( batch ) # sandsynligheder=> skal vaere [10,25,502] hvor de 25 er sandsynligheder # convert back to aa labels from one hot for loss batch_labels = utils.from_one_hot( batch) # integers for labels med 100% sikkerhed ## loss ## loss = criterion(out, batch_labels) loss_list.append(loss.item()) ## switch model to training mode, clear gradient accumulators ## model.train() optimizer.zero_grad() ## Backprop and perform Adam optimisation ## loss.backward() optimizer.step() ## Track the training accuracy ## if train_idx % 1 == 0: acc = TrainingEval.get_acc(out, batch_labels) acc_list.append(acc) TrainingEval.save_metrics(acc, loss.item(), nr_of_batches, epoch) print( 'Epoch [{}/{}], sub training set: {} , nr_batches: {}, Loss: {:.4f}, Accuracy: {:.4f}%' .format(epoch, args.num_epochs, train_idx, nr_of_batches, loss.item(), acc * 100), flush=True) # Validation ## # # if i % 1000 == 0: if train_idx % 5 == 0: # get nn model performance on valid set val_loss, val_acc, val_acc_pad, N_term, C_term, N_pad = Validation.get_performance( model, criterion, pos_acc=True, debug=args.debug) # save validation metrics to file Validation.save(val_acc, val_loss, val_acc_pad, epoch, nr_of_batches) # add to list for fast plotting valid_loss_list.append(val_loss) valid_acc_list.append(val_acc) print('Validation: Loss: {:.4f}, Accuracy: {:.4f}%\n'.format( val_loss, val_acc * 100), flush=True) # plot TrainingEval.plot_metrics(acc_list, loss_list, valid_acc_list, valid_loss_list, epoch) # Save the model every 2 epochs if train_idx % 5 == 0: # save nn model as checkpoint to restart from utils.save_checkpoint(model, optimizer, \ epoch, train_idx, \ loss_list, acc_list,\ working_dir) # save nn model as final (weights only) # utils.save_final_model(model, working_dir) # log current training status to log file LogFile.log_saved_model(steps=nr_of_batches) LogFile.log_performance(\ acc, loss.item(), ds_type='Training') # test nn model on test data set if args.testing: # get performance of current nn model on test data test_loss, test_acc, test_acc_pad, conf_matrix, N_term,C_term, N_pad = \ Test.get_performance( model, criterion, \ confusion_matrix = True, \ pos_acc=True, \ debug = args.debug) # save test set metrics of nn model Test.save(test_acc, test_loss, test_acc_pad, epoch=epoch, step=nr_of_batches) # plots different model analyses Test.plot_confusion_matrix(conf_matrix) Test.save_conf_matrix(conf_matrix) # plot performance prediction on each aa type Test.plot_per_class(conf_matrix) # plot positional accuracy, i.e. how well predicts from N-term and C-term Test.plot_pos_acc(N_term, C_term, N_pad) # log test metrics in log file LogFile.log_performance(test_acc, test_loss, ds_type='Test')