def main(): ## get model/training params args = parser.parse_args() if args.debug: print ('==== DEBUGGING MODE ====') # get name of script for saving models script_name = os.path.basename(__file__) ## Initialize metrics ### TrainingEval = utils.TrainingMetrics(script_name) working_dir = TrainingEval.working_dir valid = Prepare_Data(args.data,'valid/valid') valid_batches = DataLoader(valid, args.batch_size, drop_last=True, shuffle=True) Validation = utils.Metrics(valid_batches, working_dir ,'validation') # cp running script to working dir. os.system('cp {} {}'.format(script_name, working_dir)) ## Initialize model if torch.cuda.is_available(): model = ConvNet(args.kernel_size, args.stride, args.padding, args.ks_pool, args.str_pool, args.pad_pool).cuda() else: model = ConvNet(args.kernel_size, args.stride, args.padding, args.ks_pool, args.str_pool, args.pad_pool) ## log model/training params to file LogFile = utils.LogFile(args, model, working_dir) ## Loss and optimizer criterion = nn.CrossEntropyLoss() # doees not ignore padding (0) ignore_index=0 optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) # Train the model step = -1 # nr of batches loss_list = [] acc_list = [] valid_loss_list = [] valid_acc_list = [] for epoch in range(args.num_epochs): for train_ds in range(0,10): f = args.data name = 'train/train_{}'.format(train_ds) train = Prepare_Data(f,name) train_batches = DataLoader(train, batch_size=args.batch_size, drop_last=True, shuffle=True) for i, batch in enumerate(train_batches): step += 1 # one hot encode batch = utils.to_one_hot(batch) # transpose to input seq as vector batch = torch.transpose(batch,1,2) #transpose dim 1,2 => channels=aa ## Run the forward pass ## out = model(batch) # sandsynligheder=> skal være [10,25,502] hvor de 25 er sandsynligheder # convert back to aa labels from one hot for loss batch_labels = utils.from_one_hot(batch) # integers for labels med 100% sikkerhed ## loss ## loss = criterion(out, batch_labels) loss_list.append(loss.item()) ## switch model to training mode, clear gradient accumulators ## model.train() optimizer.zero_grad() ## Backprop and perform Adam optimisation ## loss.backward() optimizer.step() ## Track the accuracy ## if i % 50 == 0: # ########## acc = TrainingEval.get_acc(out,batch_labels) acc_list.append(acc) TrainingEval.save_metrics(acc, loss.item(), step, epoch) print('Epoch [{}/{}], Step: {}, Loss: {:.4f}, Accuracy: {:.4f}%' .format(epoch + 1, args.num_epochs, step, loss.item(), acc*100)) # Validation ## if i % 1000 == 0: val_loss, val_acc, conf_matrix = \ Validation.get_performance(model,criterion, confusion_matrix = True) Validation.save(val_acc, val_loss, epoch, step) # add to list for fast plotting valid_loss_list.append(val_loss) valid_acc_list.append(val_acc) print('Validation: Loss: {:.4f}, Accuracy: {:.4f}%\n' .format(val_loss, val_acc*100)) # plot TrainingEval.plot_metrics(acc_list, loss_list, valid_acc_list, valid_loss_list, epoch) Validation.plot_confusion_matrix(conf_matrix) Validation.plot_per_class(conf_matrix) # if i % 2000 == 0: # # Save the model # TrainingEval.save_model(model.state_dict(), i) # LogFile.log_saved_model(step) # Save the model every two train_-ds if train_ds % 5 ==0: utils.save_checkpoint(model, optimizer, epoch, train_ds,loss_list, acc_list, working_dir) utils.save_final_model(model, working_dir) LogFile.log_saved_model(step) LogFile.log_performance(acc, loss.item(), ds_type='Training') if args.testing: f = args.data name = 'test/test_1' test = Prepare_Data(f,name) test_batches = DataLoader(test, batch_size=args.batch_size, drop_last=True, shuffle=True) Test = utils.Metrics(test_batches, working_dir ,'test') test_loss, test_acc, conf_matrix = Test.get_performance( model, criterion, confusion_matrix = True) Test.save(test_acc, test_loss, epoch=-1, step=-1) Test.plot_confusion_matrix(conf_matrix) Test.save_conf_matrix(conf_matrix) Test.plot_per_class(conf_matrix) LogFile.log_performance(test_acc, test_loss, ds_type='Test')
def main(): ## get model/training params ## args = parser.parse_args() ## specify name of output dir ## # dir to be created once initializing TrainingMetrics if args.debug: top_working_dir = 'debugging' elif args.out_dir is not None: top_working_dir = args.out_dir else: top_working_dir = str(args.nn_model.split(".py")[0]) ## Initialize training metrics ### # simultanously creates working_dir TrainingEval = utils.TrainingMetrics(top_working_dir, args.restart) # get name of output/working dir working_dir = TrainingEval.working_dir ## Initialize Validation metrics ## Validation = utils.PerformMetrics(args.data, working_dir, args.batch_size, 'validation') ## Initialise Test metrics: ## if args.testing: Test = utils.PerformMetrics(args.data, working_dir, args.batch_size, 'test') ## Logging of scripts, models and params ## # cp nn_model script to working dir. os.system('cp nn_models/{} {}'.format(args.nn_model, working_dir)) ## Load nn model architecture ## path = './nn_models/' + args.nn_model spec = importlib.util.spec_from_file_location('nn_module', path) nn_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(nn_module) model = nn_module.ConvNet(args.kernel_size, args.stride, args.padding, args.ks_pool, args.str_pool, args.pad_pool) # nn_model = importlib.import_module('.{}'.format(args.nn_model), package='nn_models') # model = nn_model.ConvNet(args.kernel_size, args.stride, args.padding, # args.ks_pool, args.str_pool, args.pad_pool) # CUDA if torch.cuda.is_available(): model = model.cuda() # initalise optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) # load from restart file, params are conv to cuda in loading if args.restart is not None: model, optimizer, epoch_start, train_idx, loss_list, acc_list = \ utils.load_checkpoint(model, optimizer, filename=args.restart) print('loaded checkpoint model', flush=True) else: loss_list = [] acc_list = [] epoch_start = 0 # log model/training params to file LogFile = utils.LogFile(args, model, working_dir) ## Loss criterion = nn.CrossEntropyLoss( ) # does not ignore padding (0) ignore_index=0 # Train the model nr_of_batches = -1 # count batches for logging valid_loss_list = [] valid_acc_list = [] #initiate random shuffle between training sub dataset random_ds = list(h5py.File(args.data, 'r')['train'].keys()) # get sub-names random_ds = np.array(random_ds) np.random.shuffle(random_ds) # shuffle # loop over entire training set multiple times for epoch in range(epoch_start, args.num_epochs): # loop over sub training sets (for memory reasons) for train_idx, sub_name in enumerate(random_ds): # load data f = args.data name = 'train/{}'.format((sub_name)) train = utils.Prepare_Data(f, name, debug=args.debug) # make batches of the data train_batches = DataLoader(train, batch_size=args.batch_size, drop_last=True, shuffle=True) for i, batch in enumerate(train_batches): nr_of_batches += 1 # one hot encode batch = utils.to_one_hot(batch) # transpose to input seq as vector batch = torch.transpose(batch, 1, 2) #transpose dim 1,2 => channels=aa ## Run the forward pass ## out = model( batch ) # sandsynligheder=> skal vaere [10,25,502] hvor de 25 er sandsynligheder # convert back to aa labels from one hot for loss batch_labels = utils.from_one_hot( batch) # integers for labels med 100% sikkerhed ## loss ## loss = criterion(out, batch_labels) loss_list.append(loss.item()) ## switch model to training mode, clear gradient accumulators ## model.train() optimizer.zero_grad() ## Backprop and perform Adam optimisation ## loss.backward() optimizer.step() ## Track the training accuracy ## if train_idx % 1 == 0: acc = TrainingEval.get_acc(out, batch_labels) acc_list.append(acc) TrainingEval.save_metrics(acc, loss.item(), nr_of_batches, epoch) print( 'Epoch [{}/{}], sub training set: {} , nr_batches: {}, Loss: {:.4f}, Accuracy: {:.4f}%' .format(epoch, args.num_epochs, train_idx, nr_of_batches, loss.item(), acc * 100), flush=True) # Validation ## # # if i % 1000 == 0: if train_idx % 5 == 0: # get nn model performance on valid set val_loss, val_acc, val_acc_pad, N_term, C_term, N_pad = Validation.get_performance( model, criterion, pos_acc=True, debug=args.debug) # save validation metrics to file Validation.save(val_acc, val_loss, val_acc_pad, epoch, nr_of_batches) # add to list for fast plotting valid_loss_list.append(val_loss) valid_acc_list.append(val_acc) print('Validation: Loss: {:.4f}, Accuracy: {:.4f}%\n'.format( val_loss, val_acc * 100), flush=True) # plot TrainingEval.plot_metrics(acc_list, loss_list, valid_acc_list, valid_loss_list, epoch) # Save the model every 2 epochs if train_idx % 5 == 0: # save nn model as checkpoint to restart from utils.save_checkpoint(model, optimizer, \ epoch, train_idx, \ loss_list, acc_list,\ working_dir) # save nn model as final (weights only) # utils.save_final_model(model, working_dir) # log current training status to log file LogFile.log_saved_model(steps=nr_of_batches) LogFile.log_performance(\ acc, loss.item(), ds_type='Training') # test nn model on test data set if args.testing: # get performance of current nn model on test data test_loss, test_acc, test_acc_pad, conf_matrix, N_term,C_term, N_pad = \ Test.get_performance( model, criterion, \ confusion_matrix = True, \ pos_acc=True, \ debug = args.debug) # save test set metrics of nn model Test.save(test_acc, test_loss, test_acc_pad, epoch=epoch, step=nr_of_batches) # plots different model analyses Test.plot_confusion_matrix(conf_matrix) Test.save_conf_matrix(conf_matrix) # plot performance prediction on each aa type Test.plot_per_class(conf_matrix) # plot positional accuracy, i.e. how well predicts from N-term and C-term Test.plot_pos_acc(N_term, C_term, N_pad) # log test metrics in log file LogFile.log_performance(test_acc, test_loss, ds_type='Test')