def cross_val(args): torch.set_default_tensor_type('torch.DoubleTensor') allele_list_9 = [ 'HLA-A*02:01', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*02:03', 'HLA-B*15:01', 'HLA-A*31:01', 'HLA-A*01:01', 'HLA-B*07:02', 'HLA-A*26:01', 'HLA-A*02:06', 'HLA-A*68:02', 'HLA-B*08:01', 'HLA-B*58:01', 'HLA-B*40:01', 'HLA-B*27:05', 'HLA-A*30:01', 'HLA-A*69:01', 'HLA-B*57:01', 'HLA-B*35:01', 'HLA-A*02:02', 'HLA-A*24:02', 'HLA-B*18:01', 'HLA-B*51:01', 'HLA-A*29:02', 'HLA-A*68:01', 'HLA-A*33:01', 'HLA-A*23:01' ] allele_list_10 = [ 'HLA-A*02:01', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*68:01', 'HLA-A*31:01', 'HLA-A*02:06', 'HLA-A*68:02', 'HLA-A*02:03', 'HLA-A*33:01', 'HLA-A*02:02' ] if not os.path.exists(args.savedir): os.mkdir(args.savedir) logFileLoc = args.savedir + os.sep + args.testFile if os.path.isfile(logFileLoc): logger = open(logFileLoc, 'a') logger.write("%s\t%s\t\t\t%s\t\t\t%s\t\t\t%s\n" % ('Length', 'Allele', 'Pearson', 'AUC', 'SRCC')) logger.flush() else: logger = open(logFileLoc, 'w') logger.write("%s\t%s\t\t\t%s\t\t\t%s\t\t\t%s\n" % ('Length', 'Allele', 'Pearson', 'AUC', 'SRCC')) logger.flush() for length in [10, 9]: if length == 9: allele_list = allele_list_9 elif length == 10: allele_list = allele_list_10 else: print("Invalid Length") exit(0) for allele in allele_list: #[9,10] model_dir = args.savedir + os.sep + 'best_model' + os.sep + allele if not os.path.isdir(model_dir): os.makedirs(model_dir) data_dict = pickle.load( open( args.data_dir + os.sep + 'pickle_' + str(length) + os.sep + allele.replace('*', '.').replace(':', '_') + '.p', 'rb')) print('test on allele: ' + data_dict['allele']) if not length == data_dict['sequ_length']: print('length error') exit() encode_channel = data_dict['channel_encode'] meas = data_dict['label'] bind = [] for i in meas: i = (-1) * math.log10(i) bind.append(i) sequ, label = encode_channel, bind if (len(sequ) > 5): sequ_ori, label_ori = sequ, label train_sequ_ori, test_sequ_ori, train_label_ori, test_label_ori = train_test_split( sequ_ori, label_ori, test_size=0.1, random_state=42, shuffle=True) sequ_ori, label_ori = test_sequ_ori, test_label_ori output_list = [] label_list = [] test_data_load = torch.utils.data.DataLoader( myDataLoader.MyDataset(sequ_ori, label_ori), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) model = net.ResNetC1() if args.onGPU == True: #model = torch.nn.DataParallel(model, device_ids=[0,1,2,3]).cuda() model = model.cuda() criteria = MSELoss() if args.onGPU == True: criteria = criteria.cuda() output_sum, label = [], [] for fold_num in range(1, 6): best_model_dict = torch.load(model_dir + os.sep + allele + '_' + str(length) + '_' + str(fold_num) + '.pth') model.load_state_dict(best_model_dict) _, _, output, label = val(args, test_data_load, model, criteria) if not output_sum: output_sum.extend(output) else: output_sum = [ output_sum[i] + output[i] for i in range(len(output_sum)) ] final_out = [output_sum[i] / 5 for i in range(len(output_sum))] output_list.extend(final_out) label_list.extend(label) IC_output_list = [ math.pow(10, (-1) * value) for value in output_list ] IC_label_list = [ math.pow(10, (-1) * value) for value in label_list ] bi_output_list = [ 1 if ic < 500 else 0 for ic in IC_output_list ] bi_label_list = [1 if ic < 500 else 0 for ic in IC_label_list] pearson = pearsonr(IC_output_list, IC_label_list) auc = roc_auc_score(bi_label_list, bi_output_list) srcc = spearmanr(IC_output_list, IC_label_list) logger.write("%s\t%s\t\t%.4f\t\t\t%.4f\t\t\t%.4f\n" % (length, allele, pearson[0], auc, srcc[0])) logger.flush() prediction = args.savedir + os.sep + args.predict if os.path.exists(prediction): append_write = 'a' # append if already exists else: append_write = 'w' true_value = open(prediction, append_write) true_value.write("%s\n" % (allele)) for i in range(len(output_list)): true_value.write("%.4f\t%.4f\n" % (IC_label_list[i], IC_output_list[i])) true_value.flush() logger.close()
def cross_val(args): torch.set_default_tensor_type('torch.DoubleTensor') allele_list_9 = [ 'HLA-A*02:01', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*02:03', 'HLA-B*15:01', 'HLA-A*31:01', 'HLA-A*01:01', 'HLA-B*07:02', 'HLA-A*26:01', 'HLA-A*02:06', 'HLA-A*68:02', 'HLA-B*08:01', 'HLA-B*58:01', 'HLA-B*40:01', 'HLA-B*27:05', 'HLA-A*30:01', 'HLA-A*69:01', 'HLA-B*57:01', 'HLA-B*35:01', 'HLA-A*02:02', 'HLA-A*24:02', 'HLA-B*18:01', 'HLA-B*51:01', 'HLA-A*29:02', 'HLA-A*68:01', 'HLA-A*33:01', 'HLA-A*23:01' ] allele_list_10 = [ 'HLA-A*02:01', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*68:01', 'HLA-A*31:01', 'HLA-A*02:06', 'HLA-A*68:02', 'HLA-A*02:03', 'HLA-A*33:01', 'HLA-A*02:02' ] if not os.path.exists(args.savedir): os.mkdir(args.savedir) logFileLoc = args.savedir + os.sep + args.testFile if os.path.isfile(logFileLoc): logger = open(logFileLoc, 'a') logger.write("%s\t%s\t\t\t\t%s\n" % ('Length', 'Allele', 'Pearson')) logger.flush() else: logger = open(logFileLoc, 'w') logger.write("%s\t%s\t\t\t\t%s\n" % ('Length', 'Allele', 'Pearson')) logger.flush() for length in [10, 9]: if length == 9: allele_list = allele_list_9 elif length == 10: allele_list = allele_list_10 else: print("Invalid Length") exit(0) for allele in allele_list: #[9,10] model_dir = args.savedir + os.sep + 'best_model' + os.sep + allele if not os.path.isdir(model_dir): os.makedirs(model_dir) data_dict = pickle.load( open( args.data_dir + os.sep + 'pickle_' + str(length) + os.sep + allele.replace('*', '.').replace(':', '_') + '.p', 'rb')) print('val on allele: ' + data_dict['allele']) if not length == data_dict['sequ_length']: print('length error') exit() encode_channel = data_dict['channel_encode'] meas = data_dict['label'] bind = [] for i in meas: i = (-1) * math.log10(i) bind.append(i) sequ, label = encode_channel, bind if (len(sequ) > 0): sequ_ori, label_ori = sequ, label output_list = [] label_list = [] fold_num = 0 kf = KFold(n_splits=5, shuffle=True, random_state=42) pearson_list = [] for train_set, test_set in kf.split(sequ_ori, label_ori): fold_num += 1 train_sequ, test_sequ, train_label, test_label = [sequ_ori[i] for i in train_set], [sequ_ori[i] for i in test_set],\ [label_ori[i] for i in train_set], [label_ori[i] for i in test_set] test_data_load = torch.utils.data.DataLoader( myDataLoader.MyDataset(test_sequ, test_label), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) train_sequ, val_sequ, train_label, val_label = train_test_split( train_sequ, train_label, test_size=0.1, random_state=42, shuffle=True) val_data_load = torch.utils.data.DataLoader( myDataLoader.MyDataset(val_sequ, val_label), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) model = net.ResNetC1() if args.onGPU == True: #model = torch.nn.DataParallel(model, device_ids=[0,1,2,3]).cuda() model = model.cuda() criteria = MSELoss() if args.onGPU == True: criteria = criteria.cuda() best_model_dict = torch.load(model_dir + os.sep + allele + '_' + str(length) + '_' + str(fold_num) + '.pth') model.load_state_dict(best_model_dict) _, _, output, label = val(args, test_data_load, model, criteria) output_list.extend(output) label_list.extend(label) pearson = pearsonr(output_list, label_list) pearson_list.append(pearson[0]) logger.write("%s\t%s\t\t\t%.4f\n" % (length, allele, max(pearson_list))) logger.flush() logger.close()
def cross_val(args): torch.set_default_tensor_type('torch.DoubleTensor') allele_list_9 = [ 'HLA-A*02:01', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*02:03', 'HLA-B*15:01', 'HLA-A*31:01', 'HLA-A*01:01', 'HLA-B*07:02', 'HLA-A*26:01', 'HLA-A*02:06', 'HLA-A*68:02', 'HLA-B*08:01', 'HLA-B*58:01', 'HLA-B*40:01', 'HLA-B*27:05', 'HLA-A*30:01', 'HLA-A*69:01', 'HLA-B*57:01', 'HLA-B*35:01', 'HLA-A*02:02', 'HLA-A*24:02', 'HLA-B*18:01', 'HLA-B*51:01', 'HLA-A*29:02', 'HLA-A*68:01', 'HLA-A*33:01', 'HLA-A*23:01' ] allele_list_10 = [ 'HLA-A*02:01', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*68:01', 'HLA-A*31:01', 'HLA-A*02:06', 'HLA-A*68:02', 'HLA-A*02:03', 'HLA-A*33:01', 'HLA-A*02:02' ] if not os.path.exists(args.savedir): os.mkdir(args.savedir) if args.visualizeNet == True: x = Variable(torch.randn(1, 5, 174, 18)) if args.onGPU == True: x = x.cuda() model = net.pre_train() #y = model.cuda.().forward(x) #g = viz.make_dot(y) #g.render(args.savedir + '/model.png', view=False) total_paramters = 0 for parameter in model.parameters(): i = len(parameter.size()) p = 1 for j in range(i): p *= parameter.size(j) total_paramters += p print('Parameters: ' + str(total_paramters)) logFileLoc = args.savedir + os.sep + args.crossValFile if os.path.isfile(logFileLoc): logger = open(logFileLoc, 'a') logger.write("%s\t%s\t\t\t\t%s\t\t\t\t%s\n" % ('Length', 'Allele', 'train_loss', 'val_loss')) logger.flush() else: logger = open(logFileLoc, 'w') logger.write("%s\t%s\t\t\t\t%s\t\t\t\t%s\n" % ('Length', 'Allele', 'train_loss', 'val_loss')) logger.flush() for length in [10]: if length == 9: allele_list = allele_list_9 elif length == 10: allele_list = allele_list_10 else: print("Invalid Length") exit(0) for allele in allele_list: #[9,10] model_dir = args.savedir + os.sep + 'best_model' + os.sep + allele if not os.path.isdir(model_dir): os.makedirs(model_dir) data_dict = pickle.load( open( args.data_dir + os.sep + 'pickle_' + str(length) + os.sep + allele.replace('*', '.').replace(':', '_') + '.p', 'rb')) print('train on allele: ' + data_dict['allele']) if not length == data_dict['sequ_length']: print('length error') exit() encode_channel = data_dict['channel_encode'] bind = data_dict['channel_encode'] sequ, label = encode_channel, bind if (len(sequ) > 0): sequ_ori, label_ori = sequ, label alleleLoc = args.savedir + os.sep + allele + '.txt' if os.path.isfile(alleleLoc): log = open(alleleLoc, 'a') log.write("\n") log.write("%s\t\t\t%s\n" % ('Length: ', length)) log.write("%s\t\t\t\t%s\t\t\t\t%s\n" % ('Epoch', 'tr_loss', 'val_loss')) log.flush() else: log = open(alleleLoc, 'w') log.write("%s\t\t\t%s\n" % ('Allele', allele)) log.write("\n") log.write("%s\t\t\t%s\n" % ('Length: ', length)) log.write("%s\t\t\t\t%s\t\t\t\t%s\n" % ('Epoch', 'tr_loss', 'val_loss')) log.flush() train_sequ, val_sequ, train_label, val_label = train_test_split( sequ_ori, label_ori, test_size=0.1, random_state=42, shuffle=True) train_data_load = torch.utils.data.DataLoader( myDataLoader.MyDataset(train_sequ, train_label), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) val_data_load = torch.utils.data.DataLoader( myDataLoader.MyDataset(val_sequ, val_label), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) model = net.pre_train() if args.onGPU == True: model = torch.nn.DataParallel(model, device_ids=[0, 1, 2]).cuda() criteria = MSELoss() if args.onGPU == True: criteria = criteria.cuda() #optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4 #optimizer = torch.optim.Adam(model.parameters(), args.lr, (0.9, 0.999), eps=1e-08, weight_decay=2e-4) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=5e-4) if args.onGPU == True: cudnn.benchmark = True scheduler = torch.optim.lr_scheduler.StepLR( optimizer, step_size=args.step_loss, gamma=0.1) start_epoch = 0 min_val_loss = 100 loss_not_decay = 0 train_loss_list = [] val_loss_list = [] for epoch in range(start_epoch, args.max_epochs): tr_epoch_loss = train(args, train_data_load, model, criteria, optimizer) val_epoch_loss = val(args, val_data_load, model, criteria) train_loss_list.append(tr_epoch_loss) val_loss_list.append(val_epoch_loss) log.write("%s\t\t\t\t%.4f\t\t\t\t%.4f\n" % (epoch, tr_epoch_loss, val_epoch_loss)) #val_epoch_loss = val_epoch_loss.cpu().data.numpy()[0] if val_epoch_loss < min_val_loss: if args.save_model == True: model_file_name = model_dir + os.sep + allele + '_' + str( length) + '.pth' print('==> Saving the best model') torch.save(model.state_dict(), model_file_name) min_val_loss = val_epoch_loss loss_not_decay = 0 else: loss_not_decay += 1 if loss_not_decay >= 10: break scheduler.step(epoch) allele_train_loss = sum(train_loss_list) / len(train_loss_list) allele_val_loss = sum(val_loss_list) / len(val_loss_list) logger.write( "%s\t%s\t\t\t\t%.4f\t\t\t\t%.4f\n" % (length, allele, allele_train_loss, allele_val_loss)) logger.flush() logger.close()
def cross_val(args): torch.set_default_tensor_type('torch.DoubleTensor') csv_path = os.path.join(args.data_dir, args.file_path) data_ori = pd.read_csv(csv_path) data_ori = data_ori.loc[data_ori['species'] == 'human'] data_ori = data_ori.loc[(data_ori['peptide_length'] == 9) | (data_ori['peptide_length'] == 10)] allele_list_9 = [ 'HLA-A*02:01', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*02:03', 'HLA-B*15:01', 'HLA-A*31:01', 'HLA-A*01:01', 'HLA-B*07:02', 'HLA-A*26:01', 'HLA-A*02:06', 'HLA-A*68:02', 'HLA-B*08:01', 'HLA-B*58:01', 'HLA-B*40:01', 'HLA-B*27:05', 'HLA-A*30:01', 'HLA-A*69:01', 'HLA-B*57:01', 'HLA-B*35:01', 'HLA-A*02:02', 'HLA-A*24:02', 'HLA-B*18:01', 'HLA-B*51:01', 'HLA-A*29:02', 'HLA-A*68:01', 'HLA-A*33:01', 'HLA-A*23:01' ] allele_list_10 = [ 'HLA-A*02:01', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*68:01', 'HLA-A*31:01', 'HLA-A*02:06', 'HLA-A*68:02', 'HLA-A*02:03', 'HLA-A*33:01', 'HLA-A*02:02' ] if not os.path.exists(args.savedir): os.mkdir(args.savedir) logFileLoc = args.savedir + os.sep + args.testFile if os.path.isfile(logFileLoc): logger = open(logFileLoc, 'a') logger.write("%s\t%s\t\t\t%s\t\t\t%s\t\t\t%s\n" % ('Length', 'Allele', 'Pearson', 'AUC', 'SRCC')) logger.flush() else: logger = open(logFileLoc, 'w') logger.write("%s\t%s\t\t\t%s\t\t\t%s\t\t\t%s\n" % ('Length', 'Allele', 'Pearson', 'AUC', 'SRCC')) logger.flush() for length in [10, 9]: if length == 9: allele_list = allele_list_9 elif length == 10: allele_list = allele_list_10 else: print("Invalid Length") exit(0) for allele in allele_list: #[9,10] model_dir = args.savedir + os.sep + 'best_model' + os.sep + allele if not os.path.isdir(model_dir): os.makedirs(model_dir) data = data_ori.loc[(data_ori['peptide_length'] == length)] data = data.loc[data_ori['mhc'] == allele] sequ = data['sequence'].values.tolist() matrix_list = [make_matrix(x) for x in sequ] meas = data['meas'].values.tolist() bind = [] positive = [i for i in meas if i < 500] for i in meas: i = (-1) * math.log10(i) bind.append(i) sequ, label = matrix_list, bind if (len(sequ) > 0): sequ_ori, label_ori = sequ, label output_list = [] label_list = [] fold_num = 0 kf = KFold(n_splits=5, shuffle=True, random_state=42) for train_set, test_set in kf.split(sequ_ori, label_ori): fold_num += 1 train_sequ, test_sequ, train_label, test_label = [sequ_ori[i] for i in train_set], [sequ_ori[i] for i in test_set],\ [label_ori[i] for i in train_set], [label_ori[i] for i in test_set] test_data_load = torch.utils.data.DataLoader( myDataLoader.MyDataset(test_sequ, test_label), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) model = net.GRU_net() if args.onGPU == True: #model = torch.nn.DataParallel(model, device_ids=[0,1,2,3]).cuda() model = model.cuda() criteria = MSELoss() if args.onGPU == True: criteria = criteria.cuda() best_model_dict = torch.load(model_dir + os.sep + allele + '_' + str(length) + '_' + str(fold_num) + '.pth') model.load_state_dict(best_model_dict) _, _, output, label = val(args, test_data_load, model, criteria) output_list.extend(output) label_list.extend(label) #IC_output_list = [math.pow(10, (-1) * value) for value in output_list] #IC_label_list = [math.pow(10, (-1) * value) for value in label_list] IC_output_list = output_list IC_label_list = label_list bi_output_list = [ 1 if ic > (-1) * math.log10(500) else 0 for ic in IC_output_list ] bi_label_list = [ 1 if ic > (-1) * math.log10(500) else 0 for ic in IC_label_list ] pearson = pearsonr(IC_output_list, IC_label_list) auc = roc_auc_score(bi_label_list, bi_output_list) srcc = spearmanr(IC_label_list, IC_output_list) logger.write("%s\t%s\t\t%.4f\t\t\t%.4f\t\t\t%.4f\n" % (length, allele, pearson[0], auc, srcc[0])) logger.flush() prediction = args.savedir + os.sep + args.predict if os.path.exists(prediction): append_write = 'a' # append if already exists else: append_write = 'w' true_value = open(prediction, append_write) true_value.write("%s\n" % (allele)) for i in range(int(len(output_list) / 10)): true_value.write("%.4f\t%.4f\n" % (IC_label_list[i], IC_output_list[i])) true_value.flush() logger.close()
def cross_val(args): torch.set_default_tensor_type('torch.DoubleTensor') allele_list_9 = [ 'HLA-A*02:01', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*02:03', 'HLA-B*15:01', 'HLA-A*31:01', 'HLA-A*01:01', 'HLA-B*07:02', 'HLA-A*26:01', 'HLA-A*02:06', 'HLA-A*68:02', 'HLA-B*08:01', 'HLA-B*58:01', 'HLA-B*40:01', 'HLA-B*27:05', 'HLA-A*30:01', 'HLA-A*69:01', 'HLA-B*57:01', 'HLA-B*35:01', 'HLA-A*02:02', 'HLA-A*24:02', 'HLA-B*18:01', 'HLA-B*51:01', 'HLA-A*29:02', 'HLA-A*68:01', 'HLA-A*33:01', 'HLA-A*23:01' ] allele_list_10 = [ 'HLA-A*02:01', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*68:01', 'HLA-A*31:01', 'HLA-A*02:06', 'HLA-A*68:02', 'HLA-A*02:03', 'HLA-A*33:01', 'HLA-A*02:02' ] if not os.path.exists(args.savedir): os.mkdir(args.savedir) if args.visualizeNet == True: x = Variable(torch.randn(1, 5, 174, 18)) model = net.ResNetC1() if args.onGPU == True: x = x.cuda() model = model.cuda() y = model.forward(x) #g = viz.make_dot(y) #g.render(args.savedir + '/model.png', view=False) total_paramters = 0 for parameter in model.parameters(): i = len(parameter.size()) p = 1 for j in range(i): p *= parameter.size(j) total_paramters += p print('Parameters: ' + str(total_paramters)) logFileLoc = args.savedir + os.sep + args.crossValFile if os.path.isfile(logFileLoc): logger = open(logFileLoc, 'a') logger.write("%s\t%s\t\t\t\t\t%s\n" % ('Length', 'Allele', 'Pearson')) logger.flush() else: logger = open(logFileLoc, 'w') logger.write("%s\t%s\t\t\t\t\t%s\n" % ('Length', 'Allele', 'Pearson')) logger.flush() for length in [9, 10]: if length == 9: allele_list = allele_list_9 elif length == 10: allele_list = allele_list_10 else: print("Invalid Length") exit(0) for allele in allele_list: #[9,10] model_dir = args.savedir + os.sep + 'best_model' + os.sep + allele if not os.path.isdir(model_dir): os.makedirs(model_dir) data_dict = pickle.load( open( args.data_dir + os.sep + 'pickle_' + str(length) + os.sep + allele.replace('*', '.').replace(':', '_') + '.p', 'rb')) print('train on allele: ' + data_dict['allele']) if not length == data_dict['sequ_length']: print('length error') exit() encode_channel = data_dict['channel_encode'] meas = data_dict['label'] bind = [] for i in meas: i = (-1) * math.log10(i) bind.append(i) sequ, label = encode_channel, bind if (len(sequ) > 0): sequ_ori, label_ori = sequ, label sequ_ori, test_sequ_ori, label_ori, test_label_ori = train_test_split( sequ_ori, label_ori, test_size=0.1, random_state=42, shuffle=True) output_list = [] label_list = [] fold_num = 0 kf = KFold(n_splits=5, shuffle=True, random_state=42) for train_set, test_set in kf.split(sequ_ori, label_ori): fold_num += 1 # alleleLoc = args.savedir + os.sep + allele + '.txt' if os.path.isfile(alleleLoc): log = open(alleleLoc, 'a') log.write("\n") log.write("%s\t\t\t%s\n" % ('Length: ', length)) log.write( "%s\t\t\t\t%s\t\t\t%s\t\t\t%s\n" % ('Epoch', 'tr_loss', 'val_loss', 'val_Pearson')) log.flush() else: log = open(alleleLoc, 'w') log.write("%s\t\t\t%s\n" % ('Allele', allele)) log.write("\n") log.write("%s\t\t\t%s\n" % ('Length: ', length)) log.write( "%s\t\t\t\t%s\t\t\t%s\t\t\t%s\n" % ('Epoch', 'tr_loss', 'val_loss', 'val_Pearson')) log.flush() train_sequ, test_sequ, train_label, test_label = [sequ_ori[i] for i in train_set], [sequ_ori[i] for i in test_set],\ [label_ori[i] for i in train_set], [label_ori[i] for i in test_set] train_sequ, val_sequ, train_label, val_label = train_test_split( train_sequ, train_label, test_size=0.1, random_state=42, shuffle=True) train_data_load = torch.utils.data.DataLoader( myDataLoader.MyDataset(train_sequ, train_label), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) val_data_load = torch.utils.data.DataLoader( myDataLoader.MyDataset(val_sequ, val_label), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) test_data_load = torch.utils.data.DataLoader( myDataLoader.MyDataset(test_sequ, test_label), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) model = net.ResNetC1() if args.onGPU == True: #model = torch.nn.DataParallel(model, device_ids=[0,1,2,3]).cuda() model = model.cuda() # pretrain = torch.load('pretrain/best_model/' + allele + '/' + allele + '_' + str(length) + '.pth') # model_dict = model.state_dict() # pretrained_dict = {k: v for k, v in pretrain.items() if k in model_dict} # model_dict.update(pretrained_dict) # model.load_state_dict(model_dict) criteria = MSELoss() if args.onGPU == True: criteria = criteria.cuda() # optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) optimizer = torch.optim.Adam( model.parameters(), args.lr, weight_decay=args.weight_decay) if args.onGPU == True: cudnn.benchmark = True scheduler = torch.optim.lr_scheduler.StepLR( optimizer, step_size=args.step_loss, gamma=0.1) start_epoch = 0 min_val_loss = 100 loss_not_decay = 0 for epoch in range(start_epoch, args.max_epochs): tr_epoch_loss, tr_mean_squared_error = train( args, train_data_load, model, criteria, optimizer) val_epoch_loss, val_mean_squared_error, val_output, val_label = val( args, val_data_load, model, criteria) val_Pearson = pearsonr(val_output, val_label) log = open(alleleLoc, 'a') log.write("%s\t\t\t\t%.4f\t\t\t%.4f\t\t\t\t%.4f\n" % (epoch, tr_epoch_loss, val_epoch_loss, val_Pearson[0])) if val_epoch_loss < min_val_loss: if args.save_model == True: model_file_name = model_dir + os.sep + allele + '_' + str( length) + '_' + str(fold_num) + '.pth' print('==> Saving the best model') torch.save(model.state_dict(), model_file_name) min_val_loss = val_epoch_loss loss_not_decay = 0 else: loss_not_decay += 1 if loss_not_decay >= 40: break scheduler.step(epoch) best_model_dict = torch.load(model_dir + os.sep + allele + '_' + str(length) + '_' + str(fold_num) + '.pth') model.load_state_dict(best_model_dict) _, _, output, label = val(args, test_data_load, model, criteria) output_list.extend(output) label_list.extend(label) pearson = pearsonr(output_list, label_list) r2 = r2_score(label_list, output_list) logger.write("%s\t%s\t\t\t\t%.4f\n" % (length, allele, pearson[0])) logger.flush() logger.close()