def main(): parser = argparse.ArgumentParser() # parser.add_argument('-input', dest='inputfile', type=str, help='Protein sequences to be predicted in fasta format.', required=True) parser.add_argument('-output', dest='outputfile', type=str, help='prefix of the prediction results.', required=True) parser.add_argument('-window', dest='window', type=int, help='specify the window size', required=True) parser.add_argument('-model-prefix', dest='modelprefix', type=str, help='prefix of custom model used for prediciton. If you do not have one, please run train_models.py to train a model.', required=False, default=None) parser.add_argument('-residue-types', dest='residues', type=str, help='Residue types that to be predicted. For multiple residues, seperate each with \',\'', required=False, default="C,H,E,D") parser.add_argument('-codingMode', dest='codingMode', type=int, help='Set the input sequence encoding mode.', required=False, default=0) args = parser.parse_args() # inputfile=args.inputfile; outputfile = args.outputfile; residues = args.residues.split(",") modelprefix = args.modelprefix; window = args.window; codemode = args.codingMode print(outputfile, residues, modelprefix, window) # outputfile = r'/home/ucexbw/ZinCaps/ActiveSitePrediction/data/output/' # fp = open(outputfile+"eval_by_AUC_precision_scores_polynomial_decay_increase_decrease_1_0.5_1",'w') # fp = open(outputfile+"eval_by_AUC_precision_scores_polynomial_decay_1_0.5_1",'w') # fp = open(outputfile+"eval_by_AUC_precision_scores_10fold",'w') fp = open(outputfile + "eval_by_AUC_precision_scores_10fold_constantweight1_0.5_25.txt", 'w') model_arch = Capsnet_main(np.zeros([3, 2 * window + 1, 6]), [], nb_epoch=1, compiletimes=0, lr=0.001, batch_size=500, lam_recon=0, routings=3, modeltype='nogradientstop', nb_classes=2, predict=True) # model_arch=Capsnet_main(np.zeros([3,2*16+1,21]),[],nb_epoch=1,compiletimes=0,lr=0.001,batch_size=500,lam_recon=0,routings=3,modeltype='nogradientstop',nb_classes=2,predict=True) roc_average_weight = np.zeros(5) roc_average_predict = np.zeros(5) roc_average_last_predict = np.zeros(5) accuracy_average_last_predict = np.zeros(5) sensitivity_average_last_predict = np.zeros(5) specificity_average_last_predict = np.zeros(5) f1_score_average_last_predict = np.zeros(5) mcc_average_last_predict = np.zeros(5) pr_average_weight = np.zeros(5) pr_average_predict = np.zeros(5) pr_average_last_predict = np.zeros(5) for time in range(5): fp.write("############################" + str(time) + "\n") inputfile = '/scratch/ucexbw/ZinCaps25/ActiveSitePrediction/lib/K-Fold/annotated_sequence.fasta_training_annotated_' + str( time) + '.fasta' # if os.path.exists(outputfile+"eval_by_AUC_precision_scores"): # os.rm(outputfile+"eval_by_AUC_precision_scores") checkpointweights = '/scratch/ucexbw/ZinCaps25/ActiveSitePrediction/data/weights/Zinc_' + str(time) + '_weights' modelprefix = '/scratch/ucexbw/ZinCaps25/ActiveSitePrediction/data/models/Zinc_' + str(time) + '_model' eval_type = 'average_last_predict' # all evaluate by all method # average_weight # average_predict # average_last_predict if modelprefix is None: # print ("Please specify the prefix for an existing custom model by " # "-model-prefix!\n\ # It indicates two files [-model-prefix]_HDF5model and [-model-prefix]_parameters.\n \ # If you don't have such files, please run train_models.py to get the " # "custom model first!\n") exit() else: # custom prediction model = modelprefix + str("_HDF5model") parameter = modelprefix + str("_parameters") try: f = open(parameter, 'r') except IOError: print('cannot open ' + parameter + " ! check if the model exists. " "please run train_general.py or train_kinase.py to get the custom model first!\n") else: f = open(parameter, 'r') parameters = f.read() f.close() nclass = int(parameters.split("\t")[0]) window = int(parameters.split("\t")[1]) residues = parameters.split("\t")[2] residues = residues.split(",") codemode = int(parameters.split("\t")[4]) modeltype = str(parameters.split("\t")[5]) nb_classes = int(parameters.split("\t")[6]) testfrag, ids, poses, focuses = extractFragforPredict(inputfile, window, '-', focus=residues) testX, testY = convertRawToXY(testfrag.as_matrix(), codingMode=codemode) if len(testX.shape) > 3: testX.shape = (testX.shape[0], testX.shape[2], testX.shape[3]) predict_average_weight = np.zeros((testX.shape[0], 2)) predict_average_predict = np.zeros((testX.shape[0], 2)) predict_average_last_predict = np.zeros((testX.shape[0], 2)) for bt in range(nclass): # 0 648 bt=2 len(tf.trainable_variables())=1530 # load all involving mode weights # sess = tf.Session() inputweights = checkpointweights + "_nclass" + str(bt) + "_iteration" model_members = load_model_weights(inputweights, model_arch) if eval_type == "all" or eval_type == "average_weight": predict_temp = predict_by_avg_members(model_members, model_arch, testX) predict_average_weight += predict_temp auc_score, pr_score, accuracy, sensitivity, specificity, f1_score, mcc = evaluate(predict_temp, testY) roc_average_last_predict[time] = auc_score pr_average_last_predict[time] = pr_score accuracy_average_last_predict[time] = accuracy sensitivity_average_last_predict[time] = sensitivity specificity_average_last_predict[time] = specificity f1_score_average_last_predict[time] = f1_score mcc_average_last_predict[time] = mcc # fp.write( # "average_weight_results_bt" + str(bt) + "\t" + str(auc_score) + "\t" + str(pr_score) + "\t" + str( # accuracy) + "\t" + str(sensitivity) + "\t" + str(specificity) + "\t" + str( # f1_score) + "\t" + str(mcc) + "\n") if eval_type == "all" or eval_type == "average_predict": predict_temp = predict_by_snapshot(model_members, model_arch, testX) predict_average_predict += predict_temp auc_score, pr_score, accuracy, sensitivity, specificity, f1_score, mcc = evaluate(predict_temp, testY) roc_average_last_predict[time] = auc_score pr_average_last_predict[time] = pr_score accuracy_average_last_predict[time] = accuracy sensitivity_average_last_predict[time] = sensitivity specificity_average_last_predict[time] = specificity f1_score_average_last_predict[time] = f1_score mcc_average_last_predict[time] = mcc print("average_predict results:") # fp.write( # "average_predict_results_bt" + str(bt) + "\t" + str(auc_score) + "\t" + str(pr_score) + "\t" + str( # accuracy) + "\t" + str(sensitivity) + "\t" + str(specificity) + "\t" + str( # f1_score) + "\t" + str(mcc) + "\n") del model_members # sess.close() if eval_type == "all" or eval_type == "average_weight1": predict_average_weight = predict_average_weight / float(nclass) auc_score, pr_score, accuracy, sensitivity, specificity, f1_score, mcc = evaluate(predict_average_weight, testY) print("average_weight1") roc_average_last_predict[time] = auc_score pr_average_last_predict[time] = pr_score accuracy_average_last_predict[time] = accuracy sensitivity_average_last_predict[time] = sensitivity specificity_average_last_predict[time] = specificity f1_score_average_last_predict[time] = f1_score mcc_average_last_predict[time] = mcc # fp.write( # "average_weight_results\t" + str(auc_score) + "\t" + str(pr_score) + "\t" + str(accuracy) + "\t" + str( # sensitivity) + "\t" + str(specificity) + "\t" + str(f1_score) + "\t" + str(mcc) + "\n") # roc_average_weight[time] = auc_score # pr_average_weight[time] = pr_score # write_output(outputfile + "average_weight_results_fold"+str(time)+".txt",predict_average_weight,ids,poses,focuses) if eval_type == "all" or eval_type == "average_predict": predict_average_predict = predict_average_predict / float(nclass) auc_score, pr_score, accuracy, sensitivity, specificity, f1_score, mcc = evaluate(predict_average_predict, testY) roc_average_last_predict[time] = auc_score pr_average_last_predict[time] = pr_score accuracy_average_last_predict[time] = accuracy sensitivity_average_last_predict[time] = sensitivity specificity_average_last_predict[time] = specificity f1_score_average_last_predict[time] = f1_score mcc_average_last_predict[time] = mcc # fp.write("average_predict_results:\t" + str(auc_score) + "\t" + str(pr_score) + "\t" + str( # accuracy) + "\t" + str(sensitivity) + "\t" + str(specificity) + "\t" + str(f1_score) + "\t" + str( # mcc) + "\n") # roc_average_predict[time] = auc_score # pr_average_predict[time] = pr_score # write_output(outputfile + "average_predict_results_fold"+str(time)+".txt",predict_average_predict,ids,poses,focuses) if eval_type == "all" or eval_type == "average_last_predict": nclass_ini = 1 for bt in range(nclass): model_arch[0].load_weights(model + "_class" + str(bt)) predict_temp = model_arch[1].predict(testX)[0] predict_average_last_predict += predict_temp auc_score, pr_score, accuracy, sensitivity, specificity, f1_score, mcc = evaluate(predict_temp, testY) # fp.write("average_last_predict_results_bt" + str(bt) + "\t" + str(auc_score) + "\t" + str( # pr_score) + "\t" + str(accuracy) + "\t" + str(sensitivity) + "\t" + str(specificity) + "\t" + str( # f1_score) + "\t" + str(mcc) + "\n") predict_average_last_predict = predict_average_last_predict / (nclass * nclass_ini) auc_score, pr_score, accuracy, sensitivity, specificity, f1_score, mcc = evaluate( predict_average_last_predict, testY) # fp.write("average_last_predict_results\t" + str(auc_score) + "\t" + str(pr_score) + "\t" + str( # accuracy) + "\t" + str(sensitivity) + "\t" + str(specificity) + "\t" + str(f1_score) + "\t" + str( # mcc) + "\n") roc_average_last_predict[time] = auc_score pr_average_last_predict[time] = pr_score accuracy_average_last_predict[time] = accuracy sensitivity_average_last_predict[time] = sensitivity specificity_average_last_predict[time] = specificity f1_score_average_last_predict[time] = f1_score mcc_average_last_predict[time] = mcc # write_output(outputfile + "average_last_predict_results_fold"+str(time)+".txt",predict_average_last_predict,ids,poses,focuses) print("Successfully predicted from custom models !\n") fp.write("!!!!!!!!!!!!!!!!!!!!!!!!!\n") # fp.write("average_weight_results\t" + ",".join([str(x) for x in roc_average_weight]) + "\t" + ",".join( # [str(x) for x in pr_average_weight]) + "\t" + str(np.mean(roc_average_weight)) + "," + str( # np.std(roc_average_weight)) + "\t" + str(np.mean(pr_average_weight)) + "," + str( # np.std(pr_average_weight)) + "\n") # fp.write("average_predict_results\t" + ",".join([str(x) for x in roc_average_predict]) + "\t" + ",".join( # [str(x) for x in pr_average_predict]) + "\t" + str(np.mean(roc_average_predict)) + "," + str( # np.std(roc_average_predict)) + "\t" + str(np.mean(pr_average_predict)) + "," + str( # np.std(pr_average_predict)) + "\n") # fp.write("average_last_predict_results\t" + ",".join([str(x) for x in roc_average_last_predict]) + "\t" + ",".join( # [str(x) for x in pr_average_last_predict]) + "\t" + str(np.mean(roc_average_last_predict)) + "," + str( # np.std(roc_average_last_predict)) + "\t" + str(np.mean(pr_average_last_predict)) + "," + str( # np.std(pr_average_last_predict)) + "\n") # print("roc: \n") print(roc_average_last_predict) fp.write("average_last_predict_results: \t" + "\t" + str(np.mean(roc_average_last_predict)) + "," + "\t" + str(np.mean(pr_average_last_predict)) + "," +str(np.mean(accuracy_average_last_predict))+"," + "\t" + str(np.mean(sensitivity_average_last_predict)) +"," + "\t" +str(np.mean(specificity_average_last_predict)) +"," + "\t" +str(np.mean(f1_score_average_last_predict)) +"," + "\t" +str(np.mean(mcc_average_last_predict)) + "\n") fp.close()
def main(srate=1, nb_epoch1=1, nb_epoch2=30, earlystop=20, maxneg=None, codingMode=0, transferlayer=1, inputweights=None, outputweights=None, forkinas=False): ########## Load Training Data ########## oneofkey_pos, oneofkey_neg, pssm_pos, pssm_neg, physical_pos, physical_neg = get_data( r'data/Ubisite_train3.txt', r'data/pssmpickle2/', label=True) ########## Load Testing Data ########## test_oneofkey_pos, test_oneofkey_neg, test_pssm_pos, test_pssm_neg, test_physical_pos, test_physical_neg = get_data( r'data/Ubisite_test3.txt', r'data/pssmpickle2/', label=False) ########## Oneofkey Testing ########## test_oneofkey_pos = pd.DataFrame(test_oneofkey_pos) test_oneofkey_neg = pd.DataFrame(test_oneofkey_neg) test_oneofkey_all = pd.concat([test_oneofkey_pos, test_oneofkey_neg]) test_oneofkeyX, test_oneofkeyY = convertRawToXY( test_oneofkey_all.as_matrix(), codingMode=0) ########## Physical Testing ########## test_physical_pos = pd.DataFrame(test_physical_pos) test_physical_neg = pd.DataFrame(test_physical_neg) test_physical_all = pd.concat([test_physical_pos, test_physical_neg]) test_physicalX, test_physicalY = convertRawToXY( test_physical_all.as_matrix(), codingMode=6) ########## Pssm Testing ########## test_pssm_all = test_pssm_pos + test_pssm_neg test_pssmX = convertRawToXY(test_pssm_all, codingMode=7) test_pssmY = test_oneofkeyY ########## OneofkeyX_t For Shape ########## test_oneofkeyX_t = test_oneofkeyX test_oneofkeyX_t.shape = (test_oneofkeyX.shape[0], test_oneofkeyX.shape[2], test_oneofkeyX.shape[3]) ########## PhysicalX_t For Shape ########## test_physicalX_t = test_physicalX test_physicalX_t.shape = (test_physicalX.shape[0], test_physicalX.shape[2], test_physicalX.shape[3]) ########### PssmX_t For Shape ########## testPssmX_t = test_pssmX testPssmX_t.shape = (test_pssmX.shape[0], test_pssmX.shape[2], test_pssmX.shape[3]) ########## Del Testall ########## del test_oneofkey_all, test_physical_all, test_pssm_all ########## Set Training Times ########## nclass = 20 for cw in range(1, 3, 1): c_weight = {0: cw * 0.1, 1: 1} ########## Set Training Strate ########## for t in range(0, nclass): ########### Shulffle All Training Data ########## pssm_pos, pssm_neg, oneofkey_pos, oneofkey_neg, physical_pos, physical_neg = shufflewrr( pssm_pos, pssm_neg, oneofkey_pos, oneofkey_neg, physical_pos, physical_neg) ########## A For Positive Data Number Set ########## a = int(len(oneofkey_pos) * 0.8) ########## Oneofkey Training ########## train_oneofkey_pos = oneofkey_pos[0:a] train_oneofkey_neg = oneofkey_neg[0:a] ########## Physical Training ########## train_physical_pos = physical_pos[0:a] train_physical_neg = physical_neg[0:a] ########## Pssm Training ########## train_pssm_pos = pssm_pos[0:a] train_pssm_neg = pssm_neg[0:a] print('total train', len(train_oneofkey_pos), len(train_oneofkey_neg), 'blblblblbl', len(train_physical_pos), len(train_physical_neg), len(train_pssm_pos), len(train_pssm_neg)) ########## Pos Concat Neg ########## train_oneofkey_all = pd.concat( [train_oneofkey_pos, train_oneofkey_neg]) train_physical_all = pd.concat( [train_physical_pos, train_physical_neg]) train_pssm_all = train_pssm_pos + train_pssm_neg ########## Shuffle Again ########## train_pssm_all, train_oneofkey_all, train_physical_all = shufflePosNeg( train_pssm_all, train_oneofkey_all, train_physical_all) ########## Dprocess For Codes ########## train_oneofkey_all = pd.DataFrame(train_oneofkey_all) train_oneofkeyX, train_oneofkeyY = convertRawToXY( train_oneofkey_all.as_matrix(), codingMode=0) train_physical_all = pd.DataFrame(train_physical_all) train_physicalX, train_physicalY = convertRawToXY( train_physical_all.as_matrix(), codingMode=6) train_pssmX = convertRawToXY(train_pssm_all, codingMode=7) train_pssmY = train_oneofkeyY ########## Del Trainall ########## del train_oneofkey_all, train_physical_all, train_pssm_all ########## MultiCNN ########## if (t == 0): models = MultiCNN( train_oneofkeyX, train_oneofkeyY, train_physicalX, train_pssmX, pre_train_seq_path='bestmodel/best - oneofk - model.h5', pre_train_physical_path= 'bestmodel/best - physical - model.h5', pre_train_pssm_path='bestmodel/best - pssm - model.h5', nb_epoch=nb_epoch2, earlystop=earlystop, transferlayer=transferlayer, weights=inputweights, class_weights=c_weight, forkinas=forkinas, compiletimes=t) #predict_classes = kutils.probas_to_classes(models.predict([test_oneofkeyX_t,test_physicalX_t,testPssmX_t] ,batch_size=2048)) #predict_classes = K.round(models.predict(test_physicalX)) #print('sklearn mcc',sklearn.metrics.matthews_corrcoef(test_physicalY[:,1], predict_classes)) #print('our calculation',calculate_performance(len(test_physicalY), test_physicalY[:,1], predict_classes)) #print('No.'+ str(t)+':', models.metrics_names,models.evaluate([test_oneofkeyX_t,test_physicalX_t,testPssmX_t], test_oneofkeyY, batch_size=2048)) else: models = MultiCNN( train_oneofkeyX, train_oneofkeyY, train_physicalX, train_pssmX, pre_train_seq_path='bestmodel/best - oneofk - model.h5', pre_train_physical_path= 'bestmodel/best - physical - model.h5', pre_train_pssm_path='bestmodel/best - pssm - model.h5', nb_epoch=nb_epoch2, earlystop=earlystop, transferlayer=transferlayer, weights=inputweights, class_weights=c_weight, forkinas=forkinas, compiletimes=t, compilemodels=models) #models.save('physicalfinal',overwrite=True) #models.save_weights('physicalweightfinal',overwrite=True) #predict_classes = kutils.probas_to_classes(models.predict([test_oneofkeyX_t,test_physicalX_t,testPssmX_t] ,batch_size=2048)) #predict_classes = K.round(models.predict(test_physicalX)) #print('sklearn mcc',sklearn.metrics.matthews_corrcoef(test_physicalY[:,1], predict_classes)) #print('our calculation', calculate_performance(len(test_physicalY), test_physicalY[:,1], predict_classes)) #print('No.'+ str(t)+':', models.metrics_names,models.evaluate([test_oneofkeyX_t,test_physicalX_t,testPssmX_t], test_oneofkeyY, batch_size=2048)) #predict testing set pred_proba = models.predict( [test_oneofkeyX, test_physicalX, test_pssmX], batch_size=2048) predict_classes = kutils.probas_to_classes(pred_proba) #SAVE the prediction metrics with open('result/evaluation.txt', mode='a') as resFile: resFile.write( str(cw) + ' ' + str(t) + ' ' + calculate_performance( len(test_physicalY), test_physicalY[:, 1], predict_classes, pred_proba[:, 1]) + '\r\n') resFile.close() true_label = test_oneofkeyY result = np.column_stack((true_label[:, 1], pred_proba[:, 1])) result = pd.DataFrame(result) result.to_csv(path_or_buf='result/result' + '-' + str(t) + '-' + str(cw) + '-' + '.txt', index=False, header=None, sep='\t', quoting=csv.QUOTE_NONNUMERIC) ########## Del Test Data ########## del test_pssm_pos, test_pssm_neg, test_oneofkey_pos, test_oneofkey_neg, test_physical_pos, test_physical_neg
def bootStrapping_allneg_continue_keras2(trainfile, valfile=None, srate=0.8, nb_epoch1=3, nb_epoch2=30, earlystop=None, maxneg=None, model=0, codingMode=0, lam_recon=0, inputweights=None, outputweights=None, nb_classes=2): trainX = trainfile train_pos = trainX[np.where(trainX[:, 0] != 0)] train_neg = trainX[np.where(trainX[:, 0] == 0)] train_pos = pd.DataFrame(train_pos) train_neg = pd.DataFrame(train_neg) train_pos_s = train_pos.sample(train_pos.shape[0]) #shuffle train pos train_neg_s = train_neg.sample(train_neg.shape[0]) #shuffle train neg slength = int(train_pos.shape[0] * srate) nclass = int(train_neg.shape[0] / slength) if (valfile is not None): # use all data in valfile as val valX = valfile val_pos = valX[np.where(valX[:, 0] != 0)] val_neg = valX[np.where(valX[:, 0] == 0)] val_pos = pd.DataFrame(val_pos) val_neg = pd.DataFrame(val_neg) val_all = pd.concat([val_pos, val_neg]) valX1, valY1 = convertRawToXY(val_all.as_matrix(), codingMode=codingMode) else: #selct 0.1 samples of training data as val a = int(train_pos.shape[0] * 0.9) b = train_neg.shape[0] - int(train_pos.shape[0] * 0.1) print "train pos=" + str(train_pos.shape[0]) + str('\n') print "train neg=" + str(train_neg.shape[0]) + str('\n') print " a=" + str(a) + " b=" + str(b) + str('\n') train_pos_s = train_pos[0:a] train_neg_s = train_neg[0:b] print "train pos s=" + str(train_pos_s.shape[0]) + str('\n') print "train neg s=" + str(train_neg_s.shape[0]) + str('\n') val_pos = train_pos[(a + 1):] print "val_pos=" + str(val_pos.shape[0]) + str('\n') val_neg = train_neg[b + 1:] print "val_neg=" + str(val_neg.shape[0]) + str('\n') val_all = pd.concat([val_pos, val_neg]) valX1, valY1 = convertRawToXY(val_all.as_matrix(), codingMode=codingMode) slength = int(train_pos_s.shape[0] * srate) #update slength nclass = int(train_neg_s.shape[0] / slength) if (maxneg is not None): nclass = min(maxneg, nclass) #cannot do more than maxneg times #modelweights=None; for I in range(nb_epoch1): train_neg_s = train_neg_s.sample(train_neg_s.shape[0]) #shuffle neg sample train_pos_ss = train_pos_s.sample(slength) for t in range(nclass): train_neg_ss = train_neg_s[(slength * t):(slength * t + slength)] train_all = pd.concat([train_pos_ss, train_neg_ss]) trainX1, trainY1 = convertRawToXY(train_all.as_matrix(), codingMode=codingMode) if t == 0: models, eval_model, manipulate_model, weight_c_model, fitHistory = Capsnet_main( trainX=trainX1, trainY=trainY1, valX=valX1, valY=valY1, nb_classes=nb_classes, nb_epoch=nb_epoch2, earlystop=earlystop, weights=inputweights, compiletimes=t, lr=0.001, batch_size=500, lam_recon=lam_recon, routings=3, class_weight=None, modeltype=model) else: models, eval_model, manipulate_model, weight_c_model, fitHistory = Capsnet_main( trainX=trainX1, trainY=trainY1, valX=valX1, valY=valY1, nb_classes=nb_classes, nb_epoch=nb_epoch2, earlystop=earlystop, weights=inputweights, compiletimes=t, compilemodels=(models, eval_model, manipulate_model, weight_c_model), lr=0.001, batch_size=500, lam_recon=lam_recon, routings=3, class_weight=None, modeltype=model) print "modelweights assigned for " + str(I) + " and " + str( t) + "\n" if (outputweights is not None): models.save_weights(outputweights, overwrite=True) return models, eval_model, manipulate_model, weight_c_model, fitHistory
def main(): parser = argparse.ArgumentParser() parser.add_argument( '-input', dest='inputfile', type=str, help='Protein sequences to be predicted in fasta format.', required=True) parser.add_argument('-output', dest='outputfile', type=str, help='prefix of the prediction results.', required=True) parser.add_argument( '-model-prefix', dest='modelprefix', type=str, help= 'prefix of custom model used for prediciton. If donnot have one, please run train_general.py to train a custom general PTM model or run train_kinase.py to train a custom kinase-specific PTM model.', required=False, default=None) parser.add_argument( '-residue-types', dest='residues', type=str, help= 'Residue types that to be predicted, only used when -predict-type is \'general\'. For multiple residues, seperate each with \',\'', required=False, default="S,T,Y") args = parser.parse_args() inputfile = args.inputfile outputfile = args.outputfile residues = args.residues.split(",") modelprefix = args.modelprefix if modelprefix is None: print "Please specify the prefix for an existing custom model by -model-prefix!\n\ It indicates two files [-model-prefix]_HDF5model and [-model-prefix]_parameters.\n \ If you don't have such files, please run train_models.py to get the custom model first!\n" exit() else: #custom prediction model = modelprefix + str("_HDF5model") parameter = modelprefix + str("_parameters") try: f = open(parameter, 'r') except IOError: print 'cannot open ' + parameter + " ! check if the model exists. please run train_general.py or train_kinase.py to get the custom model first!\n" else: f = open(parameter, 'r') parameters = f.read() f.close() from DProcess import convertRawToXY from EXtractfragment_sort import extractFragforPredict from capsulenet import Capsnet_main nclass = int(parameters.split("\t")[0]) window = int(parameters.split("\t")[1]) residues = parameters.split("\t")[2] residues = residues.split(",") codemode = int(parameters.split("\t")[4]) modeltype = str(parameters.split("\t")[5]) nb_classes = int(parameters.split("\t")[6]) #print "nclass="+str(nclass)+"codemode="+str(codemode)+"\n"; testfrag, ids, poses, focuses = extractFragforPredict(inputfile, window, '-', focus=residues) testX, testY = convertRawToXY(testfrag.as_matrix(), codingMode=codemode) predictproba = np.zeros((testX.shape[0], 2)) models = Capsnet_main(testX, testY, nb_epoch=1, compiletimes=0, lr=0.001, batch_size=500, lam_recon=0, routings=3, modeltype=modeltype, nb_classes=nb_classes, predict=True) # only to get config nclass_ini = 1 for bt in range(nclass): models[0].load_weights(model + "_class" + str(bt)) predictproba += models[1].predict(testX)[0] predictproba = predictproba / (nclass * nclass_ini) poses = poses + 1 results = np.column_stack((ids, poses, focuses, predictproba[:, 1])) result = pd.DataFrame(results) result.to_csv(outputfile + ".txt", index=False, header=None, sep='\t', quoting=csv.QUOTE_NONNUMERIC) print "Successfully predicted from custom models !\n"
del windows_pos, windows_neg return windows_all all_train_windows_pos, all_train_windows_neg = get_data( r'data/pretrain/1train.txt', r'data/pssmpickle2/', label=True) val_windows_pos, val_windows_neg = get_data(r'data/pretrain/1val.txt', r'data/pssmpickle2/', label=True) test_windows_pos, test_windows_neg = get_data(r'data/pretrain/1test.txt', r'data/pssmpickle2/', label=True) test_windows_all = get_matrix(test_windows_pos, test_windows_neg) test_oneofkeyX, testY = convertRawToXY(test_windows_all, codingMode=0) test_oneofkeyX.shape = (test_oneofkeyX.shape[0], test_oneofkeyX.shape[2], test_oneofkeyX.shape[3]) test_physicalXo, _ = convertRawToXY(test_windows_all, codingMode=9) test_physicalXo.shape = (test_physicalXo.shape[0], test_physicalXo.shape[2], test_physicalXo.shape[3]) test_physicalXp, _ = convertRawToXY(test_windows_all, codingMode=10) test_physicalXp.shape = (test_physicalXp.shape[0], test_physicalXp.shape[2], test_physicalXp.shape[3]) test_physicalXh, _ = convertRawToXY(test_windows_all, codingMode=11) test_physicalXh.shape = (test_physicalXh.shape[0], test_physicalXh.shape[2], test_physicalXh.shape[3]) test_physicalXc, _ = convertRawToXY(test_windows_all, codingMode=12) test_physicalXc.shape = (test_physicalXc.shape[0], test_physicalXc.shape[2], test_physicalXc.shape[3]) test_physicalXb, _ = convertRawToXY(test_windows_all, codingMode=13)
def bootStrapping_allneg_continue_keras2(trainfile,valfile=None,srate=0.8, nb_epoch1=3,nb_epoch2=30,earlystop=None, maxneg=None,model=0,codingMode=0,lam_recon=0, inputweights=None,outputweights=None,nb_classes=2, hw_res=None,hc_res=None,hc_res2=None): #inputfile:fragments (n*34);srate:selection rate for positive data;nclass:number of class models train_pos={} #0 S/T positive;1Y positive train_neg={} #0 S/T negative;1Y negative train_pos_s={} train_neg_s={} train_pos_ss={} train_neg_ss={} slength={} nclass={} trainX = trainfile for i in range(len(trainX)): trainX[i,0]=int(trainX[i,0]) for i in range(2): train_pos[i]=trainX[np.where(trainX[:,0]==i)] #sp/tp 0 yp 1 sn/tn 2 yn 3 train_neg[i]=trainX[np.where(trainX[:,0]==i+2)] train_pos[i]=pd.DataFrame(train_pos[i]) train_neg[i]=pd.DataFrame(train_neg[i]) train_pos_s[i]=train_pos[i].sample(train_pos[i].shape[0]); #shuffle train pos train_neg_s[i]=train_neg[i].sample(train_neg[i].shape[0]); #shuffle train neg slength[i]=int(train_pos[i].shape[0]*srate); nclass[i]=int(train_neg[i].shape[0]/slength[i]); if(valfile is not None): # use all data as val valX = valfile for i in range(len(valX)): valX[i,0]=int(valX[i,0]) val_all=pd.DataFrame(); for i in range(2): val_pos=valX[np.where(valX[:,0]==i)] val_neg=valX[np.where(valX[:,0]==i+2)] val_pos=pd.DataFrame(val_pos) val_neg=pd.DataFrame(val_neg) val_all=pd.concat([val_all,val_pos,val_neg]) valX1,valY1 = convertRawToXY(val_all.as_matrix(),codingMode=codingMode) #(355340,1,33,21) after extract same size as positive (48050,1,33,21) else: val_all=pd.DataFrame() nclass={} for i in range(2): a=int(train_pos[i].shape[0]*0.9); b=train_neg[i].shape[0]-int(train_pos[i].shape[0]*0.1); print "train pos="+str(train_pos[i].shape[0])+str('\n'); print "train neg="+str(train_neg[i].shape[0])+str('\n'); print " a="+str(a)+" b="+str(b)+str('\n'); train_pos_s[i]=train_pos[i][0:a] train_neg_s[i]=train_neg[i][0:b]; print "train pos s="+str(train_pos_s[i].shape[0])+str('\n'); print "train neg s="+str(train_neg_s[i].shape[0])+str('\n'); val_pos=train_pos[i][(a+1):]; print "val_pos="+str(val_pos.shape[0])+str('\n'); val_neg=train_neg[i][b+1:]; print "val_neg="+str(val_neg.shape[0])+str('\n'); val_all=pd.concat([val_all,val_pos,val_neg]) slength[i]=int(train_pos_s[i].shape[0]*srate); #transfer 0.1 to val so update slength nclass[i]=int(train_neg_s[i].shape[0]/slength[i]) valX1,valY1 = convertRawToXY(val_all.as_matrix(),codingMode=codingMode) if(maxneg is not None): nclass_n=min(max([nclass[0],nclass[1]]),maxneg) #modelweights=None; for I in range(nb_epoch1): for i in range(2): train_neg_s[i]=train_neg_s[i].sample(train_neg_s[i].shape[0]); #shuffle neg sample train_pos_ss[i]=train_pos_s[i].sample(slength[i]) for t in range(nclass_n): train_all=pd.DataFrame() for i in range(2): train_neg_ss[i]=train_neg_s[i][(slength[i]*t%nclass[i]):(slength[i]*t%nclass[i]+slength[i])]; train_all=pd.concat([train_all,train_pos_ss[i],train_neg_ss[i]]) classweights=None if(hc_res2 is not None): #negative has weight! hc_res2 is [0,2] for T classweights = { k:1.0 for k in range(nb_classes)} #stp 0 yp 1 stn 2 yn 3 classweights[hc_res2[0]]=float(sum(train_all.as_matrix()[:,0]<=1))/sum(train_all.as_matrix()[:,0]==hc_res2[0]) classweights[hc_res2[1]]=float(sum(train_all.as_matrix()[:,0]<=1))/sum(train_all.as_matrix()[:,0]==hc_res2[1]) trainX1,trainY1 = convertRawToXY(train_all.as_matrix(),codingMode=codingMode) #(355340,1,33,21) after extract same size as positive (48050,1,33,21) if t==0: models,eval_model,manipulate_model,weight_c_model,fitHistory=Capsnet_main(trainX=trainX1,trainY=trainY1,valX=valX1,valY=valY1,nb_classes=nb_classes,nb_epoch=nb_epoch2,earlystop=earlystop,weights=inputweights,compiletimes=t,lr=0.001,batch_size=1000,lam_recon=lam_recon,routings=3,class_weight=classweights,modeltype=model) else: models,eval_model,manipulate_model,weight_c_model,fitHistory=Capsnet_main(trainX=trainX1,trainY=trainY1,valX=valX1,valY=valY1,nb_classes=nb_classes,nb_epoch=nb_epoch2,earlystop=earlystop,weights=inputweights,compiletimes=t,compilemodels=(models,eval_models,manipulate_models,weight_c_models),lr=0.001,batch_size=1000,lam_recon=lam_recon,routings=3,class_weight=classweights,modeltype=model) #modelweights=models.get_weights() print "modelweights assigned for "+str(I)+" and "+str(t)+"\n"; if(outputweights is not None): models.save_weights(outputweights+ '_iteration'+str(t),overwrite=True) #print "learning rate="+str(models.optimizer.lr.get_value())+"\n"; return models,eval_model,manipulate_model,weight_c_model,fitHistory
def bootStrapping_allneg_continue_keras2(trainfile, valfile=None, srate=0.8, nb_epoch1=3, nb_epoch2=30, earlystop=None, maxneg=None, codingMode=0, inputweights=None, outputweights=None, nb_classes=2, transferlayer=0, forkinase=False, predict=False, balance_validation=True, monitor_file_name=None, save_best_only=True, load_average_weight=False): trainX = trainfile for i in range(len(trainX)): trainX[i, 0] = int(trainX[i, 0]) train_pos = trainX[np.where(trainX[:, 0] != 0)] train_neg = trainX[np.where(trainX[:, 0] == 0)] train_pos = pd.DataFrame(train_pos) train_neg = pd.DataFrame(train_neg) train_pos_s = train_pos.sample(train_pos.shape[0]) #shuffle train pos train_neg_s = train_neg.sample(train_neg.shape[0]) #shuffle train neg slength = int(train_pos.shape[0] * srate) nclass = int(train_neg.shape[0] / slength) if (valfile is not None): # use all data in valfile as val valX = valfile for i in range(len(valX)): valX[i, 0] = int(valX[i, 0]) val_pos = valX[np.where(valX[:, 0] != 0)] val_neg = valX[np.where(valX[:, 0] == 0)] val_pos = pd.DataFrame(val_pos) val_neg = pd.DataFrame(val_neg) if balance_validation: #extract balanced val_neg val_neg = val_neg.sample(val_pos.shape[0]) val_all = pd.concat([val_pos, val_neg]) val_all_s = val_all.sample(val_all.shape[0]) valX1, valY1 = convertRawToXY(val_all_s.as_matrix(), codingMode=codingMode) else: #selct 0.1 samples of training data as val a = int(train_pos.shape[0] * 0.9) b = train_neg.shape[0] - int(train_pos.shape[0] * 0.1) print("train pos=" + str(train_pos.shape[0]) + str('\n')) print("train neg=" + str(train_neg.shape[0]) + str('\n')) print(" a=" + str(a) + " b=" + str(b) + str('\n')) train_pos_s = train_pos[0:a] train_neg_s = train_neg[0:b] print("train pos s=" + str(train_pos_s.shape[0]) + str('\n')) print("train neg s=" + str(train_neg_s.shape[0]) + str('\n')) val_pos = train_pos[(a + 1):] print("val_pos=" + str(val_pos.shape[0]) + str('\n')) val_neg = train_neg[b + 1:] print("val_neg=" + str(val_neg.shape[0]) + str('\n')) val_all = pd.concat([val_pos, val_neg]) val_all_s = val_all.sample(val_all.shape[0]) valX1, valY1 = convertRawToXY(val_all_s.as_matrix(), codingMode=codingMode) slength = int(train_pos_s.shape[0] * srate) #update slength nclass = int(train_neg_s.shape[0] / slength) if (maxneg is not None): nclass = min(maxneg, nclass) #cannot do more than maxneg times #modelweights=None; for I in range(nb_epoch1): train_neg_s = train_neg_s.sample( train_neg_s.shape[0]) #shuffle neg sample train_pos_ss = train_pos_s.sample(slength) for t in range(nclass): train_neg_ss = train_neg_s[(slength * t):(slength * t + slength)] train_all = pd.concat([train_pos_ss, train_neg_ss]) train_all_shuffle = train_all.sample(train_all.shape[0]) trainX1, trainY1 = convertRawToXY(train_all_shuffle.as_matrix(), codingMode=codingMode) if t == 0: models = MultiCNN(trainX=trainX1, trainY=trainY1, valX=valX1, valY=valY1, nb_classes=nb_classes, nb_epoch=nb_epoch2, earlystop=earlystop, weights=inputweights, compiletimes=t, batch_size=1000, class_weight=None, transferlayer=transferlayer, forkinase=forkinase, predict=predict, outputweights=outputweights, monitor_file=monitor_file_name, save_best_only=save_best_only, load_average_weight=load_average_weight) else: models = MultiCNN(trainX=trainX1, trainY=trainY1, valX=valX1, valY=valY1, nb_classes=nb_classes, nb_epoch=nb_epoch2, earlystop=earlystop, weights=inputweights, compiletimes=t, compilemodels=models, batch_size=1000, class_weight=None, transferlayer=transferlayer, forkinase=forkinase, outputweights=outputweights, monitor_file=monitor_file_name, save_best_only=save_best_only, load_average_weight=load_average_weight) #save weights already does in callbacks in MultiCNN print("modelweights assigned for " + str(I) + " and " + str(t) + "\n") #if(outputweights is not None): # models.save_weights(outputweights+ '_iteration'+str(t), # overwrite=True) #already in callbacks return models
def bootStrapping_allneg_continue_keras2( trainfile, valfile=None, srate=0.8, nb_epoch1=3, nb_epoch2=30, earlystop=None, maxneg=None, model=0, codingMode=0, frozenlayer=1, inputweights=None, outputweights=None, forkinas=False, nb_classes=2, hw_res=None, hc_res=None, hc_res2=None ): #inputfile:fragments (n*34);srate:selection rate for positive data;nclass:number of class models train_pos = {} #0S,1T,2Y train_neg = {} #0S,1T,2Y train_pos_s = {} train_neg_s = {} train_pos_ss = {} train_neg_ss = {} slength = {} nclass = {} trainX = pd.read_table(trainfile, sep='\t', header=None).values for i in range(2): train_pos[i] = trainX[np.where( trainX[:, 0] == i)] #sp 0 tp 1 yp 2 sn 3 tn 4 yn 5 p<=2 n>2 train_neg[i] = trainX[np.where(trainX[:, 0] == i + 2)] #sp 0 tp 1 yp 2 sn 3 tn 4 yn 5 train_pos[i] = pd.DataFrame(train_pos[i]) train_neg[i] = pd.DataFrame(train_neg[i]) train_pos_s[i] = train_pos[i].sample(train_pos[i].shape[0]) #shuffle train pos train_neg_s[i] = train_neg[i].sample(train_neg[i].shape[0]) #shuffle train neg slength[i] = int(train_pos[i].shape[0] * srate) nclass[i] = int(train_neg[i].shape[0] / slength[i]) if (valfile is not None): # use all data as val valX = pd.read_table(valfile, sep='\t', header=None).values val_all = pd.DataFrame() for i in range(2): val_pos = valX[np.where(valX[:, 0] == i)] val_neg = valX[np.where(valX[:, 0] == i + 2)] val_pos = pd.DataFrame(val_pos) val_neg = pd.DataFrame(val_neg) val_all = pd.concat([val_all, val_pos, val_neg]) valX1, valY1 = convertRawToXY( val_all.as_matrix(), codingMode=codingMode ) #(355340,1,33,21) after extract same size as positive (48050,1,33,21) else: val_all = pd.DataFrame() nclass = {} for i in range(2): a = int(train_pos[i].shape[0] * 0.9) b = train_neg[i].shape[0] - int(train_pos[i].shape[0] * 0.1) print "train pos=" + str(train_pos[i].shape[0]) + str('\n') print "train neg=" + str(train_neg[i].shape[0]) + str('\n') print " a=" + str(a) + " b=" + str(b) + str('\n') train_pos_s[i] = train_pos[i][0:a] train_neg_s[i] = train_neg[i][0:b] print "train pos s=" + str(train_pos_s[i].shape[0]) + str('\n') print "train neg s=" + str(train_neg_s[i].shape[0]) + str('\n') val_pos = train_pos[i][(a + 1):] print "val_pos=" + str(val_pos.shape[0]) + str('\n') val_neg = train_neg[i][b + 1:] print "val_neg=" + str(val_neg.shape[0]) + str('\n') val_all = pd.concat([val_all, val_pos, val_neg]) slength[i] = int(train_pos_s[i].shape[0] * srate) #transfer 0.1 to val so update slength nclass[i] = int(train_neg_s[i].shape[0] / slength[i]) valX1, valY1 = convertRawToXY(val_all.as_matrix(), codingMode=codingMode) if (maxneg is not None): #nclass_n=min(nclass[hc_res2[0]],maxneg); #cannot do more than maxneg times nclass_n = min(max(nclass.values()), maxneg) #modelweights=None; for I in range(nb_epoch1): for i in range(2): train_neg_s[i] = train_neg_s[i].sample(train_neg_s[i].shape[0]) #shuffle neg sample train_pos_ss[i] = train_pos_s[i].sample(slength[i]) for t in range(nclass_n): train_all = pd.DataFrame() for i in range(2): train_neg_ss[i] = train_neg_s[i][(slength[i] * t % nclass[i]):( slength[i] * t % nclass[i] + slength[i])] train_all = pd.concat( [train_all, train_pos_ss[i], train_neg_ss[i]]) sampleweights = None if (hw_res is not None): sampleweights = np.ones(len(train_all)) sampleweights[np.where( train_all.as_matrix()[:, 0] == hw_res)] *= sum( sampleweights[np.where( train_all.as_matrix()[:, 0] != 0)]) / sum( sampleweights[np.where( train_all.as_matrix()[:, 0] == hw_res)]) classweights = None if (hc_res is not None): classweights = { 0: 1, 1: 1, 2: 1, 3: 1 } #0 negative, 1 S 2 T 3 Y classweights[hc_res] = sum( train_all.as_matrix()[:, 0] != 0) / sum( train_all.as_matrix()[:, 0] == hc_res) if (hc_res2 is not None): #negative has weight! # classweights={0:1.0,1:1.0,2:1.0,3:1.0,4:1.0,5:1.0} #sp 0 tp 1 yp 2 sn 3 tn 4 yn 5 classweights = {k: 1.0 for k in range(nb_classes)} classweights[hc_res2[0]] = float( sum(train_all.as_matrix()[:, 0] < 2)) / sum( train_all.as_matrix()[:, 0] == hc_res2[0]) classweights[hc_res2[1]] = float( sum(train_all.as_matrix()[:, 0] < 2)) / sum( train_all.as_matrix()[:, 0] == hc_res2[1]) print(train_all.as_matrix()) trainX1, trainY1 = convertRawToXY( train_all.as_matrix(), codingMode=codingMode ) #(355340,1,33,21) after extract same size as positive (48050,1,33,21) #models=MultiCNN(trainX1,trainY1,valX1,valY1,nb_epoch=nb_epoch2,earlystop=earlystop,model=model,frozenlayer=frozenlayer,weights=inputweights,modelweights=modelweights,forkinas=forkinas,compiletimes=t,compilemodels=models) print("#" * 30) print(trainX1.shape) print("#" * 30) # print(trainX1.ix[:,0]) if t == 0: models = MultiCNN(trainX1, trainY1, valX1, valY1, nb_epoch=nb_epoch2, earlystop=earlystop, model=model, frozenlayer=frozenlayer, weights=inputweights, sample_weight=sampleweights, nb_classes=nb_classes, class_weight=classweights, forkinas=forkinas, compiletimes=t) else: models = MultiCNN(trainX1, trainY1, valX1, valY1, nb_epoch=nb_epoch2, earlystop=earlystop, model=model, frozenlayer=frozenlayer, weights=inputweights, sample_weight=sampleweights, nb_classes=nb_classes, class_weight=classweights, forkinas=forkinas, compiletimes=t, compilemodels=models) #modelweights=models.get_weights() print "modelweights assigned for " + str(I) + " and " + str( t) + "\n" if (outputweights is not None): models.save_weights(outputweights + '_iteration' + str(t), overwrite=True) #print "learning rate="+str(models.optimizer.lr.get_value())+"\n"; return models
def main(): args = getArgs() inputfile = args.inputfile species = args.speciesarg out_result = args.outresult earlystop = 20 batch_size = 512 input_row_One_Hot = 41 input_col_One_Hot = 5 input_row_ANF_NCP = 41 input_col_ANF_NCP = 9 input_row_CKSNAP_NCP = 150 input_col_CKSNAP_NCP = 17 input_row_PSTNPss_NCP = 39 input_col_PSTNPss_NCP = 25 beta_1 = 0.9 beta_2 = 0.999 epsilon = 1e-08 attentionhidden_x = 10 attentionhidden_xr = 8 attention_reg_x = 0.151948 attention_reg_xr = 2 learning_rate = 0.002 weight_decay = 0.00005 conv1_filter = 32 dropoutMerge1 = 0.5 conv2_filter = 136 dropoutMerge2 = 0.1 conv3_filter = 48 dropoutMerge3 = 0.1 dropoutMerge4 = 0.5 dense1 = 64 dropout_dense1 = 0.5 dense3 = 8 weightsModel = args.weightsModelarg best_model = weightsModel mycsvTrain = species+ '_train_cnn.txt' train_All = pd.read_csv(mycsvTrain) window = 20 if args.inputType == 'file': predict_data,ids,poses,focuses=analyseFASTAPredict(inputfile,window,'-',focus='C') if args.inputType == 'fixed': predict_data,ids,poses,focuses=analyseFixedPredict(inputfile) testLabel = predict_data.iloc[:,0] #####################################ENAC##################################### testX_One_Hot,testY_One_Hot = convertRawToXY(train_All.as_matrix(),predict_data.as_matrix(),codingMode='ENAC') #####################################ANF_NCP_EIIP_Onehot##################################### testX_ANF_NCP,testY_ANF_NCP = convertRawToXY(train_All.as_matrix(),predict_data.as_matrix(),codingMode='ANF_NCP_EIIP_Onehot') #####################################CKSNAP_NCP_EIIP_Onehot##################################### testX_CKSNAP_NCP,testY_CKSNAP_NCP = convertRawToXY(train_All.as_matrix(),predict_data.as_matrix(),codingMode='CKSNAP_NCP_EIIP_Onehot') #####################################PSTNPss_NCP_EIIP_Onehot##################################### testX_PSTNPss_NCP,testY_PSTNPss_NCP = convertRawToXY(train_All.as_matrix(),predict_data.as_matrix(),codingMode='PSTNPss_NCP_EIIP_Onehot') predict_model = train_cnn(learning_rate=learning_rate,weight_decay=weight_decay,dropoutMerge1=dropoutMerge1, dropoutMerge2=dropoutMerge2,dropoutMerge3=dropoutMerge3,dropoutMerge4=dropoutMerge4, dropout_dense1=dropout_dense1,dense1=dense1,dense3=dense3,conv1_filter=conv1_filter, conv2_filter=conv2_filter,conv3_filter=conv3_filter,earlystop = earlystop,batch_size = batch_size, input_row_One_Hot = input_row_One_Hot,input_col_One_Hot = input_col_One_Hot, input_row_ANF_NCP = input_row_ANF_NCP,input_col_ANF_NCP = input_col_ANF_NCP, input_row_CKSNAP_NCP = input_row_CKSNAP_NCP,input_col_CKSNAP_NCP = input_col_CKSNAP_NCP, input_row_PSTNPss_NCP = input_row_PSTNPss_NCP,input_col_PSTNPss_NCP = input_col_PSTNPss_NCP, beta_1 = beta_1,beta_2 = beta_2,epsilon = epsilon, attentionhidden_x = attentionhidden_x,attentionhidden_xr = attentionhidden_xr, attention_reg_x = attention_reg_x,attention_reg_xr = attention_reg_xr, best_model = best_model,weightsModel = weightsModel,predict = True, train = False,transfer=False) predict_model.load_weights(weightsModel) predictproba=predict_model.predict([testX_One_Hot,testX_ANF_NCP,testX_CKSNAP_NCP,testX_PSTNPss_NCP]) poses=poses+1 speciesArr = species.split('/') speciess = speciesArr[len(speciesArr)-1] if(speciess == 'A'): speciess2 = 'A.thaliana' elif(speciess == 'C'): speciess2 = 'C.elegan' elif(speciess == 'D'): speciess2 = 'D.melanogaster' elif(speciess == 'E'): speciess2 = 'E.coli' elif(speciess == 'Gsub'): speciess2 = 'G.subterraneus' elif(speciess == 'Gpick'): speciess2 = 'G.pickeringii' typess = np.full((testLabel.shape[0],1),speciess2) results=np.column_stack((ids,poses,focuses,predictproba[:,1],typess)) result=pd.DataFrame(results) result.to_csv(out_result+ "_custom2.txt", index=False, header=None, sep='\t',quoting=csv.QUOTE_NONNUMERIC)
def bootStrapping_allneg_continue_val( trainfile, valfile=None, srate=0.8, nb_epoch1=3, nb_epoch2=30, earlystop=None, maxneg=None, codingMode=0, transferlayer=1, inputweights=None, outputweights=None, forkinas=False ): #inputfile:fragments (n*34);srate:selection rate for positive data;nclass:number of class models trainX = trainfile train_pos = trainX[np.where(trainX[:, 0] == 1)] train_neg = trainX[np.where(trainX[:, 0] != 1)] train_pos = pd.DataFrame(train_pos) train_neg = pd.DataFrame(train_neg) if (train_pos.shape[0] == 0): print 'ERROR: size of positive sites is 0. Please check positive sites in training data!\n' exit() if (train_neg.shape[0] == 0): print 'ERROR: size of negative sites is 0. Please check negative sites in training data!\n' exit() train_pos_s = train_pos.sample(train_pos.shape[0]) #shuffle train pos train_neg_s = train_neg.sample(train_neg.shape[0]) #shuffle train neg slength = int(train_pos.shape[0] * srate) nclass = int(train_neg.shape[0] / slength) if (valfile is not None): valX = valfile.as_matrix() val_pos = valX[np.where(valX[:, 0] == 1)] val_neg = valX[np.where(valX[:, 0] != 1)] val_pos = pd.DataFrame(val_pos) val_neg = pd.DataFrame(val_neg) val_all = pd.concat([val_pos, val_neg]) valX1, valY1 = convertRawToXY(val_all.as_matrix(), codingMode=codingMode) else: a = int(train_pos.shape[0] * 0.9) b = train_neg.shape[0] - int(train_pos.shape[0] * 0.1) train_pos_s = train_pos[0:a] train_neg_s = train_neg[0:b] val_pos = train_pos[(a + 1):] val_neg = train_neg[b + 1:] val_all = pd.concat([val_pos, val_neg]) valX1, valY1 = convertRawToXY(val_all.as_matrix(), codingMode=codingMode) slength = int(train_pos_s.shape[0] * srate) nclass = int(train_neg_s.shape[0] / slength) if (maxneg is not None): nclass = min(maxneg, nclass) #cannot do more than maxneg times for I in range(nb_epoch1): train_neg_s = train_neg_s.sample(train_neg_s.shape[0]) #shuffle neg sample train_pos_ss = train_pos_s.sample(slength) for t in range(nclass): train_neg_ss = train_neg_s[(slength * t):(slength * t + slength)] train_all = pd.concat([train_pos_ss, train_neg_ss]) trainX1, trainY1 = convertRawToXY(train_all.as_matrix(), codingMode=codingMode) if t == 0: models = MultiCNN(trainX1, trainY1, valX1, valY1, nb_epoch=nb_epoch2, earlystop=earlystop, transferlayer=transferlayer, weights=inputweights, forkinas=forkinas, compiletimes=t) else: models = MultiCNN(trainX1, trainY1, valX1, valY1, nb_epoch=nb_epoch2, earlystop=earlystop, transferlayer=transferlayer, weights=inputweights, forkinas=forkinas, compiletimes=t, compilemodels=models) print "modelweights assigned for " + str(t) + " bootstrap.\n" if (outputweights is not None): models.save_weights(outputweights, overwrite=True) return models