def main(): parser = argparse.ArgumentParser() # parser.add_argument('-input', dest='inputfile', type=str, help='Protein sequences to be predicted in fasta format.', required=True) parser.add_argument('-output', dest='outputfile', type=str, help='prefix of the prediction results.', required=True) parser.add_argument('-window', dest='window', type=int, help='specify the window size', required=True) parser.add_argument('-model-prefix', dest='modelprefix', type=str, help='prefix of custom model used for prediciton. If you do not have one, please run train_models.py to train a model.', required=False, default=None) parser.add_argument('-residue-types', dest='residues', type=str, help='Residue types that to be predicted. For multiple residues, seperate each with \',\'', required=False, default="C,H,E,D") parser.add_argument('-codingMode', dest='codingMode', type=int, help='Set the input sequence encoding mode.', required=False, default=0) args = parser.parse_args() # inputfile=args.inputfile; outputfile = args.outputfile; residues = args.residues.split(",") modelprefix = args.modelprefix; window = args.window; codemode = args.codingMode print(outputfile, residues, modelprefix, window) # outputfile = r'/home/ucexbw/ZinCaps/ActiveSitePrediction/data/output/' # fp = open(outputfile+"eval_by_AUC_precision_scores_polynomial_decay_increase_decrease_1_0.5_1",'w') # fp = open(outputfile+"eval_by_AUC_precision_scores_polynomial_decay_1_0.5_1",'w') # fp = open(outputfile+"eval_by_AUC_precision_scores_10fold",'w') fp = open(outputfile + "eval_by_AUC_precision_scores_10fold_constantweight1_0.5_25.txt", 'w') model_arch = Capsnet_main(np.zeros([3, 2 * window + 1, 6]), [], nb_epoch=1, compiletimes=0, lr=0.001, batch_size=500, lam_recon=0, routings=3, modeltype='nogradientstop', nb_classes=2, predict=True) # model_arch=Capsnet_main(np.zeros([3,2*16+1,21]),[],nb_epoch=1,compiletimes=0,lr=0.001,batch_size=500,lam_recon=0,routings=3,modeltype='nogradientstop',nb_classes=2,predict=True) roc_average_weight = np.zeros(5) roc_average_predict = np.zeros(5) roc_average_last_predict = np.zeros(5) accuracy_average_last_predict = np.zeros(5) sensitivity_average_last_predict = np.zeros(5) specificity_average_last_predict = np.zeros(5) f1_score_average_last_predict = np.zeros(5) mcc_average_last_predict = np.zeros(5) pr_average_weight = np.zeros(5) pr_average_predict = np.zeros(5) pr_average_last_predict = np.zeros(5) for time in range(5): fp.write("############################" + str(time) + "\n") inputfile = '/scratch/ucexbw/ZinCaps25/ActiveSitePrediction/lib/K-Fold/annotated_sequence.fasta_training_annotated_' + str( time) + '.fasta' # if os.path.exists(outputfile+"eval_by_AUC_precision_scores"): # os.rm(outputfile+"eval_by_AUC_precision_scores") checkpointweights = '/scratch/ucexbw/ZinCaps25/ActiveSitePrediction/data/weights/Zinc_' + str(time) + '_weights' modelprefix = '/scratch/ucexbw/ZinCaps25/ActiveSitePrediction/data/models/Zinc_' + str(time) + '_model' eval_type = 'average_last_predict' # all evaluate by all method # average_weight # average_predict # average_last_predict if modelprefix is None: # print ("Please specify the prefix for an existing custom model by " # "-model-prefix!\n\ # It indicates two files [-model-prefix]_HDF5model and [-model-prefix]_parameters.\n \ # If you don't have such files, please run train_models.py to get the " # "custom model first!\n") exit() else: # custom prediction model = modelprefix + str("_HDF5model") parameter = modelprefix + str("_parameters") try: f = open(parameter, 'r') except IOError: print('cannot open ' + parameter + " ! check if the model exists. " "please run train_general.py or train_kinase.py to get the custom model first!\n") else: f = open(parameter, 'r') parameters = f.read() f.close() nclass = int(parameters.split("\t")[0]) window = int(parameters.split("\t")[1]) residues = parameters.split("\t")[2] residues = residues.split(",") codemode = int(parameters.split("\t")[4]) modeltype = str(parameters.split("\t")[5]) nb_classes = int(parameters.split("\t")[6]) testfrag, ids, poses, focuses = extractFragforPredict(inputfile, window, '-', focus=residues) testX, testY = convertRawToXY(testfrag.as_matrix(), codingMode=codemode) if len(testX.shape) > 3: testX.shape = (testX.shape[0], testX.shape[2], testX.shape[3]) predict_average_weight = np.zeros((testX.shape[0], 2)) predict_average_predict = np.zeros((testX.shape[0], 2)) predict_average_last_predict = np.zeros((testX.shape[0], 2)) for bt in range(nclass): # 0 648 bt=2 len(tf.trainable_variables())=1530 # load all involving mode weights # sess = tf.Session() inputweights = checkpointweights + "_nclass" + str(bt) + "_iteration" model_members = load_model_weights(inputweights, model_arch) if eval_type == "all" or eval_type == "average_weight": predict_temp = predict_by_avg_members(model_members, model_arch, testX) predict_average_weight += predict_temp auc_score, pr_score, accuracy, sensitivity, specificity, f1_score, mcc = evaluate(predict_temp, testY) roc_average_last_predict[time] = auc_score pr_average_last_predict[time] = pr_score accuracy_average_last_predict[time] = accuracy sensitivity_average_last_predict[time] = sensitivity specificity_average_last_predict[time] = specificity f1_score_average_last_predict[time] = f1_score mcc_average_last_predict[time] = mcc # fp.write( # "average_weight_results_bt" + str(bt) + "\t" + str(auc_score) + "\t" + str(pr_score) + "\t" + str( # accuracy) + "\t" + str(sensitivity) + "\t" + str(specificity) + "\t" + str( # f1_score) + "\t" + str(mcc) + "\n") if eval_type == "all" or eval_type == "average_predict": predict_temp = predict_by_snapshot(model_members, model_arch, testX) predict_average_predict += predict_temp auc_score, pr_score, accuracy, sensitivity, specificity, f1_score, mcc = evaluate(predict_temp, testY) roc_average_last_predict[time] = auc_score pr_average_last_predict[time] = pr_score accuracy_average_last_predict[time] = accuracy sensitivity_average_last_predict[time] = sensitivity specificity_average_last_predict[time] = specificity f1_score_average_last_predict[time] = f1_score mcc_average_last_predict[time] = mcc print("average_predict results:") # fp.write( # "average_predict_results_bt" + str(bt) + "\t" + str(auc_score) + "\t" + str(pr_score) + "\t" + str( # accuracy) + "\t" + str(sensitivity) + "\t" + str(specificity) + "\t" + str( # f1_score) + "\t" + str(mcc) + "\n") del model_members # sess.close() if eval_type == "all" or eval_type == "average_weight1": predict_average_weight = predict_average_weight / float(nclass) auc_score, pr_score, accuracy, sensitivity, specificity, f1_score, mcc = evaluate(predict_average_weight, testY) print("average_weight1") roc_average_last_predict[time] = auc_score pr_average_last_predict[time] = pr_score accuracy_average_last_predict[time] = accuracy sensitivity_average_last_predict[time] = sensitivity specificity_average_last_predict[time] = specificity f1_score_average_last_predict[time] = f1_score mcc_average_last_predict[time] = mcc # fp.write( # "average_weight_results\t" + str(auc_score) + "\t" + str(pr_score) + "\t" + str(accuracy) + "\t" + str( # sensitivity) + "\t" + str(specificity) + "\t" + str(f1_score) + "\t" + str(mcc) + "\n") # roc_average_weight[time] = auc_score # pr_average_weight[time] = pr_score # write_output(outputfile + "average_weight_results_fold"+str(time)+".txt",predict_average_weight,ids,poses,focuses) if eval_type == "all" or eval_type == "average_predict": predict_average_predict = predict_average_predict / float(nclass) auc_score, pr_score, accuracy, sensitivity, specificity, f1_score, mcc = evaluate(predict_average_predict, testY) roc_average_last_predict[time] = auc_score pr_average_last_predict[time] = pr_score accuracy_average_last_predict[time] = accuracy sensitivity_average_last_predict[time] = sensitivity specificity_average_last_predict[time] = specificity f1_score_average_last_predict[time] = f1_score mcc_average_last_predict[time] = mcc # fp.write("average_predict_results:\t" + str(auc_score) + "\t" + str(pr_score) + "\t" + str( # accuracy) + "\t" + str(sensitivity) + "\t" + str(specificity) + "\t" + str(f1_score) + "\t" + str( # mcc) + "\n") # roc_average_predict[time] = auc_score # pr_average_predict[time] = pr_score # write_output(outputfile + "average_predict_results_fold"+str(time)+".txt",predict_average_predict,ids,poses,focuses) if eval_type == "all" or eval_type == "average_last_predict": nclass_ini = 1 for bt in range(nclass): model_arch[0].load_weights(model + "_class" + str(bt)) predict_temp = model_arch[1].predict(testX)[0] predict_average_last_predict += predict_temp auc_score, pr_score, accuracy, sensitivity, specificity, f1_score, mcc = evaluate(predict_temp, testY) # fp.write("average_last_predict_results_bt" + str(bt) + "\t" + str(auc_score) + "\t" + str( # pr_score) + "\t" + str(accuracy) + "\t" + str(sensitivity) + "\t" + str(specificity) + "\t" + str( # f1_score) + "\t" + str(mcc) + "\n") predict_average_last_predict = predict_average_last_predict / (nclass * nclass_ini) auc_score, pr_score, accuracy, sensitivity, specificity, f1_score, mcc = evaluate( predict_average_last_predict, testY) # fp.write("average_last_predict_results\t" + str(auc_score) + "\t" + str(pr_score) + "\t" + str( # accuracy) + "\t" + str(sensitivity) + "\t" + str(specificity) + "\t" + str(f1_score) + "\t" + str( # mcc) + "\n") roc_average_last_predict[time] = auc_score pr_average_last_predict[time] = pr_score accuracy_average_last_predict[time] = accuracy sensitivity_average_last_predict[time] = sensitivity specificity_average_last_predict[time] = specificity f1_score_average_last_predict[time] = f1_score mcc_average_last_predict[time] = mcc # write_output(outputfile + "average_last_predict_results_fold"+str(time)+".txt",predict_average_last_predict,ids,poses,focuses) print("Successfully predicted from custom models !\n") fp.write("!!!!!!!!!!!!!!!!!!!!!!!!!\n") # fp.write("average_weight_results\t" + ",".join([str(x) for x in roc_average_weight]) + "\t" + ",".join( # [str(x) for x in pr_average_weight]) + "\t" + str(np.mean(roc_average_weight)) + "," + str( # np.std(roc_average_weight)) + "\t" + str(np.mean(pr_average_weight)) + "," + str( # np.std(pr_average_weight)) + "\n") # fp.write("average_predict_results\t" + ",".join([str(x) for x in roc_average_predict]) + "\t" + ",".join( # [str(x) for x in pr_average_predict]) + "\t" + str(np.mean(roc_average_predict)) + "," + str( # np.std(roc_average_predict)) + "\t" + str(np.mean(pr_average_predict)) + "," + str( # np.std(pr_average_predict)) + "\n") # fp.write("average_last_predict_results\t" + ",".join([str(x) for x in roc_average_last_predict]) + "\t" + ",".join( # [str(x) for x in pr_average_last_predict]) + "\t" + str(np.mean(roc_average_last_predict)) + "," + str( # np.std(roc_average_last_predict)) + "\t" + str(np.mean(pr_average_last_predict)) + "," + str( # np.std(pr_average_last_predict)) + "\n") # print("roc: \n") print(roc_average_last_predict) fp.write("average_last_predict_results: \t" + "\t" + str(np.mean(roc_average_last_predict)) + "," + "\t" + str(np.mean(pr_average_last_predict)) + "," +str(np.mean(accuracy_average_last_predict))+"," + "\t" + str(np.mean(sensitivity_average_last_predict)) +"," + "\t" +str(np.mean(specificity_average_last_predict)) +"," + "\t" +str(np.mean(f1_score_average_last_predict)) +"," + "\t" +str(np.mean(mcc_average_last_predict)) + "\n") fp.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument( '-input', dest='inputfile', type=str, help='Protein sequences to be predicted in fasta format.', required=True) parser.add_argument('-output', dest='outputfile', type=str, help='prefix of the prediction results.', required=True) parser.add_argument( '-model-prefix', dest='modelprefix', type=str, help= 'prefix of custom model used for prediciton. If donnot have one, please run train_general.py to train a custom general PTM model or run train_kinase.py to train a custom kinase-specific PTM model.', required=False, default=None) parser.add_argument( '-residue-types', dest='residues', type=str, help= 'Residue types that to be predicted, only used when -predict-type is \'general\'. For multiple residues, seperate each with \',\'', required=False, default="S,T,Y") args = parser.parse_args() inputfile = args.inputfile outputfile = args.outputfile residues = args.residues.split(",") modelprefix = args.modelprefix if modelprefix is None: print "Please specify the prefix for an existing custom model by -model-prefix!\n\ It indicates two files [-model-prefix]_HDF5model and [-model-prefix]_parameters.\n \ If you don't have such files, please run train_models.py to get the custom model first!\n" exit() else: #custom prediction model = modelprefix + str("_HDF5model") parameter = modelprefix + str("_parameters") try: f = open(parameter, 'r') except IOError: print 'cannot open ' + parameter + " ! check if the model exists. please run train_general.py or train_kinase.py to get the custom model first!\n" else: f = open(parameter, 'r') parameters = f.read() f.close() from DProcess import convertRawToXY from EXtractfragment_sort import extractFragforPredict from capsulenet import Capsnet_main nclass = int(parameters.split("\t")[0]) window = int(parameters.split("\t")[1]) residues = parameters.split("\t")[2] residues = residues.split(",") codemode = int(parameters.split("\t")[4]) modeltype = str(parameters.split("\t")[5]) nb_classes = int(parameters.split("\t")[6]) #print "nclass="+str(nclass)+"codemode="+str(codemode)+"\n"; testfrag, ids, poses, focuses = extractFragforPredict(inputfile, window, '-', focus=residues) testX, testY = convertRawToXY(testfrag.as_matrix(), codingMode=codemode) predictproba = np.zeros((testX.shape[0], 2)) models = Capsnet_main(testX, testY, nb_epoch=1, compiletimes=0, lr=0.001, batch_size=500, lam_recon=0, routings=3, modeltype=modeltype, nb_classes=nb_classes, predict=True) # only to get config nclass_ini = 1 for bt in range(nclass): models[0].load_weights(model + "_class" + str(bt)) predictproba += models[1].predict(testX)[0] predictproba = predictproba / (nclass * nclass_ini) poses = poses + 1 results = np.column_stack((ids, poses, focuses, predictproba[:, 1])) result = pd.DataFrame(results) result.to_csv(outputfile + ".txt", index=False, header=None, sep='\t', quoting=csv.QUOTE_NONNUMERIC) print "Successfully predicted from custom models !\n"
def bootStrapping_allneg_continue_keras2(trainfile, valfile=None, srate=0.8, nb_epoch1=3, nb_epoch2=30, earlystop=None, maxneg=None, model=0, codingMode=0, lam_recon=0, inputweights=None, outputweights=None, nb_classes=2): trainX = trainfile train_pos = trainX[np.where(trainX[:, 0] != 0)] train_neg = trainX[np.where(trainX[:, 0] == 0)] train_pos = pd.DataFrame(train_pos) train_neg = pd.DataFrame(train_neg) train_pos_s = train_pos.sample(train_pos.shape[0]) #shuffle train pos train_neg_s = train_neg.sample(train_neg.shape[0]) #shuffle train neg slength = int(train_pos.shape[0] * srate) nclass = int(train_neg.shape[0] / slength) if (valfile is not None): # use all data in valfile as val valX = valfile val_pos = valX[np.where(valX[:, 0] != 0)] val_neg = valX[np.where(valX[:, 0] == 0)] val_pos = pd.DataFrame(val_pos) val_neg = pd.DataFrame(val_neg) val_all = pd.concat([val_pos, val_neg]) valX1, valY1 = convertRawToXY(val_all.as_matrix(), codingMode=codingMode) else: #selct 0.1 samples of training data as val a = int(train_pos.shape[0] * 0.9) b = train_neg.shape[0] - int(train_pos.shape[0] * 0.1) print "train pos=" + str(train_pos.shape[0]) + str('\n') print "train neg=" + str(train_neg.shape[0]) + str('\n') print " a=" + str(a) + " b=" + str(b) + str('\n') train_pos_s = train_pos[0:a] train_neg_s = train_neg[0:b] print "train pos s=" + str(train_pos_s.shape[0]) + str('\n') print "train neg s=" + str(train_neg_s.shape[0]) + str('\n') val_pos = train_pos[(a + 1):] print "val_pos=" + str(val_pos.shape[0]) + str('\n') val_neg = train_neg[b + 1:] print "val_neg=" + str(val_neg.shape[0]) + str('\n') val_all = pd.concat([val_pos, val_neg]) valX1, valY1 = convertRawToXY(val_all.as_matrix(), codingMode=codingMode) slength = int(train_pos_s.shape[0] * srate) #update slength nclass = int(train_neg_s.shape[0] / slength) if (maxneg is not None): nclass = min(maxneg, nclass) #cannot do more than maxneg times #modelweights=None; for I in range(nb_epoch1): train_neg_s = train_neg_s.sample(train_neg_s.shape[0]) #shuffle neg sample train_pos_ss = train_pos_s.sample(slength) for t in range(nclass): train_neg_ss = train_neg_s[(slength * t):(slength * t + slength)] train_all = pd.concat([train_pos_ss, train_neg_ss]) trainX1, trainY1 = convertRawToXY(train_all.as_matrix(), codingMode=codingMode) if t == 0: models, eval_model, manipulate_model, weight_c_model, fitHistory = Capsnet_main( trainX=trainX1, trainY=trainY1, valX=valX1, valY=valY1, nb_classes=nb_classes, nb_epoch=nb_epoch2, earlystop=earlystop, weights=inputweights, compiletimes=t, lr=0.001, batch_size=500, lam_recon=lam_recon, routings=3, class_weight=None, modeltype=model) else: models, eval_model, manipulate_model, weight_c_model, fitHistory = Capsnet_main( trainX=trainX1, trainY=trainY1, valX=valX1, valY=valY1, nb_classes=nb_classes, nb_epoch=nb_epoch2, earlystop=earlystop, weights=inputweights, compiletimes=t, compilemodels=(models, eval_model, manipulate_model, weight_c_model), lr=0.001, batch_size=500, lam_recon=lam_recon, routings=3, class_weight=None, modeltype=model) print "modelweights assigned for " + str(I) + " and " + str( t) + "\n" if (outputweights is not None): models.save_weights(outputweights, overwrite=True) return models, eval_model, manipulate_model, weight_c_model, fitHistory
def bootStrapping_allneg_continue_keras2(trainfile,valfile=None,srate=0.8, nb_epoch1=3,nb_epoch2=30,earlystop=None, maxneg=None,model=0,codingMode=0,lam_recon=0, inputweights=None,outputweights=None,nb_classes=2, hw_res=None,hc_res=None,hc_res2=None): #inputfile:fragments (n*34);srate:selection rate for positive data;nclass:number of class models train_pos={} #0 S/T positive;1Y positive train_neg={} #0 S/T negative;1Y negative train_pos_s={} train_neg_s={} train_pos_ss={} train_neg_ss={} slength={} nclass={} trainX = trainfile for i in range(len(trainX)): trainX[i,0]=int(trainX[i,0]) for i in range(2): train_pos[i]=trainX[np.where(trainX[:,0]==i)] #sp/tp 0 yp 1 sn/tn 2 yn 3 train_neg[i]=trainX[np.where(trainX[:,0]==i+2)] train_pos[i]=pd.DataFrame(train_pos[i]) train_neg[i]=pd.DataFrame(train_neg[i]) train_pos_s[i]=train_pos[i].sample(train_pos[i].shape[0]); #shuffle train pos train_neg_s[i]=train_neg[i].sample(train_neg[i].shape[0]); #shuffle train neg slength[i]=int(train_pos[i].shape[0]*srate); nclass[i]=int(train_neg[i].shape[0]/slength[i]); if(valfile is not None): # use all data as val valX = valfile for i in range(len(valX)): valX[i,0]=int(valX[i,0]) val_all=pd.DataFrame(); for i in range(2): val_pos=valX[np.where(valX[:,0]==i)] val_neg=valX[np.where(valX[:,0]==i+2)] val_pos=pd.DataFrame(val_pos) val_neg=pd.DataFrame(val_neg) val_all=pd.concat([val_all,val_pos,val_neg]) valX1,valY1 = convertRawToXY(val_all.as_matrix(),codingMode=codingMode) #(355340,1,33,21) after extract same size as positive (48050,1,33,21) else: val_all=pd.DataFrame() nclass={} for i in range(2): a=int(train_pos[i].shape[0]*0.9); b=train_neg[i].shape[0]-int(train_pos[i].shape[0]*0.1); print "train pos="+str(train_pos[i].shape[0])+str('\n'); print "train neg="+str(train_neg[i].shape[0])+str('\n'); print " a="+str(a)+" b="+str(b)+str('\n'); train_pos_s[i]=train_pos[i][0:a] train_neg_s[i]=train_neg[i][0:b]; print "train pos s="+str(train_pos_s[i].shape[0])+str('\n'); print "train neg s="+str(train_neg_s[i].shape[0])+str('\n'); val_pos=train_pos[i][(a+1):]; print "val_pos="+str(val_pos.shape[0])+str('\n'); val_neg=train_neg[i][b+1:]; print "val_neg="+str(val_neg.shape[0])+str('\n'); val_all=pd.concat([val_all,val_pos,val_neg]) slength[i]=int(train_pos_s[i].shape[0]*srate); #transfer 0.1 to val so update slength nclass[i]=int(train_neg_s[i].shape[0]/slength[i]) valX1,valY1 = convertRawToXY(val_all.as_matrix(),codingMode=codingMode) if(maxneg is not None): nclass_n=min(max([nclass[0],nclass[1]]),maxneg) #modelweights=None; for I in range(nb_epoch1): for i in range(2): train_neg_s[i]=train_neg_s[i].sample(train_neg_s[i].shape[0]); #shuffle neg sample train_pos_ss[i]=train_pos_s[i].sample(slength[i]) for t in range(nclass_n): train_all=pd.DataFrame() for i in range(2): train_neg_ss[i]=train_neg_s[i][(slength[i]*t%nclass[i]):(slength[i]*t%nclass[i]+slength[i])]; train_all=pd.concat([train_all,train_pos_ss[i],train_neg_ss[i]]) classweights=None if(hc_res2 is not None): #negative has weight! hc_res2 is [0,2] for T classweights = { k:1.0 for k in range(nb_classes)} #stp 0 yp 1 stn 2 yn 3 classweights[hc_res2[0]]=float(sum(train_all.as_matrix()[:,0]<=1))/sum(train_all.as_matrix()[:,0]==hc_res2[0]) classweights[hc_res2[1]]=float(sum(train_all.as_matrix()[:,0]<=1))/sum(train_all.as_matrix()[:,0]==hc_res2[1]) trainX1,trainY1 = convertRawToXY(train_all.as_matrix(),codingMode=codingMode) #(355340,1,33,21) after extract same size as positive (48050,1,33,21) if t==0: models,eval_model,manipulate_model,weight_c_model,fitHistory=Capsnet_main(trainX=trainX1,trainY=trainY1,valX=valX1,valY=valY1,nb_classes=nb_classes,nb_epoch=nb_epoch2,earlystop=earlystop,weights=inputweights,compiletimes=t,lr=0.001,batch_size=1000,lam_recon=lam_recon,routings=3,class_weight=classweights,modeltype=model) else: models,eval_model,manipulate_model,weight_c_model,fitHistory=Capsnet_main(trainX=trainX1,trainY=trainY1,valX=valX1,valY=valY1,nb_classes=nb_classes,nb_epoch=nb_epoch2,earlystop=earlystop,weights=inputweights,compiletimes=t,compilemodels=(models,eval_models,manipulate_models,weight_c_models),lr=0.001,batch_size=1000,lam_recon=lam_recon,routings=3,class_weight=classweights,modeltype=model) #modelweights=models.get_weights() print "modelweights assigned for "+str(I)+" and "+str(t)+"\n"; if(outputweights is not None): models.save_weights(outputweights+ '_iteration'+str(t),overwrite=True) #print "learning rate="+str(models.optimizer.lr.get_value())+"\n"; return models,eval_model,manipulate_model,weight_c_model,fitHistory