def main(): dbConn = db.connect_to_server('pfacts003_test', 'webuser') dbCur = db.get_cursor(dbConn) dbConn.set_isolation_level(0) try: dbCur.execute("DROP TABLE snp_rost_intrepid_header;") print "\nPrevious version of 'snp_rost_intrepid_header' deleted." except: print "\nTable 'snp_rost_intrepid_header' does not exist and will be created." try: dbCur.execute("DROP TABLE snp_rost_intrepid_detail;") print "Previous version of 'snp_rost_intrepid_detail' deleted." except: print "Table 'snp_rost_intrepid_detail' does not exist and will be created." dbCur.execute("CREATE TABLE snp_rost_intrepid_header \ (id serial PRIMARY KEY, protaccid varchar, familyid integer, \ familytype char(1), msaok boolean, treeok boolean, idmapok boolean, scoresok boolean, familyscore integer, \ numresscored integer, starttime varchar);") print "Created new or replacement 'snp_rost_intrepid_header' table in database." dbCur.execute("CREATE TABLE snp_rost_intrepid_detail \ (id serial PRIMARY KEY, protaccid varchar, familyid integer, position integer,resseq integer,\ chain varchar,residue char(1),icode char(1),scoreimpmap float,scorejsmap float,scoreremap float,\ scoreglobalmap float,scoreglobalksmap float,scoreglobalremap float);") print "Created new or replacement 'snp_rost_intrepid_detail' table in database.\n" dbConn.commit() dbConn.close()
def main(): askForConfirm() dbConn=db.connect_to_server('pfacts003_test','webuser') dbCur=db.get_cursor(dbConn) dbConn.set_isolation_level(0) try: dbCur.execute("DROP VIEW snp_results_detail_avg;") print "\nPrevious version of 'snp_results_detail_avg' deleted." except: print "\nView 'snp_results_detail_avg' does not exist and will be created." try: dbCur.execute("DROP TABLE snp_results_detail;") print "Previous version of 'snp_results_detail' deleted." except: print "Table 'snp_results_detail' does not exist and will be created." try: dbCur.execute("DROP TABLE snp_results_header;") print "Previous version of 'snp_results_header' deleted." except: print "Table 'snp_results_header' does not exist and will be created." dbCur.execute("CREATE TABLE snp_results_header ( \ id serial PRIMARY KEY, \ time_stamp timestamp UNIQUE, \ dataset_used varchar \ );") print "Created new or replacement 'snp_results_header' table in database." dbCur.execute("CREATE TABLE snp_results_detail ( \ id serial PRIMARY KEY, \ time_stamp timestamp REFERENCES snp_results_header(time_stamp), \ model_used varchar, \ fold_no integer, \ seq_no integer, \ val_recall decimal(4,3), \ val_precision decimal(4,3) \ );") print "Created new or replacement 'snp_results_detail' table in database." dbCur.execute("CREATE VIEW snp_results_detail_avg AS \ SELECT time_stamp, model_used, seq_no, AVG(val_recall) as recall, AVG(val_precision) as precision \ FROM snp_results_detail \ GROUP BY time_stamp, model_used, seq_no \ ORDER BY time_stamp ASC, model_used ASC, seq_no ASC;") print "Created new or replacement 'snp_results_detail_avg' view in database.\n" dbConn.commit() dbConn.close()
def main(): askForConfirm() dbConn=db.connect_to_server('pfacts003','webuser') dbCur=db.get_cursor(dbConn) dbConn.set_isolation_level(0) try: dbCur.execute("DROP TABLE snp_SIFT_pred_header;") print "\nPrevious version of 'snp_SIFT_pred_header' deleted." except: print "\nTable 'snp_SIFT_pred_header' does not exist and will be created." try: dbCur.execute("DROP TABLE snp_SIFT_pred_detail;") print "Previous version of 'snp_SIFT_pred_detail' deleted." except: print "Table 'snp_SIFT_pred_detail' does not exist and will be created." dbCur.execute("CREATE TABLE snp_SIFT_pred_header ( \ id serial PRIMARY KEY, \ time_stamp timestamp UNIQUE, \ dataset_used varchar \ );") print "Created new or replacement 'snp_SIFT_pred_header' table in database." dbCur.execute("CREATE TABLE snp_SIFT_pred_detail ( \ id serial PRIMARY KEY, \ time_stamp timestamp REFERENCES snp_SIFT_pred_header(time_stamp), \ accession varchar, \ aa_ref varchar, \ aa_pos integer, \ aa_mut varchar, \ prediction integer, \ score1 float, \ score2 float, \ score3 float, \ score4 float \ );") print "Created new or replacement 'snp_SIFT_pred_detail' table in database." dbConn.commit() dbConn.close()
def main(): if len(sys.argv) != 4: print "Usage: %s <PBS_JOBID> <NUMTHREADS> <NUMPROTSPROCESSED>" % sys.argv[ 0] sys.exit(1) jobID = int(sys.argv[1]) numThreads = int(sys.argv[2]) numProtsProcessed = int(sys.argv[3]) jobidalpha = "00" + str(jobID) # sys.stdout=NullDevice() #Turn off print messages. startTime = datetime.now() pgmName = 'compute_intrepid_snp_scores' fout = open( pgmName + ".out." + startTime.strftime("%Y%m%d.%Hh%M") + "." + jobidalpha[-3:], "w") fout.write("\nStarting program...\n") fout.write("Turned off deprecation warnings.\n") dbConn = db.connect_to_server('pfacts003_test', 'webuser') dbCur = db.get_cursor(dbConn) dbConn.set_isolation_level(0) proteinsList = db.get_query_list( dbCur, "SELECT DISTINCT u.accession FROM uniprot as u INNER JOIN snp_rost_master as s \ ON s.accession=u.accession WHERE s.status='M' ORDER BY u.accession ASC LIMIT %s;", [numProtsProcessed]) totCounter = 0 totCounterThread = 0 totResCounter = 0 l1 = 65 l2 = 15 for prot in proteinsList: if totCounter % numThreads != jobID: #Check if this thread should be computing this protein in list. totCounter += 1 continue #Skip this protein. protAccID = prot[0] totCounterThread += 1 famsList = db.get_query_list( dbCur, "SELECT DISTINCT f.id, f.family_type_id, f.score \ FROM family_sequence_taxa_2012_02_17 as fs \ LEFT JOIN family as f ON fs.family_id=f.id WHERE fs.uniprot_accession=%s \ ORDER BY f.family_type_id DESC, f.score DESC;", [protAccID]) if len(famsList) == 0: # No families found for protein. famid = 0 famtype = 'N' famscore = 0 msaok = False treeok = False idmapok = False scoresok = False dbCur.execute("INSERT INTO intrepid_header \ (accession,fam_id,fam_type,fam_score,msaok,treeok,idmapok,scoresok,qty_scored,start_pos,start_time) \ VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);" ,\ [protAccID,famid,famtype,famscore,msaok,treeok,idmapok,scoresok,0,0,startTime]) else: flgGHGFound = False # Initialize to false. for fam in famsList: # Loop over all families for protein. famid = fam[0] famtype = fam[1] famscore = fam[2] if famtype == 'C' or (famtype == 'G' and flgGHGFound == False): if famtype == 'G': flgGHGFound = True results = i.run_intrepid( protAccID, famid) # Run INTREPID and get results. if results[ 0] == False: # INTREPID run was not OK, results[1] will indicate why. dbCur.execute("INSERT INTO intrepid_header \ (accession,fam_id,fam_type,fam_score,msaok,treeok,idmapok,scoresok,qty_scored,\ start_pos,start_time) \ VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);" ,\ [protAccID,famid,famtype,famscore,results[1][0],results[1][1],results[1][2],\ results[1][3],None,None,startTime]) else: # INTREPID run was OK, results[1] will be list of scores and results[2] will be start pos in seq. for line in results[1]: dbCur.execute("INSERT INTO intrepid_detail \ (accession, fam_id, position, resseq, chain, residue, icode, scoreimpmap, \ scorejsmap, scoreremap, scoreglobalmap, scoreglobalksmap, scoreglobalremap) \ VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);" , \ [protAccID,famid,line[0],line[1],line[2],line[3],\ line[4],line[5],line[6],line[7],line[8],line[9],line[10]]) startpos = i.get_start_pos(dbCur, protAccID, results[2]) dbCur.execute("INSERT INTO intrepid_header \ (accession,fam_id,fam_type,fam_score,msaok,treeok,idmapok,scoresok,qty_scored,\ start_pos,start_time) \ VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);" , \ [protAccID,famid,famtype,famscore,True,True,True,True,len(results[1]),startpos,startTime]) totResCounter += len( results[1] ) #Increment counter of total number of detail lines added. totCounter += 1 fout.write("\nResults\n") fout.write("-" * (l1 + l2) + "\n") fout.write("Number of proteins scored:".ljust(l1) + str(totCounterThread).rjust(l2) + "\n") fout.write("Number of protein positions scored:".ljust(l1) + str(totResCounter).rjust(l2) + "\n") endTime = datetime.now() diff = endTime - startTime fout.write("Elapsed time (HH:MM:SS):".ljust(l1) + str(diff).rjust(l2) + "\n\n") fout.close() dbConn.commit() dbConn.close()
def main(): if len(sys.argv) != 3: print "Usage: %s <DATASET> <NUMFOLDS>\nProgram terminated!" % sys.argv[0] sys.exit(1) dataset = sys.argv[1] if dataset != 'rost': print "Dataset '%s' not supported.\nProgram terminated!" % dataset sys.exit(1) numFolds = int(sys.argv[2]) askWhichModels('log reg') askWhichModels('SVM') startTime = datetime.now() formattedStartTime = startTime.strftime("%Y%m%dd%Hh%M") formattedDBTimeStamp = startTime.strftime("%Y-%m-%d %H:%M:%S") pgmName = 'run_model' fout = open(pgmName+".out."+formattedStartTime,"w") fout.write("\nStarting program...\n") dbConn = db.connect_to_server('pfacts003_test','webuser') dbCur = db.get_cursor(dbConn) dbConn.set_isolation_level(0) #Get entire dataset with all required features. completeSNPList = db.get_query_list(dbCur,"SELECT id, label FROM snp_rost_detail_valid ORDER BY id ASC;") numSNPsPre = len(completeSNPList) idsWithFeatures = n.array(completeSNPList)[:,0]; y = n.array(completeSNPList)[:,1] idsWithFeatures.shape = (numSNPsPre,1); y.shape = (numSNPsPre,1) # Force as col vectors. X = n.ones((numSNPsPre,1)) #Prepend a col of ones for the intercept. featuresUsed.append('Intercept') validIndices,newFeatures = f.extract_intrepid_features(dbCur,idsWithFeatures) idsWithFeatures = idsWithFeatures[validIndices] y = y[validIndices]; X = n.hstack([X[validIndices,:],newFeatures]) featuresUsed.append('INTREPID scores') numSNPsPost = len(y) validIndices,newFeatures = f.extract_blossum62_scores(dbCur,idsWithFeatures) idsWithFeatures = idsWithFeatures[validIndices] y = y[validIndices]; X = n.hstack([X[validIndices,:],newFeatures]) featuresUsed.append('BLOSSUM62 scores') numSNPsPost = len(y) #Determine partition of dataset into folds. foldAssignments = m.get_shuffled_discrete_uniform_indices(0,numFolds-1,numSNPsPost) #Create result header record to document run. dbCur.execute("INSERT INTO snp_results_header (time_stamp, dataset_used) VALUES (timestamp %s, %s);",[formattedDBTimeStamp,dataset]) #Peform k-fold cross-validation and produce results (per fold and overall). num_cutoff_points = 101 results_all_folds = n.zeros((2*numFolds,num_cutoff_points)) average_performance_results = n.zeros((2,num_cutoff_points)) #This will hold the overall results, first as a total and then as an average. average_performance_num_contributors = n.zeros((1,num_cutoff_points)) #This keeps track of how many rounds contributed to the total for a cutoff point. for round in range(0,numFolds): trainX = X[n.nonzero(foldAssignments != round)[0],:] trainY = y[n.nonzero(foldAssignments != round)[0],:] testX = X[n.nonzero(foldAssignments == round)[0],:] testY = y[n.nonzero(foldAssignments == round)[0],:] if 'log reg' in modelsUsed: models.run_logreg_model(dbCur,trainY,trainX,testY,testX,formattedDBTimeStamp,round,num_cutoff_points) if 'SVM' in modelsUsed: models.run_svm_model(dbCur,trainY,trainX,testY,testX,formattedDBTimeStamp,round,num_cutoff_points) l1=65 l2=15 fout.write("\nResults\n") fout.write("-"*(l1+l2)+"\n") fout.write("Dataset used:".ljust(l1)+str(dataset).rjust(l2)+"\n") fout.write("Number of folds in cross validation:".ljust(l1)+str(numFolds).rjust(l2)+"\n") fout.write("Number of SNPs included in dataset before feature extraction:".ljust(l1)+str(numSNPsPre).rjust(l2)+"\n") fout.write("Number of SNPs included in dataset post feature extraction:".ljust(l1)+str(numSNPsPost).rjust(l2)+"\n") fout.write("Models used:"+"\n") for modelDesc in modelsUsed: fout.write("\t"+modelDesc+"\n") fout.write("Features used:"+"\n") for featureDesc in featuresUsed: fout.write("\t"+featureDesc+"\n") fout.write("See database for storage of results.\n") endTime=datetime.now() diff=endTime-startTime fout.write("Elapsed time (HH:MM:SS):".ljust(l1)+str(diff).rjust(l2)+"\n\n") fout.close() dbConn.commit() dbConn.close()
def main(): #Check input. if len(sys.argv) != 2: print "\nUsage: %s <filename w/o pdf extension>\nProgram terminated!\n" % os.path.basename(sys.argv[0]) mlab.close() sys.exit(1) else: filename = sys.argv[1] print "\nProgram will create a plot/figure file named "+filename+".pdf\n" #Set up empty request list. plotsRequested = [] #Prompt for type of report. options = ['None/Quit','PR Curve(s)'] chosenReport = getChoice('Choose the number of the desired report',options,True) if chosenReport == 0: mlab.close(); sys.exit('Program terminated at user request!') dbConn = db.connect_to_server('pfacts003_test','webuser') dbCur = db.get_cursor(dbConn) dbConn.set_isolation_level(0) if chosenReport == 1: runsAvailable = db.get_query_list(dbCur,"SELECT time_stamp, dataset_used FROM snp_results_header ORDER BY time_stamp DESC;") if len(runsAvailable) == 0: print "No results available for chosen report type" else: idxFirst = 1; runsAvailable.insert(0,('None/Quit','NA')) print "Available runs include the following:" for time_stamp, dataset in runsAvailable[1:]: print "\t"+str(idxFirst)+"\t"+str(time_stamp)+"\t"+dataset idxFirst += 1 chosenRun = getChoice("Choose the number of the desired run",range(len(runsAvailable)),False) if chosenRun == 0: mlab.close(); sys.exit('Program terminated at user request!') modelsAvailable = db.get_query_list(dbCur,"SELECT DISTINCT model_used FROM snp_results_detail WHERE time_stamp = %s ORDER BY model_used ASC;", \ [runsAvailable[chosenRun][0]]) modelsAvailable = [i[0] for i in modelsAvailable] # Flatten list of lists. modelsAvailable.insert(0,'None/Quit') # Add this option at start of list for user. print "For the run chosen, the following models have results:" idxSec = 1 for each in modelsAvailable[1:]: print "\t"+str(idxSec)+"\t"+str(each) idxSec += 1 while True: chosenModel = getChoice("Choose the number of a model to add/include or 'None' to continue",modelsAvailable,True) if chosenModel == 0: break else: chosenType = getChoice("Choose the number corresponding to level of detail or 'None' to continue",\ ['None','Individual Folds','Average over Folds'],True) if chosenType == 0: continue else: plotsRequested.append((runsAvailable[chosenRun][0],modelsAvailable[chosenModel],chosenType)) # Collect data requested. dataToPlot = n.zeros((2,0)) # Create an empty 2-row, 0-col matrix that will used to horizontally append data. startPoints = n.array([]) # Initialize empty array that stores where each series starts. counter = 1 # Need to use base 1 because that's what Matlab uses. legend_string = '' for time_stamp,model,type in plotsRequested: if type == 1: currFold = -1 data = db.get_query_list(dbCur,"SELECT fold_no, seq_no, val_recall, val_precision FROM snp_results_detail \ WHERE time_stamp = %s AND model_used = %s ORDER BY fold_no ASC, seq_no ASC;", [time_stamp,model]) for fold,seq,valrecall,valprec in data: if fold != currFold: startPoints = n.hstack([startPoints,n.array([counter])]) currFold = fold legend_string = legend_string + 'Fold ' + str(fold) + ',' if valrecall != None and valprec != None: dataToPlot = n.hstack([dataToPlot,n.array([[float(valrecall)],[float(valprec)]])]) counter += 1 if type == 2: legend_string = legend_string + model + ',' data = db.get_query_list(dbCur,"SELECT seq_no, recall, precision FROM snp_results_detail_avg \ WHERE time_stamp = %s AND model_used = %s ORDER BY seq_no ASC;", [time_stamp,model]) startPoints = n.hstack([startPoints,n.array([counter])]) for seq,valrecall,valprec in data: if valrecall != None and valprec != None: dataToPlot = n.hstack([dataToPlot,n.array([[float(valrecall)],[float(valprec)]])]) counter += 1 mlab.close() if startPoints.shape[0] <= 1: # Means that only one series is present. legend_string = 'NA' else: legend_string = legend_string[0:-1] # Strip off trailing comma. plot_pr_curve(dataToPlot,startPoints,filename,legend_string)
def run_query(query): dbConn = db.connect_to_server('pfacts003_test', 'webuser') dbCur = db.get_cursor(dbConn) dbConn.set_isolation_level(0) return db.get_query_list(dbCur, query)