def pfamHitLength_File(fileInfo): inputFolder=fileInfo[1] inputfile=fileInfo[2] lengthArray=[] #proteinLenName=inputfile.replace(conf.pfamExt,conf.protLenExt) # with open(os.path.join(conf.protLenFolder,proteinLenName)) as f: # proteinLengths=load(f) with open(os.path.join(inputFolder,inputfile),"r") as f: for lineIndex, line in enumerate(f): lineInfo=(lineIndex,line) #format: first line is header #0 1 2 3 4 #PDB_ID PdbResNumStart PdbResNumEnd eValue PFAM_ACC if lineIndex!=0: arr=line.split("\t") start=int(arr[1].strip()) end=int(arr[2].strip()) difference=end-start lengthArray.append(difference) #write the array down and reset the array util.generateDirectories(conf.pfamGenFolder) outfile=os.path.join(conf.pfamGenFolder,inputfile.replace(conf.pfamExt,".cPickle")) with open(outfile,"wb") as f: dump(lengthArray, f) return lengthArray
def reDownloadSeq(): util.generateDirectories(conf.outputFolder) #input folder PFAMFolder = conf.PFAMFolder for infile in os.listdir(PFAMFolder): #the directory of the output infile is: outputDir = os.path.join(conf.outputFolder, infile.replace(conf.PFamExt, conf.outputExt)) errorDir = os.path.join(conf.outputFolder, infile.replace(conf.PFamExt, "_error.txt")) inputDir = os.path.join(conf.PFAMFolder, infile) progDir = os.path.join(conf.outputFolder, infile.replace(conf.PFamExt, "_progress.txt")) progress = 0 if os.path.isfile(progDir): with open(progDir, "r") as f: progress = int(f.read().strip) #create the output file #open(outputDir,"w") open(errorDir, "w") records = [] for record in SeqIO.parse(open(inputDir, "rU"), "fasta"): records.append(record) for i in range(progress, len(records)): record = records[i] seq = record.seq sid = record.id desc = record.description protID = desc.split(":")[0] seqmerge = str(seq).replace("\n", "").strip() if seqmerge == len(seqmerge) * "X": #if sequence is all X #print seq print "downloading", i, "/", len(records), int( i * 100 / float(len(records))), "%" retmax = 10 strOut = "Retmax" while (strOut == "Retmax" and retmax < 1000): strOut = DownloadNewSeq(protID, retmax) time.sleep(.3) retmax = retmax * 2 if strOut == "Error" or retmax >= 1000: print "Cannot find Seq for:", protID, "in", retmax, "downloads" with open(errorDir, "a") as f: f.write(protID + "," + str(retmax) + "\n") else: with open(outputDir, "a") as f: f.write(strOut) else: with open(outputDir, "a") as f: f.write(">" + str(desc) + "\n" + str(seq) + "\n\n")
def downloadInOneGo(): #create the output folder util.generateDirectories(conf.outputFolder) #input folder PFAMFolder = conf.PFAMFolder for infile in os.listdir(PFAMFolder): #the directory of the output infile is: outputDir = os.path.join(conf.outputFolder, infile.replace(conf.PFamExt, conf.outputExt)) #create the output file open(outputDir, "w") #identfy the proteins we need to download with open(os.path.join(PFAMFolder, infile), "r") as f: proteins = identifyProteinSequences(f.read()) proteinDir = os.path.join( conf.outputFolder, infile.replace(conf.PFamExt, "_proteins.cPickle")) with open(proteinDir, "wb") as f: dump(proteins, f) print "Number of proteins", len(proteins) # #convert the proteins into a query # proteinQueries=[] # maxLen=250 # for i, protID in enumerate(proteins): # addToProtQueries(proteinQueries, protID, maxLen) # # if i>0 and i<len(proteins)-1: # # proteinQuery+=" OR " for i, proteinQuery in enumerate(proteins): print "(", i + 1, "/", len(proteins), ")", int( i * 100 / float(len(proteins))), "%" results = fetchFASTASeqFromPDB(proteinQuery) with open(outputDir, "ab") as f: f.write(results) f.write("\n\n") #print "waiting..." time.sleep(.5)
def generateHistograms(fileInfo): util.generateDirectories(conf.histogramFolder) pfamArr=pfamHitLength_File(fileInfo) blastFilename=fileInfo[2].replace(conf.pfamExt, conf.blastExt) fileInfoMod=(fileInfo[0],conf.blastFolder, blastFilename) blastArr=BLASTHitLength_File(fileInfoMod) numbins=100 maxnum=max(numpy.amax(blastArr),numpy.amax(pfamArr)) bins = numpy.linspace(0, maxnum, numbins) plt.hist(pfamArr, bins, normed=1,facecolor="red", alpha=.75, label="pfam") plt.hist(blastArr, bins, normed=1,facecolor="blue", alpha=.25, label="blast") histoutname=fileInfo[2].replace(conf.pfamExt, ".png") outdir=os.path.join(conf.histogramFolder,histoutname) plt.legend() plt.savefig(outdir) plt.close()
def build_graph(blastInfoFilename, blastdir, hspIntGraphdir, cutoffRatio, evalueCutoff): #Generate the output folder util.generateDirectories(hspIntGraphdir) g = nx.Graph() #read the file f = open(os.path.join(blastdir, blastInfoFilename), "r") content = f.read() f.close() #a dictionary that stores node names by the protein names nodeNames = {} #add the HSP edges for i, line in enumerate(content.split("\n")): if (i % (len(content.split("\n")) / 10) == 0): #sys.stdout.write(str(int(float(10*i)/float(len(content.split("\n")))))) sys.stdout.write("*") sys.stdout.flush() if len(line) > 0: hsp = read_HSP(line) goodeval = hsp["EValue"] < evalueCutoff notsameprotein = (hsp["query_id"] != hsp["target_id"]) if goodeval and notsameprotein: #Add the nodes (p_1,s_1,e_1) and (p_2,s_2,e_2) and create an edge between them g.add_node(nodeName(hsp, "query")) g.add_node(nodeName(hsp, "target")) g.add_edge(nodeName(hsp, "query"), nodeName(hsp, "target"), eValue=hsp["EValue"]) #add the two node names to the nodeNames dictionary and take away the duplicates addToDict(nodeNames, nodeName(hsp, "query")[0], nodeName(hsp, "query")) addToDict(nodeNames, nodeName(hsp, "target")[0], nodeName(hsp, "target")) sys.stdout.write("\n") sys.stdout.flush() #add the Interval edges proteins = nodeNames.keys() for protein in proteins: # if(i%(len(proteins)/10)==0): # sys.stdout.write("*") # sys.stdout.flush() subNodeNames = nodeNames[protein] for i in xrange(len(subNodeNames) - 1): for j in xrange(i + 1, len(subNodeNames)): name1 = subNodeNames[i] name2 = subNodeNames[j] overlapPairs = findOverlapIntervals(name1, name2, cutoffRatio) for overlapPair in overlapPairs: g.add_edge(overlapPair[0], overlapPair[1]) # sys.stdout.write("\n") # sys.stdout.flush() #save the HSPIntGraph splitFilename = blastInfoFilename.split(".") fileExt = "." + splitFilename[len(splitFilename) - 1] outputFile = blastInfoFilename.replace(fileExt, "") + '_HSPIntGraph.gpickle' outputPath = os.path.join(hspIntGraphdir, outputFile) with open(outputPath, 'wb') as fout: dump(g, fout, HIGHEST_PROTOCOL) return outputFile
def defineBordersFromGraph(graphFile, hspIntGraphdir, borderInfodir, borderResultdir): #generate the directories util.generateDirectories(borderInfodir) util.generateDirectories(borderResultdir) with open(os.path.join(hspIntGraphdir, graphFile)) as fin: g = load(fin) #find the connected components of the graph: CCgraphs = list(nx.connected_component_subgraphs(g)) CCgraphs.sort(key=lambda tup: len(tup.nodes())) #create a dictionary where key=protein name, val:border information modulefamilyinfo = {} for moduleID, CCgraph in enumerate(CCgraphs): for node in CCgraph.nodes(): proteinName = node[0] start = int(node[1]) end = int(node[2]) if (proteinName not in modulefamilyinfo): modulefamilyinfo[proteinName] = [[moduleID, start, end]] else: #if we already have information on this protein modulefamilyinfo[proteinName].append([moduleID, start, end]) collapsOverlappingBorders(modulefamilyinfo[proteinName]) #cleanup borders that ended up developing to overlap borders for protein in modulefamilyinfo.keys(): collapsOverlappingBorders(modulefamilyinfo[protein]) #save the border information dataset with open( os.path.join( borderInfodir, graphFile.replace('_HSPIntGraph.gpickle', "") + '_BorderInformation.gpickle'), 'wb') as fout: moduleNumInfo = ("Number of modules detected", len(CCgraphs)) familyInfoWrap = ( "ModuleFamilyInfo, Format: Dict\{proteinName, list[borders[moduleId,start,end]] \}", modulefamilyinfo) dump((moduleNumInfo, familyInfoWrap), fout, HIGHEST_PROTOCOL) #write the information down in a text file. resultFile = open( os.path.join( borderResultdir, graphFile.replace('_HSPIntGraph.gpickle', "") + '_ModuleInfo.txt'), "w") resultFile.write("Number of modules detected: " + str(len(CCgraphs)) + "\n\n") resultFile.write("proteinName\t borders\n") proteins = modulefamilyinfo.keys() proteins.sort() for protein in proteins: resultFile.write(protein) borders = modulefamilyinfo[protein] for border in borders: moduleID = border[0] start = border[1] end = border[2] resultFile.write("\tM_" + str(moduleID) + "(" + str(start) + "," + str(end) + ")") resultFile.write("\n") resultFile.close() return 0