def main(): filenames = os.listdir(conf.inputFolder) util.generateDirectories(conf.resultFolder) with open(os.path.join(conf.resultFolder, "Results.txt"), "w") as wf: wf.write("TEvo\tNFam\tNFusions\ts%\ts\tlen\ts%_f\ts_f\tlen_f\n") for filename in filenames: # reference: # 0 1 2 3 4 5 6 7 8 9 10 11 12 # M_mjtt_SeqL_1000_NFam_2_NFusions_2_TEvo_1.5_NGen_5_ BorderInformation parsed = filename.split("_") # model = parsed[1] # seqLen = parsed[3] NFam = parsed[5] NFusions = parsed[7] TEvo = parsed[9] # NGen = parsed[11] borderDict = load( open(os.path.join(conf.inputFolder, filename), "rb")) successRatio, success, totalBorders = borderAnalysis.analyizeBorder( borderDict) successRatio2, success2, totalBorders2 = borderAnalysis.analyizeBorder( borderDict, True) wf.write( str(TEvo) + "\t" + str(NFam) + "\t" + str(NFusions) + "\t") wf.write( str(successRatio) + "\t" + str(successRatio) + "\t" + str(totalBorders) + "\t") wf.write( str(successRatio2) + "\t" + str(successRatio2) + "\t" + str(totalBorders2) + "\n")
def main(): borderFiles = os.listdir(conf.addaResultFolder) for i, borderFile in enumerate(borderFiles): resultDict = {} nidToPidPath = os.path.join(conf.nidToPidFolder, borderFile) nidToPidDict = load(open(nidToPidPath, "rb")) addaDir = os.path.join(conf.addaResultFolder, borderFile) with open(addaDir, "r") as f: for line in f: arr = line.split("\t") nid = int(arr[0].strip()) s = int(arr[1].strip()) e = int(arr[2].strip()) pid = nidToPidDict[nid] if pid in resultDict.keys(): # adda did not provide domain family info, so I just put '1' here resultDict[pid].append([1, s, e]) else: resultDict[pid] = [[1, s, e]] util.generateDirectories(conf.resultFolder) with open(os.path.join(conf.resultFolder, borderFile + ".cpickle"), "wb") as f: dump(resultDict, f)
def main(): seqFiles = os.listdir(conf.inputFolder) # generate runtime output file util.generateDirectories(conf.runTimefolder) open(conf.runTimeFile, "w") for i, seqFile in enumerate(seqFiles): util.printL("\nAnalyzing " + seqFile + "(" + str(i) + "/" + str(len(seqFiles)) + ")\n") runAlg(seqFile)
def main(): seqFiles = os.listdir(conf.inputFolder) # generate runtime output file util.generateDirectories(conf.runTimefolder) with open(conf.runTimeFile, "w") as f: f.write("numBLAST_Align\tgraphSize\ttime\tfilename\n") for i, seqFile in enumerate(seqFiles): util.printL("\nAnalyzing "+seqFile+"("+str(i)+"/"+str(len(seqFiles))+")\n") runAlg(seqFile)
def filterInput(): util.generateDirectories(conf.filteredDir) filenames = os.listdir(conf.inputFolder) for filename in filenames: with open(os.path.join(conf.filteredDir, filename), "w") as wf, \ open(os.path.join(conf.inputFolder, filename), "r") as rf: for line in rf: arr = line.split("\t") if arr[0] != arr[1]: wf.write(line)
def main(): filterInput() filenames = os.listdir(conf.filteredDir) util.generateDirectories(conf.resultFolder) for filename in filenames: fileDir = os.path.join(conf.filteredDir, filename) cmd = [conf.mosaicFinderDir, "-e", "1e-05", "-p", "30", "-i", fileDir, "-o", os.path.join(conf.resultFolder, filename)] proc = subprocess.Popen(cmd) return 0
def compareSplitPoints(): borderFiles = os.listdir(conf.bordersFolder) util.generateDirectories(conf.resultFolder) with open(conf.splitResultFile, "w") as resultFile: resultFile.write( "testcase\tavgDist\twrongNumBordersOnSplitProt\tmultBorderOnFamilyProtein\n" ) for borderFile in borderFiles: borderDir = os.path.join(conf.bordersFolder, borderFile) borderDict = load(open(borderDir, "rb")) # fusionInfoFile = borderFile.replace(conf.bordersAppend, conf.fusionInfoAppend) # fusionInfoDir = os.path.join(conf.fusionInfoFolder, fusionInfoFile) # # fusionDict = readFusionInfo(fusionInfoDir) avgDist, wrongNumBordersOnSplitProt, multBorderOnFamilyProtein = func.compareSplitPoints( borderDict) testcase = borderFile.replace(conf.bordersAppend, "") resultFile.write(testcase + "\t" + str(avgDist) + "\t" + str(wrongNumBordersOnSplitProt) + "\t" + str(multBorderOnFamilyProtein) + "\n")
def runAlg(filename): # fam_name=family[1] # numprot=family[0] # # generate fasta sequence # outfolder = conf.fastaFolder # util.generateDirectories(outfolder) # # filename = str(numprot) + "_" + fam_name # # print " Generating sequence file..." # outdir = os.path.join(outfolder, filename) # GenFasta.GenerateFastaInputForMultiFamilies(FamNames, outdir) if not conf.skipBLAST: # generate protein lengths plenFolder = conf.proteinLenFolder util.generateDirectories(plenFolder) plenDict = blast.generateProtLenDict(conf.fastaFolder, filename) dump(plenDict, open(os.path.join(plenFolder, filename), "wb")) # create blast databases print " Conducting BLASTp all-to-all..." blast.makeblastdb(conf.fastaFolder, filename) # conduct all to all BLASTp alltoallFolder = conf.alltoallFolder util.generateDirectories(alltoallFolder) blast.alltoallBlastP(conf.fastaFolder, filename, os.path.join(alltoallFolder, filename)) # This is where my algorithm starts and also where I'll start timing print " Conducting my algorithm..." startTime = time.time() # build HSPIntGraph seqSimGraph, numBlastLines, numIntEdge = buildGraph.build_graph( filename, conf.alltoallFolder) # identify protein module borders # putative domains numModules, moduleFamilyInfo = findBorders.generatePutativeModules( seqSimGraph) putativeResult = vis.visualizeModuleFamilyInfo(moduleFamilyInfo) # remove submodules findBorders.removeSuperModules(moduleFamilyInfo) moduleResult = vis.visualizeModuleFamilyInfo(moduleFamilyInfo) # print moduleResult # rename modules to have lower numbers numModulesAfterFilter = findBorders.renameModules(moduleFamilyInfo) moduleResultRenamed = vis.visualizeModuleFamilyInfo(moduleFamilyInfo) endTime = time.time() # calculate elapsed time timediff = endTime - startTime # output the results util.generateDirectories(conf.textResultsFolder) consizePath = os.path.join(conf.textResultsFolder, filename + "_Modules.txt") with open(consizePath, "w") as f: f.write(moduleResultRenamed) detailedPath = os.path.join(conf.textResultsFolder, filename + "_detailedResults.txt") with open(detailedPath, "w") as f: f.write("number of Blast Edges: " + str(numBlastLines) + "\n") f.write("time elapsed: " + str(timediff)) util.printL("Completed in " + str(timediff) + " seconds\n") f.write("number of IntervalEdges added: " + str(numIntEdge) + "\n") f.write("Putative Modules: " + str(numModules) + "\n" + putativeResult + "\n") f.write("RemoveSuperModules: \n" + moduleResult + "\n") f.write("Final Module Definition: " + str(numModulesAfterFilter) + "\n" + moduleResultRenamed + "\n") # Write down the timing results with open(conf.runTimeFile, "a") as f: f.write( str(numBlastLines) + "\t" + str(timediff) + "\t" + filename + "\n") # # compare the borders with pfam definitions side by side # pFamDict = pfamComp.correspondingPFamDict(moduleFamilyInfo) # pfamCompPath = os.path.join(conf.textResultsFolder, filename + "_pFamSideBySide.txt") # with open(pfamCompPath, "w") as f: # f.write(vis.visualizePFamComparison(moduleFamilyInfo,pFamDict)) # # dump the border files for future comparison util.generateDirectories(conf.pickledResultsFolder) myBordersPath = os.path.join(conf.pickledResultsFolder, filename + "_myBorders.cpickle") # pFamBordersPath = os.path.join(conf.pickledResultsFolder, filename + "_pfamBorders.cpickle") dump(moduleFamilyInfo, open(myBordersPath, "wb")) # dump(pFamDict, open(pFamBordersPath, "wb")) # remove extra folders to safe disk space if conf.deleteFolders: shutil.rmtree(conf.proteinLenFolder) shutil.rmtree(conf.blastdbFolder) shutil.rmtree(conf.alltoallFolder)
def convertResultToCPickleDict(): util.generateDirectories(conf.cPickleDir) filenames = os.listdir(conf.resultFolder) for filename in filenames: with open(os.path.join(conf.resultFolder, filename), "r") as f: for line in f:
def main(): # print "\n0_BlastAllToAll:" # BlastAllToAll.main() # print "\n1_ConvertToADDA: " # ConvertToADDA.main() # print "\n2_rcm_module_tree:" # rcm_module_tree.main() # print "\n3_RunADDA:" # RunADDA.main(recompile=conf.recompileADDA) if conf.recompileADDA: print "make clean and making ADDA" # make clean the software cmd = ["make", "clean", "-C", conf.addaFolder] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) proc.communicate() # make the software cmd = ["make", "-C", conf.addaFolder] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) proc.communicate() # import all of the filenames inputfilenames = os.listdir(conf.seqFolder) inputfilenames.sort() # create the file for timing information util.generateDirectories(conf.timingFolder) timingFileDir = os.path.join(conf.timingFolder, conf.timingFile) with open(timingFileDir, "w") as f: f.write("Time\tSampleName\n") # reset the log file open(os.path.join(conf.logFolder, conf.logFile), "w") for i, inputfilename in enumerate(inputfilenames): util.printL("\nProcessing " + inputfilename + " (" + str(i) + "/" + str(len(inputfilenames)) + ")\n") util.printL("0_BlastAllToAll:\n") BlastAllToAll.main(inputfilename) util.printL("1_ConvertToADDA:\n") ConvertToADDA.main(inputfilename) # start timer startTime = time.time() util.printL("\n2_rcm_module_tree:\n") rcm_module_tree.main(inputfilename) util.printL("3_RunADDA:\n") RunADDA.main(inputfilename) # end timer endTime = time.time() timeDiff = endTime - startTime util.printL("rcm and adda completed in " + str(timeDiff) + " seconds.\n") with open(timingFileDir, "a") as f: f.write(str(timeDiff) + "\t" + inputfilename + "\n")
def main(): filenames = os.listdir(conf.inputFolder) util.generateDirectories(conf.resultFolder) with open(os.path.join(conf.resultFolder, "Results.txt"), "w") as wf: wf.write("TEvo\tNFam\tNFusions\tavgConf\tnumProt\n") for filename in filenames: # reference: # 0 1 2 3 4 5 6 7 8 9 10 11 12 # M_mjtt_SeqL_1000_NFam_2_NFusions_2_TEvo_1.5_NGen_5_ BorderInformation parsed = filename.split("_") # model = parsed[1] # seqLen = parsed[3] NFam = parsed[5] NFusions = parsed[7] TEvo = parsed[9] # NGen = parsed[11] fusedDict, familyDict = readResultFile(conf.inputFolder, filename) confidenceArr = [] for pid in fusedDict.keys(): fEvent, f1, f2 = fusedDict[pid] mf1s = familyDict[fEvent][1] mf2s = familyDict[fEvent][2] success = 0 totalAssigns = len(mf1s) + len(mf2s) print pid print mf1s print mf2s #print familyDict[fEvent][3] success1 = 0 success2 = 0 for f in mf1s: if f == f1: success1 += 1 for f in mf2s: if f == f1: success2 += 1 if success1 > success2: mf1 = mf1s mf2 = mf2s else: mf1 = mf2s mf2 = mf1s for f in mf1: if f == f1: success1 += 1 for f in mf2: if f == f2: success2 += 1 # if totalAssigns == 0: # confi = 0 # else: # confi = float(success)/totalAssigns if success1 > 0 and success2 > 0: confi = 1 elif success1 == 0 and success2 == 0: confi = 0 else: confi = .5 confidenceArr.append(confi) print confi, success, totalAssigns avgConf = reduce(lambda x, y: x + y, confidenceArr) / float( len(confidenceArr)) wf.write( str(TEvo) + "\t" + str(NFam) + "\t" + str(NFusions) + "\t") wf.write(str(avgConf) + "\t" + str(len(fusedDict.keys())) + "\n")