def moveBLASTResults(self): try: print "moveBLASTResults" orgListFile_fh = open(self.orgListFile) NGS_Util.zipDirectory(self.orgBlastResDir) NGS_Util.moveDirectoryFiles(self.orgBlastResDir,self.moveToDir_orgBlastResDir) for line in orgListFile_fh: organismNameID, organismName = line.strip().split() orgRectifyBlast = NGS_Util.createFilePath(self.jointBlastDir, organismName + ".joint.blast") moveto_orgRectifyBlast = NGS_Util.createFilePath(self.moveToDir_jointBlastDir, organismName + ".joint.blast") self.moveFile_createLink(orgRectifyBlast,moveto_orgRectifyBlast) orgListFile_fh.close() except Exception: print traceback.print_exc()
def copySequenceFiles(self, srcDataDir): try: print("Copy Fasta Files from %s to %s" %(srcDataDir,self.orgFastaDir)) orgListFile_fh = open(self.orgListFile) for line in orgListFile_fh: organismNameID, organismName = line.strip().split() if not os.path.exists( NGS_Util.createFilePath(self.orgFastaDir, organismName + ".faa") ): orgFasta = NGS_Util.createFilePath(srcDataDir, organismName + ".faa") NGS_Util.copyFile(orgFasta, self.orgFastaDir) print("Copied fasta file for %s" % (organismName)) else: print("\tDoing nothing (files already copied) for %s" % (organismName)) orgListFile_fh.close() except Exception: print traceback.print_exc()
def moveIPRScanResults(self): try: print "moveIPRScanResults" orgListFile_fh = open(self.orgListFile) NGS_Util.zipDirectory(self.orgIPRScanDir) NGS_Util.moveDirectoryFiles(self.orgIPRScanDir,self.moveToDir_orgIPRScanDir) for line in orgListFile_fh: organismNameID, organismName = line.strip().split() organism_raw_final = NGS_Util.createFilePath(self.InterProScan_EC_RAW_results, organismName + ".faa.raw.txt") moveto_organism_raw_final = NGS_Util.createFilePath(self.moveToDir_InterProScan_EC_RAW_results, organismName + ".faa.raw.txt") self.moveFile_createLink(organism_raw_final,moveto_organism_raw_final) organism_IPR_final = NGS_Util.createFilePath(self.InterProScan_EC_RAW_results, organismName + ".faa.IPR.final.txt") moveto_organism_IPR_final = NGS_Util.createFilePath(self.moveToDir_InterProScan_EC_RAW_results, organismName + ".faa.IPR.final.txt") self.moveFile_createLink(organism_IPR_final,moveto_organism_IPR_final) orgListFile_fh.close() except Exception: print traceback.print_exc() return ""
def moveGTGResults(self): try: print "moveGTGResults" orgListFile_fh = open(self.orgListFile) NGS_Util.zipDirectory(self.orgGTGBlastResDir) NGS_Util.moveDirectoryFiles(self.orgGTGBlastResDir,self.moveToDir_orgGTGBlastResDir) NGS_Util.zipDirectory(self.GTGBestHitsDir) NGS_Util.moveDirectoryFiles(self.GTGBestHitsDir,self.moveToDir_GTGBestHitsDir) for line in orgListFile_fh: organismNameID, organismName = line.strip().split() org_gtg_knn_final = NGS_Util.createFilePath(self.GTGKNNDir, organismNameID + ".gtg.knn") moveto_org_gtg_knn_final = NGS_Util.createFilePath(self.moveToDir_GTGKNNDir, organismNameID + ".gtg.knn") self.moveFile_createLink(org_gtg_knn_final,moveto_org_gtg_knn_final) orgListFile_fh.close() except Exception: print traceback.print_exc()
def blast_org_vs_nr40_blast_formatted_11(self, organismName): try: org_fasta = NGS_Util.createFilePath(self.orgFastaDir, organismName+".faa") clusterArrayCall = "qsub -t 1-1 " + ScriptsDir.ClusterGTGBlast blastP = NGS_Util.createFilePath(ScriptsDir.BlastDir,"blastp") outfmt = str(11) org_vs_nr40BlastDB_f11 = NGS_Util.createFilePath(self.orgGTGBlastResDir, organismName + ".nrdb40_v2.txt") call = clusterArrayCall + " " + blastP + " " + self.nrdb40_blast_db + " " + org_fasta + " " + outfmt + " " + org_vs_nr40BlastDB_f11 + " " + str(self.blastEValue) NGS_Util.executeCall(call) return org_vs_nr40BlastDB_f11 except Exception: print traceback.print_exc() return ""
def concatenate_Org_vs_Uniprot_ClusterBlast_results(self, organismName): try: clusterProcessing = True for fragment in range(self.numberOfFragments): org_vs_UniprotBlastDB = NGS_Util.createFilePath(self.orgBlastResDir, organismName + "-vs-up_" + str(fragment+1) + ".blast" ) if not os.path.exists(org_vs_UniprotBlastDB): clusterProcessing = False break if (clusterProcessing): org_vs_UniprotBlastDB = NGS_Util.createFilePath(self.orgBlastResDir, organismName + "-vs-up.blast" ) call = "cat " + NGS_Util.createFilePath(self.orgBlastResDir, organismName + "-vs-up_*") + " > " + org_vs_UniprotBlastDB NGS_Util.executeCall(call) return org_vs_UniprotBlastDB else: print organismName + "-vs-Uniprot BLAST incomplete" except Exception: print traceback.print_exc() return ""
def generatePics(currentRunDir): try: print "generatePics" call = "cat " + projectBinDir + "r_analyseClustering.R | R --slave --args " + currentRunDir NGS_Util.executeCall(call) picsDir = NGS_Util.createDirectoryPath(currentRunDir, "PicsTables") epsFile = NGS_Util.createFilePath(picsDir, "SenSpeVsInfAll.eps") pngFile = NGS_Util.createFilePath(picsDir, "SenSpeVsInfAll.png") call = "convert " + epsFile + " " + pngFile NGS_Util.executeCall(call) except Exception: print traceback.print_exc() return ""
def rawIPRScan(self, organismName, org_ipr_split_dir): try: print "rawIPRScan: " + organismName for fragment in range(self.numberOfFragments): org_ipr_split_file = NGS_Util.createFilePath(self.splitFasta.organismSplitDataDir,organismName + "_" + str(fragment+1) ) self.raw_split_IPRScan(organismName, org_ipr_split_file, str(fragment+1)) ipr_raw_file_split = NGS_Util.createFilePath(self.orgIPRScanDir, organismName + "_split_*") ipr_raw_file = NGS_Util.createFilePath(self.orgIPRScanDir, organismName + ".ipr.raw") call = "cat " + ipr_raw_file_split + " > " + ipr_raw_file NGS_Util.executeCall(call) return ipr_raw_file except Exception: print traceback.print_exc() return ""
def makeBlastDB(self, organismName): try: print "Make Blast Database: " + organismName org_fasta = NGS_Util.createFilePath(self.orgFastaDir, organismName + ".faa") org_dust = NGS_Util.createFilePath(self.orgBlastDustDir, organismName + "_dust.asnb") org_blast_db = NGS_Util.createFilePath(self.orgBlastDBDir, organismName) if os.path.exists(org_fasta): if not os.path.exists(org_blast_db + ".phd") and not os.path.exists( org_blast_db + ".psq"): self.ngsBlast.makeProteinBlastDBFromDustFile( org_fasta, org_dust, org_blast_db) return org_blast_db except Exception: print traceback.print_exc() return ""
def makeBlastDB(self, organismName): try: print "Make Blast Database: " + organismName org_fasta = NGS_Util.createFilePath(self.orgFastaDir, organismName+".faa") org_dust = NGS_Util.createFilePath(self.orgBlastDustDir, organismName+"_dust.asnb") org_blast_db = NGS_Util.createFilePath(self.orgBlastDBDir, organismName) if os.path.exists(org_fasta): if not os.path.exists(org_blast_db + ".phd") and not os.path.exists(org_blast_db + ".psq"): self.ngsBlast.makeProteinBlastDBFromDustFile(org_fasta,org_dust,org_blast_db) return org_blast_db except Exception: print traceback.print_exc() return ""
def runClusterIPRScan(self, organismName): try: splitFasta = MetabolicReconstructionPipeline_SplitFasta.MetabolicReconstructionPipeline_SplitFasta() #numberOfFragments = 10 ############################################################################################################################################### org_fasta = NGS_Util.createFilePath(self.orgFastaDir, organismName+".faa") splitFasta.splitOrganismDataFile(organismName, org_fasta, self.numberOfFragments) ################################################################################################################################### clusterArrayCall = "qsub -t 1-" + str(self.numberOfFragments) + ":1 " + ScriptsDir.ClusterIprscan iprscan = NGS_Util.createFilePath(ScriptsDir.IprscanDir,"interproscan.sh ") splitFile = splitFasta.organismSplitDataDir + organismName ipr_raw_file_split = NGS_Util.createFilePath(self.orgIPRScanDir, organismName + "_split" ) call = clusterArrayCall + " " + iprscan + " " + splitFile + " " + ipr_raw_file_split NGS_Util.executeCall(call) except Exception: print traceback.print_exc() return ""
def run_Org_vs_Uniprot_ClusterBlast(self, organismName): try: splitFasta = MetabolicReconstructionPipeline_SplitFasta.MetabolicReconstructionPipeline_SplitFasta() ############################################################################################################################################### org_fasta = NGS_Util.createFilePath(self.orgFastaDir, organismName+".faa") splitFasta.splitOrganismDataFile(organismName, org_fasta, self.numberOfFragments) ################################################################################################################################### clusterArrayCall = "qsub -t 1-" + str(self.numberOfFragments) + ":1 " + ScriptsDir.ClusterBlast blastP = NGS_Util.createFilePath(ScriptsDir.BlastDir,"blastp") outfmt = str(6) splitFile = splitFasta.organismSplitDataDir + organismName org_vs_UniprotBlastDB = NGS_Util.createFilePath(self.orgBlastResDir, organismName + "-vs-up" ) call = clusterArrayCall + " " + blastP + " " + self.uniprot_blast_db + " " + splitFile + " " + outfmt + " " + org_vs_UniprotBlastDB + " " + str(self.blastEValue) NGS_Util.executeCall(call) except Exception: print traceback.print_exc()
def getBlastScore(self, mode): try: print "getBlastScores" orgListFile_fh = open(self.orgListFile) for line in orgListFile_fh: organismNameID, organismName = line.strip().split() orgJointBlast = NGS_Util.createFilePath(self.orgBlastResDir, organismName + ".joint.blast") orgRectifyBlast = NGS_Util.createFilePath(self.jointBlastDir, organismName + ".joint.blast") print "getBlastScore:" + organismName if not os.path.exists(orgRectifyBlast): if os.path.exists(orgJointBlast): orgRectifyBlast = self.rectifyBlast(organismName, orgJointBlast) else: if (mode == 1): org_blast_db = self.makeBlastDB(organismName) self.run_Org_vs_Uniprot_ClusterBlast(organismName) time.sleep(1800) #wait for 15 minutes self.run_Uniprot_vs_Org_ClusterBlast(organismName) time.sleep(2400) #wait for 20 minutes elif (mode == 2): org_vs_UniprotBlastDB = self.concatenate_Org_vs_Uniprot_ClusterBlast_results(organismName) Uniprot_vs_orgBlastDB = self.concatenate_Uniprot_vs_Org_ClusterBlast_results(organismName) if (org_vs_UniprotBlastDB != "" and Uniprot_vs_orgBlastDB != ""): orgJointBlast = self.combineBlast(organismName, org_vs_UniprotBlastDB, Uniprot_vs_orgBlastDB) if (orgJointBlast != ""): orgRectifyBlast = self.rectifyBlast(organismName, orgJointBlast) orgListFile_fh.close() except Exception: print traceback.print_exc() return ""
def getIPRScanScore(self, mode): try: print "getIPRScanScore" orgListFile_fh = open(self.orgListFile) for line in orgListFile_fh: organismNameID, organismName = line.strip().split() organism_IPR_final = NGS_Util.createFilePath(self.fungi_InterProScan_result, organismName + ".faa.IPR.final.txt") if not os.path.exists(organism_IPR_final): print "getIPRScanScore : " + organismName if mode == 1: self.runClusterIPRScan(organismName) time.sleep(21600) # sleep for 6 hrs elif mode == 2: ipr_xml_file = self.concatenate_ClusterIPRScan_results(organismName) ipr_raw_file = self.xmlIPRScanToRAWOutput(organismName, ipr_xml_file) organism_ipr2go = self.extract_ipr2go_based_on_xml(organismName, ipr_xml_file) organism_ipr2ec = self.map_ipr_to_specific_ecs(organismName, organism_ipr2go) organism_seqid2ec = self.combine_iprscan_raw_result_with_ipr2ec( organismName, organism_ipr2ec, ipr_raw_file) if os.path.exists(ipr_raw_file) and os.path.exists(organism_seqid2ec): organism_raw_final = NGS_Util.createFilePath(self.fungi_InterProScan_result, organismName + ".faa.raw.txt") organism_IPR_final = NGS_Util.createFilePath(self.fungi_InterProScan_result, organismName + ".faa.IPR.final.txt") NGS_Util.copyFile(ipr_raw_file, organism_raw_final) NGS_Util.copyFile(organism_seqid2ec, organism_IPR_final) orgListFile_fh.close() except Exception: print traceback.print_exc() return ""
def combineBlast(self, organismName, org_vs_UniprotBlastDB, Uniprot_vs_orgBlastDB): try: print "Combine Blast: " + organismName orgJointBlast = NGS_Util.createFilePath(self.orgBlastResDir, organismName + ".joint.blast") org_vs_UniprotBlastDB_fh = open(org_vs_UniprotBlastDB) Uniprot_vs_orgBlastDB_fh = open(Uniprot_vs_orgBlastDB) ec_files_fh = open(self.ec_files) orgJointBlast_fh = open(orgJointBlast, "w") combineBlasts(ec_files_fh, org_vs_UniprotBlastDB_fh, Uniprot_vs_orgBlastDB_fh, orgJointBlast_fh) org_vs_UniprotBlastDB_fh.close() Uniprot_vs_orgBlastDB_fh.close() ec_files_fh.close() orgJointBlast_fh.close() return orgJointBlast except Exception: print traceback.print_exc() return ""
def gettNormalizedBlastBitScoreMatrix(self, pfamID, bitScoreMatrix, blastResultFile): try: print "gettNormalizedBlastBitScoreMatrix" normalizedBitScoreMatrix = self.createBlastBitScoreMatrix(blastResultFile) for sequenceId, matchingSequenceIDsList in bitScoreMatrix.iteritems(): for matchingSequenceID in matchingSequenceIDsList: sum = bitScoreMatrix[sequenceId][sequenceId] + bitScoreMatrix[matchingSequenceID][matchingSequenceID] - bitScoreMatrix[sequenceId][matchingSequenceID] normalizedBitScoreMatrix[sequenceId][matchingSequenceID] = bitScoreMatrix[sequenceId][matchingSequenceID] / sum normalizedBitScoreMatrixFile = NGS_Util.createFilePath(self.pfamBlastStaticticsDir, pfamID + "_NormalizedBitScores.txt") self.writeMatrixToFile(normalizedBitScoreMatrix, normalizedBitScoreMatrixFile) return normalizedBitScoreMatrix except Exception: print traceback.print_exc() return ""
def getBlastScore(self): try: orgListFile_fh = open(self.orgListFile) for line in orgListFile_fh: organismNameID, organismName = line.strip().split() orgRectifyBlast = NGS_Util.createFilePath(self.jointBlastDir, organismName + ".joint.blast") if not os.path.exists(orgRectifyBlast): print "getBlastScore:" + organismName org_blast_db = self.makeBlastDB(organismName) org_vs_UniprotBlastDB = self.blast_org_vs_uniprot(organismName) Uniprot_vs_orgBlastDB = self.blast_uniprot_vs_org(organismName) if (org_vs_UniprotBlastDB != "" and Uniprot_vs_orgBlastDB != ""): orgJointBlast = self.combineBlast(organismName, org_vs_UniprotBlastDB, Uniprot_vs_orgBlastDB) if (orgJointBlast != ""): orgRectifyBlast = self.rectifyBlast(organismName, orgJointBlast) orgListFile_fh.close() except Exception: print traceback.print_exc() return ""
def doMCLClustering(self, pfamID, abcFile, mclClusteringDir): try: mciFile, tabFile = self.makeClusterInputFiles( pfamID, abcFile, mclClusteringDir) I = 1.2 for index in range(1, 10): output = NGS_Util.createFilePath( mclClusteringDir, pfamID + ".mci." + str(I).replace(".", "")) call = "mcl " + mciFile + " -I " + str( I) + " -use-tab " + tabFile + " -o " + output I += 0.4 print call NGS_Util.executeCall(call) except Exception: print traceback.print_exc()
def concatenate_ClusterIPRScan_results(self, organismName): try: #numberOfFragments = 10 clusterProcessing = True for fragment in range(self.numberOfFragments): ipr_raw_file_split = NGS_Util.createFilePath(self.orgIPRScanDir, organismName + "_split_" + str(fragment+1) + ".xml") if not os.path.exists(ipr_raw_file_split): clusterProcessing = False break if clusterProcessing: ipr_xml_file = self.mergeXML(organismName) return ipr_xml_file else: print "Interpro incomplete for: " + organismName except Exception: print traceback.print_exc() return ""
def makeClusterInputFiles(self, pfamID, abcFile, mclClusteringDir): try: mciFile = NGS_Util.createFilePath(mclClusteringDir, pfamID + ".mci") tabFile = NGS_Util.createFilePath(mclClusteringDir, pfamID + ".tab") call = "mcxload -abc " + abcFile + " --stream-mirror --stream-neg-log10 -stream-tf 'ceil(200)' -o " + mciFile + " -write-tab " + tabFile NGS_Util.executeCall(call) return mciFile, tabFile except Exception: print traceback.print_exc()
def getIPRScanScore(self): try: orgListFile_fh = open(self.orgListFile) for line in orgListFile_fh: if line.startswith("#"): continue organismNameID, organismName = line.strip().split() organism_IPR_final = NGS_Util.createFilePath(self.fungi_InterProScan_result, organismName + ".faa.IPR.final.txt") # self.create_new_seq_org_list(organismName,organismNameID) if not os.path.exists(organism_IPR_final): print "getIPRScanScore : " + organismName org_ipr_split_dir = self.splitFiles(organismName) ipr_raw_file = self.rawIPRScan(organismName,org_ipr_split_dir) ipr_xml_file = self.rawIPRScanToXMlOutput( organismName, ipr_raw_file) organism_ipr2go = self.extract_ipr2go_based_on_xml(organismName, ipr_xml_file) organism_ipr2ec = self.map_ipr_to_specific_ecs(organismName, organism_ipr2go) organism_seqid2ec = self.combine_iprscan_raw_result_with_ipr2ec( organismName, organism_ipr2ec, ipr_raw_file) if os.path.exists(ipr_raw_file) and os.path.exists(organism_seqid2ec): organism_raw_final = NGS_Util.createFilePath(self.fungi_InterProScan_result, organismName + ".faa.raw.txt") organism_IPR_final = NGS_Util.createFilePath(self.fungi_InterProScan_result, organismName + ".faa.IPR.final.txt") NGS_Util.copyFile(ipr_raw_file, organism_raw_final) NGS_Util.copyFile(organism_seqid2ec, organism_IPR_final) orgListFile_fh.close() except Exception: print traceback.print_exc() return ""
def blast_uniprot_vs_org(self, organismName): try: print "uniprot_vs_org_blast Blast: " + organismName org_blast_db = NGS_Util.createFilePath(self.orgBlastDBDir, organismName) Uniprot_vs_orgBlastDB = NGS_Util.createFilePath(self.orgBlastResDir, "up-vs-" + organismName+ ".blast") self.ngsBlast.blastP(org_blast_db, self.uniprot_fasta,6, Uniprot_vs_orgBlastDB, 10) return Uniprot_vs_orgBlastDB except Exception: print traceback.print_exc() return ""
def blast_org_vs_nr40_blast_formatted_11(self, organismName): try: print "blast_org_vs_nr40_blast_formatted_11: " + organismName org_fasta = NGS_Util.createFilePath(self.orgFastaDir, organismName + ".faa") org_vs_nr40BlastDB_f11 = NGS_Util.createFilePath(self.orgGTGBlastResDir, organismName + ".nrdb40_v2.txt") self.ngsBlast.blastP(self.nrdb40_blast_db, org_fasta, 11, org_vs_nr40BlastDB_f11, 10) return org_vs_nr40BlastDB_f11 except Exception: print traceback.print_exc() return ""
def blast_org_vs_uniprot(self, organismName): try: print "org_vs_uniprot_blast: " + organismName org_fasta = NGS_Util.createFilePath(self.orgFastaDir, organismName+".faa") org_vs_UniprotBlastDB = NGS_Util.createFilePath(self.orgBlastResDir, organismName+"-vs-up.blast") self.ngsBlast.blastP(self.uniprot_blast_db,org_fasta, 6 , org_vs_UniprotBlastDB, 10) return org_vs_UniprotBlastDB except Exception: print traceback.print_exc() return ""
def mergeXML(self,organismName): try: isFirst = True ipr_xml_file = NGS_Util.createFilePath(self.orgIPRScanDir, organismName + ".xml") ipr_xml_file_fh = open(ipr_xml_file,"w") for srcFile in glob.glob( NGS_Util.createFilePath(self.orgIPRScanDir, organismName + "_split_*") ): ipr_XML_split_fh = open(srcFile) for line in ipr_XML_split_fh: if line.startswith("<?xml version=") or line.startswith("<protein-matches"): if isFirst: ipr_xml_file_fh.write(line) elif not line.startswith("</protein-matches>"): isFirst = False ipr_xml_file_fh.write(line) ipr_XML_split_fh.close() ipr_xml_file_fh.write("</protein-matches>") ipr_xml_file_fh.close() return ipr_xml_file except Exception: print traceback.print_exc() return ""
def initialize(self, seq_org_list, jointBlastDir, GTGFungiKNNDir, fungi_InterProScan_result, phylogeneticTreeFile, modelTrainingDir): try: self.seq_org_list = seq_org_list self.jointBlastDir = jointBlastDir self.GTGFungiKNNDir = GTGFungiKNNDir self.fungi_InterProScan_result = fungi_InterProScan_result self.phylogeneticTreeFile = phylogeneticTreeFile self.modelTrainingDir = modelTrainingDir self.modelTraining_IPR_EC_Dir = NGS_Util.createDirectoryPath(self.modelTrainingDir, "IPR_EC") self.modelTrainingBlastPVDir = NGS_Util.createDirectoryPath(self.modelTrainingDir, "BlastPValues") self.modelTraining_EC_Scores_Dir = NGS_Util.createDirectoryPath(self.modelTrainingDir, "ECScores") self.modelTrainingProbabilityDensityScoreDir = NGS_Util.createDirectoryPath(self.modelTrainingDir, "ProbabilityDensityScore") self.modelTrainingTreeDir = NGS_Util.createDirectoryPath(self.modelTrainingDir, "Tree") self.modelTrainingModelDir = NGS_Util.createDirectoryPath(self.modelTrainingDir, "Model") NGS_Util.createDirectory(self.modelTrainingDir) NGS_Util.createDirectory(self.modelTraining_IPR_EC_Dir) NGS_Util.createDirectory(self.modelTrainingBlastPVDir) NGS_Util.createDirectory(self.modelTraining_EC_Scores_Dir) NGS_Util.createDirectory(self.modelTrainingProbabilityDensityScoreDir) NGS_Util.createDirectory(self.modelTrainingTreeDir) NGS_Util.createDirectory(self.modelTrainingModelDir) if (os.path.exists(self.phylogeneticTreeFile)): NGS_Util.copyFile( self.phylogeneticTreeFile,NGS_Util.createFilePath(self.modelTrainingTreeDir,"tree") ) self.phylogeneticTreeFile = NGS_Util.createFilePath(self.modelTrainingTreeDir,"tree") self.treeCPDS = NGS_Util.createFilePath(self.modelTrainingTreeDir,"tree.cpds") except Exception: print traceback.print_exc()
def splitFiles(self, organismName): try: org_fasta = NGS_Util.createFilePath(self.orgFastaDir, organismName+".faa") self.splitFasta.splitOrganismDataFile(organismName, org_fasta, self.numberOfFragments) except Exception: print traceback.print_exc() return self.splitFasta.organismSplitDataDir
def computeTreeProbabilityDensityScore(self): try: print "computeTreeProbabilityDensityScore" call = "python " + ScriptsDir.ModelTrainingScripts_estimate_mutation_probability + " " + self.modelTraining_IPR_EC_Dir + " " + self.phylogeneticTreeFile + " " + self.modelTrainingTreeDir NGS_Util.executeCall(call) self.treeCPDS = NGS_Util.createFilePath(self.modelTrainingTreeDir,"tree.cpds") except Exception: print traceback.print_exc()
def rawIPRScan(self, organismName, org_ipr_split_dir): try: print "rawIPRScan: " + organismName #####self.raw_threaded_IPRScan(organismName, org_ipr_split_dir) self.raw_SingleRun_IPRScan(organismName, org_ipr_split_dir) ipr_raw_file_split = NGS_Util.createFilePath(self.orgIPRScanDir, organismName + "_split_*") ipr_raw_file = NGS_Util.createFilePath(self.orgIPRScanDir, organismName + ".ipr.raw") call = "cat " + ipr_raw_file_split + " > " + ipr_raw_file NGS_Util.executeCall(call) return ipr_raw_file except Exception: print traceback.print_exc() return ""
def rawIPRScanToXMlOutput(self, organismName, ipr_raw_file): try: print "rawIPRScanToXMlOutput: " + organismName ipr_xml_file = NGS_Util.createFilePath(self.orgIPRScanDir, organismName + ".xml") self.ngsIPRScan.convert_raw_xml(ipr_raw_file, ipr_xml_file) return ipr_xml_file except Exception: print traceback.print_exc() return ""
def xmlIPRScanToRAWOutput(self, organismName, ipr_xml_file): try: print "xmlIPRScanToRAWOutput: " + organismName ipr_raw_file = NGS_Util.createFilePath(self.orgIPRScanDir, organismName + ".ipr.raw") self.ngsIPRScan.convert_iprscan5_xml_raw(ipr_xml_file, ipr_raw_file) return ipr_raw_file except Exception: print traceback.print_exc() return ""
def blast_org_vs_nr40_blast_formatted_6(self, organismName, org_vs_nr40BlastDB_f11): try: print "blast_org_vs_nr40_blast_formatted_6: " + organismName org_vs_nr40BlastDB_f6 = NGS_Util.createFilePath(self.orgGTGBlastResDir, organismName + ".nrdb40_v2_6.txt") self.ngsBlast.blastFormatter(org_vs_nr40BlastDB_f11, 6, org_vs_nr40BlastDB_f6) return org_vs_nr40BlastDB_f6 except Exception: print traceback.print_exc() return ""
def create_new_seq_org_list(self,organismName, organismID): #(2) extract query information from blast fmt11. : .part1 try: print "create_new_seq_org_list: " + organismName orgListFile_fh = open(self.seq_org_list) found = False for line in orgListFile_fh: if organismID in line: found = True break orgListFile_fh.close if not found: org_fasta = NGS_Util.createFilePath(self.orgFastaDir, organismName + ".faa") org_fasta_fh = open(org_fasta) orgListFile_fh = open(self.seq_org_list,"a") #output file for line in org_fasta_fh: if line.startswith(">"): if "|" in line: id = line.split()[0].split("|")[1] else: id = line.split(" ")[0] orgListFile_fh.write( id + "\t" + organismID + "\n" ) org_fasta_fh.close orgListFile_fh.close except Exception: print traceback.print_exc()
def reform_knn(self, organismName, org_gtg_knn): # (9) Add org and ecs try: print "reform_knn: " + organismName org_gtg_knn_final = NGS_Util.createFilePath(self.GTGKNNDir, organismName + ".gtg.knn") call = "python " + ScriptsDir.GTGScripts_reform_knn + " " +self.seq_org_list + " " + self.ec_files + " " + org_gtg_knn + " " + org_gtg_knn_final NGS_Util.executeCall(call) return org_gtg_knn_final except Exception: print traceback.print_exc() return ""
def rectifyBlast(self, organismName, orgJointBlast): try: print "Rectify Blast: " + organismName orgRectifyBlast = NGS_Util.createFilePath(self.jointBlastDir, organismName + ".joint.blast") call = "python " + ScriptsDir.BlastScripts_rectify_blastresult + " " + orgJointBlast + " " + orgRectifyBlast NGS_Util.executeCall(call) return orgRectifyBlast except Exception: print traceback.print_exc() return ""
def extract_ipr2go_based_on_xml(self,organismName, ipr_xml_file): try: print "extract_ipr2go_based_on_xml: " + organismName organism_ipr2go = NGS_Util.createFilePath(self.orgIPRScanDir, organismName + "_ipr2go.txt") call = "python " + ScriptsDir.IPRScanScripts_ipr2go + " " + ipr_xml_file + " " + organism_ipr2go NGS_Util.executeCall(call) return organism_ipr2go except Exception: print traceback.print_exc() return ""
def map_ipr_to_specific_ecs(self,organismName, organism_ipr2go): try: print "map_ipr_to_specific_ecs: " + organismName organism_ipr2ec = NGS_Util.createFilePath(self.orgIPRScanDir, organismName + "_ipr2ec.txt") call = "python " + ScriptsDir.IPRScanScripts_get_interpro_ecs + " " + self.ec2go + " " + organism_ipr2go + " " + organism_ipr2ec NGS_Util.executeCall(call) return organism_ipr2ec except Exception: print traceback.print_exc() return ""
def raw_split_IPRScan(self, organismName, organismSplitFile, splitNameIndex): try: print "raw_split_IPRScan: " + organismName + " " + organismSplitFile ipr_raw_file = NGS_Util.createFilePath(self.orgIPRScanDir, organismName + "_split_" + str(splitNameIndex) + ".ipr.raw") if not os.path.exists(ipr_raw_file): self.ngsIPRScan.protein_iprscan_to_raw_output(organismSplitFile, ipr_raw_file) return ipr_raw_file except Exception: print traceback.print_exc() print "error raw_split_IPRScan: " + organismName + " " + organismSplitFile return ""
def extract_start_len_fmt11(self, organismName, org_vs_nr40BlastDB_f11): #(3) extract start, len and subject name from fmt11 : .part2 try: print "extract_start_len_fmt11: " + organismName org_vs_nr40BlastDB_f11_part2 = NGS_Util.createFilePath(self.orgGTGBlastResDir, organismName + ".nrdb40_v2.part2") call = "python " + ScriptsDir.GTGScripts_extract_start_len_fmt11 + " " + org_vs_nr40BlastDB_f11 + " " + org_vs_nr40BlastDB_f11_part2 NGS_Util.executeCall(call) return org_vs_nr40BlastDB_f11_part2 except Exception: print traceback.print_exc() return ""
def extract_combine_seq_start_len_fmt11(self, organismName, org_vs_nr40BlastDB_f11_part1, org_vs_nr40BlastDB_f11_part2): #(4) combine the result from previous two steps try: print "extract_combine_seq_start_len_fmt11: " + organismName org_vs_nr40BlastDB_f11_part1_part2_result = NGS_Util.createFilePath(self.orgGTGBlastResDir, organismName + ".nrdb40_v2.part1.part2.result") call = "python " + ScriptsDir.GTGScripts_extract_combine_seq_start_len_fmt11 + " " + org_vs_nr40BlastDB_f11_part1 + " " + org_vs_nr40BlastDB_f11_part2 + " " + org_vs_nr40BlastDB_f11_part1_part2_result NGS_Util.executeCall(call) return org_vs_nr40BlastDB_f11_part1_part2_result except Exception: print traceback.print_exc() return ""
def getGTGScore(self): try: orgListFile_fh = open(self.orgListFile) for line in orgListFile_fh: if line.startswith("#"): continue organismNameID, organismName = line.strip().split() org_gtg_knn_final = NGS_Util.createFilePath(self.GTGKNNDir, organismNameID + ".gtg.knn") if not os.path.exists(org_gtg_knn_final): print "getGTGScore : " +organismName org_vs_nr40BlastDB_f11 = self.blast_org_vs_nr40_blast_formatted_11( organismName) org_vs_nr40BlastDB_f6 = self.blast_org_vs_nr40_blast_formatted_6( organismName, org_vs_nr40BlastDB_f11) org_vs_nr40BlastDB_f11_part1 = self.extract_seq_fmt11( organismName, org_vs_nr40BlastDB_f11) org_vs_nr40BlastDB_f11_part2 = self.extract_start_len_fmt11( organismName, org_vs_nr40BlastDB_f11) org_vs_nr40BlastDB_f11_part1_part2_result = self.extract_combine_seq_start_len_fmt11( organismName, org_vs_nr40BlastDB_f11_part1, org_vs_nr40BlastDB_f11_part2) org_vs_nr40BlastDB_result_final = self.reform( organismName, org_vs_nr40BlastDB_f6, org_vs_nr40BlastDB_f11_part1_part2_result) org_vs_nr40BlastDB_best_hit = self.extract_best_hit( organismNameID, org_vs_nr40BlastDB_result_final) org_gtg = self.extract_gtg( organismNameID, org_vs_nr40BlastDB_best_hit) org_gtg_knn = self.gtgknn( organismNameID, org_gtg, self.numberNearestHits) org_gtg_knn_final = self.reform_knn( organismNameID, org_gtg_knn) orgListFile_fh.close() except Exception: print traceback.print_exc() return ""
def getBlastBitScoreMatrix(self, pfamID, blastResultFile): try: print "getBlastBitScoreMatrix" bitScoreMatrix = self.createBlastBitScoreMatrix(blastResultFile) if os.path.exists(blastResultFile): blastResultFile_fh = open(blastResultFile) line = "" blastOutput = [] for line in blastResultFile_fh: blastOutput = line.strip().split("\t") if len(blastOutput)>1: sequenceID = blastOutput[0] matchingSequenceID = blastOutput[1] bitScore = float(blastOutput[11]) if bitScoreMatrix[sequenceID][matchingSequenceID] < bitScore: bitScoreMatrix[sequenceID][matchingSequenceID] = bitScore blastResultFile_fh.close() bitScoreMatrixFile = NGS_Util.createFilePath(self.pfamBlastStaticticsDir, pfamID + "_AllvsALL_BlastBitScores.txt") self.writeMatrixToFile(bitScoreMatrix, bitScoreMatrixFile) return bitScoreMatrix except Exception: print traceback.print_exc() return ""
def combine_iprscan_raw_result_with_ipr2ec(self, organismName, organism_ipr2ec, ipr_raw_file): ### to be changes new_seq_org_list -> seq_org_list try: print "combine_iprscan_raw_result_with_ipr2ec: " + organismName organism_seqid2ec = NGS_Util.createFilePath(self.orgIPRScanDir, organismName + "_seqid2ec.txt") call = "python " + ScriptsDir.IPRScanScripts_combineIPRwithECs + " " + organism_ipr2ec + " " + ipr_raw_file + " " + self.seq_org_list + " " + organism_seqid2ec NGS_Util.executeCall(call) return organism_seqid2ec except Exception: print traceback.print_exc() return ""
def extract_seq_fmt11(self, organismName, org_vs_nr40BlastDB_f11): #(2) extract query information from blast fmt11. : .part1 try: print "extract_seq_fmt11: " + organismName org_vs_nr40BlastDB_f11_part1 = NGS_Util.createFilePath(self.orgGTGBlastResDir, organismName + ".nrdb40_v2.part1") call = "python " + ScriptsDir.GTGScripts_extract_seq_fmt11 + " " + org_vs_nr40BlastDB_f11 + " " + org_vs_nr40BlastDB_f11_part1 NGS_Util.executeCall(call) return org_vs_nr40BlastDB_f11_part1 except Exception: print traceback.print_exc() return ""
def extract_best_hit(self, organismName, org_vs_nr40BlastDB_result_final): #(6) Extract the best hit for each query seq. try: print "extract_best_hit: " + organismName org_vs_nr40BlastDB_best_hit = NGS_Util.createFilePath(self.GTGBestHitsDir, organismName + ".nrdb40.best_hit") call = "python " + ScriptsDir.GTGScripts_extract_best_hit + " " + org_vs_nr40BlastDB_result_final + " " + org_vs_nr40BlastDB_best_hit NGS_Util.executeCall(call) return org_vs_nr40BlastDB_best_hit except Exception: print traceback.print_exc() return ""
def reform(self, organismName, org_vs_nr40BlastDB_f6, org_vs_nr40BlastDB_f11_part1_part2_result): #(5) extract and reform based on $name.nrdb40_v2_6.txt and $name.nrdb40_v2.txt: should give you the sample output below try: print "reform: " + organismName org_vs_nr40BlastDB_result_final = NGS_Util.createFilePath(self.orgGTGBlastResDir, organismName + ".nrdb40.result.final") call = "python " + ScriptsDir.GTGScripts_reform + " " + org_vs_nr40BlastDB_f6 + " " + org_vs_nr40BlastDB_f11_part1_part2_result + " " + org_vs_nr40BlastDB_result_final NGS_Util.executeCall(call) return org_vs_nr40BlastDB_result_final except Exception: print traceback.print_exc() return ""
def splitFiles(self, organismName): try: print "splitFiles: " + organismName org_fasta = NGS_Util.createFilePath(self.orgFastaDir, organismName + ".faa") org_ipr_split_dir = NGS_Util.createDirectoryPath(self.orgIPRScanDir, organismName) call = "sh " + ScriptsDir.IPRScanScripts_fsplit + " " + org_fasta + " " + org_ipr_split_dir NGS_Util.executeCall(call) return org_ipr_split_dir except Exception: print traceback.print_exc() return ""