def ppOut2PPSout(): inFile = '/Users/ivan/Documents/work/binning/data/HumanGut/PP/TS29_scaff.file.0.5.txt' outFile = '/Users/ivan/Documents/work/binning/data/HumanGut/PP/TS29_scaff.file.0.5.PPS.txt' dbFile = '/Users/ivan/Documents/work/binning/taxonomy/20120828/ncbitax_sqlite.db' #DB taxonomy = taxonomy_ncbi.TaxonomyNcbi(dbFile) out = csv.OutFileBuffer(outFile) csv.forEachLine(inFile, PP2PPSoutParser(taxonomy, out)) out.close()
def setCandidatePlacement(self, sequences, taxonomy, fastaFileDNA): """ Set candidate placement according to the marker gene analysis !!! """ outPredAllFileName = os.path.join(self.markerGeneWorkingDir, str(os.path.basename(fastaFileDNA) + '_all.mP')) return forEachLine(outPredAllFileName, _SetCandidatePlacement(sequences, taxonomy)).getAssignedSeqCount()
def runMarkerGeneAnalysis(self, fastaFileDNA, outLog=None): """ Run hmmer HMM and mothur classify (bayesian), same param as for the 16S analysis. """ #read list of marker genes mgFiles = forEachLine(self.markerGeneListFile, _MgFiles(self.markerGeneListFileDir)) #translate DNA to protein sequences fastaFileProt = os.path.join(self.markerGeneWorkingDir, str(os.path.basename(fastaFileDNA) + '.PROT')) dnaToProt(fastaFileDNA, fastaFileProt) #read DNA fasta file try: handle = open(fastaFileDNA, "rU") dnaSeqDict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) handle.close() except Exception: sys.stderr.write(str('Cannot read file: ' + str(fastaFileDNA))) raise #to output all predictions in one file outPredAllFileName = os.path.join(self.markerGeneWorkingDir, str(os.path.basename(fastaFileDNA) + '_all.mP')) outAllBuffer = OutFileBuffer(outPredAllFileName) #run HMM search mgList = mgFiles.getGeneNameList() if outLog is not None: stdoutLog = open(outLog, 'w') else: stdoutLog = subprocess.STDOUT #for each gene perform the analysis separately for geneName in mgList: domFileArray = [os.path.join(self.markerGeneWorkingDir, str(geneName + '_1.dom'))] #, # os.path.join(self.markerGeneWorkingDir, str(geneName + '_2.dom'))] outFileArray = [os.path.join(self.markerGeneWorkingDir, str(geneName + '_1.out'))] #, # os.path.join(self.markerGeneWorkingDir, str(geneName + '_2.out'))] hmmFileArray = [mgFiles.getFilePath(geneName, 'hmmPROTPrim')] #, # mgFiles.getFilePath(geneName, 'hmmPROTSec')] cmdArray = list([]) #define cmd for i in range(1): if hmmFileArray[i] is not None: cmdArray.append(str(os.path.join(self.hmmerBinDir, 'hmmsearch') + ' --domtblout ' + domFileArray[i] + ' -E 0.01' + self.processorsHmm + ' -o ' + outFileArray[i] + ' ' + hmmFileArray[i] + ' ' + fastaFileProt)) else: cmdArray.append(None) #run cmd for cmd in cmdArray: if cmd is not None and os.name == 'posix': cwd = self.hmmInstallDir if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd)])) is not None: sys.exit(-1) # hmmProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self.hmmInstallDir, stdout=stdoutLog) # print 'run cmd:', cmd # hmmProc.wait() # print 'HMM return code:', hmmProc.returncode # if hmmProc.returncode != 0: # raise Exception("Command returned with non-zero %s status: %s" % (hmmProc.returncode, cmd)) else: print 'Marker genes analysis, doesn`t run (no posix): ', cmd #get regions that match to the HMM profile () entryDictList = [] for i in range(1): if cmdArray[i] is not None: entryDictList.append(forEachLine(domFileArray[i], _MgRegions()).getEntryDict()) else: entryDictList.append(None) entryDict1 = entryDictList[0] # entryDict2 = entryDictList[1] #extract regions found in the protein sequences that were found by the HMM and generate corresponding DNA sequences regionDnaFasta = os.path.join(self.markerGeneWorkingDir, str(geneName + '_dna.gff')) outFileBuffer = OutFileBuffer(regionDnaFasta) for seqName in entryDict1: i = -1 for e in entryDict1[seqName]: i += 1 from1 = entryDict1[seqName][i][0] to1 = entryDict1[seqName][i][1] assert ((from1 != None) and (to1 != None)) #compare the results found by the primary and secondary HMM profiles # if (entryDict2 != None) and (seqName in entryDict2): # if len(entryDict2[seqName]) >= (i+1): # from2 = entryDict2[seqName][i][0] # to2 = entryDict2[seqName][i][1] #if from1 != from2 or to1 != to2: # print str('Different positions in' + seqName + ' from1:' + str(from1) + ' from2:' + str(from2) # + ' to1:' + str(to1) + ' to2:' + str(to2)) #extract regions from the DNA sequences (consider 3 ORF and reverse complements) #name of the whole sequence dnaSeqName = re.sub(r'([0-9]+_[0-9]+)_[pr]+[012]', r'\1', seqName) #whole DNA sequence dnaSeq = dnaSeqDict[dnaSeqName].seq #reverse complement (contains "pr") tagRev = 'p' if re.match(r'[0-9]+_[0-9]+_pr[012]', seqName): dnaSeq = dnaSeq.reverse_complement() tagRev = 'pr' #shift "0" if re.match(r'[0-9]+_[0-9]+_[pr]+0', seqName): tagFrom = ((from1 - 1)*3) tagTo = (to1*3) tagRev += '0' dnaSeq = dnaSeq[tagFrom:tagTo] #shift "1" elif re.match(r'[0-9]+_[0-9]+_[pr]+1', seqName): tagFrom = (((from1 - 1)*3) + 1) tagTo = ((to1*3) + 1) tagRev += '1' dnaSeq = dnaSeq[tagFrom:tagTo] #shift "2" elif re.match(r'[0-9]+_[0-9]+_[pr]+2', seqName): tagFrom = (((from1 - 1)*3) + 2) tagTo = ((to1*3) + 2) tagRev += '2' dnaSeq = dnaSeq[tagFrom:tagTo] #error else: sys.stderr.write('Wrong seq name: ' + seqName + ' \n') dnaSeq = None tag = str(str(tagFrom) + '_' + str(tagTo) + '_' + tagRev) outFileBuffer.writeText(str('>' + dnaSeqName + '_' + tag + '\n' + dnaSeq + '\n')) outFileBuffer.close() #if no marker gene found if outFileBuffer.isEmpty(): continue #run mothur classify (bayesian? the same as for the 16S analysis) templateFile = mgFiles.getFilePath(geneName, 'templateDNA') taxonomyFile = mgFiles.getFilePath(geneName, 'taxonomyDNA') assert ((templateFile is not None) and (taxonomyFile is not None)) cmd = str('' + self.mothur + ' "#classify.seqs(fasta=' + regionDnaFasta + ', template=' + templateFile + ', taxonomy=' + taxonomyFile + ', ' + self.mothurParam + ')"') if os.name == 'posix': print('Mothur processing: %s' % os.path.basename(templateFile).split('_', 1)[0]) cwd = self.markerGeneWorkingDir if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd, stdout=stdoutLog)])) is not None: sys.exit(-1) # mothurProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self.markerGeneWorkingDir, stdout=stdoutLog) # print 'run cmd:', cmd # mothurProc.wait() # print 'mothur return code:', mothurProc.returncode # if mothurProc.returncode != 0: # raise Exception("Command returned with non-zero %s status: %s" % (mothurProc.returncode, cmd)) else: print 'Cannot run mothur since your system is not "posix" but', str('"' + os.name + '"'), '\n', cmd #transform the mothur output to a simple output (name, ncbid, weight) #mothurPredFileName = os.path.join(self.markerGeneWorkingDir, # str(geneName + '_dna.' + os.path.basename(taxonomyFile) + 'onomy')) # taxonomy #!!!!!!!!!!!!! mothurPredFileName = common.getMothurOutputFilePath(regionDnaFasta, taxonomyFile) if not os.path.isfile(mothurPredFileName): mothurPredFileName = common.getMothurOutputFilePath(regionDnaFasta, taxonomyFile, suffix='.bayesian.taxonomy') if not os.path.isfile(mothurPredFileName): print("Can't open file: %s" % mothurPredFileName) outPredFileName = os.path.join(self.markerGeneWorkingDir, str(os.path.basename(fastaFileDNA) + '_' + geneName + '.mP')) outBuffer = OutFileBuffer(outPredFileName, bufferText=True) forEachLine(mothurPredFileName, _MothurOutFileParser(outBuffer, geneName)) if not outAllBuffer.isEmpty(): outAllBuffer.writeText('\n') outAllBuffer.writeText(outBuffer.getTextBuffer()) if outLog is not None: stdoutLog.close() outAllBuffer.close()
def _init(self, align=True, dm=True, cluster=True): """ Init data, compute: alignment, distance matrix, clusters. """ if self._initDone: return self._initDone = True fastaPathList = [] # fasta files containing regions that correspond to particular marker genes self._mgList = [] # list of names of marker genes mgToFastaPath = dict([]) # marker gene name -> fasta file path #collect regions from Amphora mg for fastaFile in glob.glob(os.path.join(os.path.normpath(self._mgWorkingDir),'*.gff')): fastaPathList.append(fastaFile) for path in fastaPathList: name = re.sub('([^\.]+)\..*$', r'\1' , os.path.basename(path)) mg = re.sub(r'([^_]+)_dna', r'\1',name) dir = os.path.dirname(path) self._mgList.append(mg) mgToFastaPath[mg] = path #add 16S s16List = ['5S_rRNA', '16S_rRNA', '23S_rRNA'] for mg in s16List: mgToFastaPath[mg] = str(self._s16Prefix + '.' + mg + '.fna') self._mgList.append(mg) #For each marker gene create filtered fasta file that contains for each mg and sequence at most one region. mgToFilteredFastaPath = dict([]) mgToSeqNameToTaxPathDict = dict([]) #mg -> seqName (~region name) -> pred for mg in self._mgList: mgToSeqNameToTaxPathDict[mg] = dict([]) for seq in self._sequences.sequences: id = str(str(seq.scaffold.id) + '_' + str(seq.id)) for mg,tag,pred in zip(seq.getCandidateTaxPathSourceList(), seq.getCandidateTaxPathTagList(), seq.getCandidateTaxPathDictList()): mgToSeqNameToTaxPathDict[mg][str(id + '_' + tag)] = pred #for each marker gene: choose only one sequence region for each mg and sequence #all sequences are predicted at least at superkingdom for mg in self._mgList: seqNameToPred = mgToSeqNameToTaxPathDict[mg] #sequence region predictions for this mg seqNameToSeq = fastaFileToDict(mgToFastaPath[mg]) #read the fasta file outPath = os.path.normpath(os.path.join(self._clustDir, str(mg + '.filter.fna'))) mgToFilteredFastaPath[mg] = outPath out = OutFileBuffer(outPath) seqBaseToSeqName = dict([]) # sequence base (scaffId_seqId) -> region name for seqName in seqNameToSeq: seqBase = re.sub(r'^([0-9]+_[0-9]+)[^0-9].*',r'\1', seqName) if seqBase not in seqBaseToSeqName: seqBaseToSeqName[seqBase] = [] seqBaseToSeqName[seqBase].append(seqName) for seqBase in seqBaseToSeqName: seqId = int(re.sub(r'^[0-9]+_([0-9]+)',r'\1', seqBase)) seqBaseTaxPathDict = self._sequences.getSequence(seqId).getTaxonomyPath() list = seqBaseToSeqName[seqBase] candidateSeq = [] # sequence region is predicted at least at rank superkingdom for seqName in list: if seqName not in seqNameToPred: taxPathDict = None else: taxPathDict = seqNameToPred[seqName] if taxPathDict != None: candidateSeq.append(seqName) if len(candidateSeq) == 0: continue candidateSeq2 = [] # sequence regions predicted at least at the same rank as the whole sequence for seqName in candidateSeq: taxPathDict = seqNameToPred[seqName] if ((seqBaseTaxPathDict == None) or (len(taxPathDict) >= len(seqBaseTaxPathDict))): #predict at least at the same level candidateSeq2.append(seqName) if len(candidateSeq2) > 0: #take the longest sequence sMax = candidateSeq2[0] for s in candidateSeq2[1:]: if len(seqNameToSeq[s]) > len(seqNameToSeq[sMax]): sMax = s else: #all sequence regions are predicted higher than the sequence sMax = candidateSeq[0] #sequence region with the most specific prediction for s in candidateSeq[1:]: taxPathDictMax = seqNameToPred[sMax] taxPathDictS = seqNameToPred[s] if taxPathDictS == None: continue if taxPathDictMax == None: sMax = s continue if len(taxPathDictMax) < len(taxPathDictS): sMax = s candidateSeq3 = [] #get all sequence regions with the most specific prediction taxPathDictMax = seqNameToPred[sMax] for s in candidateSeq: taxPathDictS = seqNameToPred[s] if taxPathDictMax == None: candidateSeq3.append(s) elif len(taxPathDictS) == len(taxPathDictMax): candidateSeq3.append(s) sMax = candidateSeq3[0] for s in candidateSeq3[1:]: #take the longest sequence if len(seqNameToSeq[sMax]) < len(seqNameToSeq[s]): sMax = s out.writeText(str('>' + str(sMax) + '\n' + str(seqNameToSeq[sMax]) + '\n')) out.close() mgToAlignPath = dict([]) for mg in self._mgList: mgToAlignPath[mg] = os.path.normpath(os.path.join(self._clustDir, str(mg + '.align.fna'))) #build alignment if align: for mg in self._mgList: alignCmd = str(self._config.get('aligner') + ' -in ' + mgToFilteredFastaPath[mg] + ' -out ' + mgToAlignPath[mg] + ' -quiet') assert os.name == 'posix' predictProc = subprocess.Popen(alignCmd, cwd=self._mgWorkingDir, shell=True, bufsize=-1) #stdout=subprocess.STDOUT, stderr=subprocess.STDOUT) predictProc.wait() print 'Muscle return code for', mg, ':', predictProc.returncode if predictProc.returncode != 0: sys.stderr.write(str(alignCmd + ' \n')) #compute DM if dm: for mg in self._mgList: mothur = os.path.join(os.path.normpath(self._configRRNA16S.get('mothurInstallDir')), 'mothur') mothurCmd = str('time ' + mothur + ' "#dist.seqs(fasta=' + mgToAlignPath[mg] + ', processors=2, countends=F, calc=nogaps, cutoff=0.3, output=lt)"') assert os.name == 'posix' mothurProc = subprocess.Popen(mothurCmd, shell=True, bufsize=-1, cwd=self._mgWorkingDir) mothurProc.wait() print 'Mothur return code dist:', mg, mothurProc.returncode #distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist')) #self._mgToDM[mg] = forEachLine(distFilePath, DM()) #self._mgToDM[mg].printDM() #cluster if cluster: for mg in self._mgList: distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist')) mothur = os.path.join(os.path.normpath(self._configRRNA16S.get('mothurInstallDir')), 'mothur') mothurCmd = str('time ' + mothur + ' "#cluster(phylip=' + distFilePath + ', method=furthest, hard=t, precision=1000)"') assert os.name == 'posix' mothurProc = subprocess.Popen(mothurCmd, shell=True, bufsize=-1, cwd=self._mgWorkingDir) mothurProc.wait() print 'Mothur return code cluster:', mg, mothurProc.returncode #read DM and clusters #sequence predictions self._seqIdToTaxPathDict = dict([]) self._seqIdToWeight = dict([]) for seq in self._sequences.sequences: id = int(seq.id) self._seqIdToTaxPathDict[id] = seq.getTaxonomyPath() self._seqIdToWeight[id] = seq.getTaxonomyPathWeight() #similarity thresholds thresholds = self._configMG.get('mgSimilarityThresholds') self._mgToMaxThreshold = dict([]) tmpDict = getMapping(self._configMG.get('mgSimilarityThresholds'), 0, 1, sep='\t', comment = '#') for k in tmpDict: self._mgToMaxThreshold[k] = float(tmpDict[k][0]) self._mgToDM = dict([]) self._mgToCluster = dict([]) for mg in self._mgList: file = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist')) self._mgToDM[mg] = forEachLine(file, DM()) file = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.fn.list')) self._mgToCluster[mg] = forEachLine(file, MCluster(self._seqIdToTaxPathDict, self._mgToMaxThreshold[mg]))
def _init(self, align=True, dm=True, cluster=True): """ Init data, compute: alignment, distance matrix, clusters. """ if self._initDone: return self._initDone = True fastaPathList = [ ] # fasta files containing regions that correspond to particular marker genes self._mgList = [] # list of names of marker genes mgToFastaPath = dict([]) # marker gene name -> fasta file path #collect regions from Amphora mg for fastaFile in glob.glob( os.path.join(os.path.normpath(self._mgWorkingDir), '*.gff')): fastaPathList.append(fastaFile) for path in fastaPathList: name = re.sub('([^\.]+)\..*$', r'\1', os.path.basename(path)) mg = re.sub(r'([^_]+)_dna', r'\1', name) dir = os.path.dirname(path) self._mgList.append(mg) mgToFastaPath[mg] = path #add 16S s16List = ['5S_rRNA', '16S_rRNA', '23S_rRNA'] for mg in s16List: mgToFastaPath[mg] = str(self._s16Prefix + '.' + mg + '.fna') self._mgList.append(mg) #For each marker gene create filtered fasta file that contains for each mg and sequence at most one region. mgToFilteredFastaPath = dict([]) mgToSeqNameToTaxPathDict = dict( []) #mg -> seqName (~region name) -> pred for mg in self._mgList: mgToSeqNameToTaxPathDict[mg] = dict([]) for seq in self._sequences.sequences: id = str(str(seq.scaffold.id) + '_' + str(seq.id)) for mg, tag, pred in zip(seq.getCandidateTaxPathSourceList(), seq.getCandidateTaxPathTagList(), seq.getCandidateTaxPathDictList()): mgToSeqNameToTaxPathDict[mg][str(id + '_' + tag)] = pred #for each marker gene: choose only one sequence region for each mg and sequence #all sequences are predicted at least at superkingdom for mg in self._mgList: seqNameToPred = mgToSeqNameToTaxPathDict[ mg] #sequence region predictions for this mg seqNameToSeq = fastaFileToDict( mgToFastaPath[mg]) #read the fasta file outPath = os.path.normpath( os.path.join(self._clustDir, str(mg + '.filter.fna'))) mgToFilteredFastaPath[mg] = outPath out = OutFileBuffer(outPath) seqBaseToSeqName = dict( []) # sequence base (scaffId_seqId) -> region name for seqName in seqNameToSeq: seqBase = re.sub(r'^([0-9]+_[0-9]+)[^0-9].*', r'\1', seqName) if seqBase not in seqBaseToSeqName: seqBaseToSeqName[seqBase] = [] seqBaseToSeqName[seqBase].append(seqName) for seqBase in seqBaseToSeqName: seqId = int(re.sub(r'^[0-9]+_([0-9]+)', r'\1', seqBase)) seqBaseTaxPathDict = self._sequences.getSequence( seqId).getTaxonomyPath() list = seqBaseToSeqName[seqBase] candidateSeq = [ ] # sequence region is predicted at least at rank superkingdom for seqName in list: if seqName not in seqNameToPred: taxPathDict = None else: taxPathDict = seqNameToPred[seqName] if taxPathDict != None: candidateSeq.append(seqName) if len(candidateSeq) == 0: continue candidateSeq2 = [ ] # sequence regions predicted at least at the same rank as the whole sequence for seqName in candidateSeq: taxPathDict = seqNameToPred[seqName] if ((seqBaseTaxPathDict == None) or (len(taxPathDict) >= len(seqBaseTaxPathDict)) ): #predict at least at the same level candidateSeq2.append(seqName) if len(candidateSeq2) > 0: #take the longest sequence sMax = candidateSeq2[0] for s in candidateSeq2[1:]: if len(seqNameToSeq[s]) > len(seqNameToSeq[sMax]): sMax = s else: #all sequence regions are predicted higher than the sequence sMax = candidateSeq[ 0] #sequence region with the most specific prediction for s in candidateSeq[1:]: taxPathDictMax = seqNameToPred[sMax] taxPathDictS = seqNameToPred[s] if taxPathDictS == None: continue if taxPathDictMax == None: sMax = s continue if len(taxPathDictMax) < len(taxPathDictS): sMax = s candidateSeq3 = [ ] #get all sequence regions with the most specific prediction taxPathDictMax = seqNameToPred[sMax] for s in candidateSeq: taxPathDictS = seqNameToPred[s] if taxPathDictMax == None: candidateSeq3.append(s) elif len(taxPathDictS) == len(taxPathDictMax): candidateSeq3.append(s) sMax = candidateSeq3[0] for s in candidateSeq3[1:]: #take the longest sequence if len(seqNameToSeq[sMax]) < len(seqNameToSeq[s]): sMax = s out.writeText( str('>' + str(sMax) + '\n' + str(seqNameToSeq[sMax]) + '\n')) out.close() mgToAlignPath = dict([]) for mg in self._mgList: mgToAlignPath[mg] = os.path.normpath( os.path.join(self._clustDir, str(mg + '.align.fna'))) #build alignment if align: for mg in self._mgList: alignCmd = str( self._config.get('aligner') + ' -in ' + mgToFilteredFastaPath[mg] + ' -out ' + mgToAlignPath[mg] + ' -quiet') assert os.name == 'posix' predictProc = subprocess.Popen( alignCmd, cwd=self._mgWorkingDir, shell=True, bufsize=-1 ) #stdout=subprocess.STDOUT, stderr=subprocess.STDOUT) predictProc.wait() print 'Muscle return code for', mg, ':', predictProc.returncode if predictProc.returncode != 0: sys.stderr.write(str(alignCmd + ' \n')) #compute DM if dm: for mg in self._mgList: mothur = os.path.join( os.path.normpath( self._configRRNA16S.get('mothurInstallDir')), 'mothur') mothurCmd = str( 'time ' + mothur + ' "#dist.seqs(fasta=' + mgToAlignPath[mg] + ', processors=2, countends=F, calc=nogaps, cutoff=0.3, output=lt)"' ) assert os.name == 'posix' mothurProc = subprocess.Popen(mothurCmd, shell=True, bufsize=-1, cwd=self._mgWorkingDir) mothurProc.wait() print 'Mothur return code dist:', mg, mothurProc.returncode #distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist')) #self._mgToDM[mg] = forEachLine(distFilePath, DM()) #self._mgToDM[mg].printDM() #cluster if cluster: for mg in self._mgList: distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist')) mothur = os.path.join( os.path.normpath( self._configRRNA16S.get('mothurInstallDir')), 'mothur') mothurCmd = str('time ' + mothur + ' "#cluster(phylip=' + distFilePath + ', method=furthest, hard=t, precision=1000)"') assert os.name == 'posix' mothurProc = subprocess.Popen(mothurCmd, shell=True, bufsize=-1, cwd=self._mgWorkingDir) mothurProc.wait() print 'Mothur return code cluster:', mg, mothurProc.returncode #read DM and clusters #sequence predictions self._seqIdToTaxPathDict = dict([]) self._seqIdToWeight = dict([]) for seq in self._sequences.sequences: id = int(seq.id) self._seqIdToTaxPathDict[id] = seq.getTaxonomyPath() self._seqIdToWeight[id] = seq.getTaxonomyPathWeight() #similarity thresholds thresholds = self._configMG.get('mgSimilarityThresholds') self._mgToMaxThreshold = dict([]) tmpDict = getMapping(self._configMG.get('mgSimilarityThresholds'), 0, 1, sep='\t', comment='#') for k in tmpDict: self._mgToMaxThreshold[k] = float(tmpDict[k][0]) self._mgToDM = dict([]) self._mgToCluster = dict([]) for mg in self._mgList: file = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist')) self._mgToDM[mg] = forEachLine(file, DM()) file = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.fn.list')) self._mgToCluster[mg] = forEachLine( file, MCluster(self._seqIdToTaxPathDict, self._mgToMaxThreshold[mg]))
def runMarkerGeneAnalysis(self, fastaFileDNA, outLog=None): """ Run hmmer HMM and mothur classify (bayesian), same param as for the 16S analysis. """ #read list of marker genes mgFiles = forEachLine(self.markerGeneListFile, _MgFiles(self.markerGeneListFileDir)) #translate DNA to protein sequences fastaFileProt = os.path.join(self.markerGeneWorkingDir, str(os.path.basename(fastaFileDNA) + '.PROT')) dnaToProt(fastaFileDNA, fastaFileProt) #read DNA fasta file try: handle = open(fastaFileDNA, "rU") dnaSeqDict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) handle.close() except Exception: sys.stderr.write(str('Cannot read file: ' + str(fastaFileDNA))) raise #to output all predictions in one file outPredAllFileName = os.path.join(self.markerGeneWorkingDir, str(os.path.basename(fastaFileDNA) + '_all.mP')) outAllBuffer = OutFileBuffer(outPredAllFileName) #run HMM search mgList = mgFiles.getGeneNameList() if outLog is not None: stdoutLog = open(outLog,'w') else: stdoutLog = subprocess.STDOUT #for each gene perform the analysis separately for geneName in mgList: domFileArray = [os.path.join(self.markerGeneWorkingDir, str(geneName + '_1.dom')), os.path.join(self.markerGeneWorkingDir, str(geneName + '_2.dom'))] outFileArray = [os.path.join(self.markerGeneWorkingDir, str(geneName + '_1.out')), os.path.join(self.markerGeneWorkingDir, str(geneName + '_2.out'))] hmmFileArray = [mgFiles.getFilePath(geneName, 'hmmPROTPrim'), mgFiles.getFilePath(geneName, 'hmmPROTSec')] cmdArray = list([]) #define cmd for i in range(2): if hmmFileArray[i] is not None: cmdArray.append(str(os.path.join(self.hmmerBinDir, 'hmmsearch') + ' --domtblout ' + domFileArray[i] + ' -E 0.01' + ' -o ' + outFileArray[i] + ' ' + hmmFileArray[i] + ' ' + fastaFileProt)) else: cmdArray.append(None) #run cmd for cmd in cmdArray: if cmd is not None and os.name == 'posix': hmmProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self.hmmInstallDir, stdout=stdoutLog) print 'run cmd:', cmd hmmProc.wait() print 'HMM return code:', hmmProc.returncode if hmmProc.returncode != 0: raise Exception("Command returned with non-zero %s status: %s" % (hmmProc.returncode, cmd)) else: print 'Marker genes analysis, doesn`t run (no posix): ', cmd #get regions that match to the HMM profile () entryDictList = [] for i in range(2): if cmdArray[i] is not None: entryDictList.append(forEachLine(domFileArray[i], _MgRegions()).getEntryDict()) else: entryDictList.append(None) entryDict1 = entryDictList[0] entryDict2 = entryDictList[1] #extract regions found in the protein sequences that were found by the HMM and generate corresponding DNA sequences regionDnaFasta = os.path.join(self.markerGeneWorkingDir, str(geneName + '_dna.gff')) outFileBuffer = OutFileBuffer(regionDnaFasta) for seqName in entryDict1: i = -1 for e in entryDict1[seqName]: i += 1 from1 = entryDict1[seqName][i][0] to1 = entryDict1[seqName][i][1] assert ((from1 != None) and (to1 != None)) #compare the results found by the primary and secondary HMM profiles if (entryDict2 != None) and (seqName in entryDict2): if len(entryDict2[seqName]) >= (i+1): from2 = entryDict2[seqName][i][0] to2 = entryDict2[seqName][i][1] #if from1 != from2 or to1 != to2: # print str('Different positions in' + seqName + ' from1:' + str(from1) + ' from2:' + str(from2) # + ' to1:' + str(to1) + ' to2:' + str(to2)) #extract regions from the DNA sequences (consider 3 ORF and reverse complements) #name of the whole sequence dnaSeqName = re.sub(r'([0-9]+_[0-9]+)_[pr]+[012]', r'\1', seqName) #whole DNA sequence dnaSeq = dnaSeqDict[dnaSeqName].seq #reverse complement (contains "pr") tagRev = 'p' if re.match(r'[0-9]+_[0-9]+_pr[012]', seqName): dnaSeq = dnaSeq.reverse_complement() tagRev = 'pr' #shift "0" if re.match(r'[0-9]+_[0-9]+_[pr]+0', seqName): tagFrom = ((from1 - 1)*3) tagTo = (to1*3) tagRev += '0' dnaSeq = dnaSeq[tagFrom:tagTo] #shift "1" elif re.match(r'[0-9]+_[0-9]+_[pr]+1', seqName): tagFrom = (((from1 - 1)*3) + 1) tagTo = ((to1*3) + 1) tagRev += '1' dnaSeq = dnaSeq[tagFrom:tagTo] #shift "2" elif re.match(r'[0-9]+_[0-9]+_[pr]+2', seqName): tagFrom = (((from1 - 1)*3) + 2) tagTo = ((to1*3) + 2) tagRev += '2' dnaSeq = dnaSeq[tagFrom:tagTo] #error else: sys.stderr.write('Wrong seq name: ' + seqName + ' \n') dnaSeq = None tag = str(str(tagFrom) + '_' + str(tagTo) + '_' + tagRev) outFileBuffer.writeText(str('>' + dnaSeqName + '_' + tag + '\n' + dnaSeq + '\n')) outFileBuffer.close() #if no marker gene found if outFileBuffer.isEmpty(): continue #run mothur classify (bayesian? the same as for the 16S analysis) templateFile = mgFiles.getFilePath(geneName, 'templateDNA') taxonomyFile = mgFiles.getFilePath(geneName, 'taxonomyDNA') assert ((templateFile is not None) and (taxonomyFile is not None)) cmd = str('time ' + self.mothur + ' "#classify.seqs(fasta=' + regionDnaFasta + ', template=' + templateFile + ', taxonomy=' + taxonomyFile + ', ' + self.mothurParam + ')"') if os.name == 'posix': mothurProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self.markerGeneWorkingDir, stdout=stdoutLog) print 'run cmd:', cmd mothurProc.wait() print 'mothur return code:', mothurProc.returncode if mothurProc.returncode != 0: raise Exception("Command returned with non-zero %s status: %s" % (mothurProc.returncode, cmd)) else: print 'Cannot run mothur since your system is not "posix" but', str('"' + os.name + '"'), '\n', cmd #transform the mothur output to a simple output (name, ncbid, weight) #mothurPredFileName = os.path.join(self.markerGeneWorkingDir, # str(geneName + '_dna.' + os.path.basename(taxonomyFile) + 'onomy')) # taxonomy #!!!!!!!!!!!!! mothurPredFileName = common.getMothurOutputFilePath(regionDnaFasta, taxonomyFile) if not os.path.isfile(mothurPredFileName): mothurPredFileName = common.getMothurOutputFilePath(regionDnaFasta, taxonomyFile, suffix='.bayesian.taxonomy') if not os.path.isfile(mothurPredFileName): print("Can't open file: %s" % mothurPredFileName) outPredFileName = os.path.join(self.markerGeneWorkingDir, str(os.path.basename(fastaFileDNA) + '_' + geneName + '.mP')) outBuffer = OutFileBuffer(outPredFileName, bufferText=True) forEachLine(mothurPredFileName, _MothurOutFileParser(outBuffer, geneName)) if not outAllBuffer.isEmpty(): outAllBuffer.writeText('\n') outAllBuffer.writeText(outBuffer.getTextBuffer()) if outLog is not None: stdoutLog.close() outAllBuffer.close()