Ejemplo n.º 1
0
def ppOut2PPSout():
    inFile = '/Users/ivan/Documents/work/binning/data/HumanGut/PP/TS29_scaff.file.0.5.txt'
    outFile = '/Users/ivan/Documents/work/binning/data/HumanGut/PP/TS29_scaff.file.0.5.PPS.txt'
    dbFile = '/Users/ivan/Documents/work/binning/taxonomy/20120828/ncbitax_sqlite.db' #DB
    taxonomy = taxonomy_ncbi.TaxonomyNcbi(dbFile)

    out = csv.OutFileBuffer(outFile)

    csv.forEachLine(inFile, PP2PPSoutParser(taxonomy, out))

    out.close()
Ejemplo n.º 2
0
def ppOut2PPSout():
    inFile = '/Users/ivan/Documents/work/binning/data/HumanGut/PP/TS29_scaff.file.0.5.txt'
    outFile = '/Users/ivan/Documents/work/binning/data/HumanGut/PP/TS29_scaff.file.0.5.PPS.txt'
    dbFile = '/Users/ivan/Documents/work/binning/taxonomy/20120828/ncbitax_sqlite.db'  #DB
    taxonomy = taxonomy_ncbi.TaxonomyNcbi(dbFile)

    out = csv.OutFileBuffer(outFile)

    csv.forEachLine(inFile, PP2PPSoutParser(taxonomy, out))

    out.close()
Ejemplo n.º 3
0
 def setCandidatePlacement(self, sequences, taxonomy, fastaFileDNA):
     """
         Set candidate placement according to the marker gene analysis !!!
     """
     outPredAllFileName = os.path.join(self.markerGeneWorkingDir,
                                       str(os.path.basename(fastaFileDNA) + '_all.mP'))
     return forEachLine(outPredAllFileName, _SetCandidatePlacement(sequences, taxonomy)).getAssignedSeqCount()
Ejemplo n.º 4
0
 def setCandidatePlacement(self, sequences, taxonomy, fastaFileDNA):
     """
         Set candidate placement according to the marker gene analysis !!!
     """
     outPredAllFileName = os.path.join(self.markerGeneWorkingDir,
                                       str(os.path.basename(fastaFileDNA) + '_all.mP'))
     return forEachLine(outPredAllFileName, _SetCandidatePlacement(sequences, taxonomy)).getAssignedSeqCount()
Ejemplo n.º 5
0
    def runMarkerGeneAnalysis(self, fastaFileDNA, outLog=None):
        """
            Run hmmer HMM and mothur classify (bayesian), same param as for the 16S analysis.
        """
        #read list of marker genes
        mgFiles = forEachLine(self.markerGeneListFile, _MgFiles(self.markerGeneListFileDir))

        #translate DNA to protein sequences
        fastaFileProt = os.path.join(self.markerGeneWorkingDir, str(os.path.basename(fastaFileDNA) + '.PROT'))
        dnaToProt(fastaFileDNA, fastaFileProt)

        #read DNA fasta file
        try:
            handle = open(fastaFileDNA, "rU")
            dnaSeqDict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
            handle.close()
        except Exception:
            sys.stderr.write(str('Cannot read file: ' + str(fastaFileDNA)))
            raise

        #to output all predictions in one file
        outPredAllFileName = os.path.join(self.markerGeneWorkingDir,
                                           str(os.path.basename(fastaFileDNA) + '_all.mP'))
        outAllBuffer = OutFileBuffer(outPredAllFileName)

        #run HMM search
        mgList = mgFiles.getGeneNameList()

        if outLog is not None:
            stdoutLog = open(outLog, 'w')
        else:
            stdoutLog = subprocess.STDOUT

        #for each gene perform the analysis separately
        for geneName in mgList:

            domFileArray = [os.path.join(self.markerGeneWorkingDir, str(geneName + '_1.dom'))]  #,
                            # os.path.join(self.markerGeneWorkingDir, str(geneName + '_2.dom'))]
            outFileArray = [os.path.join(self.markerGeneWorkingDir, str(geneName + '_1.out'))]  #,
                            # os.path.join(self.markerGeneWorkingDir, str(geneName + '_2.out'))]
            hmmFileArray = [mgFiles.getFilePath(geneName, 'hmmPROTPrim')]  #,
                            # mgFiles.getFilePath(geneName, 'hmmPROTSec')]
            cmdArray = list([])

            #define cmd
            for i in range(1):
                if hmmFileArray[i] is not None:
                    cmdArray.append(str(os.path.join(self.hmmerBinDir, 'hmmsearch') + ' --domtblout ' + domFileArray[i] + ' -E 0.01' + self.processorsHmm
                               + ' -o ' + outFileArray[i] + ' ' + hmmFileArray[i] + ' ' + fastaFileProt))
                else:
                    cmdArray.append(None)

            #run cmd
            for cmd in cmdArray:
                if cmd is not None and os.name == 'posix':

                    cwd = self.hmmInstallDir

                    if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd)])) is not None:
                        sys.exit(-1)

                    # hmmProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self.hmmInstallDir, stdout=stdoutLog)
                    # print 'run cmd:', cmd
                    # hmmProc.wait()
                    # print 'HMM  return code:', hmmProc.returncode
                    # if hmmProc.returncode != 0:
                    #     raise Exception("Command returned with non-zero %s status: %s" % (hmmProc.returncode, cmd))


                else:
                    print 'Marker genes analysis, doesn`t run (no posix): ', cmd


            #get regions that match to the HMM profile ()
            entryDictList = []
            for i in range(1):
                if cmdArray[i] is not None:
                    entryDictList.append(forEachLine(domFileArray[i], _MgRegions()).getEntryDict())
                else:
                    entryDictList.append(None)

            entryDict1 = entryDictList[0]
            # entryDict2 = entryDictList[1]

            #extract regions found in the protein sequences that were found by the HMM and generate corresponding DNA sequences
            regionDnaFasta = os.path.join(self.markerGeneWorkingDir, str(geneName + '_dna.gff'))
            outFileBuffer = OutFileBuffer(regionDnaFasta)

            for seqName in entryDict1:
                i = -1
                for e in entryDict1[seqName]:
                    i += 1
                    from1 = entryDict1[seqName][i][0]
                    to1 = entryDict1[seqName][i][1]
                    assert ((from1 != None) and (to1 != None))
                    #compare the results found by the primary and secondary HMM profiles
                    # if (entryDict2 != None) and (seqName in entryDict2):
                    #     if len(entryDict2[seqName]) >= (i+1):
                    #         from2 = entryDict2[seqName][i][0]
                    #         to2 = entryDict2[seqName][i][1]
                            #if from1 != from2 or to1 != to2:
                            #    print str('Different positions in' + seqName + ' from1:' + str(from1) + ' from2:' + str(from2)
                            #                + ' to1:' + str(to1) + ' to2:' + str(to2))

                    #extract regions from the DNA sequences (consider 3 ORF and reverse complements)

                    #name of the whole sequence
                    dnaSeqName = re.sub(r'([0-9]+_[0-9]+)_[pr]+[012]', r'\1', seqName)
                    #whole DNA sequence
                    dnaSeq = dnaSeqDict[dnaSeqName].seq

                    #reverse complement (contains "pr")
                    tagRev = 'p'
                    if re.match(r'[0-9]+_[0-9]+_pr[012]', seqName):
                        dnaSeq = dnaSeq.reverse_complement()
                        tagRev = 'pr'

                    #shift "0"
                    if re.match(r'[0-9]+_[0-9]+_[pr]+0', seqName):
                        tagFrom = ((from1 - 1)*3)
                        tagTo = (to1*3)
                        tagRev += '0'
                        dnaSeq = dnaSeq[tagFrom:tagTo]

                    #shift "1"
                    elif re.match(r'[0-9]+_[0-9]+_[pr]+1', seqName):
                        tagFrom = (((from1 - 1)*3) + 1)
                        tagTo = ((to1*3) + 1)
                        tagRev += '1'
                        dnaSeq = dnaSeq[tagFrom:tagTo]

                    #shift "2"
                    elif re.match(r'[0-9]+_[0-9]+_[pr]+2', seqName):
                        tagFrom = (((from1 - 1)*3) + 2)
                        tagTo = ((to1*3) + 2)
                        tagRev += '2'
                        dnaSeq = dnaSeq[tagFrom:tagTo]

                    #error
                    else:
                        sys.stderr.write('Wrong seq name: ' + seqName + ' \n')
                        dnaSeq = None

                    tag = str(str(tagFrom) + '_' + str(tagTo) + '_' + tagRev)
                    outFileBuffer.writeText(str('>' + dnaSeqName + '_' + tag + '\n' + dnaSeq + '\n'))

            outFileBuffer.close()

            #if no marker gene found
            if outFileBuffer.isEmpty():
                continue

            #run mothur classify (bayesian? the same as for the 16S analysis)
            templateFile = mgFiles.getFilePath(geneName, 'templateDNA')
            taxonomyFile = mgFiles.getFilePath(geneName, 'taxonomyDNA')
            assert ((templateFile is not None) and (taxonomyFile is not None))
            cmd = str('' + self.mothur + ' "#classify.seqs(fasta=' + regionDnaFasta + ', template=' + templateFile
                + ', taxonomy=' +  taxonomyFile + ', ' + self.mothurParam + ')"')
            if os.name == 'posix':

                print('Mothur processing: %s' % os.path.basename(templateFile).split('_', 1)[0])

                cwd = self.markerGeneWorkingDir

                if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd, stdout=stdoutLog)])) is not None:
                    sys.exit(-1)

                # mothurProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self.markerGeneWorkingDir, stdout=stdoutLog)
                # print 'run cmd:', cmd
                # mothurProc.wait()
                # print 'mothur return code:', mothurProc.returncode
                # if mothurProc.returncode != 0:
                #     raise Exception("Command returned with non-zero %s status: %s" % (mothurProc.returncode, cmd))

            else:
                print 'Cannot run mothur since your system is not "posix" but', str('"' + os.name + '"'), '\n', cmd

            #transform the mothur output to a simple output (name, ncbid, weight)

            #mothurPredFileName = os.path.join(self.markerGeneWorkingDir,
            #                                  str(geneName + '_dna.' + os.path.basename(taxonomyFile) + 'onomy'))  # taxonomy
            #!!!!!!!!!!!!!
            mothurPredFileName = common.getMothurOutputFilePath(regionDnaFasta, taxonomyFile)
            if not os.path.isfile(mothurPredFileName):
                mothurPredFileName = common.getMothurOutputFilePath(regionDnaFasta, taxonomyFile, suffix='.bayesian.taxonomy')
                if not os.path.isfile(mothurPredFileName):
                    print("Can't open file: %s" % mothurPredFileName)

            outPredFileName = os.path.join(self.markerGeneWorkingDir,
                                           str(os.path.basename(fastaFileDNA) + '_' + geneName + '.mP'))
            outBuffer = OutFileBuffer(outPredFileName, bufferText=True)
            forEachLine(mothurPredFileName, _MothurOutFileParser(outBuffer, geneName))

            if not outAllBuffer.isEmpty():
                outAllBuffer.writeText('\n')
            outAllBuffer.writeText(outBuffer.getTextBuffer())

        if outLog is not None:
            stdoutLog.close()
        outAllBuffer.close()
Ejemplo n.º 6
0
    def _init(self, align=True, dm=True, cluster=True):
        """
            Init data, compute: alignment, distance matrix, clusters.
        """
        if self._initDone:
            return
        self._initDone = True

        fastaPathList = [] # fasta files containing regions that correspond to particular marker genes
        self._mgList = [] # list of names of marker genes
        mgToFastaPath = dict([]) # marker gene name -> fasta file path

        #collect regions from Amphora mg
        for fastaFile in glob.glob(os.path.join(os.path.normpath(self._mgWorkingDir),'*.gff')):
            fastaPathList.append(fastaFile)
        for path in fastaPathList:
            name = re.sub('([^\.]+)\..*$', r'\1' , os.path.basename(path))
            mg = re.sub(r'([^_]+)_dna', r'\1',name)
            dir = os.path.dirname(path)
            self._mgList.append(mg)
            mgToFastaPath[mg] = path

        #add 16S
        s16List = ['5S_rRNA', '16S_rRNA', '23S_rRNA']
        for mg in s16List:
            mgToFastaPath[mg] = str(self._s16Prefix + '.' + mg + '.fna')
            self._mgList.append(mg)

        #For each marker gene create filtered fasta file that contains for each mg and sequence at most one region.
        mgToFilteredFastaPath = dict([])
        mgToSeqNameToTaxPathDict = dict([]) #mg -> seqName (~region name) -> pred
        for mg in self._mgList:
            mgToSeqNameToTaxPathDict[mg] = dict([])

        for seq in self._sequences.sequences:
            id = str(str(seq.scaffold.id) + '_' + str(seq.id))
            for mg,tag,pred in zip(seq.getCandidateTaxPathSourceList(), seq.getCandidateTaxPathTagList(),
                                    seq.getCandidateTaxPathDictList()):
                mgToSeqNameToTaxPathDict[mg][str(id + '_' + tag)] = pred

        #for each marker gene: choose only one sequence region for each mg and sequence
        #all sequences are predicted at least at superkingdom
        for mg in self._mgList:
            seqNameToPred = mgToSeqNameToTaxPathDict[mg] #sequence region predictions for this mg
            seqNameToSeq = fastaFileToDict(mgToFastaPath[mg]) #read the fasta file
            outPath = os.path.normpath(os.path.join(self._clustDir, str(mg + '.filter.fna')))
            mgToFilteredFastaPath[mg] = outPath
            out = OutFileBuffer(outPath)
            seqBaseToSeqName = dict([]) # sequence base (scaffId_seqId) -> region name
            for seqName in seqNameToSeq:
                seqBase = re.sub(r'^([0-9]+_[0-9]+)[^0-9].*',r'\1', seqName)
                if seqBase not in seqBaseToSeqName:
                    seqBaseToSeqName[seqBase] = []
                seqBaseToSeqName[seqBase].append(seqName)
            for seqBase in seqBaseToSeqName:
                seqId = int(re.sub(r'^[0-9]+_([0-9]+)',r'\1', seqBase))
                seqBaseTaxPathDict = self._sequences.getSequence(seqId).getTaxonomyPath()
                list = seqBaseToSeqName[seqBase]
                candidateSeq = [] # sequence region is predicted at least at rank superkingdom
                for seqName in list:
                    if seqName not in seqNameToPred:
                        taxPathDict = None
                    else:
                        taxPathDict = seqNameToPred[seqName]
                    if taxPathDict != None:
                         candidateSeq.append(seqName)
                if len(candidateSeq) == 0:
                    continue
                candidateSeq2 = [] # sequence regions predicted at least at the same rank as the whole sequence
                for seqName in candidateSeq:
                    taxPathDict = seqNameToPred[seqName]
                    if ((seqBaseTaxPathDict == None)
                        or (len(taxPathDict) >= len(seqBaseTaxPathDict))): #predict at least at the same level
                        candidateSeq2.append(seqName)
                if len(candidateSeq2) > 0: #take the longest sequence
                    sMax = candidateSeq2[0]
                    for s in candidateSeq2[1:]:
                        if len(seqNameToSeq[s]) > len(seqNameToSeq[sMax]):
                            sMax = s
                else: #all sequence regions are predicted higher than the sequence
                    sMax = candidateSeq[0] #sequence region with the most specific prediction
                    for s in candidateSeq[1:]:
                        taxPathDictMax = seqNameToPred[sMax]
                        taxPathDictS = seqNameToPred[s]
                        if taxPathDictS == None:
                            continue
                        if taxPathDictMax == None:
                            sMax = s
                            continue
                        if len(taxPathDictMax) < len(taxPathDictS):
                            sMax = s

                    candidateSeq3 = [] #get all sequence regions with the most specific prediction
                    taxPathDictMax = seqNameToPred[sMax]
                    for s in candidateSeq:
                        taxPathDictS = seqNameToPred[s]
                        if taxPathDictMax == None:
                            candidateSeq3.append(s)
                        elif len(taxPathDictS) == len(taxPathDictMax):
                            candidateSeq3.append(s)
                    sMax = candidateSeq3[0]
                    for s in candidateSeq3[1:]: #take the longest sequence
                        if len(seqNameToSeq[sMax]) < len(seqNameToSeq[s]):
                            sMax = s

                out.writeText(str('>' + str(sMax) + '\n' + str(seqNameToSeq[sMax]) + '\n'))

            out.close()

        mgToAlignPath = dict([])
        for mg in self._mgList:
            mgToAlignPath[mg] = os.path.normpath(os.path.join(self._clustDir, str(mg + '.align.fna')))

        #build alignment
        if align:
            for mg in self._mgList:
                alignCmd = str(self._config.get('aligner') + ' -in ' + mgToFilteredFastaPath[mg]
                + ' -out ' + mgToAlignPath[mg] + ' -quiet')
                assert os.name == 'posix'
                predictProc = subprocess.Popen(alignCmd, cwd=self._mgWorkingDir, shell=True, bufsize=-1) #stdout=subprocess.STDOUT, stderr=subprocess.STDOUT)
                predictProc.wait()
                print 'Muscle return code for', mg, ':', predictProc.returncode
                if predictProc.returncode != 0:
                    sys.stderr.write(str(alignCmd + ' \n'))

        #compute DM
        if dm:
            for mg in self._mgList:
                mothur = os.path.join(os.path.normpath(self._configRRNA16S.get('mothurInstallDir')), 'mothur')
                mothurCmd = str('time ' + mothur + ' "#dist.seqs(fasta=' + mgToAlignPath[mg]
                                + ', processors=2, countends=F, calc=nogaps, cutoff=0.3, output=lt)"')
                assert os.name == 'posix'
                mothurProc = subprocess.Popen(mothurCmd, shell=True, bufsize=-1, cwd=self._mgWorkingDir)
                mothurProc.wait()
                print 'Mothur return code dist:', mg, mothurProc.returncode
                #distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist'))
                #self._mgToDM[mg] = forEachLine(distFilePath, DM())
                #self._mgToDM[mg].printDM()

        #cluster
        if cluster:
            for mg in self._mgList:
                distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist'))
                mothur = os.path.join(os.path.normpath(self._configRRNA16S.get('mothurInstallDir')), 'mothur')
                mothurCmd = str('time ' + mothur + ' "#cluster(phylip=' + distFilePath
                                + ', method=furthest, hard=t, precision=1000)"')
                assert os.name == 'posix'
                mothurProc = subprocess.Popen(mothurCmd, shell=True, bufsize=-1, cwd=self._mgWorkingDir)
                mothurProc.wait()
                print 'Mothur return code cluster:', mg, mothurProc.returncode

        #read DM and clusters

        #sequence predictions
        self._seqIdToTaxPathDict = dict([])
        self._seqIdToWeight = dict([])
        for seq in self._sequences.sequences:
            id = int(seq.id)
            self._seqIdToTaxPathDict[id] = seq.getTaxonomyPath()
            self._seqIdToWeight[id] = seq.getTaxonomyPathWeight()

        #similarity thresholds
        thresholds = self._configMG.get('mgSimilarityThresholds')
        self._mgToMaxThreshold = dict([])
        tmpDict = getMapping(self._configMG.get('mgSimilarityThresholds'), 0, 1, sep='\t', comment = '#')
        for k in tmpDict:
            self._mgToMaxThreshold[k] = float(tmpDict[k][0])

        self._mgToDM = dict([])
        self._mgToCluster = dict([])
        for mg in self._mgList:
            file = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist'))
            self._mgToDM[mg] = forEachLine(file, DM())
            file = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.fn.list'))
            self._mgToCluster[mg] = forEachLine(file, MCluster(self._seqIdToTaxPathDict, self._mgToMaxThreshold[mg]))
Ejemplo n.º 7
0
    def _init(self, align=True, dm=True, cluster=True):
        """
            Init data, compute: alignment, distance matrix, clusters.
        """
        if self._initDone:
            return
        self._initDone = True

        fastaPathList = [
        ]  # fasta files containing regions that correspond to particular marker genes
        self._mgList = []  # list of names of marker genes
        mgToFastaPath = dict([])  # marker gene name -> fasta file path

        #collect regions from Amphora mg
        for fastaFile in glob.glob(
                os.path.join(os.path.normpath(self._mgWorkingDir), '*.gff')):
            fastaPathList.append(fastaFile)
        for path in fastaPathList:
            name = re.sub('([^\.]+)\..*$', r'\1', os.path.basename(path))
            mg = re.sub(r'([^_]+)_dna', r'\1', name)
            dir = os.path.dirname(path)
            self._mgList.append(mg)
            mgToFastaPath[mg] = path

        #add 16S
        s16List = ['5S_rRNA', '16S_rRNA', '23S_rRNA']
        for mg in s16List:
            mgToFastaPath[mg] = str(self._s16Prefix + '.' + mg + '.fna')
            self._mgList.append(mg)

        #For each marker gene create filtered fasta file that contains for each mg and sequence at most one region.
        mgToFilteredFastaPath = dict([])
        mgToSeqNameToTaxPathDict = dict(
            [])  #mg -> seqName (~region name) -> pred
        for mg in self._mgList:
            mgToSeqNameToTaxPathDict[mg] = dict([])

        for seq in self._sequences.sequences:
            id = str(str(seq.scaffold.id) + '_' + str(seq.id))
            for mg, tag, pred in zip(seq.getCandidateTaxPathSourceList(),
                                     seq.getCandidateTaxPathTagList(),
                                     seq.getCandidateTaxPathDictList()):
                mgToSeqNameToTaxPathDict[mg][str(id + '_' + tag)] = pred

        #for each marker gene: choose only one sequence region for each mg and sequence
        #all sequences are predicted at least at superkingdom
        for mg in self._mgList:
            seqNameToPred = mgToSeqNameToTaxPathDict[
                mg]  #sequence region predictions for this mg
            seqNameToSeq = fastaFileToDict(
                mgToFastaPath[mg])  #read the fasta file
            outPath = os.path.normpath(
                os.path.join(self._clustDir, str(mg + '.filter.fna')))
            mgToFilteredFastaPath[mg] = outPath
            out = OutFileBuffer(outPath)
            seqBaseToSeqName = dict(
                [])  # sequence base (scaffId_seqId) -> region name
            for seqName in seqNameToSeq:
                seqBase = re.sub(r'^([0-9]+_[0-9]+)[^0-9].*', r'\1', seqName)
                if seqBase not in seqBaseToSeqName:
                    seqBaseToSeqName[seqBase] = []
                seqBaseToSeqName[seqBase].append(seqName)
            for seqBase in seqBaseToSeqName:
                seqId = int(re.sub(r'^[0-9]+_([0-9]+)', r'\1', seqBase))
                seqBaseTaxPathDict = self._sequences.getSequence(
                    seqId).getTaxonomyPath()
                list = seqBaseToSeqName[seqBase]
                candidateSeq = [
                ]  # sequence region is predicted at least at rank superkingdom
                for seqName in list:
                    if seqName not in seqNameToPred:
                        taxPathDict = None
                    else:
                        taxPathDict = seqNameToPred[seqName]
                    if taxPathDict != None:
                        candidateSeq.append(seqName)
                if len(candidateSeq) == 0:
                    continue
                candidateSeq2 = [
                ]  # sequence regions predicted at least at the same rank as the whole sequence
                for seqName in candidateSeq:
                    taxPathDict = seqNameToPred[seqName]
                    if ((seqBaseTaxPathDict == None)
                            or (len(taxPathDict) >= len(seqBaseTaxPathDict))
                        ):  #predict at least at the same level
                        candidateSeq2.append(seqName)
                if len(candidateSeq2) > 0:  #take the longest sequence
                    sMax = candidateSeq2[0]
                    for s in candidateSeq2[1:]:
                        if len(seqNameToSeq[s]) > len(seqNameToSeq[sMax]):
                            sMax = s
                else:  #all sequence regions are predicted higher than the sequence
                    sMax = candidateSeq[
                        0]  #sequence region with the most specific prediction
                    for s in candidateSeq[1:]:
                        taxPathDictMax = seqNameToPred[sMax]
                        taxPathDictS = seqNameToPred[s]
                        if taxPathDictS == None:
                            continue
                        if taxPathDictMax == None:
                            sMax = s
                            continue
                        if len(taxPathDictMax) < len(taxPathDictS):
                            sMax = s

                    candidateSeq3 = [
                    ]  #get all sequence regions with the most specific prediction
                    taxPathDictMax = seqNameToPred[sMax]
                    for s in candidateSeq:
                        taxPathDictS = seqNameToPred[s]
                        if taxPathDictMax == None:
                            candidateSeq3.append(s)
                        elif len(taxPathDictS) == len(taxPathDictMax):
                            candidateSeq3.append(s)
                    sMax = candidateSeq3[0]
                    for s in candidateSeq3[1:]:  #take the longest sequence
                        if len(seqNameToSeq[sMax]) < len(seqNameToSeq[s]):
                            sMax = s

                out.writeText(
                    str('>' + str(sMax) + '\n' + str(seqNameToSeq[sMax]) +
                        '\n'))

            out.close()

        mgToAlignPath = dict([])
        for mg in self._mgList:
            mgToAlignPath[mg] = os.path.normpath(
                os.path.join(self._clustDir, str(mg + '.align.fna')))

        #build alignment
        if align:
            for mg in self._mgList:
                alignCmd = str(
                    self._config.get('aligner') + ' -in ' +
                    mgToFilteredFastaPath[mg] + ' -out ' + mgToAlignPath[mg] +
                    ' -quiet')
                assert os.name == 'posix'
                predictProc = subprocess.Popen(
                    alignCmd, cwd=self._mgWorkingDir, shell=True, bufsize=-1
                )  #stdout=subprocess.STDOUT, stderr=subprocess.STDOUT)
                predictProc.wait()
                print 'Muscle return code for', mg, ':', predictProc.returncode
                if predictProc.returncode != 0:
                    sys.stderr.write(str(alignCmd + ' \n'))

        #compute DM
        if dm:
            for mg in self._mgList:
                mothur = os.path.join(
                    os.path.normpath(
                        self._configRRNA16S.get('mothurInstallDir')), 'mothur')
                mothurCmd = str(
                    'time ' + mothur + ' "#dist.seqs(fasta=' +
                    mgToAlignPath[mg] +
                    ', processors=2, countends=F, calc=nogaps, cutoff=0.3, output=lt)"'
                )
                assert os.name == 'posix'
                mothurProc = subprocess.Popen(mothurCmd,
                                              shell=True,
                                              bufsize=-1,
                                              cwd=self._mgWorkingDir)
                mothurProc.wait()
                print 'Mothur return code dist:', mg, mothurProc.returncode
                #distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist'))
                #self._mgToDM[mg] = forEachLine(distFilePath, DM())
                #self._mgToDM[mg].printDM()

        #cluster
        if cluster:
            for mg in self._mgList:
                distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]),
                                            str(mg + '.align.phylip.dist'))
                mothur = os.path.join(
                    os.path.normpath(
                        self._configRRNA16S.get('mothurInstallDir')), 'mothur')
                mothurCmd = str('time ' + mothur + ' "#cluster(phylip=' +
                                distFilePath +
                                ', method=furthest, hard=t, precision=1000)"')
                assert os.name == 'posix'
                mothurProc = subprocess.Popen(mothurCmd,
                                              shell=True,
                                              bufsize=-1,
                                              cwd=self._mgWorkingDir)
                mothurProc.wait()
                print 'Mothur return code cluster:', mg, mothurProc.returncode

        #read DM and clusters

        #sequence predictions
        self._seqIdToTaxPathDict = dict([])
        self._seqIdToWeight = dict([])
        for seq in self._sequences.sequences:
            id = int(seq.id)
            self._seqIdToTaxPathDict[id] = seq.getTaxonomyPath()
            self._seqIdToWeight[id] = seq.getTaxonomyPathWeight()

        #similarity thresholds
        thresholds = self._configMG.get('mgSimilarityThresholds')
        self._mgToMaxThreshold = dict([])
        tmpDict = getMapping(self._configMG.get('mgSimilarityThresholds'),
                             0,
                             1,
                             sep='\t',
                             comment='#')
        for k in tmpDict:
            self._mgToMaxThreshold[k] = float(tmpDict[k][0])

        self._mgToDM = dict([])
        self._mgToCluster = dict([])
        for mg in self._mgList:
            file = os.path.join(os.path.dirname(mgToAlignPath[mg]),
                                str(mg + '.align.phylip.dist'))
            self._mgToDM[mg] = forEachLine(file, DM())
            file = os.path.join(os.path.dirname(mgToAlignPath[mg]),
                                str(mg + '.align.phylip.fn.list'))
            self._mgToCluster[mg] = forEachLine(
                file,
                MCluster(self._seqIdToTaxPathDict, self._mgToMaxThreshold[mg]))
Ejemplo n.º 8
0
    def runMarkerGeneAnalysis(self, fastaFileDNA, outLog=None):
        """
            Run hmmer HMM and mothur classify (bayesian), same param as for the 16S analysis.
        """
        #read list of marker genes
        mgFiles = forEachLine(self.markerGeneListFile, _MgFiles(self.markerGeneListFileDir))

        #translate DNA to protein sequences
        fastaFileProt = os.path.join(self.markerGeneWorkingDir, str(os.path.basename(fastaFileDNA) + '.PROT'))
        dnaToProt(fastaFileDNA, fastaFileProt)

        #read DNA fasta file
        try:
            handle = open(fastaFileDNA, "rU")
            dnaSeqDict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
            handle.close()
        except Exception:
            sys.stderr.write(str('Cannot read file: ' + str(fastaFileDNA)))
            raise

        #to output all predictions in one file
        outPredAllFileName = os.path.join(self.markerGeneWorkingDir,
                                           str(os.path.basename(fastaFileDNA) + '_all.mP'))
        outAllBuffer = OutFileBuffer(outPredAllFileName)

        #run HMM search
        mgList = mgFiles.getGeneNameList()

        if outLog is not None:
            stdoutLog = open(outLog,'w')
        else:
            stdoutLog = subprocess.STDOUT

        #for each gene perform the analysis separately
        for geneName in mgList:

            domFileArray = [os.path.join(self.markerGeneWorkingDir, str(geneName + '_1.dom')),
                            os.path.join(self.markerGeneWorkingDir, str(geneName + '_2.dom'))]
            outFileArray = [os.path.join(self.markerGeneWorkingDir, str(geneName + '_1.out')),
                            os.path.join(self.markerGeneWorkingDir, str(geneName + '_2.out'))]
            hmmFileArray = [mgFiles.getFilePath(geneName, 'hmmPROTPrim'),
                            mgFiles.getFilePath(geneName, 'hmmPROTSec')]
            cmdArray = list([])

            #define cmd
            for i in range(2):
                if hmmFileArray[i] is not None:
                    cmdArray.append(str(os.path.join(self.hmmerBinDir, 'hmmsearch') + ' --domtblout ' + domFileArray[i] + ' -E 0.01'
                               + ' -o ' + outFileArray[i] + ' ' + hmmFileArray[i] + ' ' + fastaFileProt))
                else:
                    cmdArray.append(None)

            #run cmd
            for cmd in cmdArray:
                if cmd is not None and os.name == 'posix':
                    hmmProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self.hmmInstallDir, stdout=stdoutLog)
                    print 'run cmd:', cmd
                    hmmProc.wait()
                    print 'HMM  return code:', hmmProc.returncode
                    if hmmProc.returncode != 0:
                        raise Exception("Command returned with non-zero %s status: %s" % (hmmProc.returncode, cmd))
                else:
                    print 'Marker genes analysis, doesn`t run (no posix): ', cmd


            #get regions that match to the HMM profile ()
            entryDictList = []
            for i in range(2):
                if cmdArray[i] is not None:
                    entryDictList.append(forEachLine(domFileArray[i], _MgRegions()).getEntryDict())
                else:
                    entryDictList.append(None)

            entryDict1 = entryDictList[0]
            entryDict2 = entryDictList[1]

            #extract regions found in the protein sequences that were found by the HMM and generate corresponding DNA sequences
            regionDnaFasta = os.path.join(self.markerGeneWorkingDir, str(geneName + '_dna.gff'))
            outFileBuffer = OutFileBuffer(regionDnaFasta)

            for seqName in entryDict1:
                i = -1
                for e in entryDict1[seqName]:
                    i += 1
                    from1 = entryDict1[seqName][i][0]
                    to1 = entryDict1[seqName][i][1]
                    assert ((from1 != None) and (to1 != None))
                    #compare the results found by the primary and secondary HMM profiles
                    if (entryDict2 != None) and (seqName in entryDict2):
                        if len(entryDict2[seqName]) >= (i+1):
                            from2 = entryDict2[seqName][i][0]
                            to2 = entryDict2[seqName][i][1]
                            #if from1 != from2 or to1 != to2:
                            #    print str('Different positions in' + seqName + ' from1:' + str(from1) + ' from2:' + str(from2)
                            #                + ' to1:' + str(to1) + ' to2:' + str(to2))

                    #extract regions from the DNA sequences (consider 3 ORF and reverse complements)

                    #name of the whole sequence
                    dnaSeqName = re.sub(r'([0-9]+_[0-9]+)_[pr]+[012]', r'\1', seqName)
                    #whole DNA sequence
                    dnaSeq = dnaSeqDict[dnaSeqName].seq

                    #reverse complement (contains "pr")
                    tagRev = 'p'
                    if re.match(r'[0-9]+_[0-9]+_pr[012]', seqName):
                        dnaSeq = dnaSeq.reverse_complement()
                        tagRev = 'pr'

                    #shift "0"
                    if re.match(r'[0-9]+_[0-9]+_[pr]+0', seqName):
                        tagFrom = ((from1 - 1)*3)
                        tagTo = (to1*3)
                        tagRev += '0'
                        dnaSeq = dnaSeq[tagFrom:tagTo]

                    #shift "1"
                    elif re.match(r'[0-9]+_[0-9]+_[pr]+1', seqName):
                        tagFrom = (((from1 - 1)*3) + 1)
                        tagTo = ((to1*3) + 1)
                        tagRev += '1'
                        dnaSeq = dnaSeq[tagFrom:tagTo]

                    #shift "2"
                    elif re.match(r'[0-9]+_[0-9]+_[pr]+2', seqName):
                        tagFrom = (((from1 - 1)*3) + 2)
                        tagTo = ((to1*3) + 2)
                        tagRev += '2'
                        dnaSeq = dnaSeq[tagFrom:tagTo]

                    #error
                    else:
                        sys.stderr.write('Wrong seq name: ' + seqName + ' \n')
                        dnaSeq = None

                    tag = str(str(tagFrom) + '_' + str(tagTo) + '_' + tagRev)
                    outFileBuffer.writeText(str('>' + dnaSeqName + '_' + tag + '\n' + dnaSeq + '\n'))

            outFileBuffer.close()

            #if no marker gene found
            if outFileBuffer.isEmpty():
                continue

            #run mothur classify (bayesian? the same as for the 16S analysis)
            templateFile = mgFiles.getFilePath(geneName, 'templateDNA')
            taxonomyFile = mgFiles.getFilePath(geneName, 'taxonomyDNA')
            assert ((templateFile is not None) and (taxonomyFile is not None))
            cmd = str('time ' + self.mothur + ' "#classify.seqs(fasta=' + regionDnaFasta + ', template=' + templateFile
                + ', taxonomy=' +  taxonomyFile + ', ' + self.mothurParam + ')"')
            if os.name == 'posix':
                mothurProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self.markerGeneWorkingDir, stdout=stdoutLog)
                print 'run cmd:', cmd
                mothurProc.wait()
                print 'mothur return code:', mothurProc.returncode
                if mothurProc.returncode != 0:
                    raise Exception("Command returned with non-zero %s status: %s" % (mothurProc.returncode, cmd))
            else:
                print 'Cannot run mothur since your system is not "posix" but', str('"' + os.name + '"'), '\n', cmd

            #transform the mothur output to a simple output (name, ncbid, weight)

            #mothurPredFileName = os.path.join(self.markerGeneWorkingDir,
            #                                  str(geneName + '_dna.' + os.path.basename(taxonomyFile) + 'onomy'))  # taxonomy
            #!!!!!!!!!!!!!
            mothurPredFileName = common.getMothurOutputFilePath(regionDnaFasta, taxonomyFile)
            if not os.path.isfile(mothurPredFileName):
                mothurPredFileName = common.getMothurOutputFilePath(regionDnaFasta, taxonomyFile, suffix='.bayesian.taxonomy')
                if not os.path.isfile(mothurPredFileName):
                    print("Can't open file: %s" % mothurPredFileName)

            outPredFileName = os.path.join(self.markerGeneWorkingDir,
                                           str(os.path.basename(fastaFileDNA) + '_' + geneName + '.mP'))
            outBuffer = OutFileBuffer(outPredFileName, bufferText=True)
            forEachLine(mothurPredFileName, _MothurOutFileParser(outBuffer, geneName))

            if not outAllBuffer.isEmpty():
                outAllBuffer.writeText('\n')
            outAllBuffer.writeText(outBuffer.getTextBuffer())

        if outLog is not None:
            stdoutLog.close()
        outAllBuffer.close()