Example #1
0
def runPredict(ppsInstallDir, ppsConfigFilePathPy, fastaFile):
    cwd = os.path.join(ppsInstallDir, 'python_scripts')
    scriptPath = os.path.join(cwd, 'predict.py')
    cmd = 'python %s -c %s -fasta %s ' % (scriptPath, ppsConfigFilePathPy, fastaFile)

    baseName = os.path.basename(fastaFile)

    for f in os.listdir(os.path.dirname(fastaFile)):
        if (f.endswith('.out') or f.endswith('.sl')) and baseName in f:
            os.remove(os.path.join(os.path.dirname(fastaFile), f))

    if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd)])) is not None:
        sys.exit(-1)

    for f in os.listdir(os.path.dirname(fastaFile)):
        if f.endswith('.sl') and baseName in f:
            os.remove(os.path.join(os.path.dirname(fastaFile), f))
Example #2
0
def runTrain(ppsInstallDir, ppsConfigFilePathPy):
    cwd = os.path.join(ppsInstallDir, 'python_scripts')
    scriptPath = os.path.join(cwd, 'train.py')
    cmd = 'python %s -c %s -y' % (scriptPath, ppsConfigFilePathPy)
    if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd)])) is not None:
        sys.exit(-1)
Example #3
0
    def runMarkerGeneAnalysis(self, fastaFileDNA, outLog=None):
        """
            Run hmmer HMM and mothur classify (bayesian), same param as for the 16S analysis.
        """
        #read list of marker genes
        mgFiles = forEachLine(self.markerGeneListFile, _MgFiles(self.markerGeneListFileDir))

        #translate DNA to protein sequences
        fastaFileProt = os.path.join(self.markerGeneWorkingDir, str(os.path.basename(fastaFileDNA) + '.PROT'))
        dnaToProt(fastaFileDNA, fastaFileProt)

        #read DNA fasta file
        try:
            handle = open(fastaFileDNA, "rU")
            dnaSeqDict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
            handle.close()
        except Exception:
            sys.stderr.write(str('Cannot read file: ' + str(fastaFileDNA)))
            raise

        #to output all predictions in one file
        outPredAllFileName = os.path.join(self.markerGeneWorkingDir,
                                           str(os.path.basename(fastaFileDNA) + '_all.mP'))
        outAllBuffer = OutFileBuffer(outPredAllFileName)

        #run HMM search
        mgList = mgFiles.getGeneNameList()

        if outLog is not None:
            stdoutLog = open(outLog, 'w')
        else:
            stdoutLog = subprocess.STDOUT

        #for each gene perform the analysis separately
        for geneName in mgList:

            domFileArray = [os.path.join(self.markerGeneWorkingDir, str(geneName + '_1.dom'))]  #,
                            # os.path.join(self.markerGeneWorkingDir, str(geneName + '_2.dom'))]
            outFileArray = [os.path.join(self.markerGeneWorkingDir, str(geneName + '_1.out'))]  #,
                            # os.path.join(self.markerGeneWorkingDir, str(geneName + '_2.out'))]
            hmmFileArray = [mgFiles.getFilePath(geneName, 'hmmPROTPrim')]  #,
                            # mgFiles.getFilePath(geneName, 'hmmPROTSec')]
            cmdArray = list([])

            #define cmd
            for i in range(1):
                if hmmFileArray[i] is not None:
                    cmdArray.append(str(os.path.join(self.hmmerBinDir, 'hmmsearch') + ' --domtblout ' + domFileArray[i] + ' -E 0.01' + self.processorsHmm
                               + ' -o ' + outFileArray[i] + ' ' + hmmFileArray[i] + ' ' + fastaFileProt))
                else:
                    cmdArray.append(None)

            #run cmd
            for cmd in cmdArray:
                if cmd is not None and os.name == 'posix':

                    cwd = self.hmmInstallDir

                    if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd)])) is not None:
                        sys.exit(-1)

                    # hmmProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self.hmmInstallDir, stdout=stdoutLog)
                    # print 'run cmd:', cmd
                    # hmmProc.wait()
                    # print 'HMM  return code:', hmmProc.returncode
                    # if hmmProc.returncode != 0:
                    #     raise Exception("Command returned with non-zero %s status: %s" % (hmmProc.returncode, cmd))


                else:
                    print 'Marker genes analysis, doesn`t run (no posix): ', cmd


            #get regions that match to the HMM profile ()
            entryDictList = []
            for i in range(1):
                if cmdArray[i] is not None:
                    entryDictList.append(forEachLine(domFileArray[i], _MgRegions()).getEntryDict())
                else:
                    entryDictList.append(None)

            entryDict1 = entryDictList[0]
            # entryDict2 = entryDictList[1]

            #extract regions found in the protein sequences that were found by the HMM and generate corresponding DNA sequences
            regionDnaFasta = os.path.join(self.markerGeneWorkingDir, str(geneName + '_dna.gff'))
            outFileBuffer = OutFileBuffer(regionDnaFasta)

            for seqName in entryDict1:
                i = -1
                for e in entryDict1[seqName]:
                    i += 1
                    from1 = entryDict1[seqName][i][0]
                    to1 = entryDict1[seqName][i][1]
                    assert ((from1 != None) and (to1 != None))
                    #compare the results found by the primary and secondary HMM profiles
                    # if (entryDict2 != None) and (seqName in entryDict2):
                    #     if len(entryDict2[seqName]) >= (i+1):
                    #         from2 = entryDict2[seqName][i][0]
                    #         to2 = entryDict2[seqName][i][1]
                            #if from1 != from2 or to1 != to2:
                            #    print str('Different positions in' + seqName + ' from1:' + str(from1) + ' from2:' + str(from2)
                            #                + ' to1:' + str(to1) + ' to2:' + str(to2))

                    #extract regions from the DNA sequences (consider 3 ORF and reverse complements)

                    #name of the whole sequence
                    dnaSeqName = re.sub(r'([0-9]+_[0-9]+)_[pr]+[012]', r'\1', seqName)
                    #whole DNA sequence
                    dnaSeq = dnaSeqDict[dnaSeqName].seq

                    #reverse complement (contains "pr")
                    tagRev = 'p'
                    if re.match(r'[0-9]+_[0-9]+_pr[012]', seqName):
                        dnaSeq = dnaSeq.reverse_complement()
                        tagRev = 'pr'

                    #shift "0"
                    if re.match(r'[0-9]+_[0-9]+_[pr]+0', seqName):
                        tagFrom = ((from1 - 1)*3)
                        tagTo = (to1*3)
                        tagRev += '0'
                        dnaSeq = dnaSeq[tagFrom:tagTo]

                    #shift "1"
                    elif re.match(r'[0-9]+_[0-9]+_[pr]+1', seqName):
                        tagFrom = (((from1 - 1)*3) + 1)
                        tagTo = ((to1*3) + 1)
                        tagRev += '1'
                        dnaSeq = dnaSeq[tagFrom:tagTo]

                    #shift "2"
                    elif re.match(r'[0-9]+_[0-9]+_[pr]+2', seqName):
                        tagFrom = (((from1 - 1)*3) + 2)
                        tagTo = ((to1*3) + 2)
                        tagRev += '2'
                        dnaSeq = dnaSeq[tagFrom:tagTo]

                    #error
                    else:
                        sys.stderr.write('Wrong seq name: ' + seqName + ' \n')
                        dnaSeq = None

                    tag = str(str(tagFrom) + '_' + str(tagTo) + '_' + tagRev)
                    outFileBuffer.writeText(str('>' + dnaSeqName + '_' + tag + '\n' + dnaSeq + '\n'))

            outFileBuffer.close()

            #if no marker gene found
            if outFileBuffer.isEmpty():
                continue

            #run mothur classify (bayesian? the same as for the 16S analysis)
            templateFile = mgFiles.getFilePath(geneName, 'templateDNA')
            taxonomyFile = mgFiles.getFilePath(geneName, 'taxonomyDNA')
            assert ((templateFile is not None) and (taxonomyFile is not None))
            cmd = str('' + self.mothur + ' "#classify.seqs(fasta=' + regionDnaFasta + ', template=' + templateFile
                + ', taxonomy=' +  taxonomyFile + ', ' + self.mothurParam + ')"')
            if os.name == 'posix':

                print('Mothur processing: %s' % os.path.basename(templateFile).split('_', 1)[0])

                cwd = self.markerGeneWorkingDir

                if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd, stdout=stdoutLog)])) is not None:
                    sys.exit(-1)

                # mothurProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self.markerGeneWorkingDir, stdout=stdoutLog)
                # print 'run cmd:', cmd
                # mothurProc.wait()
                # print 'mothur return code:', mothurProc.returncode
                # if mothurProc.returncode != 0:
                #     raise Exception("Command returned with non-zero %s status: %s" % (mothurProc.returncode, cmd))

            else:
                print 'Cannot run mothur since your system is not "posix" but', str('"' + os.name + '"'), '\n', cmd

            #transform the mothur output to a simple output (name, ncbid, weight)

            #mothurPredFileName = os.path.join(self.markerGeneWorkingDir,
            #                                  str(geneName + '_dna.' + os.path.basename(taxonomyFile) + 'onomy'))  # taxonomy
            #!!!!!!!!!!!!!
            mothurPredFileName = common.getMothurOutputFilePath(regionDnaFasta, taxonomyFile)
            if not os.path.isfile(mothurPredFileName):
                mothurPredFileName = common.getMothurOutputFilePath(regionDnaFasta, taxonomyFile, suffix='.bayesian.taxonomy')
                if not os.path.isfile(mothurPredFileName):
                    print("Can't open file: %s" % mothurPredFileName)

            outPredFileName = os.path.join(self.markerGeneWorkingDir,
                                           str(os.path.basename(fastaFileDNA) + '_' + geneName + '.mP'))
            outBuffer = OutFileBuffer(outPredFileName, bufferText=True)
            forEachLine(mothurPredFileName, _MothurOutFileParser(outBuffer, geneName))

            if not outAllBuffer.isEmpty():
                outAllBuffer.writeText('\n')
            outAllBuffer.writeText(outBuffer.getTextBuffer())

        if outLog is not None:
            stdoutLog.close()
        outAllBuffer.close()
Example #4
0
def mainSnowball(
    fq1Path,
    fq2Path,
    profileHmmFile,
    insertSize,
    readLen=None,
    outFile=None,
    outFormat="fna",
    workingDir=None,
    hmmsearchPath=None,
    pfamMinScore=None,
    pOverlap=None,
    overlapLen=None,
    outAnnot=False,
    cleanUp=False,
    processors=mp.cpu_count(),
):
    """
        Main function, the interface of the Snowball gene assembler.

        @param fq1Path: FASTQ 1 file path (containing first ends of Illumina paired-end reads)
        @param fq2Path: FASTQ 2 file path (containing second ends)
        @param profileHmmFile: profile HMMs file, containing models generated by the HMMER 3 software
        @param insertSize: mean insert size used for the library preparation (i.e. read generation)
        @param readLen: read length (if None, it will be derived from the FASTQ file)
        @param outFile: output file (if None, it will be derived from the FASTQ 1 file)
        @param outFormat: output file format 'fna' or 'fq'
        @param workingDir: temporary files will be stored here (if None, a temporary directory will be created/deleted)
        @param hmmsearchPath: path to the HMMER hmmsearch command (if None, it will take version that is in the PATH)
        @param pfamMinScore: minimum score for the hmmsearch (if None, use default)
        @param pOverlap: minimum overlap probability for the Snowball algorithm
        @param overlapLen: minimum overlap length for the Snowball algorithm
        @param outAnnot: if true, additional annotation will be stored along with the resulting contigs
        @param cleanUp: if true, delete temporary files at the end
        @param processors: Number of processors (default: use all processors available)

        @type fq1Path: str
        @type fq2Path: str
        @type profileHmmFile: str
        @type insertSize: int
        @type readLen: str
        @type outFile: str
        @type outFormat: str
        @type workingDir: str
        @type hmmsearchPath: str
        @type pfamMinScore: int
        @type pOverlap: float
        @type overlapLen: float
        @type outAnnot: bool
        @type cleanUp: bool
        @type processors: int
    """
    assert os.name == "posix", 'Snowball runs only on "posix" systems, your system is: %s' % os.name

    # checking input parameters
    assert os.path.isfile(fq1Path), "File does not exist: %s" % fq1Path
    assert os.path.isfile(fq2Path), "File does not exist: %s" % fq2Path

    # derive the read length
    if readLen is None:
        for name, dna, p, qs in fq.ReadFqGen(fq1Path):
            readLen = len(dna)
            assert readLen == len(qs), "File corrupted %s" % fq1Path
            break
    assert readLen is not None, "Cannot derive read length from %s" % fq1Path

    assert readLen <= insertSize < 2 * readLen, "Invalid read length (%s) and insert size (%s) combination" % (
        readLen,
        insertSize,
    )

    assert os.path.isfile(profileHmmFile), "File does not exist: %s" % profileHmmFile

    outFormat = outFormat.strip()
    assert outFormat == "fna" or outFormat == "fq", "Invalid output format: %s" % outFormat

    # checking the output file
    if outFile is None:
        c = 0
        while True:
            outFile = fq1Path + "_%s.%s.gz" % (c, outFormat)
            if not os.path.isfile(outFile):
                break
            c += 1
    else:
        outFileDir = os.path.dirname(outFile)
        assert os.path.basename(outFile) != "", "Output file name is empty"
        assert outFileDir == "" or os.path.isdir(outFileDir), "Invalid output directory: %s" % outFileDir
        outFile = outFile.strip()
        if not outFile.endswith(".gz"):
            outFile += ".gz"
            print("The name of the output file was modified to:\n\t%s" % outFile)

    # Looking for the hmmsearch binaries
    if hmmsearchPath is None:
        hmmsearchPath = os.popen("which hmmsearch").read().strip()
        if hmmsearchPath != "":
            print("This hmmsearch binary will be used:\n\t%s" % hmmsearchPath)

    assert os.path.isfile(hmmsearchPath), "Path for (hmmsearch) is invalid: %s" % hmmsearchPath

    # creates a temporary working directory
    if workingDir is None:
        workingDir = tempfile.mkdtemp(prefix="snowball_")
        assert os.path.isdir(workingDir), "Cannot create temporary working directory (%s)" % workingDir
        cleenUpTmpWorkingDir = True
        print("Using temporary directory:\n\t%s" % workingDir)
    else:
        cleenUpTmpWorkingDir = False

    assert os.path.isdir(workingDir), "Working directory does not exist:\n\t%s" % workingDir
    assert not os.listdir(workingDir), "Working directory must be empty:\n\t%s" % workingDir

    # set the number of processor cores to be used
    comh.MAX_PROC = max(1, min(processors, mp.cpu_count()))

    # set assembly parameters or use defaults
    if pfamMinScore is not None:
        comh.SAMPLES_PFAM_EVAN_MIN_SCORE = pfamMinScore

    if pOverlap is not None:
        comh.ASSEMBLY_POVERLAP = (pOverlap,)

    if overlapLen is not None:
        comh.ASSEMBLY_OVERLAP_LEN = (overlapLen,)

    # creates a temporary directory for the sample strains
    strainsDir = os.path.join(workingDir, "strains")
    if not os.path.isdir(strainsDir):
        os.mkdir(strainsDir)
    assert os.path.isdir(strainsDir), "Cannot create temporary directory:\n\t%s" % strainsDir

    os.symlink(fq1Path, os.path.join(strainsDir, "0_pair1.fq.gz"))
    os.symlink(fq2Path, os.path.join(strainsDir, "0_pair2.fq.gz"))

    # Start of the algorithm
    print("Running on: %s (%s)" % (" ".join(platform.dist()), sys.platform))
    print("Using %s processors" % comh.MAX_PROC)
    print(
        "Settings:\n\tRead length: %s\n\tInsert size: %s\n\tMin. overlap probability: %s\n\tMin. overlap length: %s"
        "\n\tMin. HMM score: %s"
        % (
            readLen,
            insertSize,
            comh.ASSEMBLY_POVERLAP[0],
            comh.ASSEMBLY_OVERLAP_LEN[0],
            comh.SAMPLES_PFAM_EVAN_MIN_SCORE,
        )
    )

    # file with joined consensus reads
    fqJoinPath = os.path.join(strainsDir, "0_join.fq.gz")

    # join paired-end reads
    if True:  # to skip this step, set to False (e.g. resume processing after OS/HW failure)
        print("Joining paired-end reads into consensus reads, loading reads from:\n\t%s\n\t%s" % (fq1Path, fq2Path))

        r = fq.joinPairEnd(
            [(fq1Path, fq2Path, fqJoinPath, readLen, insertSize, None, 60)],
            minOverlap=comh.SAMPLES_PAIRED_END_JOIN_MIN_OVERLAP,
            minOverlapIdentity=comh.SAMPLES_PAIRED_END_JOIN_MIN_OVERLAP_IDENTITY,
            maxCpu=comh.MAX_PROC,
        )
        print("Filtered out: %s %% reads" % r)

    # Translate consensus reads into protein sequences, run hmmsearch
    if True:  # to skip this step, set to False (e.g. resume processing after OS/HW failure)
        print("Translating reads to protein sequences")
        # file with protein consensus read sequences
        joinFastaProtGzip = os.path.join(strainsDir, "0_join_prot.fna.gz")
        fq.readsToProt(fqJoinPath, joinFastaProtGzip, comh.TRANSLATION_TABLE)

        print("Running HMMER (hmmsearch)")
        domOut = os.path.join(strainsDir, "0_join_prot.domtblout")
        joinFastaProt = joinFastaProtGzip[:-3]

        cmd = "zcat %s > %s;%s -o /dev/null --noali --domtblout %s -E 0.01 " "--cpu %s %s %s;rm %s;gzip -f %s" % (
            joinFastaProtGzip,
            joinFastaProt,
            hmmsearchPath,
            domOut,
            comh.MAX_PROC,
            profileHmmFile,
            joinFastaProt,
            joinFastaProt,
            domOut,
        )

        assert parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, strainsDir)])) is None

    # Assign consensus reads to individual gene domains
    if True:  # to skip this step, set to False (e.g. resume processing after OS/HW failure)
        print("Assigning consensus reads to gene domains")
        hio.partitionReads(
            workingDir,
            comh.SAMPLES_PFAM_EVAN_MIN_SCORE,
            comh.SAMPLES_PFAM_EVAN_MIN_ACCURACY,
            comh.SAMPLES_SHUFFLE_RAND_SEED,
            comh.SAMPLES_PFAM_PARTITIONED_DIR,
            True,
            False,
        )

    partitionedDir = os.path.join(workingDir, comh.SAMPLES_PFAM_PARTITIONED_DIR)

    # Run Assembly
    if True:  # to skip this step, set to False (e.g. resume processing after OS/HW failure)
        print("Running Snowball assembly")

        # collect tasks for each gene domain
        taskList = []
        assert os.path.isdir(partitionedDir), "Temporary directory does not exist:\n\t%s" % partitionedDir
        for f in os.listdir(partitionedDir):
            fPath = os.path.join(partitionedDir, f)
            if f.endswith("join.fq.gz") and os.path.isfile(fPath):
                base = fPath[:-6]
                inFq = fPath
                inDomtblout = "%s_prot.domtblout.gz" % base
                inProtFna = "%s_prot.fna.gz" % base
                outPath = "%s_read_rec.pkl.gz" % base
                taskList.append(
                    parallel.TaskThread(
                        hmain.buildSuperReads,
                        (
                            inFq,
                            inDomtblout,
                            inProtFna,
                            outPath,
                            comh.ASSEMBLY_CONSIDER_PROT_COMP,
                            comh.ASSEMBLY_ONLY_POVERLAP,
                            comh.ASSEMBLY_POVERLAP,
                            comh.ASSEMBLY_OVERLAP_LEN,
                            comh.ASSEMBLY_OVERLAP_ANNOT_LEN,
                            comh.ASSEMBLY_STOP_OVERLAP_MISMATCH,
                            comh.ASSEMBLY_MAX_LOOPS,
                            comh.TRANSLATION_TABLE,
                        ),
                    )
                )
        # run tasks in parallel
        parallel.runThreadParallel(taskList, comh.MAX_PROC, keepRetValues=False)

    # Creates the output file
    if True:  # to skip this step, set to False (e.g. resume processing after OS/HW failure)
        print("Creating output file:\n\t%s" % outFile)
        counter = 0
        out = fq.WriteFq(outFile)
        for f in os.listdir(partitionedDir):
            fPath = os.path.join(partitionedDir, f)
            if f.endswith(".pkl.gz") and os.path.isfile(fPath):
                domName = f[2:-23]
                for rec in hio.loadReadRec(fPath):
                    counter += 1
                    contigName = "contig_%s_%s" % (counter, domName)
                    dnaSeq = rec.dnaSeq

                    # get the quality score string
                    if outAnnot or outFormat == "fq":
                        qs = rec.qsArray.getQSStr(dnaSeq)
                    else:
                        qs = None

                    # get the contig annotations
                    if outAnnot:
                        assert qs is not None
                        codingStart = rec.annotStart
                        codingLen = rec.annotLen
                        posCov = ",".join(map(lambda x: str(int(x)), rec.getPosCovArray()))
                        annotStr = "domName:%s|codingStart:%s|codingLen:%s|qs:%s|posCov:%s" % (
                            domName,
                            codingStart,
                            codingLen,
                            qs,
                            posCov,
                        )
                    else:
                        annotStr = ""

                    # write an entry to the output file
                    if outFormat == "fq":
                        out.writeFqEntry("@" + contigName, dnaSeq, qs, annotStr)
                    else:
                        assert outFormat == "fna"
                        if outAnnot:
                            annotStr = "|" + annotStr
                        out.write(">%s%s\n%s\n" % (contigName, annotStr, dnaSeq))

        # close output file
        out.close()

    # Clean up the working directory
    if cleenUpTmpWorkingDir:
        # clean up the temporary directory
        print("Cleaning up temporary directory")
        assert os.path.isdir(workingDir), "Directory to be cleaned does not exist:\n%s" % workingDir
        shutil.rmtree(workingDir)
    elif cleanUp:
        # clean up the user defined working directory
        if os.path.isdir(workingDir):
            print("Cleaning up working directory:\n\t%s" % workingDir)
            shutil.rmtree(os.path.join(workingDir, comh.SAMPLES_PFAM_PARTITIONED_DIR))
            shutil.rmtree(strainsDir)

    print("Done")
Example #5
0
    def runHMM(self, inputFastaFile, outLog=None):
        """
            Run the hidden markov model to get regions in the input sequences where the 16S and 23S genes are located.
        """

        processors = self._config.get('processors')
        if processors is not None:
            processors = ' -p %s ' % processors
        else:
            processors = ''

        hmmInstallDir = os.path.normpath(self._config.get('rnaHmmInstallDir'))
        hmmerBinDir = os.path.normpath(self._config.get('hmmerBinDir'))
        regionsFile = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.gff'))
        cmd = str('export PATH=' + hmmerBinDir + ':$PATH;' + os.path.join(hmmInstallDir, 'rna_hmm3.py') +
                  ' -i ' + inputFastaFile + ' -o ' + regionsFile + processors)  # + processors
        if os.name == 'posix':

            cwd = self._config.get('rnaHmmInstallDir')

            if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd)])) is not None:
                sys.exit(-1)

            # if outLog is not None:
            #     stdoutLog = open(outLog, 'w')
            # else:
            #     stdoutLog = subprocess.STDOUT  # stdout=subprocess.STDOUT
            # hmmProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self._config.get('rnaHmmInstallDir'), stdout=stdoutLog)
            # print 'run cmd:', cmd
            # hmmProc.wait()
            # if outLog is not None:
            #     stdoutLog.close()
            # print 'HMM return code:', hmmProc.returncode
            # if hmmProc.returncode != 0:
            #     raise Exception("Command returned with non-zero %s status: %s" % (hmmProc.returncode, cmd))

        else:
            print 'Cannot run HMM since your system is not "posix" but', str('"' + os.name + '"'), '\n', cmd

        handle = open(inputFastaFile, "rU")
        record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
        handle.close()
        #trunkoutputfilename = inputFastaFile.split( "/" )[-1]
        trunkoutputfilename = os.path.join(self._workingDir, os.path.basename(inputFastaFile))
        # parse results file line by line
        for line in open(regionsFile, "rU"):
            if line[0] != "#":
                line = line.split()
                ident = line[0]
                start = int( line[3] )
                stop = int( line[4] )
                strand = line[6]
                gene = line[8]
                seq = record_dict[ ident ].seq
                if strand == "+":
                    subseq = seq[start-1:stop]
                elif strand == "-":
                    subseq = seq[start-1:stop].reverse_complement()
                else:
                    sys.stderr.write(" analysis16s: invalid strand symbol")
                    exit(1)

                outfile = open(trunkoutputfilename + "." + gene + ".fna", "a")
                print >> outfile, ">%s_%i_%i_%s" % (ident, start, stop, strand)
                print >> outfile, subseq
                outfile.close()
Example #6
0
    def _classify(self, mode, inputFastaFile, outLog=None):

        mothur = os.path.join(os.path.normpath(self._config.get('mothurInstallDir')), 'mothur')

        if mode == 16:
            extractedRegionsFasta = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.16S_rRNA.fna'))
            taxonomyFile = os.path.join(self._refDir, os.path.normpath(self._refDict[('16S_rRNA','taxonomyDNA')][0]))
            templateFile = os.path.join(self._refDir, os.path.normpath(self._refDict[('16S_rRNA','templateDNA')][0]))
            #mothurPredFileName = str(extractedRegionsFasta[0:extractedRegionsFasta.rindex('.')] + '.taxonomy')
            mothurPredFileName = common.getMothurOutputFilePath(extractedRegionsFasta, taxonomyFile)
            predFileName = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.16P'))

            #extractedRegionsFasta = str(inputFastaFile + '.16S_rRNA.fna')
            #templateFile = os.path.normpath(self._configRRNA16S.get('mothurClassifyParam16STemplate'))
            #taxonomyFile = os.path.normpath(self._configRRNA16S.get('mothurClassifyParam16STaxonomy'))
            #mothurPredFileName = str(inputFastaFile + '.16S_rRNA.bacteria+archaea.taxonomy')
            #mothurPredFileName = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.16S_rRNA.bacteria+archaea.taxonomy'))
            #mothurPredFileName = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.16S_rRNA.fasta.taxonomy'))
            #predFileName = str(inputFastaFile + '.16P')
        elif mode == 23:
            extractedRegionsFasta = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.23S_rRNA.fna'))
            taxonomyFile = os.path.join(self._refDir, os.path.normpath(self._refDict[('23S_rRNA','taxonomyDNA')][0]))
            templateFile = os.path.join(self._refDir, os.path.normpath(self._refDict[('23S_rRNA','templateDNA')][0]))
            #mothurPredFileName = str(extractedRegionsFasta[0:extractedRegionsFasta.rindex('.')] + '.taxonomy')
            mothurPredFileName = common.getMothurOutputFilePath(extractedRegionsFasta, taxonomyFile)
            predFileName = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.23P'))

            #extractedRegionsFasta = str(inputFastaFile + '.23S_rRNA.fna')
            #templateFile = os.path.normpath(self._configRRNA16S.get('mothurClassifyParam23STemplate'))
            #taxonomyFile = os.path.normpath(self._configRRNA16S.get('mothurClassifyParam23STaxonomy'))
            #mothurPredFileName = str(inputFastaFile + '.23S_rRNA.bacteria+archaea.taxonomy')
            #mothurPredFileName = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.23S_rRNA.bacteria+archaea.taxonomy'))
            #mothurPredFileName = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.23S_rRNA.fasta.taxonomy'))
            #predFileName = str(inputFastaFile + '.23P')
        elif mode == 5:
            #extractedRegionsFasta = str(inputFastaFile + '.5S_rRNA.fna')
            extractedRegionsFasta = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.5S_rRNA.fna'))
            taxonomyFile = os.path.join(self._refDir, os.path.normpath(self._refDict[('5S_rRNA', 'taxonomyDNA')][0]))
            templateFile = os.path.join(self._refDir, os.path.normpath(self._refDict[('5S_rRNA', 'templateDNA')][0]))
            mothurPredFileName = common.getMothurOutputFilePath(extractedRegionsFasta, taxonomyFile)
            predFileName = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.5P'))

            #templateFile = os.path.normpath(self._configRRNA16S.get('mothurClassifyParam5STemplate'))
            #taxonomyFile = os.path.normpath(self._configRRNA16S.get('mothurClassifyParam5STaxonomy'))
            #mothurPredFileName = os.path.join(self._workingDir,
            #                                  str(os.path.basename(inputFastaFile) + '.5S_rRNA.' + os.path.basename(taxonomyFile) + 'onomy'))#.taxonomy
            #predFileName = str(inputFastaFile + '.5P')

        else:
            raise Exception('Wrong branch')

        if not os.path.isfile(mothurPredFileName):
            mothurPredFileName = common.getMothurOutputFilePath(extractedRegionsFasta, taxonomyFile, suffix='.bayesian.taxonomy')

        param = self._config.get('mothurClassifyParamOther')

        cmd = str(mothur + ' "#classify.seqs(fasta=' + extractedRegionsFasta + ', template=' + templateFile
                + ', taxonomy=' + taxonomyFile + ', ' + param + ')"')

        if os.name == 'posix':

            print('Mothur processing: %s' % os.path.basename(templateFile).split('_', 1)[0])

            cwd = self._workingDir

            if outLog is not None:
                stdoutLog = open(outLog, 'w')
            else:
                stdoutLog = subprocess.STDOUT

            if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd, stdout=stdoutLog)])) is not None:
                sys.exit(-1)

            if outLog is not None:
                stdoutLog.close()

            # mothurProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self._workingDir, stdout=stdoutLog)
            # print 'run cmd:', cmd
            # mothurProc.wait()
            # if outLog is not None:
            #     stdoutLog.close()
            # print 'mothur return code:', mothurProc.returncode
            # if mothurProc.returncode != 0:
            #     raise Exception("Command returned with non-zero %s status: %s" % (mothurProc.returncode, cmd))

        else:
            print 'Cannot run mothur since your system is not "posix" but', str('"' + os.name + '"'), '\n', cmd



        #transform mothur prediction files to the tab separated files
        self.mothurPredToTabSepPred(mothurPredFileName, predFileName)