Python runCmdSerial Examples

Programming Language: Python

Namespace/Package Name: algbioi.com.parallel

Method/Function: runCmdSerial

Examples at hotexamples.com: 6

Python runCmdSerial - 6 examples found. These are the top rated real world Python examples of algbioi.com.parallel.runCmdSerial extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: pps_wrap.py Project: algbioi/docker_ppsp

def runPredict(ppsInstallDir, ppsConfigFilePathPy, fastaFile):
    cwd = os.path.join(ppsInstallDir, 'python_scripts')
    scriptPath = os.path.join(cwd, 'predict.py')
    cmd = 'python %s -c %s -fasta %s ' % (scriptPath, ppsConfigFilePathPy, fastaFile)

    baseName = os.path.basename(fastaFile)

    for f in os.listdir(os.path.dirname(fastaFile)):
        if (f.endswith('.out') or f.endswith('.sl')) and baseName in f:
            os.remove(os.path.join(os.path.dirname(fastaFile), f))

    if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd)])) is not None:
        sys.exit(-1)

    for f in os.listdir(os.path.dirname(fastaFile)):
        if f.endswith('.sl') and baseName in f:
            os.remove(os.path.join(os.path.dirname(fastaFile), f))

Example #2

Show file

File: pps_wrap.py Project: algbioi/docker_ppsp

def runTrain(ppsInstallDir, ppsConfigFilePathPy):
    cwd = os.path.join(ppsInstallDir, 'python_scripts')
    scriptPath = os.path.join(cwd, 'train.py')
    cmd = 'python %s -c %s -y' % (scriptPath, ppsConfigFilePathPy)
    if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd)])) is not None:
        sys.exit(-1)

Example #3

Show file

File: analysis_mg.py Project: algbioi/docker_ppsp

    def runMarkerGeneAnalysis(self, fastaFileDNA, outLog=None):
        """
            Run hmmer HMM and mothur classify (bayesian), same param as for the 16S analysis.
        """
        #read list of marker genes
        mgFiles = forEachLine(self.markerGeneListFile, _MgFiles(self.markerGeneListFileDir))

        #translate DNA to protein sequences
        fastaFileProt = os.path.join(self.markerGeneWorkingDir, str(os.path.basename(fastaFileDNA) + '.PROT'))
        dnaToProt(fastaFileDNA, fastaFileProt)

        #read DNA fasta file
        try:
            handle = open(fastaFileDNA, "rU")
            dnaSeqDict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
            handle.close()
        except Exception:
            sys.stderr.write(str('Cannot read file: ' + str(fastaFileDNA)))
            raise

        #to output all predictions in one file
        outPredAllFileName = os.path.join(self.markerGeneWorkingDir,
                                           str(os.path.basename(fastaFileDNA) + '_all.mP'))
        outAllBuffer = OutFileBuffer(outPredAllFileName)

        #run HMM search
        mgList = mgFiles.getGeneNameList()

        if outLog is not None:
            stdoutLog = open(outLog, 'w')
        else:
            stdoutLog = subprocess.STDOUT

        #for each gene perform the analysis separately
        for geneName in mgList:

            domFileArray = [os.path.join(self.markerGeneWorkingDir, str(geneName + '_1.dom'))]  #,
                            # os.path.join(self.markerGeneWorkingDir, str(geneName + '_2.dom'))]
            outFileArray = [os.path.join(self.markerGeneWorkingDir, str(geneName + '_1.out'))]  #,
                            # os.path.join(self.markerGeneWorkingDir, str(geneName + '_2.out'))]
            hmmFileArray = [mgFiles.getFilePath(geneName, 'hmmPROTPrim')]  #,
                            # mgFiles.getFilePath(geneName, 'hmmPROTSec')]
            cmdArray = list([])

            #define cmd
            for i in range(1):
                if hmmFileArray[i] is not None:
                    cmdArray.append(str(os.path.join(self.hmmerBinDir, 'hmmsearch') + ' --domtblout ' + domFileArray[i] + ' -E 0.01' + self.processorsHmm
                               + ' -o ' + outFileArray[i] + ' ' + hmmFileArray[i] + ' ' + fastaFileProt))
                else:
                    cmdArray.append(None)

            #run cmd
            for cmd in cmdArray:
                if cmd is not None and os.name == 'posix':

                    cwd = self.hmmInstallDir

                    if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd)])) is not None:
                        sys.exit(-1)

                    # hmmProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self.hmmInstallDir, stdout=stdoutLog)
                    # print 'run cmd:', cmd
                    # hmmProc.wait()
                    # print 'HMM  return code:', hmmProc.returncode
                    # if hmmProc.returncode != 0:
                    #     raise Exception("Command returned with non-zero %s status: %s" % (hmmProc.returncode, cmd))


                else:
                    print 'Marker genes analysis, doesn`t run (no posix): ', cmd


            #get regions that match to the HMM profile ()
            entryDictList = []
            for i in range(1):
                if cmdArray[i] is not None:
                    entryDictList.append(forEachLine(domFileArray[i], _MgRegions()).getEntryDict())
                else:
                    entryDictList.append(None)

            entryDict1 = entryDictList[0]
            # entryDict2 = entryDictList[1]

            #extract regions found in the protein sequences that were found by the HMM and generate corresponding DNA sequences
            regionDnaFasta = os.path.join(self.markerGeneWorkingDir, str(geneName + '_dna.gff'))
            outFileBuffer = OutFileBuffer(regionDnaFasta)

            for seqName in entryDict1:
                i = -1
                for e in entryDict1[seqName]:
                    i += 1
                    from1 = entryDict1[seqName][i][0]
                    to1 = entryDict1[seqName][i][1]
                    assert ((from1 != None) and (to1 != None))
                    #compare the results found by the primary and secondary HMM profiles
                    # if (entryDict2 != None) and (seqName in entryDict2):
                    #     if len(entryDict2[seqName]) >= (i+1):
                    #         from2 = entryDict2[seqName][i][0]
                    #         to2 = entryDict2[seqName][i][1]
                            #if from1 != from2 or to1 != to2:
                            #    print str('Different positions in' + seqName + ' from1:' + str(from1) + ' from2:' + str(from2)
                            #                + ' to1:' + str(to1) + ' to2:' + str(to2))

                    #extract regions from the DNA sequences (consider 3 ORF and reverse complements)

                    #name of the whole sequence
                    dnaSeqName = re.sub(r'([0-9]+_[0-9]+)_[pr]+[012]', r'\1', seqName)
                    #whole DNA sequence
                    dnaSeq = dnaSeqDict[dnaSeqName].seq

                    #reverse complement (contains "pr")
                    tagRev = 'p'
                    if re.match(r'[0-9]+_[0-9]+_pr[012]', seqName):
                        dnaSeq = dnaSeq.reverse_complement()
                        tagRev = 'pr'

                    #shift "0"
                    if re.match(r'[0-9]+_[0-9]+_[pr]+0', seqName):
                        tagFrom = ((from1 - 1)*3)
                        tagTo = (to1*3)
                        tagRev += '0'
                        dnaSeq = dnaSeq[tagFrom:tagTo]

                    #shift "1"
                    elif re.match(r'[0-9]+_[0-9]+_[pr]+1', seqName):
                        tagFrom = (((from1 - 1)*3) + 1)
                        tagTo = ((to1*3) + 1)
                        tagRev += '1'
                        dnaSeq = dnaSeq[tagFrom:tagTo]

                    #shift "2"
                    elif re.match(r'[0-9]+_[0-9]+_[pr]+2', seqName):
                        tagFrom = (((from1 - 1)*3) + 2)
                        tagTo = ((to1*3) + 2)
                        tagRev += '2'
                        dnaSeq = dnaSeq[tagFrom:tagTo]

                    #error
                    else:
                        sys.stderr.write('Wrong seq name: ' + seqName + ' \n')
                        dnaSeq = None

                    tag = str(str(tagFrom) + '_' + str(tagTo) + '_' + tagRev)
                    outFileBuffer.writeText(str('>' + dnaSeqName + '_' + tag + '\n' + dnaSeq + '\n'))

            outFileBuffer.close()

            #if no marker gene found
            if outFileBuffer.isEmpty():
                continue

            #run mothur classify (bayesian? the same as for the 16S analysis)
            templateFile = mgFiles.getFilePath(geneName, 'templateDNA')
            taxonomyFile = mgFiles.getFilePath(geneName, 'taxonomyDNA')
            assert ((templateFile is not None) and (taxonomyFile is not None))
            cmd = str('' + self.mothur + ' "#classify.seqs(fasta=' + regionDnaFasta + ', template=' + templateFile
                + ', taxonomy=' +  taxonomyFile + ', ' + self.mothurParam + ')"')
            if os.name == 'posix':

                print('Mothur processing: %s' % os.path.basename(templateFile).split('_', 1)[0])

                cwd = self.markerGeneWorkingDir

                if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd, stdout=stdoutLog)])) is not None:
                    sys.exit(-1)

                # mothurProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self.markerGeneWorkingDir, stdout=stdoutLog)
                # print 'run cmd:', cmd
                # mothurProc.wait()
                # print 'mothur return code:', mothurProc.returncode
                # if mothurProc.returncode != 0:
                #     raise Exception("Command returned with non-zero %s status: %s" % (mothurProc.returncode, cmd))

            else:
                print 'Cannot run mothur since your system is not "posix" but', str('"' + os.name + '"'), '\n', cmd

            #transform the mothur output to a simple output (name, ncbid, weight)

            #mothurPredFileName = os.path.join(self.markerGeneWorkingDir,
            #                                  str(geneName + '_dna.' + os.path.basename(taxonomyFile) + 'onomy'))  # taxonomy
            #!!!!!!!!!!!!!
            mothurPredFileName = common.getMothurOutputFilePath(regionDnaFasta, taxonomyFile)
            if not os.path.isfile(mothurPredFileName):
                mothurPredFileName = common.getMothurOutputFilePath(regionDnaFasta, taxonomyFile, suffix='.bayesian.taxonomy')
                if not os.path.isfile(mothurPredFileName):
                    print("Can't open file: %s" % mothurPredFileName)

            outPredFileName = os.path.join(self.markerGeneWorkingDir,
                                           str(os.path.basename(fastaFileDNA) + '_' + geneName + '.mP'))
            outBuffer = OutFileBuffer(outPredFileName, bufferText=True)
            forEachLine(mothurPredFileName, _MothurOutFileParser(outBuffer, geneName))

            if not outAllBuffer.isEmpty():
                outAllBuffer.writeText('\n')
            outAllBuffer.writeText(outBuffer.getTextBuffer())

        if outLog is not None:
            stdoutLog.close()
        outAllBuffer.close()

Example #4

Show file

File: run.py Project: algbioi/snowball

def mainSnowball(
    fq1Path,
    fq2Path,
    profileHmmFile,
    insertSize,
    readLen=None,
    outFile=None,
    outFormat="fna",
    workingDir=None,
    hmmsearchPath=None,
    pfamMinScore=None,
    pOverlap=None,
    overlapLen=None,
    outAnnot=False,
    cleanUp=False,
    processors=mp.cpu_count(),
):
    """
        Main function, the interface of the Snowball gene assembler.

        @param fq1Path: FASTQ 1 file path (containing first ends of Illumina paired-end reads)
        @param fq2Path: FASTQ 2 file path (containing second ends)
        @param profileHmmFile: profile HMMs file, containing models generated by the HMMER 3 software
        @param insertSize: mean insert size used for the library preparation (i.e. read generation)
        @param readLen: read length (if None, it will be derived from the FASTQ file)
        @param outFile: output file (if None, it will be derived from the FASTQ 1 file)
        @param outFormat: output file format 'fna' or 'fq'
        @param workingDir: temporary files will be stored here (if None, a temporary directory will be created/deleted)
        @param hmmsearchPath: path to the HMMER hmmsearch command (if None, it will take version that is in the PATH)
        @param pfamMinScore: minimum score for the hmmsearch (if None, use default)
        @param pOverlap: minimum overlap probability for the Snowball algorithm
        @param overlapLen: minimum overlap length for the Snowball algorithm
        @param outAnnot: if true, additional annotation will be stored along with the resulting contigs
        @param cleanUp: if true, delete temporary files at the end
        @param processors: Number of processors (default: use all processors available)

        @type fq1Path: str
        @type fq2Path: str
        @type profileHmmFile: str
        @type insertSize: int
        @type readLen: str
        @type outFile: str
        @type outFormat: str
        @type workingDir: str
        @type hmmsearchPath: str
        @type pfamMinScore: int
        @type pOverlap: float
        @type overlapLen: float
        @type outAnnot: bool
        @type cleanUp: bool
        @type processors: int
    """
    assert os.name == "posix", 'Snowball runs only on "posix" systems, your system is: %s' % os.name

    # checking input parameters
    assert os.path.isfile(fq1Path), "File does not exist: %s" % fq1Path
    assert os.path.isfile(fq2Path), "File does not exist: %s" % fq2Path

    # derive the read length
    if readLen is None:
        for name, dna, p, qs in fq.ReadFqGen(fq1Path):
            readLen = len(dna)
            assert readLen == len(qs), "File corrupted %s" % fq1Path
            break
    assert readLen is not None, "Cannot derive read length from %s" % fq1Path

    assert readLen <= insertSize < 2 * readLen, "Invalid read length (%s) and insert size (%s) combination" % (
        readLen,
        insertSize,
    )

    assert os.path.isfile(profileHmmFile), "File does not exist: %s" % profileHmmFile

    outFormat = outFormat.strip()
    assert outFormat == "fna" or outFormat == "fq", "Invalid output format: %s" % outFormat

    # checking the output file
    if outFile is None:
        c = 0
        while True:
            outFile = fq1Path + "_%s.%s.gz" % (c, outFormat)
            if not os.path.isfile(outFile):
                break
            c += 1
    else:
        outFileDir = os.path.dirname(outFile)
        assert os.path.basename(outFile) != "", "Output file name is empty"
        assert outFileDir == "" or os.path.isdir(outFileDir), "Invalid output directory: %s" % outFileDir
        outFile = outFile.strip()
        if not outFile.endswith(".gz"):
            outFile += ".gz"
            print("The name of the output file was modified to:\n\t%s" % outFile)

    # Looking for the hmmsearch binaries
    if hmmsearchPath is None:
        hmmsearchPath = os.popen("which hmmsearch").read().strip()
        if hmmsearchPath != "":
            print("This hmmsearch binary will be used:\n\t%s" % hmmsearchPath)

    assert os.path.isfile(hmmsearchPath), "Path for (hmmsearch) is invalid: %s" % hmmsearchPath

    # creates a temporary working directory
    if workingDir is None:
        workingDir = tempfile.mkdtemp(prefix="snowball_")
        assert os.path.isdir(workingDir), "Cannot create temporary working directory (%s)" % workingDir
        cleenUpTmpWorkingDir = True
        print("Using temporary directory:\n\t%s" % workingDir)
    else:
        cleenUpTmpWorkingDir = False

    assert os.path.isdir(workingDir), "Working directory does not exist:\n\t%s" % workingDir
    assert not os.listdir(workingDir), "Working directory must be empty:\n\t%s" % workingDir

    # set the number of processor cores to be used
    comh.MAX_PROC = max(1, min(processors, mp.cpu_count()))

    # set assembly parameters or use defaults
    if pfamMinScore is not None:
        comh.SAMPLES_PFAM_EVAN_MIN_SCORE = pfamMinScore

    if pOverlap is not None:
        comh.ASSEMBLY_POVERLAP = (pOverlap,)

    if overlapLen is not None:
        comh.ASSEMBLY_OVERLAP_LEN = (overlapLen,)

    # creates a temporary directory for the sample strains
    strainsDir = os.path.join(workingDir, "strains")
    if not os.path.isdir(strainsDir):
        os.mkdir(strainsDir)
    assert os.path.isdir(strainsDir), "Cannot create temporary directory:\n\t%s" % strainsDir

    os.symlink(fq1Path, os.path.join(strainsDir, "0_pair1.fq.gz"))
    os.symlink(fq2Path, os.path.join(strainsDir, "0_pair2.fq.gz"))

    # Start of the algorithm
    print("Running on: %s (%s)" % (" ".join(platform.dist()), sys.platform))
    print("Using %s processors" % comh.MAX_PROC)
    print(
        "Settings:\n\tRead length: %s\n\tInsert size: %s\n\tMin. overlap probability: %s\n\tMin. overlap length: %s"
        "\n\tMin. HMM score: %s"
        % (
            readLen,
            insertSize,
            comh.ASSEMBLY_POVERLAP[0],
            comh.ASSEMBLY_OVERLAP_LEN[0],
            comh.SAMPLES_PFAM_EVAN_MIN_SCORE,
        )
    )

    # file with joined consensus reads
    fqJoinPath = os.path.join(strainsDir, "0_join.fq.gz")

    # join paired-end reads
    if True:  # to skip this step, set to False (e.g. resume processing after OS/HW failure)
        print("Joining paired-end reads into consensus reads, loading reads from:\n\t%s\n\t%s" % (fq1Path, fq2Path))

        r = fq.joinPairEnd(
            [(fq1Path, fq2Path, fqJoinPath, readLen, insertSize, None, 60)],
            minOverlap=comh.SAMPLES_PAIRED_END_JOIN_MIN_OVERLAP,
            minOverlapIdentity=comh.SAMPLES_PAIRED_END_JOIN_MIN_OVERLAP_IDENTITY,
            maxCpu=comh.MAX_PROC,
        )
        print("Filtered out: %s %% reads" % r)

    # Translate consensus reads into protein sequences, run hmmsearch
    if True:  # to skip this step, set to False (e.g. resume processing after OS/HW failure)
        print("Translating reads to protein sequences")
        # file with protein consensus read sequences
        joinFastaProtGzip = os.path.join(strainsDir, "0_join_prot.fna.gz")
        fq.readsToProt(fqJoinPath, joinFastaProtGzip, comh.TRANSLATION_TABLE)

        print("Running HMMER (hmmsearch)")
        domOut = os.path.join(strainsDir, "0_join_prot.domtblout")
        joinFastaProt = joinFastaProtGzip[:-3]

        cmd = "zcat %s > %s;%s -o /dev/null --noali --domtblout %s -E 0.01 " "--cpu %s %s %s;rm %s;gzip -f %s" % (
            joinFastaProtGzip,
            joinFastaProt,
            hmmsearchPath,
            domOut,
            comh.MAX_PROC,
            profileHmmFile,
            joinFastaProt,
            joinFastaProt,
            domOut,
        )

        assert parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, strainsDir)])) is None

    # Assign consensus reads to individual gene domains
    if True:  # to skip this step, set to False (e.g. resume processing after OS/HW failure)
        print("Assigning consensus reads to gene domains")
        hio.partitionReads(
            workingDir,
            comh.SAMPLES_PFAM_EVAN_MIN_SCORE,
            comh.SAMPLES_PFAM_EVAN_MIN_ACCURACY,
            comh.SAMPLES_SHUFFLE_RAND_SEED,
            comh.SAMPLES_PFAM_PARTITIONED_DIR,
            True,
            False,
        )

    partitionedDir = os.path.join(workingDir, comh.SAMPLES_PFAM_PARTITIONED_DIR)

    # Run Assembly
    if True:  # to skip this step, set to False (e.g. resume processing after OS/HW failure)
        print("Running Snowball assembly")

        # collect tasks for each gene domain
        taskList = []
        assert os.path.isdir(partitionedDir), "Temporary directory does not exist:\n\t%s" % partitionedDir
        for f in os.listdir(partitionedDir):
            fPath = os.path.join(partitionedDir, f)
            if f.endswith("join.fq.gz") and os.path.isfile(fPath):
                base = fPath[:-6]
                inFq = fPath
                inDomtblout = "%s_prot.domtblout.gz" % base
                inProtFna = "%s_prot.fna.gz" % base
                outPath = "%s_read_rec.pkl.gz" % base
                taskList.append(
                    parallel.TaskThread(
                        hmain.buildSuperReads,
                        (
                            inFq,
                            inDomtblout,
                            inProtFna,
                            outPath,
                            comh.ASSEMBLY_CONSIDER_PROT_COMP,
                            comh.ASSEMBLY_ONLY_POVERLAP,
                            comh.ASSEMBLY_POVERLAP,
                            comh.ASSEMBLY_OVERLAP_LEN,
                            comh.ASSEMBLY_OVERLAP_ANNOT_LEN,
                            comh.ASSEMBLY_STOP_OVERLAP_MISMATCH,
                            comh.ASSEMBLY_MAX_LOOPS,
                            comh.TRANSLATION_TABLE,
                        ),
                    )
                )
        # run tasks in parallel
        parallel.runThreadParallel(taskList, comh.MAX_PROC, keepRetValues=False)

    # Creates the output file
    if True:  # to skip this step, set to False (e.g. resume processing after OS/HW failure)
        print("Creating output file:\n\t%s" % outFile)
        counter = 0
        out = fq.WriteFq(outFile)
        for f in os.listdir(partitionedDir):
            fPath = os.path.join(partitionedDir, f)
            if f.endswith(".pkl.gz") and os.path.isfile(fPath):
                domName = f[2:-23]
                for rec in hio.loadReadRec(fPath):
                    counter += 1
                    contigName = "contig_%s_%s" % (counter, domName)
                    dnaSeq = rec.dnaSeq

                    # get the quality score string
                    if outAnnot or outFormat == "fq":
                        qs = rec.qsArray.getQSStr(dnaSeq)
                    else:
                        qs = None

                    # get the contig annotations
                    if outAnnot:
                        assert qs is not None
                        codingStart = rec.annotStart
                        codingLen = rec.annotLen
                        posCov = ",".join(map(lambda x: str(int(x)), rec.getPosCovArray()))
                        annotStr = "domName:%s|codingStart:%s|codingLen:%s|qs:%s|posCov:%s" % (
                            domName,
                            codingStart,
                            codingLen,
                            qs,
                            posCov,
                        )
                    else:
                        annotStr = ""

                    # write an entry to the output file
                    if outFormat == "fq":
                        out.writeFqEntry("@" + contigName, dnaSeq, qs, annotStr)
                    else:
                        assert outFormat == "fna"
                        if outAnnot:
                            annotStr = "|" + annotStr
                        out.write(">%s%s\n%s\n" % (contigName, annotStr, dnaSeq))

        # close output file
        out.close()

    # Clean up the working directory
    if cleenUpTmpWorkingDir:
        # clean up the temporary directory
        print("Cleaning up temporary directory")
        assert os.path.isdir(workingDir), "Directory to be cleaned does not exist:\n%s" % workingDir
        shutil.rmtree(workingDir)
    elif cleanUp:
        # clean up the user defined working directory
        if os.path.isdir(workingDir):
            print("Cleaning up working directory:\n\t%s" % workingDir)
            shutil.rmtree(os.path.join(workingDir, comh.SAMPLES_PFAM_PARTITIONED_DIR))
            shutil.rmtree(strainsDir)

    print("Done")

Example #5

Show file

File: analysis16s.py Project: algbioi/docker_ppsp

    def runHMM(self, inputFastaFile, outLog=None):
        """
            Run the hidden markov model to get regions in the input sequences where the 16S and 23S genes are located.
        """

        processors = self._config.get('processors')
        if processors is not None:
            processors = ' -p %s ' % processors
        else:
            processors = ''

        hmmInstallDir = os.path.normpath(self._config.get('rnaHmmInstallDir'))
        hmmerBinDir = os.path.normpath(self._config.get('hmmerBinDir'))
        regionsFile = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.gff'))
        cmd = str('export PATH=' + hmmerBinDir + ':$PATH;' + os.path.join(hmmInstallDir, 'rna_hmm3.py') +
                  ' -i ' + inputFastaFile + ' -o ' + regionsFile + processors)  # + processors
        if os.name == 'posix':

            cwd = self._config.get('rnaHmmInstallDir')

            if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd)])) is not None:
                sys.exit(-1)

            # if outLog is not None:
            #     stdoutLog = open(outLog, 'w')
            # else:
            #     stdoutLog = subprocess.STDOUT  # stdout=subprocess.STDOUT
            # hmmProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self._config.get('rnaHmmInstallDir'), stdout=stdoutLog)
            # print 'run cmd:', cmd
            # hmmProc.wait()
            # if outLog is not None:
            #     stdoutLog.close()
            # print 'HMM return code:', hmmProc.returncode
            # if hmmProc.returncode != 0:
            #     raise Exception("Command returned with non-zero %s status: %s" % (hmmProc.returncode, cmd))

        else:
            print 'Cannot run HMM since your system is not "posix" but', str('"' + os.name + '"'), '\n', cmd

        handle = open(inputFastaFile, "rU")
        record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
        handle.close()
        #trunkoutputfilename = inputFastaFile.split( "/" )[-1]
        trunkoutputfilename = os.path.join(self._workingDir, os.path.basename(inputFastaFile))
        # parse results file line by line
        for line in open(regionsFile, "rU"):
            if line[0] != "#":
                line = line.split()
                ident = line[0]
                start = int( line[3] )
                stop = int( line[4] )
                strand = line[6]
                gene = line[8]
                seq = record_dict[ ident ].seq
                if strand == "+":
                    subseq = seq[start-1:stop]
                elif strand == "-":
                    subseq = seq[start-1:stop].reverse_complement()
                else:
                    sys.stderr.write(" analysis16s: invalid strand symbol")
                    exit(1)

                outfile = open(trunkoutputfilename + "." + gene + ".fna", "a")
                print >> outfile, ">%s_%i_%i_%s" % (ident, start, stop, strand)
                print >> outfile, subseq
                outfile.close()

Example #6

Show file

File: analysis16s.py Project: algbioi/docker_ppsp

    def _classify(self, mode, inputFastaFile, outLog=None):

        mothur = os.path.join(os.path.normpath(self._config.get('mothurInstallDir')), 'mothur')

        if mode == 16:
            extractedRegionsFasta = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.16S_rRNA.fna'))
            taxonomyFile = os.path.join(self._refDir, os.path.normpath(self._refDict[('16S_rRNA','taxonomyDNA')][0]))
            templateFile = os.path.join(self._refDir, os.path.normpath(self._refDict[('16S_rRNA','templateDNA')][0]))
            #mothurPredFileName = str(extractedRegionsFasta[0:extractedRegionsFasta.rindex('.')] + '.taxonomy')
            mothurPredFileName = common.getMothurOutputFilePath(extractedRegionsFasta, taxonomyFile)
            predFileName = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.16P'))

            #extractedRegionsFasta = str(inputFastaFile + '.16S_rRNA.fna')
            #templateFile = os.path.normpath(self._configRRNA16S.get('mothurClassifyParam16STemplate'))
            #taxonomyFile = os.path.normpath(self._configRRNA16S.get('mothurClassifyParam16STaxonomy'))
            #mothurPredFileName = str(inputFastaFile + '.16S_rRNA.bacteria+archaea.taxonomy')
            #mothurPredFileName = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.16S_rRNA.bacteria+archaea.taxonomy'))
            #mothurPredFileName = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.16S_rRNA.fasta.taxonomy'))
            #predFileName = str(inputFastaFile + '.16P')
        elif mode == 23:
            extractedRegionsFasta = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.23S_rRNA.fna'))
            taxonomyFile = os.path.join(self._refDir, os.path.normpath(self._refDict[('23S_rRNA','taxonomyDNA')][0]))
            templateFile = os.path.join(self._refDir, os.path.normpath(self._refDict[('23S_rRNA','templateDNA')][0]))
            #mothurPredFileName = str(extractedRegionsFasta[0:extractedRegionsFasta.rindex('.')] + '.taxonomy')
            mothurPredFileName = common.getMothurOutputFilePath(extractedRegionsFasta, taxonomyFile)
            predFileName = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.23P'))

            #extractedRegionsFasta = str(inputFastaFile + '.23S_rRNA.fna')
            #templateFile = os.path.normpath(self._configRRNA16S.get('mothurClassifyParam23STemplate'))
            #taxonomyFile = os.path.normpath(self._configRRNA16S.get('mothurClassifyParam23STaxonomy'))
            #mothurPredFileName = str(inputFastaFile + '.23S_rRNA.bacteria+archaea.taxonomy')
            #mothurPredFileName = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.23S_rRNA.bacteria+archaea.taxonomy'))
            #mothurPredFileName = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.23S_rRNA.fasta.taxonomy'))
            #predFileName = str(inputFastaFile + '.23P')
        elif mode == 5:
            #extractedRegionsFasta = str(inputFastaFile + '.5S_rRNA.fna')
            extractedRegionsFasta = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.5S_rRNA.fna'))
            taxonomyFile = os.path.join(self._refDir, os.path.normpath(self._refDict[('5S_rRNA', 'taxonomyDNA')][0]))
            templateFile = os.path.join(self._refDir, os.path.normpath(self._refDict[('5S_rRNA', 'templateDNA')][0]))
            mothurPredFileName = common.getMothurOutputFilePath(extractedRegionsFasta, taxonomyFile)
            predFileName = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.5P'))

            #templateFile = os.path.normpath(self._configRRNA16S.get('mothurClassifyParam5STemplate'))
            #taxonomyFile = os.path.normpath(self._configRRNA16S.get('mothurClassifyParam5STaxonomy'))
            #mothurPredFileName = os.path.join(self._workingDir,
            #                                  str(os.path.basename(inputFastaFile) + '.5S_rRNA.' + os.path.basename(taxonomyFile) + 'onomy'))#.taxonomy
            #predFileName = str(inputFastaFile + '.5P')

        else:
            raise Exception('Wrong branch')

        if not os.path.isfile(mothurPredFileName):
            mothurPredFileName = common.getMothurOutputFilePath(extractedRegionsFasta, taxonomyFile, suffix='.bayesian.taxonomy')

        param = self._config.get('mothurClassifyParamOther')

        cmd = str(mothur + ' "#classify.seqs(fasta=' + extractedRegionsFasta + ', template=' + templateFile
                + ', taxonomy=' + taxonomyFile + ', ' + param + ')"')

        if os.name == 'posix':

            print('Mothur processing: %s' % os.path.basename(templateFile).split('_', 1)[0])

            cwd = self._workingDir

            if outLog is not None:
                stdoutLog = open(outLog, 'w')
            else:
                stdoutLog = subprocess.STDOUT

            if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd, stdout=stdoutLog)])) is not None:
                sys.exit(-1)

            if outLog is not None:
                stdoutLog.close()

            # mothurProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self._workingDir, stdout=stdoutLog)
            # print 'run cmd:', cmd
            # mothurProc.wait()
            # if outLog is not None:
            #     stdoutLog.close()
            # print 'mothur return code:', mothurProc.returncode
            # if mothurProc.returncode != 0:
            #     raise Exception("Command returned with non-zero %s status: %s" % (mothurProc.returncode, cmd))

        else:
            print 'Cannot run mothur since your system is not "posix" but', str('"' + os.name + '"'), '\n', cmd



        #transform mothur prediction files to the tab separated files
        self.mothurPredToTabSepPred(mothurPredFileName, predFileName)