def runPredict(ppsInstallDir, ppsConfigFilePathPy, fastaFile): cwd = os.path.join(ppsInstallDir, 'python_scripts') scriptPath = os.path.join(cwd, 'predict.py') cmd = 'python %s -c %s -fasta %s ' % (scriptPath, ppsConfigFilePathPy, fastaFile) baseName = os.path.basename(fastaFile) for f in os.listdir(os.path.dirname(fastaFile)): if (f.endswith('.out') or f.endswith('.sl')) and baseName in f: os.remove(os.path.join(os.path.dirname(fastaFile), f)) if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd)])) is not None: sys.exit(-1) for f in os.listdir(os.path.dirname(fastaFile)): if f.endswith('.sl') and baseName in f: os.remove(os.path.join(os.path.dirname(fastaFile), f))
def runTrain(ppsInstallDir, ppsConfigFilePathPy): cwd = os.path.join(ppsInstallDir, 'python_scripts') scriptPath = os.path.join(cwd, 'train.py') cmd = 'python %s -c %s -y' % (scriptPath, ppsConfigFilePathPy) if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd)])) is not None: sys.exit(-1)
def runMarkerGeneAnalysis(self, fastaFileDNA, outLog=None): """ Run hmmer HMM and mothur classify (bayesian), same param as for the 16S analysis. """ #read list of marker genes mgFiles = forEachLine(self.markerGeneListFile, _MgFiles(self.markerGeneListFileDir)) #translate DNA to protein sequences fastaFileProt = os.path.join(self.markerGeneWorkingDir, str(os.path.basename(fastaFileDNA) + '.PROT')) dnaToProt(fastaFileDNA, fastaFileProt) #read DNA fasta file try: handle = open(fastaFileDNA, "rU") dnaSeqDict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) handle.close() except Exception: sys.stderr.write(str('Cannot read file: ' + str(fastaFileDNA))) raise #to output all predictions in one file outPredAllFileName = os.path.join(self.markerGeneWorkingDir, str(os.path.basename(fastaFileDNA) + '_all.mP')) outAllBuffer = OutFileBuffer(outPredAllFileName) #run HMM search mgList = mgFiles.getGeneNameList() if outLog is not None: stdoutLog = open(outLog, 'w') else: stdoutLog = subprocess.STDOUT #for each gene perform the analysis separately for geneName in mgList: domFileArray = [os.path.join(self.markerGeneWorkingDir, str(geneName + '_1.dom'))] #, # os.path.join(self.markerGeneWorkingDir, str(geneName + '_2.dom'))] outFileArray = [os.path.join(self.markerGeneWorkingDir, str(geneName + '_1.out'))] #, # os.path.join(self.markerGeneWorkingDir, str(geneName + '_2.out'))] hmmFileArray = [mgFiles.getFilePath(geneName, 'hmmPROTPrim')] #, # mgFiles.getFilePath(geneName, 'hmmPROTSec')] cmdArray = list([]) #define cmd for i in range(1): if hmmFileArray[i] is not None: cmdArray.append(str(os.path.join(self.hmmerBinDir, 'hmmsearch') + ' --domtblout ' + domFileArray[i] + ' -E 0.01' + self.processorsHmm + ' -o ' + outFileArray[i] + ' ' + hmmFileArray[i] + ' ' + fastaFileProt)) else: cmdArray.append(None) #run cmd for cmd in cmdArray: if cmd is not None and os.name == 'posix': cwd = self.hmmInstallDir if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd)])) is not None: sys.exit(-1) # hmmProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self.hmmInstallDir, stdout=stdoutLog) # print 'run cmd:', cmd # hmmProc.wait() # print 'HMM return code:', hmmProc.returncode # if hmmProc.returncode != 0: # raise Exception("Command returned with non-zero %s status: %s" % (hmmProc.returncode, cmd)) else: print 'Marker genes analysis, doesn`t run (no posix): ', cmd #get regions that match to the HMM profile () entryDictList = [] for i in range(1): if cmdArray[i] is not None: entryDictList.append(forEachLine(domFileArray[i], _MgRegions()).getEntryDict()) else: entryDictList.append(None) entryDict1 = entryDictList[0] # entryDict2 = entryDictList[1] #extract regions found in the protein sequences that were found by the HMM and generate corresponding DNA sequences regionDnaFasta = os.path.join(self.markerGeneWorkingDir, str(geneName + '_dna.gff')) outFileBuffer = OutFileBuffer(regionDnaFasta) for seqName in entryDict1: i = -1 for e in entryDict1[seqName]: i += 1 from1 = entryDict1[seqName][i][0] to1 = entryDict1[seqName][i][1] assert ((from1 != None) and (to1 != None)) #compare the results found by the primary and secondary HMM profiles # if (entryDict2 != None) and (seqName in entryDict2): # if len(entryDict2[seqName]) >= (i+1): # from2 = entryDict2[seqName][i][0] # to2 = entryDict2[seqName][i][1] #if from1 != from2 or to1 != to2: # print str('Different positions in' + seqName + ' from1:' + str(from1) + ' from2:' + str(from2) # + ' to1:' + str(to1) + ' to2:' + str(to2)) #extract regions from the DNA sequences (consider 3 ORF and reverse complements) #name of the whole sequence dnaSeqName = re.sub(r'([0-9]+_[0-9]+)_[pr]+[012]', r'\1', seqName) #whole DNA sequence dnaSeq = dnaSeqDict[dnaSeqName].seq #reverse complement (contains "pr") tagRev = 'p' if re.match(r'[0-9]+_[0-9]+_pr[012]', seqName): dnaSeq = dnaSeq.reverse_complement() tagRev = 'pr' #shift "0" if re.match(r'[0-9]+_[0-9]+_[pr]+0', seqName): tagFrom = ((from1 - 1)*3) tagTo = (to1*3) tagRev += '0' dnaSeq = dnaSeq[tagFrom:tagTo] #shift "1" elif re.match(r'[0-9]+_[0-9]+_[pr]+1', seqName): tagFrom = (((from1 - 1)*3) + 1) tagTo = ((to1*3) + 1) tagRev += '1' dnaSeq = dnaSeq[tagFrom:tagTo] #shift "2" elif re.match(r'[0-9]+_[0-9]+_[pr]+2', seqName): tagFrom = (((from1 - 1)*3) + 2) tagTo = ((to1*3) + 2) tagRev += '2' dnaSeq = dnaSeq[tagFrom:tagTo] #error else: sys.stderr.write('Wrong seq name: ' + seqName + ' \n') dnaSeq = None tag = str(str(tagFrom) + '_' + str(tagTo) + '_' + tagRev) outFileBuffer.writeText(str('>' + dnaSeqName + '_' + tag + '\n' + dnaSeq + '\n')) outFileBuffer.close() #if no marker gene found if outFileBuffer.isEmpty(): continue #run mothur classify (bayesian? the same as for the 16S analysis) templateFile = mgFiles.getFilePath(geneName, 'templateDNA') taxonomyFile = mgFiles.getFilePath(geneName, 'taxonomyDNA') assert ((templateFile is not None) and (taxonomyFile is not None)) cmd = str('' + self.mothur + ' "#classify.seqs(fasta=' + regionDnaFasta + ', template=' + templateFile + ', taxonomy=' + taxonomyFile + ', ' + self.mothurParam + ')"') if os.name == 'posix': print('Mothur processing: %s' % os.path.basename(templateFile).split('_', 1)[0]) cwd = self.markerGeneWorkingDir if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd, stdout=stdoutLog)])) is not None: sys.exit(-1) # mothurProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self.markerGeneWorkingDir, stdout=stdoutLog) # print 'run cmd:', cmd # mothurProc.wait() # print 'mothur return code:', mothurProc.returncode # if mothurProc.returncode != 0: # raise Exception("Command returned with non-zero %s status: %s" % (mothurProc.returncode, cmd)) else: print 'Cannot run mothur since your system is not "posix" but', str('"' + os.name + '"'), '\n', cmd #transform the mothur output to a simple output (name, ncbid, weight) #mothurPredFileName = os.path.join(self.markerGeneWorkingDir, # str(geneName + '_dna.' + os.path.basename(taxonomyFile) + 'onomy')) # taxonomy #!!!!!!!!!!!!! mothurPredFileName = common.getMothurOutputFilePath(regionDnaFasta, taxonomyFile) if not os.path.isfile(mothurPredFileName): mothurPredFileName = common.getMothurOutputFilePath(regionDnaFasta, taxonomyFile, suffix='.bayesian.taxonomy') if not os.path.isfile(mothurPredFileName): print("Can't open file: %s" % mothurPredFileName) outPredFileName = os.path.join(self.markerGeneWorkingDir, str(os.path.basename(fastaFileDNA) + '_' + geneName + '.mP')) outBuffer = OutFileBuffer(outPredFileName, bufferText=True) forEachLine(mothurPredFileName, _MothurOutFileParser(outBuffer, geneName)) if not outAllBuffer.isEmpty(): outAllBuffer.writeText('\n') outAllBuffer.writeText(outBuffer.getTextBuffer()) if outLog is not None: stdoutLog.close() outAllBuffer.close()
def mainSnowball( fq1Path, fq2Path, profileHmmFile, insertSize, readLen=None, outFile=None, outFormat="fna", workingDir=None, hmmsearchPath=None, pfamMinScore=None, pOverlap=None, overlapLen=None, outAnnot=False, cleanUp=False, processors=mp.cpu_count(), ): """ Main function, the interface of the Snowball gene assembler. @param fq1Path: FASTQ 1 file path (containing first ends of Illumina paired-end reads) @param fq2Path: FASTQ 2 file path (containing second ends) @param profileHmmFile: profile HMMs file, containing models generated by the HMMER 3 software @param insertSize: mean insert size used for the library preparation (i.e. read generation) @param readLen: read length (if None, it will be derived from the FASTQ file) @param outFile: output file (if None, it will be derived from the FASTQ 1 file) @param outFormat: output file format 'fna' or 'fq' @param workingDir: temporary files will be stored here (if None, a temporary directory will be created/deleted) @param hmmsearchPath: path to the HMMER hmmsearch command (if None, it will take version that is in the PATH) @param pfamMinScore: minimum score for the hmmsearch (if None, use default) @param pOverlap: minimum overlap probability for the Snowball algorithm @param overlapLen: minimum overlap length for the Snowball algorithm @param outAnnot: if true, additional annotation will be stored along with the resulting contigs @param cleanUp: if true, delete temporary files at the end @param processors: Number of processors (default: use all processors available) @type fq1Path: str @type fq2Path: str @type profileHmmFile: str @type insertSize: int @type readLen: str @type outFile: str @type outFormat: str @type workingDir: str @type hmmsearchPath: str @type pfamMinScore: int @type pOverlap: float @type overlapLen: float @type outAnnot: bool @type cleanUp: bool @type processors: int """ assert os.name == "posix", 'Snowball runs only on "posix" systems, your system is: %s' % os.name # checking input parameters assert os.path.isfile(fq1Path), "File does not exist: %s" % fq1Path assert os.path.isfile(fq2Path), "File does not exist: %s" % fq2Path # derive the read length if readLen is None: for name, dna, p, qs in fq.ReadFqGen(fq1Path): readLen = len(dna) assert readLen == len(qs), "File corrupted %s" % fq1Path break assert readLen is not None, "Cannot derive read length from %s" % fq1Path assert readLen <= insertSize < 2 * readLen, "Invalid read length (%s) and insert size (%s) combination" % ( readLen, insertSize, ) assert os.path.isfile(profileHmmFile), "File does not exist: %s" % profileHmmFile outFormat = outFormat.strip() assert outFormat == "fna" or outFormat == "fq", "Invalid output format: %s" % outFormat # checking the output file if outFile is None: c = 0 while True: outFile = fq1Path + "_%s.%s.gz" % (c, outFormat) if not os.path.isfile(outFile): break c += 1 else: outFileDir = os.path.dirname(outFile) assert os.path.basename(outFile) != "", "Output file name is empty" assert outFileDir == "" or os.path.isdir(outFileDir), "Invalid output directory: %s" % outFileDir outFile = outFile.strip() if not outFile.endswith(".gz"): outFile += ".gz" print("The name of the output file was modified to:\n\t%s" % outFile) # Looking for the hmmsearch binaries if hmmsearchPath is None: hmmsearchPath = os.popen("which hmmsearch").read().strip() if hmmsearchPath != "": print("This hmmsearch binary will be used:\n\t%s" % hmmsearchPath) assert os.path.isfile(hmmsearchPath), "Path for (hmmsearch) is invalid: %s" % hmmsearchPath # creates a temporary working directory if workingDir is None: workingDir = tempfile.mkdtemp(prefix="snowball_") assert os.path.isdir(workingDir), "Cannot create temporary working directory (%s)" % workingDir cleenUpTmpWorkingDir = True print("Using temporary directory:\n\t%s" % workingDir) else: cleenUpTmpWorkingDir = False assert os.path.isdir(workingDir), "Working directory does not exist:\n\t%s" % workingDir assert not os.listdir(workingDir), "Working directory must be empty:\n\t%s" % workingDir # set the number of processor cores to be used comh.MAX_PROC = max(1, min(processors, mp.cpu_count())) # set assembly parameters or use defaults if pfamMinScore is not None: comh.SAMPLES_PFAM_EVAN_MIN_SCORE = pfamMinScore if pOverlap is not None: comh.ASSEMBLY_POVERLAP = (pOverlap,) if overlapLen is not None: comh.ASSEMBLY_OVERLAP_LEN = (overlapLen,) # creates a temporary directory for the sample strains strainsDir = os.path.join(workingDir, "strains") if not os.path.isdir(strainsDir): os.mkdir(strainsDir) assert os.path.isdir(strainsDir), "Cannot create temporary directory:\n\t%s" % strainsDir os.symlink(fq1Path, os.path.join(strainsDir, "0_pair1.fq.gz")) os.symlink(fq2Path, os.path.join(strainsDir, "0_pair2.fq.gz")) # Start of the algorithm print("Running on: %s (%s)" % (" ".join(platform.dist()), sys.platform)) print("Using %s processors" % comh.MAX_PROC) print( "Settings:\n\tRead length: %s\n\tInsert size: %s\n\tMin. overlap probability: %s\n\tMin. overlap length: %s" "\n\tMin. HMM score: %s" % ( readLen, insertSize, comh.ASSEMBLY_POVERLAP[0], comh.ASSEMBLY_OVERLAP_LEN[0], comh.SAMPLES_PFAM_EVAN_MIN_SCORE, ) ) # file with joined consensus reads fqJoinPath = os.path.join(strainsDir, "0_join.fq.gz") # join paired-end reads if True: # to skip this step, set to False (e.g. resume processing after OS/HW failure) print("Joining paired-end reads into consensus reads, loading reads from:\n\t%s\n\t%s" % (fq1Path, fq2Path)) r = fq.joinPairEnd( [(fq1Path, fq2Path, fqJoinPath, readLen, insertSize, None, 60)], minOverlap=comh.SAMPLES_PAIRED_END_JOIN_MIN_OVERLAP, minOverlapIdentity=comh.SAMPLES_PAIRED_END_JOIN_MIN_OVERLAP_IDENTITY, maxCpu=comh.MAX_PROC, ) print("Filtered out: %s %% reads" % r) # Translate consensus reads into protein sequences, run hmmsearch if True: # to skip this step, set to False (e.g. resume processing after OS/HW failure) print("Translating reads to protein sequences") # file with protein consensus read sequences joinFastaProtGzip = os.path.join(strainsDir, "0_join_prot.fna.gz") fq.readsToProt(fqJoinPath, joinFastaProtGzip, comh.TRANSLATION_TABLE) print("Running HMMER (hmmsearch)") domOut = os.path.join(strainsDir, "0_join_prot.domtblout") joinFastaProt = joinFastaProtGzip[:-3] cmd = "zcat %s > %s;%s -o /dev/null --noali --domtblout %s -E 0.01 " "--cpu %s %s %s;rm %s;gzip -f %s" % ( joinFastaProtGzip, joinFastaProt, hmmsearchPath, domOut, comh.MAX_PROC, profileHmmFile, joinFastaProt, joinFastaProt, domOut, ) assert parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, strainsDir)])) is None # Assign consensus reads to individual gene domains if True: # to skip this step, set to False (e.g. resume processing after OS/HW failure) print("Assigning consensus reads to gene domains") hio.partitionReads( workingDir, comh.SAMPLES_PFAM_EVAN_MIN_SCORE, comh.SAMPLES_PFAM_EVAN_MIN_ACCURACY, comh.SAMPLES_SHUFFLE_RAND_SEED, comh.SAMPLES_PFAM_PARTITIONED_DIR, True, False, ) partitionedDir = os.path.join(workingDir, comh.SAMPLES_PFAM_PARTITIONED_DIR) # Run Assembly if True: # to skip this step, set to False (e.g. resume processing after OS/HW failure) print("Running Snowball assembly") # collect tasks for each gene domain taskList = [] assert os.path.isdir(partitionedDir), "Temporary directory does not exist:\n\t%s" % partitionedDir for f in os.listdir(partitionedDir): fPath = os.path.join(partitionedDir, f) if f.endswith("join.fq.gz") and os.path.isfile(fPath): base = fPath[:-6] inFq = fPath inDomtblout = "%s_prot.domtblout.gz" % base inProtFna = "%s_prot.fna.gz" % base outPath = "%s_read_rec.pkl.gz" % base taskList.append( parallel.TaskThread( hmain.buildSuperReads, ( inFq, inDomtblout, inProtFna, outPath, comh.ASSEMBLY_CONSIDER_PROT_COMP, comh.ASSEMBLY_ONLY_POVERLAP, comh.ASSEMBLY_POVERLAP, comh.ASSEMBLY_OVERLAP_LEN, comh.ASSEMBLY_OVERLAP_ANNOT_LEN, comh.ASSEMBLY_STOP_OVERLAP_MISMATCH, comh.ASSEMBLY_MAX_LOOPS, comh.TRANSLATION_TABLE, ), ) ) # run tasks in parallel parallel.runThreadParallel(taskList, comh.MAX_PROC, keepRetValues=False) # Creates the output file if True: # to skip this step, set to False (e.g. resume processing after OS/HW failure) print("Creating output file:\n\t%s" % outFile) counter = 0 out = fq.WriteFq(outFile) for f in os.listdir(partitionedDir): fPath = os.path.join(partitionedDir, f) if f.endswith(".pkl.gz") and os.path.isfile(fPath): domName = f[2:-23] for rec in hio.loadReadRec(fPath): counter += 1 contigName = "contig_%s_%s" % (counter, domName) dnaSeq = rec.dnaSeq # get the quality score string if outAnnot or outFormat == "fq": qs = rec.qsArray.getQSStr(dnaSeq) else: qs = None # get the contig annotations if outAnnot: assert qs is not None codingStart = rec.annotStart codingLen = rec.annotLen posCov = ",".join(map(lambda x: str(int(x)), rec.getPosCovArray())) annotStr = "domName:%s|codingStart:%s|codingLen:%s|qs:%s|posCov:%s" % ( domName, codingStart, codingLen, qs, posCov, ) else: annotStr = "" # write an entry to the output file if outFormat == "fq": out.writeFqEntry("@" + contigName, dnaSeq, qs, annotStr) else: assert outFormat == "fna" if outAnnot: annotStr = "|" + annotStr out.write(">%s%s\n%s\n" % (contigName, annotStr, dnaSeq)) # close output file out.close() # Clean up the working directory if cleenUpTmpWorkingDir: # clean up the temporary directory print("Cleaning up temporary directory") assert os.path.isdir(workingDir), "Directory to be cleaned does not exist:\n%s" % workingDir shutil.rmtree(workingDir) elif cleanUp: # clean up the user defined working directory if os.path.isdir(workingDir): print("Cleaning up working directory:\n\t%s" % workingDir) shutil.rmtree(os.path.join(workingDir, comh.SAMPLES_PFAM_PARTITIONED_DIR)) shutil.rmtree(strainsDir) print("Done")
def runHMM(self, inputFastaFile, outLog=None): """ Run the hidden markov model to get regions in the input sequences where the 16S and 23S genes are located. """ processors = self._config.get('processors') if processors is not None: processors = ' -p %s ' % processors else: processors = '' hmmInstallDir = os.path.normpath(self._config.get('rnaHmmInstallDir')) hmmerBinDir = os.path.normpath(self._config.get('hmmerBinDir')) regionsFile = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.gff')) cmd = str('export PATH=' + hmmerBinDir + ':$PATH;' + os.path.join(hmmInstallDir, 'rna_hmm3.py') + ' -i ' + inputFastaFile + ' -o ' + regionsFile + processors) # + processors if os.name == 'posix': cwd = self._config.get('rnaHmmInstallDir') if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd)])) is not None: sys.exit(-1) # if outLog is not None: # stdoutLog = open(outLog, 'w') # else: # stdoutLog = subprocess.STDOUT # stdout=subprocess.STDOUT # hmmProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self._config.get('rnaHmmInstallDir'), stdout=stdoutLog) # print 'run cmd:', cmd # hmmProc.wait() # if outLog is not None: # stdoutLog.close() # print 'HMM return code:', hmmProc.returncode # if hmmProc.returncode != 0: # raise Exception("Command returned with non-zero %s status: %s" % (hmmProc.returncode, cmd)) else: print 'Cannot run HMM since your system is not "posix" but', str('"' + os.name + '"'), '\n', cmd handle = open(inputFastaFile, "rU") record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) handle.close() #trunkoutputfilename = inputFastaFile.split( "/" )[-1] trunkoutputfilename = os.path.join(self._workingDir, os.path.basename(inputFastaFile)) # parse results file line by line for line in open(regionsFile, "rU"): if line[0] != "#": line = line.split() ident = line[0] start = int( line[3] ) stop = int( line[4] ) strand = line[6] gene = line[8] seq = record_dict[ ident ].seq if strand == "+": subseq = seq[start-1:stop] elif strand == "-": subseq = seq[start-1:stop].reverse_complement() else: sys.stderr.write(" analysis16s: invalid strand symbol") exit(1) outfile = open(trunkoutputfilename + "." + gene + ".fna", "a") print >> outfile, ">%s_%i_%i_%s" % (ident, start, stop, strand) print >> outfile, subseq outfile.close()
def _classify(self, mode, inputFastaFile, outLog=None): mothur = os.path.join(os.path.normpath(self._config.get('mothurInstallDir')), 'mothur') if mode == 16: extractedRegionsFasta = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.16S_rRNA.fna')) taxonomyFile = os.path.join(self._refDir, os.path.normpath(self._refDict[('16S_rRNA','taxonomyDNA')][0])) templateFile = os.path.join(self._refDir, os.path.normpath(self._refDict[('16S_rRNA','templateDNA')][0])) #mothurPredFileName = str(extractedRegionsFasta[0:extractedRegionsFasta.rindex('.')] + '.taxonomy') mothurPredFileName = common.getMothurOutputFilePath(extractedRegionsFasta, taxonomyFile) predFileName = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.16P')) #extractedRegionsFasta = str(inputFastaFile + '.16S_rRNA.fna') #templateFile = os.path.normpath(self._configRRNA16S.get('mothurClassifyParam16STemplate')) #taxonomyFile = os.path.normpath(self._configRRNA16S.get('mothurClassifyParam16STaxonomy')) #mothurPredFileName = str(inputFastaFile + '.16S_rRNA.bacteria+archaea.taxonomy') #mothurPredFileName = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.16S_rRNA.bacteria+archaea.taxonomy')) #mothurPredFileName = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.16S_rRNA.fasta.taxonomy')) #predFileName = str(inputFastaFile + '.16P') elif mode == 23: extractedRegionsFasta = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.23S_rRNA.fna')) taxonomyFile = os.path.join(self._refDir, os.path.normpath(self._refDict[('23S_rRNA','taxonomyDNA')][0])) templateFile = os.path.join(self._refDir, os.path.normpath(self._refDict[('23S_rRNA','templateDNA')][0])) #mothurPredFileName = str(extractedRegionsFasta[0:extractedRegionsFasta.rindex('.')] + '.taxonomy') mothurPredFileName = common.getMothurOutputFilePath(extractedRegionsFasta, taxonomyFile) predFileName = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.23P')) #extractedRegionsFasta = str(inputFastaFile + '.23S_rRNA.fna') #templateFile = os.path.normpath(self._configRRNA16S.get('mothurClassifyParam23STemplate')) #taxonomyFile = os.path.normpath(self._configRRNA16S.get('mothurClassifyParam23STaxonomy')) #mothurPredFileName = str(inputFastaFile + '.23S_rRNA.bacteria+archaea.taxonomy') #mothurPredFileName = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.23S_rRNA.bacteria+archaea.taxonomy')) #mothurPredFileName = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.23S_rRNA.fasta.taxonomy')) #predFileName = str(inputFastaFile + '.23P') elif mode == 5: #extractedRegionsFasta = str(inputFastaFile + '.5S_rRNA.fna') extractedRegionsFasta = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.5S_rRNA.fna')) taxonomyFile = os.path.join(self._refDir, os.path.normpath(self._refDict[('5S_rRNA', 'taxonomyDNA')][0])) templateFile = os.path.join(self._refDir, os.path.normpath(self._refDict[('5S_rRNA', 'templateDNA')][0])) mothurPredFileName = common.getMothurOutputFilePath(extractedRegionsFasta, taxonomyFile) predFileName = os.path.join(self._workingDir, str(os.path.basename(inputFastaFile) + '.5P')) #templateFile = os.path.normpath(self._configRRNA16S.get('mothurClassifyParam5STemplate')) #taxonomyFile = os.path.normpath(self._configRRNA16S.get('mothurClassifyParam5STaxonomy')) #mothurPredFileName = os.path.join(self._workingDir, # str(os.path.basename(inputFastaFile) + '.5S_rRNA.' + os.path.basename(taxonomyFile) + 'onomy'))#.taxonomy #predFileName = str(inputFastaFile + '.5P') else: raise Exception('Wrong branch') if not os.path.isfile(mothurPredFileName): mothurPredFileName = common.getMothurOutputFilePath(extractedRegionsFasta, taxonomyFile, suffix='.bayesian.taxonomy') param = self._config.get('mothurClassifyParamOther') cmd = str(mothur + ' "#classify.seqs(fasta=' + extractedRegionsFasta + ', template=' + templateFile + ', taxonomy=' + taxonomyFile + ', ' + param + ')"') if os.name == 'posix': print('Mothur processing: %s' % os.path.basename(templateFile).split('_', 1)[0]) cwd = self._workingDir if outLog is not None: stdoutLog = open(outLog, 'w') else: stdoutLog = subprocess.STDOUT if parallel.reportFailedCmd(parallel.runCmdSerial([parallel.TaskCmd(cmd, cwd, stdout=stdoutLog)])) is not None: sys.exit(-1) if outLog is not None: stdoutLog.close() # mothurProc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=self._workingDir, stdout=stdoutLog) # print 'run cmd:', cmd # mothurProc.wait() # if outLog is not None: # stdoutLog.close() # print 'mothur return code:', mothurProc.returncode # if mothurProc.returncode != 0: # raise Exception("Command returned with non-zero %s status: %s" % (mothurProc.returncode, cmd)) else: print 'Cannot run mothur since your system is not "posix" but', str('"' + os.name + '"'), '\n', cmd #transform mothur prediction files to the tab separated files self.mothurPredToTabSepPred(mothurPredFileName, predFileName)