def run(self, configFile, otu, threads): rc = ReadConfig() projectParams, sampleParams = rc.readConfig(configFile, outputDirExists = False) ggDB = self.ggDB.replace('##', str(otu), 1) print 'Mapping reads to the GreenGenes DB at: ' + ggDB + '\n' if not os.path.exists(ggDB + '.amb'): print 'Indexing GreenGenes DB:' os.system('bwa index -a is ' + ggDB) print '' else: print 'GreenGenes DB is already indexed.\n' for sample in sampleParams: print 'Mapping reads in sample: ' + sample pairs = sampleParams[sample]['pairs'] singles = sampleParams[sample]['singles'] # align and map each pair for i in xrange(0, len(pairs), 2): pair1 = pairs[i] pair2 = pairs[i+1] bamPrefix = projectParams['output_dir'] + ntpath.basename(pair1) mapPair(ggDB, pair1, pair2, bamPrefix, threads) # align and map each single-ended read file for i in xrange(0, len(singles)): bamPrefix = projectParams['output_dir'] + ntpath.basename(singles[i]) mapSingle(ggDB, singles[i], bamPrefix, threads)
def run(self, configFile, contigFile, assemblies16S, binDir, threads): rc = ReadConfig() projectParams, sampleParams = rc.readConfig(configFile, outputDirExists=True) # check if links directory already exists linkFile = os.path.join(projectParams['output_dir'], 'linksToBin') if not os.path.exists(linkFile): os.makedirs(linkFile) else: rtn = raw_input('Remove previously identified links (Y or N)? ') if rtn.lower() == 'y' or rtn.lower() == 'yes': files = os.listdir(linkFile) for f in files: os.remove(os.path.join(linkFile, f)) else: sys.exit() outputDir = os.path.join(projectParams['output_dir'], 'linksToBin') # create combined file with reference sequences and assembled 16S sequences print 'Combining unbinned reference sequences with de novo assembled 16S sequences.' combinedFile = os.path.join(outputDir, 'scaffolds.combined.fasta') os.system('cat ' + contigFile + ' ' + assemblies16S + ' > ' + combinedFile) # create combined 16S read files print 'Combining 16S/18S reads from all samples.' reads1 = '' reads2 = '' for sample in sampleParams: extractedPrefix = os.path.join(projectParams['output_dir'], 'extracted', sample) pairs = sampleParams[sample]['pairs'] for i in xrange(0, len(pairs), 2): pair1Base = ntpath.basename(pairs[i]) pair2Base = ntpath.basename(pairs[i + 1]) classificationFile1 = extractedPrefix + '.' + pair1Base[ 0:pair1Base.rfind('.')] + '.union.SSU.fasta' classificationFile2 = extractedPrefix + '.' + pair2Base[ 0:pair2Base.rfind('.')] + '.union.SSU.fasta' reads1 += classificationFile1 + ' ' reads2 += classificationFile2 + ' ' os.system('cat ' + reads1 + ' > ' + os.path.join(outputDir, 'ssu.1.fasta')) os.system('cat ' + reads2 + ' > ' + os.path.join(outputDir, 'ssu.2.fasta')) # identify 16S sequences in paired-end reads self.link16S(combinedFile, os.path.join(outputDir, 'ssu.1.fasta'), os.path.join(outputDir, 'ssu.2.fasta'), binDir, threads, outputDir)
def run(self, configFile, threads, evalue, bQuiet): rc = ReadConfig() projectParams, sampleParams = rc.readConfig(configFile, outputDirExists = True) os.makedirs(projectParams['output_dir'] + 'extracted_lsu') self.bQuiet = bQuiet for sample in sampleParams: pairs = sampleParams[sample]['pairs'] singles = sampleParams[sample]['singles'] # identify 16S sequences in paired-end reads self.processPairs(pairs, threads, evalue, projectParams['output_dir'], sample) # identify 16S sequences in single-end reads self.processSingles(singles, threads, evalue, projectParams['output_dir'], sample)
def run(self, configFile, contigFile, assemblies16S, binDir, threads): rc = ReadConfig() projectParams, sampleParams = rc.readConfig(configFile, outputDirExists = True) # check if links directory already exists linkFile = os.path.join(projectParams['output_dir'], 'linksToBin') if not os.path.exists(linkFile): os.makedirs(linkFile) else: rtn = raw_input('Remove previously identified links (Y or N)? ') if rtn.lower() == 'y' or rtn.lower() == 'yes': files = os.listdir(linkFile) for f in files: os.remove(os.path.join(linkFile, f)) else: sys.exit() outputDir = os.path.join(projectParams['output_dir'], 'linksToBin') # create combined file with reference sequences and assembled 16S sequences print 'Combining unbinned reference sequences with de novo assembled 16S sequences.' combinedFile = os.path.join(outputDir, 'scaffolds.combined.fasta') os.system('cat ' + contigFile + ' ' + assemblies16S + ' > ' + combinedFile) # create combined 16S read files print 'Combining 16S/18S reads from all samples.' reads1 = '' reads2 = '' for sample in sampleParams: extractedPrefix = os.path.join(projectParams['output_dir'], 'extracted', sample) pairs = sampleParams[sample]['pairs'] for i in xrange(0, len(pairs), 2): pair1Base = ntpath.basename(pairs[i]) pair2Base = ntpath.basename(pairs[i+1]) classificationFile1 = extractedPrefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.union.SSU.fasta' classificationFile2 = extractedPrefix + '.' + pair2Base[0:pair2Base.rfind('.')] + '.union.SSU.fasta' reads1 += classificationFile1 + ' ' reads2 += classificationFile2 + ' ' os.system('cat ' + reads1 + ' > ' + os.path.join(outputDir, 'ssu.1.fasta')) os.system('cat ' + reads2 + ' > ' + os.path.join(outputDir, 'ssu.2.fasta')) # identify 16S sequences in paired-end reads self.link16S(combinedFile, os.path.join(outputDir, 'ssu.1.fasta'), os.path.join(outputDir, 'ssu.2.fasta'), binDir, threads, outputDir)
def run(self, configFile, mappingQual, minLength): self.mappingQualityThreshold = mappingQual self.minLength = minLength rc = ReadConfig() projectParams, sampleParams = rc.readConfig(configFile, outputDirExists = True) for sample in sampleParams: outputDir = projectParams['output_dir'] prefix = outputDir + sample pairs = sampleParams[sample]['pairs'] singles = sampleParams[sample]['singles'] # identify 16S sequences in paired-end reads self.processPairs(pairs, outputDir, prefix) # identify 16S sequences in single-end reads self.processSingles(singles, outputDir, prefix)
def run(self, configFile, threads, evalue, bQuiet): rc = ReadConfig() projectParams, sampleParams = rc.readConfig(configFile, outputDirExists=True) os.makedirs(projectParams['output_dir'] + 'extracted_lsu') self.bQuiet = bQuiet for sample in sampleParams: pairs = sampleParams[sample]['pairs'] singles = sampleParams[sample]['singles'] # identify 16S sequences in paired-end reads self.processPairs(pairs, threads, evalue, projectParams['output_dir'], sample) # identify 16S sequences in single-end reads self.processSingles(singles, threads, evalue, projectParams['output_dir'], sample)
def run(self, configFile, db, threads, bQuiet): rc = ReadConfig() projectParams, sampleParams = rc.readConfig(configFile, outputDirExists=True) # check if classification directory already exists if not os.path.exists( os.path.join(projectParams['output_dir'], 'classified')): os.makedirs(os.path.join(projectParams['output_dir'], 'classified')) else: rtn = raw_input('Remove previously classified reads (Y or N)? ') if rtn.lower() == 'y' or rtn.lower() == 'yes': files = os.listdir(projectParams['output_dir'] + 'classified') for f in files: os.remove(projectParams['output_dir'] + 'classified/' + f) else: sys.exit() dbFile = self.dbFiles[db] taxonomyFile = self.taxonomyFiles[db] if not bQuiet: print 'Classifying reads with: ' + dbFile print 'Assigning taxonomy with: ' + taxonomyFile print 'Threads: ' + str(threads) print '' # create list of all sequence to classify mothurSeqFileList = '' for sample in sampleParams: prefix = os.path.join(projectParams['output_dir'], 'extracted', sample) pairs = sampleParams[sample]['pairs'] singles = sampleParams[sample]['singles'] for i in xrange(0, len(pairs), 2): pair1Base = ntpath.basename(pairs[i]) pair1File = prefix + '.' + pair1Base[ 0:pair1Base.rfind('.')] + '.intersect.SSU.fasta' pair2Base = ntpath.basename(pairs[i + 1]) pair2File = prefix + '.' + pair2Base[ 0:pair2Base.rfind('.')] + '.intersect.SSU.fasta' diffFile = prefix + '.' + pair1Base[ 0:pair1Base.rfind('.')] + '.difference.SSU.fasta' mothurSeqFileList += pair1File + '-' + pair2File + '-' + diffFile + '-' for single in singles: singleBase = ntpath.basename(single) singleFile = prefix + '.' + singleBase[ 0:singleBase.rfind('.')] + '.SSU.fasta' mothurSeqFileList += singleFile + '-' # classify with mothur mothurSeqFileList = mothurSeqFileList[0:-1] # remove trailing dash self.classify(mothurSeqFileList, dbFile, taxonomyFile, threads, bQuiet) # rename classification file for consistency with down-stream processing print 'Final classifications written to: ' for filename in mothurSeqFileList.split('-'): if 'GG' in db: inputName = filename[0:filename. rfind('.')] + '.full.wang.taxonomy' else: inputName = filename[0:filename.rfind( '.')] + '.SSURef_111_NR_taxonomy.wang.taxonomy' outputName = inputName.replace('/extracted/', '/classified/') outputName = outputName.replace('SSU.full.wang.taxonomy', '16S.tsv') os.system('mv ' + inputName + ' ' + outputName) print ' ' + outputName
print '' if __name__ == '__main__': parser = argparse.ArgumentParser( description= "Classify 16S fragments by mapping them to the GreenGenes DB with BWA.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('config_file', help='project config file.') parser.add_argument( 'ref_db', help= 'Reference DB to use for classification (choices: GG94, GG97, GG99, SILVA98)', choices=['GG94', 'GG97', 'GG99', 'SILVA98']) parser.add_argument('-t', '--threads', help='number of threads', type=int, default=1) args = parser.parse_args() classifyBWA = ClassifyBWA() rc = ReadConfig() projectParams, sampleParams = rc.readConfig(args.config_file, outputDirExists=True) classifyBWA.run(projectParams, sampleParams, args.ref_db, args.threads)
def run(self, configFile, otu, seqIdentityThreshold, minSeqCutoff, bPairsAsSingles, bSingleEnded, bQuiet): self.bQuiet = bQuiet rc = ReadConfig() projectParams, sampleParams = rc.readConfig(configFile, outputDirExists = True) ggRefDistFile = self.ggRefDist.replace('##', str(otu)) neighbours = self.getNeighbours(ggRefDistFile, seqIdentityThreshold) # create directory to store putative 16S genes dirPutative16S = projectParams['output_dir'] + 'putativeSSU/' if not os.path.exists(dirPutative16S): os.makedirs(dirPutative16S) else: rtn = raw_input('Remove previously recovered 16S reads (Y or N)? ') if rtn.lower() == 'y' or rtn.lower() == 'yes': files = os.listdir(dirPutative16S) for f in files: if f.endswith('fasta'): os.remove(dirPutative16S + '/' + f) else: sys.exit() referenceSeqHits = {} for sample in sampleParams: if not self.bQuiet: print '' print sample + ':' extractedPrefix = projectParams['output_dir'] + 'extracted/' + sample classifiedPrefix = projectParams['output_dir'] + 'classified/' + sample pairs = sampleParams[sample]['pairs'] singles = sampleParams[sample]['singles'] for i in xrange(0, len(pairs), 2): pair1Base = ntpath.basename(pairs[i]) pair2Base = ntpath.basename(pairs[i+1]) classificationFile1 = classifiedPrefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.intersect.16S.tsv' classificationFile2 = classifiedPrefix + '.' + pair2Base[0:pair2Base.rfind('.')] + '.intersect.16S.tsv' if not self.bQuiet: print ' Processing files: ' print ' ' + classificationFile1 print ' ' + classificationFile2 pairFile1 = extractedPrefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.intersect.SSU.fasta' pairFile2 = extractedPrefix + '.' + pair2Base[0:pair2Base.rfind('.')] + '.intersect.SSU.fasta' self.identifyConsistentPairs(referenceSeqHits, pairFile1, pairFile2, classificationFile1, classificationFile2, neighbours, bPairsAsSingles, bSingleEnded) if bSingleEnded: classificationFile = classifiedPrefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.difference.16S.tsv' if not self.bQuiet: print ' Processing file: ' + classificationFile singleFile = extractedPrefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.difference.SSU.fasta' self.addSingletons(referenceSeqHits, singleFile, classificationFile) if bSingleEnded: for single in singles: singleBase = ntpath.basename(single) classificationFile = classifiedPrefix + '.' + singleBase[0:singleBase.rfind('.')] + '.16S.tsv' if not self.bQuiet: print ' Processing file: ' + classificationFile singleFile = extractedPrefix + '.' + singleBase[0:singleBase.rfind('.')] + '.SSU.fasta' self.addSingletons(referenceSeqHits, singleFile, classificationFile) self.extractRecoverable16S(referenceSeqHits, neighbours, minSeqCutoff, dirPutative16S)
def run(self, configFile, threads, kmerLen, minContigLen): rc = ReadConfig() projectParams, _ = rc.readConfig(configFile, outputDirExists=True) # create directory to store putative 16S genes dirPutative16S = projectParams['output_dir'] + 'putativeSSU/' if not os.path.exists(dirPutative16S): print '[Error] Putative 16S gene reads expected in: ' + dirPutative16S sys.exit() # extract GreenGene Ids of putative 16S genes ggIds = set() files = os.listdir(dirPutative16S) for f in files: if f.endswith('fasta'): ggIds.add(int(f.split('.')[0])) print 'Putative 16S genes to assemble: ' + str(len(ggIds)) contigInfo = {} for ggId in ggIds: print 'Assembling ' + str(ggId) + ': ' print '' pair1 = dirPutative16S + str(ggId) + '.1.fasta' pair2 = dirPutative16S + str(ggId) + '.2.fasta' single = dirPutative16S + str(ggId) + '.singletons.fasta' outputDir = dirPutative16S + str(ggId) + '_assembly' if os.path.exists(outputDir): shutil.rmtree(outputDir) cmd = 'mpiexec -n ' + str(threads) + ' Ray -k ' + str( kmerLen) + ' -minimum-contig-length ' + str( minContigLen) + ' -o ' + outputDir if os.stat(single ).st_size > 0: # check if file contains any sequences cmd += ' -s ' + single if os.stat(pair1).st_size > 0: cmd += ' -p ' + pair1 + ' ' + pair2 os.system(cmd) contigInfo[ggId] = self.parseContigInfo(outputDir) print '\n*********************************' allContigsFile = projectParams[ 'output_dir'] + 'assembled_contigs.16S.fasta' fout = open(allContigsFile, 'w') print 'Assembly results: ' for ggId in contigInfo: print ' Assembly of ' + str(ggId) + ' produce ' + str( len(contigInfo[ggId])) + ' contig(s): ' + ' '.join( contigInfo[ggId]) index = 0 for line in open(dirPutative16S + str(ggId) + '_assembly/Contigs.fasta'): if line[0] == '>': lineSplit = line.split() seqLen = lineSplit[1] fout.write('>16S_' + str(ggId) + '-' + str(index) + ' ' + seqLen + '\n') index += 1 else: fout.write(line) fout.close() print '' print ' All assembled 16S contigs written to: ' + allContigsFile
writeProc.start() for p in calcProc: p.start() for p in calcProc: p.join() writerQueue.put(None) writeProc.join() if __name__ == '__main__': parser = argparse.ArgumentParser(description="Extract 16S/18S sequences from metagenomic data using HMMs.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('config_file', help='project config file.') parser.add_argument('-t', '--threads', help='number of threads', type=int, default = 1) parser.add_argument('-e', '--evalue', help='e-value threshold for identifying hits', default = '1e-5') parser.add_argument('-a', '--align_len', type=float, help='fraction of read that must align for identifying hits', default = '0.5') parser.add_argument('-q', '--quiet', help='suppress all output', action='store_true') args = parser.parse_args() # Read config file rc = ReadConfig() projectParams, sampleParams = rc.readConfig(args.config_file, outputDirExists = False) extract16S = Extract16S() extract16S.run(projectParams, sampleParams, args.threads, args.evalue, args.align_len, args.quiet)
def run(self, configFile, db, threads, bQuiet): rc = ReadConfig() projectParams, sampleParams = rc.readConfig(configFile, outputDirExists = True) # check if classification directory already exists if not os.path.exists(os.path.join(projectParams['output_dir'], 'classified')): os.makedirs(os.path.join(projectParams['output_dir'], 'classified')) else: rtn = raw_input('Remove previously classified reads (Y or N)? ') if rtn.lower() == 'y' or rtn.lower() == 'yes': files = os.listdir(projectParams['output_dir'] + 'classified') for f in files: os.remove(projectParams['output_dir'] + 'classified/' + f) else: sys.exit() dbFile = self.dbFiles[db] taxonomyFile = self.taxonomyFiles[db] if not bQuiet: print 'Classifying reads with: ' + dbFile print 'Assigning taxonomy with: ' + taxonomyFile print 'Threads: ' + str(threads) print '' # create list of all sequence to classify mothurSeqFileList = '' for sample in sampleParams: prefix = os.path.join(projectParams['output_dir'], 'extracted', sample) pairs = sampleParams[sample]['pairs'] singles = sampleParams[sample]['singles'] for i in xrange(0, len(pairs), 2): pair1Base = ntpath.basename(pairs[i]) pair1File = prefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.intersect.SSU.fasta' pair2Base = ntpath.basename(pairs[i+1]) pair2File = prefix + '.' + pair2Base[0:pair2Base.rfind('.')] + '.intersect.SSU.fasta' diffFile = prefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.difference.SSU.fasta' mothurSeqFileList += pair1File + '-' + pair2File + '-' + diffFile + '-' for single in singles: singleBase = ntpath.basename(single) singleFile = prefix + '.' + singleBase[0:singleBase.rfind('.')] + '.SSU.fasta' mothurSeqFileList += singleFile + '-' # classify with mothur mothurSeqFileList = mothurSeqFileList[0:-1] # remove trailing dash self.classify(mothurSeqFileList, dbFile, taxonomyFile, threads, bQuiet) # rename classification file for consistency with down-stream processing print 'Final classifications written to: ' for filename in mothurSeqFileList.split('-'): if 'GG' in db: inputName = filename[0:filename.rfind('.')] + '.full.wang.taxonomy' else: inputName = filename[0:filename.rfind('.')] + '.SSURef_111_NR_taxonomy.wang.taxonomy' outputName = inputName.replace('/extracted/','/classified/') outputName = outputName.replace('SSU.full.wang.taxonomy','16S.tsv') os.system('mv ' + inputName + ' ' + outputName) print ' ' + outputName
def run(self, configFile, threads): rc = ReadConfig() projectParams, _ = rc.readConfig(configFile, outputDirExists = True) # create directory to store putative 16S genes dirPutative16S = projectParams['output_dir'] + 'putativeSSU/' if not os.path.exists(dirPutative16S): print '[Error] Putative 16S gene reads expected in: ' + dirPutative16S sys.exit() # extract GreenGene Ids of putative 16S genes ggIds = set() files = os.listdir(dirPutative16S) for f in files: if f.endswith('fasta'): ggIds.add(int(f.split('.')[0])) print 'Putative 16S genes to assemble: ' + str(len(ggIds)) scaffoldInfo = {} for ggId in ggIds: print 'Assembling ' + str(ggId) + ': ' print '' pair1 = dirPutative16S + str(ggId) + '.1.fasta' pair2 = dirPutative16S + str(ggId) + '.2.fasta' single = dirPutative16S + str(ggId) + '.singletons.fasta' outputDir = dirPutative16S + str(ggId) + '_assembly_spades' if os.path.exists(outputDir): shutil.rmtree(outputDir) cmd = 'spades.py --only-assembler -o ' + outputDir + ' -t ' + str(threads) if os.stat(single).st_size > 0: # check if file contains any sequences cmd += ' -s ' + single if os.stat(pair1).st_size > 0: cmd += ' -1 ' + pair1 + ' -2 ' + pair2 os.system(cmd) scaffoldInfo[ggId] = self.parseScaffoldInfo(outputDir) print '\n*********************************' allScaffoldsFile = projectParams['output_dir'] + 'assembled_scaffolds.16S.fasta' fout = open(allScaffoldsFile, 'w') print 'Assembly results: ' for ggId in scaffoldInfo: print ' Assembly of ' + str(ggId) + ' produce ' + str(len(scaffoldInfo[ggId])) + ' scaffold(s): ' + ' '.join(scaffoldInfo[ggId]) if not os.path.isfile(dirPutative16S + str(ggId) + '_assembly_spades/scaffolds.fasta'): print ' Failed to build scaffolds for ' + str(ggId) continue index = 0 for line in open(dirPutative16S + str(ggId) + '_assembly_spades/scaffolds.fasta'): if line[0] == '>': lineSplit = line.split('_') seqLen = lineSplit[3] fout.write('>16S_' + str(ggId) + '-' + str(index) + ' ' + seqLen + '\n') index += 1 else: fout.write(line) fout.close() print '' print ' All assembled 16S contigs written to: ' + allScaffoldsFile
def run(self, configFile, threads): rc = ReadConfig() projectParams, _ = rc.readConfig(configFile, outputDirExists=True) # create directory to store putative 16S genes dirPutative16S = projectParams['output_dir'] + 'putativeSSU/' if not os.path.exists(dirPutative16S): print '[Error] Putative 16S gene reads expected in: ' + dirPutative16S sys.exit() # extract GreenGene Ids of putative 16S genes ggIds = set() files = os.listdir(dirPutative16S) for f in files: if f.endswith('fasta'): ggIds.add(int(f.split('.')[0])) print 'Putative 16S genes to assemble: ' + str(len(ggIds)) scaffoldInfo = {} for ggId in ggIds: print 'Assembling ' + str(ggId) + ': ' print '' pair1 = dirPutative16S + str(ggId) + '.1.fasta' pair2 = dirPutative16S + str(ggId) + '.2.fasta' single = dirPutative16S + str(ggId) + '.singletons.fasta' outputDir = dirPutative16S + str(ggId) + '_assembly_spades' if os.path.exists(outputDir): shutil.rmtree(outputDir) cmd = 'spades.py --only-assembler -o ' + outputDir + ' -t ' + str( threads) if os.stat(single ).st_size > 0: # check if file contains any sequences cmd += ' -s ' + single if os.stat(pair1).st_size > 0: cmd += ' -1 ' + pair1 + ' -2 ' + pair2 os.system(cmd) scaffoldInfo[ggId] = self.parseScaffoldInfo(outputDir) print '\n*********************************' allScaffoldsFile = projectParams[ 'output_dir'] + 'assembled_scaffolds.16S.fasta' fout = open(allScaffoldsFile, 'w') print 'Assembly results: ' for ggId in scaffoldInfo: print ' Assembly of ' + str(ggId) + ' produce ' + str( len(scaffoldInfo[ggId])) + ' scaffold(s): ' + ' '.join( scaffoldInfo[ggId]) if not os.path.isfile(dirPutative16S + str(ggId) + '_assembly_spades/scaffolds.fasta'): print ' Failed to build scaffolds for ' + str(ggId) continue index = 0 for line in open(dirPutative16S + str(ggId) + '_assembly_spades/scaffolds.fasta'): if line[0] == '>': lineSplit = line.split('_') seqLen = lineSplit[3] fout.write('>16S_' + str(ggId) + '-' + str(index) + ' ' + seqLen + '\n') index += 1 else: fout.write(line) fout.close() print '' print ' All assembled 16S contigs written to: ' + allScaffoldsFile
def run(self, configFile, otu, seqIdentityThreshold, minSeqCutoff, bPairsAsSingles, bSingleEnded, bQuiet): self.bQuiet = bQuiet rc = ReadConfig() projectParams, sampleParams = rc.readConfig(configFile, outputDirExists = True) ggRefDistFile = self.ggRefDist.replace('##', str(otu)) neighbours = self.getNeighbours(ggRefDistFile, seqIdentityThreshold) # create directory to store putative 16S genes dirPutative16S = projectParams['output_dir'] + 'putativeSSU/' if not os.path.exists(dirPutative16S): os.makedirs(dirPutative16S) else: rtn = raw_input('Remove previously recovered 16S reads (Y or N)? ') if rtn.lower() == 'y' or rtn.lower() == 'yes': files = os.listdir(dirPutative16S) for f in files: if f.endswith('fasta'): os.remove(dirPutative16S + '/' + f) else: sys.exit() referenceSeqHits = {} for sample in sampleParams: if not self.bQuiet: print '' print sample + ':' extractedPrefix = os.path.join(projectParams['output_dir'], 'extracted', sample) classifiedPrefix = os.path.join(projectParams['output_dir'], 'classified' + sample) pairs = sampleParams[sample]['pairs'] singles = sampleParams[sample]['singles'] for i in xrange(0, len(pairs), 2): pair1Base = ntpath.basename(pairs[i]) pair2Base = ntpath.basename(pairs[i+1]) classificationFile1 = classifiedPrefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.intersect.16S.tsv' classificationFile2 = classifiedPrefix + '.' + pair2Base[0:pair2Base.rfind('.')] + '.intersect.16S.tsv' if not self.bQuiet: print ' Processing files: ' print ' ' + classificationFile1 print ' ' + classificationFile2 pairFile1 = extractedPrefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.intersect.SSU.fasta' pairFile2 = extractedPrefix + '.' + pair2Base[0:pair2Base.rfind('.')] + '.intersect.SSU.fasta' self.identifyConsistentPairs(referenceSeqHits, pairFile1, pairFile2, classificationFile1, classificationFile2, neighbours, bPairsAsSingles, bSingleEnded) if bSingleEnded: classificationFile = classifiedPrefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.difference.16S.tsv' if not self.bQuiet: print ' Processing file: ' + classificationFile singleFile = extractedPrefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.difference.SSU.fasta' self.addSingletons(referenceSeqHits, singleFile, classificationFile) if bSingleEnded: for single in singles: singleBase = ntpath.basename(single) classificationFile = classifiedPrefix + '.' + singleBase[0:singleBase.rfind('.')] + '.16S.tsv' if not self.bQuiet: print ' Processing file: ' + classificationFile singleFile = extractedPrefix + '.' + singleBase[0:singleBase.rfind('.')] + '.SSU.fasta' self.addSingletons(referenceSeqHits, singleFile, classificationFile) self.extractRecoverable16S(referenceSeqHits, neighbours, minSeqCutoff, dirPutative16S)
def run(self, configFile, threads, kmerLen, minContigLen): rc = ReadConfig() projectParams, _ = rc.readConfig(configFile, outputDirExists = True) # create directory to store putative 16S genes dirPutative16S = projectParams['output_dir'] + 'putativeSSU/' if not os.path.exists(dirPutative16S): print '[Error] Putative 16S gene reads expected in: ' + dirPutative16S sys.exit() # extract GreenGene Ids of putative 16S genes ggIds = set() files = os.listdir(dirPutative16S) for f in files: if f.endswith('fasta'): ggIds.add(int(f.split('.')[0])) print 'Putative 16S genes to assemble: ' + str(len(ggIds)) contigInfo = {} for ggId in ggIds: print 'Assembling ' + str(ggId) + ': ' print '' pair1 = dirPutative16S + str(ggId) + '.1.fasta' pair2 = dirPutative16S + str(ggId) + '.2.fasta' single = dirPutative16S + str(ggId) + '.singletons.fasta' outputDir = dirPutative16S + str(ggId) + '_assembly' if os.path.exists(outputDir): shutil.rmtree(outputDir) cmd = 'mpiexec -n ' + str(threads) + ' Ray -k ' + str(kmerLen) + ' -minimum-contig-length ' + str(minContigLen) + ' -o ' + outputDir if os.stat(single).st_size > 0: # check if file contains any sequences cmd += ' -s ' + single if os.stat(pair1).st_size > 0: cmd += ' -p ' + pair1 + ' ' + pair2 os.system(cmd) contigInfo[ggId] = self.parseContigInfo(outputDir) print '\n*********************************' allContigsFile = projectParams['output_dir'] + 'assembled_contigs.16S.fasta' fout = open(allContigsFile, 'w') print 'Assembly results: ' for ggId in contigInfo: print ' Assembly of ' + str(ggId) + ' produce ' + str(len(contigInfo[ggId])) + ' contig(s): ' + ' '.join(contigInfo[ggId]) index = 0 for line in open(dirPutative16S + str(ggId) + '_assembly/Contigs.fasta'): if line[0] == '>': lineSplit = line.split() seqLen = lineSplit[1] fout.write('>16S_' + str(ggId) + '-' + str(index) + ' ' + seqLen + '\n') index += 1 else: fout.write(line) fout.close() print '' print ' All assembled 16S contigs written to: ' + allContigsFile