def __readBins(self, binFiles): bins = {} for binFile in binFiles: binId = binIdFromFilename(binFile) bins[binId] = set(readFastaSeqIds(binFile)) return bins
def unique(self, binFiles): """Check if sequences are assigned to multiple bins.""" # read seq ids from all bins binSeqs = {} for f in binFiles: binId = binIdFromFilename(f) binSeqs[binId] = readFastaSeqIds(f) # check for sequences assigned to multiple bins bDuplicates = False binIds = binSeqs.keys() for i in xrange(0, len(binIds)): for j in xrange(i+1, len(binIds)): seqInter = set(binSeqs[binIds[i]]).intersection(set(binSeqs[binIds[j]])) if len(seqInter) > 0: bDuplicates = True print ' Sequences shared between %s and %s: ' % (binIds[i], binIds[j]) for seqId in seqInter: print ' ' + seqId print '' if not bDuplicates: print ' No sequences assigned to multiple bins.'
def run(self, contigFile, binFiles, outputDir, evalueThreshold, concatenateThreshold): # make sure output directory exists if not os.path.exists(outputDir): os.makedirs(outputDir) # get bin id of binned contigs self.logger.info(' Determining bin assignment of sequences.') seqIdToBinId = {} for f in binFiles: binId = binIdFromFilename(f) seqIds = readFastaSeqIds(f) for seqId in seqIds: seqIdToBinId[seqId] = binId # identify 16S reads from contigs/scaffolds self.logger.info(' Identifying SSU rRNAs on sequences.') self.__hmmSearch(contigFile, evalueThreshold, os.path.join(outputDir, 'ssu')) # read HMM hits hitsPerDomain = {} for domain in ['archaea', 'bacteria', 'euk']: hits = {} seqInfo = self.__readHits(os.path.join(outputDir, 'ssu' + '.' + domain + '.txt'), domain, evalueThreshold) if len(seqInfo) > 0: for seqId, seqHits in seqInfo.iteritems(): for hit in seqHits: self.__addHit(hits, seqId, hit, concatenateThreshold) hitsPerDomain[domain] = hits # find best domain hit for each sequence bestHits = {} for _, hits in hitsPerDomain.iteritems(): for seqId, info in hits.iteritems(): if '-#' in seqId: seqId = seqId[0:seqId.rfind('-#')] self.__addDomainHit(bestHits, seqId, info) # write summary file and putative SSU rRNAs to file summaryFile = os.path.join(outputDir, 'ssu_summary.tsv') summaryOut = open(summaryFile, 'w') summaryOut.write('Bin Id\tSeq. Id\tHMM\ti-Evalue\tStart hit\tEnd hit\t16S/18S gene length\tRev. Complement\tSequence length\n') seqFile = os.path.join(outputDir, 'ssu.fna') seqOut = open(seqFile, 'w') seqs = readFasta(contigFile) hitsToBins = {} for seqId in bestHits: origSeqId = seqId if '-#' in seqId: seqId = seqId[0:seqId.rfind('-#')] if seqId in seqIdToBinId: binId = seqIdToBinId[seqId] else: binId = DefaultValues.UNBINNED seqInfo = [origSeqId] + bestHits[origSeqId] hitsToBins[binId] = hitsToBins.get(binId, []) + [seqInfo] for binId in sorted(hitsToBins.keys()): for seqInfo in hitsToBins[binId]: seqId = seqInfo[0] if '-#' in seqId: seqId = seqId[0:seqId.rfind('-#')] seq = seqs[seqId] summaryOut.write(binId + '\t' + '\t'.join(seqInfo) + '\t' + str(len(seq)) + '\n') seqOut.write('>' + binId + DefaultValues.SEQ_CONCAT_CHAR + seqInfo[0] + '\n') seqOut.write(seq[int(seqInfo[3]):int(seqInfo[4])] + '\n') summaryOut.close() seqOut.close() self.logger.info('') self.logger.info(' Identified ' + str(len(bestHits)) + ' putative SSU genes:') self.logger.info(' Summary of identified hits written to: ' + summaryFile) self.logger.info(' SSU sequences written to: ' + seqFile)
def run(self, contigFile, binFiles, outputDir, evalueThreshold, concatenateThreshold): # make sure output directory exists if not os.path.exists(outputDir): os.makedirs(outputDir) # get bin id of binned contigs self.logger.info('Determining bin assignment of sequences.') seqIdToBinId = {} for f in binFiles: binId = binIdFromFilename(f) seqIds = readFastaSeqIds(f) for seqId in seqIds: seqIdToBinId[seqId] = binId # identify 16S reads from contigs/scaffolds self.logger.info('Identifying SSU rRNAs on sequences.') self.__hmmSearch(contigFile, evalueThreshold, os.path.join(outputDir, 'ssu')) # read HMM hits hitsPerDomain = {} for domain in ['archaea', 'bacteria', 'euk']: hits = {} seqInfo = self.__readHits( os.path.join(outputDir, 'ssu' + '.' + domain + '.txt'), domain, evalueThreshold) if len(seqInfo) > 0: for seqId, seqHits in seqInfo.items(): for hit in seqHits: self.__addHit(hits, seqId, hit, concatenateThreshold) hitsPerDomain[domain] = hits # find best domain hit for each sequence bestHits = {} for _, hits in hitsPerDomain.items(): for seqId, info in hits.items(): if '-#' in seqId: seqId = seqId[0:seqId.rfind('-#')] self.__addDomainHit(bestHits, seqId, info) # write summary file and putative SSU rRNAs to file summaryFile = os.path.join(outputDir, 'ssu_summary.tsv') summaryOut = open(summaryFile, 'w') summaryOut.write( 'Bin Id\tSeq. Id\tHMM\ti-Evalue\tStart hit\tEnd hit\t16S/18S gene length\tRev. Complement\tSequence length\n' ) seqFile = os.path.join(outputDir, 'ssu.fna') seqOut = open(seqFile, 'w') seqs = readFasta(contigFile) hitsToBins = {} for seqId in bestHits: origSeqId = seqId if '-#' in seqId: seqId = seqId[0:seqId.rfind('-#')] if seqId in seqIdToBinId: binId = seqIdToBinId[seqId] else: binId = DefaultValues.UNBINNED seqInfo = [origSeqId] + bestHits[origSeqId] hitsToBins[binId] = hitsToBins.get(binId, []) + [seqInfo] for binId in sorted(hitsToBins.keys()): for seqInfo in hitsToBins[binId]: seqId = seqInfo[0] if '-#' in seqId: seqId = seqId[0:seqId.rfind('-#')] seq = seqs[seqId] summaryOut.write(binId + '\t' + '\t'.join(seqInfo) + '\t' + str(len(seq)) + '\n') seqOut.write('>' + binId + DefaultValues.SEQ_CONCAT_CHAR + seqInfo[0] + '\n') seqOut.write(seq[int(seqInfo[3]) + 1:int(seqInfo[4]) + 1] + '\n') summaryOut.close() seqOut.close() self.logger.info('Identified ' + str(len(bestHits)) + ' putative SSU genes.') self.logger.info('Summary of identified hits written to: ' + summaryFile) self.logger.info('SSU sequences written to: ' + seqFile)