def removeOutliers(self, binFile, outlierFile, outputFile): """Remove sequences specified as outliers in the provided file.""" binSeqs = readFasta(binFile) binIdToModify = binIdFromFilename(binFile) # get files to remove checkFileExists(outlierFile) seqsToRemove = [] bHeader = True for line in open(outlierFile): if bHeader: bHeader = False continue lineSplit = line.split('\t') binId = lineSplit[0] if binId == binIdToModify: seqId = lineSplit[1] seqsToRemove.append(seqId) # remove sequences from bin if len(seqsToRemove) > 0: self.__removeSeqs(binSeqs, seqsToRemove) # save modified bin writeFasta(binSeqs, outputFile)
def run(self, outputDir): # make sure output directory exists if not os.path.exists(outputDir): os.mkdir(outputDir) # remove similar taxa print 'Filtering out highly similar taxa in order to reduce size of tree:' seqs = readFasta(self.derepConcatenatedAlignFile) nearlyIdentical = self.__nearlyIdenticalGenomes(seqs, outputDir) reducedSeqs = {} for s in nearlyIdentical: rndGenome = random.choice(tuple(s)) reducedSeqs[rndGenome] = seqs[rndGenome] # write out reduced alignment reducedAlignmentFile = os.path.join(outputDir, "genome_tree.fasta") writeFasta(reducedSeqs, reducedAlignmentFile) # prune tree to retained taxa print '' print 'Pruning tree:' tree = dendropy.Tree.get_from_path(self.tree, schema='newick', as_rooted=False, preserve_underscores=True) for seqId in reducedSeqs: node = tree.find_node_with_taxon_label(seqId) if not node: print 'Missing taxa: %s' % seqId tree.retain_taxa_with_labels(reducedSeqs.keys()) outputTree = os.path.join(outputDir, 'genome_tree.tre') tree.write_to_path(outputTree, schema='newick', suppress_rooting=True, unquoted_underscores=True) for t in tree.internal_nodes(): t.label = None for t in tree.leaf_nodes(): if t.taxon.label not in reducedSeqs: print 'missing in sequence file: %s' % t.taxon.label outputTreeWithoutLabels = os.path.join(outputDir, 'genome_tree.small.no_internal_labels.tre') tree.write_to_path(outputTreeWithoutLabels, schema='newick', suppress_rooting=True, unquoted_underscores=True) print ' Pruned tree written to: %s' % outputTree # calculate model parameters for pruned tree print '' print 'Determining model parameters for new tree.' outputTreeLog = os.path.join(outputDir, 'genome_tree.log') fastTreeOutput = os.path.join(outputDir, 'genome_tree.no_internal_labels.fasttree.tre') # os.system('FastTreeMP -nome -mllen -intree %s -log %s < %s > %s' % (outputTreeWithoutLabels, outputTreeLog, reducedAlignmentFile, fastTreeOutput)) # calculate reference package for pruned tree print '' print 'Creating reference package.' os.system('taxit create -l %s -P %s --aln-fasta %s --tree-stats %s --tree-file %s' % ('genome_tree_reduced', os.path.join(outputDir, 'genome_tree_reduced.refpkg'), reducedAlignmentFile, outputTreeLog, outputTree))
def __createConcatenatedAlignment(self, binFiles, resultsParser, alignOutputDir): """Create a concatenated alignment of marker genes for each bin.""" # read alignment files self.logger.info(' Reading marker alignment files.') alignments = defaultdict(dict) files = os.listdir(alignOutputDir) binIds = set() for f in files: if f.endswith('.masked.faa'): markerId = f[0:f.find('.masked.faa')] seqs = readFasta(os.path.join(alignOutputDir, f)) for seqId, seq in seqs.items(): binId = seqId[0:seqId.find(DefaultValues.SEQ_CONCAT_CHAR)] alignments[markerId][binId] = seq binIds.add(binId) # get all markers and their lengths markerIds = list(resultsParser.models[list( resultsParser.models.keys())[0]].keys()) markerIdLens = {} for markerId in markerIds: markerIdLens[markerId] = resultsParser.models[list( resultsParser.models.keys())[0]][markerId].leng # create concatenated alignment self.logger.info(' Concatenating alignments.') concatenatedSeqs = {} for markerId in sorted(markerIds): seqs = alignments[markerId] for binId in binIds: if binId in seqs: # append alignment concatenatedSeqs[binId] = concatenatedSeqs.get( binId, '') + seqs[binId] else: # missing gene concatenatedSeqs[binId] = concatenatedSeqs.get( binId, '') + '-' * markerIdLens[markerId] # save concatenated alignment concatenatedAlignFile = os.path.join( alignOutputDir, DefaultValues.PPLACER_CONCAT_SEQ_OUT) writeFasta(concatenatedSeqs, concatenatedAlignFile) return concatenatedAlignFile
def modify(self, binFile, seqFile, seqsToAdd, seqsToRemove, outputFile): """Add and remove sequences from a file.""" binSeqs = readFasta(binFile) # add sequences to bin if seqsToAdd != None: refSeqs = readFasta(seqFile) self.__addSeqs(binSeqs, refSeqs, seqsToAdd) # remove sequences from bin if seqsToRemove != None: self.__removeSeqs(binSeqs, seqsToRemove) # save modified bin writeFasta(binSeqs, outputFile)
def __createConcatenatedAlignment(self, binFiles, resultsParser, alignOutputDir): """Create a concatenated alignment of marker genes for each bin.""" # read alignment files self.logger.info(' Reading marker alignment files.') alignments = defaultdict(dict) files = os.listdir(alignOutputDir) binIds = set() for f in files: if f.endswith('.masked.faa'): markerId = f[0:f.find('.masked.faa')] seqs = readFasta(os.path.join(alignOutputDir, f)) for seqId, seq in seqs.iteritems(): binId = seqId[0:seqId.find(DefaultValues.SEQ_CONCAT_CHAR)] alignments[markerId][binId] = seq binIds.add(binId) # get all markers and their lengths markerIds = resultsParser.models[resultsParser.models.keys()[0]].keys() markerIdLens = {} for markerId in markerIds: markerIdLens[markerId] = resultsParser.models[resultsParser.models.keys()[0]][markerId].leng # create concatenated alignment self.logger.info(' Concatenating alignments.') concatenatedSeqs = {} for markerId in sorted(markerIds): seqs = alignments[markerId] for binId in binIds: if binId in seqs: # append alignment concatenatedSeqs[binId] = concatenatedSeqs.get(binId, '') + seqs[binId] else: # missing gene concatenatedSeqs[binId] = concatenatedSeqs.get(binId, '') + '-' * markerIdLens[markerId] # save concatenated alignment concatenatedAlignFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_CONCAT_SEQ_OUT) writeFasta(concatenatedSeqs, concatenatedAlignFile) return concatenatedAlignFile
def run(self, query, bNucORFs=True): prodigal_input = query # decompress gzip input files if prodigal_input.endswith('.gz'): tmp_dir = tempfile.mkdtemp() prodigal_input = os.path.join(tmp_dir, os.path.basename(prodigal_input[0:-3]) + '.fna') writeFasta(seqs, prodigal_input) # gather statistics about query file seqs = readFasta(prodigal_input) totalBases = 0 for seqId, seq in seqs.items(): totalBases += len(seq) # call ORFs with different translation tables and select the one with the highest coding density tableCodingDensity = {} for translationTable in [4, 11]: aaGeneFile = self.aaGeneFile + '.' + str(translationTable) ntGeneFile = self.ntGeneFile + '.' + str(translationTable) gffFile = self.gffFile + '.' + str(translationTable) # check if there is sufficient bases to calculate prodigal parameters if totalBases < 100000: procedureStr = 'meta' # use best precalculated parameters else: procedureStr = 'single' # estimate parameters from data if bNucORFs: cmd = ('prodigal -p %s -q -m -f gff -g %d -a %s -d %s -i %s > %s 2> /dev/null' % (procedureStr, translationTable, aaGeneFile, ntGeneFile, prodigal_input, gffFile)) else: cmd = ('prodigal -p %s -q -m -f gff -g %d -a %s -i %s > %s 2> /dev/null' % (procedureStr, translationTable, aaGeneFile, prodigal_input, gffFile)) os.system(cmd) if not self.__areORFsCalled(aaGeneFile) and procedureStr == 'single': # prodigal will fail to learn a model if the input genome has a large number of N's # so try gene prediction with 'meta' cmd = cmd.replace('-p single', '-p meta') os.system(cmd) # determine coding density prodigalParser = ProdigalGeneFeatureParser(gffFile) codingBases = 0 for seqId, seq in seqs.items(): codingBases += prodigalParser.codingBases(seqId) if totalBases != 0: codingDensity = float(codingBases) / totalBases else: codingDensity = 0 tableCodingDensity[translationTable] = codingDensity # determine best translation table bestTranslationTable = 11 if (tableCodingDensity[4] - tableCodingDensity[11] > 0.05) and tableCodingDensity[4] > 0.7: bestTranslationTable = 4 shutil.copyfile(self.aaGeneFile + '.' + str(bestTranslationTable), self.aaGeneFile) shutil.copyfile(self.gffFile + '.' + str(bestTranslationTable), self.gffFile) if bNucORFs: shutil.copyfile(self.ntGeneFile + '.' + str(bestTranslationTable), self.ntGeneFile) # clean up redundant prodigal results for translationTable in [4, 11]: os.remove(self.aaGeneFile + '.' + str(translationTable)) os.remove(self.gffFile + '.' + str(translationTable)) if bNucORFs: os.remove(self.ntGeneFile + '.' + str(translationTable)) if prodigal_input.endswith('.gz'): shutil.rmtree(tmp_dir) return bestTranslationTable
def run(self, geneTreeDir, alignmentDir, extension, outputAlignFile, outputTree, outputTaxonomy, bSupportValues=False): # read gene trees print 'Reading gene trees.' geneIds = set() files = os.listdir(geneTreeDir) for f in files: if f.endswith('.tre'): geneId = f[0:f.find('.')] geneIds.add(geneId) # write out genome tree taxonomy print 'Reading trusted genomes.' img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv') genomeIds = img.genomeMetadata().keys() self.__taxonomy(img, genomeIds, outputTaxonomy) print ' There are %d trusted genomes.' % (len(genomeIds)) # get genes in genomes print 'Reading all PFAM and TIGRFAM hits in trusted genomes.' genesInGenomes = self.__genesInGenomes(genomeIds) # read alignment files print 'Reading alignment files.' alignments = {} genomeIds = set() files = os.listdir(alignmentDir) for f in files: geneId = f[0:f.find('.')] if f.endswith(extension) and geneId in geneIds: seqs = readFasta(os.path.join(alignmentDir, f)) imgGeneId = geneId if imgGeneId.startswith('PF'): imgGeneId = imgGeneId.replace('PF', 'pfam') seqs = self.__filterParalogs(seqs, imgGeneId, genesInGenomes) genomeIds.update(set(seqs.keys())) alignments[geneId] = seqs # create concatenated alignment print 'Concatenating alignments:' concatenatedSeqs = {} totalAlignLen = 0 for geneId in sorted(alignments.keys()): seqs = alignments[geneId] alignLen = len(seqs[seqs.keys()[0]]) print ' ' + str(geneId) + ',' + str(alignLen) totalAlignLen += alignLen for genomeId in genomeIds: if genomeId in seqs: # append alignment concatenatedSeqs['IMG_' + genomeId] = concatenatedSeqs.get( 'IMG_' + genomeId, '') + seqs[genomeId] else: # missing gene concatenatedSeqs['IMG_' + genomeId] = concatenatedSeqs.get( 'IMG_' + genomeId, '') + '-' * alignLen print ' Total alignment length: ' + str(totalAlignLen) # save concatenated alignment writeFasta(concatenatedSeqs, outputAlignFile) # infer genome tree print 'Inferring genome tree.' outputLog = outputTree[0:outputTree.rfind('.')] + '.log' supportStr = ' ' if not bSupportValues: supportStr = ' -nosupport ' cmd = 'FastTreeMP' + supportStr + '-wag -gamma -log ' + outputLog + ' ' + outputAlignFile + ' > ' + outputTree os.system(cmd)
def run( self, geneTreeDir, alignmentDir, extension, outputAlignFile, outputTree, outputTaxonomy, bSupportValues=False ): # read gene trees print "Reading gene trees." geneIds = set() files = os.listdir(geneTreeDir) for f in files: if f.endswith(".tre"): geneId = f[0 : f.find(".")] geneIds.add(geneId) # write out genome tree taxonomy print "Reading trusted genomes." img = IMG("/srv/whitlam/bio/db/checkm/img/img_metadata.tsv", "/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv") genomeIds = img.genomeMetadata().keys() self.__taxonomy(img, genomeIds, outputTaxonomy) print " There are %d trusted genomes." % (len(genomeIds)) # get genes in genomes print "Reading all PFAM and TIGRFAM hits in trusted genomes." genesInGenomes = self.__genesInGenomes(genomeIds) # read alignment files print "Reading alignment files." alignments = {} genomeIds = set() files = os.listdir(alignmentDir) for f in files: geneId = f[0 : f.find(".")] if f.endswith(extension) and geneId in geneIds: seqs = readFasta(os.path.join(alignmentDir, f)) imgGeneId = geneId if imgGeneId.startswith("PF"): imgGeneId = imgGeneId.replace("PF", "pfam") seqs = self.__filterParalogs(seqs, imgGeneId, genesInGenomes) genomeIds.update(set(seqs.keys())) alignments[geneId] = seqs # create concatenated alignment print "Concatenating alignments:" concatenatedSeqs = {} totalAlignLen = 0 for geneId in sorted(alignments.keys()): seqs = alignments[geneId] alignLen = len(seqs[seqs.keys()[0]]) print " " + str(geneId) + "," + str(alignLen) totalAlignLen += alignLen for genomeId in genomeIds: if genomeId in seqs: # append alignment concatenatedSeqs["IMG_" + genomeId] = concatenatedSeqs.get("IMG_" + genomeId, "") + seqs[genomeId] else: # missing gene concatenatedSeqs["IMG_" + genomeId] = concatenatedSeqs.get("IMG_" + genomeId, "") + "-" * alignLen print " Total alignment length: " + str(totalAlignLen) # save concatenated alignment writeFasta(concatenatedSeqs, outputAlignFile) # infer genome tree print "Inferring genome tree." outputLog = outputTree[0 : outputTree.rfind(".")] + ".log" supportStr = " " if not bSupportValues: supportStr = " -nosupport " cmd = "FastTreeMP" + supportStr + "-wag -gamma -log " + outputLog + " " + outputAlignFile + " > " + outputTree os.system(cmd)