Exemple #1
0
    def __init__(self):
        self.markerSetBuilder = MarkerSetBuilder()
        self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')

        self.contigLens = [1000, 2000, 5000, 10000, 20000, 50000]
        self.percentComps = [0.5, 0.7, 0.8, 0.9, 0.95, 1.0]
        self.percentConts = [0.0, 0.05, 0.1, 0.15, 0.2]
Exemple #2
0
    def __workerThread(self, ubiquityThreshold, singleCopyThreshold,
                       minGenomes, colocatedDistThreshold,
                       colocatedGenomeThreshold, metadata, queueIn, queueOut):
        """Process each data item in parallel."""

        img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv',
                  '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        markerSetBuilder = MarkerSetBuilder()

        while True:
            lineage = queueIn.get(block=True, timeout=None)
            if lineage == None:
                break

            if lineage == 'Universal':
                genomeIds = img.genomeIdsByTaxonomy('prokaryotes', metadata)
            else:
                genomeIds = img.genomeIdsByTaxonomy(lineage, metadata)
            if len(genomeIds) >= minGenomes:
                markerSet = markerSetBuilder.buildMarkerSet(
                    genomeIds, ubiquityThreshold, singleCopyThreshold,
                    colocatedDistThreshold)
                colocatedSets = markerSet.markerSet
            else:
                colocatedSets = None

            # allow results to be processed or written to file
            queueOut.put((lineage, colocatedSets, len(genomeIds)))
Exemple #3
0
    def __init__(self, outputDir):
        self.img = IMG()
        self.markerSetBuilder = MarkerSetBuilder()

        if os.path.exists(outputDir):
            print '[Error] Output directory already exists: ' + outputDir
            sys.exit(0)
        else:
            os.makedirs(outputDir)

        self.__checkForHMMER()
        self.__checkForFastTree()

        self.hmmDir = os.path.join(outputDir, 'phylo_hmms')
        self.alignmentDir = os.path.join(outputDir, 'gene_alignments')
        self.geneTreeDir = os.path.join(outputDir, 'gene_trees')
        self.conspecificGeneTreeDir = os.path.join(outputDir,
                                                   'gene_trees_conspecific')
        self.finalGeneTreeDir = os.path.join(outputDir, 'gene_trees_final')

        self.consistencyOut = os.path.join(outputDir,
                                           'genome_tree.consistency.tsv')
        self.concatenatedAlignFile = os.path.join(
            outputDir, 'genome_tree.concatenated.faa')
        self.derepConcatenatedAlignFile = os.path.join(
            outputDir, 'genome_tree.concatenated.derep.fasta')
        self.treeOut = os.path.join(outputDir, 'genome_tree.tre')
        self.treeOutAce = os.path.join(outputDir, 'genome_tree.ace_ids.tre')

        self.treeRootedOut = os.path.join(outputDir, 'genome_tree.rooted.tre')
        self.treeTaxonomyOut = os.path.join(outputDir,
                                            'genome_tree.taxonomy.tre')
        self.treeDerepOut = os.path.join(outputDir, 'genome_tree.derep.tre')
        self.treeDerepRootedOut = os.path.join(outputDir,
                                               'genome_tree.derep.rooted.tre')
        self.treeDerepBootstrapOut = os.path.join(outputDir,
                                                  'genome_tree.derep.bs.tre')
        self.treeDerepFinalOut = os.path.join(outputDir,
                                              'genome_tree.final.tre')
        self.taxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tsv')
        self.treeMetadata = os.path.join(outputDir, 'genome_tree.metadata.tsv')
        self.phyloHMMsOut = os.path.join(outputDir, 'phylo.hmm')
        self.derepSeqFile = os.path.join(outputDir, 'genome_tree.derep.txt')

        self.phyloUbiquity = 0.90
        self.phyloSingleCopy = 0.90
        self.paralogAcceptPer = 0.01
        # self.consistencyAcceptPer = 0.95    # for trees at the class-level
        self.consistencyAcceptPer = 0.906  # for trees at the phylum-level
        self.consistencyMinTaxa = 20

        # create output directories
        os.makedirs(self.hmmDir)
        os.makedirs(self.alignmentDir)
        os.makedirs(self.geneTreeDir)
        os.makedirs(self.conspecificGeneTreeDir)
        os.makedirs(self.finalGeneTreeDir)
Exemple #4
0
    def __init__(self):
        self.simFile = './experiments/simulation.tuning.genus.summary.tsv'
        self.looRank = 5

        self.markerSetBuilder = MarkerSetBuilder()
        self.img = IMG()
 def __init__(self):
     self.markerSetBuilder = MarkerSetBuilder()
     self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
 def __init__(self):
     self.markerSetBuilder = MarkerSetBuilder()
     self.img = IMG()
     
     self.simContigLen = 10000
Exemple #7
0
    def run(self, inputMetadataFile, outputMetadataFile, outputDir,
            ubiquityThreshold, singleCopyThreshold, trustedCompleteness,
            trustedContamination):
        img = IMG()
        markerSetBuilder = MarkerSetBuilder()

        allOut = open(os.path.join(outputDir, 'genomes_all.tsv'), 'w')
        allOut.write(
            'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n'
        )

        trustedOut = open(os.path.join(outputDir, 'genomes_trusted.tsv'), 'w')
        trustedOut.write(
            'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n'
        )

        filteredOut = open(os.path.join(outputDir, 'genomes_filtered.tsv'),
                           'w')
        filteredOut.write(
            'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n'
        )

        metadataOut = open(outputMetadataFile, 'w')

        # read input metadata file
        metadata = img.genomeMetadataFromFile(inputMetadataFile)

        finishedGenomes = defaultdict(set)
        allGenomes = defaultdict(set)

        metadataLine = {}

        bHeader = True
        for line in open(inputMetadataFile):
            if bHeader:
                metadataOut.write(line)
                bHeader = False
                continue

            lineSplit = line.split('\t')
            genomeId = lineSplit[0]
            domain = lineSplit[1]
            status = lineSplit[2]

            if status == 'Finished':
                finishedGenomes[domain].add(genomeId)

            allGenomes[domain].add(genomeId)
            metadataLine[genomeId] = line

        allTrustedGenomeIds = set()
        for lineage, allLineageGenomeIds in allGenomes.items():
            print('[' + lineage + ']')
            print('  Number of genomes: %d' % len(allLineageGenomeIds))

            # tabulate genomes from each phylum
            allPhylumCounts = {}
            for genomeId in allLineageGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1

            # identify marker genes for finished genomes
            print(
                '\nDetermining initial marker gene sets for genome filtering.')
            markerSet = markerSetBuilder.buildMarkerSet(
                finishedGenomes[lineage], ubiquityThreshold,
                singleCopyThreshold)

            print(
                '  Marker set consists of %s marker genes organized into %d sets.'
                % (markerSet.numMarkers(), markerSet.numSets()))
            fout = open(
                os.path.join(outputDir,
                             'trusted_marker_sets_' + lineage + '.txt'), 'w')
            fout.write(str(markerSet.markerSet))
            fout.close()

            # identifying trusted genomes (highly complete, low contamination genomes)
            print('\nIdentifying highly complete, low contamination genomes.')
            trustedGenomeIds = set()
            filteredGenomes = set()
            retainedStatus = {}
            filteredStatus = {}
            geneCountTable = img.geneCountTable(allLineageGenomeIds)
            for genomeId in allLineageGenomeIds:
                completeness, contamination, missingMarkers, duplicateMarkers = markerSetBuilder.genomeCheck(
                    markerSet.markerSet, genomeId, geneCountTable)

                genomeStr = self.__genomeString(genomeId, metadata,
                                                completeness, contamination,
                                                missingMarkers,
                                                duplicateMarkers)

                if completeness >= trustedCompleteness and contamination <= trustedContamination:
                    trustedGenomeIds.add(genomeId)
                    allTrustedGenomeIds.add(genomeId)
                    retainedStatus[metadata[genomeId]
                                   ['status']] = retainedStatus.get(
                                       metadata[genomeId]['status'], 0) + 1

                    trustedOut.write(genomeStr)
                    allOut.write(genomeStr)

                    metadataOut.write(metadataLine[genomeId])
                else:
                    filteredGenomes.add(genomeId)
                    filteredStatus[metadata[genomeId]
                                   ['status']] = filteredStatus.get(
                                       metadata[genomeId]['status'], 0) + 1

                    filteredOut.write(genomeStr)
                    allOut.write(genomeStr)

            print('  Filtered genomes: %d (%.2f%%)' %
                  (len(filteredGenomes),
                   len(filteredGenomes) * 100.0 / len(allLineageGenomeIds)))
            print('  ' + str(filteredStatus))
            print('  \nTrusted genomes: %d (%.2f%%)' %
                  (len(trustedGenomeIds),
                   len(trustedGenomeIds) * 100.0 / len(allLineageGenomeIds)))
            print('  ' + str(retainedStatus))

            # determine status of retained genomes
            print('\nTrusted genomes by phylum:')
            trustedPhylumCounts = {}
            for genomeId in trustedGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon,
                                                                     0) + 1

            for phylum, count in allPhylumCounts.items():
                print('  ' + phylum + ': %d of %d' %
                      (trustedPhylumCounts.get(phylum, 0), count))
            print('')

        allOut.close()
        trustedOut.close()
        filteredOut.close()
        metadataOut.close()

        # write out lineage statistics for genome distribution
        allStats = {}
        trustedStats = {}

        for r in range(0, 6):  # Domain to Genus
            for genomeId, data in metadata.items():
                taxaStr = ';'.join(data['taxonomy'][0:r + 1])
                allStats[taxaStr] = allStats.get(taxaStr, 0) + 1
                if genomeId in allTrustedGenomeIds:
                    trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1

        sortedLineages = img.lineagesSorted(metadata)

        fout = open(os.path.join(outputDir, 'lineage_stats.tsv'), 'w')
        fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n')
        for lineage in sortedLineages:
            fout.write(lineage + '\t' + str(allStats.get(lineage, 0)) + '\t' +
                       str(trustedStats.get(lineage, 0)) + '\n')
        fout.close()
Exemple #8
0
 def __init__(self):
     self.markerSetBuilder = MarkerSetBuilder()
     self.img = IMG()