Example #1
0
    def pull(self, options):
        """Create taxonomy file from a decorated tree."""

        check_file_exists(options.input_tree)

        if options.no_validation:
            tree = dendropy.Tree.get_from_path(options.input_tree, 
                                                schema='newick', 
                                                rooting="force-rooted", 
                                                preserve_underscores=True)

            taxonomy = {}
            for leaf in tree.leaf_node_iter():
                taxon_id = leaf.taxon.label
                
                node = leaf.parent_node
                taxa = []
                while node:
                    support, taxon, aux_info = parse_label(node.label)
                    if taxon:
                        for t in map(str.strip, taxon.split(';'))[::-1]:
                            taxa.append(t)
                    node = node.parent_node
                    
                taxonomy[taxon_id] = taxa[::-1]
        else:
            taxonomy = Taxonomy().read_from_tree(options.input_tree)
                                                
        Taxonomy().write(taxonomy, options.output_taxonomy)
            
        self.logger.info('Stripped tree written to: %s' % options.output_taxonomy)
Example #2
0
    def lsu_tree(self, options):
        """Infer 23S tree spanning GTDB genomes."""

        check_dependencies(['esl-sfetch', 'cmsearch', 'cmalign', 'esl-alimask', 'FastTreeMP', 'blastn'])

        check_file_exists(options.gtdb_metadata_file)
        check_file_exists(options.gtdb_lsu_file)
        make_sure_path_exists(options.output_dir)

        rna_workflow = RNA_Workflow(options.cpus)
        rna_workflow.run('lsu',
                            options.gtdb_metadata_file,
                            options.gtdb_lsu_file,
                            options.min_lsu_length,
                            options.min_scaffold_length,
                            options.min_quality,
                            options.max_contigs,
                            options.min_N50,
                            not options.disable_tax_filter,
                            #options.reps_only,
                            #options.user_genomes,
                            options.genome_list,
                            options.output_dir)

        self.logger.info('Results written to: %s' % options.output_dir)
Example #3
0
    def strip(self, options):
        """Remove taxonomic labels from tree."""

        check_file_exists(options.input_tree)

        outgroup_in_tree = set()
        tree = dendropy.Tree.get_from_path(options.input_tree,
                                            schema='newick',
                                            rooting='force-rooted',
                                            preserve_underscores=True)

        for node in tree.internal_nodes():
            if node.label:
                if ':' in node.label:
                    support, _taxa = node.label.split(':')
                    node.label = support
                else:
                    node.label = None

        tree.write_to_path(options.output_tree,
                            schema='newick',
                            suppress_rooting=True,
                            unquoted_underscores=True)

        self.logger.info('Stripped tree written to: %s' % options.output_tree)
Example #4
0
    def cluster(self, options):
        """Cluster command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - cluster] Partitioning bin into clusters.')
        self.logger.info('*******************************************************************************')

        check_file_exists(options.scaffold_stats_file)
        check_file_exists(options.genome_file)
        make_sure_path_exists(options.output_dir)

        self.logger.info('')
        self.logger.info('  Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        cluster = Cluster(options.cpus)
        cluster.run(scaffold_stats,
                    options.num_clusters,
                    options.num_components,
                    options.K,
                    options.no_coverage,
                    options.no_pca,
                    options.iterations,
                    options.genome_file,
                    options.output_dir)

        self.logger.info('')
        self.logger.info('  Partitioned sequences written to: ' + options.output_dir)

        self.time_keeper.print_time_stamp()
Example #5
0
    def gene(self, options):
        self.logger.info('Calculating gene properties of genome.')

        check_file_exists(options.genome_file)
        check_file_exists(options.gff_file)
        make_sure_path_exists(options.output_dir)

        meta_genes = MetadataGenes()
        metadata_values, metadata_desc = meta_genes.generate(options.genome_file,
                                                                options.gff_file)

        # write statistics to file
        output_file = os.path.join(options.output_dir, 'metadata.genome_gene.tsv')
        fout = open(output_file, 'w')
        for field in sorted(metadata_values.keys()):
            fout.write('%s\t%s\n' % (field, str(metadata_values[field])))
        fout.close()

        # write description to file
        output_file = os.path.join(options.output_dir, 'metadata.genome_gene.desc.tsv')
        fout = open(output_file, 'w')
        for field in sorted(metadata_desc.keys()):
            fout.write('%s\t%s\t%s\n' % (field,
                                         metadata_desc[field],
                                         type(metadata_values[field]).__name__.upper()))
        fout.close()
Example #6
0
    def reference(self, options):
        """Reference command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info('[RefineM - reference] Identifying scaffolds similar to specific genome(s).')
        self.logger.info('*******************************************************************************')

        check_file_exists(options.scaffold_prot_file)
        check_file_exists(options.scaffold_stats_file)
        make_sure_path_exists(options.output_dir)

        ref_gene_files = self._genome_files(options.ref_genome_prot_dir, options.protein_ext)
        if not self._check_protein_seqs(ref_gene_files):
            self.logger.warning('[Warning] All files must contain amino acid sequences.')
            sys.exit()

        reference = Reference(options.cpus, options.output_dir)
        reference_out = reference.run(options.scaffold_prot_file,
                                        options.scaffold_stats_file,
                                        ref_gene_files,
                                        options.db_file,
                                        options.evalue,
                                        options.per_identity)

        self.logger.info('')
        self.logger.info('  Results written to: ' + reference_out)

        self.time_keeper.print_time_stamp()
Example #7
0
    def phylogenetic_diversity_clade(self, options):
        """Calculate phylogenetic diversity of named groups."""

        check_file_exists(options.decorated_tree)
        
        pd = PhylogeneticDiversity()
        pd.pd_clade(options.decorated_tree, options.output_file, options.taxa_list, options.rep_list)
Example #8
0
    def append(self, options):
        """Append command"""
        
        check_file_exists(options.input_tree)
        check_file_exists(options.input_taxonomy)

        taxonomy = Taxonomy().read(options.input_taxonomy)

        tree = dendropy.Tree.get_from_path(options.input_tree, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        for n in tree.leaf_node_iter():
            taxa_str = taxonomy.get(n.taxon.label, None)
            if taxa_str == None:
                self.logger.error('Taxonomy file does not contain an entry for %s.' % n.label)
                sys.exit(-1)
            n.taxon.label = n.taxon.label + '|' + '; '.join(taxonomy[n.taxon.label])

        tree.write_to_path(options.output_tree, 
                            schema='newick', 
                            suppress_rooting=True, 
                            unquoted_underscores=True)

        self.logger.info('Decorated tree written to: %s' % options.output_tree)
Example #9
0
    def cluster_stats(self, options):
        """Calculate statistics for species cluster."""

        check_file_exists(options.cluster_file)
        check_file_exists(options.genome_path_file)
        
        p = ClusterStats(options.ani_cache_file,
                            options.cpus, 
                            options.output_dir)
        p.run(options.cluster_file, 
                options.genome_path_file)
Example #10
0
    def scaffold_stats(self, options):
        """Scaffold statistics command"""
        print options
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - scaffold_stats] Calculating statistics for scaffolds.')
        self.logger.info('*******************************************************************************')

        check_file_exists(options.scaffold_file)

        if not self._check_nuclotide_seqs([options.scaffold_file]):
            self.logger.warning('[Warning] Scaffold file must contain nucleotide sequences.')
            sys.exit()

        genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext)
        if not self._check_nuclotide_seqs(genome_files):
            self.logger.warning('[Warning] All files must contain nucleotide sequences.')
            sys.exit()

        make_sure_path_exists(options.output_dir)

        # get coverage information
        if not options.coverage_file:
            if not options.bam_files:
                self.logger.warning('\n  [Warning] One or more BAM files must be specified in order to calculate coverage profiles.')
                coverage_file = None
            else:
                coverage = Coverage(options.cpus)
                coverage_file = os.path.join(options.output_dir, 'coverage.tsv')
                coverage.run(options.bam_files, coverage_file, options.cov_all_reads, options.cov_min_align, options.cov_max_edit_dist)
                self.logger.info('')
                self.logger.info('  Coverage profiles written to: %s' % coverage_file)
        else:
            coverage_file = options.coverage_file

        # get tetranucleotide signatures - ALEX - IMPORTANT FOR MY STUFF 
        if not options.tetra_file:
            self.logger.info('')
            tetra = Tetranucleotide(options.cpus)
            tetra_file = os.path.join(options.output_dir, 'tetra.tsv')
            signatures = tetra.run(options.scaffold_file)
            tetra.write(signatures, tetra_file)
            self.logger.info('  Tetranucleotide signatures written to: %s' % tetra_file)
        else:
            tetra_file = options.tetra_file

        # write out scaffold statistics
        stats_output = os.path.join(options.output_dir, 'scaffold_stats.tsv')
        stats = ScaffoldStats(options.cpus)
        stats.run(options.scaffold_file, genome_files, tetra_file, coverage_file, stats_output)

        self.logger.info('  Scaffold statistic written to: %s' % stats_output)

        self.time_keeper.print_time_stamp()
Example #11
0
    def arb_records(self, options):
        """Create an ARB records file from GTDB metadata."""

        check_file_exists(options.metadata_file)
        
        arb = Arb()
        arb.create_records(options.metadata_file, 
                            options.msa_file, 
                            options.taxonomy_file, 
                            options.genome_list, 
                            options.output_file)
Example #12
0
    def pull(self, options):
        """Pull command"""
        check_file_exists(options.input_tree)

        t = Taxonomy().read_from_tree(options.input_tree) #, False)
        if not options.no_rank_fill:
            for taxon_id, taxa in t.iteritems():
                t[taxon_id] = Taxonomy().fill_missing_ranks(taxa)

        Taxonomy().write(t, options.output_file)

        self.logger.info('Taxonomy strings written to: %s' % options.output_file)
Example #13
0
    def rd_ranks(self, options):
        """Calculate number of taxa for specified rd thresholds."""

        check_file_exists(options.input_tree)
        make_sure_path_exists(options.output_dir)

        r = RdRanks()
        r.run(options.input_tree,
                options.thresholds,
                options.output_dir)

        self.logger.info('Done.')
Example #14
0
    def bl_table(self, options):
        """Produce table with number of lineage for increasing mean branch lengths."""

        check_file_exists(options.input_tree)
        check_file_exists(options.taxon_category)

        b = BranchLengthDistribution()
        b.table(options.input_tree,
                options.taxon_category,
                options.step_size,
                options.output_table)

        self.logger.info('Done.')
Example #15
0
    def run(self, genome_files, scaffold_file, min_seq_len):
        """Fragment genome sequences into fragments of a fixed size.

        Parameters
        ----------
        genome_files : list of str
            Fasta files of genomes to process.
        scaffold_file : str
            Scaffolds binned to generate putative genomes.
        min_seq_len : int
            Ignore scaffolds shorter than the specified length.

        Returns
        -------
        dict : d[seq_id] -> seq
            Dictionary of unbinned sequences.
        """

        check_file_exists(scaffold_file)

        # get list of sequences in bins
        self.logger.info('')
        self.logger.info('  Reading binned scaffolds.')

        binned_seq_ids = set()
        total_binned_bases = 0
        for genome_file in genome_files:
            for seq_id, seq in seq_io.read_seq(genome_file):
                binned_seq_ids.add(seq_id)
                total_binned_bases += len(seq)

        self.logger.info('    Read %d (%.2f Mbp) binned scaffolds.' % (len(binned_seq_ids), float(total_binned_bases) / 1e6))

        # write all unbinned sequences
        self.logger.info('')
        self.logger.info('  Identifying unbinned scaffolds >= %d bp.' % min_seq_len)

        unbinned_bases = 0
        unbinned_seqs = {}
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id not in binned_seq_ids and len(seq) >= min_seq_len:
                unbinned_seqs[seq_id] = seq
                unbinned_bases += len(seq)

        self.logger.info('    Identified %d (%.2f Mbp) unbinned scaffolds.' % (len(unbinned_seqs), float(unbinned_bases) / 1e6))

        self.logger.info('')
        self.logger.info('  Percentage of unbinned scaffolds: %.2f%%' % (len(unbinned_seqs) * 100.0 / (len(unbinned_seqs) + len(binned_seq_ids))))
        self.logger.info('  Percentage of unbinned bases: %.2f%%' % (unbinned_bases * 100.0 / (unbinned_bases + total_binned_bases)))

        return unbinned_seqs
Example #16
0
    def outgroup(self, options):
        """Reroot tree with outgroup."""

        check_file_exists(options.taxonomy_file)

        self.logger.info('Identifying genomes from the specified outgroup.')
        outgroup = set()
        for genome_id, taxa in Taxonomy().read(options.taxonomy_file).iteritems():
            if options.outgroup_taxon in taxa:
                outgroup.add(genome_id)
        self.logger.info('Identifying %d genomes in the outgroup.' % len(outgroup))

        reroot = RerootTree()
        reroot.root_with_outgroup(options.input_tree, options.output_tree, outgroup)
Example #17
0
    def bl_dist(self, options):
        """Calculate distribution of branch lengths at each taxonomic rank."""

        check_file_exists(options.input_tree)
        make_sure_path_exists(options.output_dir)

        b = BranchLengthDistribution()
        b.run(options.input_tree,
                options.trusted_taxa_file,
                options.min_children,
                options.taxonomy_file,
                options.output_dir)

        self.logger.info('Done.')
Example #18
0
 def tree_tax_diff(self, options):
     """Taxonomy difference command."""
     
     check_file_exists(options.input_tree1)
     check_file_exists(options.input_tree2)
     
     if not os.path.exists(options.output_dir):
         os.makedirs(options.output_dir)
     
     td = TaxDiff()
     td.tree_tax_diff(options.input_tree1,
                         options.input_tree2,
                         options.output_dir)
     
     self.logger.info('Done.')
Example #19
0
    def decorate(self, options):
        """Place internal taxonomic labels on tree."""

        check_file_exists(options.input_tree)
        check_file_exists(options.taxonomy_file)

        decorate = Decorate()
        decorate.run(options.input_tree,
                        options.taxonomy_file,
                        options.trusted_taxa_file,
                        options.min_children,
                        options.min_support,
                        options.output_tree)

        self.logger.info('Finished decorating tree.')
Example #20
0
 def tax_diff(self, options):
     """Taxonomy difference command."""
     
     check_file_exists(options.tax1_file)
     check_file_exists(options.tax2_file)
     
     if not os.path.exists(options.output_dir):
         os.makedirs(options.output_dir)
     
     td = TaxDiff()
     td.tax_diff(options.tax1_file,
             options.tax2_file,
             options.include_user_taxa,
             options.output_dir)
     
     self.logger.info('Done.')
Example #21
0
 def bl_decorate(self, options):
     """Decorate tree based using a mean branch length criterion."""
     
     check_file_exists(options.input_tree)
     
     b = BranchLengthDistribution()
     b.decorate(options.input_tree, 
                 options.taxonomy_file,
                 options.threshold, 
                 options.rank, 
                 options.retain_named_lineages,
                 options.keep_labels,
                 options.prune,
                 options.output_tree)
     
     self.logger.info('Done.')
Example #22
0
    def jk_taxa(self, options):
        """Jackknife taxa."""

        check_file_exists(options.input_tree)
        check_file_exists(options.msa_file)
        make_sure_path_exists(options.output_dir)

        jackknife_taxa = JackknifeTaxa(options.cpus)
        output_tree = jackknife_taxa.run(options.input_tree,
                                            options.msa_file,
                                            options.outgroup_ids,
                                            options.perc_taxa,
                                            options.num_replicates,
                                            options.model,
                                            options.output_dir)

        self.logger.info('Jackknifed taxa tree written to: %s' % output_tree)
Example #23
0
    def diss(self, options):
        """Calculate dissimilarity between usage profiles."""
        
        check_file_exists(options.profile_file)
        
        genome_ids = []
        profiles = []
        with open(options.profile_file) as f:
            f.readline() # burn header
            
            for line in f:
                line_split = line.rstrip().split('\t')
                genome_id = line_split[0]
                profile = [float(v) for v in line_split[1:]]
                
                genome_ids.append(genome_id)
                profiles.append(profile)
                
        # calculate dissimilarity between genomes
        d = scipy_pdist(profiles, metric=options.metric)

        fout = open(options.output_file, 'w')
        if not options.full_matrix:
            # write out lower triangle from condense dissimilarity matrix,
            # in pairwise fashion
            fout.write('Genome A\tGenome B\tDissimilarity\n')
            condensed_idx = lambda i,j,n: n*j - j*(j+1)/2 + i - 1 - j
            for i in xrange(1, len(genome_ids)):
                for j in xrange(i):
                    fout.write('%s\t%s\t%f\n' % (genome_ids[i], genome_ids[j], d[condensed_idx(i, j, len(genome_ids))]))
        else:
            # write out full dissimilarity matrix
            ds = scipy_squareform(d)
            for genome_id in genome_ids:
                fout.write('\t' + genome_id)
            fout.write('\n')
            
            for i, genome_id in enumerate(genome_ids):
                fout.write(genome_id)
                for j in xrange(len(genome_ids)):
                    fout.write('\t%f' % ds[i,j])
                fout.write('\n')
        
        fout.close()
        
        self.logger.info('Dissimilarity values written to: %s' % options.output_file)
Example #24
0
    def mark_tree(self, options):
        """Mark tree command"""

        check_file_exists(options.input_tree)

        mt = MarkTree()
        mt.run(options.input_tree,
                    options.output_tree,
                    options.min_support,
                    options.only_named_clades,
                    options.min_length,
                    not options.no_percentile,
                    not options.no_relative_divergence,
                    not options.no_prediction,
                    options.thresholds)

        self.logger.info('Marked tree written to: %s' % options.output_tree)
Example #25
0
 def tree_diff(self, options):
     """Tree diff command."""
     
     check_file_exists(options.input_tree1)
     check_file_exists(options.input_tree2)
     
     if not os.path.exists(options.output_dir):
         os.makedirs(options.output_dir)
     
     td = TreeDiff()
     td.run(options.input_tree1,
             options.input_tree2,
             options.output_dir,
             options.min_support,
             options.min_taxa,
             options.named_only)
     
     self.logger.info('Done.')
Example #26
0
 def classify(self, options):
     """Classify genomes based on AAI values."""
     check_file_exists(options.sorted_hit_table)
     make_sure_path_exists(options.output_dir)
     
     classify = Classify(options.cpus)
     results_file = classify.run(options.query_gene_file,
                                     options.target_gene_file,
                                     options.sorted_hit_table,
                                     options.evalue,
                                     options.per_identity,
                                     options.per_aln_len,
                                     options.num_top_targets,
                                     options.taxonomy_file,
                                     options.keep_rbhs,
                                     options.output_dir)
     
     self.logger.info('Classification results written to: %s' % results_file)
Example #27
0
    def aai(self, options):
        """AAI command"""
        check_file_exists(options.sorted_hit_table)
        make_sure_path_exists(options.output_dir)

        aai_calculator = AAICalculator(options.cpus)
        aai_output_file, rbh_output_file = aai_calculator.run(options.query_gene_file,
                                                                None,
                                                                options.sorted_hit_table,
                                                                options.evalue,
                                                                options.per_identity,
                                                                options.per_aln_len,
                                                                options.keep_rbhs,
                                                                options.output_dir)

        if rbh_output_file:
            self.logger.info('Identified reciprocal best hits written to: %s' % rbh_output_file)
            
        self.logger.info('AAI between genomes written to: %s' % aai_output_file)
Example #28
0
 def rna_dump(self, options):
     """Dump all 5S, 16S, and 23S sequences to files."""
     
     check_file_exists(options.genomic_file)
     make_sure_path_exists(options.output_dir)
     
     rna_workflow = RNA_Workflow(1)
     rna_workflow.dump(options.genomic_file,
                         options.gtdb_taxonomy,
                         options.min_5S_len,
                         options.min_16S_ar_len,
                         options.min_16S_bac_len,
                         options.min_23S_len,
                         options.min_contig_len,
                         options.include_user,
                         options.genome_list,
                         options.output_dir)
                         
     self.logger.info('Results written to: %s' % options.output_dir)
Example #29
0
    def jk_markers(self, options):
        """Jackknife marker genes."""

        check_file_exists(options.input_tree)
        if options.msa_file != 'NONE':
            check_file_exists(options.msa_file)
        make_sure_path_exists(options.output_dir)

        jackknife_markers = JackknifeMarkers(options.cpus)
        output_tree = jackknife_markers.run(options.input_tree,
                                                options.msa_file,
                                                options.marker_info_file,
                                                options.mask_file,
                                                options.perc_markers,
                                                options.num_replicates,
                                                options.model,
                                                options.jk_dir,
                                                options.output_dir)

        self.logger.info('Jackknifed marker tree written to: %s' % output_tree)
Example #30
0
    def bootstrap(self, options):
        """Bootstrap multiple sequence alignment."""

        check_file_exists(options.input_tree)
        if options.msa_file != 'NONE':
            check_file_exists(options.msa_file)
        make_sure_path_exists(options.output_dir)

        bootstrap = Bootstrap(options.cpus)
        output_tree = bootstrap.run(options.input_tree,
                                    options.msa_file,
                                    options.num_replicates,
                                    options.model,
                                    options.gamma,
                                    options.base_type,
                                    options.fraction,
                                    options.boot_dir,
                                    options.output_dir)

        self.logger.info('Bootstrapped tree written to: %s' % output_tree)
Example #31
0
    def taxonomy_files(self, options):
        """Generate taxonomy files for GTDB website."""

        check_file_exists(options.metadata_file)
        check_file_exists(options.gtdb_sp_clusters_file)
        check_file_exists(options.user_gid_table)
        make_sure_path_exists(options.output_dir)

        p = WebsiteData(options.release_number, options.output_dir)
        p.taxonomy_files(options.metadata_file, options.gtdb_sp_clusters_file,
                         options.user_gid_table)

        self.logger.info('Done.')
Example #32
0
    def sp_cluster_file(self, options):
        """Generate file indicating GTDB species clusters."""

        check_file_exists(options.metadata_file)
        check_file_exists(options.gtdb_sp_clusters_file)
        check_file_exists(options.user_gid_table)
        make_sure_path_exists(options.output_dir)

        p = WebsiteData(options.release_number, options.output_dir)
        p.sp_cluster_file(options.metadata_file, options.gtdb_sp_clusters_file,
                          options.user_gid_table)

        self.logger.info('Done.')
Example #33
0
    def reduce(self, options):
        """Infer tree for reduced set of genes."""

        check_file_exists(options.homolog_file)
        check_file_exists(options.gene_ids)
        check_file_exists(options.taxonomy_file)

        make_sure_path_exists(options.output_dir)

        r = Reduce(options.cpus)
        r.run(options.homolog_file, options.gene_ids, options.taxonomy_file,
              options.min_per_taxa, options.consensus, options.min_per_bp,
              options.use_trimAl, options.msa_program, options.tree_program,
              options.prot_model, options.output_dir)
Example #34
0
    def outliers(self, options):
        """Create information for identifying taxnomic outliers"""

        check_file_exists(options.input_tree)
        check_file_exists(options.taxonomy_file)

        if options.plot_taxa_file:
            check_file_exists(options.plot_taxa_file)

        if options.trusted_taxa_file:
            check_file_exists(options.trusted_taxa_file)

        if not os.path.exists(options.output_dir):
            os.makedirs(options.output_dir)
            
        if options.highlight_polyphyly and not options.fmeasure_table:
            self.logger.error("The '--highlight_polyphyly' flag must be used with the '--fmeasure_table' flag.")
            return

        o = Outliers(options.dpi)
        o.run(options.input_tree,
                options.taxonomy_file,
                options.output_dir,
                options.plot_taxa_file,
                options.plot_dist_taxa_only,
                options.plot_domain,
                options.highlight_polyphyly,
                options.highlight_taxa_file,
                options.trusted_taxa_file,
                options.fixed_root,
                options.min_children,
                options.min_support,
                options.mblet,
                options.fmeasure_table,
                options.min_fmeasure,
                options.fmeasure_mono,
                options.verbose_table)

        self.logger.info('Done.')
Example #35
0
    def tree_gids(self, options):
        """Determine genome IDs for test/validation tree."""

        check_file_exists(options.qc_file)
        check_file_exists(options.gtdb_metadata_file)
        check_file_exists(options.gtdb_final_clusters)

        try:
            p = TreeGIDs()
            p.run(options.qc_file, options.gtdb_metadata_file,
                  options.gtdb_final_clusters, options.output_dir)
        except GenomeTreeTkError as e:
            print e.message
            raise SystemExit

        self.logger.info('Results written to: %s' % options.output_dir)
Example #36
0
    def dist_plot(self, options):
        """Distribution plot command"""

        check_file_exists(options.input_tree)

        if options.plot_taxa_file:
            check_file_exists(options.plot_taxa_file)

        if options.trusted_taxa_file:
            check_file_exists(options.trusted_taxa_file)

        dist_plot = DistributionPlot()
        dist_plot.run(options.input_tree, options.output_prefix,
                      options.plot_taxa_file, options.trusted_taxa_file,
                      options.min_children, options.min_support)

        self.logger.info('Done.')
Example #37
0
 def derep_tree(self, options):
     """Dereplicate tree."""
     
     check_file_exists(options.input_tree)
     check_file_exists(options.gtdb_metadata)
     check_file_exists(options.msa_file)
     make_sure_path_exists(options.output_dir)
     
     derep_tree = DereplicateTree()
     derep_tree.run(options.input_tree,
                     options.lineage_of_interest,
                     options.outgroup,
                     options.gtdb_metadata,
                     options.taxa_to_retain,
                     options.msa_file,
                     options.keep_unclassified,
                     options.output_dir)
Example #38
0
    def cluster_user(self, args):
        """Cluster User genomes to GTDB species clusters."""

        check_file_exists(args.gtdb_metadata_file)
        check_file_exists(args.genome_path_file)
        check_file_exists(args.final_cluster_file)
        make_sure_path_exists(args.output_dir)

        try:
            p = ClusterUser(args.ani_cache_file, args.cpus, args.output_dir)
            p.run(args.gtdb_metadata_file, args.genome_path_file,
                  args.final_cluster_file)
        except GTDB_Error as e:
            print(e.message)
            raise SystemExit

        self.logger.info('Clustering results written to: %s' % args.output_dir)
Example #39
0
    def cluster(self, options):
        """Cluster remaining genomes based on Mash distances."""

        check_file_exists(options.rep_genome_file)
        check_file_exists(options.metadata_file)
        check_file_exists(options.mash_pairwise_file)

        try:
            rep = Representatives()
            rep.cluster(options.rep_genome_file, options.metadata_file,
                        options.mash_pairwise_file, options.cluster_file)

            self.logger.info('Clustering information written to: %s' %
                             options.cluster_file)

        except GenomeTreeTkError as e:
            print(e.message)
            raise SystemExit
Example #40
0
    def outliers(self, options):
        """Create information for identifying taxnomic outliers"""

        check_file_exists(options.input_tree)

        if options.plot_taxa_file:
            check_file_exists(options.plot_taxa_file)

        if options.trusted_taxa_file:
            check_file_exists(options.trusted_taxa_file)

        if not os.path.exists(options.output_dir):
            os.makedirs(options.output_dir)

        o = Outliers(options.dpi)
        o.run(options.input_tree, options.taxonomy_file, options.output_dir,
              options.plot_taxa_file, options.plot_dist_taxa_only,
              options.plot_domain, options.trusted_taxa_file,
              options.fixed_root, options.min_children, options.min_support,
              options.verbose_table)

        self.logger.info('Done.')
Example #41
0
    def blast(self, options):
        """Infer gene tree using BLAST."""

        check_file_exists(options.query_proteins)
        check_file_exists(options.db_file)
        check_file_exists(options.taxonomy_file)

        # sanity check arguments
        if options.prot_model == 'AUTO' and options.tree_program != 'raxml':
            self.logger.error(
                "The 'AUTO' protein model can only be used with RAxML.")
            sys.exit(-1)

        blast_workflow = BlastWorkflow(options.cpus)
        blast_workflow.run(
            options.query_proteins, options.db_file, options.custom_db_file,
            options.taxonomy_file, options.custom_taxonomy_file,
            options.evalue, options.per_identity, options.per_aln_len,
            options.max_matches, options.homology_search, options.min_per_taxa,
            options.consensus, options.min_per_bp, options.use_trimAl,
            options.restrict_taxon, options.msa_program, options.tree_program,
            options.prot_model, options.skip_rooting, options.output_dir)
Example #42
0
    def dereplicate(self, options):
        """Select representative genomes for named species."""

        check_file_exists(options.metadata_file)
        check_file_exists(options.prev_rep_file)
        check_file_exists(options.trusted_user_file)

        try:
            rep = Representatives()
            rep.dereplicate(options.metadata_file, options.prev_rep_file,
                            options.exceptions_file, options.trusted_user_file,
                            options.max_species, options.min_rep_comp,
                            options.max_rep_cont, options.min_quality,
                            options.max_contigs, options.min_N50,
                            options.max_ambiguous, options.max_gap_length,
                            options.strict_filtering,
                            options.species_derep_file)
        except GenomeTreeTkError as e:
            print(e.message)
            raise SystemExit

        self.logger.info('RefSeq representative genomes written to: %s' %
                         options.species_derep_file)
Example #43
0
    def taxon_profile(self, options):
        """Call genes command"""

        make_sure_path_exists(options.output_dir)
        check_file_exists(options.scaffold_stats_file)
        check_file_exists(options.taxonomy_file)
        check_file_exists(options.db_file)

        gene_files = self._genome_files(options.genome_prot_dir,
                                        options.protein_ext)
        if not self._check_protein_seqs(gene_files):
            self.logger.warning('All files must contain amino acid sequences.')
            sys.exit()

        # build gene profile
        taxon_profile = TaxonProfile(options.cpus, options.output_dir)
        taxon_profile.run(gene_files, options.scaffold_stats_file,
                          options.db_file, options.taxonomy_file,
                          options.per_to_classify, options.evalue,
                          options.per_identity, options.per_aln_len,
                          options.tmpdir)

        self.logger.info('Results written to: %s' % options.output_dir)
Example #44
0
    def rank_res(self, options):
        """Calculate taxonomic resolution at each rank."""

        check_file_exists(options.input_tree)
        check_file_exists(options.taxonomy_file)

        if options.taxa_file:
            taxa_out = open(options.taxa_file, 'w')
            taxa_out.write('Rank\tLowest Rank\tTaxon\n')

        # determine taxonomic resolution of named groups
        tree = dendropy.Tree.get_from_path(options.input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        rank_res = defaultdict(lambda: defaultdict(int))
        for node in tree.preorder_node_iter(lambda n: n != tree.seed_node):
            if not node.label or node.is_leaf():
                continue

            _support, taxon_name, _auxiliary_info = parse_label(node.label)

            if taxon_name:
                lowest_rank = [x.strip()
                               for x in taxon_name.split(';')][-1][0:3]
                for rank_prefix in Taxonomy.rank_prefixes:
                    if rank_prefix in taxon_name:
                        rank_res[rank_prefix][lowest_rank] += 1
                        if options.taxa_file:
                            rank_prefix_name = Taxonomy.rank_labels[
                                Taxonomy.rank_index[rank_prefix]]
                            lowest_rank_name = Taxonomy.rank_labels[
                                Taxonomy.rank_index[lowest_rank]]
                            taxa_out.write('%s\t%s\t%s\n' %
                                           (rank_prefix_name, lowest_rank_name,
                                            taxon_name))

        # identify any singleton taxa which are treated as having species level resolution
        for line in open(options.taxonomy_file):
            line_split = line.split('\t')
            genome_id = line_split[0]
            taxonomy = line_split[1].split(';')

            for i, rank_prefix in enumerate(Taxonomy.rank_prefixes):
                if taxonomy[i] == rank_prefix:
                    # this taxa is undefined at the specified rank so
                    # must be the sole representative; e.g., a p__
                    # indicates a taxon that represents a novel phyla
                    rank_res[rank_prefix]['s__'] += 1
                    if options.taxa_file:
                        rank_prefix_name = Taxonomy.rank_labels[
                            Taxonomy.rank_index[rank_prefix]]
                        taxa_out.write('%s\t%s\t%s (%s)\n' %
                                       (rank_prefix_name, 'species',
                                        taxonomy[i], genome_id))
        if options.taxa_file:
            taxa_out.close()

        # write out results
        fout = open(options.output_file, 'w')
        fout.write('Category')
        for rank in Taxonomy.rank_labels[1:]:
            fout.write('\t' + rank)
        fout.write('\n')

        for i, rank_prefix in enumerate(Taxonomy.rank_prefixes[1:]):
            fout.write(Taxonomy.rank_labels[i + 1])

            for j, r in enumerate(Taxonomy.rank_prefixes[1:]):
                if i >= j:
                    fout.write('\t' + str(rank_res[r].get(rank_prefix, 0)))
                else:
                    fout.write('\t-')
            fout.write('\n')
        fout.close()

        self.logger.info('Done.')
Example #45
0
 def check_unique_strains(self, options):
     check_file_exists(options.node)
     check_file_exists(options.name)
     check_file_exists(options.metadata_file)
     p = Tools()
     p.parse_ncbi_names_and_nodes(options.name, options.node, options.metadata_file, options.output_file)
Example #46
0
    def annoted_features(self, options):
        """Making annoted features matrix"""

        missing = []

        features2annotation = {}
        with open(options.features_annotation) as f:
            for line in f:
                line = line.rstrip()
                features_id, annotation = line.split('\t')
                features2annotation[features_id] = annotation

        counts = {}
        id2description = {}
        annotation_id_list = []
        with open(options.annotation_description) as f:
            for line in f:
                line = line.rstrip()
                annotation_id, description = line.split('\t')
                id2description[annotation_id] = description
                annotation_id_list.append(annotation_id)
                counts[annotation_id] = {}

        annotation_id_list.append('hypothetical protein')
        counts['hypothetical protein'] = {}

        check_dir_exists(options.features_dir)
        input_matrices = DefaultValues.FEATURES_ABUNDANCE_FILES
        output_matrices = DefaultValues.ANNOTATE_ABUNDANCE_FILES

        for index, input_matrix in enumerate(input_matrices):

            input_matrix = os.path.join(options.features_dir, input_matrix)
            count_type, abundance_type = input_matrix.split('_')[1:3]
            check_file_exists(input_matrix)
            counts_all = {}
            header = []

            with open(input_matrix) as f:
                for line in f:
                    line = line.rstrip()
                    line_list = line.split('\t')
                    if len(header) == 0:
                        header = line_list
                        for i in range(3, len(header), 1):
                            sample = header[i]
                            for annotation_id in annotation_id_list:
                                counts[annotation_id][sample] = 0
                            counts_all[sample] = 0

                    else:
                        features = line_list[0]
                        annotation_id = features2annotation[features]
                        if annotation_id not in counts:
                            if annotation_id not in missing:
                                self.logger.warning(
                                    "'%s' not present in %s" %
                                    (annotation_id,
                                     options.annotation_description))
                                missing.append(annotation_id)
                            continue
                        for i in range(3, len(header), 1):
                            sample = header[i]
                            counts[annotation_id][sample] = counts[
                                annotation_id][sample] + float(line_list[i])
                            counts_all[sample] = counts_all[sample] + float(
                                line_list[i])

            output_matrix = os.path.join(options.features_dir,
                                         output_matrices[index])
            self.logger.info('Print %s %s abundance matrix in "%s"' %
                             (count_type, abundance_type, output_matrix))
            output_handle = open(output_matrix, "w")
            output_handle.write('\t'.join(['Features'] +
                                          header[3:len(header)]) + '\n')
            for annotation in annotation_id_list:
                if sum([counts[annotation][s]
                        for s in counts[annotation]]) == 0 and options.removed:
                    continue
                else:
                    output_handle.write('\t'.join([annotation] + [
                        str(counts[annotation][s]) for s in counts[annotation]
                    ]) + '\n')

        self.logger.info('Printing matrices done')
Example #47
0
    def compare_red(self, options):
        """Compare RED values of taxa calculated over different trees."""

        check_file_exists(options.red_table1)
        check_file_exists(options.red_table2)
        check_file_exists(options.red_dict2)

        median_reds = eval(open(options.red_dict2).readline())

        red1 = {}
        red2 = {}
        lineage = {}
        for d, red_file in [(red1, options.red_table1),
                            (red2, options.red_table2)]:
            with open(red_file) as f:
                f.readline()

                for line in f:
                    line_split = line.strip().split('\t')
                    taxon = line_split[0]
                    median_red = float(line_split[2])
                    d[taxon] = median_red

                    if d == red1:
                        lineage[taxon] = line_split[1]

        red1_label = os.path.splitext(os.path.basename(options.red_table1))[0]
        red2_label = os.path.splitext(os.path.basename(options.red_table2))[0]

        fout = open(options.output_table, 'w')
        fout.write(
            'Taxon\tLineage\t%s\t%s\tDifference\tAbs. Difference\tChanged rank\n'
            % (red1_label, red2_label))
        if options.viral:
            sorted_taxa = sort_viral_taxa(set(red1.keys()).union(red2.keys()))
        else:
            sorted_taxa = Taxonomy().sort_taxa(
                set(red1.keys()).union(red2.keys()))

        for taxon in sorted_taxa:
            r1 = red1.get(taxon, 'NA')
            r2 = red2.get(taxon, 'NA')
            if r1 == 'NA':
                fout.write('%s\t%s\t%s\t%.3f\t%s\t%s' %
                           (taxon, 'NA', 'NA', r2, 'NA', 'NA'))
            elif r2 == 'NA':
                fout.write('%s\t%s\t%.3f\t%s\t%s\t%s\t%s\n' %
                           (taxon, lineage[taxon], r1, 'NA', 'NA', 'NA', 'NA'))
            else:
                fout.write(
                    '%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f' %
                    (taxon, lineage[taxon], r1, r2, r1 - r2, abs(r1 - r2)))

            if r2 != 'NA':
                rank_prefix = taxon[0:3]
                if rank_prefix == 'd__':
                    continue

                if options.viral:
                    rank_label = VIRAL_RANK_LABELS[VIRAL_RANK_PREFIXES.index(
                        rank_prefix)]
                else:
                    rank_label = Taxonomy.rank_labels[
                        Taxonomy.rank_prefixes.index(rank_prefix)]
                rank_median = median_reds[rank_label]

                closest_rank = rank_label
                closest_dist = 1e6
                if r2 < rank_median - 0.1 or r2 > rank_median + 0.1:
                    for rank, median_red in median_reds.items():
                        d = abs(r2 - median_red)
                        if d < closest_dist:
                            closest_dist = d
                            closest_rank = rank

                if rank_label != closest_rank:
                    fout.write('\tTrue (%s: %.3f)' %
                               (closest_rank, closest_dist))
                else:
                    fout.write('\tFalse')
                fout.write('\n')

        fout.close()
Example #48
0
    def u_synonyms(self, args):
        """Determine synonyms for validly or effectively published species."""

        check_file_exists(args.gtdb_clusters_file)
        check_file_exists(args.cur_gtdb_metadata_file)
        check_file_exists(args.uba_genome_paths)
        check_file_exists(args.qc_passed_file)
        check_file_exists(args.ncbi_misclassified_file)
        check_file_exists(args.ncbi_genbank_assembly_file)
        check_file_exists(args.untrustworthy_type_file)
        check_file_exists(args.ani_af_rep_vs_nonrep)
        check_file_exists(args.gtdb_type_strains_ledger)
        check_file_exists(args.sp_priority_ledger)
        check_file_exists(args.genus_priority_ledger)
        check_file_exists(args.dsmz_bacnames_file)
        make_sure_path_exists(args.output_dir)

        p = UpdateSynonyms(args.output_dir)
        p.run(args.gtdb_clusters_file, args.cur_gtdb_metadata_file,
              args.uba_genome_paths, args.qc_passed_file,
              args.ncbi_misclassified_file, args.ncbi_genbank_assembly_file,
              args.untrustworthy_type_file, args.ani_af_rep_vs_nonrep,
              args.gtdb_type_strains_ledger, args.sp_priority_ledger,
              args.genus_priority_ledger, args.dsmz_bacnames_file)

        self.logger.info('Done.')
Example #49
0
    def u_ncbi_erroneous(self, args):
        """Identify genomes with erroneous NCBI species assignments."""

        check_file_exists(args.gtdb_clusters_file)
        check_file_exists(args.cur_gtdb_metadata_file)
        check_file_exists(args.cur_genomic_path_file)
        check_file_exists(args.uba_genome_paths)
        check_file_exists(args.qc_passed_file)
        check_file_exists(args.ncbi_genbank_assembly_file)
        check_file_exists(args.untrustworthy_type_file)
        check_file_exists(args.gtdb_type_strains_ledger)
        check_file_exists(args.sp_priority_ledger)
        check_file_exists(args.genus_priority_ledger)
        check_file_exists(args.dsmz_bacnames_file)
        make_sure_path_exists(args.output_dir)

        p = UpdateErroneousNCBI(args.ani_ncbi_erroneous, args.ani_cache_file,
                                args.cpus, args.output_dir)
        p.run(args.gtdb_clusters_file, args.cur_gtdb_metadata_file,
              args.cur_genomic_path_file, args.uba_genome_paths,
              args.qc_passed_file, args.ncbi_genbank_assembly_file,
              args.untrustworthy_type_file, args.gtdb_type_strains_ledger,
              args.sp_priority_ledger, args.genus_priority_ledger,
              args.dsmz_bacnames_file)

        self.logger.info('Done.')
Example #50
0
    def u_genus_names(self, args):
        """Update genus names as a precursor for establish binomial species names."""

        check_file_exists(args.gtdb_clusters_file)
        check_file_exists(args.prev_gtdb_metadata_file)
        check_file_exists(args.cur_gtdb_metadata_file)
        check_file_exists(args.uba_genome_paths)
        check_file_exists(args.qc_passed_file)
        check_file_exists(args.gtdbtk_classify_file)
        check_file_exists(args.ncbi_genbank_assembly_file)
        check_file_exists(args.untrustworthy_type_file)
        check_file_exists(args.gtdb_type_strains_ledger)
        check_file_exists(args.sp_priority_ledger)
        check_file_exists(args.gtdb_taxa_updates_ledger)
        check_file_exists(args.dsmz_bacnames_file)
        make_sure_path_exists(args.output_dir)

        p = UpdateGenusNames(args.output_dir)
        p.run(args.gtdb_clusters_file, args.prev_gtdb_metadata_file,
              args.cur_gtdb_metadata_file, args.uba_genome_paths,
              args.qc_passed_file, args.gtdbtk_classify_file,
              args.ncbi_genbank_assembly_file, args.untrustworthy_type_file,
              args.gtdb_type_strains_ledger, args.sp_priority_ledger,
              args.gtdb_taxa_updates_ledger, args.dsmz_bacnames_file)

        self.logger.info('Done.')
Example #51
0
    def select_type_genomes(self, args):
        """Select representative genomes for named species."""

        check_file_exists(args.qc_file)
        check_file_exists(args.gtdb_metadata_file)
        check_file_exists(args.genome_path_file)
        check_file_exists(args.prev_rep_file)
        check_file_exists(args.ncbi_refseq_assembly_file)
        check_file_exists(args.ncbi_genbank_assembly_file)
        check_file_exists(args.gtdb_domain_report)
        check_file_exists(args.species_exception_file)
        check_file_exists(args.gtdb_type_genome_file)
        make_sure_path_exists(args.output_dir)

        try:
            p = SelectTypeGenomes(args.ani_cache_file, args.cpus,
                                  args.output_dir)
            p.run(args.qc_file, args.gtdb_metadata_file, args.ltp_blast_file,
                  args.genome_path_file, args.prev_rep_file,
                  args.ncbi_refseq_assembly_file,
                  args.ncbi_genbank_assembly_file, args.gtdb_domain_report,
                  args.species_exception_file, args.gtdb_type_genome_file)
        except GTDB_Error as e:
            print(e.message)
            raise SystemExit

        self.logger.info('GTDB type genomes written to: %s' % args.output_dir)
Example #52
0
    def pmc_check_type_strains(self, args):
        """Check for agreement between GTDB species and genomes assembled from type strain of species."""

        check_file_exists(args.manual_taxonomy)
        check_file_exists(args.cur_gtdb_metadata_file)
        check_file_exists(args.uba_genome_paths)
        check_file_exists(args.qc_passed_file)
        check_file_exists(args.ncbi_genbank_assembly_file)
        check_file_exists(args.untrustworthy_type_file)
        check_file_exists(args.synonym_file)
        check_file_exists(args.gtdb_type_strains_ledger)
        check_file_exists(args.sp_priority_ledger)
        check_file_exists(args.genus_priority_ledger)
        check_file_exists(args.dsmz_bacnames_file)
        make_sure_path_exists(args.output_dir)

        p = PMC_CheckTypeStrains(args.output_dir)
        p.run(args.manual_taxonomy, args.cur_gtdb_metadata_file,
              args.uba_genome_paths, args.qc_passed_file,
              args.ncbi_genbank_assembly_file, args.untrustworthy_type_file,
              args.synonym_file, args.gtdb_type_strains_ledger,
              args.sp_priority_ledger, args.genus_priority_ledger,
              args.dsmz_bacnames_file)

        self.logger.info('Done.')
Example #53
0
    def u_species_init(self, args):
        """Produce initial best guess at GTDB species clusters."""

        check_file_exists(args.gtdb_clusters_file)
        check_file_exists(args.prev_gtdb_metadata_file)
        check_file_exists(args.prev_genomic_path_file)
        check_file_exists(args.cur_gtdb_metadata_file)
        check_file_exists(args.cur_genomic_path_file)
        check_file_exists(args.uba_genome_paths)
        check_file_exists(args.qc_passed_file)
        check_file_exists(args.gtdbtk_classify_file)
        check_file_exists(args.ncbi_genbank_assembly_file)
        check_file_exists(args.untrustworthy_type_file)
        check_file_exists(args.synonym_file)
        check_file_exists(args.gtdb_type_strains_ledger)
        check_file_exists(args.sp_priority_ledger)
        check_file_exists(args.genus_priority_ledger)
        check_file_exists(args.gtdb_taxa_updates_ledger)
        check_file_exists(args.dsmz_bacnames_file)
        make_sure_path_exists(args.output_dir)

        p = UpdateSpeciesInit(args.ani_cache_file, args.cpus, args.output_dir)
        p.run(args.gtdb_clusters_file, args.prev_gtdb_metadata_file,
              args.prev_genomic_path_file, args.cur_gtdb_metadata_file,
              args.cur_genomic_path_file, args.uba_genome_paths,
              args.qc_passed_file, args.gtdbtk_classify_file,
              args.ncbi_genbank_assembly_file, args.untrustworthy_type_file,
              args.synonym_file, args.gtdb_type_strains_ledger,
              args.sp_priority_ledger, args.genus_priority_ledger,
              args.gtdb_taxa_updates_ledger, args.dsmz_bacnames_file)

        self.logger.info('Done.')
Example #54
0
    def pmc_validate(self, args):
        """Validate final species names."""

        check_file_exists(args.final_taxonomy)
        check_file_exists(args.final_scaled_tree)
        check_file_exists(args.manual_sp_names)
        check_file_exists(args.pmc_custom_species)
        check_file_exists(args.gtdb_clusters_file)
        check_file_exists(args.prev_gtdb_metadata_file)
        check_file_exists(args.cur_gtdb_metadata_file)
        check_file_exists(args.uba_genome_paths)
        check_file_exists(args.qc_passed_file)
        check_file_exists(args.ncbi_misclassified_file)
        check_file_exists(args.ncbi_genbank_assembly_file)
        check_file_exists(args.untrustworthy_type_file)
        check_file_exists(args.synonym_file)
        check_file_exists(args.updated_species_reps)
        check_file_exists(args.gtdb_type_strains_ledger)
        check_file_exists(args.species_classification_ledger)
        check_file_exists(args.sp_priority_ledger)
        check_file_exists(args.genus_priority_ledger)
        check_file_exists(args.specific_epithet_ledger)
        check_file_exists(args.dsmz_bacnames_file)
        check_file_exists(args.ground_truth_test_cases)
        make_sure_path_exists(args.output_dir)

        p = PMC_Validation(args.output_dir)
        p.run(args.final_taxonomy, args.final_scaled_tree,
              args.manual_sp_names, args.pmc_custom_species,
              args.gtdb_clusters_file, args.prev_gtdb_metadata_file,
              args.cur_gtdb_metadata_file, args.uba_genome_paths,
              args.qc_passed_file, args.ncbi_misclassified_file,
              args.ncbi_genbank_assembly_file, args.untrustworthy_type_file,
              args.synonym_file, args.updated_species_reps,
              args.gtdb_type_strains_ledger,
              args.species_classification_ledger, args.sp_priority_ledger,
              args.genus_priority_ledger, args.specific_epithet_ledger,
              args.dsmz_bacnames_file, args.ground_truth_test_cases,
              args.skip_genus_checks)

        self.logger.info('Done.')
Example #55
0
    def u_summary_stats(self, args):
        """Summary statistics indicating changes to GTDB species clusters."""

        check_file_exists(args.updated_sp_rep_file)
        check_file_exists(args.gtdb_clusters_file)
        check_file_exists(args.prev_gtdb_metadata_file)
        check_file_exists(args.cur_gtdb_metadata_file)
        check_file_exists(args.uba_genome_paths)
        check_file_exists(args.qc_passed_file)
        check_file_exists(args.gtdbtk_classify_file)
        check_file_exists(args.ncbi_genbank_assembly_file)
        check_file_exists(args.untrustworthy_type_file)
        check_file_exists(args.synonym_file)
        check_file_exists(args.gtdb_type_strains_ledger)
        make_sure_path_exists(args.output_dir)

        p = UpdateSummaryStats(args.output_dir)
        p.run(args.updated_sp_rep_file, args.gtdb_clusters_file,
              args.prev_gtdb_metadata_file, args.cur_gtdb_metadata_file,
              args.uba_genome_paths, args.qc_passed_file,
              args.gtdbtk_classify_file, args.ncbi_genbank_assembly_file,
              args.untrustworthy_type_file, args.synonym_file,
              args.gtdb_type_strains_ledger)

        self.logger.info('Done.')
Example #56
0
    def rep_compare(self, args):
        """Compare current and previous representatives."""

        check_file_exists(args.cur_metadata_file)
        check_file_exists(args.prev_metadata_file)

        # get representatives in current taxonomy
        cur_gids = set()
        cur_species = set()
        cur_genera = set()
        cur_reps_taxa = {}
        cur_rep_species = set()
        cur_rep_genera = set()
        header = True
        for row in csv.reader(open(args.cur_metadata_file)):
            if header:
                header = False
                gtdb_rep_index = row.index('gtdb_representative')
                gtdb_taxonomy_index = row.index('gtdb_taxonomy')
            else:
                gid = row[0]
                cur_gids.add(gid)

                gtdb_taxonomy = row[gtdb_taxonomy_index]
                if gtdb_taxonomy:
                    gtdb_taxa = [
                        t.strip() for t in row[gtdb_taxonomy_index].split(';')
                    ]
                    if gtdb_taxa[6] != 's__':
                        cur_species.add(gtdb_taxa[6])
                    if gtdb_taxa[5] != 'g__':
                        cur_genera.add(gtdb_taxa[5])

                if row[gtdb_rep_index] == 't':
                    cur_reps_taxa[gid] = gtdb_taxa

                    if gtdb_taxa[6] != 's__':
                        cur_rep_species.add(gtdb_taxa[6])

                    if gtdb_taxa[5] != 'g__':
                        cur_rep_genera.add(gtdb_taxa[5])

        # get representatives in previous taxonomy
        prev_reps_taxa = {}
        prev_rep_species = set()
        prev_rep_genera = set()
        header = True
        for row in csv.reader(open(args.prev_metadata_file)):
            if header:
                header = False
                gtdb_rep_index = row.index('gtdb_representative')
                gtdb_taxonomy_index = row.index('gtdb_taxonomy')
            else:
                if row[gtdb_rep_index] == 't':
                    gid = row[0]
                    gtdb_taxonomy = row[gtdb_taxonomy_index]
                    if gtdb_taxonomy:
                        gtdb_taxa = [
                            t.strip()
                            for t in row[gtdb_taxonomy_index].split(';')
                        ]

                        prev_reps_taxa[gid] = gtdb_taxa

                        if gtdb_taxa[6] != 's__':
                            prev_rep_species.add(gtdb_taxa[6])

                        if gtdb_taxa[5] != 'g__':
                            prev_rep_genera.add(gtdb_taxa[5])

        # summarize differences
        print('No. current representatives: %d' % len(cur_reps_taxa))
        print('No. previous representatives: %d' % len(prev_reps_taxa))

        print('')
        print('No. current species with representatives: %d' %
              len(cur_rep_species))
        print('No. previous species with representatives: %d' %
              len(prev_rep_species))

        print('')
        print('No. new representatives: %d' %
              len(set(cur_reps_taxa) - set(prev_reps_taxa)))
        print('No. retired representatives: %d' %
              len(set(prev_reps_taxa) - set(cur_reps_taxa)))

        print('')
        print('No. new species with representative: %d' %
              len(cur_rep_species - prev_rep_species))
        print('No. new genera with representative: %d' %
              len(cur_rep_genera - prev_rep_genera))

        print('')
        missing_sp_reps = prev_rep_species.intersection(
            cur_species) - cur_rep_species
        print('No. species that no longer have a representative: %d' %
              len(missing_sp_reps))
        for sp in missing_sp_reps:
            print('  ' + sp)

        print('')
        missing_genera_reps = prev_rep_genera.intersection(
            cur_genera) - cur_rep_genera
        print('No. genera that no longer have a representative: %d' %
              len(missing_genera_reps))
        for g in missing_genera_reps:
            print('  ' + g)

        print('')
        deprecated_reps = set(prev_reps_taxa).intersection(cur_gids) - set(
            cur_reps_taxa)
        print('No. deprecated previous representatives: %d' %
              len(deprecated_reps))
Example #57
0
    def u_cluster_named_reps(self, args):
        """Cluster genomes to selected GTDB representatives."""

        check_file_exists(args.named_rep_file)
        check_file_exists(args.cur_gtdb_metadata_file)
        check_file_exists(args.cur_genomic_path_file)
        check_file_exists(args.uba_genome_paths)
        check_file_exists(args.qc_passed_file)
        check_file_exists(args.ncbi_genbank_assembly_file)
        check_file_exists(args.untrustworthy_type_file)
        check_file_exists(args.rep_mash_sketch_file)
        check_file_exists(args.rep_ani_file)
        check_file_exists(args.gtdb_type_strains_ledger)
        make_sure_path_exists(args.output_dir)

        p = UpdateClusterNamedReps(args.ani_sp, args.af_sp,
                                   args.ani_cache_file, args.cpus,
                                   args.output_dir)
        p.run(args.named_rep_file, args.cur_gtdb_metadata_file,
              args.cur_genomic_path_file, args.uba_genome_paths,
              args.qc_passed_file, args.ncbi_genbank_assembly_file,
              args.untrustworthy_type_file, args.rep_mash_sketch_file,
              args.rep_ani_file, args.gtdb_type_strains_ledger)

        self.logger.info('Done.')
Example #58
0
    def u_cluster_de_novo(self, args):
        """Infer de novo species clusters and representatives for remaining genomes."""

        check_file_exists(args.named_cluster_file)
        check_file_exists(args.cur_gtdb_metadata_file)
        check_file_exists(args.cur_genomic_path_file)
        check_file_exists(args.uba_genome_paths)
        check_file_exists(args.qc_passed_file)
        check_file_exists(args.gtdbtk_classify_file)
        check_file_exists(args.ncbi_genbank_assembly_file)
        check_file_exists(args.untrustworthy_type_file)
        check_file_exists(args.ani_af_rep_vs_nonrep)
        check_file_exists(args.gtdb_type_strains_ledger)
        make_sure_path_exists(args.output_dir)

        p = UpdateClusterDeNovo(args.ani_sp, args.af_sp, args.ani_cache_file,
                                args.cpus, args.output_dir)
        p.run(args.named_cluster_file, args.cur_gtdb_metadata_file,
              args.cur_genomic_path_file, args.uba_genome_paths,
              args.qc_passed_file, args.gtdbtk_classify_file,
              args.ncbi_genbank_assembly_file, args.untrustworthy_type_file,
              args.ani_af_rep_vs_nonrep, args.gtdb_type_strains_ledger)

        self.logger.info('Done.')
Example #59
0
    def pmc_species_names(self, args):
        """Establish final species names based on manual curation."""

        check_file_exists(args.manual_taxonomy)
        check_file_exists(args.manual_sp_names)
        check_file_exists(args.pmc_custom_species)
        check_file_exists(args.gtdb_clusters_file)
        check_file_exists(args.prev_gtdb_metadata_file)
        check_file_exists(args.cur_gtdb_metadata_file)
        check_file_exists(args.uba_genome_paths)
        check_file_exists(args.qc_passed_file)
        check_file_exists(args.ncbi_misclassified_file)
        check_file_exists(args.ncbi_genbank_assembly_file)
        check_file_exists(args.untrustworthy_type_file)
        check_file_exists(args.synonym_file)
        check_file_exists(args.updated_species_reps)
        check_file_exists(args.gtdb_type_strains_ledger)
        check_file_exists(args.species_classification_ledger)
        check_file_exists(args.sp_priority_ledger)
        check_file_exists(args.genus_priority_ledger)
        check_file_exists(args.specific_epithet_ledger)
        check_file_exists(args.dsmz_bacnames_file)
        make_sure_path_exists(args.output_dir)

        p = PMC_SpeciesNames(args.output_dir)
        p.run(args.manual_taxonomy, args.manual_sp_names,
              args.pmc_custom_species, args.gtdb_clusters_file,
              args.prev_gtdb_metadata_file, args.cur_gtdb_metadata_file,
              args.uba_genome_paths, args.qc_passed_file,
              args.ncbi_misclassified_file, args.ncbi_genbank_assembly_file,
              args.untrustworthy_type_file, args.synonym_file,
              args.updated_species_reps, args.gtdb_type_strains_ledger,
              args.species_classification_ledger, args.sp_priority_ledger,
              args.genus_priority_ledger, args.specific_epithet_ledger,
              args.dsmz_bacnames_file)

        self.logger.info('Done.')
Example #60
0
    def propagate(self, options):
        """Propagate labels to all genomes in a cluster."""

        check_file_exists(options.input_taxonomy)
        check_file_exists(options.metadata_file)

        user_to_uba = {}
        if options.uba_mapping_file:
            self.logger.info('Parsing genome ID mapping file.')
            with open(options.uba_mapping_file) as f:
                for line in f:
                    tokens = line.strip().split('\t')
                    if len(tokens) == 2:
                        user_to_uba[tokens[0]] = tokens[1]
            self.logger.info(' - found mappings for {:,} genomes.'.format(
                len(user_to_uba)))

        # get representative genome information
        rep_metadata = read_gtdb_metadata(
            options.metadata_file,
            ['gtdb_representative', 'gtdb_clustered_genomes'])

        rep_metadata = {
            canonical_gid(gid): values
            for gid, values in rep_metadata.items()
        }

        rep_metadata = {
            user_to_uba.get(gid, gid): values
            for gid, values in rep_metadata.items()
        }

        explict_tax = Taxonomy().read(options.input_taxonomy)

        self.logger.info(f' - identified {len(rep_metadata):,} genomes')

        # sanity check all representatives have a taxonomy string
        rep_count = 0
        for gid in rep_metadata:
            is_rep_genome, clustered_genomes = rep_metadata.get(
                gid, (None, None))
            if is_rep_genome:
                rep_count += 1
                if gid not in explict_tax:
                    self.logger.error(
                        'Expected to find {} in input taxonomy as it is a GTDB representative.'
                        .format(gid))
                    sys.exit(-1)

        self.logger.info(
            'Identified {:,} representatives in metadata file and {:,} genomes in input taxonomy file.'
            .format(rep_count, len(explict_tax)))

        # propagate taxonomy to genomes clustered with each representative
        fout = open(options.output_taxonomy, 'w')
        for rid, taxon_list in explict_tax.items():
            taxonomy_str = ';'.join(taxon_list)
            rid = canonical_gid(rid)
            rid = user_to_uba.get(rid, rid)

            is_rep_genome, clustered_genomes = rep_metadata[rid]
            if is_rep_genome:
                # assign taxonomy to representative and all genomes in the cluster
                fout.write('{}\t{}\n'.format(rid, taxonomy_str))
                for cid in [
                        gid.strip() for gid in clustered_genomes.split(';')
                ]:
                    cid = canonical_gid(cid)
                    cid = user_to_uba.get(cid, cid)
                    if cid != rid:
                        if cid in rep_metadata:
                            fout.write('{}\t{}\n'.format(cid, taxonomy_str))
                        else:
                            self.logger.warning(
                                'Skipping {} as it is not in GTDB metadata file.'
                                .format(cid))
            else:
                self.logger.error(
                    'Did not expected to find {} in input taxonomy as it is not a GTDB representative.'
                    .format(rid))
                sys.exit(-1)

        self.logger.info('Taxonomy written to: {}'.format(
            options.output_taxonomy))