Ejemplo n.º 1
0
    def taxon_profile(self, options):
        """Call genes command"""

        make_sure_path_exists(options.output_dir)
        check_file_exists(options.scaffold_stats_file)
        check_file_exists(options.taxonomy_file)
        check_file_exists(options.db_file)

        gene_files = self._genome_files(options.genome_prot_dir,
                                        options.protein_ext)
        if not self._check_protein_seqs(gene_files):
            self.logger.warning('All files must contain amino acid sequences.')
            sys.exit()

        # build gene profile
        taxon_profile = TaxonProfile(options.cpus, options.output_dir)
        taxon_profile.run(gene_files, options.scaffold_stats_file,
                          options.db_file, options.taxonomy_file,
                          options.per_to_classify, options.evalue,
                          options.per_identity, options.per_aln_len,
                          options.tmpdir)

        self.logger.info('Results written to: %s' % options.output_dir)
Ejemplo n.º 2
0
    def taxon_filter(self, options):
        """Taxon filter command"""

        make_sure_path_exists(options.taxon_profile_dir)

        # build gene profile
        taxon_profile = TaxonProfile(options.cpus, options.taxon_profile_dir)

        if False:
            taxon_profile.filter(options.genome_threshold,
                                 options.min_scaffold_agreement,
                                 options.max_scaffold_disagreement,
                                 options.min_classified_per,
                                 options.output_file)
        else:
            taxon_profile.filter(
                options.consensus_taxon, options.trusted_scaffold,
                options.common_taxa, options.congruent_scaffold,
                options.min_classified_per, options.min_classified,
                options.consensus_scaffold, options.output_file)

        self.logger.info('Results written to: %s' % options.output_file)
Ejemplo n.º 3
0
Archivo: ssu.py Proyecto: wwood/RefineM
    def erroneous(self, ssu_hits, ssu_classifications, taxon_profile_dir,
                  common_taxon_threshold, ssu_min_length, ssu_domain,
                  ssu_phylum, ssu_class, ssu_order, ssu_family, ssu_genus,
                  output_dir):
        """Identify scaffolds with SSU genes that have divergent taxonomic classifcation.

        Parameters
        ----------
        ssu_hits : d[genome_id][scaffold_id] -> hit information
            Hits to SSU genes in each genome.
        ssu_classifications : d[genome_id][scaffold_id] -> hit information
            File with taxonomic classifications for SSU genes.
        taxon_profile_dir : str
        common_taxon_threshold : float
        ssu_min_length : int
        ssu_domain : float
        ssu_phylum : float
        ssu_class : float
        ssu_order : float
        ssu_family : float
        ssu_genus : float
        output_dir : str
            Directory for output files.
        """

        header = 'Scaffold id\tGenome id\tGenome classification\tIncongruent common taxa set'
        header += '\tNo. 16S in Genome'
        header += '\t16S Classification\t16S length\t16S e-value\t16S alignment length\t16S percent identity'
        header += '\tScaffold length (bp)\n'

        fout = open(os.path.join(output_dir, 'ssu_erroneous.tsv'), 'w')
        fout.write(header)

        taxon_profile = TaxonProfile(1, taxon_profile_dir)
        common_taxa = taxon_profile.common_taxa(common_taxon_threshold, 25.0)
        genome_taxonomy = taxon_profile.read_genome_taxonomy()

        for genome_id, scaffold_ids in ssu_hits.iteritems():
            # **** HACK for SRA processing
            gid = genome_id.replace('.filtered', '')

            for scaffold_id in scaffold_ids:
                hmm_model, evalue, _start, _stop, ssu_length, _rev_comp, scaffold_len = ssu_hits[
                    genome_id][scaffold_id]

                evalue = float(evalue)
                ssu_length = int(ssu_length)
                scaffold_len = int(scaffold_len)

                if ssu_length < ssu_min_length:
                    continue

                if scaffold_id not in ssu_classifications[genome_id]:
                    continue

                ssu_taxonomy, _, _, _, per_ident = ssu_classifications[
                    genome_id][scaffold_id]
                per_ident = float(per_ident)

                # check if taxa indicated by 16S rRNA gene are congruent with the list of common taxa in genome
                ssu_taxonomy = ssu_taxonomy.split(';')
                congruent = True
                incongruent_common_taxa = None
                for r, value in enumerate([
                        ssu_domain, ssu_phylum, ssu_class, ssu_order,
                        ssu_family, ssu_genus
                ]):
                    if r not in common_taxa[gid]:
                        # insufficient classified genes to determine common taxa at rank
                        break

                    if len(common_taxa[gid][r]) == 0:
                        # no consistent taxonomic signal at rank so do not filter
                        break

                    if ssu_taxonomy[r] == Taxonomy.rank_prefixes[r]:
                        break

                    if per_ident < value:
                        break

                    common_taxa_at_rank = common_taxa[gid][r]
                    if ssu_taxonomy[r] not in common_taxa_at_rank:
                        congruent = False
                        incongruent_common_taxa = common_taxa_at_rank
                        break

                # report outliers
                if not congruent:
                    row = '%s\t%s\t%s' % (scaffold_id, genome_id,
                                          genome_taxonomy[gid])
                    row += '\t%s\t%d' % (';'.join(
                        sorted(
                            list(incongruent_common_taxa))), len(scaffold_ids))
                    row += '\t%s\t%s\t%s\t%s\t%s' % tuple(
                        ssu_classifications[genome_id][scaffold_id])
                    row += '\t%d\n' % scaffold_len

                    fout.write(row)

        fout.close()