Esempio n. 1
0
    def _producer_blast(self, genome_pair):
        """Apply reciprocal blast to a pair of genomes.

        Parameters
        ----------
        genome_pair : list
            Identifier of genomes to process.
        """

        blast = Blast(cpus=self.producer_cpus)

        aa_gene_fileA, aa_gene_fileB = genome_pair

        genome_idA = remove_extension(aa_gene_fileA)
        genome_idB = remove_extension(aa_gene_fileB)

        dbA = os.path.join(self.output_dir, genome_idA + '.db')
        dbB = os.path.join(self.output_dir, genome_idB + '.db')

        output_fileAB = os.path.join(self.output_dir, genome_idA + '-' + genome_idB + '.blastp.tsv')
        blast.blastp(aa_gene_fileA, dbB, output_fileAB, self.evalue)

        output_fileBA = os.path.join(self.output_dir, genome_idB + '-' + genome_idA + '.blastp.tsv')
        blast.blastp(aa_gene_fileB, dbA, output_fileBA, self.evalue)

        return True
Esempio n. 2
0
    def classify(self, seq_file, db, taxonomy_file, evalue_threshold, output_dir):
        """Classify rRNA genes.

        Parameters
        ----------
        seq_file : str
            Name of fasta file containing rRNA sequences.
        ssu_db : str
            BLAST database of rRNA genes.
        ssu_taxonomy_file : str
            Taxonomy file for genes in the rRNA database.
        evalue_threshold : float
            E-value threshold for defining valid hits.
        output_dir : str
            Output directory.
        """

        # blast sequences against rRNA database
        blast = Blast(self.cpus)
        blast_file = os.path.join(output_dir, '%s.blastn.tsv' % self.rna_name)
        blast.blastn(seq_file, db, blast_file, evalue=evalue_threshold,
                     max_matches=5, output_fmt='custom')

        # read taxonomy file
        taxonomy = Taxonomy().read(taxonomy_file)

        # write out classification file
        classification_file = os.path.join(
            output_dir, '%s.taxonomy.tsv' % self.rna_name)
        fout = open(classification_file, 'w')
        fout.write(
            'query_id\ttaxonomy\tlength\tblast_subject_id\tblast_evalue\tblast_bitscore\tblast_align_len\tblast_perc_identity\n')

        processed_query_ids = set()
        for line in open(blast_file):
            line_split = [x.strip() for x in line.split('\t')]
            query_id = line_split[0]

            if query_id in processed_query_ids:
                # A query may have multiple hits to different genes or sections
                # of a gene. Blast results are organized by bitscore so
                # only the first hit is considered.
                continue

            processed_query_ids.add(query_id)
            query_len = int(line_split[1])
            subject_id = line_split[2]
            align_len = line_split[5]
            perc_identity = line_split[6]
            evalue = line_split[7]
            bitscore = line_split[8]

            taxonomy_str = ';'.join(taxonomy[subject_id])

            fout.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (query_id, taxonomy_str,
                                                             query_len, subject_id, evalue, bitscore, align_len, perc_identity))

        fout.close()
Esempio n. 3
0
    def _run_self_blastp(self, query_gene_file, 
                                evalue, 
                                per_identity, 
                                per_aln_len,
                                max_hits,
                                tmp_dir,
                                output_dir):
        """Perform similarity search of query genes against themselves.

        Parameters
        ----------
        query_gene_file : str
            File with all query sequences.
        evalue : float
            E-value threshold for reporting hits.
        per_identity : float
            Percent identity threshold for reporting hits.
        per_aln_len : float
            Percent query coverage threshold for reporting hits.
        max_hits : int
            Maximum number of hits to report per query sequences.
        tmp_dir : str
            Directory to store temporary files.
        output_dir : str
            Directory to store blast results.
        """
        
        # concatenate all gene files and create a single diamond database
        self.logger.info('Creating BLASTP database (be patient!).')
        
        blast = Blast(self.cpus, silent=True)
        blast.create_blastp_db(query_gene_file)
        
        # create temporary hits table
        if tmp_dir:
            tmp_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', delete=False)
        tmp_hits_table.close()

        # blast all genes against the database
        self.logger.info('Performing sequence similarity search between all query genomes (be patient!).')
        hits_daa_file = os.path.join(output_dir, 'query_hits')
        blast.blastp(query_gene_file, query_gene_file, tmp_hits_table.name, evalue, max_hits, task='blastp-fast')
        
        # sort hit table
        hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv')
        self._sort_hit_table(tmp_hits_table.name, hits_table_file)
Esempio n. 4
0
    def _tax_filter(self, ssu_output_file, taxonomy, output_dir):
        """Identify sequence to filter based on taxonomy of best BLAST hit.

        """

        tax_filter_dir = os.path.join(output_dir, 'tax_filter')
        if not os.path.exists(tax_filter_dir):
            os.makedirs(tax_filter_dir)

        blast = Blast(self.cpus)
        self.logger.info('Creating BLASTN database.')
        blast.create_blastn_db(ssu_output_file)

        self.logger.info(
            'Performing reciprocal BLAST to identify sequences with incongruent taxonomies.'
        )

        blast_table = os.path.join(tax_filter_dir, 'blastn.tsv')
        blast.blastn(ssu_output_file,
                     ssu_output_file,
                     blast_table,
                     evalue=1e-10,
                     max_matches=2,
                     output_fmt='custom',
                     task='megablast')

        filter = set()
        order_index = Taxonomy.rank_labels.index('order')
        fout = open(os.path.join(tax_filter_dir, 'filtered_seqs.tsv'), 'w')
        fout.write(
            'Seq Id\tQuery Taxonomy\tSubject Taxonomy\tPerc. Identity\tAlign. Length\n'
        )
        for hit in blast.read_hit(blast_table, table_fmt='custom'):
            if hit.query_id == hit.subject_id:
                # ignore self hits
                continue

            # require a (very lenient) percent identity of 82%
            # (threshold from Yarza et al., 2014)
            if hit.perc_identity >= 82 and hit.alignment_len > 800:
                # there is a close hit in the database so verify it has
                # the expected taxonomic order
                query_genome_id = hit.query_id.split('~', 1)[0]
                subject_genome_id = hit.subject_id.split('~', 1)[0]
                order_of_query = taxonomy[query_genome_id][order_index][
                    3:].strip()
                order_of_subject = taxonomy[subject_genome_id][order_index][
                    3:].strip()
                if order_of_query and order_of_subject and order_of_query != order_of_subject:
                    filter.add(hit.query_id)
                    fout.write(
                        '%s\t%s\t%s\%.2f\t%d\n' %
                        (hit.query_id, ';'.join(taxonomy[query_genome_id]),
                         ';'.join(taxonomy[subject_genome_id]),
                         hit.perc_identity, hit.alignment_len))

        fout.close()

        return filter
Esempio n. 5
0
    def _run_self_blastp(self, query_gene_file, evalue, per_identity,
                         per_aln_len, max_hits, tmp_dir, output_dir):
        """Perform similarity search of query genes against themselves.

        Parameters
        ----------
        query_gene_file : str
            File with all query sequences.
        evalue : float
            E-value threshold for reporting hits.
        per_identity : float
            Percent identity threshold for reporting hits.
        per_aln_len : float
            Percent query coverage threshold for reporting hits.
        max_hits : int
            Maximum number of hits to report per query sequences.
        tmp_dir : str
            Directory to store temporary files.
        output_dir : str
            Directory to store blast results.
        """

        # concatenate all gene files and create a single diamond database
        self.logger.info('Creating BLASTP database (be patient!).')

        blastp_db = os.path.join(output_dir, 'query_genes')
        blast = Blast(self.cpus, silent=True)
        blast.create_blastp_db(query_gene_file, blastp_db)

        # create temporary hits table
        if tmp_dir:
            tmp_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', delete=False)
        tmp_hits_table.close()

        # blast all genes against the database
        self.logger.info(
            'Performing sequence similarity search between all query genomes (be patient!).'
        )
        hits_daa_file = os.path.join(output_dir, 'query_hits')
        blast.blastp(query_gene_file,
                     blastp_db,
                     tmp_hits_table.name,
                     evalue,
                     max_hits,
                     task='blastp-fast')

        # sort hit table
        hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv')
        self._sort_hit_table(tmp_hits_table.name, hits_table_file)
Esempio n. 6
0
    def run(self, query_proteins, db_file, custom_db_file, taxonomy_file,
            custom_taxonomy_file, evalue, per_identity, per_aln_len,
            max_matches, homology_search, min_per_taxa, consensus, min_per_bp,
            use_trimAl, restrict_taxon, msa_program, tree_program, prot_model,
            skip_rooting, output_dir):
        """Infer a gene tree for homologs genes identified by blast.

        Workflow for inferring a gene tree from sequences identified as being
        homologs to a set of query proteins. Homologs are identified using BLASTP
        and a set of user-defined parameters.

        Parameters
        ----------
        query_proteins : str
            Fasta file containing query proteins.
        db_file : str
            BLAST database of reference proteins.
        custom_db_file : str
            Custom database of proteins.
        taxonomy_file : str
            Taxonomic assignment of each reference genomes.
        custom_taxonomy_file : str
            Taxonomic assignment of genomes in custom database.
        evalue : float
            E-value threshold used to define homolog.
        per_identity : float
            Percent identity threshold used to define a homolog.
        per_aln_len : float
            Alignment length threshold used to define a homolog.
        max_matches : int
            Maximum matches per query protein.
        metadata : dict[genome_id] -> metadata dictionary
            Metadata for genomes.
        homology_search : str
            Type of homology search to perform.
        min_per_taxa : float
            Minimum percentage of taxa required to retain a column.
        consensus : float
            Minimum percentage of the same amino acid required to retain column.
        min_per_bp : float
            Minimum percentage of base pairs required to keep trimmed sequence.
        use_trimAl : boolean
            Filter columns using trimAl.
        restrict_taxon : str
            Restrict alignment to specific taxonomic group (e.g., k__Archaea).
        msa_program : str
            Program to use for multiple sequence alignment ['mafft', 'muscle'].
        tree_program : str
            Program to use for tree inference ['fasttree', 'raxml'].
        prot_model : str
            Protein substitution model for tree inference ['WAG', 'LG', 'AUTO'].
        skip_rooting : boolean
            Skip midpoint rooting if True.
        output_dir : str
            Directory to store results.
        """

        # validate query sequence names for use with GeneTreeTk
        validate_seq_ids(query_proteins)

        # read taxonomy file
        self.logger.info('Reading taxonomy file.')
        taxonomy = Taxonomy().read(taxonomy_file)

        if custom_taxonomy_file:
            custom_taxonomy = Taxonomy().read(custom_taxonomy_file)
            taxonomy.update(custom_taxonomy)

        # report distribution of query genes
        mean_len, max_len, min_len, p10, p50, p90 = self._gene_distribution(
            query_proteins)
        self.logger.info(
            'Query gene lengths: min, mean, max = %d, %.1f, %d | p10, p50, p90 = %.1f, %.1f, %.1f'
            % (min_len, mean_len, max_len, p10, p50, p90))

        # identify homologs using BLASTP
        self.logger.info('Identifying homologs using %s.' % homology_search)
        blast = Blast(self.cpus)
        blast_output = os.path.join(output_dir, 'reference_hits.tsv')
        if homology_search == 'diamond':
            diamond = Diamond(self.cpus)
            diamond.blastp(query_proteins,
                           db_file,
                           evalue,
                           per_identity,
                           per_aln_len,
                           max_matches,
                           blast_output,
                           output_fmt='custom')
        else:
            blast.blastp(query_proteins,
                         db_file,
                         blast_output,
                         evalue,
                         max_matches,
                         output_fmt='custom',
                         task=homology_search)
        homologs = blast.identify_homologs(blast_output, evalue, per_identity,
                                           per_aln_len)
        self.logger.info('Identified %d homologs in reference database.' %
                         len(homologs))

        custom_homologs = None
        if custom_db_file:
            custom_blast_output = os.path.join(output_dir, 'custom_hits.tsv')
            if homology_search == 'diamond':
                diamond = Diamond(self.cpus)
                diamond.blastp(query_proteins,
                               custom_db_file,
                               evalue,
                               per_identity,
                               per_aln_len,
                               max_matches,
                               custom_blast_output,
                               output_fmt='custom')
            else:
                blast.blastp(query_proteins,
                             custom_db_file,
                             custom_blast_output,
                             evalue,
                             max_matches,
                             output_fmt='custom',
                             task=homology_search)
            custom_homologs = blast.identify_homologs(custom_blast_output,
                                                      evalue, per_identity,
                                                      per_aln_len)
            self.logger.info('Identified %d homologs in custom database.' %
                             len(custom_homologs))

        # restrict homologs to specific taxonomic group
        if restrict_taxon:
            self.logger.info('Restricting homologs to %s.' % restrict_taxon)
            restricted_homologs = {}
            for query_id, hit in homologs.iteritems():
                genome_id = hit.subject_id.split('~')[0]
                if restrict_taxon in taxonomy[genome_id]:
                    restricted_homologs[query_id] = hit

            self.logger.info(
                '%d of %d homologs in reference database are from the specified group.'
                % (len(restricted_homologs), len(homologs)))
            homologs = restricted_homologs

        if len(homologs) == 0:
            self.logger.error(
                'Too few homologs were identified. Gene tree cannot be inferred.'
            )
            sys.exit()

        # extract homologs
        self.logger.info(
            'Extracting homologs and determining local gene context.')
        db_homologs_tmp = os.path.join(output_dir, 'homologs_db.tmp')
        gene_precontext, gene_postcontext = self.extract_homologs_and_context(
            homologs.keys(), db_file, db_homologs_tmp)

        # report gene length distribution of homologs
        mean_len, max_len, min_len, p10, p50, p90 = self._gene_distribution(
            db_homologs_tmp)
        self.logger.info(
            'Homolog gene lengths: min, mean, max = %d, %.1f, %d | p10, p50, p90 = %.1f, %.1f, %.1f'
            % (min_len, mean_len, max_len, p10, p50, p90))

        # concatenate homologs with initial query genes
        homolog_ouput_tmp = os.path.join(output_dir, 'homologs.faa.tmp')
        if custom_homologs:
            custom_db_homologs_tmp = os.path.join(output_dir,
                                                  'custom_homologs_db.tmp')
            custom_gene_precontext, custom_gene_postcontext = self.extract_homologs_and_context(
                custom_homologs.keys(), custom_db_file, custom_db_homologs_tmp)
            gene_precontext.update(custom_gene_precontext)
            gene_postcontext.update(custom_gene_postcontext)
            homologs.update(custom_homologs)
            concatenate_files(
                [query_proteins, db_homologs_tmp, custom_db_homologs_tmp],
                homolog_ouput_tmp)
            os.remove(custom_db_homologs_tmp)
        else:
            concatenate_files([query_proteins, db_homologs_tmp],
                              homolog_ouput_tmp)

        os.remove(db_homologs_tmp)

        # remove stop codons
        homolog_ouput = os.path.join(output_dir, 'homologs.faa')
        self._remove_stop_codons(homolog_ouput_tmp, homolog_ouput)
        os.remove(homolog_ouput_tmp)

        # infer multiple sequence alignment
        msa = MsaWorkflow(self.cpus)
        trimmed_msa_output = msa.run(homolog_ouput, min_per_taxa, consensus,
                                     min_per_bp, use_trimAl, msa_program,
                                     output_dir)

        # infer tree
        tw = TreeWorkflow(self.cpus)
        tree_output = tw.run(trimmed_msa_output, tree_program, prot_model,
                             skip_rooting, output_dir)

        # create tax2tree consensus map and decorate tree
        self.logger.info('Decorating internal tree nodes with tax2tree.')
        output_taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv')
        fout = open(output_taxonomy_file, 'w')
        for homolog_id in homologs.keys():
            genome_id = homolog_id.split('~')[0]
            t = taxonomy.get(genome_id, None)
            if t:
                fout.write(homolog_id + '\t' + ';'.join(t) + '\n')
        fout.close()

        t2t_tree = os.path.join(output_dir, 'homologs.tax2tree.tree')
        cmd = 't2t decorate -m %s -t %s -o %s' % (output_taxonomy_file,
                                                  tree_output, t2t_tree)
        os.system(cmd)

        # create tree with leaf nodes given as genome accessions
        tree = dendropy.Tree.get_from_path(t2t_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        for leaf in tree.leaf_node_iter():
            leaf.taxon.label = leaf.taxon.label.split('~')[0]

        genome_tree = os.path.join(output_dir,
                                   'homologs.tax2tree.genome_accessions.tree')
        tree.write_to_path(genome_tree,
                           schema='newick',
                           suppress_rooting=True,
                           unquoted_underscores=True)

        # setup metadata for ARB file
        src_dir = os.path.dirname(os.path.realpath(__file__))
        version_file = open(os.path.join(src_dir, 'VERSION'))

        metadata = {}
        metadata['genetreetk_version'] = version_file.read().strip()
        metadata['genetreetk_query_proteins'] = query_proteins
        metadata['genetreetk_db_file'] = db_file
        metadata['genetreetk_taxonomy_file'] = taxonomy_file
        metadata['genetreetk_blast_evalue'] = str(evalue)
        metadata['genetreetk_blast_per_identity'] = str(per_identity)
        metadata['genetreetk_blast_per_aln_len'] = str(per_aln_len)
        metadata['genetreetk_blast_max_matches'] = str(max_matches)
        metadata['genetreetk_homology_search'] = homology_search

        metadata['genetreetk_msa_min_per_taxa'] = str(min_per_taxa)
        metadata['genetreetk_msa_consensus'] = str(consensus)
        metadata['genetreetk_msa_min_per_bp'] = str(min_per_bp)
        metadata['genetreetk_msa_program'] = msa_program

        metadata['genetreetk_tree_program'] = tree_program
        metadata['genetreetk_tree_prot_model'] = prot_model

        # create ARB metadata file
        self.logger.info('Creating ARB metadata file.')
        arb_metadata_file = os.path.join(output_dir, 'arb.metadata.txt')
        self.create_arb_metadata(homologs, trimmed_msa_output, taxonomy,
                                 metadata, gene_precontext, gene_postcontext,
                                 arb_metadata_file)
Esempio n. 7
0
    def _tax_filter(self, ssu_output_file, taxonomy, output_dir):
        """Identify sequence to filter based on taxonomy of best BLAST hit.

        """

        extant_taxa = Taxonomy().extant_taxa(taxonomy)

        tax_filter_dir = os.path.join(output_dir, 'tax_filter')
        if not os.path.exists(tax_filter_dir):
            os.makedirs(tax_filter_dir)

        blast = Blast(self.cpus)
        self.logger.info('Creating BLASTN database.')
        blast.create_blastn_db(ssu_output_file)

        self.logger.info(
            'Performing reciprocal BLAST to identify sequences with incongruent taxonomies.'
        )

        blast_table = os.path.join(tax_filter_dir, 'blastn.tsv')
        blast.blastn(ssu_output_file,
                     ssu_output_file,
                     blast_table,
                     evalue=1e-10,
                     max_matches=2,
                     output_fmt='custom',
                     task='blastn')

        filter = set()
        fout = open(os.path.join(tax_filter_dir, 'filtered_seqs.tsv'), 'w')
        fout.write(
            'Query ID\tQuery Taxonomy\tSubject ID\tSubject Taxonomy\tPerc. Identity\tAlign. Length\tMismatch Rank\tNo. Query Genomes\tNo. Subject Genomes\n'
        )
        for hit in blast.read_hit(blast_table, table_fmt='custom'):
            if hit.query_id == hit.subject_id:
                # ignore self hits
                continue

            if hit.alignment_len > 800:
                query_genome_id = hit.query_id.split('~', 1)[0]
                subject_genome_id = hit.subject_id.split('~', 1)[0]

                # require a (very lenient) percent identity of threshold from Yarza et al., 2014
                if hit.query_id not in filter and hit.perc_identity >= 82:  # order
                    rank_index = Taxonomy.rank_labels.index('order')
                    query_taxa = taxonomy[query_genome_id][rank_index][
                        3:].strip()
                    subject_taxa = taxonomy[subject_genome_id][rank_index][
                        3:].strip()
                    if query_taxa and subject_taxa and query_taxa != subject_taxa:
                        filter.add(hit.query_id)
                        fout.write(
                            '%s\t%s\t%s\t%s\t%.2f\t%d\t%s\t%d\t%d\n' %
                            (hit.query_id, ';'.join(taxonomy[query_genome_id]),
                             hit.subject_id, ';'.join(
                                 taxonomy[subject_genome_id]),
                             hit.perc_identity, hit.alignment_len, 'Order',
                             len(extant_taxa['o__' + query_taxa]),
                             len(extant_taxa['o__' + subject_taxa])))

                if False:
                    if hit.perc_identity >= 75:  # phylum
                        rank_index = Taxonomy.rank_labels.index('phylum')
                        query_taxa = taxonomy[query_genome_id][rank_index][
                            3:].strip()
                        subject_taxa = taxonomy[subject_genome_id][rank_index][
                            3:].strip()
                        if query_taxa and subject_taxa and query_taxa != subject_taxa:
                            filter.add(hit.query_id)
                            fout.write(
                                '%s\t%s\t%s\t%s\t%.2f\t%d\t%s\t%d\t%d\n' %
                                (hit.query_id, ';'.join(
                                    taxonomy[query_genome_id]), hit.subject_id,
                                 ';'.join(taxonomy[subject_genome_id]),
                                 hit.perc_identity, hit.alignment_len,
                                 'Phylum', len(
                                     extant_taxa['p__' + query_taxa]),
                                 len(extant_taxa['p__' + subject_taxa])))

                    if hit.perc_identity >= 78.5:  # class
                        rank_index = Taxonomy.rank_labels.index('class')
                        query_taxa = taxonomy[query_genome_id][rank_index][
                            3:].strip()
                        subject_taxa = taxonomy[subject_genome_id][rank_index][
                            3:].strip()
                        if query_taxa and subject_taxa and query_taxa != subject_taxa:
                            filter.add(hit.query_id)
                            fout.write(
                                '%s\t%s\t%s\t%s\t%.2f\t%d\t%s\t%d\t%d\n' %
                                (hit.query_id, ';'.join(
                                    taxonomy[query_genome_id]), hit.subject_id,
                                 ';'.join(taxonomy[subject_genome_id]),
                                 hit.perc_identity, hit.alignment_len, 'Class',
                                 len(extant_taxa['c__' + query_taxa]),
                                 len(extant_taxa['c__' + subject_taxa])))

                    if hit.query_id not in filter and hit.perc_identity >= 82:  # order
                        rank_index = Taxonomy.rank_labels.index('order')
                        query_taxa = taxonomy[query_genome_id][rank_index][
                            3:].strip()
                        subject_taxa = taxonomy[subject_genome_id][rank_index][
                            3:].strip()
                        if query_taxa and subject_taxa and query_taxa != subject_taxa:
                            filter.add(hit.query_id)
                            fout.write(
                                '%s\t%s\t%s\t%s\t%.2f\t%d\t%s\t%d\t%d\n' %
                                (hit.query_id, ';'.join(
                                    taxonomy[query_genome_id]), hit.subject_id,
                                 ';'.join(taxonomy[subject_genome_id]),
                                 hit.perc_identity, hit.alignment_len, 'Order',
                                 len(extant_taxa['o__' + query_taxa]),
                                 len(extant_taxa['o__' + subject_taxa])))

                    if hit.query_id not in filter and hit.perc_identity >= 86.5:  # family
                        rank_index = Taxonomy.rank_labels.index('family')
                        query_taxa = taxonomy[query_genome_id][rank_index][
                            3:].strip()
                        subject_taxa = taxonomy[subject_genome_id][rank_index][
                            3:].strip()
                        if query_taxa and subject_taxa and query_taxa != subject_taxa:
                            filter.add(hit.query_id)
                            fout.write(
                                '%s\t%s\t%s\t%s\t%.2f\t%d\t%s\t%d\t%d\n' %
                                (hit.query_id, ';'.join(
                                    taxonomy[query_genome_id]), hit.subject_id,
                                 ';'.join(taxonomy[subject_genome_id]),
                                 hit.perc_identity, hit.alignment_len,
                                 'Family', len(
                                     extant_taxa['f__' + query_taxa]),
                                 len(extant_taxa['f__' + subject_taxa])))

                    if hit.query_id not in filter and hit.perc_identity >= 94.5:  # genus
                        rank_index = Taxonomy.rank_labels.index('genus')
                        query_taxa = taxonomy[query_genome_id][rank_index][
                            3:].strip()
                        subject_taxa = taxonomy[subject_genome_id][rank_index][
                            3:].strip()
                        if query_taxa and subject_taxa and query_taxa != subject_taxa:
                            filter.add(hit.query_id)
                            fout.write(
                                '%s\t%s\t%s\t%s\t%.2f\t%d\t%s\t%d\t%d\n' %
                                (hit.query_id, ';'.join(
                                    taxonomy[query_genome_id]), hit.subject_id,
                                 ';'.join(taxonomy[subject_genome_id]),
                                 hit.perc_identity, hit.alignment_len, 'Genus',
                                 len(extant_taxa['g__' + query_taxa]),
                                 len(extant_taxa['g__' + subject_taxa])))

                    if hit.query_id not in filter and hit.perc_identity >= 99:  # species
                        rank_index = Taxonomy.rank_labels.index('species')
                        query_taxa = taxonomy[query_genome_id][rank_index][
                            3:].strip()
                        subject_taxa = taxonomy[subject_genome_id][rank_index][
                            3:].strip()
                        if query_taxa and subject_taxa and query_taxa != subject_taxa:
                            filter.add(hit.query_id)
                            fout.write(
                                '%s\t%s\t%s\t%s\t%.2f\t%d\t%s\t%d\t%d\n' %
                                (hit.query_id, ';'.join(
                                    taxonomy[query_genome_id]), hit.subject_id,
                                 ';'.join(taxonomy[subject_genome_id]),
                                 hit.perc_identity, hit.alignment_len,
                                 'Species', len(
                                     extant_taxa['s__' + query_taxa]),
                                 len(extant_taxa['s__' + subject_taxa])))

        fout.close()

        return filter
Esempio n. 8
0
File: ssu.py Progetto: wwood/RefineM
    def classify(self, seq_files, ssu_db, ssu_taxonomy_file, evalue_threshold,
                 output_dir):
        """Classify 16S rRNA genes.

        Parameters
        ----------
        seq_files : d[genome_id] -> fasta file
            Fasta file containing 16S rRNA sequences for each genome.
        ssu_db : str
            BLAST database of 16S rRNA genes.
        ssu_taxonomy_file : str
            Taxonomy file for genes in the 16S rRNA database.
        evalue_threshold : float
            E-value threshold for defining valid hits.
        output_dir : str
            Output directory.
            
        Returns
        -------
        d[genome_id][scaffold_id] -> str
            Taxonomic classifications of SSU sequences for each genome.
        """

        blast = Blast(self.cpus)

        self.logger.info('Classifying SSU rRNA genes.')
        classifications = defaultdict(dict)
        for genome_id, seq_file in seq_files.iteritems():
            genome_dir = os.path.join(output_dir, genome_id)

            # blast sequences against 16S database
            blast_file = os.path.join(genome_dir, 'ssu.blastn.tsv')
            blast.blastn(seq_file,
                         ssu_db,
                         blast_file,
                         evalue=evalue_threshold,
                         max_matches=1,
                         output_fmt='custom')

            # read taxonomy file
            taxonomy = Taxonomy().read(ssu_taxonomy_file)

            # write out classification file
            classification_file = os.path.join(genome_dir, 'ssu.taxonomy.tsv')
            fout = open(classification_file, 'w')
            fout.write(
                'query_id\tssu_taxonomy\tssu_length\tssu_blast_subject_id\tssu_blast_evalue\tssu_blast_bitscore\tssu_blast_align_len\tssu_blast_perc_identity\n'
            )

            processed_query_ids = set()
            for line in open(blast_file):
                line_split = [x.strip() for x in line.split('\t')]
                query_id = line_split[0]

                if query_id in processed_query_ids:
                    # A query may have multiple hits to different sections
                    # of a gene. Blast results are organized by e-value so
                    # only the first hit is considered. The subject gene
                    # is the same in all cases so the taxonomy string will
                    # be identical.
                    continue

                processed_query_ids.add(query_id)
                query_len = int(line_split[1])
                subject_id = line_split[2]
                align_len = line_split[5]
                perc_identity = line_split[6]
                evalue = line_split[7]
                bitscore = line_split[8]

                taxonomy_str = ';'.join(taxonomy[subject_id])

                classifications[genome_id][query_id] = [
                    taxonomy_str, query_len, evalue, align_len, perc_identity
                ]
                fout.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %
                           (query_id, taxonomy_str, query_len, subject_id,
                            evalue, bitscore, align_len, perc_identity))

            fout.close()

        return classifications