Example #1
0
def cazy_parser(tsv, namemap, output, summary_method='best', min_identity=60, min_bitscore=0, min_length=60, max_evalue=0.000001, top_fraction=1, max_hits=10, table_name="cazy"):
    """Parse BLAST hits from CAZy reference database.

    The BLAST hits are assumed to be sorted by query with decreasing bitscores (best alignment first):

        \b
        sort -k1,1 -k12,12rn tsv > sorted_tsv

    Expected columns in the CAZy database:

        \b
        cazy_gene
        cazy_family
        cazy_class
        cazy_ec

    For a given gene match, all possible ECs are returned in a single line separated by '|'.

    Args:
        tsv (str): blast hits file path in default tabular format
        namemap (str): sqlite database file path
        output (str): :py:class:`click.File` in write mode
        summary_method (str): either 'majority' or 'best'; summary method for annotating ORFs; when majority and there is no majority, best is used
        min_identity (int): minimum allowable percent ID of BLAST hit
        min_bitscore (int): minimum allowable bitscore of BLAST hit; 0 disables
        min_length (int): minimum allowable BLAST alignment length
        max_evalue (float): maximum allowable e-value of BLAST hit
        top_fraction (float): filters ORF BLAST hits before finding majority by only keep hits within this fraction, e.g. 0.98, of the highest bitscore
        max_hits (int): maximum number of BLAST hits to consider when summarizing ORFs as a majority
        table_name (str): table name within namemap database; expected columns are listed above

    """
    logging.info("Parsing %s" % tsv)

    if top_fraction == 1:
        top_fraction = None

    print("contig", "orf", "cazy_gene", "cazy_family", "cazy_class", "cazy_ec",
          "%s_evalue" % table_name, "%s_bitscore" % table_name, sep="\t", file=output)
    with contextlib.closing(sqlite3.connect(namemap)) as conn, gzopen(tsv) as blast_tab_fh:
        cursor = conn.cursor()
        for query, qgroup in groupby(blast_tab_fh, key=lambda x: x.partition("\t")[0]):

            contig_name, _, orf_idx = query.rpartition("_")
            hit_id, evalue, bitscore = get_hit_from_blast_group(qgroup, max_hits, top_fraction, min_length, min_identity, max_evalue, min_bitscore, summary_method)
            cazy_gene = "NA"
            cazy_family = "NA"
            cazy_class = "NA"
            cazy_ec = "NA"

            # everything could have been filtered out due to user constraints
            if hit_id:
                cursor.execute('SELECT cazy_gene, cazy_family, cazy_class, cazy_ec \
                                FROM %s \
                                WHERE cazy_gene="%s"' % (table_name, hit_id))
                cazy_gene, cazy_family, cazy_class, cazy_ec = cursor.fetchone()
            print(contig_name, "%s_%s" % (contig_name, orf_idx), cazy_gene, cazy_family,
                  cazy_class, cazy_ec, evalue, bitscore, sep="\t", file=output)
    logging.info("Complete")
Example #2
0
def parse_blast_results_with_tree(blast_tab,
                                  name_map,
                                  summary_method,
                                  tree,
                                  min_identity=70,
                                  min_bitscore=0,
                                  min_length=60,
                                  max_evalue=0.000001,
                                  max_hits_per_orf=10,
                                  top_fraction_of_hits=None,
                                  table_name="refseq",
                                  lca_threshold=1):
    """Parse BLAST results (-outfmt 6), filter, and aggregate ORF taxonomies.

    Args:
        blast_tab (str): file path to blast TSV file
        name_map (dict): dict of tuples from parse_tree_annotation
        summary_method (dict): method of ORF annotation selection

        lca_threshold (float): the first parent above this fraction of representation (its count is
            greater than the total * lca_threshold)

    Returns:
        dict: dict of dicts where first key is contig name, inner key is ORF ID; values are tuple of
              protein function, taxonomy ID, bitscore, evalue

    Raises:
        AssertionError when ORF summary method is not supported (['lca', 'best', 'majority'])
    """

    # allowing 1 and 0 to disable
    if top_fraction_of_hits == 1:
        top_fraction_of_hits = None

    assert summary_method in ["lca", "best", "majority"]
    contigs = defaultdict(dict)

    with contextlib.closing(sqlite3.connect(name_map)) as conn, gzopen(
            blast_tab) as blast_tab_fh:
        cursor = conn.cursor()
        # group hits by ORF (column 2)
        for orf_id, qgroup in groupby(blast_tab_fh,
                                      key=lambda x: x.split("\t")[1]):

            protein_function = "hypothetical protein"
            protein_set = False
            taxonomy_id = "1"
            bitscore = "NA"
            evalue = "NA"
            orf_hits = BlastHits(max_hits=max_hits_per_orf,
                                 top_fraction=top_fraction_of_hits)
            lines = []

            # iterate over blast hits per ORF
            for hsp in qgroup:
                # HSPs will now have contig in column 1
                toks = hsp.strip().split("\t")
                # remove extra column from toks
                contig_name = toks.pop(0)
                # convert toks to dictionary
                toks = dict(zip(BLAST6, toks))

                if (int(toks["length"]) < min_length
                        or float(toks["pident"]) < min_identity
                        or float(toks["evalue"]) > max_evalue):
                    continue
                if min_bitscore and float(toks["bitscore"]) < min_bitscore:
                    # input is sorted by decreasing bitscore
                    break

                cursor.execute(
                    'SELECT function, taxonomy FROM %s WHERE name="%s"' %
                    (table_name, toks["sseqid"]))
                current_function, current_taxonomy = cursor.fetchone()

                # update taxonomy based on pident; would be similar to 16S taxonomy assignments
                # current_taxonomy = tree.climb_tree(current_taxonomy, float(toks["pident"]))

                if summary_method == "best":
                    taxonomy_id = current_taxonomy
                    protein_function = current_function
                    bitscore = toks["bitscore"]
                    evalue = toks["evalue"]
                    break

                # TODO implement bitscore ratio as a measure of alignment quality as a function of input sequence
                orf_hits.add(current_taxonomy, toks["bitscore"])
                toks["current_function"] = current_function
                toks["current_taxonomy"] = current_taxonomy
                lines.append(toks)

            # summary method is majority and we have passing HSPs
            if not summary_method == "best" and lines:
                if summary_method == "majority":
                    taxonomy_id = orf_hits.majority()
                    for toks in lines:
                        if toks["current_taxonomy"] == taxonomy_id:
                            bitscore = toks["bitscore"]
                            evalue = toks["evalue"]
                            protein_function = toks["current_function"]
                            break
                # summary method is 'lca'
                else:
                    orf_hits.names.reverse()
                    taxonomy_id = tree.lca(orf_hits.names,
                                           threshold=lca_threshold)
                    # grabbing best hit's bitscore and evalue
                    bitscore = lines[0]["bitscore"]
                    evalue = lines[0]["evalue"]
                    protein_function = lines[0]["current_function"]

                if bitscore == "NA":
                    logging.critical(
                        "The summarized ID (%s) was not assigned metadata" %
                        taxonomy_id)

            contigs[contig_name][orf_id] = (protein_function, taxonomy_id,
                                            bitscore, evalue)

    return contigs
Example #3
0
def eggnog_parser(tsv, namemap, output, summary_method, min_identity, min_bitscore, min_length, max_evalue, top_fraction, max_hits, table_name):
    """Parse BLAST hits from EGGNOG.

    The BLAST hits are assumed to be sorted by query with decreasing bitscores (best alignment first):

        \b
        sort -k1,1 -k12,12rn tsv > sorted_tsv

    Expected columns in the EggNOG database:

        \b
        uniprot_ac
        eggnog_ssid_b
        eggnog_species_id
        uniprot_id
        ko_id
        ko_level1_name
        ko_level2_name
        ko_level3_id
        ko_level3_name
        ko_gene_symbol
        ko_product
        ko_ec

    Args:
        tsv (str): blast hits file path in default tabular format
        namemap (str): sqlite database file path
        output (str): :py:class:`click.File` in write mode
        summary_method (str): either 'majority' or 'best'; summary method for annotating ORFs; when majority and there is no majority, best is used
        min_identity (int): minimum allowable percent ID of BLAST hit
        min_bitscore (int): minimum allowable bitscore of BLAST hit; 0 disables
        min_length (int): minimum allowable BLAST alignment length
        max_evalue (float): maximum allowable e-value of BLAST hit
        top_fraction (float): filters ORF BLAST hits before finding majority by only keep hits within this fraction, e.g. 0.98, of the highest bitscore
        max_hits (int): maximum number of BLAST hits to consider when summarizing ORFs as a majority
        table_name (str): table name within namemap database; expected columns are listed above

    """
    logging.info("Parsing %s" % tsv)

    if top_fraction == 1:
        top_fraction = None

    print("contig", "orf", "uniprot_ac", "eggnog_ssid_b", "eggnog_species_id", "uniprot_id",
          "ko_id", "ko_level1_name", "ko_level2_name", "ko_level3_id", "ko_level3_name",
          "ko_gene_symbol", "ko_product", "ko_ec", "%s_evalue" % table_name,
          "%s_bitscore" % table_name, sep="\t", file=output)

    with contextlib.closing(sqlite3.connect(namemap)) as conn, gzopen(tsv) as blast_tab_fh:
        cursor = conn.cursor()
        for query, qgroup in groupby(blast_tab_fh, key=lambda x: x.partition("\t")[0]):

            contig_name, _, orf_idx = query.rpartition("_")
            hit_id, evalue, bitscore = get_hit_from_blast_group(qgroup, max_hits, top_fraction, min_length, min_identity, max_evalue, min_bitscore, summary_method)
            uniprot_ac = "NA"
            eggnog_ssid_b = "NA"
            eggnog_species_id = "NA"
            uniprot_id = "NA"
            ko_id = "NA"
            ko_level1_name = "NA"
            ko_level2_name = "NA"
            ko_level3_id = "NA"
            ko_level3_name = "NA"
            ko_gene_symbol = "NA"
            ko_product = "NA"
            ko_ec = "NA"

            # everything could have been filtered out due to user constraints
            if hit_id:
                cursor.execute('SELECT uniprot_ac, eggnog_ssid_b, eggnog_species_id, uniprot_id, \
                                       ko_id, ko_level1_name, ko_level2_name, \
                                       ko_level3_id, ko_level3_name, ko_gene_symbol, ko_product, ko_ec \
                                FROM %s \
                                WHERE eggnog_ssid_b="%s"' % (table_name, hit_id))
                try:
                    uniprot_ac, eggnog_ssid_b, eggnog_species_id, uniprot_id, ko_id, \
                        ko_level1_name, ko_level2_name, ko_level3_id, ko_level3_name, \
                        ko_gene_symbol, ko_product, ko_ec = cursor.fetchone()
                # legacy before database was pruned; can have hits not in metadata
                except TypeError:
                    logging.warning("'%s' not present in database" % hit_id)
                    pass

            # print for this query
            print(contig_name, "%s_%s" % (contig_name, orf_idx), uniprot_ac, eggnog_ssid_b,
                  eggnog_species_id, uniprot_id, ko_id, ko_level1_name, ko_level2_name,
                  ko_level3_id, ko_level3_name, ko_gene_symbol, ko_product, ko_ec, evalue,
                  bitscore, sep="\t", file=output)
    logging.info("Complete")