Python gzopen Examples

Programming Language: Python

Namespace/Package Name: atlas.utils

Method/Function: gzopen

Examples at hotexamples.com: 3

Python gzopen - 3 examples found. These are the top rated real world Python examples of atlas.utils.gzopen extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def cazy_parser(tsv, namemap, output, summary_method='best', min_identity=60, min_bitscore=0, min_length=60, max_evalue=0.000001, top_fraction=1, max_hits=10, table_name="cazy"):
    """Parse BLAST hits from CAZy reference database.

    The BLAST hits are assumed to be sorted by query with decreasing bitscores (best alignment first):

        \b
        sort -k1,1 -k12,12rn tsv > sorted_tsv

    Expected columns in the CAZy database:

        \b
        cazy_gene
        cazy_family
        cazy_class
        cazy_ec

    For a given gene match, all possible ECs are returned in a single line separated by '|'.

    Args:
        tsv (str): blast hits file path in default tabular format
        namemap (str): sqlite database file path
        output (str): :py:class:`click.File` in write mode
        summary_method (str): either 'majority' or 'best'; summary method for annotating ORFs; when majority and there is no majority, best is used
        min_identity (int): minimum allowable percent ID of BLAST hit
        min_bitscore (int): minimum allowable bitscore of BLAST hit; 0 disables
        min_length (int): minimum allowable BLAST alignment length
        max_evalue (float): maximum allowable e-value of BLAST hit
        top_fraction (float): filters ORF BLAST hits before finding majority by only keep hits within this fraction, e.g. 0.98, of the highest bitscore
        max_hits (int): maximum number of BLAST hits to consider when summarizing ORFs as a majority
        table_name (str): table name within namemap database; expected columns are listed above

    """
    logging.info("Parsing %s" % tsv)

    if top_fraction == 1:
        top_fraction = None

    print("contig", "orf", "cazy_gene", "cazy_family", "cazy_class", "cazy_ec",
          "%s_evalue" % table_name, "%s_bitscore" % table_name, sep="\t", file=output)
    with contextlib.closing(sqlite3.connect(namemap)) as conn, gzopen(tsv) as blast_tab_fh:
        cursor = conn.cursor()
        for query, qgroup in groupby(blast_tab_fh, key=lambda x: x.partition("\t")[0]):

            contig_name, _, orf_idx = query.rpartition("_")
            hit_id, evalue, bitscore = get_hit_from_blast_group(qgroup, max_hits, top_fraction, min_length, min_identity, max_evalue, min_bitscore, summary_method)
            cazy_gene = "NA"
            cazy_family = "NA"
            cazy_class = "NA"
            cazy_ec = "NA"

            # everything could have been filtered out due to user constraints
            if hit_id:
                cursor.execute('SELECT cazy_gene, cazy_family, cazy_class, cazy_ec \
                                FROM %s \
                                WHERE cazy_gene="%s"' % (table_name, hit_id))
                cazy_gene, cazy_family, cazy_class, cazy_ec = cursor.fetchone()
            print(contig_name, "%s_%s" % (contig_name, orf_idx), cazy_gene, cazy_family,
                  cazy_class, cazy_ec, evalue, bitscore, sep="\t", file=output)
    logging.info("Complete")

Example #2

Show file

def parse_blast_results_with_tree(blast_tab,
                                  name_map,
                                  summary_method,
                                  tree,
                                  min_identity=70,
                                  min_bitscore=0,
                                  min_length=60,
                                  max_evalue=0.000001,
                                  max_hits_per_orf=10,
                                  top_fraction_of_hits=None,
                                  table_name="refseq",
                                  lca_threshold=1):
    """Parse BLAST results (-outfmt 6), filter, and aggregate ORF taxonomies.

    Args:
        blast_tab (str): file path to blast TSV file
        name_map (dict): dict of tuples from parse_tree_annotation
        summary_method (dict): method of ORF annotation selection

        lca_threshold (float): the first parent above this fraction of representation (its count is
            greater than the total * lca_threshold)

    Returns:
        dict: dict of dicts where first key is contig name, inner key is ORF ID; values are tuple of
              protein function, taxonomy ID, bitscore, evalue

    Raises:
        AssertionError when ORF summary method is not supported (['lca', 'best', 'majority'])
    """

    # allowing 1 and 0 to disable
    if top_fraction_of_hits == 1:
        top_fraction_of_hits = None

    assert summary_method in ["lca", "best", "majority"]
    contigs = defaultdict(dict)

    with contextlib.closing(sqlite3.connect(name_map)) as conn, gzopen(
            blast_tab) as blast_tab_fh:
        cursor = conn.cursor()
        # group hits by ORF (column 2)
        for orf_id, qgroup in groupby(blast_tab_fh,
                                      key=lambda x: x.split("\t")[1]):

            protein_function = "hypothetical protein"
            protein_set = False
            taxonomy_id = "1"
            bitscore = "NA"
            evalue = "NA"
            orf_hits = BlastHits(max_hits=max_hits_per_orf,
                                 top_fraction=top_fraction_of_hits)
            lines = []

            # iterate over blast hits per ORF
            for hsp in qgroup:
                # HSPs will now have contig in column 1
                toks = hsp.strip().split("\t")
                # remove extra column from toks
                contig_name = toks.pop(0)
                # convert toks to dictionary
                toks = dict(zip(BLAST6, toks))

                if (int(toks["length"]) < min_length
                        or float(toks["pident"]) < min_identity
                        or float(toks["evalue"]) > max_evalue):
                    continue
                if min_bitscore and float(toks["bitscore"]) < min_bitscore:
                    # input is sorted by decreasing bitscore
                    break

                cursor.execute(
                    'SELECT function, taxonomy FROM %s WHERE name="%s"' %
                    (table_name, toks["sseqid"]))
                current_function, current_taxonomy = cursor.fetchone()

                # update taxonomy based on pident; would be similar to 16S taxonomy assignments
                # current_taxonomy = tree.climb_tree(current_taxonomy, float(toks["pident"]))

                if summary_method == "best":
                    taxonomy_id = current_taxonomy
                    protein_function = current_function
                    bitscore = toks["bitscore"]
                    evalue = toks["evalue"]
                    break

                # TODO implement bitscore ratio as a measure of alignment quality as a function of input sequence
                orf_hits.add(current_taxonomy, toks["bitscore"])
                toks["current_function"] = current_function
                toks["current_taxonomy"] = current_taxonomy
                lines.append(toks)

            # summary method is majority and we have passing HSPs
            if not summary_method == "best" and lines:
                if summary_method == "majority":
                    taxonomy_id = orf_hits.majority()
                    for toks in lines:
                        if toks["current_taxonomy"] == taxonomy_id:
                            bitscore = toks["bitscore"]
                            evalue = toks["evalue"]
                            protein_function = toks["current_function"]
                            break
                # summary method is 'lca'
                else:
                    orf_hits.names.reverse()
                    taxonomy_id = tree.lca(orf_hits.names,
                                           threshold=lca_threshold)
                    # grabbing best hit's bitscore and evalue
                    bitscore = lines[0]["bitscore"]
                    evalue = lines[0]["evalue"]
                    protein_function = lines[0]["current_function"]

                if bitscore == "NA":
                    logging.critical(
                        "The summarized ID (%s) was not assigned metadata" %
                        taxonomy_id)

            contigs[contig_name][orf_id] = (protein_function, taxonomy_id,
                                            bitscore, evalue)

    return contigs

Example #3

Show file

def eggnog_parser(tsv, namemap, output, summary_method, min_identity, min_bitscore, min_length, max_evalue, top_fraction, max_hits, table_name):
    """Parse BLAST hits from EGGNOG.

    The BLAST hits are assumed to be sorted by query with decreasing bitscores (best alignment first):

        \b
        sort -k1,1 -k12,12rn tsv > sorted_tsv

    Expected columns in the EggNOG database:

        \b
        uniprot_ac
        eggnog_ssid_b
        eggnog_species_id
        uniprot_id
        ko_id
        ko_level1_name
        ko_level2_name
        ko_level3_id
        ko_level3_name
        ko_gene_symbol
        ko_product
        ko_ec

    Args:
        tsv (str): blast hits file path in default tabular format
        namemap (str): sqlite database file path
        output (str): :py:class:`click.File` in write mode
        summary_method (str): either 'majority' or 'best'; summary method for annotating ORFs; when majority and there is no majority, best is used
        min_identity (int): minimum allowable percent ID of BLAST hit
        min_bitscore (int): minimum allowable bitscore of BLAST hit; 0 disables
        min_length (int): minimum allowable BLAST alignment length
        max_evalue (float): maximum allowable e-value of BLAST hit
        top_fraction (float): filters ORF BLAST hits before finding majority by only keep hits within this fraction, e.g. 0.98, of the highest bitscore
        max_hits (int): maximum number of BLAST hits to consider when summarizing ORFs as a majority
        table_name (str): table name within namemap database; expected columns are listed above

    """
    logging.info("Parsing %s" % tsv)

    if top_fraction == 1:
        top_fraction = None

    print("contig", "orf", "uniprot_ac", "eggnog_ssid_b", "eggnog_species_id", "uniprot_id",
          "ko_id", "ko_level1_name", "ko_level2_name", "ko_level3_id", "ko_level3_name",
          "ko_gene_symbol", "ko_product", "ko_ec", "%s_evalue" % table_name,
          "%s_bitscore" % table_name, sep="\t", file=output)

    with contextlib.closing(sqlite3.connect(namemap)) as conn, gzopen(tsv) as blast_tab_fh:
        cursor = conn.cursor()
        for query, qgroup in groupby(blast_tab_fh, key=lambda x: x.partition("\t")[0]):

            contig_name, _, orf_idx = query.rpartition("_")
            hit_id, evalue, bitscore = get_hit_from_blast_group(qgroup, max_hits, top_fraction, min_length, min_identity, max_evalue, min_bitscore, summary_method)
            uniprot_ac = "NA"
            eggnog_ssid_b = "NA"
            eggnog_species_id = "NA"
            uniprot_id = "NA"
            ko_id = "NA"
            ko_level1_name = "NA"
            ko_level2_name = "NA"
            ko_level3_id = "NA"
            ko_level3_name = "NA"
            ko_gene_symbol = "NA"
            ko_product = "NA"
            ko_ec = "NA"

            # everything could have been filtered out due to user constraints
            if hit_id:
                cursor.execute('SELECT uniprot_ac, eggnog_ssid_b, eggnog_species_id, uniprot_id, \
                                       ko_id, ko_level1_name, ko_level2_name, \
                                       ko_level3_id, ko_level3_name, ko_gene_symbol, ko_product, ko_ec \
                                FROM %s \
                                WHERE eggnog_ssid_b="%s"' % (table_name, hit_id))
                try:
                    uniprot_ac, eggnog_ssid_b, eggnog_species_id, uniprot_id, ko_id, \
                        ko_level1_name, ko_level2_name, ko_level3_id, ko_level3_name, \
                        ko_gene_symbol, ko_product, ko_ec = cursor.fetchone()
                # legacy before database was pruned; can have hits not in metadata
                except TypeError:
                    logging.warning("'%s' not present in database" % hit_id)
                    pass

            # print for this query
            print(contig_name, "%s_%s" % (contig_name, orf_idx), uniprot_ac, eggnog_ssid_b,
                  eggnog_species_id, uniprot_id, ko_id, ko_level1_name, ko_level2_name,
                  ko_level3_id, ko_level3_name, ko_gene_symbol, ko_product, ko_ec, evalue,
                  bitscore, sep="\t", file=output)
    logging.info("Complete")