Esempio n. 1
0
def cache_enzyme(enzyme_id):
    """ Lookup and/or cache enzyme """
    result = DataStore.get_entry("pathways").exec_query("enzyme-lookup", [enzyme_id]).fetchone()

    if result:
        # Cache hit
        return parse_kegg(result[0])

    # Cache miss
    enzyme_raw = retrieve_kegg(enzyme_id)
    DataStore.get_entry("pathways").insert_rows("enzyme", [(enzyme_id, enzyme_raw)])
    return parse_kegg(enzyme_raw)
Esempio n. 2
0
def cache_pathway(path_id):
    """ Lookup and/or cache pathway """
    result = DataStore.get_entry("pathways").exec_query("pathway-lookup", [path_id]).fetchone()

    if result:
        # Cache hit
        return parse_kegg(result[0])

    # Cache miss
    pathway_raw = retrieve_kegg(path_id)
    DataStore.get_entry("pathways").insert_rows("pathway", [(path_id, pathway_raw)])
    return parse_kegg(pathway_raw)
Esempio n. 3
0
def build_tree(entries):
    """ Create tree structure out of taxon lineage data """
    ret_tree = {"name": "all", "children": dict(), "enzymes": Counter()}

    for entry in entries:
        result = DataStore.get_entry("taxonomy").exec_query("lineage-lookup", [entry[0]]).fetchone()
        if not result:
            continue

        raw_lineage = result[0]

        # If a UniProt species-level species ID (non 9CCCC), append species to lineage
        if not entry[0].startswith("9"):
            raw_lineage += ";{0}".format(entry[1])

        ranks = raw_lineage.split(";")
        parent = ret_tree["children"]

        for rank in ranks:
            rank = "Unknown" if rank == "" else rank.strip()

            # Initialize dictionary for this rank, add child values
            parent.setdefault(rank, {"name": rank, "children": dict(), "enzymes": Counter()})
            parent[rank]["enzymes"] += Counter(entries[entry])
            parent = parent[rank]["children"]

        # Update root level
        ret_tree["enzymes"] += Counter(entries[entry])

    return ret_tree
Esempio n. 4
0
def treeify_lineage(taxa_entries, taxa_count):
    """ Create tree structure out of taxa/lineage data """
    ret_tree = (dict(), taxa_count)

    for taxon in taxa_entries:
        result = DataStore.get_entry("taxonomy").exec_query(
            "lineage-lookup", [taxon[0]]).fetchone()
        if not result:
            continue

        raw_lineage = result[0]
        ranks = raw_lineage.split(";")
        parent = ret_tree[0]

        for rank in ranks:
            rank = rank.strip()

            # Start new dictionary, initialize size cache
            if rank not in parent:
                parent[rank] = (dict(), 0)

            parent[rank] = (parent[rank][0],
                            parent[rank][1] + taxa_entries[taxon])
            parent = parent[rank][0]

    return ret_tree
Esempio n. 5
0
def taxonomy_init():
    # Setup FileStore
    FileStore("taxonomy-db", "taxonomy-db", "taxonomy.db", None,
              FileStore.FTYPE_CACHE, FileStore.FOPT_NORMAL)
    FileStore("taxonomy-lineage", "taxonomy-lineage", "taxonomy-lineage.dat",
              "http://www.uniprot.org/taxonomy/?query=&sort=score&format=tab",
              FileStore.FTYPE_TEMP, FileStore.FOPT_NORMAL)

    # Setup DataStore
    DataStore("taxonomy", FileStore.get_entry("taxonomy-db").path)
    DataStore.get_entry("taxonomy").create_table(
        "lineage", [("mnemonic", "text", "PRIMARY KEY"),
                    ("lineage", "text", "")])
    DataStore.get_entry("taxonomy").define_query(
        "lineage-lookup", "SELECT lineage FROM lineage WHERE mnemonic = ?")

    # Populate database
    populate_database()
Esempio n. 6
0
def render_sequences(cluster_ids):
    """ Retrieve all members of requested UniRef90 clusters and render fasta data """
    # Prepare all sequence files for reading
    for entry in FileStore.get_group("decluster-seqs"):
        entry.get_handle("rt")

    for cluster_id in cluster_ids:
        for result in DataStore.get_entry("crossref").exec_query(
                "uniprot_cross_acc", ("UniRef90", cluster_id)).fetchmany():
            core.main.send_output(get_sequence(result[0]), "stdout", "")
Esempio n. 7
0
def decluster_init():
    # Setup FileStore
    FileStore("decluster-db", "decluster-db", "decluster.db", None,
              FileStore.FTYPE_CACHE, FileStore.FOPT_NORMAL)
    FileStore(
        "decluster-seqs", "decluster-swissprot",
        "decluster_uniprot_sprot.fasta.gz",
        "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz",
        FileStore.FTYPE_CACHE, FileStore.FOPT_GZIP_DECOMPRESS)
    FileStore(
        "decluster-seqs", "decluster-trembl",
        "decluster_uniprot_trembl.fasta.gz",
        "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz",
        FileStore.FTYPE_CACHE, FileStore.FOPT_GZIP_DECOMPRESS)

    # Setup DataStore
    DataStore("decluster", FileStore.get_entry("decluster-db").path)
    DataStore.get_entry("decluster").create_table(
        "indices", [("id", "text", "PRIMARY KEY"), ("file", "text", ""),
                    ("pos", "integer", "")])
    DataStore.get_entry("decluster").define_query(
        "index-lookup", "SELECT file, pos FROM indices WHERE id = ?")

    populate_database()
Esempio n. 8
0
def filter_taxa(taxa, pattern):
    """ Filter taxa having a specific taxonomic rank (as regex) """
    ret_taxa = dict()
    ret_count = 0

    for taxon in taxa:
        result = DataStore.get_entry("taxonomy").exec_query(
            "lineage-lookup", [taxon[0]]).fetchone()
        lineage = result[0] if result else "Unknown"

        if re.search(pattern, lineage):
            ret_taxa[taxon] = taxa[taxon]
            ret_count += taxa[taxon]

    return ret_taxa, ret_count
Esempio n. 9
0
def group_sam(data, sam_file, species):
    """ Group SAM reads per taxonomic rank """
    retData = dict()

    # Simplify species lookup keys
    species_lookup = {item[0]: item[1] for item in species}

    # Read SAM entries
    entries = SamEntry.get_entries(sam_file, 0)

    for entry in entries:
        reference = entries[entry].reference

        # Currently only handles UniProt
        if "_" not in reference:
            continue

        mnemonic = reference.split("_")[1]

        # Skip entries that are too ambiguous (e.g. 9ZZZZ) or recently moved by UniProt
        if mnemonic not in species_lookup:
            continue

        result = DataStore.get_entry("taxonomy").exec_query(
            "lineage-lookup", [mnemonic]).fetchone()
        if not result:
            continue

        # Combine lineage and species for lookup into taxonomy results
        lineage = [item.strip() for item in result[0].split(";")]
        lineage.append(species_lookup[mnemonic])

        for rank in lineage:
            if rank in data:
                query = entries[entry].query
                if query not in retData:
                    retData[query] = list()
                retData[query].append(rank)

    return retData
Esempio n. 10
0
def get_sequence(acc):
    """ Lookup sequence for given acc """
    result = DataStore.get_entry("decluster").exec_query(
        "index-lookup", [acc]).fetchone()
    if not result:
        core.main.send_output(
            "Sequence not found for UniProt accession '{0}'".format(acc))
        sys.exit(1)

    # Seek the index position in the appropriate file
    handle = FileStore.get_entry(result[0], "decluster-seqs").get_handle()
    handle.seek(result[1])

    # Append sequence data until next header
    ret_seq = ""
    for line in handle:
        if line.startswith(">") and ret_seq:
            break

        ret_seq += line

    return ret_seq
Esempio n. 11
0
def combined_compare(taxonomy_entries, sam_entries, uniprot_entries,
                     pass_total):
    """ Combined compare breaks down taxonomy report on per read basis, then aggregates """
    ret_entries = dict()

    # Calculate unmapped difference and add to taxonomy set
    unmapped_diff = 0
    for sam_entry in sam_entries:
        for set_idx in range(2):
            if sam_entry[set_idx] == "*":
                if set_idx == 0:
                    pass_total += 1

                unmapped_diff += [1, -1][set_idx]

    # Add unmapped entry to taxonomy list
    taxonomy_entries["Unmapped"] = unmapped_diff

    # Iterate through each taxonomy grouping
    for taxon_entry in taxonomy_entries:
        # Skip unchanged entries
        if taxonomy_entries[taxon_entry] == 0:
            continue

        # Prepare entry if new
        if taxon_entry not in ret_entries:
            ret_entries[taxon_entry] = (dict(), taxonomy_entries[taxon_entry] /
                                        pass_total)

        # Scan SAM entries for matching lineage
        for sam_entry in sam_entries:
            # Lookup species and lineage
            sam_records = list()
            lineage = list()

            for set_idx in range(2):
                if sam_entry[set_idx] == "*":
                    miss_record = type("miss_record", (object, ), {})()
                    miss_record.species_id = "Unmapped"
                    miss_record.species_full = "Unmapped"
                    sam_records.append(miss_record)
                    lineage.append("Unmapped")
                else:
                    sam_records.append(uniprot_entries[set_idx][
                        sam_entry[set_idx].split("|")[-1]])
                    species_id = sam_records[set_idx].species_id
                    result = DataStore.get_entry("taxonomy").exec_query(
                        "lineage-lookup", [species_id]).fetchone()

                    if result:
                        lineage.append(
                            [x.strip() for x in result[0].split(";")])
                    else:
                        lineage.append("Unknown")

            for set_idx in range(2):
                # Attempt to match on species
                if sam_records[set_idx].species_full == taxon_entry:
                    sam_species = sam_records[1 - set_idx].species_full
                    sam_amount = [1, -1][set_idx] * sam_entries[sam_entry]

                    if sam_species not in ret_entries[taxon_entry][0]:
                        ret_entries[taxon_entry][0][sam_species] = (0, 0)

                    sam_parts = ret_entries[taxon_entry][0][sam_species]

                    if sam_amount > 0:
                        ret_entries[taxon_entry][0][sam_species] = (
                            sam_parts[0] + sam_amount, sam_parts[1])
                    else:
                        ret_entries[taxon_entry][0][sam_species] = (
                            sam_parts[0], sam_parts[1] + sam_amount)

                    break

                # Attempt to match on lineage
                for rank_idx in range(len(lineage[set_idx])):
                    if lineage[set_idx][rank_idx] == taxon_entry:
                        compare_idx = rank_idx

                        if compare_idx >= len(lineage[1 - set_idx]):
                            compare_idx = len(lineage[1 - set_idx]) - 1

                        sam_lineage = lineage[1 - set_idx][compare_idx]
                        sam_amount = [1, -1][set_idx] * sam_entries[sam_entry]

                        if sam_lineage not in ret_entries[taxon_entry][0]:
                            ret_entries[taxon_entry][0][sam_lineage] = (0, 0)

                        sam_parts = ret_entries[taxon_entry][0][sam_lineage]

                        if sam_amount > 0:
                            ret_entries[taxon_entry][0][sam_lineage] = (
                                sam_parts[0] + sam_amount, sam_parts[1])
                        else:
                            ret_entries[taxon_entry][0][sam_lineage] = (
                                sam_parts[0], sam_parts[1] + sam_amount)

                        break

    # Calculate percentages
    for taxonomy_entry in ret_entries:
        for sam_entry in ret_entries[taxonomy_entry][0]:
            sam_parts = ret_entries[taxonomy_entry][0][sam_entry]
            ret_entries[taxonomy_entry][0][sam_entry] = (sam_parts[0] /
                                                         pass_total,
                                                         sam_parts[1] /
                                                         pass_total)

    return ret_entries
Esempio n. 12
0
def populate_database():
    """ Populate sequence header indices """
    if not DataStore.get_entry("decluster").get_expired("indices", 30):
        return

    core.main.send_output("Populating UniProt sequences...", "stderr")

    # Start transaction and empty any existing data
    DataStore.get_entry("decluster").process_trans()
    DataStore.get_entry("decluster").delete_rows("indices")

    # Download each sequence file
    for entry in FileStore.get_group("decluster-seqs"):
        entry.prepare()

        with entry.get_handle("rt") as handle:
            acc = ""

            while True:
                line = handle.readline()
                if not line:
                    break

                if line.startswith(">"):
                    fields = line.rstrip().split()
                    acc = fields[0].split("|")[1]
                    DataStore.get_entry("decluster").insert_rows(
                        "indices",
                        [(acc, entry.fid, handle.tell() - len(line))])

    # Finalize transaction and current table age
    DataStore.get_entry("decluster").process_trans()
    DataStore.get_entry("decluster").update_age("indices")
Esempio n. 13
0
def crossref_init():
    # Setup FileStore
    FileStore("crossref-db", "crossref-db", "crossref.db", None,
              FileStore.FTYPE_CACHE, FileStore.FOPT_NORMAL)
    FileStore(
        "crossref-uniprot", "crossref-uniprot", "idmapping.dat.gz",
        "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz",
        FileStore.FTYPE_TEMP, FileStore.FOPT_GZIP)

    # Setup DataStore
    DataStore("crossref", FileStore.get_entry("crossref-db").get_path())
    DataStore.get_entry("crossref").create_table("uniprot",
                                                 [("acc", "text", ""),
                                                  ("db", "text", ""),
                                                  ("cross", "text", "")])
    DataStore.get_entry("crossref").define_query(
        "uniprot_acc_cross",
        "SELECT cross FROM uniprot WHERE acc = ? AND db = ?")
    DataStore.get_entry("crossref").define_query(
        "uniprot_acc_all", "SELECT db, cross FROM uniprot WHERE acc = ?")
    DataStore.get_entry("crossref").define_query(
        "uniprot_cross_acc",
        "SELECT acc FROM uniprot WHERE db = ? AND cross = ?")
    DataStore.get_entry("crossref").define_query(
        "uniprot_cross_cross",
        "SELECT t2.cross FROM uniprot AS t1 JOIN uniprot AS t2 ON acc WHERE t1.db = ? AND t1.cross = ? AND t2.db = ?"
    )
    DataStore.get_entry("crossref").define_index("uniprot_acc", "uniprot",
                                                 ["acc"], False)
    DataStore.get_entry("crossref").define_index("uniprot_acc_db", "uniprot",
                                                 ["acc", "db"], False)
    DataStore.get_entry("crossref").define_index("uniprot_db_cross", "uniprot",
                                                 ["db", "cross"], False)

    # Populate database
    populate_database()
Esempio n. 14
0
def populate_database():
    """ Generate (if necessary) and get lineage lookup """
    if not DataStore.get_entry("taxonomy").get_expired("lineage", 30):
        return

    core.main.send_output("Populating taxonomic lineage data...", "stderr")

    # Download tab delimited data
    entry = FileStore.get_entry("taxonomy-lineage")
    entry.prepare()

    # Start transaction and empty any existing data
    DataStore.get_entry("taxonomy").process_trans()
    DataStore.get_entry("taxonomy").delete_rows("lineage")

    # Iterate through downloaded table and add rows
    with entry.get_handle("r") as handle:
        for line in handle:
            fields = line.rstrip().split("\t")
            if len(fields) < 9 or fields[1] == "":
                continue

            # Add to database
            DataStore.get_entry("taxonomy").insert_rows(
                "lineage", [(fields[1], fields[8])])

    # Finalize transaction and current table age
    DataStore.get_entry("taxonomy").process_trans()
    DataStore.get_entry("taxonomy").update_age("lineage")
Esempio n. 15
0
def populate_database():
    """ Generate cross-reference database """
    if not DataStore.get_entry("crossref").get_expired("uniprot", 30):
        return

    core.main.send_output("Populating UniProt database cross-references...",
                          "stderr")

    # Download tab delimited data
    entry = FileStore.get_entry("crossref-uniprot")
    entry.prepare()

    # Start transaction and empty any existing data
    DataStore.get_entry("crossref").drop_index("uniprot_acc")
    DataStore.get_entry("crossref").drop_index("uniprot_acc_db")
    DataStore.get_entry("crossref").drop_index("uniprot_db_cross")
    DataStore.get_entry("crossref").process_trans()
    DataStore.get_entry("crossref").delete_rows("uniprot")

    # Iterate through downloaded table and add rows
    with entry.get_handle("rt") as handle:
        for line in handle:
            fields = line.rstrip().split("\t")
            if len(fields) < 3:
                continue

            # Add to database
            DataStore.get_entry("crossref").insert_rows("uniprot", [fields])

    # Finalize transaction and current table age
    DataStore.get_entry("crossref").process_trans()
    DataStore.get_entry("crossref").create_index("uniprot_acc")
    DataStore.get_entry("crossref").create_index("uniprot_acc_db")
    DataStore.get_entry("crossref").create_index("uniprot_db_cross")
    DataStore.get_entry("crossref").update_age("uniprot")
Esempio n. 16
0
                    with open(fpath) as f:
                        if is_binary_string(f.read(128)):
                            continue
                        else:
                            f.seek(0)
                        alerts.extend(
                            check_alerts_in_file(code_checker, f, fname))
    else:
        with open(path) as f:
            alerts.extend(check_alerts_in_file(code_checker, f, path))

    data_store = None
    if args.store:
        (host, port) = args.store.split(":")
        data_store = DataStore(host=host,
                               port=port,
                               default_doctype="repoguard",
                               default_index="repoguard")

    for alert in alerts:
        print 'file:\t%s:%s\nrule:\t%s\nline:\t%s\ndescr:\t%s\n' % (
            alert.filename,
            alert.line_number,
            alert.rule.name,
            alert.line[0:200].strip().replace("\t", " ").decode(
                'utf-8', 'replace'),
            alert.rule.description,
        )
        if args.store:
            try:
                body = {
                    "check_id":
Esempio n. 17
0
                fpath = os.path.join(root, fname)
                if not os.path.islink(fpath):
                    with open(fpath) as f:
                        if is_binary_string(f.read(128)):
                            continue
                        else:
                            f.seek(0)
                        alerts.extend(check_alerts_in_file(code_checker, f, fname))
    else:
        with open(path) as f:
            alerts.extend(check_alerts_in_file(code_checker, f, path))

    data_store = None
    if args.store:
        (host, port) = args.store.split(":")
        data_store = DataStore(host=host, port=port, default_doctype="repoguard", default_index="repoguard")

    for alert in alerts:
        print 'file:\t%s\nrule:\t%s\nline:\t%s\ndescr:\t%s\n' % (
            alert.filename, alert.rule.name,
            alert.line[0:200].strip().replace("\t", " ").decode('utf-8', 'replace'), alert.rule.description,
        )
        if args.store:
            try:
                body = {
                    "check_id": alert.rule.name,
                    "description": alert.rule.description,
                    "filename": alert.filename,
                    "commit_id": alert.commit,
                    "matching_line": alert.line[0:200].replace("\t", " ").decode('utf-8', 'replace'),
                    "repo_name": alert.repo,
Esempio n. 18
0
def pathways_init():
    # Setup FileStore
    FileStore("pathways-db", "pathways-db", "pathways.db", None, FileStore.FTYPE_CACHE, FileStore.FOPT_NORMAL)

    # Setup DataStore
    DataStore("pathways", FileStore.get_entry("pathways-db").path)
    DataStore.get_entry("pathways").create_table("enzyme", [("ec", "text", "PRIMARY KEY"), ("pathway", "text", "")])
    DataStore.get_entry("pathways").create_table("pathway", [("pathway", "text", "PRIMARY KEY"), ("info", "text", "")])
    DataStore.get_entry("pathways").define_query("enzyme-lookup", "SELECT pathway FROM enzyme WHERE ec = ?")
    DataStore.get_entry("pathways").define_query("pathway-lookup", "SELECT info FROM pathway WHERE pathway = ?")

    # Check for expired database
    if DataStore.get_entry("pathways").get_expired("enzyme", 30):
        DataStore.get_entry("pathways").delete_rows("enzyme")
        DataStore.get_entry("pathways").delete_rows("pathway")
        DataStore.get_entry("pathways").update_age("enzyme")
Esempio n. 19
0
def get_data_store():
    datastore = DataStore(host=app.config["ELASTIC_HOST"],
                          port=app.config["ELASTIC_PORT"],
                          default_index=app.config["INDEX"],
                          default_doctype=app.config["DOC_TYPE"])
    return datastore