Ejemplo n.º 1
0
def insert_structural_models(pro_url: str, stg_url: str, p_entry2xrefs: str):
    logger.info("finding entries with structures")
    has_structures = set()
    with DumpFile(p_entry2xrefs) as df:
        for accession, xrefs in df:
            if xrefs["structures"]:
                has_structures.add(accession)

    my_con = MySQLdb.connect(**url2dict(stg_url))
    my_cur = my_con.cursor()
    my_cur.execute("DROP TABLE IF EXISTS webfront_structuralmodel")
    my_cur.execute("""
        CREATE TABLE webfront_structuralmodel
        (
            model_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
            accession VARCHAR(25) NOT NULL,
            contacts LONGBLOB NOT NULL,
            lddt LONGBLOB NOT NULL,
            structure LONGBLOB NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)

    # Load accessions of signatures with structural models
    logger.info("finding entries with trRosetta structural models")
    ora_con = cx_Oracle.connect(pro_url)
    ora_cur = ora_con.cursor()
    ora_cur.outputtypehandler = blob_as_str
    ora_cur.execute("SELECT METHOD_AC FROM INTERPRO.PFAM_TRROSETTA")
    to_import = {acc for acc, in ora_cur if acc not in has_structures}

    logger.info(f"{len(to_import)} entries with structural models to import")
    for acc in to_import:
        ora_cur.execute(
            """
            SELECT PROB_CONTACTS, PRED_LDDT, PRED_STRUCTURE
            FROM INTERPRO.PFAM_TRROSETTA
            WHERE METHOD_AC = :1
            """, (acc, ))

        for cmap_gz, lddt_gz, pdb_gz in ora_cur:
            my_cur.execute(
                """
                    INSERT INTO webfront_structuralmodel (
                      accession, contacts, lddt, structure
                    )
                    VALUES (%s, %s, %s, %s)
                """, (acc, cmap_gz, lddt_gz, pdb_gz))

    ora_cur.close()
    ora_con.close()

    my_con.commit()
    my_cur.execute("""
        CREATE INDEX i_structuralmodel
        ON webfront_structuralmodel (accession)
        """)
    my_cur.close()
    my_con.close()

    logger.info("complete")
Ejemplo n.º 2
0
def _export_alns(pfam_url: str, dt: DirectoryTree, buffer_size: int = 1000):
    logger.info("processing Pfam alignments")
    df = DumpFile(dt.mktemp(), compress=True)
    cnt = 0

    iterator = pfam.get_alignments(pfam_url)
    for entry_acc, aln_type, aln_bytes, count in iterator:
        df.dump((entry_acc, f"alignment:{aln_type}", aln_bytes,
                 "application/gzip", count))

        cnt += 1
        if cnt == buffer_size:
            df.close()
            yield df.path
            df = DumpFile(dt.mktemp(), compress=True)
            cnt = 0

    df.close()
    yield df.path
Ejemplo n.º 3
0
def dump_xrefs(xrefs: dict, taxonomy: dict, output: str):
    # Init all taxa
    final_xrefs = {}
    for taxon_id in taxonomy:
        final_xrefs[taxon_id] = init_xrefs()

    while xrefs:
        taxon_id, taxon_xrefs = xrefs.popitem()

        for node_id in taxonomy[taxon_id]["lineage"]:
            deepupdate(taxon_xrefs, final_xrefs[node_id], replace=False)

    with DumpFile(output, compress=True) as f:
        for taxon_id in sorted(final_xrefs):
            f.dump((taxon_id, final_xrefs[taxon_id]))
Ejemplo n.º 4
0
def _insert(url: str, queue: Queue):
    for path in iter(queue.get, None):
        with DumpFile(path) as df:
            con = MySQLdb.connect(**url2dict(url))
            cur = con.cursor()

            for acc, anntype, value, mime, count in df:
                cur.execute(
                    """
                        INSERT INTO webfront_entryannotation (
                          accession, type, value, mime_type, num_sequences
                        )
                        VALUES (%s, %s, %s, %s, %s)
                    """, (acc, anntype, value, mime, count))

            con.commit()
            cur.close()
            con.close()
Ejemplo n.º 5
0
def export_interpro(url: str,
                    p_entries: str,
                    p_entry2xrefs: str,
                    p_interpro2taxonomy: str,
                    outdir: str,
                    tmpdir: Optional[str] = None):
    shutil.copy(os.path.join(os.path.dirname(__file__), "interpro.dtd"),
                outdir)

    logger.info("loading entries")
    entries = loadobj(p_entries)
    interpro_entries = []
    deleted_entries = []
    for e in entries.values():
        if e.database != "interpro":
            continue
        elif e.is_deleted:
            deleted_entries.append(e.accession)
        else:
            interpro_entries.append(e.accession)

    logger.info("creating entry-taxon database")
    fd, taxdb = mkstemp(dir=tmpdir)
    os.close(fd)
    os.remove(taxdb)
    with DumpFile(p_interpro2taxonomy) as interpro2taxonomy:
        with KVdb(taxdb, writeback=True) as kvdb:
            i = 0
            for entry_acc, taxon_id, counts in interpro2taxonomy:
                kvdb[f"{entry_acc}-{taxon_id}"] = str(counts)

                i += 1
                if not i % 1000000:
                    kvdb.sync()

    logger.info("loading protein counts")
    con = MySQLdb.connect(**url2dict(url), charset="utf8mb4")
    cur = MySQLdb.cursors.SSCursor(con)
    cur.execute("""
        SELECT accession, counts
        FROM webfront_entry
        """)
    num_proteins = {}
    for entry_acc, counts in cur:
        num_proteins[entry_acc] = str(json.loads(counts)["proteins"])

    output = os.path.join(outdir, "interpro.xml.gz")
    with gzip.open(output, "wt", encoding="utf-8") as fh:
        fh.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        fh.write('<!DOCTYPE interprodb SYSTEM "interpro.dtd">\n')
        fh.write("<interprodb>\n")

        doc = getDOMImplementation().createDocument(None, None, None)

        # writing <release> section (do not log progress, < 1 sec)
        elem = doc.createElement("release")
        databases = {}
        cur.execute("""
            SELECT name, name_alt, type, num_entries, version, release_date
            FROM webfront_database
            ORDER BY name_long
            """)

        for name, name_alt, db_type, entry_count, version, date in cur:
            databases[name] = name_alt
            if db_type in ("entry", "protein"):
                dbinfo = doc.createElement("dbinfo")
                dbinfo.setAttribute("version", version)
                dbinfo.setAttribute("dbname", name_alt)
                dbinfo.setAttribute("entry_count", str(entry_count))
                dbinfo.setAttribute("file_date",
                                    date.strftime("%d-%b-%y").upper())
                elem.appendChild(dbinfo)

        elem.writexml(fh, addindent="  ", newl="\n")

        logger.info("loading taxonomic data")
        key_species = {
            "3702",  # Arabidopsis thaliana
            "6239",  # Caenorhabditis elegans
            "7955",  # Danio rerio
            "7227",  # Drosophila melanogaster
            "9606",  # H**o sapiens
            "10090",  # Mus musculus
            "367110",  # Neurospora crassa
            "10116",  # Rattus norvegicus
            "559292",  # Saccharomyces cerevisiae
            "284812",  # Schizosaccharomyces pombe
            "4577",  # Zea mays
        }
        superkingdoms = {
            "Archaea": None,
            "Bacteria": None,
            "Eukaryota": None,
            "Viruses": None
        }
        cur.execute("""
            SELECT accession, scientific_name, full_name, lineage
            FROM webfront_taxonomy
            """)
        taxa = {}
        for tax_id, sci_name, full_name, lineage in cur:
            """
            lineage stored as a string with heading/leading whitespaces,
            and a whitespace between taxa
            """
            taxa[tax_id] = (full_name, lineage.strip().split())

            if sci_name in superkingdoms:
                superkingdoms[sci_name] = tax_id

        cur.close()
        con.close()

        # Raise if a superkingdom is not in the table
        for sci_name, tax_id in superkingdoms.items():
            if tax_id is None:
                raise ValueError(f"{sci_name}: missing taxon ID")

        superkingdoms = {tax_id for tax_id in superkingdoms.values()}

        logger.info("writing entries")
        with DumpFile(p_entry2xrefs) as entry2xrefs, KVdb(taxdb) as kvdb:
            for entry_acc, xrefs in entry2xrefs:
                entry = entries[entry_acc]
                if entry.database != "interpro" or entry.is_deleted:
                    continue

                elem = doc.createElement("interpro")
                elem.setAttribute("id", entry.accession)
                elem.setAttribute("protein_count", num_proteins[entry_acc])
                elem.setAttribute("short_name", entry.short_name)
                elem.setAttribute("type", entry.type)

                name = doc.createElement("name")
                name.appendChild(doc.createTextNode(entry.name))
                elem.appendChild(name)

                text = _restore_abstract('\n'.join(entry.description))
                try:
                    _doc = parseString(f"<abstract>{text}</abstract>")
                except ExpatError as exc:
                    # TODO: use CDATA section for all entries
                    logger.warning(f"{entry_acc}: {exc}")
                    # abstract = doc.createElement("abstract")
                    # abstract.appendChild(doc.createCDATASection(text))
                else:
                    abstract = _doc.documentElement
                    elem.appendChild(abstract)

                if entry.go_terms:
                    go_list = doc.createElement("class_list")

                    for term in entry.go_terms:
                        go_elem = doc.createElement("classification")
                        go_elem.setAttribute("id", term["identifier"])
                        go_elem.setAttribute("class_type", "GO")

                        _elem = doc.createElement("category")
                        _elem.appendChild(
                            doc.createTextNode(term["category"]["name"]))
                        go_elem.appendChild(_elem)

                        _elem = doc.createElement("description")
                        _elem.appendChild(doc.createTextNode(term["name"]))
                        go_elem.appendChild(_elem)

                        go_list.appendChild(go_elem)

                    elem.appendChild(go_list)

                if entry.literature:
                    pub_list = doc.createElement("pub_list")
                    for pub_id in sorted(entry.literature):
                        pub = entry.literature[pub_id]

                        pub_elem = doc.createElement("publication")
                        pub_elem.setAttribute("id", pub_id)

                        _elem = doc.createElement("author_list")
                        if pub["authors"]:
                            _elem.appendChild(
                                doc.createTextNode(", ".join(pub['authors'])))
                        else:
                            _elem.appendChild(doc.createTextNode("Unknown"))
                        pub_elem.appendChild(_elem)

                        if pub["title"]:
                            _elem = doc.createElement("title")
                            _elem.appendChild(doc.createTextNode(pub["title"]))
                            pub_elem.appendChild(_elem)

                        if pub["URL"]:
                            _elem = doc.createElement("url")
                            _elem.appendChild(doc.createTextNode(pub["URL"]))
                            pub_elem.appendChild(_elem)

                        _elem = doc.createElement("db_xref")
                        if pub["PMID"]:
                            _elem.setAttribute("db", "PUBMED")
                            _elem.setAttribute("dbkey", str(pub["PMID"]))
                        else:
                            _elem.setAttribute("db", "MEDLINE")
                            _elem.setAttribute("dbkey", "MEDLINE")
                        pub_elem.appendChild(_elem)

                        if pub["ISO_journal"]:
                            _elem = doc.createElement("journal")
                            _elem.appendChild(
                                doc.createTextNode(pub["ISO_journal"]))
                            pub_elem.appendChild(_elem)

                        if pub["ISBN"]:
                            _elem = doc.createElement("book_title")
                            isbn = f"ISBN:{pub['ISBN']}"
                            _elem.appendChild(doc.createTextNode(isbn))
                            pub_elem.appendChild(_elem)

                        if pub["volume"] or pub["issue"] or pub["raw_pages"]:
                            _elem = doc.createElement("location")
                            if pub["volume"]:
                                _elem.setAttribute("volume", pub["volume"])

                            if pub["issue"]:
                                _elem.setAttribute("issue", pub["issue"])

                            if pub["raw_pages"]:
                                _elem.setAttribute("pages", pub["raw_pages"])

                            pub_elem.appendChild(_elem)

                        if pub["year"]:
                            _elem = doc.createElement("year")
                            _elem.appendChild(
                                doc.createTextNode(str(pub["year"])))
                            pub_elem.appendChild(_elem)

                        pub_list.appendChild(pub_elem)

                    elem.appendChild(pub_list)

                parent, children = entry.relations
                if parent:
                    par_elem = doc.createElement("parent_list")
                    _elem = doc.createElement("rel_ref")
                    _elem.setAttribute("ipr_ref", parent)
                    par_elem.appendChild(_elem)
                    elem.appendChild(par_elem)

                if children:
                    child_list = doc.createElement("child_list")
                    for child in children:
                        _elem = doc.createElement("rel_ref")
                        _elem.setAttribute("ipr_ref", child)
                        child_list.appendChild(_elem)

                    elem.appendChild(child_list)

                members = []
                for database, signatures in entry.integrates.items():
                    for signature_acc in signatures:
                        members.append((
                            signature_acc,
                            entries[signature_acc].short_name,
                            database,
                            num_proteins[signature_acc],
                        ))

                mem_list = doc.createElement("member_list")
                for member in sorted(members):
                    _elem = doc.createElement("db_xref")
                    _elem.setAttribute("protein_count", member[3])
                    _elem.setAttribute("db", databases[member[2]])
                    _elem.setAttribute("dbkey", member[0])
                    _elem.setAttribute("name", member[1])
                    mem_list.appendChild(_elem)
                elem.appendChild(mem_list)

                # Merge cross-references and pathways
                cross_refs = {}
                for key, values in entry.cross_references.items():
                    cross_refs[databases[key]] = values

                for key, values in entry.pathways.items():
                    cross_refs[databases[key]] = [val["id"] for val in values]

                if cross_refs:
                    xref_list = doc.createElement("external_doc_list")
                    for ref_db in sorted(cross_refs):
                        for ref_id in sorted(cross_refs[ref_db]):
                            _elem = doc.createElement("db_xref")
                            _elem.setAttribute("db", ref_db)
                            _elem.setAttribute("dbkey", ref_id)
                            xref_list.appendChild(_elem)
                    elem.appendChild(xref_list)

                if xrefs["structures"]:
                    xref_list = doc.createElement("structure_db_links")
                    for pdb_id in sorted(xrefs["structures"]):
                        _elem = doc.createElement("db_xref")
                        _elem.setAttribute("db", "PDB")
                        _elem.setAttribute("dbkey", pdb_id)
                        xref_list.appendChild(_elem)
                    elem.appendChild(xref_list)

                # Find key species and taxonomic distribution
                entry_key_species = []
                entry_superkingdoms = {}
                for tax_id in xrefs["taxa"]:
                    full_name, lineage = taxa[tax_id]

                    if tax_id in key_species:
                        entry_key_species.append((full_name, tax_id))

                    # Find the superkingdom contain this taxon
                    for superkingdom_id in superkingdoms:
                        if superkingdom_id in lineage:
                            break
                    else:
                        continue

                    try:
                        other_lineage = entry_superkingdoms[superkingdom_id]
                    except KeyError:
                        entry_superkingdoms[superkingdom_id] = lineage
                    else:
                        # Compare lineages and find lowest common ancestor
                        i = 0
                        while i < len(lineage) and i < len(other_lineage):
                            if lineage[i] != other_lineage[i]:
                                break
                            i += 1

                        # Path to the lowest common ancestor
                        entry_superkingdoms[superkingdom_id] = lineage[:i]

                # Get lowest common ancestor for each represented superkingdom
                lowest_common_ancestors = []
                for lineage in entry_superkingdoms.values():
                    # Lowest common ancestor
                    tax_id = lineage[-1]
                    full_name, _ = taxa[tax_id]
                    lowest_common_ancestors.append((full_name, tax_id))

                # Write taxonomic distribution
                tax_dist = doc.createElement("taxonomy_distribution")
                for full_name, tax_id in sorted(lowest_common_ancestors):
                    _elem = doc.createElement("taxon_data")
                    _elem.setAttribute("name", full_name)
                    key = f"{entry_acc}-{tax_id}"
                    _elem.setAttribute("proteins_count", kvdb[key])
                    tax_dist.appendChild(_elem)
                elem.appendChild(tax_dist)

                if entry_key_species:
                    # Write key species
                    key_spec = doc.createElement("key_species")
                    for full_name, tax_id in sorted(entry_key_species):
                        _elem = doc.createElement("taxon_data")
                        _elem.setAttribute("name", full_name)
                        key = f"{entry_acc}-{tax_id}"
                        _elem.setAttribute("proteins_count", kvdb[key])
                        key_spec.appendChild(_elem)
                    elem.appendChild(key_spec)

                elem.writexml(fh, addindent="  ", newl="\n")

        if deleted_entries:
            block = doc.createElement("deleted_entries")
            for entry_acc in sorted(deleted_entries):
                elem = doc.createElement("del_ref")
                elem.setAttribute("id", entry_acc)
                block.appendChild(elem)

            block.writexml(fh, addindent="  ", newl="\n")

        fh.write("</interprodb>\n")

    logger.info(f"temporary file: {os.path.getsize(taxdb)/1024/1024:,.0f} MB")
    os.remove(taxdb)
    logger.info("complete")
Ejemplo n.º 6
0
def insert_entries(pfam_url: str, stg_url: str, p_entries: str,
                   p_entry2xrefs: str):
    logger.info("fetching Wikipedia data for Pfam entries")
    wiki = pfam.get_wiki(pfam_url)

    logger.info("loading Pfam curation/family details")
    pfam_details = pfam.get_details(pfam_url)

    logger.info("populating webfront_entry")
    entries = loadobj(p_entries)
    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_entry")
    cur.execute("""
        CREATE TABLE webfront_entry
        (
            entry_id VARCHAR(10) DEFAULT NULL,
            accession VARCHAR(25) PRIMARY KEY NOT NULL,
            type VARCHAR(50) NOT NULL,
            name LONGTEXT,
            short_name VARCHAR(100),
            source_database VARCHAR(10) NOT NULL,
            member_databases LONGTEXT,
            integrated_id VARCHAR(25),
            go_terms LONGTEXT,
            description LONGTEXT,
            wikipedia LONGTEXT,
            details LONGTEXT,
            literature LONGTEXT,
            hierarchy LONGTEXT,
            cross_references LONGTEXT,
            interactions LONGTEXT,
            pathways LONGTEXT,
            overlaps_with LONGTEXT,
            is_featured TINYINT NOT NULL,
            is_alive TINYINT NOT NULL,
            history LONGTEXT,
            entry_date DATETIME NOT NULL,
            deletion_date DATETIME,
            counts LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)

    # Count number of structural models per entry
    cur.execute("""
        SELECT accession, COUNT(*)
        FROM webfront_structuralmodel
        GROUP BY accession
        """)
    num_struct_models = dict(cur.fetchall())
    cur.close()

    sql = """
        INSERT INTO webfront_entry
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
          %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """

    with Table(con, sql) as table:
        with DumpFile(p_entry2xrefs) as df:
            for accession, xrefs in df:
                entry = entries[accession]
                counts = reduce(xrefs)
                counts.update({
                    "interactions":
                    len(entry.ppi),
                    "pathways":
                    sum([len(v) for v in entry.pathways.values()]),
                    "sets":
                    1 if entry.clan else 0,
                    "structural_models":
                    num_struct_models.get(accession, 0)
                })

                table.insert(
                    (None, accession, entry.type.lower(),
                     entry.name, entry.short_name, entry.database,
                     jsonify(entry.integrates), entry.integrated_in,
                     jsonify(entry.go_terms), jsonify(entry.description),
                     jsonify(wiki.get(accession)),
                     jsonify(pfam_details.get(accession)),
                     jsonify(entry.literature), jsonify(entry.hierarchy),
                     jsonify(entry.cross_references), jsonify(entry.ppi),
                     jsonify(entry.pathways), jsonify(entry.overlaps_with), 0,
                     0 if entry.is_deleted else 1, jsonify(entry.history),
                     entry.creation_date, entry.deletion_date,
                     jsonify(counts)))

    con.commit()
    con.close()
    logger.info("complete")
Ejemplo n.º 7
0
def export_residues(url: str, dt: DirectoryTree) -> List[str]:
    files = []

    con = cx_Oracle.connect(url)
    cur = con.cursor()
    cur.execute("""
        SELECT S.PROTEIN_AC, S.METHOD_AC, M.NAME, LOWER(D.DBSHORT),
               S.DESCRIPTION, S.RESIDUE, S.RESIDUE_START, S.RESIDUE_END
        FROM INTERPRO.SITE_MATCH S
        INNER JOIN INTERPRO.CV_DATABASE D ON S.DBCODE = D.DBCODE
        LEFT OUTER JOIN INTERPRO.METHOD M ON S.METHOD_AC = M.METHOD_AC  
        """)

    i = 0
    proteins = {}
    for row in cur:
        protein_acc = row[0]
        signature_acc = row[1]
        signature_name = row[2]
        database = row[3]
        description = row[4]
        residue = row[5]
        pos_start = row[6]
        pos_end = row[7]

        try:
            entries = proteins[protein_acc]
        except KeyError:
            entries = proteins[protein_acc] = {}

        try:
            entry = entries[signature_acc]
        except KeyError:
            entry = entries[signature_acc] = {
                "name": signature_name,
                "database": database,
                "descriptions": {}
            }

        try:
            fragments = entry["descriptions"][description]
        except KeyError:
            fragments = entry["descriptions"][description] = []

        fragments.append((residue, pos_start, pos_end))
        i += 1
        if not i % 1000000:
            files.append(dt.mktemp())
            with DumpFile(files[-1], compress=True) as df:
                for protein_acc in sorted(proteins):
                    df.dump((protein_acc, proteins[protein_acc]))

            proteins = {}

            if not i % 100000000:
                logger.info(f"{i:>15,}")

    logger.info(f"{i:>15,}")
    cur.close()
    con.close()

    files.append(dt.mktemp())
    with DumpFile(files[-1], compress=True) as df:
        for protein_acc in sorted(proteins):
            df.dump((protein_acc, proteins[protein_acc]))

    return files
Ejemplo n.º 8
0
def _export_hmms(p_uniprot2matches: str,
                 pro_url: str,
                 dt: DirectoryTree,
                 buffer_size: int = 1000):
    logger.info("counting hits per model")
    signatures = {}
    with Store(p_uniprot2matches) as u2matches:
        cnt = 0
        for entries in u2matches.values():
            for entry_acc, locations in entries.items():
                for loc in locations:
                    if loc["model"] is None:
                        continue  # InterPro entries

                    try:
                        models = signatures[entry_acc]
                    except KeyError:
                        models = signatures[entry_acc] = {}

                    try:
                        models[loc["model"]] += 1
                    except KeyError:
                        models[loc["model"]] = 1

            cnt += 1
            if not cnt % 10e6:
                logger.info(f"{cnt:>12,}")

        logger.info(f"{cnt:>12,}")

    for entry_acc, models in signatures.items():
        # Select the model with the most hits
        model_acc = sorted(models, key=lambda k: (-models[k], k))[0]
        signatures[entry_acc] = model_acc

    logger.info("processing models")
    df = DumpFile(dt.mktemp(), compress=True)
    cnt = 0
    ignored = 0

    iterator = ippro.get_hmms(pro_url, multi_models=True)
    for entry_acc, model_acc, hmm_bytes in iterator:
        try:
            representative_model = signatures[entry_acc]
        except KeyError:
            # Signature without matches, i.e. without representative model
            ignored += 1
            continue

        if model_acc and model_acc != representative_model:
            continue

        hmm_str = gzip.decompress(hmm_bytes).decode("utf-8")
        df.dump((entry_acc, "hmm", hmm_bytes, "application/gzip", None))

        with StringIO(hmm_str) as stream:
            hmm = hmmer.HMMFile(stream)

        df.dump((entry_acc, "logo",
                 json.dumps(hmm.logo("info_content_all",
                                     "hmm")), "application/json", None))

        cnt += 2
        if cnt >= buffer_size:
            df.close()
            yield df.path
            df = DumpFile(dt.mktemp(), compress=True)
            cnt = 0

    df.close()
    yield df.path

    logger.info(f"  {ignored} models ignored")
Ejemplo n.º 9
0
def insert_taxonomy(p_entries: str,
                    p_proteins: str,
                    p_structures: str,
                    p_taxonomy: str,
                    p_uniprot2matches: str,
                    p_uniprot2proteome: str,
                    stg_url: str,
                    p_interpro2taxonomy: str,
                    tmpdir: Optional[str] = None):
    logger.info("preparing data")
    dt = DirectoryTree(tmpdir)
    entries = loadobj(p_entries)
    taxonomy = loadobj(p_taxonomy)
    uniprot2pdbe = {}
    for pdb_id, entry in loadobj(p_structures).items():
        for uniprot_acc, chains in entry["proteins"].items():
            try:
                uniprot2pdbe[uniprot_acc][pdb_id] = chains
            except KeyError:
                uniprot2pdbe[uniprot_acc] = {pdb_id: chains}

    proteins = Store(p_proteins)
    u2matches = Store(p_uniprot2matches)
    u2proteome = Store(p_uniprot2proteome)

    logger.info("starting")
    i = 0
    xrefs = {}
    files = []
    for uniprot_acc, info in proteins.items():
        taxon_id = info["taxid"]

        try:
            taxon = xrefs[taxon_id]
        except KeyError:
            taxon = xrefs[taxon_id] = init_xrefs()

        try:
            proteome_id = u2proteome[uniprot_acc]
        except KeyError:
            pass
        else:
            taxon["proteomes"].add(proteome_id)

        taxon["proteins"]["all"] += 1

        protein_structures = uniprot2pdbe.get(uniprot_acc, {})

        # Add structures to taxon, regardless of entry matches
        taxon["structures"]["all"] |= set(protein_structures.keys())

        databases = set()
        for entry_acc, locations in u2matches.get(uniprot_acc, {}).items():
            entry = entries[entry_acc]
            database = entry.database

            try:
                taxon["entries"][database].add(entry_acc)
            except KeyError:
                taxon["entries"][database] = {entry_acc}

            if database not in databases:
                # Counting the protein *once* per database
                databases.add(database)
                try:
                    taxon["proteins"]["databases"][database] += 1
                except KeyError:
                    taxon["proteins"]["databases"][database] = 1

            try:
                taxon["proteins"]["entries"][entry_acc] += 1
            except KeyError:
                taxon["proteins"]["entries"][entry_acc] = 1

            for pdb_id, chains in protein_structures.items():
                for chain_id, segments in chains.items():
                    if overlaps_pdb_chain(locations, segments):
                        try:
                            taxon["structures"]["entries"][entry_acc].add(
                                pdb_id)
                        except KeyError:
                            taxon["structures"]["entries"][entry_acc] = {
                                pdb_id
                            }

                        break  # Skip other chains

        i += 1
        if not i % 1000000:
            output = dt.mktemp()
            dump_xrefs(xrefs, taxonomy, output)
            files.append(output)
            xrefs = {}

            if not i % 10000000:
                logger.info(f"{i:>12,}")

    if xrefs:
        output = dt.mktemp()
        dump_xrefs(xrefs, taxonomy, output)
        files.append(output)
        xrefs = {}

    logger.info(f"{i:>12,}")
    logger.info(f"temporary files: "
                f"{sum(map(os.path.getsize, files))/1024/1024:.0f} MB")

    proteins.close()
    u2matches.close()
    u2proteome.close()

    logger.info("populating taxonomy tables")
    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_taxonomy")
    cur.execute("""
        CREATE TABLE webfront_taxonomy
        (
            accession VARCHAR(20) PRIMARY KEY NOT NULL,
            scientific_name VARCHAR(255) NOT NULL,
            full_name VARCHAR(512) NOT NULL,
            lineage LONGTEXT NOT NULL,
            parent_id VARCHAR(20),
            rank VARCHAR(20) NOT NULL,
            children LONGTEXT,
            counts LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.execute("DROP TABLE IF EXISTS webfront_taxonomyperentry")
    cur.execute("""
        CREATE TABLE webfront_taxonomyperentry
        (
          id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
          tax_id VARCHAR(20) NOT NULL,
          entry_acc VARCHAR(25) NOT NULL,
          counts LONGTEXT NULL NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.execute("DROP TABLE IF EXISTS webfront_taxonomyperentrydb")
    cur.execute("""
        CREATE TABLE webfront_taxonomyperentrydb
        (
          id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
          tax_id VARCHAR(20) NOT NULL,
          source_database VARCHAR(10) NOT NULL,
          counts LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    table = Table(con,
                  query="""
        INSERT INTO webfront_taxonomy VALUES (%s, %s, %s, %s, %s, %s, %s, %s) 
    """)
    per_entry = Table(con,
                      query="""
        INSERT INTO webfront_taxonomyperentry (tax_id,entry_acc,counts)
        VALUES (%s, %s, %s) 
    """)
    per_database = Table(con,
                         query="""
        INSERT INTO webfront_taxonomyperentrydb (tax_id,source_database,counts)
        VALUES (%s, %s, %s) 
    """)

    with DumpFile(p_interpro2taxonomy, compress=True) as interpro2taxonomy:
        interpro_entries = {
            entry.accession
            for entry in entries.values()
            if entry.database == "interpro" and not entry.is_deleted
        }

        i = 0
        for taxon_id, taxon_xrefs in merge_dumps(files):
            taxon = taxonomy[taxon_id]

            protein_counts = taxon_xrefs.pop("proteins")
            structure_counts = taxon_xrefs.pop("structures")
            counts = reduce(taxon_xrefs)

            # Add total protein count (not grouped by database/entry)
            counts["proteins"] = protein_counts["all"]

            # Add total structure count
            counts["structures"] = len(structure_counts["all"])

            # Add total entry count (not grouped by database)
            counts["entries"]["total"] = sum(counts["entries"].values())

            table.insert(
                (taxon_id, taxon["sci_name"], taxon["full_name"],
                 f" {' '.join(taxon['lineage'])} ", taxon["parent"],
                 taxon["rank"], jsonify(taxon["children"]), jsonify(counts)))

            # Remove the 'entry' property
            # (no needed for webfront_taxonomyperentry)
            entry_counts = counts.pop("entries")

            database_structures = {}
            for entry_acc, count in protein_counts["entries"].items():
                if entry_acc in interpro_entries:
                    interpro2taxonomy.dump((entry_acc, taxon_id, count))

                counts["proteins"] = count

                try:
                    entry_structures = structure_counts["entries"][entry_acc]
                except KeyError:
                    counts["structures"] = 0
                else:
                    counts["structures"] = len(entry_structures)

                    database = entries[entry_acc].database
                    try:
                        database_structures[database] |= entry_structures
                    except KeyError:
                        database_structures[database] = entry_structures.copy()
                finally:
                    per_entry.insert((taxon_id, entry_acc, jsonify(counts)))

            for database, count in protein_counts["databases"].items():
                counts.update({
                    "entries":
                    entry_counts[database],
                    "proteins":
                    count,
                    "structures":
                    len(database_structures.get(database, []))
                })
                per_database.insert((taxon_id, database, jsonify(counts)))

            i += 1
            if not i % 100000:
                logger.info(f"{i:>12,}")

        logger.info(f"{i:>12,}")

    table.close()
    per_entry.close()
    per_database.close()
    con.commit()

    dt.remove()

    logger.info("indexing")
    cur = con.cursor()
    cur.execute("""
        CREATE INDEX i_webfront_taxonomyperentry_tax
        ON webfront_taxonomyperentry (tax_id)
        """)
    cur.execute("""
        CREATE INDEX i_webfront_taxonomyperentry_entry
        ON webfront_taxonomyperentry (entry_acc)
        """)
    cur.execute("""
        CREATE INDEX i_webfront_taxonomyperentrydb_tax
        ON webfront_taxonomyperentrydb (tax_id)
        """)
    cur.execute("""
        CREATE INDEX i_webfront_taxonomyperentrydb_database
        ON webfront_taxonomyperentrydb (source_database)
        """)
    cur.close()
    con.close()
    logger.info("complete")
Ejemplo n.º 10
0
def insert_clans(stg_url: str, p_alignments: str, p_clans: str, p_entries: str,
                 p_entry2xrefs: str, **kwargs):
    max_xrefs = kwargs.get("max_xrefs", 1000000)
    tmpdir = kwargs.get("tmpdir")

    logger.info("aggregating clan cross-references")
    dt = DirectoryTree(tmpdir)
    entry2clan = {}
    for entry_acc, entry in loadobj(p_entries).items():
        if entry.clan:
            entry2clan[entry_acc] = entry.clan["accession"]

    clans = {}
    files = []
    num_xrefs = 0
    with DumpFile(p_entry2xrefs) as df:
        for entry_acc, entry_xrefs in df:
            try:
                clan_acc = entry2clan[entry_acc]
            except KeyError:
                continue

            try:
                clan_xrefs = clans[clan_acc]
            except KeyError:
                clan_xrefs = clans[clan_acc] = {}

            # We do not need the number of matches
            del entry_xrefs["matches"]

            cnt_before = sum(map(len, clan_xrefs.values()))
            deepupdate(entry_xrefs, clan_xrefs)
            cnt_after = sum(map(len, clan_xrefs.values()))
            num_xrefs += cnt_after - cnt_before

            if num_xrefs >= max_xrefs:
                file = dt.mktemp()
                with DumpFile(file, compress=True) as df2:
                    for clan_acc in sorted(clans):
                        df2.dump((clan_acc, clans[clan_acc]))

                files.append(file)
                clans = {}
                num_xrefs = 0

    file = dt.mktemp()
    with DumpFile(file, compress=True) as df2:
        for clan_acc in sorted(clans):
            df2.dump((clan_acc, clans[clan_acc]))

    files.append(file)

    logger.info("inserting clans")
    clans = loadobj(p_clans)
    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_set")
    cur.execute("""
        CREATE TABLE webfront_set
        (
            accession VARCHAR(20) PRIMARY KEY NOT NULL,
            name VARCHAR(400),
            description TEXT,
            source_database VARCHAR(10) NOT NULL,
            relationships LONGTEXT NOT NULL,
            authors TEXT,
            literature TEXT,
            counts LONGTEXT DEFAULT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    sql = """
        INSERT INTO webfront_set
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
    """
    with Table(con, sql) as table:
        for clan_acc, xrefs in merge_dumps(files):
            clan = clans[clan_acc]
            counts = reduce(xrefs)
            counts["entries"] = {
                clan["database"]: len(clan["members"]),
                "total": len(clan["members"])
            }

            table.insert(
                (clan_acc, clan["name"], clan["description"], clan["database"],
                 jsonify(clan["relationships"],
                         nullable=False), jsonify(clan.get("authors")),
                 jsonify(clan.get("literature")), jsonify(counts)))

    logger.info(f"temporary files: {dt.size / 1024 / 1024:.0f} MB")
    dt.remove()

    logger.info("inserting alignments")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_alignment")
    cur.execute("""
        CREATE TABLE webfront_alignment
        (
            id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
            set_acc VARCHAR(20) NOT NULL,
            entry_acc VARCHAR(25) NOT NULL,
            target_acc VARCHAR(25) NOT NULL,
            target_set_acc VARCHAR(20),
            score DOUBLE NOT NULL,
            seq_length MEDIUMINT NOT NULL,
            domains TEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    sql = """
        INSERT INTO webfront_alignment (
            set_acc, entry_acc, target_acc, target_set_acc, score, 
            seq_length, domains
        )
        VALUES (%s, %s, %s, %s, %s, %s, %s)
    """
    with DumpFile(p_alignments) as df, Table(con, sql) as table:
        for alignments in df:
            for aln in alignments:
                table.insert(aln)

    con.commit()
    con.close()

    logger.info("complete")
Ejemplo n.º 11
0
def export_clans(ipr_url: str, pfam_url: str, p_clans: str, p_alignments: str,
                 **kwargs):
    buffer_size = kwargs.get("buffer_size", 1000000)
    threshold = kwargs.get("threshold", 1e-2)

    logger.info("loading clans")
    con = cx_Oracle.connect(ipr_url)
    cur = con.cursor()
    clans = get_clans(cur)

    clan_links = {}
    entry2clan = {}
    for accession, clan in clans.items():
        clan_links[accession] = {}
        for member_acc, score, seq_length in clan["members"]:
            entry2clan[member_acc] = (accession, seq_length)

    logger.info("exporting alignments")
    with DumpFile(p_alignments, compress=True) as df:
        i = 0
        alignments = []
        for query_acc, target_acc, evalue, domains in iter_alignments(cur):
            i += 1
            if not i % 10000000:
                logger.info(f"{i:>12,}")

            try:
                query_clan_acc, seq_length = entry2clan[query_acc]
            except KeyError:
                continue

            if evalue > threshold:
                continue

            try:
                target_clan_acc, _ = entry2clan[target_acc]
            except KeyError:
                target_clan_acc = None

            alignments.append((
                query_clan_acc,
                query_acc,
                target_acc,
                target_clan_acc,
                evalue,
                seq_length,
                json.dumps(domains)
            ))

            if len(alignments) == buffer_size:
                df.dump(alignments)
                alignments = []

            if query_clan_acc == target_clan_acc:
                # Query and target from the same clan: update the clan's links
                links = clan_links[query_clan_acc]

                if query_acc > target_acc:
                    query_acc, target_acc = target_acc, query_acc

                try:
                    targets = links[query_acc]
                except KeyError:
                    links[query_acc] = {target_acc: evalue}
                else:
                    if target_acc not in targets or evalue < targets[target_acc]:
                        targets[target_acc] = evalue

        df.dump(alignments)
        alignments = []
        logger.info(f"{i:>12,}")

    cur.close()
    con.close()

    logger.info("loading additional details for Pfam clans")
    pfam_clans = pfam.get_clans(pfam_url)

    logger.info("finalizing")
    for clan_acc, clan in clans.items():
        nodes = []
        for accession, score, seq_length in clan["members"]:
            nodes.append({
                "accession": accession,
                "type": "entry",
                "score": score
            })

        links = []
        for query_acc, targets in clan_links[clan_acc].items():
            for target_acc, score in targets.items():
                links.append({
                    "source": query_acc,
                    "target": target_acc,
                    "score": score
                })

        clan["relationships"] = {
            "nodes": nodes,
            "links": links
        }

        if clan_acc in pfam_clans:
            # Replace `description`, add `authors` and `literature`
            clan.update(pfam_clans[clan_acc])

    dumpobj(p_clans, clans)
    logger.info("complete")
Ejemplo n.º 12
0
def export(url: str, p_entries: str, p_entry2xrefs: str, p_taxonomy: str,
           outdir: str, max_xrefs: int = 100000):
    logger.info("loading database versions")
    con = MySQLdb.connect(**url2dict(url))
    cur = con.cursor()
    cur.execute(
        """
        SELECT name, name_long, version, release_date
        FROM webfront_database
        WHERE type = 'entry'
        """
    )
    databases = {}
    release_version = release_date = None
    for name, full_name, version, date in cur:
        databases[name] = full_name

        if name == "interpro":
            release_version = version
            release_date = date.strftime("%Y-%m-%d")

    cur.close()
    con.close()

    if release_version is None:
        raise RuntimeError("missing release version/date for InterPro")

    logger.info("loading taxonomic info")
    sci_names = {}
    for taxon_id, taxon in loadobj(p_taxonomy).items():
        sci_names[taxon_id] = taxon["sci_name"]

    try:
        shutil.rmtree(outdir)
    except FileNotFoundError:
        pass
    finally:
        os.makedirs(outdir, mode=0o775)

    entries = loadobj(p_entries)

    logger.info("starting")
    i = 0
    types = {}
    num_xrefs = {}
    with DumpFile(p_entry2xrefs) as df:
        for accession, entry_xrefs in df:
            entry = entries[accession]
            if entry.is_deleted:
                continue

            fields, xrefs = _init_fields(entry)

            fields.append({
                "name": "source_database",
                "value": databases[entry.database]
            })

            for uniprot_acc, uniprot_id in entry_xrefs["proteins"]:
                xrefs.append({
                    "dbname": "UNIPROT",
                    "dbkey": uniprot_acc
                })

                xrefs.append({
                    "dbname": "UNIPROT",
                    "dbkey": uniprot_id
                })

            for tax_id in entry_xrefs["taxa"]:
                xrefs.append({
                    "dbname": "TAXONOMY",
                    "dbkey": tax_id
                })

                xrefs.append({
                    "dbname": "TAXONOMY",
                    "dbkey": sci_names[tax_id]
                })

            for upid in entry_xrefs["proteomes"]:
                xrefs.append({
                    "dbname": "PROTEOMES",
                    "dbkey": upid
                })

            for pdbe_id in entry_xrefs["structures"]:
                xrefs.append({
                    "dbname": "PDB",
                    "dbkey": pdbe_id
                })

            entry_type = entry.type.lower()
            try:
                dt, items = types[entry_type]
            except KeyError:
                dt = DirectoryTree(outdir, entry_type)
                items = []
                types[entry_type] = (dt, items)
                num_xrefs[entry_type] = 0

            items.append({
                "fields": fields,
                "cross_references": xrefs
            })
            num_xrefs[entry_type] += len(xrefs)

            if num_xrefs[entry_type] >= max_xrefs:
                path = dt.mktemp(suffix=".json")
                with open(path, "wt") as fh:
                    json.dump({
                        "name": "InterPro",
                        "release": release_version,
                        "release_date": release_date,
                        "entry_count": len(items),
                        "entries": items
                    }, fh, indent=4)

                items.clear()
                num_xrefs[entry_type] = 0

            i += 1
            if not i % 10000:
                logger.info(f"{i:>12,}")

    logger.info(f"{i:>12,}")

    for entry_type, (dt, items) in types.items():
        if num_xrefs[entry_type]:
            path = dt.mktemp(suffix=".json")
            with open(path, "wt") as fh:
                json.dump({
                    "name": "InterPro",
                    "release": release_version,
                    "release_date": release_date,
                    "entry_count": len(items),
                    "entries": items
                }, fh, indent=4)

    logger.info("complete")
Ejemplo n.º 13
0
def export_entries(url: str, p_metacyc: str, p_clans: str,
                   p_proteins: str, p_structures: str,
                   p_uniprot2matches: str, p_uniprot2proteome: str,
                   p_uniprot2ida: str, p_entry2xrefs: str, p_entries: str,
                   **kwargs):
    min_overlap = kwargs.get("overlap", 0.2)
    processes = kwargs.get("processes", 1)
    min_similarity = kwargs.get("similarity", 0.75)
    tmpdir = kwargs.get("tmpdir")

    con = cx_Oracle.connect(url)
    cur = con.cursor()

    entries = {}
    logger.info("loading active InterPro entries")
    for entry in _get_interpro_entries(cur):
        entries[entry.accession] = entry

    logger.info("enriching entries with IntAct data")
    for accession, interactions in intact.get_interactions(cur).items():
        try:
            entry = entries[accession]
        except KeyError:
            continue
        else:
            entry.ppi = interactions

    logger.info("loading deleted InterPro entries")
    for entry in _get_retired_interpro_entries(cur):
        if entry.accession in entries:
            cur.close()
            con.close()
            raise RuntimeError(f"entry cannot be active "
                               f"and deleted {entry.accession}")

        entries[entry.accession] = entry

    logger.info("loading member database signatures")
    for entry in _get_signatures(cur):
        if entry.integrated_in and entry.integrated_in not in entries:
            cur.close()
            con.close()
            raise RuntimeError(f"{entry.accession} integrated "
                               f"in missing entry ({entry.integrated_in})")

        entries[entry.accession] = entry

    logger.info("loading past entry names")
    past_names = _get_name_history(cur)

    logger.info("loading past signature integrations")
    past_integrations = _get_integration_history(cur)

    logger.info("loading ENZYME")
    u2enzyme = uniprot.get_swissprot2enzyme(cur)

    logger.info("loading Reactome pathways")
    u2reactome = uniprot.get_swissprot2reactome(cur)
    cur.close()
    con.close()

    logger.info("loading MetaCyc pathways")
    ec2metacyc = metacyc.get_ec2pathways(p_metacyc)

    # Updating entry history
    for entry in entries.values():
        try:
            names = past_names[entry.accession]
        except KeyError:
            pass
        else:
            entry.history["names"] = names

        try:
            signatures = past_integrations[entry.accession]
        except KeyError:
            pass
        else:
            entry.history["signatures"] = signatures

    # Updating entry clan info
    for clan in loadobj(p_clans).values():
        for entry_acc, score, seq_length in clan["members"]:
            try:
                entry = entries[entry_acc]
            except:
                continue
            else:
                entry.clan = {
                    "accession": clan["accession"],
                    "name": clan["name"]
                }

    inqueue = Queue(maxsize=processes)
    outqueue = Queue()
    workers = []
    for _ in range(max(1, processes - 1)):
        dt = DirectoryTree(tmpdir)
        p = Process(target=_process_proteins,
                    args=(inqueue, entries, min_overlap, dt, outqueue))
        p.start()
        workers.append((p, dt))

    logger.info("processing")
    uniprot2pdbe = {}
    for pdb_id, entry in loadobj(p_structures).items():
        for uniprot_acc, chains in entry["proteins"].items():
            try:
                uniprot2pdbe[uniprot_acc][pdb_id] = chains
            except KeyError:
                uniprot2pdbe[uniprot_acc] = {pdb_id: chains}

    proteins = Store(p_proteins)
    u2matches = Store(p_uniprot2matches)
    u2proteome = Store(p_uniprot2proteome)
    i = 0
    for uniprot_acc, matches in u2matches.items():
        inqueue.put((
            uniprot_acc,
            proteins[uniprot_acc],
            matches,
            u2proteome.get(uniprot_acc),
            uniprot2pdbe.get(uniprot_acc, {}),
            set(u2enzyme.get(uniprot_acc, [])),
            set(u2reactome.get(uniprot_acc, []))
        ))

        i += 1
        if not i % 10000000:
            logger.info(f"{i:>15,}")

    proteins.close()
    u2matches.close()
    u2proteome.close()
    logger.info(f"{i:>15,}")

    # Send sentinel
    for _ in workers:
        inqueue.put(None)

    # Merge results from workers
    logger.info("exporting domain architectures")
    entries_with_xrefs = set()
    xref_files = []
    entry_counts = {}
    entry_intersections = {}
    interpro2enzyme = {}
    interpro2reactome = {}
    with Store(p_uniprot2ida, u2matches.get_keys(), tmpdir) as u2ida:
        for _ in workers:
            obj = outqueue.get()
            xref_files.append(obj[0])                               # str
            entries_with_xrefs |= obj[1]                            # set
            ida_file = obj[2]                                       # str
            deepupdate(obj[3], entry_counts, replace=False)         # dict
            deepupdate(obj[4], entry_intersections, replace=False)  # dict
            deepupdate(obj[5], interpro2enzyme)                     # dict
            deepupdate(obj[6], interpro2reactome)                   # dict

            with DumpFile(ida_file) as df:
                i = 0
                for uniprot_acc, dom_members, dom_str, dom_id in df:
                    u2ida[uniprot_acc] = (
                        dom_members,
                        dom_str,
                        dom_id
                    )
                    i += 1

                    if not i % 1000:
                        u2ida.sync()

            u2ida.sync()

        size = u2ida.merge(processes=processes)

    # Adding empty EntryXrefs objects for entries without xrefs
    xref_files.append(workers[0][1].mktemp())
    with DumpFile(xref_files[-1], compress=True) as df:
        for entry_acc in sorted(set(entries.keys()) - entries_with_xrefs):
            df.dump((entry_acc, EntryXrefs().asdict()))

    logger.info("exporting cross-references")
    with DumpFile(p_entry2xrefs, compress=True) as df:
        for entry_acc, xrefs in merge_dumps(xref_files):
            df.dump((entry_acc, xrefs))

            entry = entries[entry_acc]

            # Reactome pathways
            if entry_acc in interpro2reactome:
                pathways = interpro2reactome[entry_acc]
                entry.pathways["reactome"] = [
                    dict(zip(("id", "name"), pthw))
                    for pthw in sorted(pathways)
                ]

            # EC numbers
            if entry_acc in interpro2enzyme:
                ecnos = sorted(interpro2enzyme[entry_acc])
                entry.cross_references["ec"] = ecnos

                # MetaCyc pathways
                pathways = set()
                for ecno in ecnos:
                    pathways |= set(ec2metacyc.get(ecno, []))

                if pathways:
                    entry.pathways["metacyc"] = [
                        dict(zip(("id", "name"), pthw))
                        for pthw in sorted(pathways)
                    ]

    for p, dt in workers:
        size += dt.size
        dt.remove()

    logger.info(f"temporary files: {size / 1024 / 1024:.0f} MB")

    logger.info("calculating overlapping relationships")
    supfam = "homologous_superfamily"
    types = (supfam, "domain", "family", "repeat")
    for entry_acc, overlaps in entry_intersections.items():
        entry1 = entries[entry_acc]
        entry_cnt = entry_counts[entry_acc]
        type1 = entry1.type.lower()

        for other_acc, overlap_counts in overlaps.items():
            o1 = overlap_counts["1"]
            o2 = overlap_counts["2"]
            other_cnt = entry_counts[other_acc]

            # Independent coefficients
            coef1 = o1 / (entry_cnt + other_cnt - o1)
            coef2 = o2 / (entry_cnt + other_cnt - o2)

            # Final coefficient: average of independent coefficients
            coef = (coef1 + coef2) * 0.5

            # Containment indices
            c1 = o1 / entry_cnt
            c2 = o2 / other_cnt

            if all([item < min_similarity for item in (coef, c1, c2)]):
                continue

            # Entries are similar enough
            entry2 = entries[other_acc]
            type2 = entry2.type.lower()
            if ((type1 == supfam and type2 in types)
                    or (type1 in types and type2 == supfam)):
                # e1 -> e2 relationship
                entry1.overlaps_with.append({
                    "accession": other_acc,
                    "name": entry2.name,
                    "type": type2
                })

                # e2 -> e1 relationship
                entry2.overlaps_with.append({
                    "accession": entry_acc,
                    "name": entry1.name,
                    "type": type1
                })

    dumpobj(p_entries, entries)

    logger.info("populating ENTRY2PATHWAY")
    con = cx_Oracle.connect(url)
    cur = con.cursor()
    cur.execute("TRUNCATE TABLE INTERPRO.ENTRY2PATHWAY")
    cur.close()
    sql = "INSERT INTO INTERPRO.ENTRY2PATHWAY VALUES (:1, :2, :3, :4)"
    with Table(con, sql) as table:
        for e in entries.values():
            for database, pathways in e.pathways.items():
                code = PATHWAY_DATABASE[database]
                for pthw in pathways:
                    table.insert((
                        e.accession,
                        code,
                        pthw["id"],
                        pthw["name"]
                    ))

    con.commit()
    con.close()
    logger.info("complete")
Ejemplo n.º 14
0
def _process_proteins(inqueue: Queue, entries: Mapping[str, Entry],
                      min_overlap: bool, dt: DirectoryTree, outqueue: Queue):
    xrefs = {}                  # temporary dict accession->xrefs
    xref_files = []             # files containing xrefs
    entries_with_xrefs = set()  # accession of entries having xrefs
    entry_counts = {}           # number of matches
    entry_intersections = {}    # number of overlapping matches
    interpro2enzyme = {}        # InterPro-ENZYME mapping
    interpro2reactome = {}      # InterPro-Reactome mapping

    ida_file = dt.mktemp()
    with DumpFile(ida_file, compress=True) as ida_df:
        i = 0
        for obj in iter(inqueue.get, None):
            uniprot_acc = obj[0]     # str
            protein_info = obj[1]    # dict
            matches = obj[2]         # dict
            proteome_id = obj[3]     # str or None
            pdb_entries = obj[4]     # dict
            enzymes = obj[5]         # set
            pathways = obj[6]        # set

            supermatches = []
            all_locations = []
            for entry_acc, locations in matches.items():
                entry = entries[entry_acc]
                if entry.database == "interpro":
                    # Adding EC / Reactome mapping

                    if enzymes:
                        try:
                            interpro2enzyme[entry_acc] |= enzymes
                        except KeyError:
                            interpro2enzyme[entry_acc] = enzymes.copy()

                    if pathways:
                        try:
                            interpro2reactome[entry_acc] |= pathways
                        except KeyError:
                            interpro2reactome[entry_acc] = pathways.copy()
                elif entry.database == "pfam":
                    # Storing matches for IDA
                    for loc in locations:
                        all_locations.append({
                            "pfam": entry_acc,
                            "interpro": entry.integrated_in,
                            # We do not consider fragmented locations
                            "start": loc["fragments"][0]["start"],
                            "end": max(f["end"] for f in loc["fragments"])
                        })

                # Adding cross-references (except IDA, still being calculated)
                try:
                    entry_xrefs = xrefs[entry_acc]
                except KeyError:
                    entry_xrefs = xrefs[entry_acc] = EntryXrefs()
                    entries_with_xrefs.add(entry_acc)

                entry_xrefs.matches += len(locations)
                entry_xrefs.proteins.add((
                    uniprot_acc,
                    protein_info["identifier"]
                ))

                if proteome_id:
                    entry_xrefs.proteomes.add(proteome_id)

                for pdb_id, chains in pdb_entries.items():
                    for chain_id, segments in chains.items():
                        if overlaps_pdb_chain(locations, segments):
                            entry_xrefs.structures.add(pdb_id)
                            break  # Skip other chains

                entry_xrefs.taxa.add(protein_info["taxid"])

                # Create a Supermatch for each integrated signature match
                if entry.integrated_in:
                    # Integrated member database signature
                    interpro_acc = entry.integrated_in
                    root = entries[interpro_acc].hierarchy["accession"]
                    for loc in locations:
                        sm = Supermatch(interpro_acc, loc["fragments"], root)
                        supermatches.append(sm)

            # Finishing IDA
            domains = []
            dom_members = set()
            for loc in sorted(all_locations, key=repr_fragment):
                if loc["interpro"]:
                    domains.append(f"{loc['pfam']}:{loc['interpro']}")
                    dom_members.add(loc["interpro"])
                else:
                    domains.append(loc["pfam"])

                dom_members.add(loc["pfam"])

            if domains:
                # Flush IDA
                dom_str = '-'.join(domains)
                dom_id = hashlib.sha1(dom_str.encode("utf-8")).hexdigest()
                ida_df.dump((uniprot_acc, dom_members, dom_str, dom_id))

                # Adding cross-references now
                for key in dom_members:
                    xrefs[key].ida.add(dom_id)

            # Merging overlapping supermatches
            merged = []
            for sm_to_merge in sorted(supermatches):
                for sm_merged in merged:
                    if sm_merged.overlaps(sm_to_merge, min_overlap):
                        """
                        Supermatches overlap
                            (sm_to_merge has been merged into sm_merged)
                        """
                        break
                else:
                    # sm_to_merge does not overlap with any other supermatches
                    merged.append(sm_to_merge)

            # Group by entry
            merged_grouped = {}
            for sm in merged:
                for interpro_acc in sm.entries:
                    try:
                        merged_grouped[interpro_acc] += sm.fragments
                    except KeyError:
                        merged_grouped[interpro_acc] = list(sm.fragments)

            # Evaluate how entries overlap
            for interpro_acc, fragments1 in merged_grouped.items():
                try:
                    entry_counts[interpro_acc] += 1
                except KeyError:
                    entry_counts[interpro_acc] = 1

                for other_acc, fragments2 in merged_grouped.items():
                    if other_acc >= interpro_acc:
                        continue

                    try:
                        obj = entry_intersections[interpro_acc]
                    except KeyError:
                        obj = entry_intersections[interpro_acc] = {}

                    try:
                        overlaps = obj[other_acc]
                    except KeyError:
                        """
                        Use a dict rather than a list (or tuple)
                        because deepupdate() would concatenate the lists
                        created by different workers
                        """
                        overlaps = obj[other_acc] = {
                            "1": 0,
                            "2": 0,
                        }

                    flag = 0
                    for f1 in fragments1:
                        start1 = f1["start"]
                        end1 = f1["end"]
                        length1 = end1 - start1 + 1

                        for f2 in fragments2:
                            start2 = f2["start"]
                            end2 = f2["end"]
                            length2 = end2 - start2 + 1
                            overlap = min(end1, end2) - max(start1, start2) + 1

                            if not flag & 1 and overlap >= length1 * 0.5:
                                # 1st time fragments overlap >= 50% of f1
                                flag |= 1
                                overlaps["1"] += 1

                            if not flag & 2 and overlap >= length2 * 0.5:
                                # 1st time fragments overlap >= 50% of f2
                                flag |= 2
                                overlaps["2"] += 1

                        if flag == 3:
                            """
                            Both cases already happened
                              -> no need to keep iterating
                            """
                            break

            i += 1
            if not i % 100000:
                # Flush Xrefs
                file = dt.mktemp()
                with DumpFile(file, compress=True) as xref_df:
                    for entry_acc in sorted(xrefs):
                        xref_df.dump((entry_acc, xrefs[entry_acc].asdict()))

                xrefs = {}
                xref_files.append(file)

    # Remaining xrefs
    file = dt.mktemp()
    with DumpFile(file, compress=True) as df:
        for entry_acc in sorted(xrefs):
            df.dump((entry_acc, xrefs[entry_acc].asdict()))

    xref_files.append(file)

    # Merge files (each worker will produce one merged file)
    xref_file = dt.mktemp()
    with DumpFile(xref_file, compress=True) as df:
        for entry_acc, xrefs in merge_dumps(xref_files):
            df.dump((entry_acc, xrefs))

    outqueue.put((
        xref_file,
        entries_with_xrefs,
        ida_file,
        entry_counts,
        entry_intersections,
        interpro2enzyme,
        interpro2reactome
    ))