Esempio n. 1
0
def insert_extra_features(stg_url: str, p_uniprot2features: str):
    logger.info("starting")

    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_proteinfeature")
    cur.execute("""
        CREATE TABLE webfront_proteinfeature
        (
            feature_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
            protein_acc VARCHAR(15) NOT NULL,
            entry_acc VARCHAR(25) NOT NULL,
            source_database VARCHAR(10) NOT NULL,
            location_start INT NOT NULL,
            location_end INT NOT NULL,
            sequence_feature VARCHAR(35)
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    sql = """
        INSERT INTO webfront_proteinfeature (
          protein_acc, entry_acc, source_database, location_start,
          location_end, sequence_feature
        )
        VALUES (%s, %s, %s, %s, %s, %s)
    """
    with Store(p_uniprot2features) as proteins, Table(con, sql) as table:
        i = 0
        for uniprot_acc, entries in proteins.items():
            for entry_acc, info in entries.items():
                for pos_start, pos_end, seq_feature in info["locations"]:
                    table.insert((uniprot_acc, entry_acc, info["database"],
                                  pos_start, pos_end, seq_feature))

            i += 1
            if not i % 10000000:
                logger.info(f"{i:>12,}")

        logger.info(f"{i:>12,}")
    con.commit()

    logger.info("indexing")
    cur = con.cursor()
    cur.execute("""
        CREATE INDEX i_proteinfeature
        ON webfront_proteinfeature (protein_acc)
        """)
    cur.close()
    con.close()
    logger.info("complete")
Esempio n. 2
0
def insert_structural_models(pro_url: str, stg_url: str, p_entry2xrefs: str):
    logger.info("finding entries with structures")
    has_structures = set()
    with DumpFile(p_entry2xrefs) as df:
        for accession, xrefs in df:
            if xrefs["structures"]:
                has_structures.add(accession)

    my_con = MySQLdb.connect(**url2dict(stg_url))
    my_cur = my_con.cursor()
    my_cur.execute("DROP TABLE IF EXISTS webfront_structuralmodel")
    my_cur.execute("""
        CREATE TABLE webfront_structuralmodel
        (
            model_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
            accession VARCHAR(25) NOT NULL,
            contacts LONGBLOB NOT NULL,
            lddt LONGBLOB NOT NULL,
            structure LONGBLOB NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)

    # Load accessions of signatures with structural models
    logger.info("finding entries with trRosetta structural models")
    ora_con = cx_Oracle.connect(pro_url)
    ora_cur = ora_con.cursor()
    ora_cur.outputtypehandler = blob_as_str
    ora_cur.execute("SELECT METHOD_AC FROM INTERPRO.PFAM_TRROSETTA")
    to_import = {acc for acc, in ora_cur if acc not in has_structures}

    logger.info(f"{len(to_import)} entries with structural models to import")
    for acc in to_import:
        ora_cur.execute(
            """
            SELECT PROB_CONTACTS, PRED_LDDT, PRED_STRUCTURE
            FROM INTERPRO.PFAM_TRROSETTA
            WHERE METHOD_AC = :1
            """, (acc, ))

        for cmap_gz, lddt_gz, pdb_gz in ora_cur:
            my_cur.execute(
                """
                    INSERT INTO webfront_structuralmodel (
                      accession, contacts, lddt, structure
                    )
                    VALUES (%s, %s, %s, %s)
                """, (acc, cmap_gz, lddt_gz, pdb_gz))

    ora_cur.close()
    ora_con.close()

    my_con.commit()
    my_cur.execute("""
        CREATE INDEX i_structuralmodel
        ON webfront_structuralmodel (accession)
        """)
    my_cur.close()
    my_con.close()

    logger.info("complete")
Esempio n. 3
0
def export_sequences(url: str,
                     keyfile: str,
                     output: str,
                     processes: int = 1,
                     tmpdir: Optional[str] = None):
    logger.info("starting")
    with Store(output, Store.load_keys(keyfile), tmpdir) as store:
        con = cx_Oracle.connect(url)
        cur = con.cursor()
        cur.execute("""
            SELECT /*+ PARALLEL */ UX.AC, UP.SEQ_SHORT, UP.SEQ_LONG
            FROM UNIPARC.XREF UX
            INNER JOIN UNIPARC.PROTEIN UP ON UX.UPI = UP.UPI
            WHERE UX.DBID IN (2, 3)
            AND UX.DELETED = 'N'
            """)

        i = 0
        for row in cur:
            store[row[0]] = row[2].read() if row[2] is not None else row[1]

            i += 1
            if not i % 1000000:
                store.sync()

                if not i % 10000000:
                    logger.info(f"{i:>12,}")

        cur.close()
        con.close()

        logger.info(f"{i:>12,}")
        size = store.merge(processes=processes)
        logger.info(f"temporary files: {size/1024/1024:.0f} MB")
Esempio n. 4
0
def export_go(url: str,
              keyfile: str,
              output: str,
              processes: int = 1,
              tmpdir: Optional[str] = None):
    logger.info("starting")
    with Store(output, Store.load_keys(keyfile), tmpdir) as store:
        con = cx_Oracle.connect(url)
        cur = con.cursor()
        cur.execute("""
            SELECT CODE, SORT_ORDER, TERM_NAME
            FROM GO.CV_CATEGORIES@GOAPRO
            """)
        categories = {row[0]: row[1:] for row in cur}

        cur.execute("""
            SELECT E.ACCESSION, D.PRIMARY_ID, D.SECONDARY_ID, D.NOTE
            FROM SPTR.DBENTRY@SWPREAD E
            INNER JOIN SPTR.DBENTRY_2_DATABASE@SWPREAD D 
              ON E.DBENTRY_ID = D.DBENTRY_ID
            WHERE E.ENTRY_TYPE IN (0, 1)            -- Swiss-Prot and TrEMBL
              AND E.MERGE_STATUS != 'R'             -- not 'Redundant'
              AND E.DELETED = 'N'                   -- not deleted
              AND E.FIRST_PUBLIC IS NOT NULL        -- published
              AND D.DATABASE_ID = 'GO'              -- GO annotation
            """)

        i = 0
        for accession, go_id, sec_id, note in cur:
            # sec_id ->
            """
            sec_id -> cat_code:term_name, e.g.:
                C:integral component of membrane
                
            node -> go_evidence: source,e.g.:
                IEA:InterPro
            """
            cat_code, term_name = sec_id.split(':', 1)
            cat_order, cat_name = categories[cat_code]
            store.append(accession,
                         (cat_order, go_id, term_name, cat_code, cat_name))

            i += 1
            if not i % 1000000:
                store.sync()

                if not i % 10000000:
                    logger.info(f"{i:>12,}")

        cur.close()
        con.close()

        logger.info(f"{i:>12,}")
        size = store.merge(fn=_post_go, processes=processes)
        logger.info(f"temporary files: {size / 1024 / 1024:.0f} MB")
def _export_alns(pfam_url: str, dt: DirectoryTree, buffer_size: int = 1000):
    logger.info("processing Pfam alignments")
    df = DumpFile(dt.mktemp(), compress=True)
    cnt = 0

    iterator = pfam.get_alignments(pfam_url)
    for entry_acc, aln_type, aln_bytes, count in iterator:
        df.dump((entry_acc, f"alignment:{aln_type}", aln_bytes,
                 "application/gzip", count))

        cnt += 1
        if cnt == buffer_size:
            df.close()
            yield df.path
            df = DumpFile(dt.mktemp(), compress=True)
            cnt = 0

    df.close()
    yield df.path
Esempio n. 6
0
def export_name(url: str,
                keyfile: str,
                output: str,
                processes: int = 1,
                tmpdir: Optional[str] = None):
    logger.info("starting")
    with Store(output, Store.load_keys(keyfile), tmpdir) as store:
        con = cx_Oracle.connect(url)
        cur = con.cursor()
        cur.execute("""
            SELECT ACCESSION, DESCR
            FROM (
                SELECT
                  E.ACCESSION, 
                  D.DESCR, 
                  ROW_NUMBER() OVER (
                    PARTITION BY E.ACCESSION 
                    ORDER BY CV.DESC_ID,    -- 1=RecName, 2=AltName, 3=SubName
                             CV.ORDER_IN,   -- Swiss-Prot manual order
                             D.DESCR        -- TrEMBL alphabetic order
                  ) RN
                FROM SPTR.DBENTRY@SWPREAD E
                INNER JOIN SPTR.DBENTRY_2_DESC@SWPREAD D
                  ON E.DBENTRY_ID = D.DBENTRY_ID
                  AND D.DESC_ID IN (1,4,11,13,16,23,25,28,35)  --Full description section
                INNER JOIN SPTR.CV_DESC@SWPREAD CV
                  ON D.DESC_ID = CV.DESC_ID
                WHERE E.ENTRY_TYPE IN (0, 1)
                  AND E.MERGE_STATUS != 'R'
                  AND E.DELETED = 'N'
                  AND E.FIRST_PUBLIC IS NOT NULL
            )
            WHERE RN = 1
            """)

        i = 0
        for accession, description in cur:
            store[accession] = description

            i += 1
            if not i % 1000000:
                store.sync()

                if not i % 10000000:
                    logger.info(f"{i:>12,}")

        cur.close()
        con.close()

        logger.info(f"{i:>12,}")
        size = store.merge(processes=processes)
        logger.info(f"temporary files: {size/1024/1024:.0f} MB")
Esempio n. 7
0
def export_proteome(url: str,
                    keyfile: str,
                    output: str,
                    processes: int = 1,
                    tmpdir: Optional[str] = None):
    logger.info("starting")
    with Store(output, Store.load_keys(keyfile), tmpdir) as store:
        con = cx_Oracle.connect(url)
        cur = con.cursor()
        """
        Without the DISTINCT, there would be duplicated rows, e.g.
        A0A059MHQ6  UP000024941
        A0A059MHQ6  UP000024941
        
        Even for duplicated rows, a given UniProt accession is associated
        to one unique UPID.
        
        It's just easier to remove the duplicates at the database level.
        """
        cur.execute("""
            SELECT DISTINCT E.ACCESSION, P.UPID
            FROM SPTR.DBENTRY@SWPREAD E
            INNER JOIN SPTR.PROTEOME2UNIPROT@SWPREAD P2U
              ON E.ACCESSION = P2U.ACCESSION AND E.TAX_ID = P2U.TAX_ID
            INNER JOIN SPTR.PROTEOME@SWPREAD P
              ON P2U.PROTEOME_ID = P.PROTEOME_ID
              AND P.IS_REFERENCE = 1
            WHERE E.ENTRY_TYPE IN (0, 1)
            AND E.MERGE_STATUS != 'R'
            AND E.DELETED = 'N'
            AND E.FIRST_PUBLIC IS NOT NULL
            """)

        i = 0
        for accession, upid in cur:
            store[accession] = upid

            i += 1
            if not i % 1000000:
                store.sync()

                if not i % 10000000:
                    logger.info(f"{i:>12,}")

        cur.close()
        con.close()

        logger.info(f"{i:>12,}")
        size = store.merge(processes=processes)
        logger.info(f"temporary files: {size/1024/1024:.0f} MB")
Esempio n. 8
0
def export_evidence(url: str,
                    keyfile: str,
                    output: str,
                    processes: int = 1,
                    tmpdir: Optional[str] = None):
    logger.info("starting")
    with Store(output, Store.load_keys(keyfile), tmpdir) as store:
        con = cx_Oracle.connect(url)
        cur = con.cursor()
        cur.execute("""
            SELECT ACCESSION, PROTEIN_EXISTENCE_ID, NAME
            FROM (
              SELECT
                E.ACCESSION,
                E.PROTEIN_EXISTENCE_ID,
                GN.NAME,
                ROW_NUMBER() OVER (
                  PARTITION BY E.ACCESSION
                  ORDER BY GN.GENE_NAME_TYPE_ID
                ) RN
              FROM SPTR.DBENTRY@SWPREAD E
              LEFT OUTER JOIN SPTR.GENE@SWPREAD G
                ON E.DBENTRY_ID = G.DBENTRY_ID
              LEFT OUTER JOIN SPTR.GENE_NAME@SWPREAD GN
                ON G.GENE_ID = GN.GENE_ID
              WHERE E.ENTRY_TYPE IN (0, 1)
              AND E.MERGE_STATUS != 'R'
              AND E.DELETED = 'N'
              AND E.FIRST_PUBLIC IS NOT NULL
            )
            WHERE RN = 1
            """)

        i = 0
        for accession, evidence, gene in cur:
            store[accession] = (evidence, gene)

            i += 1
            if not i % 1000000:
                store.sync()

                if not i % 10000000:
                    logger.info(f"{i:>12,}")

        cur.close()
        con.close()

        logger.info(f"{i:>12,}")
        size = store.merge(processes=processes)
        logger.info(f"temporary files: {size/1024/1024:.0f} MB")
Esempio n. 9
0
def export_features(url: str,
                    keyfile: str,
                    output: str,
                    processes: int = 1,
                    tmpdir: Optional[str] = None):
    logger.info("starting")
    with Store(output, Store.load_keys(keyfile), tmpdir) as store:
        con = cx_Oracle.connect(url)
        cur = con.cursor()
        cur.execute("""
            SELECT FM.PROTEIN_AC, FM.METHOD_AC, LOWER(DB.DBSHORT),
                   FM.POS_FROM, FM.POS_TO, FM.SEQ_FEATURE
            FROM INTERPRO.FEATURE_MATCH FM
            INNER JOIN INTERPRO.CV_DATABASE DB ON FM.DBCODE = DB.DBCODE
            """)

        i = 0
        for row in cur:
            protein_acc = row[0]
            signature_acc = row[1]
            database = row[2]
            pos_start = row[3]
            pos_end = row[4]
            seq_feature = row[5]

            if database == "mobidblt" and seq_feature is None:
                seq_feature = "Consensus Disorder Prediction"

            store.update(protein_acc, {
                signature_acc: {
                    "database": database,
                    "locations": [(pos_start, pos_end, seq_feature)]
                }
            },
                         replace=True)

            i += 1
            if not i % 1000000:
                store.sync()

                if not i % 100000000:
                    logger.info(f"{i:>13,}")

        cur.close()
        con.close()

        logger.info(f"{i:>13,}")
        size = store.merge(processes=processes)
        logger.info(f"temporary files: {size/1024/1024:.0f} MB")
Esempio n. 10
0
def export_comments(url: str,
                    keyfile: str,
                    output: str,
                    processes: int = 1,
                    tmpdir: Optional[str] = None):
    logger.info("starting")
    with Store(output, Store.load_keys(keyfile), tmpdir) as store:
        con = cx_Oracle.connect(url)
        cur = con.cursor()
        """
        Note on the TEXT structure: 
        Some comments have a title (e.g. Q01299) which is not retrieved 
        when joining on CC_STRUCTURE_TYPE_ID = 1
        """
        cur.execute("""
            SELECT E.ACCESSION, B.ORDER_IN, NVL(B.TEXT, SS.TEXT)
            FROM SPTR.DBENTRY@SWPREAD E
            INNER JOIN SPTR.COMMENT_BLOCK@SWPREAD B
              ON E.DBENTRY_ID = B.DBENTRY_ID
              AND B.COMMENT_TOPICS_ID = 2        -- FUNCTION comments
            LEFT OUTER JOIN SPTR.COMMENT_STRUCTURE@SWPREAD S
              ON B.COMMENT_BLOCK_ID = S.COMMENT_BLOCK_ID
              AND S.CC_STRUCTURE_TYPE_ID = 1      -- TEXT structure
            LEFT OUTER JOIN SPTR.COMMENT_SUBSTRUCTURE@SWPREAD SS
              ON S.COMMENT_STRUCTURE_ID = SS.COMMENT_STRUCTURE_ID
            WHERE E.ENTRY_TYPE IN (0, 1)          -- Swiss-Prot and TrEMBL
              AND E.MERGE_STATUS != 'R'           -- not 'Redundant'
              AND E.DELETED = 'N'                 -- not deleted
              AND E.FIRST_PUBLIC IS NOT NULL      -- published
            """)

        i = 0
        for accession, block_number, text in cur:
            store.append(accession, (block_number, text))

            i += 1
            if not i % 1000000:
                store.sync()

                if not i % 10000000:
                    logger.info(f"{i:>12,}")

        cur.close()
        con.close()

        logger.info(f"{i:>12,}")
        size = store.merge(fn=_post_comments, processes=processes)
        logger.info(f"temporary files: {size/1024/1024:.0f} MB")
Esempio n. 11
0
def insert_annotations(pro_url: str, p_uniprot2matches: str, pfam_url: str,
                       stg_url: str, **kwargs):
    tmpdir = kwargs.get("tmpdir")

    con = MySQLdb.connect(**url2dict(stg_url))
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_entryannotation")
    cur.execute("""
        CREATE TABLE webfront_entryannotation
        (
            annotation_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
            accession VARCHAR(25) NOT NULL,
            type VARCHAR(20) NOT NULL,
            value LONGBLOB NOT NULL,
            mime_type VARCHAR(32) NOT NULL,
            num_sequences INT
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()
    con.close()

    queue = Queue()
    consumer = Process(target=_insert, args=(stg_url, queue))
    consumer.start()

    dt = DirectoryTree(root=tmpdir)

    # Get HMMs from InterPro Oracle database
    for path in _export_hmms(p_uniprot2matches, pro_url, dt):
        queue.put(path)

    # Get alignments from Pfam MySQL database
    for path in _export_alns(pfam_url, dt):
        queue.put(path)

    queue.put(None)
    consumer.join()

    logger.info(f"temporary files: {dt.size / 1024 ** 2:.0f} MB")
    dt.remove()

    logger.info("indexing")
    con = MySQLdb.connect(**url2dict(stg_url))
    cur = con.cursor()
    cur.execute("CREATE INDEX i_entryannotation "
                "ON webfront_entryannotation (accession)")
    cur.close()
    con.close()

    logger.info("complete")
Esempio n. 12
0
def export_proteins(url: str,
                    keyfile: str,
                    output: str,
                    processes: int = 1,
                    tmpdir: Optional[str] = None):
    logger.info("starting")
    with Store(output, Store.load_keys(keyfile), tmpdir) as store:
        con = cx_Oracle.connect(url)
        cur = con.cursor()
        cur.execute("""
            SELECT 
              PROTEIN_AC, NAME, DBCODE, LEN, FRAGMENT, 
              TO_CHAR(TAX_ID), CRC64
            FROM INTERPRO.PROTEIN
            """)

        i = 0
        for row in cur:
            store[row[0]] = {
                "identifier": row[1],
                "reviewed": row[2] == 'S',
                "length": row[3],
                "fragment": row[4] == 'Y',
                "taxid": row[5],
                "crc64": row[6]
            }

            i += 1
            if not i % 1000000:
                store.sync()

                if not i % 10000000:
                    logger.info(f"{i:>12,}")

        cur.close()
        con.close()

        logger.info(f"{i:>12,}")
        size = store.merge(processes=processes)
        logger.info(f"temporary files: {size/1024/1024:.0f} MB")
Esempio n. 13
0
def export(pro_url: str, stg_url: str, outdir: str):
    os.makedirs(outdir, exist_ok=True)

    con = cx_Oracle.connect(pro_url)
    cur = con.cursor()

    logger.info("exporting PDB-InterPro-GO-UniProt mapping")
    filepath = os.path.join(outdir, "pdb2interpro2go.tsv")
    _export_pdb2interpro2go2uniprot(cur, filepath)

    logger.info("exporting InterPro-GO-UniProt mapping")
    filepath = os.path.join(outdir, "interpro2go2uniprot.tsv")
    _export_interpro2go2uniprot(cur, filepath)
    cur.close()
    con.close()

    logger.info("exporting release info")
    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute(
        """
        SELECT version, release_date 
        FROM webfront_database 
        WHERE name='interpro'
        """
    )
    version, date = cur.fetchone()
    cur.close()
    con.close()

    filepath = os.path.join(outdir, "release.txt")
    with open(filepath, "wt") as fh:
        fh.write(f"InterPro version:    {version}\n")
        fh.write(f"Release date:        {date:%A, %d %B %Y}\n")
        fh.write(f"Generated on:        {datetime.now():%Y-%m-%d %H:%M}\n")

    os.chmod(filepath, 0o775)
    logger.info("complete")
Esempio n. 14
0
def chunk_proteins(url: str, keyfile: str, chunk_size: int = 50000):
    logger.info("loading")
    con = cx_Oracle.connect(url)
    cur = con.cursor()
    cur.execute("""
        SELECT PROTEIN_AC
        FROM INTERPRO.PROTEIN
        """)

    accessions = [acc for acc, in cur]
    cur.close()
    con.close()

    logger.info("splitting into chunks")
    Store.dump_keys(Store.chunk(accessions, chunk_size), keyfile)
    logger.info("complete")
Esempio n. 15
0
def export_matches(pro_url: str,
                   stg_url: str,
                   p_proteins: str,
                   p_uniprot2matches: str,
                   outdir: str,
                   processes: int = 8):
    shutil.copy(os.path.join(os.path.dirname(__file__), "match_complete.dtd"),
                outdir)

    logger.info("loading isoforms")
    u2variants = {}
    for accession, variant in ippro.get_isoforms(pro_url).items():
        protein_acc = variant["protein_acc"]
        try:
            variants = u2variants[protein_acc]
        except KeyError:
            variants = u2variants[protein_acc] = []
        finally:
            variants.append((accession, variant["length"], variant["crc64"],
                             variant["matches"]))

    logger.info("loading signatures")
    con = cx_Oracle.connect(pro_url)
    cur = con.cursor()
    signatures = ippro.get_signatures(cur)
    cur.close()
    con.close()

    logger.info("spawning processes")
    processes = max(1, processes - 1)
    ctx = mp.get_context(method="spawn")
    workers = []
    with Store(p_proteins) as proteins:
        proteins_per_file = math.ceil(len(proteins) / processes)
        start_acc = None
        for i, uniprot_acc in enumerate(proteins):
            if not i % proteins_per_file:
                if start_acc:
                    filename = f"match_{len(workers)+1}.xml"
                    filepath = os.path.join(outdir, filename)
                    p = ctx.Process(target=_write_match_tmp,
                                    args=(signatures, u2variants, p_proteins,
                                          p_uniprot2matches, start_acc,
                                          uniprot_acc, filepath))
                    p.start()
                    workers.append((p, filepath))

                start_acc = uniprot_acc

        filename = f"match_{len(workers) + 1}.xml"
        filepath = os.path.join(outdir, filename)
        p = ctx.Process(target=_write_match_tmp,
                        args=(signatures, u2variants, p_proteins,
                              p_uniprot2matches, start_acc, None, filepath))
        p.start()
        workers.append((p, filepath))

    logger.info("concatenating XML files")
    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("""
        SELECT name, name_alt, type, num_entries, version, release_date
        FROM webfront_database
        ORDER BY name_long
        """)

    doc = getDOMImplementation().createDocument(None, None, None)
    elem = doc.createElement("release")
    for name, name_alt, db_type, entry_count, version, date in cur:
        if db_type == "entry":
            dbinfo = doc.createElement("dbinfo")
            dbinfo.setAttribute("dbname", name_alt)
            if version:
                dbinfo.setAttribute("version", version)
            if entry_count:
                dbinfo.setAttribute("entry_count", str(entry_count))
            if date:
                dbinfo.setAttribute("file_date",
                                    date.strftime("%d-%b-%y").upper())
            elem.appendChild(dbinfo)
    cur.close()
    con.close()

    output = os.path.join(outdir, "match_complete.xml.gz")
    with gzip.open(output, "wt", encoding="utf-8") as fh:
        fh.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        fh.write('<!DOCTYPE interpromatch SYSTEM "match_complete.dtd">\n')
        fh.write('<interpromatch>\n')
        elem.writexml(fh, addindent="  ", newl="\n")

        for i, (p, filepath) in enumerate(workers):
            p.join()
            with open(filepath, "rt", encoding="utf-8") as tfh:
                for line in tfh:
                    fh.write(line)

            os.remove(filepath)
            logger.info(f"\t{i+1} / {len(workers)}")

        fh.write('</interpromatch>\n')

    logger.info("complete")
Esempio n. 16
0
def insert_entries(pfam_url: str, stg_url: str, p_entries: str,
                   p_entry2xrefs: str):
    logger.info("fetching Wikipedia data for Pfam entries")
    wiki = pfam.get_wiki(pfam_url)

    logger.info("loading Pfam curation/family details")
    pfam_details = pfam.get_details(pfam_url)

    logger.info("populating webfront_entry")
    entries = loadobj(p_entries)
    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_entry")
    cur.execute("""
        CREATE TABLE webfront_entry
        (
            entry_id VARCHAR(10) DEFAULT NULL,
            accession VARCHAR(25) PRIMARY KEY NOT NULL,
            type VARCHAR(50) NOT NULL,
            name LONGTEXT,
            short_name VARCHAR(100),
            source_database VARCHAR(10) NOT NULL,
            member_databases LONGTEXT,
            integrated_id VARCHAR(25),
            go_terms LONGTEXT,
            description LONGTEXT,
            wikipedia LONGTEXT,
            details LONGTEXT,
            literature LONGTEXT,
            hierarchy LONGTEXT,
            cross_references LONGTEXT,
            interactions LONGTEXT,
            pathways LONGTEXT,
            overlaps_with LONGTEXT,
            is_featured TINYINT NOT NULL,
            is_alive TINYINT NOT NULL,
            history LONGTEXT,
            entry_date DATETIME NOT NULL,
            deletion_date DATETIME,
            counts LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)

    # Count number of structural models per entry
    cur.execute("""
        SELECT accession, COUNT(*)
        FROM webfront_structuralmodel
        GROUP BY accession
        """)
    num_struct_models = dict(cur.fetchall())
    cur.close()

    sql = """
        INSERT INTO webfront_entry
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
          %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """

    with Table(con, sql) as table:
        with DumpFile(p_entry2xrefs) as df:
            for accession, xrefs in df:
                entry = entries[accession]
                counts = reduce(xrefs)
                counts.update({
                    "interactions":
                    len(entry.ppi),
                    "pathways":
                    sum([len(v) for v in entry.pathways.values()]),
                    "sets":
                    1 if entry.clan else 0,
                    "structural_models":
                    num_struct_models.get(accession, 0)
                })

                table.insert(
                    (None, accession, entry.type.lower(),
                     entry.name, entry.short_name, entry.database,
                     jsonify(entry.integrates), entry.integrated_in,
                     jsonify(entry.go_terms), jsonify(entry.description),
                     jsonify(wiki.get(accession)),
                     jsonify(pfam_details.get(accession)),
                     jsonify(entry.literature), jsonify(entry.hierarchy),
                     jsonify(entry.cross_references), jsonify(entry.ppi),
                     jsonify(entry.pathways), jsonify(entry.overlaps_with), 0,
                     0 if entry.is_deleted else 1, jsonify(entry.history),
                     entry.creation_date, entry.deletion_date,
                     jsonify(counts)))

    con.commit()
    con.close()
    logger.info("complete")
Esempio n. 17
0
def export_matches(url: str,
                   keyfile: str,
                   output: str,
                   processes: int = 1,
                   tmpdir: Optional[str] = None):
    logger.info("starting")
    with Store(output, Store.load_keys(keyfile), tmpdir) as store:
        con = cx_Oracle.connect(url)
        cur = con.cursor()
        cur.execute("""
            SELECT M.PROTEIN_AC, M.METHOD_AC, M.MODEL_AC, M.POS_FROM, 
                   M.POS_TO, M.FRAGMENTS, M.SCORE, E.ENTRY_AC
            FROM INTERPRO.MATCH M
            LEFT OUTER JOIN (
              SELECT E.ENTRY_AC, EM.METHOD_AC
              FROM INTERPRO.ENTRY E
              INNER JOIN INTERPRO.ENTRY2METHOD EM
                ON E.ENTRY_AC = EM.ENTRY_AC
              WHERE E.CHECKED = 'Y'
            ) E ON M.METHOD_AC = E.METHOD_AC
            """)

        i = 0
        for row in cur:
            if row[5]:
                fragments = []
                for frag in row[5].split(','):
                    # Format: START-END-STATUS
                    s, e, t = frag.split('-')
                    fragments.append({
                        "start": int(s),
                        "end": int(e),
                        "dc-status": DC_STATUSES[t]
                    })
            else:
                fragments = [{
                    "start": row[3],
                    "end": row[4],
                    "dc-status": DC_STATUSES['S']  # Continuous
                }]

            store.append(
                row[0],
                (
                    row[1],  # signature
                    row[2],  # model
                    row[6],  # score
                    fragments,
                    row[7]  # InterPro entry
                ))

            i += 1
            if not i % 1000000:
                store.sync()

                if not i % 100000000:
                    logger.info(f"{i:>13,}")

        cur.close()
        con.close()

        logger.info(f"{i:>13,}")
        size = store.merge(fn=_post_matches, processes=processes)
        logger.info(f"temporary files: {size/1024/1024:.0f} MB")
Esempio n. 18
0
def insert_clans(stg_url: str, p_alignments: str, p_clans: str, p_entries: str,
                 p_entry2xrefs: str, **kwargs):
    max_xrefs = kwargs.get("max_xrefs", 1000000)
    tmpdir = kwargs.get("tmpdir")

    logger.info("aggregating clan cross-references")
    dt = DirectoryTree(tmpdir)
    entry2clan = {}
    for entry_acc, entry in loadobj(p_entries).items():
        if entry.clan:
            entry2clan[entry_acc] = entry.clan["accession"]

    clans = {}
    files = []
    num_xrefs = 0
    with DumpFile(p_entry2xrefs) as df:
        for entry_acc, entry_xrefs in df:
            try:
                clan_acc = entry2clan[entry_acc]
            except KeyError:
                continue

            try:
                clan_xrefs = clans[clan_acc]
            except KeyError:
                clan_xrefs = clans[clan_acc] = {}

            # We do not need the number of matches
            del entry_xrefs["matches"]

            cnt_before = sum(map(len, clan_xrefs.values()))
            deepupdate(entry_xrefs, clan_xrefs)
            cnt_after = sum(map(len, clan_xrefs.values()))
            num_xrefs += cnt_after - cnt_before

            if num_xrefs >= max_xrefs:
                file = dt.mktemp()
                with DumpFile(file, compress=True) as df2:
                    for clan_acc in sorted(clans):
                        df2.dump((clan_acc, clans[clan_acc]))

                files.append(file)
                clans = {}
                num_xrefs = 0

    file = dt.mktemp()
    with DumpFile(file, compress=True) as df2:
        for clan_acc in sorted(clans):
            df2.dump((clan_acc, clans[clan_acc]))

    files.append(file)

    logger.info("inserting clans")
    clans = loadobj(p_clans)
    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_set")
    cur.execute("""
        CREATE TABLE webfront_set
        (
            accession VARCHAR(20) PRIMARY KEY NOT NULL,
            name VARCHAR(400),
            description TEXT,
            source_database VARCHAR(10) NOT NULL,
            relationships LONGTEXT NOT NULL,
            authors TEXT,
            literature TEXT,
            counts LONGTEXT DEFAULT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    sql = """
        INSERT INTO webfront_set
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
    """
    with Table(con, sql) as table:
        for clan_acc, xrefs in merge_dumps(files):
            clan = clans[clan_acc]
            counts = reduce(xrefs)
            counts["entries"] = {
                clan["database"]: len(clan["members"]),
                "total": len(clan["members"])
            }

            table.insert(
                (clan_acc, clan["name"], clan["description"], clan["database"],
                 jsonify(clan["relationships"],
                         nullable=False), jsonify(clan.get("authors")),
                 jsonify(clan.get("literature")), jsonify(counts)))

    logger.info(f"temporary files: {dt.size / 1024 / 1024:.0f} MB")
    dt.remove()

    logger.info("inserting alignments")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_alignment")
    cur.execute("""
        CREATE TABLE webfront_alignment
        (
            id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
            set_acc VARCHAR(20) NOT NULL,
            entry_acc VARCHAR(25) NOT NULL,
            target_acc VARCHAR(25) NOT NULL,
            target_set_acc VARCHAR(20),
            score DOUBLE NOT NULL,
            seq_length MEDIUMINT NOT NULL,
            domains TEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    sql = """
        INSERT INTO webfront_alignment (
            set_acc, entry_acc, target_acc, target_set_acc, score, 
            seq_length, domains
        )
        VALUES (%s, %s, %s, %s, %s, %s, %s)
    """
    with DumpFile(p_alignments) as df, Table(con, sql) as table:
        for alignments in df:
            for aln in alignments:
                table.insert(aln)

    con.commit()
    con.close()

    logger.info("complete")
Esempio n. 19
0
def insert_taxonomy(p_entries: str,
                    p_proteins: str,
                    p_structures: str,
                    p_taxonomy: str,
                    p_uniprot2matches: str,
                    p_uniprot2proteome: str,
                    stg_url: str,
                    p_interpro2taxonomy: str,
                    tmpdir: Optional[str] = None):
    logger.info("preparing data")
    dt = DirectoryTree(tmpdir)
    entries = loadobj(p_entries)
    taxonomy = loadobj(p_taxonomy)
    uniprot2pdbe = {}
    for pdb_id, entry in loadobj(p_structures).items():
        for uniprot_acc, chains in entry["proteins"].items():
            try:
                uniprot2pdbe[uniprot_acc][pdb_id] = chains
            except KeyError:
                uniprot2pdbe[uniprot_acc] = {pdb_id: chains}

    proteins = Store(p_proteins)
    u2matches = Store(p_uniprot2matches)
    u2proteome = Store(p_uniprot2proteome)

    logger.info("starting")
    i = 0
    xrefs = {}
    files = []
    for uniprot_acc, info in proteins.items():
        taxon_id = info["taxid"]

        try:
            taxon = xrefs[taxon_id]
        except KeyError:
            taxon = xrefs[taxon_id] = init_xrefs()

        try:
            proteome_id = u2proteome[uniprot_acc]
        except KeyError:
            pass
        else:
            taxon["proteomes"].add(proteome_id)

        taxon["proteins"]["all"] += 1

        protein_structures = uniprot2pdbe.get(uniprot_acc, {})

        # Add structures to taxon, regardless of entry matches
        taxon["structures"]["all"] |= set(protein_structures.keys())

        databases = set()
        for entry_acc, locations in u2matches.get(uniprot_acc, {}).items():
            entry = entries[entry_acc]
            database = entry.database

            try:
                taxon["entries"][database].add(entry_acc)
            except KeyError:
                taxon["entries"][database] = {entry_acc}

            if database not in databases:
                # Counting the protein *once* per database
                databases.add(database)
                try:
                    taxon["proteins"]["databases"][database] += 1
                except KeyError:
                    taxon["proteins"]["databases"][database] = 1

            try:
                taxon["proteins"]["entries"][entry_acc] += 1
            except KeyError:
                taxon["proteins"]["entries"][entry_acc] = 1

            for pdb_id, chains in protein_structures.items():
                for chain_id, segments in chains.items():
                    if overlaps_pdb_chain(locations, segments):
                        try:
                            taxon["structures"]["entries"][entry_acc].add(
                                pdb_id)
                        except KeyError:
                            taxon["structures"]["entries"][entry_acc] = {
                                pdb_id
                            }

                        break  # Skip other chains

        i += 1
        if not i % 1000000:
            output = dt.mktemp()
            dump_xrefs(xrefs, taxonomy, output)
            files.append(output)
            xrefs = {}

            if not i % 10000000:
                logger.info(f"{i:>12,}")

    if xrefs:
        output = dt.mktemp()
        dump_xrefs(xrefs, taxonomy, output)
        files.append(output)
        xrefs = {}

    logger.info(f"{i:>12,}")
    logger.info(f"temporary files: "
                f"{sum(map(os.path.getsize, files))/1024/1024:.0f} MB")

    proteins.close()
    u2matches.close()
    u2proteome.close()

    logger.info("populating taxonomy tables")
    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_taxonomy")
    cur.execute("""
        CREATE TABLE webfront_taxonomy
        (
            accession VARCHAR(20) PRIMARY KEY NOT NULL,
            scientific_name VARCHAR(255) NOT NULL,
            full_name VARCHAR(512) NOT NULL,
            lineage LONGTEXT NOT NULL,
            parent_id VARCHAR(20),
            rank VARCHAR(20) NOT NULL,
            children LONGTEXT,
            counts LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.execute("DROP TABLE IF EXISTS webfront_taxonomyperentry")
    cur.execute("""
        CREATE TABLE webfront_taxonomyperentry
        (
          id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
          tax_id VARCHAR(20) NOT NULL,
          entry_acc VARCHAR(25) NOT NULL,
          counts LONGTEXT NULL NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.execute("DROP TABLE IF EXISTS webfront_taxonomyperentrydb")
    cur.execute("""
        CREATE TABLE webfront_taxonomyperentrydb
        (
          id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
          tax_id VARCHAR(20) NOT NULL,
          source_database VARCHAR(10) NOT NULL,
          counts LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    table = Table(con,
                  query="""
        INSERT INTO webfront_taxonomy VALUES (%s, %s, %s, %s, %s, %s, %s, %s) 
    """)
    per_entry = Table(con,
                      query="""
        INSERT INTO webfront_taxonomyperentry (tax_id,entry_acc,counts)
        VALUES (%s, %s, %s) 
    """)
    per_database = Table(con,
                         query="""
        INSERT INTO webfront_taxonomyperentrydb (tax_id,source_database,counts)
        VALUES (%s, %s, %s) 
    """)

    with DumpFile(p_interpro2taxonomy, compress=True) as interpro2taxonomy:
        interpro_entries = {
            entry.accession
            for entry in entries.values()
            if entry.database == "interpro" and not entry.is_deleted
        }

        i = 0
        for taxon_id, taxon_xrefs in merge_dumps(files):
            taxon = taxonomy[taxon_id]

            protein_counts = taxon_xrefs.pop("proteins")
            structure_counts = taxon_xrefs.pop("structures")
            counts = reduce(taxon_xrefs)

            # Add total protein count (not grouped by database/entry)
            counts["proteins"] = protein_counts["all"]

            # Add total structure count
            counts["structures"] = len(structure_counts["all"])

            # Add total entry count (not grouped by database)
            counts["entries"]["total"] = sum(counts["entries"].values())

            table.insert(
                (taxon_id, taxon["sci_name"], taxon["full_name"],
                 f" {' '.join(taxon['lineage'])} ", taxon["parent"],
                 taxon["rank"], jsonify(taxon["children"]), jsonify(counts)))

            # Remove the 'entry' property
            # (no needed for webfront_taxonomyperentry)
            entry_counts = counts.pop("entries")

            database_structures = {}
            for entry_acc, count in protein_counts["entries"].items():
                if entry_acc in interpro_entries:
                    interpro2taxonomy.dump((entry_acc, taxon_id, count))

                counts["proteins"] = count

                try:
                    entry_structures = structure_counts["entries"][entry_acc]
                except KeyError:
                    counts["structures"] = 0
                else:
                    counts["structures"] = len(entry_structures)

                    database = entries[entry_acc].database
                    try:
                        database_structures[database] |= entry_structures
                    except KeyError:
                        database_structures[database] = entry_structures.copy()
                finally:
                    per_entry.insert((taxon_id, entry_acc, jsonify(counts)))

            for database, count in protein_counts["databases"].items():
                counts.update({
                    "entries":
                    entry_counts[database],
                    "proteins":
                    count,
                    "structures":
                    len(database_structures.get(database, []))
                })
                per_database.insert((taxon_id, database, jsonify(counts)))

            i += 1
            if not i % 100000:
                logger.info(f"{i:>12,}")

        logger.info(f"{i:>12,}")

    table.close()
    per_entry.close()
    per_database.close()
    con.commit()

    dt.remove()

    logger.info("indexing")
    cur = con.cursor()
    cur.execute("""
        CREATE INDEX i_webfront_taxonomyperentry_tax
        ON webfront_taxonomyperentry (tax_id)
        """)
    cur.execute("""
        CREATE INDEX i_webfront_taxonomyperentry_entry
        ON webfront_taxonomyperentry (entry_acc)
        """)
    cur.execute("""
        CREATE INDEX i_webfront_taxonomyperentrydb_tax
        ON webfront_taxonomyperentrydb (tax_id)
        """)
    cur.execute("""
        CREATE INDEX i_webfront_taxonomyperentrydb_database
        ON webfront_taxonomyperentrydb (source_database)
        """)
    cur.close()
    con.close()
    logger.info("complete")
Esempio n. 20
0
def export_residues(url: str, dt: DirectoryTree) -> List[str]:
    files = []

    con = cx_Oracle.connect(url)
    cur = con.cursor()
    cur.execute("""
        SELECT S.PROTEIN_AC, S.METHOD_AC, M.NAME, LOWER(D.DBSHORT),
               S.DESCRIPTION, S.RESIDUE, S.RESIDUE_START, S.RESIDUE_END
        FROM INTERPRO.SITE_MATCH S
        INNER JOIN INTERPRO.CV_DATABASE D ON S.DBCODE = D.DBCODE
        LEFT OUTER JOIN INTERPRO.METHOD M ON S.METHOD_AC = M.METHOD_AC  
        """)

    i = 0
    proteins = {}
    for row in cur:
        protein_acc = row[0]
        signature_acc = row[1]
        signature_name = row[2]
        database = row[3]
        description = row[4]
        residue = row[5]
        pos_start = row[6]
        pos_end = row[7]

        try:
            entries = proteins[protein_acc]
        except KeyError:
            entries = proteins[protein_acc] = {}

        try:
            entry = entries[signature_acc]
        except KeyError:
            entry = entries[signature_acc] = {
                "name": signature_name,
                "database": database,
                "descriptions": {}
            }

        try:
            fragments = entry["descriptions"][description]
        except KeyError:
            fragments = entry["descriptions"][description] = []

        fragments.append((residue, pos_start, pos_end))
        i += 1
        if not i % 1000000:
            files.append(dt.mktemp())
            with DumpFile(files[-1], compress=True) as df:
                for protein_acc in sorted(proteins):
                    df.dump((protein_acc, proteins[protein_acc]))

            proteins = {}

            if not i % 100000000:
                logger.info(f"{i:>15,}")

    logger.info(f"{i:>15,}")
    cur.close()
    con.close()

    files.append(dt.mktemp())
    with DumpFile(files[-1], compress=True) as df:
        for protein_acc in sorted(proteins):
            df.dump((protein_acc, proteins[protein_acc]))

    return files
Esempio n. 21
0
def insert_structures(p_entries: str, p_proteins: str, p_structures: str,
                      p_uniprot2ida: str, p_uniprot2matches: str,
                      p_uniprot2proteome: str, stg_url: str):
    logger.info("preparing data")
    entries = {}
    for entry in loadobj(p_entries).values():
        entries[entry.accession] = (entry.database, entry.clan)

    uniprot2pdbe = {}
    xrefs = {}
    for pdb_id, entry in loadobj(p_structures).items():
        for uniprot_acc, chains in entry["proteins"].items():
            try:
                uniprot2pdbe[uniprot_acc][pdb_id] = chains
            except KeyError:
                uniprot2pdbe[uniprot_acc] = {pdb_id: chains}

        xrefs[pdb_id] = {
            "domain_architectures": set(),
            "entries": {},
            "proteomes": set(),
            "proteins": 0,
            "sets": set(),
            "taxa": set()
        }

    proteins = Store(p_proteins)
    u2ida = Store(p_uniprot2ida)
    u2matches = Store(p_uniprot2matches)
    u2proteome = Store(p_uniprot2proteome)

    logger.info("starting")
    i = 0
    for uniprot_acc in sorted(uniprot2pdbe):
        info = proteins[uniprot_acc]

        try:
            dom_members, dom_arch, dom_arch_id = u2ida[uniprot_acc]
        except KeyError:
            dom_arch_id = None

        proteome_id = u2proteome.get(uniprot_acc)
        matches = u2matches.get(uniprot_acc, {})

        for pdb_id, chains in uniprot2pdbe[uniprot_acc].items():
            _xrefs = xrefs[pdb_id]

            if dom_arch_id:
                _xrefs["domain_architectures"].add(dom_arch_id)

            if proteome_id:
                _xrefs["proteomes"].add(proteome_id)

            _xrefs["proteins"] += 1
            _xrefs["taxa"].add(info["taxid"])

            for entry_acc, locations in matches.items():
                database, clan = entries[entry_acc]

                for chain_id, segments in chains.items():
                    if overlaps_pdb_chain(locations, segments):
                        try:
                            _xrefs["entries"][database].add(entry_acc)
                        except KeyError:
                            _xrefs["entries"][database] = {entry_acc}

                        if clan:
                            _xrefs["sets"].add(clan["accession"])

                        break  # Skip other chains

        i += 1
        if not i % 10000:
            logger.info(f"{i:>12,}")

    logger.info(f"{i:>12,}")

    proteins.close()
    u2ida.close()
    u2matches.close()
    u2proteome.close()

    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_structure")
    cur.execute("""
        CREATE TABLE webfront_structure
        (
            accession VARCHAR(4) PRIMARY KEY NOT NULL,
            name VARCHAR(512) NOT NULL,
            source_database VARCHAR(10) NOT NULL,
            experiment_type VARCHAR(16) NOT NULL,
            release_date DATETIME NOT NULL,
            resolution FLOAT,
            literature LONGTEXT,
            chains LONGTEXT NOT NULL,
            proteins LONGTEXT NOT NULL,
            secondary_structures LONGTEXT,
            counts LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    sql = """
        INSERT INTO webfront_structure 
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 
    """
    with Table(con, sql) as table:
        for pdb_id, info in loadobj(p_structures).items():
            counts = reduce(xrefs[pdb_id])
            counts["entries"]["total"] = sum(counts["entries"].values())
            table.insert((
                pdb_id,
                info["name"],
                "pdb",
                info["evidence"],
                info["date"],
                info["resolution"],
                jsonify(info["citations"]),
                # Sorted list of unique chain (e.g. 'A', 'B', ...)
                jsonify(sorted({
                    chain_id
                    for chains in info["proteins"].values()
                    for chain_id in chains
                }),
                        nullable=False),
                jsonify(info["proteins"], nullable=False),
                jsonify(info["secondary_structures"]),
                jsonify(counts)))

    con.commit()
    con.close()

    logger.info("complete")
Esempio n. 22
0
def insert_proteomes(p_entries: str, p_proteins: str, p_proteomes: str,
                     p_structures: str, p_uniprot2ida: str,
                     p_uniprot2matches: str, p_uniprot2proteome: str,
                     stg_url: str):
    logger.info("preparing data")
    proteomes = loadobj(p_proteomes)
    uniprot2pdbe = {}
    for pdb_id, entry in loadobj(p_structures).items():
        for uniprot_acc in entry["proteins"]:
            try:
                uniprot2pdbe[uniprot_acc].add(pdb_id)
            except KeyError:
                uniprot2pdbe[uniprot_acc] = {pdb_id}

    # Init all proteomes
    xrefs = {}
    for proteome_id in proteomes:
        xrefs[proteome_id] = {
            "domain_architectures": set(),
            "entries": {},
            "proteins": 0,
            "sets": set(),
            "structures": set(),
            "taxa": set()
        }

    entries = loadobj(p_entries)
    proteins = Store(p_proteins)
    u2ida = Store(p_uniprot2ida)
    u2matches = Store(p_uniprot2matches)
    u2proteome = Store(p_uniprot2proteome)

    logger.info("starting")
    i = 0
    for uniprot_acc, proteome_id in u2proteome.items():
        proteome = xrefs[proteome_id]
        proteome["proteins"] += 1

        info = proteins[uniprot_acc]
        proteome["taxa"].add(info["taxid"])

        try:
            dom_members, dom_arch, dom_arch_id = u2ida[uniprot_acc]
        except KeyError:
            pass
        else:
            proteome["domain_architectures"].add(dom_arch_id)

        for entry_acc in u2matches.get(uniprot_acc, []):
            entry = entries[entry_acc]
            try:
                proteome["entries"][entry.database].add(entry_acc)
            except KeyError:
                proteome["entries"][entry.database] = {entry_acc}

            if entry.clan:
                proteome["sets"].add(entry.clan["accession"])

        try:
            pdb_ids = uniprot2pdbe[uniprot_acc]
        except KeyError:
            pass
        else:
            proteome["structures"] |= pdb_ids

        i += 1
        if not i % 10000000:
            logger.info(f"{i:>12,}")

    logger.info(f"{i:>12,}")

    proteins.close()
    u2ida.close()
    u2matches.close()
    u2proteome.close()

    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_proteome")
    cur.execute("""
        CREATE TABLE webfront_proteome
        (
            accession VARCHAR(20) PRIMARY KEY NOT NULL,
            name VARCHAR(215) NOT NULL,
            is_reference TINYINT NOT NULL,
            strain VARCHAR(512),
            assembly VARCHAR(512),
            taxonomy_id VARCHAR(20) NOT NULL,
            counts LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    sql = """
        INSERT INTO webfront_proteome VALUES (%s, %s, %s, %s, %s, %s, %s) 
    """
    with Table(con, sql) as table:
        for proteome_id, info in proteomes.items():
            counts = reduce(xrefs[proteome_id])
            counts["entries"]["total"] = sum(counts["entries"].values())
            table.insert(
                (proteome_id, info["name"], 1 if info["is_reference"] else 0,
                 info["strain"], info["assembly"], info["taxon_id"],
                 jsonify(counts)))

    con.commit()
    con.close()

    logger.info("complete")
Esempio n. 23
0
def export(p_entries: str, p_uniprot2matches: str, outdir: str):
    logger.info("loading entries")
    entries = []
    integrated = {}
    for e in loadobj(p_entries).values():
        if e.database == "interpro" and not e.is_deleted:
            entries.append(e)

            for signatures in e.integrates.values():
                for signature_acc in signatures:
                    integrated[signature_acc] = (e.accession, e.name)

    logger.info("writing entry.list")
    with open(os.path.join(outdir, "entry.list"), "wt") as fh:
        fh.write("ENTRY_AC\tENTRY_TYPE\tENTRY_NAME\n")

        for e in sorted(entries, key=lambda e: (e.type, e.accession)):
            fh.write(f"{e.accession}\t{e.type}\t{e.name}\n")

    logger.info("writing names.dat")
    with open(os.path.join(outdir, "names.dat"), "wt") as fh:
        for e in sorted(entries, key=lambda e: e.accession):
            fh.write(f"{e.accession}\t{e.name}\n")

    logger.info("writing short_names.dat")
    with open(os.path.join(outdir, "short_names.dat"), "wt") as fh:
        for e in sorted(entries, key=lambda e: e.accession):
            fh.write(f"{e.accession}\t{e.short_name}\n")

    logger.info("writing interpro2go")
    with open(os.path.join(outdir, "interpro2go"), "wt") as fh:
        fh.write(f"!date: {datetime.now():%Y/%m/%d %H:%M:%S}\n")
        fh.write("!Mapping of InterPro entries to GO\n")
        fh.write("!\n")

        for e in sorted(entries, key=lambda e: e.accession):
            for term in e.go_terms:
                fh.write(f"InterPro:{e.accession} {e.name} > "
                         f"GO:{term['name']} ; {term['identifier']}\n")

    logger.info("writing ParentChildTreeFile.txt")
    with open(os.path.join(outdir, "ParentChildTreeFile.txt"), "wt") as fh:
        for e in sorted(entries, key=lambda e: e.accession):
            root = e.hierarchy["accession"]
            if root == e.accession and e.hierarchy["children"]:
                _write_node(e.hierarchy, fh, level=0)

    logger.info("writing protein2ipr.dat.gz")
    filepath = os.path.join(outdir, "protein2ipr.dat.gz")
    with gzip.open(filepath, "wt") as fh, Store(p_uniprot2matches) as sh:
        i = 0
        for uniprot_acc, protein_entries in sh.items():
            matches = []
            for signature_acc in sorted(protein_entries):
                try:
                    interpro_acc, name = integrated[signature_acc]
                except KeyError:
                    # Not integrated signature or InterPro entry
                    continue

                locations = protein_entries[signature_acc]

                for loc in locations:
                    matches.append((
                        uniprot_acc,
                        interpro_acc,
                        name,
                        signature_acc,
                        # We do not consider fragmented locations
                        loc["fragments"][0]["start"],
                        max(f["end"] for f in loc["fragments"])
                    ))

            for m in sorted(matches):
                fh.write('\t'.join(map(str, m)) + '\n')

            i += 1
            if not i % 10000000:
                logger.debug(f"{i:>12,}")

        logger.info(f"{i:>12,}")

    logger.info("complete")
Esempio n. 24
0
def export_structure_matches(url: str, p_proteins: str, p_structures: str,
                             outdir: str):
    shutil.copy(os.path.join(os.path.dirname(__file__), "feature.dtd"), outdir)

    logger.info("loading structures")
    uniprot2pdbe = {}
    for pdb_id, entry in loadobj(p_structures).items():
        for uniprot_acc, chains in entry["proteins"].items():
            try:
                uniprot2pdbe[uniprot_acc][pdb_id] = chains
            except KeyError:
                uniprot2pdbe[uniprot_acc] = {pdb_id: chains}

    logger.info("loading CATH/SCOP domains")
    uni2prot2cath = pdbe.get_cath_domains(url)
    uni2prot2scop = pdbe.get_scop_domains(url)

    logger.info("writing file")
    output = os.path.join(outdir, "feature.xml.gz")
    with gzip.open(output, "wt", encoding="utf-8") as fh:
        fh.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        fh.write('<!DOCTYPE interprofeature SYSTEM "feature.dtd">\n')
        fh.write('<interprofeature>\n')

        with Store(p_proteins) as proteins:
            doc = getDOMImplementation().createDocument(None, None, None)

            for uniprot_acc, protein in proteins.items():
                pdb_entries = uniprot2pdbe.get(uniprot_acc, {})
                cath_entries = uni2prot2cath.get(uniprot_acc, {})
                scop_entries = uni2prot2scop.get(uniprot_acc, {})

                if pdb_entries or cath_entries or scop_entries:
                    elem = doc.createElement("protein")
                    elem.setAttribute("id", uniprot_acc)
                    elem.setAttribute("name", protein["identifier"])
                    elem.setAttribute("length", str(protein["length"]))
                    elem.setAttribute("crc64", protein["crc64"])

                    for pdb_id in sorted(pdb_entries):
                        chains = pdb_entries[pdb_id]
                        for chain_id in sorted(chains):
                            domain = doc.createElement("domain")
                            domain.setAttribute("id", f"{pdb_id}{chain_id}")
                            domain.setAttribute("dbname", "PDB")

                            for loc in chains[chain_id]:
                                start = loc["protein_start"]
                                end = loc["protein_end"]

                                coord = doc.createElement("coord")
                                coord.setAttribute("pdb", pdb_id)
                                coord.setAttribute("chain", chain_id)
                                coord.setAttribute("start", str(start))
                                coord.setAttribute("end", str(end))
                                domain.appendChild(coord)

                            elem.appendChild(domain)

                    for domain_id in sorted(cath_entries):
                        entry = cath_entries[domain_id]

                        domain = doc.createElement("domain")
                        domain.setAttribute("id", domain_id)
                        domain.setAttribute("cfn", entry["superfamily"]["id"])
                        domain.setAttribute("dbname", "CATH")

                        for loc in entry["locations"]:
                            coord = doc.createElement("coord")
                            coord.setAttribute("pdb", entry["pdb_id"])
                            coord.setAttribute("chain", entry["chain"])
                            coord.setAttribute("start", str(loc["start"]))
                            coord.setAttribute("end", str(loc["end"]))
                            domain.appendChild(coord)

                        elem.appendChild(domain)

                    for domain_id in sorted(scop_entries):
                        entry = scop_entries[domain_id]

                        domain = doc.createElement("domain")
                        domain.setAttribute("id", domain_id)
                        domain.setAttribute("cfn", entry["superfamily"]["id"])
                        domain.setAttribute("dbname", "SCOP")

                        for loc in entry["locations"]:
                            coord = doc.createElement("coord")
                            coord.setAttribute("pdb", entry["pdb_id"])
                            coord.setAttribute("chain", entry["chain"])
                            coord.setAttribute("start", str(loc["start"]))
                            coord.setAttribute("end", str(loc["end"]))
                            domain.appendChild(coord)

                        elem.appendChild(domain)

                    elem.writexml(fh, addindent="  ", newl="\n")

        fh.write('</interprofeature>\n')

    logger.info("complete")
Esempio n. 25
0
def insert_residues(pro_url: str, stg_url: str, tmpdir: Optional[str] = None):
    dt = DirectoryTree(root=tmpdir)

    logger.info("exporting residues")
    files = ippro.export_residues(pro_url, dt)

    logger.info("inserting residues")
    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_proteinresidue")
    cur.execute("""
        CREATE TABLE webfront_proteinresidue
        (
            residue_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
            protein_acc VARCHAR(15) NOT NULL,
            entry_acc VARCHAR(25) NOT NULL,
            entry_name VARCHAR(100),
            source_database VARCHAR(10) NOT NULL,
            description VARCHAR(255),
            fragments LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    sql = """
        INSERT INTO webfront_proteinresidue (
          protein_acc, entry_acc, entry_name, source_database, description,
          fragments
        )
        VALUES (%s, %s, %s, %s, %s, %s)
    """
    with Table(con, sql) as table:
        i = 0
        for protein_acc, entries in merge_dumps(files, replace=True):
            for entry_acc, entry in entries.items():
                for descr, locations in entry["descriptions"].items():
                    locations.sort(key=lambda x: (x[1], x[2]))
                    table.insert((protein_acc, entry_acc, entry["name"],
                                  entry["database"], descr,
                                  jsonify(locations, nullable=False)))

            i += 1
            if not i % 10000000:
                logger.info(f"{i:>15,}")

        logger.info(f"{i:>15,}")
    con.commit()

    logger.info(f"temporary files: {dt.size / 1024 ** 2:.0f} MB")
    dt.remove()

    logger.info("indexing")
    cur = con.cursor()
    cur.execute("""
        CREATE INDEX i_proteinresidue
        ON webfront_proteinresidue (protein_acc)
        """)
    cur.close()
    con.close()
    logger.info("complete")
Esempio n. 26
0
def export_interpro(url: str,
                    p_entries: str,
                    p_entry2xrefs: str,
                    p_interpro2taxonomy: str,
                    outdir: str,
                    tmpdir: Optional[str] = None):
    shutil.copy(os.path.join(os.path.dirname(__file__), "interpro.dtd"),
                outdir)

    logger.info("loading entries")
    entries = loadobj(p_entries)
    interpro_entries = []
    deleted_entries = []
    for e in entries.values():
        if e.database != "interpro":
            continue
        elif e.is_deleted:
            deleted_entries.append(e.accession)
        else:
            interpro_entries.append(e.accession)

    logger.info("creating entry-taxon database")
    fd, taxdb = mkstemp(dir=tmpdir)
    os.close(fd)
    os.remove(taxdb)
    with DumpFile(p_interpro2taxonomy) as interpro2taxonomy:
        with KVdb(taxdb, writeback=True) as kvdb:
            i = 0
            for entry_acc, taxon_id, counts in interpro2taxonomy:
                kvdb[f"{entry_acc}-{taxon_id}"] = str(counts)

                i += 1
                if not i % 1000000:
                    kvdb.sync()

    logger.info("loading protein counts")
    con = MySQLdb.connect(**url2dict(url), charset="utf8mb4")
    cur = MySQLdb.cursors.SSCursor(con)
    cur.execute("""
        SELECT accession, counts
        FROM webfront_entry
        """)
    num_proteins = {}
    for entry_acc, counts in cur:
        num_proteins[entry_acc] = str(json.loads(counts)["proteins"])

    output = os.path.join(outdir, "interpro.xml.gz")
    with gzip.open(output, "wt", encoding="utf-8") as fh:
        fh.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        fh.write('<!DOCTYPE interprodb SYSTEM "interpro.dtd">\n')
        fh.write("<interprodb>\n")

        doc = getDOMImplementation().createDocument(None, None, None)

        # writing <release> section (do not log progress, < 1 sec)
        elem = doc.createElement("release")
        databases = {}
        cur.execute("""
            SELECT name, name_alt, type, num_entries, version, release_date
            FROM webfront_database
            ORDER BY name_long
            """)

        for name, name_alt, db_type, entry_count, version, date in cur:
            databases[name] = name_alt
            if db_type in ("entry", "protein"):
                dbinfo = doc.createElement("dbinfo")
                dbinfo.setAttribute("version", version)
                dbinfo.setAttribute("dbname", name_alt)
                dbinfo.setAttribute("entry_count", str(entry_count))
                dbinfo.setAttribute("file_date",
                                    date.strftime("%d-%b-%y").upper())
                elem.appendChild(dbinfo)

        elem.writexml(fh, addindent="  ", newl="\n")

        logger.info("loading taxonomic data")
        key_species = {
            "3702",  # Arabidopsis thaliana
            "6239",  # Caenorhabditis elegans
            "7955",  # Danio rerio
            "7227",  # Drosophila melanogaster
            "9606",  # H**o sapiens
            "10090",  # Mus musculus
            "367110",  # Neurospora crassa
            "10116",  # Rattus norvegicus
            "559292",  # Saccharomyces cerevisiae
            "284812",  # Schizosaccharomyces pombe
            "4577",  # Zea mays
        }
        superkingdoms = {
            "Archaea": None,
            "Bacteria": None,
            "Eukaryota": None,
            "Viruses": None
        }
        cur.execute("""
            SELECT accession, scientific_name, full_name, lineage
            FROM webfront_taxonomy
            """)
        taxa = {}
        for tax_id, sci_name, full_name, lineage in cur:
            """
            lineage stored as a string with heading/leading whitespaces,
            and a whitespace between taxa
            """
            taxa[tax_id] = (full_name, lineage.strip().split())

            if sci_name in superkingdoms:
                superkingdoms[sci_name] = tax_id

        cur.close()
        con.close()

        # Raise if a superkingdom is not in the table
        for sci_name, tax_id in superkingdoms.items():
            if tax_id is None:
                raise ValueError(f"{sci_name}: missing taxon ID")

        superkingdoms = {tax_id for tax_id in superkingdoms.values()}

        logger.info("writing entries")
        with DumpFile(p_entry2xrefs) as entry2xrefs, KVdb(taxdb) as kvdb:
            for entry_acc, xrefs in entry2xrefs:
                entry = entries[entry_acc]
                if entry.database != "interpro" or entry.is_deleted:
                    continue

                elem = doc.createElement("interpro")
                elem.setAttribute("id", entry.accession)
                elem.setAttribute("protein_count", num_proteins[entry_acc])
                elem.setAttribute("short_name", entry.short_name)
                elem.setAttribute("type", entry.type)

                name = doc.createElement("name")
                name.appendChild(doc.createTextNode(entry.name))
                elem.appendChild(name)

                text = _restore_abstract('\n'.join(entry.description))
                try:
                    _doc = parseString(f"<abstract>{text}</abstract>")
                except ExpatError as exc:
                    # TODO: use CDATA section for all entries
                    logger.warning(f"{entry_acc}: {exc}")
                    # abstract = doc.createElement("abstract")
                    # abstract.appendChild(doc.createCDATASection(text))
                else:
                    abstract = _doc.documentElement
                    elem.appendChild(abstract)

                if entry.go_terms:
                    go_list = doc.createElement("class_list")

                    for term in entry.go_terms:
                        go_elem = doc.createElement("classification")
                        go_elem.setAttribute("id", term["identifier"])
                        go_elem.setAttribute("class_type", "GO")

                        _elem = doc.createElement("category")
                        _elem.appendChild(
                            doc.createTextNode(term["category"]["name"]))
                        go_elem.appendChild(_elem)

                        _elem = doc.createElement("description")
                        _elem.appendChild(doc.createTextNode(term["name"]))
                        go_elem.appendChild(_elem)

                        go_list.appendChild(go_elem)

                    elem.appendChild(go_list)

                if entry.literature:
                    pub_list = doc.createElement("pub_list")
                    for pub_id in sorted(entry.literature):
                        pub = entry.literature[pub_id]

                        pub_elem = doc.createElement("publication")
                        pub_elem.setAttribute("id", pub_id)

                        _elem = doc.createElement("author_list")
                        if pub["authors"]:
                            _elem.appendChild(
                                doc.createTextNode(", ".join(pub['authors'])))
                        else:
                            _elem.appendChild(doc.createTextNode("Unknown"))
                        pub_elem.appendChild(_elem)

                        if pub["title"]:
                            _elem = doc.createElement("title")
                            _elem.appendChild(doc.createTextNode(pub["title"]))
                            pub_elem.appendChild(_elem)

                        if pub["URL"]:
                            _elem = doc.createElement("url")
                            _elem.appendChild(doc.createTextNode(pub["URL"]))
                            pub_elem.appendChild(_elem)

                        _elem = doc.createElement("db_xref")
                        if pub["PMID"]:
                            _elem.setAttribute("db", "PUBMED")
                            _elem.setAttribute("dbkey", str(pub["PMID"]))
                        else:
                            _elem.setAttribute("db", "MEDLINE")
                            _elem.setAttribute("dbkey", "MEDLINE")
                        pub_elem.appendChild(_elem)

                        if pub["ISO_journal"]:
                            _elem = doc.createElement("journal")
                            _elem.appendChild(
                                doc.createTextNode(pub["ISO_journal"]))
                            pub_elem.appendChild(_elem)

                        if pub["ISBN"]:
                            _elem = doc.createElement("book_title")
                            isbn = f"ISBN:{pub['ISBN']}"
                            _elem.appendChild(doc.createTextNode(isbn))
                            pub_elem.appendChild(_elem)

                        if pub["volume"] or pub["issue"] or pub["raw_pages"]:
                            _elem = doc.createElement("location")
                            if pub["volume"]:
                                _elem.setAttribute("volume", pub["volume"])

                            if pub["issue"]:
                                _elem.setAttribute("issue", pub["issue"])

                            if pub["raw_pages"]:
                                _elem.setAttribute("pages", pub["raw_pages"])

                            pub_elem.appendChild(_elem)

                        if pub["year"]:
                            _elem = doc.createElement("year")
                            _elem.appendChild(
                                doc.createTextNode(str(pub["year"])))
                            pub_elem.appendChild(_elem)

                        pub_list.appendChild(pub_elem)

                    elem.appendChild(pub_list)

                parent, children = entry.relations
                if parent:
                    par_elem = doc.createElement("parent_list")
                    _elem = doc.createElement("rel_ref")
                    _elem.setAttribute("ipr_ref", parent)
                    par_elem.appendChild(_elem)
                    elem.appendChild(par_elem)

                if children:
                    child_list = doc.createElement("child_list")
                    for child in children:
                        _elem = doc.createElement("rel_ref")
                        _elem.setAttribute("ipr_ref", child)
                        child_list.appendChild(_elem)

                    elem.appendChild(child_list)

                members = []
                for database, signatures in entry.integrates.items():
                    for signature_acc in signatures:
                        members.append((
                            signature_acc,
                            entries[signature_acc].short_name,
                            database,
                            num_proteins[signature_acc],
                        ))

                mem_list = doc.createElement("member_list")
                for member in sorted(members):
                    _elem = doc.createElement("db_xref")
                    _elem.setAttribute("protein_count", member[3])
                    _elem.setAttribute("db", databases[member[2]])
                    _elem.setAttribute("dbkey", member[0])
                    _elem.setAttribute("name", member[1])
                    mem_list.appendChild(_elem)
                elem.appendChild(mem_list)

                # Merge cross-references and pathways
                cross_refs = {}
                for key, values in entry.cross_references.items():
                    cross_refs[databases[key]] = values

                for key, values in entry.pathways.items():
                    cross_refs[databases[key]] = [val["id"] for val in values]

                if cross_refs:
                    xref_list = doc.createElement("external_doc_list")
                    for ref_db in sorted(cross_refs):
                        for ref_id in sorted(cross_refs[ref_db]):
                            _elem = doc.createElement("db_xref")
                            _elem.setAttribute("db", ref_db)
                            _elem.setAttribute("dbkey", ref_id)
                            xref_list.appendChild(_elem)
                    elem.appendChild(xref_list)

                if xrefs["structures"]:
                    xref_list = doc.createElement("structure_db_links")
                    for pdb_id in sorted(xrefs["structures"]):
                        _elem = doc.createElement("db_xref")
                        _elem.setAttribute("db", "PDB")
                        _elem.setAttribute("dbkey", pdb_id)
                        xref_list.appendChild(_elem)
                    elem.appendChild(xref_list)

                # Find key species and taxonomic distribution
                entry_key_species = []
                entry_superkingdoms = {}
                for tax_id in xrefs["taxa"]:
                    full_name, lineage = taxa[tax_id]

                    if tax_id in key_species:
                        entry_key_species.append((full_name, tax_id))

                    # Find the superkingdom contain this taxon
                    for superkingdom_id in superkingdoms:
                        if superkingdom_id in lineage:
                            break
                    else:
                        continue

                    try:
                        other_lineage = entry_superkingdoms[superkingdom_id]
                    except KeyError:
                        entry_superkingdoms[superkingdom_id] = lineage
                    else:
                        # Compare lineages and find lowest common ancestor
                        i = 0
                        while i < len(lineage) and i < len(other_lineage):
                            if lineage[i] != other_lineage[i]:
                                break
                            i += 1

                        # Path to the lowest common ancestor
                        entry_superkingdoms[superkingdom_id] = lineage[:i]

                # Get lowest common ancestor for each represented superkingdom
                lowest_common_ancestors = []
                for lineage in entry_superkingdoms.values():
                    # Lowest common ancestor
                    tax_id = lineage[-1]
                    full_name, _ = taxa[tax_id]
                    lowest_common_ancestors.append((full_name, tax_id))

                # Write taxonomic distribution
                tax_dist = doc.createElement("taxonomy_distribution")
                for full_name, tax_id in sorted(lowest_common_ancestors):
                    _elem = doc.createElement("taxon_data")
                    _elem.setAttribute("name", full_name)
                    key = f"{entry_acc}-{tax_id}"
                    _elem.setAttribute("proteins_count", kvdb[key])
                    tax_dist.appendChild(_elem)
                elem.appendChild(tax_dist)

                if entry_key_species:
                    # Write key species
                    key_spec = doc.createElement("key_species")
                    for full_name, tax_id in sorted(entry_key_species):
                        _elem = doc.createElement("taxon_data")
                        _elem.setAttribute("name", full_name)
                        key = f"{entry_acc}-{tax_id}"
                        _elem.setAttribute("proteins_count", kvdb[key])
                        key_spec.appendChild(_elem)
                    elem.appendChild(key_spec)

                elem.writexml(fh, addindent="  ", newl="\n")

        if deleted_entries:
            block = doc.createElement("deleted_entries")
            for entry_acc in sorted(deleted_entries):
                elem = doc.createElement("del_ref")
                elem.setAttribute("id", entry_acc)
                block.appendChild(elem)

            block.writexml(fh, addindent="  ", newl="\n")

        fh.write("</interprodb>\n")

    logger.info(f"temporary file: {os.path.getsize(taxdb)/1024/1024:,.0f} MB")
    os.remove(taxdb)
    logger.info("complete")
Esempio n. 27
0
def export_features_matches(url: str,
                            p_proteins: str,
                            p_uniprot2features: str,
                            outdir: str,
                            processes: int = 8):
    shutil.copy(os.path.join(os.path.dirname(__file__), "extra.dtd"), outdir)

    logger.info("loading features")
    con = cx_Oracle.connect(url)
    cur = con.cursor()
    features = ippro.get_features(cur)
    cur.close()
    con.close()

    logger.info("spawning processes")
    processes = max(1, processes - 1)
    ctx = mp.get_context(method="spawn")
    workers = []
    with Store(p_uniprot2features) as proteins:
        proteins_per_file = math.ceil(len(proteins) / processes)
        start_acc = None
        for i, uniprot_acc in enumerate(proteins):
            if not i % proteins_per_file:
                if start_acc:
                    filename = f"extra_{len(workers) + 1}.xml"
                    filepath = os.path.join(outdir, filename)
                    p = ctx.Process(target=_write_feature_tmp,
                                    args=(features, p_proteins,
                                          p_uniprot2features, start_acc,
                                          uniprot_acc, filepath))
                    p.start()
                    workers.append((p, filepath))

                start_acc = uniprot_acc

        filename = f"extra_{len(workers) + 1}.xml"
        filepath = os.path.join(outdir, filename)
        p = ctx.Process(target=_write_feature_tmp,
                        args=(features, p_proteins, p_uniprot2features,
                              start_acc, None, filepath))
        p.start()
        workers.append((p, filepath))

    logger.info("concatenating XML files")
    output = os.path.join(outdir, "extra.xml.gz")
    with gzip.open(output, "wt", encoding="utf-8") as fh:
        fh.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        fh.write('<!DOCTYPE interproextra SYSTEM "extra.dtd">\n')
        fh.write('<interproextra>\n')

        doc = getDOMImplementation().createDocument(None, None, None)
        elem = doc.createElement("release")
        databases = {(f["database"], f["version"]) for f in features.values()}
        for name, version in sorted(databases):
            dbinfo = doc.createElement("dbinfo")
            dbinfo.setAttribute("dbname", name)

            if version:
                dbinfo.setAttribute("version", version)

            elem.appendChild(dbinfo)

        elem.writexml(fh, addindent="  ", newl="\n")

        for i, (p, filepath) in enumerate(workers):
            p.join()
            with open(filepath, "rt", encoding="utf-8") as tfh:
                for line in tfh:
                    fh.write(line)

            os.remove(filepath)
            logger.info(f"\t{i+1} / {len(workers)}")

        fh.write('</interproextra>\n')

    logger.info("complete")
Esempio n. 28
0
def insert_proteins(p_entries: str, p_proteins: str, p_structures: str,
                    p_taxonomy: str, p_uniprot2comments: str,
                    p_uniprot2name: str, p_uniprot2evidences: str,
                    p_uniprot2ida: str, p_uniprot2matches: str,
                    p_uniprot2proteome: str, p_uniprot2sequence: str,
                    pro_url: str, stg_url: str):
    logger.info("loading CATH/SCOP domains")
    uniprot2cath = pdbe.get_cath_domains(pro_url)
    uniprot2scop = pdbe.get_scop_domains(pro_url)

    logger.info("preparing data")
    proteins = Store(p_proteins)
    u2comments = Store(p_uniprot2comments)
    u2descriptions = Store(p_uniprot2name)
    u2evidences = Store(p_uniprot2evidences)
    u2ida = Store(p_uniprot2ida)
    u2matches = Store(p_uniprot2matches)
    u2proteome = Store(p_uniprot2proteome)
    u2sequence = Store(p_uniprot2sequence)

    taxonomy = {}
    for taxid, info in loadobj(p_taxonomy).items():
        taxonomy[taxid] = jsonify({
            "taxId": taxid,
            "scientificName": info["sci_name"],
            "fullName": info["full_name"]
        })

    uniprot2pdbe = {}
    for pdb_id, entry in loadobj(p_structures).items():
        for uniprot_acc in entry["proteins"]:
            try:
                uniprot2pdbe[uniprot_acc].append(pdb_id)
            except KeyError:
                uniprot2pdbe[uniprot_acc] = [pdb_id]

    logger.info("counting proteins/IDA")
    ida_count = {}
    for dom_members, dom_arch, dom_arch_id in u2ida.values():
        try:
            ida_count[dom_arch_id] += 1
        except KeyError:
            ida_count[dom_arch_id] = 1

    logger.info("inserting proteins")
    entries = loadobj(p_entries)
    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("""
        SELECT protein_acc, COUNT(*)
        FROM webfront_varsplic
        GROUP BY protein_acc
        """)
    isoforms = dict(cur.fetchall())

    cur.execute("DROP TABLE IF EXISTS webfront_protein")
    cur.execute("""
        CREATE TABLE webfront_protein
        (
            accession VARCHAR(15) PRIMARY KEY NOT NULL,
            identifier VARCHAR(16) NOT NULL,
            organism LONGTEXT NOT NULL,
            name VARCHAR(255) NOT NULL,
            description LONGTEXT,
            sequence LONGBLOB NOT NULL,
            length INT(11) NOT NULL,
            proteome VARCHAR(20),
            gene VARCHAR(70),
            go_terms LONGTEXT,
            evidence_code INT(11) NOT NULL,
            source_database VARCHAR(10) NOT NULL,
            is_fragment TINYINT NOT NULL,
            structure LONGTEXT,
            tax_id VARCHAR(20) NOT NULL,
            ida_id VARCHAR(40),
            ida TEXT,
            counts LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    i = 0
    sql = """
        INSERT into webfront_protein
        VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    """
    with Table(con, sql) as table:
        for uniprot_acc, protein_info in proteins.items():
            taxid = protein_info["taxid"]

            try:
                taxon = taxonomy[taxid]
            except KeyError:
                table.close()
                con.close()
                raise RuntimeError(f"{uniprot_acc}: invalid taxon {taxid}")

            try:
                name = u2descriptions[uniprot_acc]
            except KeyError:
                table.close()
                con.close()
                raise RuntimeError(f"{uniprot_acc}: missing name")

            try:
                evidence, gene = u2evidences[uniprot_acc]
            except KeyError:
                table.close()
                con.close()
                raise RuntimeError(f"{uniprot_acc}: missing evidence")

            try:
                sequence = u2sequence[uniprot_acc]
            except KeyError:
                table.close()
                con.close()
                raise RuntimeError(f"{uniprot_acc}: missing sequence")

            proteome_id = u2proteome.get(uniprot_acc)

            clans = []
            databases = {}
            go_terms = {}
            for entry_acc in u2matches.get(uniprot_acc, []):
                entry = entries[entry_acc]

                try:
                    databases[entry.database] += 1
                except KeyError:
                    databases[entry.database] = 1

                if entry.clan:
                    clans.append(entry.clan["accession"])

                for term in entry.go_terms:
                    go_terms[term["identifier"]] = term

            protein_structures = {}
            domains = uniprot2cath.get(uniprot_acc)
            if domains:
                protein_structures["cath"] = {}

                for dom in domains.values():
                    dom_id = dom["id"]

                    protein_structures["cath"][dom_id] = {
                        "domain_id": dom["superfamily"]["id"],
                        "coordinates": dom["locations"]
                    }

            domains = uniprot2scop.get(uniprot_acc)
            if domains:
                protein_structures["scop"] = {}

                for dom in domains.values():
                    dom_id = dom["id"]

                    protein_structures["scop"][dom_id] = {
                        "domain_id": dom["superfamily"]["id"],
                        "coordinates": dom["locations"]
                    }

            try:
                dom_members, dom_arch, dom_arch_id = u2ida[uniprot_acc]
            except KeyError:
                dom_arch = dom_arch_id = None
                dom_count = 0
            else:
                dom_count = ida_count[dom_arch_id]

            table.insert(
                (uniprot_acc, protein_info["identifier"], taxon, name,
                 jsonify(u2comments.get(uniprot_acc)),
                 gzip.compress(sequence.encode("utf-8")),
                 protein_info["length"], proteome_id, gene,
                 jsonify(list(go_terms.values())), evidence,
                 "reviewed" if protein_info["reviewed"] else "unreviewed",
                 1 if protein_info["fragment"] else 0,
                 jsonify(protein_structures), protein_info["taxid"],
                 dom_arch_id, dom_arch,
                 jsonify({
                     "domain_architectures": dom_count,
                     "entries": databases,
                     "isoforms": isoforms.get(uniprot_acc, 0),
                     "proteomes": 1 if proteome_id else 0,
                     "sets": len(set(clans)),
                     "structures": len(uniprot2pdbe.get(uniprot_acc, [])),
                     "taxa": 1
                 })))

            i += 1
            if not i % 10000000:
                logger.info(f"{i:>12,}")

        logger.info(f"{i:>12,}")

    con.commit()

    proteins.close()
    u2comments.close()
    u2descriptions.close()
    u2evidences.close()
    u2ida.close()
    u2matches.close()
    u2proteome.close()
    u2sequence.close()

    logger.info("indexing")
    cur = con.cursor()
    cur.execute("""
        CREATE UNIQUE INDEX ui_protein_identifier
        ON webfront_protein (identifier)
        """)
    cur.execute("""
        CREATE INDEX i_protein_proteome
        ON webfront_protein (proteome)
        """)
    cur.execute("""
        CREATE INDEX i_protein_database
        ON webfront_protein (source_database)
        """)
    cur.execute("""
        CREATE INDEX i_protein_taxon
        ON webfront_protein (tax_id)
        """)
    cur.execute("""
        CREATE INDEX i_protein_ida
        ON webfront_protein (ida_id)
        """)
    cur.execute("""
        CREATE INDEX i_protein_fragment
        ON webfront_protein (is_fragment)
        """)
    cur.close()
    con.close()

    logger.info("complete")
Esempio n. 29
0
def insert_release_notes(p_entries: str, p_proteins: str, p_proteomes: str,
                         p_structures: str, p_taxonomy: str,
                         p_uniprot2matches: str, p_uniprot2proteome: str,
                         rel_url: str, stg_url: str, relfile: str):
    logger.info("preparing data")
    uniprot2pdbe = {}
    for pdb_id, entry in loadobj(p_structures).items():
        for uniprot_acc in entry["proteins"]:
            try:
                uniprot2pdbe[uniprot_acc].add(pdb_id)
            except KeyError:
                uniprot2pdbe[uniprot_acc] = {pdb_id}

    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("""
        SELECT name_long, version
        FROM webfront_database
        WHERE name_long IN ('UniProtKB', 'UniProtKB/Swiss-Prot', 'UniProtKB/TrEMBL')
        """)
    uniprot = {}
    for name, version in cur:
        uniprot[name] = {
            "version": version,
            "count": 0,
            "signatures": 0,
            "integrated_signatures": 0
        }
    cur.close()
    con.close()

    entries = loadobj(p_entries)
    proteins = Store(p_proteins)
    u2matches = Store(p_uniprot2matches)
    u2proteome = Store(p_uniprot2proteome)

    # Entities found in InterPro
    integrated_proteomes = set()
    integrated_structures = set()
    integrated_taxonomy = set()

    # Number of proteins with GO terms from InterPro
    uniprot2go = 0

    logger.info("starting")
    i = 0
    for uniprot_acc, info in proteins.items():
        i += 1
        if not i % 10000000:
            logger.info(f"{i:>12,}")

        if info["reviewed"]:
            database = uniprot["UniProtKB/Swiss-Prot"]
        else:
            database = uniprot["UniProtKB/TrEMBL"]

        database["count"] += 1

        try:
            matches = u2matches[uniprot_acc]
        except KeyError:
            # No matches
            continue

        # Protein matched by at least one signature
        database["signatures"] += 1

        is_integrated = False
        for entry_acc in matches:
            entry = entries[entry_acc]
            if entry.database == "interpro":
                """
                Protein matched by at least one InterPro entry,
                i.e. at least one integrated signature
                """
                is_integrated = True

                if entry.go_terms:
                    uniprot2go += 1
                    break

        if is_integrated:
            database["integrated_signatures"] += 1

            try:
                proteome_id = u2proteome[uniprot_acc]
            except KeyError:
                pass
            else:
                integrated_proteomes.add(proteome_id)

            try:
                pdb_ids = uniprot2pdbe[uniprot_acc]
            except KeyError:
                pass
            else:
                integrated_structures |= pdb_ids

            integrated_taxonomy.add(info["taxid"])

    proteins.close()
    u2matches.close()
    u2proteome.close()

    logger.info(f"{i:>12,}")

    # Sum Swiss-Prot and TrEMBL counts
    for key in ["count", "signatures", "integrated_signatures"]:
        value_sp = uniprot["UniProtKB/Swiss-Prot"][key]
        value_tr = uniprot["UniProtKB/TrEMBL"][key]
        uniprot["UniProtKB"][key] = value_sp + value_tr

    logger.info("tracking changes since last releases")
    con = MySQLdb.connect(**url2dict(rel_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("""
        SELECT accession, source_database, integrated_id
        FROM webfront_entry 
        WHERE is_alive = 1
        """)
    public_entries = set()
    public_integrated = set()
    for entry_acc, database, integrated_in in cur:
        if database == "interpro":
            public_entries.add(entry_acc)
        elif integrated_in:
            # Signature already integrated in the previous release
            public_integrated.add(entry_acc)

    cur.execute("""
        SELECT name, version 
        FROM webfront_database 
        WHERE type = 'entry'
        """)
    public_databases = dict(cur.fetchall())
    cur.execute("SELECT * FROM webfront_release_note")
    prev_releases = cur.fetchall()
    cur.close()
    con.close()

    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_release_note")
    cur.execute("""
        CREATE TABLE webfront_release_note
        (
            version VARCHAR(20) PRIMARY KEY NOT NULL,
            release_date DATETIME NOT NULL,
            content LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.executemany(
        """
        INSERT INTO webfront_release_note
        VALUES (%s, %s, %s)
        """, prev_releases)
    con.commit()
    prev_releases = None

    cur.execute("""
        SELECT name, name_long, version, release_date
        FROM webfront_database 
        WHERE type = 'entry'
        """)
    staging_databases = {row[0]: (row[1], row[2], row[3]) for row in cur}

    interpro_new = []
    interpro_types = {}
    member_databases = {}
    pubmed_citations = set()
    interpro2go = 0
    latest_entry = None

    for entry in sorted(loadobj(p_entries).values(),
                        key=lambda e: e.creation_date):
        if entry.is_deleted:
            continue

        if entry.database == "interpro":
            for pub in entry.literature.values():
                if pub["PMID"] is not None:
                    pubmed_citations.add(pub["PMID"])

            try:
                interpro_types[entry.type.lower()] += 1
            except KeyError:
                interpro_types[entry.type.lower()] = 1

            if entry.accession not in public_entries:
                interpro_new.append(entry.accession)

            interpro2go += len(entry.go_terms)
            latest_entry = entry.accession
        else:
            try:
                obj = member_databases[entry.database]
            except KeyError:
                database, version, _ = staging_databases[entry.database]

                is_new = is_updated = False
                if entry.database not in public_databases:
                    is_new = True
                elif version != public_databases[entry.database]:
                    is_updated = True

                obj = member_databases[entry.database] = {
                    "name": database,
                    "version": version,
                    "signatures": 0,
                    "integrated_signatures": 0,
                    "recently_integrated": [],
                    "is_new": is_new,
                    "is_updated": is_updated,
                    "sets": set()
                }

            obj["signatures"] += 1
            if entry.integrated_in:
                obj["integrated_signatures"] += 1

                if entry.accession not in public_integrated:
                    # Recent integration
                    obj["recently_integrated"].append(entry.accession)

            if entry.clan:
                obj["sets"].add(entry.clan["accession"])

    # Transform sets of clans to counts:
    for obj in member_databases.values():
        obj["sets"] = len(obj["sets"])

    structures = list(loadobj(p_structures).values())

    proteomes = set(loadobj(p_proteomes).keys())
    errors = integrated_proteomes - proteomes
    if errors:
        raise RuntimeError(f"{len(errors)} invalid proteomes")

    taxa = set(loadobj(p_taxonomy).keys())
    errors = integrated_taxonomy - taxa
    if errors:
        raise RuntimeError(f"{len(errors)} invalid taxa")

    content = {
        "notes": [],  # TODO implement way to pass custom notes
        "interpro": {
            "entries": sum(interpro_types.values()),
            "new_entries": interpro_new,
            "latest_entry": latest_entry,
            "types": interpro_types,
            "go_terms": interpro2go
        },
        "member_databases": member_databases,
        "proteins": uniprot,
        "structures": {
            "total":
            len(structures),
            "integrated":
            len(integrated_structures),
            "version":
            max(entry["date"] for entry in structures).strftime("%Y-%m-%d")
        },
        "proteomes": {
            "total": len(proteomes),
            "integrated": len(integrated_proteomes),
            "version": uniprot["UniProtKB"]["version"]
        },
        "taxonomy": {
            "total": len(taxa),
            "integrated": len(integrated_taxonomy),
            "version": uniprot["UniProtKB"]["version"]
        },
        "citations": len(pubmed_citations)
    }

    _, version, date = staging_databases["interpro"]
    cur.execute(
        """
        SELECT COUNT(*)
        FROM webfront_release_note
        WHERE version = %s
        """, (version, ))
    n_rows, = cur.fetchone()

    if n_rows:
        cur.execute(
            """
            UPDATE webfront_release_note
            SET content = %s
            WHERE version = %s
            """, (json.dumps(content), version))
    else:
        cur.execute(
            """
            INSERT INTO webfront_release_note
            VALUES (%s, %s, %s)
            """, (version, date, json.dumps(content)))

    con.commit()
    cur.close()
    con.close()

    with open(relfile, "wt") as fh:
        new_integrated = 0
        dbs_integrated = []
        for db in sorted(member_databases.values(), key=lambda x: x["name"]):
            cnt = len(db["recently_integrated"])

            if cnt:
                new_integrated += cnt
                dbs_integrated.append(f"{db['name']} ({cnt})")

        if new_integrated:
            integr_str = (f" integrates {new_integrated} new methods from "
                          f"the {', '.join(dbs_integrated)} databases, and")
        else:
            integr_str = ""

        u_ver = uniprot["UniProtKB"]["version"]
        u_integ = uniprot["UniProtKB"]["integrated_signatures"]
        u_total = uniprot["UniProtKB"]["count"]
        u_cov = round(u_integ / u_total * 100, 1)

        fh.write(f"""\
Title
-----
New releases: InterPro {version} and InterProScan 5.??-{version}

Image: alternate text
---------------------
InterPro: protein sequence analysis & classification

Image: title
------------
InterPro: protein sequence analysis & classification

Summary
-------
InterPro version {version} and InterProScan 5.??-{version} are now available! \
InterPro now features hundreds of new methods integrated \
from partner databases, and InterProScan draws on over \
{sum(interpro_types.values())//1000*1000} entries.

Body
----
<h3>
    <a href="http://www.ebi.ac.uk/interpro/">InterPro version {version}</a>
</h3>

<p>
    <a href="http://www.ebi.ac.uk/interpro/">InterPro {version}</a>\
{integr_str} covers {u_cov}% of UniProt Knowledgebase release {u_ver}. \
It predicts <a href="http://www.geneontology.org/">Gene Ontology</a> \
(GO) terms for over {uniprot2go/1e6:.0f} million UniProt proteins \
via the InterPro2GO pipeline.
</p>

<p>
    The new release includes an update to UniParc (uniparc_match.tar.gz) \
matches to InterPro methods. You can find this on our ftp site: \
<a href="ftp://ftp.ebi.ac.uk/pub/databases/interpro">ftp://ftp.ebi.ac.uk/pub/databases/interpro</a>.
</p>

<p>
    For full details, see <a href="//www.ebi.ac.uk/interpro/release_notes/">the latest InterPro Release Notes</a>.
</p>

<h3>
    <a href="https://github.com/ebi-pf-team/interproscan">InterProScan 5.??-{version}</a>
</h3>

<p>
    InterProScan 5.??-{version} uses data from the newly released InterPro {version}, \
which contains {sum(interpro_types.values()):,} entries. \
You can find the <a href="https://interproscan-docs.readthedocs.io/en/latest/ReleaseNotes.html">full release notes here</a>.
</p>

<p>
    If you need help with InterPro or InterProScan, please contact us using \
<a href="http://www.ebi.ac.uk/support/interpro">our support form</a> - \
your message will reach everyone on the team.
</p>

Meta fields: description
------------------------
We are pleased to announce the release of InterPro {version} \
and InterProScan 5.??-{version}!

Meta fields: tags
-----------------
Protein families, InterProScan, InterPro, Protein, \
protein family, protein motif

URL alias
---------
about/news/service-news/InterPro-{version}
""")

    logger.info("complete")
Esempio n. 30
0
def _export_hmms(p_uniprot2matches: str,
                 pro_url: str,
                 dt: DirectoryTree,
                 buffer_size: int = 1000):
    logger.info("counting hits per model")
    signatures = {}
    with Store(p_uniprot2matches) as u2matches:
        cnt = 0
        for entries in u2matches.values():
            for entry_acc, locations in entries.items():
                for loc in locations:
                    if loc["model"] is None:
                        continue  # InterPro entries

                    try:
                        models = signatures[entry_acc]
                    except KeyError:
                        models = signatures[entry_acc] = {}

                    try:
                        models[loc["model"]] += 1
                    except KeyError:
                        models[loc["model"]] = 1

            cnt += 1
            if not cnt % 10e6:
                logger.info(f"{cnt:>12,}")

        logger.info(f"{cnt:>12,}")

    for entry_acc, models in signatures.items():
        # Select the model with the most hits
        model_acc = sorted(models, key=lambda k: (-models[k], k))[0]
        signatures[entry_acc] = model_acc

    logger.info("processing models")
    df = DumpFile(dt.mktemp(), compress=True)
    cnt = 0
    ignored = 0

    iterator = ippro.get_hmms(pro_url, multi_models=True)
    for entry_acc, model_acc, hmm_bytes in iterator:
        try:
            representative_model = signatures[entry_acc]
        except KeyError:
            # Signature without matches, i.e. without representative model
            ignored += 1
            continue

        if model_acc and model_acc != representative_model:
            continue

        hmm_str = gzip.decompress(hmm_bytes).decode("utf-8")
        df.dump((entry_acc, "hmm", hmm_bytes, "application/gzip", None))

        with StringIO(hmm_str) as stream:
            hmm = hmmer.HMMFile(stream)

        df.dump((entry_acc, "logo",
                 json.dumps(hmm.logo("info_content_all",
                                     "hmm")), "application/json", None))

        cnt += 2
        if cnt >= buffer_size:
            df.close()
            yield df.path
            df = DumpFile(dt.mktemp(), compress=True)
            cnt = 0

    df.close()
    yield df.path

    logger.info(f"  {ignored} models ignored")