Esempio n. 1
0
def insert_residues(pro_url: str, stg_url: str, tmpdir: Optional[str] = None):
    dt = DirectoryTree(root=tmpdir)

    logger.info("exporting residues")
    files = ippro.export_residues(pro_url, dt)

    logger.info("inserting residues")
    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_proteinresidue")
    cur.execute("""
        CREATE TABLE webfront_proteinresidue
        (
            residue_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
            protein_acc VARCHAR(15) NOT NULL,
            entry_acc VARCHAR(25) NOT NULL,
            entry_name VARCHAR(100),
            source_database VARCHAR(10) NOT NULL,
            description VARCHAR(255),
            fragments LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    sql = """
        INSERT INTO webfront_proteinresidue (
          protein_acc, entry_acc, entry_name, source_database, description,
          fragments
        )
        VALUES (%s, %s, %s, %s, %s, %s)
    """
    with Table(con, sql) as table:
        i = 0
        for protein_acc, entries in merge_dumps(files, replace=True):
            for entry_acc, entry in entries.items():
                for descr, locations in entry["descriptions"].items():
                    locations.sort(key=lambda x: (x[1], x[2]))
                    table.insert((protein_acc, entry_acc, entry["name"],
                                  entry["database"], descr,
                                  jsonify(locations, nullable=False)))

            i += 1
            if not i % 10000000:
                logger.info(f"{i:>15,}")

        logger.info(f"{i:>15,}")
    con.commit()

    logger.info(f"temporary files: {dt.size / 1024 ** 2:.0f} MB")
    dt.remove()

    logger.info("indexing")
    cur = con.cursor()
    cur.execute("""
        CREATE INDEX i_proteinresidue
        ON webfront_proteinresidue (protein_acc)
        """)
    cur.close()
    con.close()
    logger.info("complete")
Esempio n. 2
0
def insert_extra_features(stg_url: str, p_uniprot2features: str):
    logger.info("starting")

    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_proteinfeature")
    cur.execute("""
        CREATE TABLE webfront_proteinfeature
        (
            feature_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
            protein_acc VARCHAR(15) NOT NULL,
            entry_acc VARCHAR(25) NOT NULL,
            source_database VARCHAR(10) NOT NULL,
            location_start INT NOT NULL,
            location_end INT NOT NULL,
            sequence_feature VARCHAR(35)
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    sql = """
        INSERT INTO webfront_proteinfeature (
          protein_acc, entry_acc, source_database, location_start,
          location_end, sequence_feature
        )
        VALUES (%s, %s, %s, %s, %s, %s)
    """
    with Store(p_uniprot2features) as proteins, Table(con, sql) as table:
        i = 0
        for uniprot_acc, entries in proteins.items():
            for entry_acc, info in entries.items():
                for pos_start, pos_end, seq_feature in info["locations"]:
                    table.insert((uniprot_acc, entry_acc, info["database"],
                                  pos_start, pos_end, seq_feature))

            i += 1
            if not i % 10000000:
                logger.info(f"{i:>12,}")

        logger.info(f"{i:>12,}")
    con.commit()

    logger.info("indexing")
    cur = con.cursor()
    cur.execute("""
        CREATE INDEX i_proteinfeature
        ON webfront_proteinfeature (protein_acc)
        """)
    cur.close()
    con.close()
    logger.info("complete")
Esempio n. 3
0
def insert_isoforms(src_entries: str, pro_url: str, stg_url: str):
    entries = loadobj(src_entries)

    con = MySQLdb.connect(**url2dict(stg_url))
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_varsplic")
    cur.execute("""
        CREATE TABLE webfront_varsplic
        (
            accession VARCHAR(20) PRIMARY KEY NOT NULL,
            protein_acc VARCHAR(15) NOT NULL,
            length INT(11) NOT NULL,
            sequence LONGTEXT NOT NULL,
            features LONGTEXT
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    sql = """
        INSERT INTO webfront_varsplic VALUES (%s, %s, %s, %s, %s)
    """
    with Table(con, sql) as table:
        for accession, variant in ippro.get_isoforms(pro_url).items():
            features = {}
            for entry_acc, locations in variant["matches"].items():
                entry = entries[entry_acc]

                features[entry_acc] = {
                    "accession": entry_acc,
                    "integrated": entry.integrated_in,
                    "name": entry.name,
                    "type": entry.type.lower(),
                    "source_database": entry.database,
                    "locations": locations
                }

            table.insert((accession, variant["protein_acc"], variant["length"],
                          variant["sequence"], jsonify(features)))

    con.commit()

    cur = con.cursor()
    cur.execute("""
        CREATE INDEX i_varsplic
        ON webfront_varsplic (protein_acc)
        """)
    cur.close()
    con.close()
Esempio n. 4
0
def insert_databases(pro_url: str,
                     stg_url: str,
                     version: str,
                     date: str,
                     update_prod: bool = False):
    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_database")
    cur.execute("""
        CREATE TABLE webfront_database
        (
            name VARCHAR(10) NOT NULL PRIMARY KEY,
            name_alt VARCHAR(10) NOT NULL,
            name_long VARCHAR(25) NOT NULL,
            description LONGTEXT,
            type ENUM('protein', 'entry', 'feature', 'other') NOT NULL,
            num_entries INTEGER,
            version VARCHAR(20),
            release_date DATETIME,
            prev_version VARCHAR(20),
            prev_release_date DATETIME
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    sql = """
        INSERT INTO webfront_database 
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 
    """
    with Table(con, sql) as table:
        rows = ippro.get_databases(pro_url, version, date, update=update_prod)
        for row in rows:
            table.insert(row)

    con.commit()
    con.close()
Esempio n. 5
0
def insert_entries(pfam_url: str, stg_url: str, p_entries: str,
                   p_entry2xrefs: str):
    logger.info("fetching Wikipedia data for Pfam entries")
    wiki = pfam.get_wiki(pfam_url)

    logger.info("loading Pfam curation/family details")
    pfam_details = pfam.get_details(pfam_url)

    logger.info("populating webfront_entry")
    entries = loadobj(p_entries)
    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_entry")
    cur.execute("""
        CREATE TABLE webfront_entry
        (
            entry_id VARCHAR(10) DEFAULT NULL,
            accession VARCHAR(25) PRIMARY KEY NOT NULL,
            type VARCHAR(50) NOT NULL,
            name LONGTEXT,
            short_name VARCHAR(100),
            source_database VARCHAR(10) NOT NULL,
            member_databases LONGTEXT,
            integrated_id VARCHAR(25),
            go_terms LONGTEXT,
            description LONGTEXT,
            wikipedia LONGTEXT,
            details LONGTEXT,
            literature LONGTEXT,
            hierarchy LONGTEXT,
            cross_references LONGTEXT,
            interactions LONGTEXT,
            pathways LONGTEXT,
            overlaps_with LONGTEXT,
            is_featured TINYINT NOT NULL,
            is_alive TINYINT NOT NULL,
            history LONGTEXT,
            entry_date DATETIME NOT NULL,
            deletion_date DATETIME,
            counts LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)

    # Count number of structural models per entry
    cur.execute("""
        SELECT accession, COUNT(*)
        FROM webfront_structuralmodel
        GROUP BY accession
        """)
    num_struct_models = dict(cur.fetchall())
    cur.close()

    sql = """
        INSERT INTO webfront_entry
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
          %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """

    with Table(con, sql) as table:
        with DumpFile(p_entry2xrefs) as df:
            for accession, xrefs in df:
                entry = entries[accession]
                counts = reduce(xrefs)
                counts.update({
                    "interactions":
                    len(entry.ppi),
                    "pathways":
                    sum([len(v) for v in entry.pathways.values()]),
                    "sets":
                    1 if entry.clan else 0,
                    "structural_models":
                    num_struct_models.get(accession, 0)
                })

                table.insert(
                    (None, accession, entry.type.lower(),
                     entry.name, entry.short_name, entry.database,
                     jsonify(entry.integrates), entry.integrated_in,
                     jsonify(entry.go_terms), jsonify(entry.description),
                     jsonify(wiki.get(accession)),
                     jsonify(pfam_details.get(accession)),
                     jsonify(entry.literature), jsonify(entry.hierarchy),
                     jsonify(entry.cross_references), jsonify(entry.ppi),
                     jsonify(entry.pathways), jsonify(entry.overlaps_with), 0,
                     0 if entry.is_deleted else 1, jsonify(entry.history),
                     entry.creation_date, entry.deletion_date,
                     jsonify(counts)))

    con.commit()
    con.close()
    logger.info("complete")
Esempio n. 6
0
def insert_proteomes(p_entries: str, p_proteins: str, p_proteomes: str,
                     p_structures: str, p_uniprot2ida: str,
                     p_uniprot2matches: str, p_uniprot2proteome: str,
                     stg_url: str):
    logger.info("preparing data")
    proteomes = loadobj(p_proteomes)
    uniprot2pdbe = {}
    for pdb_id, entry in loadobj(p_structures).items():
        for uniprot_acc in entry["proteins"]:
            try:
                uniprot2pdbe[uniprot_acc].add(pdb_id)
            except KeyError:
                uniprot2pdbe[uniprot_acc] = {pdb_id}

    # Init all proteomes
    xrefs = {}
    for proteome_id in proteomes:
        xrefs[proteome_id] = {
            "domain_architectures": set(),
            "entries": {},
            "proteins": 0,
            "sets": set(),
            "structures": set(),
            "taxa": set()
        }

    entries = loadobj(p_entries)
    proteins = Store(p_proteins)
    u2ida = Store(p_uniprot2ida)
    u2matches = Store(p_uniprot2matches)
    u2proteome = Store(p_uniprot2proteome)

    logger.info("starting")
    i = 0
    for uniprot_acc, proteome_id in u2proteome.items():
        proteome = xrefs[proteome_id]
        proteome["proteins"] += 1

        info = proteins[uniprot_acc]
        proteome["taxa"].add(info["taxid"])

        try:
            dom_members, dom_arch, dom_arch_id = u2ida[uniprot_acc]
        except KeyError:
            pass
        else:
            proteome["domain_architectures"].add(dom_arch_id)

        for entry_acc in u2matches.get(uniprot_acc, []):
            entry = entries[entry_acc]
            try:
                proteome["entries"][entry.database].add(entry_acc)
            except KeyError:
                proteome["entries"][entry.database] = {entry_acc}

            if entry.clan:
                proteome["sets"].add(entry.clan["accession"])

        try:
            pdb_ids = uniprot2pdbe[uniprot_acc]
        except KeyError:
            pass
        else:
            proteome["structures"] |= pdb_ids

        i += 1
        if not i % 10000000:
            logger.info(f"{i:>12,}")

    logger.info(f"{i:>12,}")

    proteins.close()
    u2ida.close()
    u2matches.close()
    u2proteome.close()

    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_proteome")
    cur.execute("""
        CREATE TABLE webfront_proteome
        (
            accession VARCHAR(20) PRIMARY KEY NOT NULL,
            name VARCHAR(215) NOT NULL,
            is_reference TINYINT NOT NULL,
            strain VARCHAR(512),
            assembly VARCHAR(512),
            taxonomy_id VARCHAR(20) NOT NULL,
            counts LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    sql = """
        INSERT INTO webfront_proteome VALUES (%s, %s, %s, %s, %s, %s, %s) 
    """
    with Table(con, sql) as table:
        for proteome_id, info in proteomes.items():
            counts = reduce(xrefs[proteome_id])
            counts["entries"]["total"] = sum(counts["entries"].values())
            table.insert(
                (proteome_id, info["name"], 1 if info["is_reference"] else 0,
                 info["strain"], info["assembly"], info["taxon_id"],
                 jsonify(counts)))

    con.commit()
    con.close()

    logger.info("complete")
Esempio n. 7
0
def insert_structures(p_entries: str, p_proteins: str, p_structures: str,
                      p_uniprot2ida: str, p_uniprot2matches: str,
                      p_uniprot2proteome: str, stg_url: str):
    logger.info("preparing data")
    entries = {}
    for entry in loadobj(p_entries).values():
        entries[entry.accession] = (entry.database, entry.clan)

    uniprot2pdbe = {}
    xrefs = {}
    for pdb_id, entry in loadobj(p_structures).items():
        for uniprot_acc, chains in entry["proteins"].items():
            try:
                uniprot2pdbe[uniprot_acc][pdb_id] = chains
            except KeyError:
                uniprot2pdbe[uniprot_acc] = {pdb_id: chains}

        xrefs[pdb_id] = {
            "domain_architectures": set(),
            "entries": {},
            "proteomes": set(),
            "proteins": 0,
            "sets": set(),
            "taxa": set()
        }

    proteins = Store(p_proteins)
    u2ida = Store(p_uniprot2ida)
    u2matches = Store(p_uniprot2matches)
    u2proteome = Store(p_uniprot2proteome)

    logger.info("starting")
    i = 0
    for uniprot_acc in sorted(uniprot2pdbe):
        info = proteins[uniprot_acc]

        try:
            dom_members, dom_arch, dom_arch_id = u2ida[uniprot_acc]
        except KeyError:
            dom_arch_id = None

        proteome_id = u2proteome.get(uniprot_acc)
        matches = u2matches.get(uniprot_acc, {})

        for pdb_id, chains in uniprot2pdbe[uniprot_acc].items():
            _xrefs = xrefs[pdb_id]

            if dom_arch_id:
                _xrefs["domain_architectures"].add(dom_arch_id)

            if proteome_id:
                _xrefs["proteomes"].add(proteome_id)

            _xrefs["proteins"] += 1
            _xrefs["taxa"].add(info["taxid"])

            for entry_acc, locations in matches.items():
                database, clan = entries[entry_acc]

                for chain_id, segments in chains.items():
                    if overlaps_pdb_chain(locations, segments):
                        try:
                            _xrefs["entries"][database].add(entry_acc)
                        except KeyError:
                            _xrefs["entries"][database] = {entry_acc}

                        if clan:
                            _xrefs["sets"].add(clan["accession"])

                        break  # Skip other chains

        i += 1
        if not i % 10000:
            logger.info(f"{i:>12,}")

    logger.info(f"{i:>12,}")

    proteins.close()
    u2ida.close()
    u2matches.close()
    u2proteome.close()

    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_structure")
    cur.execute("""
        CREATE TABLE webfront_structure
        (
            accession VARCHAR(4) PRIMARY KEY NOT NULL,
            name VARCHAR(512) NOT NULL,
            source_database VARCHAR(10) NOT NULL,
            experiment_type VARCHAR(16) NOT NULL,
            release_date DATETIME NOT NULL,
            resolution FLOAT,
            literature LONGTEXT,
            chains LONGTEXT NOT NULL,
            proteins LONGTEXT NOT NULL,
            secondary_structures LONGTEXT,
            counts LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    sql = """
        INSERT INTO webfront_structure 
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 
    """
    with Table(con, sql) as table:
        for pdb_id, info in loadobj(p_structures).items():
            counts = reduce(xrefs[pdb_id])
            counts["entries"]["total"] = sum(counts["entries"].values())
            table.insert((
                pdb_id,
                info["name"],
                "pdb",
                info["evidence"],
                info["date"],
                info["resolution"],
                jsonify(info["citations"]),
                # Sorted list of unique chain (e.g. 'A', 'B', ...)
                jsonify(sorted({
                    chain_id
                    for chains in info["proteins"].values()
                    for chain_id in chains
                }),
                        nullable=False),
                jsonify(info["proteins"], nullable=False),
                jsonify(info["secondary_structures"]),
                jsonify(counts)))

    con.commit()
    con.close()

    logger.info("complete")
Esempio n. 8
0
def insert_proteins(p_entries: str, p_proteins: str, p_structures: str,
                    p_taxonomy: str, p_uniprot2comments: str,
                    p_uniprot2name: str, p_uniprot2evidences: str,
                    p_uniprot2ida: str, p_uniprot2matches: str,
                    p_uniprot2proteome: str, p_uniprot2sequence: str,
                    pro_url: str, stg_url: str):
    logger.info("loading CATH/SCOP domains")
    uniprot2cath = pdbe.get_cath_domains(pro_url)
    uniprot2scop = pdbe.get_scop_domains(pro_url)

    logger.info("preparing data")
    proteins = Store(p_proteins)
    u2comments = Store(p_uniprot2comments)
    u2descriptions = Store(p_uniprot2name)
    u2evidences = Store(p_uniprot2evidences)
    u2ida = Store(p_uniprot2ida)
    u2matches = Store(p_uniprot2matches)
    u2proteome = Store(p_uniprot2proteome)
    u2sequence = Store(p_uniprot2sequence)

    taxonomy = {}
    for taxid, info in loadobj(p_taxonomy).items():
        taxonomy[taxid] = jsonify({
            "taxId": taxid,
            "scientificName": info["sci_name"],
            "fullName": info["full_name"]
        })

    uniprot2pdbe = {}
    for pdb_id, entry in loadobj(p_structures).items():
        for uniprot_acc in entry["proteins"]:
            try:
                uniprot2pdbe[uniprot_acc].append(pdb_id)
            except KeyError:
                uniprot2pdbe[uniprot_acc] = [pdb_id]

    logger.info("counting proteins/IDA")
    ida_count = {}
    for dom_members, dom_arch, dom_arch_id in u2ida.values():
        try:
            ida_count[dom_arch_id] += 1
        except KeyError:
            ida_count[dom_arch_id] = 1

    logger.info("inserting proteins")
    entries = loadobj(p_entries)
    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("""
        SELECT protein_acc, COUNT(*)
        FROM webfront_varsplic
        GROUP BY protein_acc
        """)
    isoforms = dict(cur.fetchall())

    cur.execute("DROP TABLE IF EXISTS webfront_protein")
    cur.execute("""
        CREATE TABLE webfront_protein
        (
            accession VARCHAR(15) PRIMARY KEY NOT NULL,
            identifier VARCHAR(16) NOT NULL,
            organism LONGTEXT NOT NULL,
            name VARCHAR(255) NOT NULL,
            description LONGTEXT,
            sequence LONGBLOB NOT NULL,
            length INT(11) NOT NULL,
            proteome VARCHAR(20),
            gene VARCHAR(70),
            go_terms LONGTEXT,
            evidence_code INT(11) NOT NULL,
            source_database VARCHAR(10) NOT NULL,
            is_fragment TINYINT NOT NULL,
            structure LONGTEXT,
            tax_id VARCHAR(20) NOT NULL,
            ida_id VARCHAR(40),
            ida TEXT,
            counts LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    i = 0
    sql = """
        INSERT into webfront_protein
        VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    """
    with Table(con, sql) as table:
        for uniprot_acc, protein_info in proteins.items():
            taxid = protein_info["taxid"]

            try:
                taxon = taxonomy[taxid]
            except KeyError:
                table.close()
                con.close()
                raise RuntimeError(f"{uniprot_acc}: invalid taxon {taxid}")

            try:
                name = u2descriptions[uniprot_acc]
            except KeyError:
                table.close()
                con.close()
                raise RuntimeError(f"{uniprot_acc}: missing name")

            try:
                evidence, gene = u2evidences[uniprot_acc]
            except KeyError:
                table.close()
                con.close()
                raise RuntimeError(f"{uniprot_acc}: missing evidence")

            try:
                sequence = u2sequence[uniprot_acc]
            except KeyError:
                table.close()
                con.close()
                raise RuntimeError(f"{uniprot_acc}: missing sequence")

            proteome_id = u2proteome.get(uniprot_acc)

            clans = []
            databases = {}
            go_terms = {}
            for entry_acc in u2matches.get(uniprot_acc, []):
                entry = entries[entry_acc]

                try:
                    databases[entry.database] += 1
                except KeyError:
                    databases[entry.database] = 1

                if entry.clan:
                    clans.append(entry.clan["accession"])

                for term in entry.go_terms:
                    go_terms[term["identifier"]] = term

            protein_structures = {}
            domains = uniprot2cath.get(uniprot_acc)
            if domains:
                protein_structures["cath"] = {}

                for dom in domains.values():
                    dom_id = dom["id"]

                    protein_structures["cath"][dom_id] = {
                        "domain_id": dom["superfamily"]["id"],
                        "coordinates": dom["locations"]
                    }

            domains = uniprot2scop.get(uniprot_acc)
            if domains:
                protein_structures["scop"] = {}

                for dom in domains.values():
                    dom_id = dom["id"]

                    protein_structures["scop"][dom_id] = {
                        "domain_id": dom["superfamily"]["id"],
                        "coordinates": dom["locations"]
                    }

            try:
                dom_members, dom_arch, dom_arch_id = u2ida[uniprot_acc]
            except KeyError:
                dom_arch = dom_arch_id = None
                dom_count = 0
            else:
                dom_count = ida_count[dom_arch_id]

            table.insert(
                (uniprot_acc, protein_info["identifier"], taxon, name,
                 jsonify(u2comments.get(uniprot_acc)),
                 gzip.compress(sequence.encode("utf-8")),
                 protein_info["length"], proteome_id, gene,
                 jsonify(list(go_terms.values())), evidence,
                 "reviewed" if protein_info["reviewed"] else "unreviewed",
                 1 if protein_info["fragment"] else 0,
                 jsonify(protein_structures), protein_info["taxid"],
                 dom_arch_id, dom_arch,
                 jsonify({
                     "domain_architectures": dom_count,
                     "entries": databases,
                     "isoforms": isoforms.get(uniprot_acc, 0),
                     "proteomes": 1 if proteome_id else 0,
                     "sets": len(set(clans)),
                     "structures": len(uniprot2pdbe.get(uniprot_acc, [])),
                     "taxa": 1
                 })))

            i += 1
            if not i % 10000000:
                logger.info(f"{i:>12,}")

        logger.info(f"{i:>12,}")

    con.commit()

    proteins.close()
    u2comments.close()
    u2descriptions.close()
    u2evidences.close()
    u2ida.close()
    u2matches.close()
    u2proteome.close()
    u2sequence.close()

    logger.info("indexing")
    cur = con.cursor()
    cur.execute("""
        CREATE UNIQUE INDEX ui_protein_identifier
        ON webfront_protein (identifier)
        """)
    cur.execute("""
        CREATE INDEX i_protein_proteome
        ON webfront_protein (proteome)
        """)
    cur.execute("""
        CREATE INDEX i_protein_database
        ON webfront_protein (source_database)
        """)
    cur.execute("""
        CREATE INDEX i_protein_taxon
        ON webfront_protein (tax_id)
        """)
    cur.execute("""
        CREATE INDEX i_protein_ida
        ON webfront_protein (ida_id)
        """)
    cur.execute("""
        CREATE INDEX i_protein_fragment
        ON webfront_protein (is_fragment)
        """)
    cur.close()
    con.close()

    logger.info("complete")
Esempio n. 9
0
def insert_taxonomy(p_entries: str,
                    p_proteins: str,
                    p_structures: str,
                    p_taxonomy: str,
                    p_uniprot2matches: str,
                    p_uniprot2proteome: str,
                    stg_url: str,
                    p_interpro2taxonomy: str,
                    tmpdir: Optional[str] = None):
    logger.info("preparing data")
    dt = DirectoryTree(tmpdir)
    entries = loadobj(p_entries)
    taxonomy = loadobj(p_taxonomy)
    uniprot2pdbe = {}
    for pdb_id, entry in loadobj(p_structures).items():
        for uniprot_acc, chains in entry["proteins"].items():
            try:
                uniprot2pdbe[uniprot_acc][pdb_id] = chains
            except KeyError:
                uniprot2pdbe[uniprot_acc] = {pdb_id: chains}

    proteins = Store(p_proteins)
    u2matches = Store(p_uniprot2matches)
    u2proteome = Store(p_uniprot2proteome)

    logger.info("starting")
    i = 0
    xrefs = {}
    files = []
    for uniprot_acc, info in proteins.items():
        taxon_id = info["taxid"]

        try:
            taxon = xrefs[taxon_id]
        except KeyError:
            taxon = xrefs[taxon_id] = init_xrefs()

        try:
            proteome_id = u2proteome[uniprot_acc]
        except KeyError:
            pass
        else:
            taxon["proteomes"].add(proteome_id)

        taxon["proteins"]["all"] += 1

        protein_structures = uniprot2pdbe.get(uniprot_acc, {})

        # Add structures to taxon, regardless of entry matches
        taxon["structures"]["all"] |= set(protein_structures.keys())

        databases = set()
        for entry_acc, locations in u2matches.get(uniprot_acc, {}).items():
            entry = entries[entry_acc]
            database = entry.database

            try:
                taxon["entries"][database].add(entry_acc)
            except KeyError:
                taxon["entries"][database] = {entry_acc}

            if database not in databases:
                # Counting the protein *once* per database
                databases.add(database)
                try:
                    taxon["proteins"]["databases"][database] += 1
                except KeyError:
                    taxon["proteins"]["databases"][database] = 1

            try:
                taxon["proteins"]["entries"][entry_acc] += 1
            except KeyError:
                taxon["proteins"]["entries"][entry_acc] = 1

            for pdb_id, chains in protein_structures.items():
                for chain_id, segments in chains.items():
                    if overlaps_pdb_chain(locations, segments):
                        try:
                            taxon["structures"]["entries"][entry_acc].add(
                                pdb_id)
                        except KeyError:
                            taxon["structures"]["entries"][entry_acc] = {
                                pdb_id
                            }

                        break  # Skip other chains

        i += 1
        if not i % 1000000:
            output = dt.mktemp()
            dump_xrefs(xrefs, taxonomy, output)
            files.append(output)
            xrefs = {}

            if not i % 10000000:
                logger.info(f"{i:>12,}")

    if xrefs:
        output = dt.mktemp()
        dump_xrefs(xrefs, taxonomy, output)
        files.append(output)
        xrefs = {}

    logger.info(f"{i:>12,}")
    logger.info(f"temporary files: "
                f"{sum(map(os.path.getsize, files))/1024/1024:.0f} MB")

    proteins.close()
    u2matches.close()
    u2proteome.close()

    logger.info("populating taxonomy tables")
    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_taxonomy")
    cur.execute("""
        CREATE TABLE webfront_taxonomy
        (
            accession VARCHAR(20) PRIMARY KEY NOT NULL,
            scientific_name VARCHAR(255) NOT NULL,
            full_name VARCHAR(512) NOT NULL,
            lineage LONGTEXT NOT NULL,
            parent_id VARCHAR(20),
            rank VARCHAR(20) NOT NULL,
            children LONGTEXT,
            counts LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.execute("DROP TABLE IF EXISTS webfront_taxonomyperentry")
    cur.execute("""
        CREATE TABLE webfront_taxonomyperentry
        (
          id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
          tax_id VARCHAR(20) NOT NULL,
          entry_acc VARCHAR(25) NOT NULL,
          counts LONGTEXT NULL NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.execute("DROP TABLE IF EXISTS webfront_taxonomyperentrydb")
    cur.execute("""
        CREATE TABLE webfront_taxonomyperentrydb
        (
          id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
          tax_id VARCHAR(20) NOT NULL,
          source_database VARCHAR(10) NOT NULL,
          counts LONGTEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    table = Table(con,
                  query="""
        INSERT INTO webfront_taxonomy VALUES (%s, %s, %s, %s, %s, %s, %s, %s) 
    """)
    per_entry = Table(con,
                      query="""
        INSERT INTO webfront_taxonomyperentry (tax_id,entry_acc,counts)
        VALUES (%s, %s, %s) 
    """)
    per_database = Table(con,
                         query="""
        INSERT INTO webfront_taxonomyperentrydb (tax_id,source_database,counts)
        VALUES (%s, %s, %s) 
    """)

    with DumpFile(p_interpro2taxonomy, compress=True) as interpro2taxonomy:
        interpro_entries = {
            entry.accession
            for entry in entries.values()
            if entry.database == "interpro" and not entry.is_deleted
        }

        i = 0
        for taxon_id, taxon_xrefs in merge_dumps(files):
            taxon = taxonomy[taxon_id]

            protein_counts = taxon_xrefs.pop("proteins")
            structure_counts = taxon_xrefs.pop("structures")
            counts = reduce(taxon_xrefs)

            # Add total protein count (not grouped by database/entry)
            counts["proteins"] = protein_counts["all"]

            # Add total structure count
            counts["structures"] = len(structure_counts["all"])

            # Add total entry count (not grouped by database)
            counts["entries"]["total"] = sum(counts["entries"].values())

            table.insert(
                (taxon_id, taxon["sci_name"], taxon["full_name"],
                 f" {' '.join(taxon['lineage'])} ", taxon["parent"],
                 taxon["rank"], jsonify(taxon["children"]), jsonify(counts)))

            # Remove the 'entry' property
            # (no needed for webfront_taxonomyperentry)
            entry_counts = counts.pop("entries")

            database_structures = {}
            for entry_acc, count in protein_counts["entries"].items():
                if entry_acc in interpro_entries:
                    interpro2taxonomy.dump((entry_acc, taxon_id, count))

                counts["proteins"] = count

                try:
                    entry_structures = structure_counts["entries"][entry_acc]
                except KeyError:
                    counts["structures"] = 0
                else:
                    counts["structures"] = len(entry_structures)

                    database = entries[entry_acc].database
                    try:
                        database_structures[database] |= entry_structures
                    except KeyError:
                        database_structures[database] = entry_structures.copy()
                finally:
                    per_entry.insert((taxon_id, entry_acc, jsonify(counts)))

            for database, count in protein_counts["databases"].items():
                counts.update({
                    "entries":
                    entry_counts[database],
                    "proteins":
                    count,
                    "structures":
                    len(database_structures.get(database, []))
                })
                per_database.insert((taxon_id, database, jsonify(counts)))

            i += 1
            if not i % 100000:
                logger.info(f"{i:>12,}")

        logger.info(f"{i:>12,}")

    table.close()
    per_entry.close()
    per_database.close()
    con.commit()

    dt.remove()

    logger.info("indexing")
    cur = con.cursor()
    cur.execute("""
        CREATE INDEX i_webfront_taxonomyperentry_tax
        ON webfront_taxonomyperentry (tax_id)
        """)
    cur.execute("""
        CREATE INDEX i_webfront_taxonomyperentry_entry
        ON webfront_taxonomyperentry (entry_acc)
        """)
    cur.execute("""
        CREATE INDEX i_webfront_taxonomyperentrydb_tax
        ON webfront_taxonomyperentrydb (tax_id)
        """)
    cur.execute("""
        CREATE INDEX i_webfront_taxonomyperentrydb_database
        ON webfront_taxonomyperentrydb (source_database)
        """)
    cur.close()
    con.close()
    logger.info("complete")
Esempio n. 10
0
def insert_clans(stg_url: str, p_alignments: str, p_clans: str, p_entries: str,
                 p_entry2xrefs: str, **kwargs):
    max_xrefs = kwargs.get("max_xrefs", 1000000)
    tmpdir = kwargs.get("tmpdir")

    logger.info("aggregating clan cross-references")
    dt = DirectoryTree(tmpdir)
    entry2clan = {}
    for entry_acc, entry in loadobj(p_entries).items():
        if entry.clan:
            entry2clan[entry_acc] = entry.clan["accession"]

    clans = {}
    files = []
    num_xrefs = 0
    with DumpFile(p_entry2xrefs) as df:
        for entry_acc, entry_xrefs in df:
            try:
                clan_acc = entry2clan[entry_acc]
            except KeyError:
                continue

            try:
                clan_xrefs = clans[clan_acc]
            except KeyError:
                clan_xrefs = clans[clan_acc] = {}

            # We do not need the number of matches
            del entry_xrefs["matches"]

            cnt_before = sum(map(len, clan_xrefs.values()))
            deepupdate(entry_xrefs, clan_xrefs)
            cnt_after = sum(map(len, clan_xrefs.values()))
            num_xrefs += cnt_after - cnt_before

            if num_xrefs >= max_xrefs:
                file = dt.mktemp()
                with DumpFile(file, compress=True) as df2:
                    for clan_acc in sorted(clans):
                        df2.dump((clan_acc, clans[clan_acc]))

                files.append(file)
                clans = {}
                num_xrefs = 0

    file = dt.mktemp()
    with DumpFile(file, compress=True) as df2:
        for clan_acc in sorted(clans):
            df2.dump((clan_acc, clans[clan_acc]))

    files.append(file)

    logger.info("inserting clans")
    clans = loadobj(p_clans)
    con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_set")
    cur.execute("""
        CREATE TABLE webfront_set
        (
            accession VARCHAR(20) PRIMARY KEY NOT NULL,
            name VARCHAR(400),
            description TEXT,
            source_database VARCHAR(10) NOT NULL,
            relationships LONGTEXT NOT NULL,
            authors TEXT,
            literature TEXT,
            counts LONGTEXT DEFAULT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    sql = """
        INSERT INTO webfront_set
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
    """
    with Table(con, sql) as table:
        for clan_acc, xrefs in merge_dumps(files):
            clan = clans[clan_acc]
            counts = reduce(xrefs)
            counts["entries"] = {
                clan["database"]: len(clan["members"]),
                "total": len(clan["members"])
            }

            table.insert(
                (clan_acc, clan["name"], clan["description"], clan["database"],
                 jsonify(clan["relationships"],
                         nullable=False), jsonify(clan.get("authors")),
                 jsonify(clan.get("literature")), jsonify(counts)))

    logger.info(f"temporary files: {dt.size / 1024 / 1024:.0f} MB")
    dt.remove()

    logger.info("inserting alignments")
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS webfront_alignment")
    cur.execute("""
        CREATE TABLE webfront_alignment
        (
            id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
            set_acc VARCHAR(20) NOT NULL,
            entry_acc VARCHAR(25) NOT NULL,
            target_acc VARCHAR(25) NOT NULL,
            target_set_acc VARCHAR(20),
            score DOUBLE NOT NULL,
            seq_length MEDIUMINT NOT NULL,
            domains TEXT NOT NULL
        ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci
        """)
    cur.close()

    sql = """
        INSERT INTO webfront_alignment (
            set_acc, entry_acc, target_acc, target_set_acc, score, 
            seq_length, domains
        )
        VALUES (%s, %s, %s, %s, %s, %s, %s)
    """
    with DumpFile(p_alignments) as df, Table(con, sql) as table:
        for alignments in df:
            for aln in alignments:
                table.insert(aln)

    con.commit()
    con.close()

    logger.info("complete")
Esempio n. 11
0
def export_entries(url: str, p_metacyc: str, p_clans: str,
                   p_proteins: str, p_structures: str,
                   p_uniprot2matches: str, p_uniprot2proteome: str,
                   p_uniprot2ida: str, p_entry2xrefs: str, p_entries: str,
                   **kwargs):
    min_overlap = kwargs.get("overlap", 0.2)
    processes = kwargs.get("processes", 1)
    min_similarity = kwargs.get("similarity", 0.75)
    tmpdir = kwargs.get("tmpdir")

    con = cx_Oracle.connect(url)
    cur = con.cursor()

    entries = {}
    logger.info("loading active InterPro entries")
    for entry in _get_interpro_entries(cur):
        entries[entry.accession] = entry

    logger.info("enriching entries with IntAct data")
    for accession, interactions in intact.get_interactions(cur).items():
        try:
            entry = entries[accession]
        except KeyError:
            continue
        else:
            entry.ppi = interactions

    logger.info("loading deleted InterPro entries")
    for entry in _get_retired_interpro_entries(cur):
        if entry.accession in entries:
            cur.close()
            con.close()
            raise RuntimeError(f"entry cannot be active "
                               f"and deleted {entry.accession}")

        entries[entry.accession] = entry

    logger.info("loading member database signatures")
    for entry in _get_signatures(cur):
        if entry.integrated_in and entry.integrated_in not in entries:
            cur.close()
            con.close()
            raise RuntimeError(f"{entry.accession} integrated "
                               f"in missing entry ({entry.integrated_in})")

        entries[entry.accession] = entry

    logger.info("loading past entry names")
    past_names = _get_name_history(cur)

    logger.info("loading past signature integrations")
    past_integrations = _get_integration_history(cur)

    logger.info("loading ENZYME")
    u2enzyme = uniprot.get_swissprot2enzyme(cur)

    logger.info("loading Reactome pathways")
    u2reactome = uniprot.get_swissprot2reactome(cur)
    cur.close()
    con.close()

    logger.info("loading MetaCyc pathways")
    ec2metacyc = metacyc.get_ec2pathways(p_metacyc)

    # Updating entry history
    for entry in entries.values():
        try:
            names = past_names[entry.accession]
        except KeyError:
            pass
        else:
            entry.history["names"] = names

        try:
            signatures = past_integrations[entry.accession]
        except KeyError:
            pass
        else:
            entry.history["signatures"] = signatures

    # Updating entry clan info
    for clan in loadobj(p_clans).values():
        for entry_acc, score, seq_length in clan["members"]:
            try:
                entry = entries[entry_acc]
            except:
                continue
            else:
                entry.clan = {
                    "accession": clan["accession"],
                    "name": clan["name"]
                }

    inqueue = Queue(maxsize=processes)
    outqueue = Queue()
    workers = []
    for _ in range(max(1, processes - 1)):
        dt = DirectoryTree(tmpdir)
        p = Process(target=_process_proteins,
                    args=(inqueue, entries, min_overlap, dt, outqueue))
        p.start()
        workers.append((p, dt))

    logger.info("processing")
    uniprot2pdbe = {}
    for pdb_id, entry in loadobj(p_structures).items():
        for uniprot_acc, chains in entry["proteins"].items():
            try:
                uniprot2pdbe[uniprot_acc][pdb_id] = chains
            except KeyError:
                uniprot2pdbe[uniprot_acc] = {pdb_id: chains}

    proteins = Store(p_proteins)
    u2matches = Store(p_uniprot2matches)
    u2proteome = Store(p_uniprot2proteome)
    i = 0
    for uniprot_acc, matches in u2matches.items():
        inqueue.put((
            uniprot_acc,
            proteins[uniprot_acc],
            matches,
            u2proteome.get(uniprot_acc),
            uniprot2pdbe.get(uniprot_acc, {}),
            set(u2enzyme.get(uniprot_acc, [])),
            set(u2reactome.get(uniprot_acc, []))
        ))

        i += 1
        if not i % 10000000:
            logger.info(f"{i:>15,}")

    proteins.close()
    u2matches.close()
    u2proteome.close()
    logger.info(f"{i:>15,}")

    # Send sentinel
    for _ in workers:
        inqueue.put(None)

    # Merge results from workers
    logger.info("exporting domain architectures")
    entries_with_xrefs = set()
    xref_files = []
    entry_counts = {}
    entry_intersections = {}
    interpro2enzyme = {}
    interpro2reactome = {}
    with Store(p_uniprot2ida, u2matches.get_keys(), tmpdir) as u2ida:
        for _ in workers:
            obj = outqueue.get()
            xref_files.append(obj[0])                               # str
            entries_with_xrefs |= obj[1]                            # set
            ida_file = obj[2]                                       # str
            deepupdate(obj[3], entry_counts, replace=False)         # dict
            deepupdate(obj[4], entry_intersections, replace=False)  # dict
            deepupdate(obj[5], interpro2enzyme)                     # dict
            deepupdate(obj[6], interpro2reactome)                   # dict

            with DumpFile(ida_file) as df:
                i = 0
                for uniprot_acc, dom_members, dom_str, dom_id in df:
                    u2ida[uniprot_acc] = (
                        dom_members,
                        dom_str,
                        dom_id
                    )
                    i += 1

                    if not i % 1000:
                        u2ida.sync()

            u2ida.sync()

        size = u2ida.merge(processes=processes)

    # Adding empty EntryXrefs objects for entries without xrefs
    xref_files.append(workers[0][1].mktemp())
    with DumpFile(xref_files[-1], compress=True) as df:
        for entry_acc in sorted(set(entries.keys()) - entries_with_xrefs):
            df.dump((entry_acc, EntryXrefs().asdict()))

    logger.info("exporting cross-references")
    with DumpFile(p_entry2xrefs, compress=True) as df:
        for entry_acc, xrefs in merge_dumps(xref_files):
            df.dump((entry_acc, xrefs))

            entry = entries[entry_acc]

            # Reactome pathways
            if entry_acc in interpro2reactome:
                pathways = interpro2reactome[entry_acc]
                entry.pathways["reactome"] = [
                    dict(zip(("id", "name"), pthw))
                    for pthw in sorted(pathways)
                ]

            # EC numbers
            if entry_acc in interpro2enzyme:
                ecnos = sorted(interpro2enzyme[entry_acc])
                entry.cross_references["ec"] = ecnos

                # MetaCyc pathways
                pathways = set()
                for ecno in ecnos:
                    pathways |= set(ec2metacyc.get(ecno, []))

                if pathways:
                    entry.pathways["metacyc"] = [
                        dict(zip(("id", "name"), pthw))
                        for pthw in sorted(pathways)
                    ]

    for p, dt in workers:
        size += dt.size
        dt.remove()

    logger.info(f"temporary files: {size / 1024 / 1024:.0f} MB")

    logger.info("calculating overlapping relationships")
    supfam = "homologous_superfamily"
    types = (supfam, "domain", "family", "repeat")
    for entry_acc, overlaps in entry_intersections.items():
        entry1 = entries[entry_acc]
        entry_cnt = entry_counts[entry_acc]
        type1 = entry1.type.lower()

        for other_acc, overlap_counts in overlaps.items():
            o1 = overlap_counts["1"]
            o2 = overlap_counts["2"]
            other_cnt = entry_counts[other_acc]

            # Independent coefficients
            coef1 = o1 / (entry_cnt + other_cnt - o1)
            coef2 = o2 / (entry_cnt + other_cnt - o2)

            # Final coefficient: average of independent coefficients
            coef = (coef1 + coef2) * 0.5

            # Containment indices
            c1 = o1 / entry_cnt
            c2 = o2 / other_cnt

            if all([item < min_similarity for item in (coef, c1, c2)]):
                continue

            # Entries are similar enough
            entry2 = entries[other_acc]
            type2 = entry2.type.lower()
            if ((type1 == supfam and type2 in types)
                    or (type1 in types and type2 == supfam)):
                # e1 -> e2 relationship
                entry1.overlaps_with.append({
                    "accession": other_acc,
                    "name": entry2.name,
                    "type": type2
                })

                # e2 -> e1 relationship
                entry2.overlaps_with.append({
                    "accession": entry_acc,
                    "name": entry1.name,
                    "type": type1
                })

    dumpobj(p_entries, entries)

    logger.info("populating ENTRY2PATHWAY")
    con = cx_Oracle.connect(url)
    cur = con.cursor()
    cur.execute("TRUNCATE TABLE INTERPRO.ENTRY2PATHWAY")
    cur.close()
    sql = "INSERT INTO INTERPRO.ENTRY2PATHWAY VALUES (:1, :2, :3, :4)"
    with Table(con, sql) as table:
        for e in entries.values():
            for database, pathways in e.pathways.items():
                code = PATHWAY_DATABASE[database]
                for pthw in pathways:
                    table.insert((
                        e.accession,
                        code,
                        pthw["id"],
                        pthw["name"]
                    ))

    con.commit()
    con.close()
    logger.info("complete")