def export_proteomes(url: str, output: str): con = cx_Oracle.connect(url) cur = con.cursor() cur.execute(""" SELECT P.UPID, P.PROTEOME_NAME, P.IS_REFERENCE, P.GC_SET_ACC, TO_CHAR(P.PROTEOME_TAXID), SN.NAME FROM SPTR.PROTEOME@SWPREAD P LEFT OUTER JOIN TAXONOMY.SPTR_STRAIN@SWPREAD S ON P.PROTEOME_TAXID = S.TAX_ID LEFT OUTER JOIN TAXONOMY.SPTR_STRAIN_NAME@SWPREAD SN ON S.STRAIN_ID = SN.STRAIN_ID WHERE P.IS_REFERENCE = 1 """) proteomes = {} for row in cur: upid = row[0] if upid in proteomes: continue proteomes[upid] = { "id": upid, "name": row[1], "is_reference": row[2] != 0, "assembly": row[3], "taxon_id": row[4], "strain": row[5] } cur.close() con.close() dumpobj(output, proteomes)
def export_taxonomy(url: str, output: str): con = cx_Oracle.connect(url) cur = con.cursor() cur.execute(""" SELECT TO_CHAR(TAX_ID), TO_CHAR(PARENT_ID), SCIENTIFIC_NAME, FULL_NAME, RANK FROM INTERPRO.ETAXI """) taxonomy = {} for row in cur: taxon_id = row[0] taxonomy[taxon_id] = { "id": taxon_id, "parent": row[1], "sci_name": row[2], "full_name": row[3], "rank": row[4], "children": set(), "lineage": [taxon_id] } cur.close() con.close() for taxon_id, taxon in taxonomy.items(): node_id = taxon_id parent_id = taxon["parent"] # Traverse lineage from child to parent while parent_id is not None: taxon["lineage"].append(parent_id) taxonomy[parent_id]["children"].add(node_id) # We move to the parent node_id = parent_id parent_id = taxonomy[parent_id]["parent"] for taxon_id, info in taxonomy.items(): info["children"] = list(info["children"]) info["lineage"] = list(map(str, reversed(info["lineage"]))) dumpobj(output, taxonomy)
def export_structures(url: str, output: str): con = cx_Oracle.connect(url) cur = con.cursor() # Retrieve citations cur.execute( """ SELECT E.ID, C.ID, C.TITLE, C.JOURNAL_ABBREV, C.JOURNAL_VOLUME, C.PAGE_FIRST, C.PAGE_LAST, C.YEAR, C.DATABASE_ID_PUBMED, C.DATABASE_ID_DOI, C.CITATION_TYPE, A.NAME FROM ENTRY@PDBE_LIVE E INNER JOIN CITATION@PDBE_LIVE C ON E.ID = C.ENTRY_ID INNER JOIN CITATION_AUTHOR@PDBE_LIVE A ON C.ENTRY_ID = A.ENTRY_ID AND C.ID = A.CITATION_ID ORDER BY E.ID, C.ID, A.ORDINAL """ ) entry_citations = {} for row in cur: pdb_id = row[0] try: entry = entry_citations[pdb_id] except KeyError: entry = entry_citations[pdb_id] = {} pub_id = row[1] try: pub = entry[pub_id] except KeyError: if row[5] and row[6]: pages = f"{row[5]}-{row[6]}" elif row[5]: pages = str(row[5]) elif row[6]: pages = str(row[6]) else: pages = None pub = entry[pub_id] = { "PMID": int(row[8]) if row[8] is not None else None, "volume": row[4], "year": row[7], "title": row[2], "raw_pages": pages, "ISO_journal": row[3], "authors": [], "DOI_URL": row[9], "type": row[10] } pub["authors"].append(row[11]) # Retrieve secondary structures cur.execute( """ SELECT SS.ENTRY_ID, SS.STRUCT_ASYM_ID, SS.ELEMENT_TYPE, R1.UNP_SEQ_ID AS POS_FROM, R1.CHEM_COMP_ID AS RES_FROM, R2.UNP_SEQ_ID AS POS_TO, R2.CHEM_COMP_ID AS RES_TO FROM ( SELECT ENTRY_ID, STRUCT_ASYM_ID, ELEMENT_TYPE, RESIDUE_BEG_ID, RESIDUE_END_ID FROM PDBE.SS_HELIX@PDBE_LIVE UNION ALL SELECT ENTRY_ID, STRUCT_ASYM_ID, ELEMENT_TYPE, RESIDUE_BEG_ID, RESIDUE_END_ID FROM PDBE.SS_STRAND@PDBE_LIVE ) SS INNER JOIN SIFTS_ADMIN.SIFTS_XREF_RESIDUE@PDBE_LIVE R1 ON (SS.ENTRY_ID=R1.ENTRY_ID AND SS.STRUCT_ASYM_ID=R1.STRUCT_ASYM_ID AND SS.RESIDUE_BEG_ID=R1.ID AND R1.CANONICAL_ACC=1 AND R1.OBSERVED='Y' AND R1.UNP_SEQ_ID IS NOT NULL) INNER JOIN SIFTS_ADMIN.SIFTS_XREF_RESIDUE@PDBE_LIVE R2 ON (SS.ENTRY_ID=R2.ENTRY_ID AND SS.STRUCT_ASYM_ID=R2.STRUCT_ASYM_ID AND SS.RESIDUE_END_ID=R2.ID AND R2.CANONICAL_ACC=1 AND R2.OBSERVED='Y' AND R2.UNP_SEQ_ID IS NOT NULL) """ ) entry_sec_structures = {} for row in cur: pdb_id = row[0] try: chains = entry_sec_structures[pdb_id] except KeyError: chains = entry_sec_structures[pdb_id] = {} chain_id = row[1] try: chain = chains[chain_id] except KeyError: chain = chains[chain_id] = {} elem_type = row[2] try: fragments = chain[elem_type] except KeyError: fragments = chain[elem_type] = [] fragments.append({ # add the type of secondary structure to the fragment "shape": elem_type, "start": row[3], "end": row[5], # "res_start": row[4], # "res_end": row[6], }) # Sort chains by fragment for pdb_id, dict_chains in entry_sec_structures.items(): list_chains = [] for chain_id in sorted(dict_chains): locations = [] for elem_type, fragments in dict_chains[chain_id].items(): fragments.sort(key=repr_fragment) locations.append({ "fragments": fragments }) list_chains.append({ "accession": chain_id, "locations": locations }) entry_sec_structures[pdb_id] = list_chains """ Retrieve PDBe entries with the proteins they are associated to (CRC64 not stored in hexadecimal, so need to convert) """ cur.execute( """ SELECT DISTINCT E.ID, E.TITLE, E.METHOD_CLASS, E.RESOLUTION, E.FIRST_REV_DATE, U.ACCESSION, U.AUTH_ASYM_ID, U.UNP_START, U.UNP_END, U.PDB_START, U.PDB_END, U.AUTH_START, U.AUTH_END FROM PDBE.ENTRY@PDBE_LIVE E INNER JOIN SIFTS_ADMIN.SIFTS_XREF_SEGMENT@PDBE_LIVE U ON ( E.ID = U.ENTRY_ID AND E.METHOD_CLASS IN ('nmr', 'x-ray', 'em') AND U.UNP_START IS NOT NULL AND U.UNP_END IS NOT NULL AND U.PDB_START IS NOT NULL AND U.PDB_END IS NOT NULL ) INNER JOIN SIFTS_ADMIN.SPTR_DBENTRY@PDBE_LIVE DB ON U.ACCESSION = DB.ACCESSION INNER JOIN SIFTS_ADMIN.SPTR_SEQUENCE@PDBE_LIVE S ON DB.DBENTRY_ID = S.DBENTRY_ID INNER JOIN INTERPRO.PROTEIN P ON ( U.ACCESSION = P.PROTEIN_AC AND P.CRC64 = LPAD(TRIM(TO_CHAR(S.CHECKSUM, 'XXXXXXXXXXXXXXXX')),16,'0') ) """ ) entries = {} for row in cur: pdb_id = row[0] try: entry = entries[pdb_id] except KeyError: entry = entries[pdb_id] = { "id": pdb_id, "date": row[4], "name": row[1], "resolution": row[3], "evidence": row[2], "proteins": {}, "citations": entry_citations.get(pdb_id), "secondary_structures": entry_sec_structures.get(pdb_id) } protein_ac = row[5] try: chains = entry["proteins"][protein_ac] except KeyError: chains = entry["proteins"][protein_ac] = {} chain_id = row[6] try: chain = chains[chain_id] except KeyError: chain = chains[chain_id] = [] unp_start = row[7] unp_end = row[8] if unp_start > unp_end: unp_start, unp_end = unp_end, unp_start chain.append({ "protein_start": unp_start, "protein_end": unp_end, "structure_start": row[9], "structure_end": row[10], "author_structure_start": row[11], "author_structure_end": row[12] }) cur.close() con.close() # Sort chains by fragment for entry in entries.values(): for chains in entry["proteins"].values(): for fragments in chains.values(): fragments.sort(key=_repr_protein) dumpobj(output, entries)
def export_clans(ipr_url: str, pfam_url: str, p_clans: str, p_alignments: str, **kwargs): buffer_size = kwargs.get("buffer_size", 1000000) threshold = kwargs.get("threshold", 1e-2) logger.info("loading clans") con = cx_Oracle.connect(ipr_url) cur = con.cursor() clans = get_clans(cur) clan_links = {} entry2clan = {} for accession, clan in clans.items(): clan_links[accession] = {} for member_acc, score, seq_length in clan["members"]: entry2clan[member_acc] = (accession, seq_length) logger.info("exporting alignments") with DumpFile(p_alignments, compress=True) as df: i = 0 alignments = [] for query_acc, target_acc, evalue, domains in iter_alignments(cur): i += 1 if not i % 10000000: logger.info(f"{i:>12,}") try: query_clan_acc, seq_length = entry2clan[query_acc] except KeyError: continue if evalue > threshold: continue try: target_clan_acc, _ = entry2clan[target_acc] except KeyError: target_clan_acc = None alignments.append(( query_clan_acc, query_acc, target_acc, target_clan_acc, evalue, seq_length, json.dumps(domains) )) if len(alignments) == buffer_size: df.dump(alignments) alignments = [] if query_clan_acc == target_clan_acc: # Query and target from the same clan: update the clan's links links = clan_links[query_clan_acc] if query_acc > target_acc: query_acc, target_acc = target_acc, query_acc try: targets = links[query_acc] except KeyError: links[query_acc] = {target_acc: evalue} else: if target_acc not in targets or evalue < targets[target_acc]: targets[target_acc] = evalue df.dump(alignments) alignments = [] logger.info(f"{i:>12,}") cur.close() con.close() logger.info("loading additional details for Pfam clans") pfam_clans = pfam.get_clans(pfam_url) logger.info("finalizing") for clan_acc, clan in clans.items(): nodes = [] for accession, score, seq_length in clan["members"]: nodes.append({ "accession": accession, "type": "entry", "score": score }) links = [] for query_acc, targets in clan_links[clan_acc].items(): for target_acc, score in targets.items(): links.append({ "source": query_acc, "target": target_acc, "score": score }) clan["relationships"] = { "nodes": nodes, "links": links } if clan_acc in pfam_clans: # Replace `description`, add `authors` and `literature` clan.update(pfam_clans[clan_acc]) dumpobj(p_clans, clans) logger.info("complete")
def index_documents(hosts: Sequence[str], indir: str, version: str, threads: int = 4, step: int = 100e6): kwargs = { "thread_count": threads, "queue_size": threads, "raise_on_exception": False, "raise_on_error": False } es = connect(hosts, timeout=30, verbose=False) num_documents = 0 num_indexed = 0 first_pass = True while True: for filepath in iter_files(indir, version): docs = loadobj(filepath) if first_pass: # Count only once the number of documents to index num_documents += len(docs) actions = [] for idx, doc_id, doc in docs: actions.append({ "_op_type": "index", "_index": idx, "_id": doc_id, "_source": doc }) failed = [] for i, (ok, info) in enumerate(pbulk(es, actions, **kwargs)): if ok: num_indexed += 1 if not num_indexed % 100e6: logger.info(f"{num_indexed:>14,} / {num_documents:,}") else: failed.append(docs[i]) # try: # is_429 = info["index"]["status"] == 429 # except (KeyError, IndexError): # is_429 = False # # try: # exc = info["index"]["exception"] # except (KeyError, TypeError): # exc = None # # if is_429 or isinstance(exc, exceptions.ConnectionTimeout): # pause = True # else: # logger.debug(info) if failed: # Overwrite file with failed documents dumpobj(filepath, failed) else: # Remove file as all documents have been successfully indexed os.remove(filepath) logger.info(f"{num_indexed:>14,} / {num_documents:,}") first_pass = False if num_indexed == num_documents: break # Update index settings for base_alias in (IDA_BASE_ALIAS, REL_BASE_ALIAS): alias = base_alias + STAGING_ALIAS_SUFFIX # This assumes there are indices with the 'staging' alias for index in es.indices.get_alias(name=alias): es.indices.put_settings( body={ "number_of_replicas": 1, "refresh_interval": None # default (1s) }, index=index )
def export_documents(src_proteins: str, src_entries: str, src_proteomes: str, src_structures: str, src_taxonomy: str, src_uniprot2ida: str, src_uniprot2matches: str, src_uniprot2proteomes: str, outdirs: Sequence[str], version: str, cache_size: int = 100000): logger.info("preparing data") os.umask(0o002) organizers = [] for path in outdirs: try: shutil.rmtree(path) except FileNotFoundError: pass os.makedirs(path, mode=0o775) organizers.append(DirectoryTree(path)) open(os.path.join(path, f"{version}{LOAD_SUFFIX}"), "w").close() logger.info("loading domain architectures") domains = {} with Store(src_uniprot2ida) as u2ida: for dom_members, dom_arch, dom_arch_id in u2ida.values(): try: dom = domains[dom_arch_id] except KeyError: domains[dom_arch_id] = { "ida_id": dom_arch_id, "ida": dom_arch, "counts": 1 } else: dom["counts"] += 1 logger.info("writing IDA documents") num_documents = 0 domains = list(domains.values()) for i in range(0, len(domains), cache_size): documents = [] for dom in domains[i:i + cache_size]: documents.append(( IDA_INDEX + version, dom["ida_id"], dom )) num_documents += len(documents) for org in organizers: filepath = org.mktemp() dumpobj(filepath, documents) os.rename(filepath, f"{filepath}{EXTENSION}") domains = None proteins = Store(src_proteins) uniprot2ida = Store(src_uniprot2ida) uniprot2matches = Store(src_uniprot2matches) uniprot2proteomes = Store(src_uniprot2proteomes) entries = loadobj(src_entries) # mem: ~1.5 GB proteomes = loadobj(src_proteomes) # mem: <1 GB structures = loadobj(src_structures) # mem: ~ 4GB taxonomy = loadobj(src_taxonomy) # mem: ~ 2.5GB uniprot2pdbe = {} # mem: <1 GB for pdb_id, entry in structures.items(): for uniprot_acc in entry["proteins"]: try: uniprot2pdbe[uniprot_acc].append(pdb_id) except KeyError: uniprot2pdbe[uniprot_acc] = [pdb_id] logger.info("writing relationship documents") i = 0 documents = [] used_entries = set() used_taxa = set() for uniprot_acc, info in proteins.items(): taxid = info["taxid"] taxon = taxonomy[taxid] used_taxa.add(taxid) # remember that this taxon has been used try: dom_members, dom_arch, dom_arch_id = uniprot2ida[uniprot_acc] except KeyError: dom_members = [] dom_arch = dom_arch_id = None # Create an empty document (all properties set to None) doc = init_rel_doc() doc.update({ "protein_acc": uniprot_acc.lower(), "protein_length": info["length"], "protein_is_fragment": info["fragment"], "protein_db": "reviewed" if info["reviewed"] else "unreviewed", "text_protein": join(uniprot_acc, info["identifier"]), # Taxonomy "tax_id": taxid, "tax_name": taxon["sci_name"], "tax_lineage": taxon["lineage"], "tax_rank": taxon["rank"], "text_taxonomy": join(taxid, taxon["full_name"], taxon["rank"]) }) proteome_id = uniprot2proteomes.get(uniprot_acc) if proteome_id: proteome = proteomes[proteome_id] doc.update({ "proteome_acc": proteome_id.lower(), "proteome_name": proteome["name"], "proteome_is_reference": proteome["is_reference"], "text_proteome": join(proteome_id, proteome["name"], proteome["assembly"], proteome["taxon_id"], proteome["strain"]), }) # Adding PDBe structures/chains pdb_chains = {} # mapping PDB-chain ID -> chain segments pdb_documents = {} # mapping PDB-chain ID -> ES document for pdb_id in uniprot2pdbe.get(uniprot_acc, []): pdb_entry = structures[pdb_id] chains = pdb_entry["proteins"][uniprot_acc] pdb_doc = doc.copy() pdb_doc.update({ "structure_acc": pdb_id.lower(), "structure_resolution": pdb_entry["resolution"], "structure_date": pdb_entry["date"], "structure_evidence": pdb_entry["evidence"], "protein_structure": chains, "text_structure": join(pdb_id, pdb_entry["evidence"], pdb_entry["name"]) }) for chain_id, segments in chains.items(): pdb_chain_id = f"{pdb_id}-{chain_id}" locations = [] for segment in segments: locations.append({ "fragments": [{ "start": segment["protein_start"], "end": segment["protein_end"], }] }) chain_doc = pdb_doc.copy() chain_doc.update({ "structure_chain_acc": chain_id, "structure_protein_locations": locations, "structure_chain": pdb_chain_id }) pdb_documents[pdb_chain_id] = chain_doc pdb_chains[pdb_chain_id] = segments # Adding entries overlapping_chains = set() # chains associated to at least one entry matches = uniprot2matches.get(uniprot_acc, {}) num_protein_docs = 0 for entry_acc, locations in matches.items(): used_entries.add(entry_acc) # this entry has been used entry = entries[entry_acc] if entry.integrated_in: interpro_acc = entry.integrated_in.lower() else: interpro_acc = None entry_obj = { "entry_acc": entry_acc.lower(), "entry_db": entry.database, "entry_type": entry.type.lower(), "entry_date": entry.creation_date.strftime("%Y-%m-%d"), "entry_protein_locations": locations, "entry_go_terms": [t["identifier"] for t in entry.go_terms], "entry_integrated": interpro_acc, "text_entry": join(entry_acc, entry.short_name, entry.name, entry.type.lower(), interpro_acc), } if entry.clan: entry_obj.update({ "set_acc": entry.clan["accession"].lower(), "set_db": entry.database, "text_set": join(entry.clan["accession"], entry.clan["name"]), }) if entry_acc in dom_members: entry_obj.update({ "ida_id": dom_arch_id, "ida": dom_arch, }) # Test if the entry overlaps PDB chains entry_chains = set() for pdb_chain_id, segments in pdb_chains.items(): if overlaps_pdb_chain(locations, segments): # Entry overlaps chain: associate entry to struct/chain chain_doc = pdb_documents[pdb_chain_id] entry_doc = chain_doc.copy() entry_doc.update(entry_obj) documents.append(( entry.database + version, get_rel_doc_id(entry_doc), entry_doc )) entry_chains.add(pdb_chain_id) num_protein_docs += 1 if entry_chains: # Entry overlaps at least one chain overlapping_chains |= entry_chains else: # Associate entry to protein directly entry_doc = doc.copy() entry_doc.update(entry_obj) documents.append(( entry.database + version, get_rel_doc_id(entry_doc), entry_doc )) num_protein_docs += 1 # Add chains not overlapping any entry for chain_id, chain_doc in pdb_documents.items(): if chain_id in overlapping_chains: continue chain_doc.update({ "ida_id": dom_arch_id, "ida": dom_arch, }) documents.append(( # Not overlapping any entry -> not associated to a member DB REL_INDEX + version, get_rel_doc_id(chain_doc), chain_doc )) num_protein_docs += 1 if not num_protein_docs: # No relationships for this protein: fallback to protein doc documents.append(( REL_INDEX + version, get_rel_doc_id(doc), doc )) while len(documents) >= cache_size: for org in organizers: filepath = org.mktemp() dumpobj(filepath, documents[:cache_size]) os.rename(filepath, f"{filepath}{EXTENSION}") del documents[:cache_size] num_documents += cache_size i += 1 if not i % 10000000: logger.info(f"{i:>12,}") logger.info(f"{i:>12,}") logger.info("writing remaining documents") # Add unused entries for entry in entries.values(): if entry.accession in used_entries or entry.is_deleted: continue if entry.integrated_in: interpro_acc = entry.integrated_in.lower() else: interpro_acc = None doc = init_rel_doc() doc.update({ "entry_acc": entry.accession.lower(), "entry_db": entry.database, "entry_type": entry.type.lower(), "entry_date": entry.creation_date.strftime("%Y-%m-%d"), "entry_protein_locations": [], "entry_go_terms": [t["identifier"] for t in entry.go_terms], "entry_integrated": interpro_acc, "text_entry": join(entry.accession, entry.short_name, entry.name, entry.type.lower(), interpro_acc), }) if entry.clan: doc.update({ "set_acc": entry.clan["accession"].lower(), "set_db": entry.database, "text_set": join(entry.clan["accession"], entry.clan["name"]), }) documents.append(( entry.database + version, get_rel_doc_id(doc), doc )) # Add unused taxa for taxon in taxonomy.values(): if taxon["id"] in used_taxa: continue doc = init_rel_doc() doc.update({ "tax_id": taxon["id"], "tax_name": taxon["full_name"], "tax_lineage": taxon["lineage"], "tax_rank": taxon["rank"], "text_taxonomy": join(taxon["id"], taxon["full_name"], taxon["rank"]) }) documents.append(( REL_INDEX + version, get_rel_doc_id(doc), doc )) num_documents += len(documents) while documents: for org in organizers: filepath = org.mktemp() dumpobj(filepath, documents[:cache_size]) os.rename(filepath, f"{filepath}{EXTENSION}") del documents[:cache_size] proteins.close() uniprot2ida.close() uniprot2matches.close() uniprot2proteomes.close() for path in outdirs: open(os.path.join(path, f"{version}{DONE_SUFFIX}"), "w").close() logger.info(f"complete ({num_documents:,} documents)")
def export_entries(url: str, p_metacyc: str, p_clans: str, p_proteins: str, p_structures: str, p_uniprot2matches: str, p_uniprot2proteome: str, p_uniprot2ida: str, p_entry2xrefs: str, p_entries: str, **kwargs): min_overlap = kwargs.get("overlap", 0.2) processes = kwargs.get("processes", 1) min_similarity = kwargs.get("similarity", 0.75) tmpdir = kwargs.get("tmpdir") con = cx_Oracle.connect(url) cur = con.cursor() entries = {} logger.info("loading active InterPro entries") for entry in _get_interpro_entries(cur): entries[entry.accession] = entry logger.info("enriching entries with IntAct data") for accession, interactions in intact.get_interactions(cur).items(): try: entry = entries[accession] except KeyError: continue else: entry.ppi = interactions logger.info("loading deleted InterPro entries") for entry in _get_retired_interpro_entries(cur): if entry.accession in entries: cur.close() con.close() raise RuntimeError(f"entry cannot be active " f"and deleted {entry.accession}") entries[entry.accession] = entry logger.info("loading member database signatures") for entry in _get_signatures(cur): if entry.integrated_in and entry.integrated_in not in entries: cur.close() con.close() raise RuntimeError(f"{entry.accession} integrated " f"in missing entry ({entry.integrated_in})") entries[entry.accession] = entry logger.info("loading past entry names") past_names = _get_name_history(cur) logger.info("loading past signature integrations") past_integrations = _get_integration_history(cur) logger.info("loading ENZYME") u2enzyme = uniprot.get_swissprot2enzyme(cur) logger.info("loading Reactome pathways") u2reactome = uniprot.get_swissprot2reactome(cur) cur.close() con.close() logger.info("loading MetaCyc pathways") ec2metacyc = metacyc.get_ec2pathways(p_metacyc) # Updating entry history for entry in entries.values(): try: names = past_names[entry.accession] except KeyError: pass else: entry.history["names"] = names try: signatures = past_integrations[entry.accession] except KeyError: pass else: entry.history["signatures"] = signatures # Updating entry clan info for clan in loadobj(p_clans).values(): for entry_acc, score, seq_length in clan["members"]: try: entry = entries[entry_acc] except: continue else: entry.clan = { "accession": clan["accession"], "name": clan["name"] } inqueue = Queue(maxsize=processes) outqueue = Queue() workers = [] for _ in range(max(1, processes - 1)): dt = DirectoryTree(tmpdir) p = Process(target=_process_proteins, args=(inqueue, entries, min_overlap, dt, outqueue)) p.start() workers.append((p, dt)) logger.info("processing") uniprot2pdbe = {} for pdb_id, entry in loadobj(p_structures).items(): for uniprot_acc, chains in entry["proteins"].items(): try: uniprot2pdbe[uniprot_acc][pdb_id] = chains except KeyError: uniprot2pdbe[uniprot_acc] = {pdb_id: chains} proteins = Store(p_proteins) u2matches = Store(p_uniprot2matches) u2proteome = Store(p_uniprot2proteome) i = 0 for uniprot_acc, matches in u2matches.items(): inqueue.put(( uniprot_acc, proteins[uniprot_acc], matches, u2proteome.get(uniprot_acc), uniprot2pdbe.get(uniprot_acc, {}), set(u2enzyme.get(uniprot_acc, [])), set(u2reactome.get(uniprot_acc, [])) )) i += 1 if not i % 10000000: logger.info(f"{i:>15,}") proteins.close() u2matches.close() u2proteome.close() logger.info(f"{i:>15,}") # Send sentinel for _ in workers: inqueue.put(None) # Merge results from workers logger.info("exporting domain architectures") entries_with_xrefs = set() xref_files = [] entry_counts = {} entry_intersections = {} interpro2enzyme = {} interpro2reactome = {} with Store(p_uniprot2ida, u2matches.get_keys(), tmpdir) as u2ida: for _ in workers: obj = outqueue.get() xref_files.append(obj[0]) # str entries_with_xrefs |= obj[1] # set ida_file = obj[2] # str deepupdate(obj[3], entry_counts, replace=False) # dict deepupdate(obj[4], entry_intersections, replace=False) # dict deepupdate(obj[5], interpro2enzyme) # dict deepupdate(obj[6], interpro2reactome) # dict with DumpFile(ida_file) as df: i = 0 for uniprot_acc, dom_members, dom_str, dom_id in df: u2ida[uniprot_acc] = ( dom_members, dom_str, dom_id ) i += 1 if not i % 1000: u2ida.sync() u2ida.sync() size = u2ida.merge(processes=processes) # Adding empty EntryXrefs objects for entries without xrefs xref_files.append(workers[0][1].mktemp()) with DumpFile(xref_files[-1], compress=True) as df: for entry_acc in sorted(set(entries.keys()) - entries_with_xrefs): df.dump((entry_acc, EntryXrefs().asdict())) logger.info("exporting cross-references") with DumpFile(p_entry2xrefs, compress=True) as df: for entry_acc, xrefs in merge_dumps(xref_files): df.dump((entry_acc, xrefs)) entry = entries[entry_acc] # Reactome pathways if entry_acc in interpro2reactome: pathways = interpro2reactome[entry_acc] entry.pathways["reactome"] = [ dict(zip(("id", "name"), pthw)) for pthw in sorted(pathways) ] # EC numbers if entry_acc in interpro2enzyme: ecnos = sorted(interpro2enzyme[entry_acc]) entry.cross_references["ec"] = ecnos # MetaCyc pathways pathways = set() for ecno in ecnos: pathways |= set(ec2metacyc.get(ecno, [])) if pathways: entry.pathways["metacyc"] = [ dict(zip(("id", "name"), pthw)) for pthw in sorted(pathways) ] for p, dt in workers: size += dt.size dt.remove() logger.info(f"temporary files: {size / 1024 / 1024:.0f} MB") logger.info("calculating overlapping relationships") supfam = "homologous_superfamily" types = (supfam, "domain", "family", "repeat") for entry_acc, overlaps in entry_intersections.items(): entry1 = entries[entry_acc] entry_cnt = entry_counts[entry_acc] type1 = entry1.type.lower() for other_acc, overlap_counts in overlaps.items(): o1 = overlap_counts["1"] o2 = overlap_counts["2"] other_cnt = entry_counts[other_acc] # Independent coefficients coef1 = o1 / (entry_cnt + other_cnt - o1) coef2 = o2 / (entry_cnt + other_cnt - o2) # Final coefficient: average of independent coefficients coef = (coef1 + coef2) * 0.5 # Containment indices c1 = o1 / entry_cnt c2 = o2 / other_cnt if all([item < min_similarity for item in (coef, c1, c2)]): continue # Entries are similar enough entry2 = entries[other_acc] type2 = entry2.type.lower() if ((type1 == supfam and type2 in types) or (type1 in types and type2 == supfam)): # e1 -> e2 relationship entry1.overlaps_with.append({ "accession": other_acc, "name": entry2.name, "type": type2 }) # e2 -> e1 relationship entry2.overlaps_with.append({ "accession": entry_acc, "name": entry1.name, "type": type1 }) dumpobj(p_entries, entries) logger.info("populating ENTRY2PATHWAY") con = cx_Oracle.connect(url) cur = con.cursor() cur.execute("TRUNCATE TABLE INTERPRO.ENTRY2PATHWAY") cur.close() sql = "INSERT INTO INTERPRO.ENTRY2PATHWAY VALUES (:1, :2, :3, :4)" with Table(con, sql) as table: for e in entries.values(): for database, pathways in e.pathways.items(): code = PATHWAY_DATABASE[database] for pthw in pathways: table.insert(( e.accession, code, pthw["id"], pthw["name"] )) con.commit() con.close() logger.info("complete")