def insert_residues(pro_url: str, stg_url: str, tmpdir: Optional[str] = None): dt = DirectoryTree(root=tmpdir) logger.info("exporting residues") files = ippro.export_residues(pro_url, dt) logger.info("inserting residues") con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4") cur = con.cursor() cur.execute("DROP TABLE IF EXISTS webfront_proteinresidue") cur.execute(""" CREATE TABLE webfront_proteinresidue ( residue_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, protein_acc VARCHAR(15) NOT NULL, entry_acc VARCHAR(25) NOT NULL, entry_name VARCHAR(100), source_database VARCHAR(10) NOT NULL, description VARCHAR(255), fragments LONGTEXT NOT NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.close() sql = """ INSERT INTO webfront_proteinresidue ( protein_acc, entry_acc, entry_name, source_database, description, fragments ) VALUES (%s, %s, %s, %s, %s, %s) """ with Table(con, sql) as table: i = 0 for protein_acc, entries in merge_dumps(files, replace=True): for entry_acc, entry in entries.items(): for descr, locations in entry["descriptions"].items(): locations.sort(key=lambda x: (x[1], x[2])) table.insert((protein_acc, entry_acc, entry["name"], entry["database"], descr, jsonify(locations, nullable=False))) i += 1 if not i % 10000000: logger.info(f"{i:>15,}") logger.info(f"{i:>15,}") con.commit() logger.info(f"temporary files: {dt.size / 1024 ** 2:.0f} MB") dt.remove() logger.info("indexing") cur = con.cursor() cur.execute(""" CREATE INDEX i_proteinresidue ON webfront_proteinresidue (protein_acc) """) cur.close() con.close() logger.info("complete")
def insert_annotations(pro_url: str, p_uniprot2matches: str, pfam_url: str, stg_url: str, **kwargs): tmpdir = kwargs.get("tmpdir") con = MySQLdb.connect(**url2dict(stg_url)) cur = con.cursor() cur.execute("DROP TABLE IF EXISTS webfront_entryannotation") cur.execute(""" CREATE TABLE webfront_entryannotation ( annotation_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, accession VARCHAR(25) NOT NULL, type VARCHAR(20) NOT NULL, value LONGBLOB NOT NULL, mime_type VARCHAR(32) NOT NULL, num_sequences INT ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.close() con.close() queue = Queue() consumer = Process(target=_insert, args=(stg_url, queue)) consumer.start() dt = DirectoryTree(root=tmpdir) # Get HMMs from InterPro Oracle database for path in _export_hmms(p_uniprot2matches, pro_url, dt): queue.put(path) # Get alignments from Pfam MySQL database for path in _export_alns(pfam_url, dt): queue.put(path) queue.put(None) consumer.join() logger.info(f"temporary files: {dt.size / 1024 ** 2:.0f} MB") dt.remove() logger.info("indexing") con = MySQLdb.connect(**url2dict(stg_url)) cur = con.cursor() cur.execute("CREATE INDEX i_entryannotation " "ON webfront_entryannotation (accession)") cur.close() con.close() logger.info("complete")
def _export_alns(pfam_url: str, dt: DirectoryTree, buffer_size: int = 1000): logger.info("processing Pfam alignments") df = DumpFile(dt.mktemp(), compress=True) cnt = 0 iterator = pfam.get_alignments(pfam_url) for entry_acc, aln_type, aln_bytes, count in iterator: df.dump((entry_acc, f"alignment:{aln_type}", aln_bytes, "application/gzip", count)) cnt += 1 if cnt == buffer_size: df.close() yield df.path df = DumpFile(dt.mktemp(), compress=True) cnt = 0 df.close() yield df.path
def export_residues(url: str, dt: DirectoryTree) -> List[str]: files = [] con = cx_Oracle.connect(url) cur = con.cursor() cur.execute(""" SELECT S.PROTEIN_AC, S.METHOD_AC, M.NAME, LOWER(D.DBSHORT), S.DESCRIPTION, S.RESIDUE, S.RESIDUE_START, S.RESIDUE_END FROM INTERPRO.SITE_MATCH S INNER JOIN INTERPRO.CV_DATABASE D ON S.DBCODE = D.DBCODE LEFT OUTER JOIN INTERPRO.METHOD M ON S.METHOD_AC = M.METHOD_AC """) i = 0 proteins = {} for row in cur: protein_acc = row[0] signature_acc = row[1] signature_name = row[2] database = row[3] description = row[4] residue = row[5] pos_start = row[6] pos_end = row[7] try: entries = proteins[protein_acc] except KeyError: entries = proteins[protein_acc] = {} try: entry = entries[signature_acc] except KeyError: entry = entries[signature_acc] = { "name": signature_name, "database": database, "descriptions": {} } try: fragments = entry["descriptions"][description] except KeyError: fragments = entry["descriptions"][description] = [] fragments.append((residue, pos_start, pos_end)) i += 1 if not i % 1000000: files.append(dt.mktemp()) with DumpFile(files[-1], compress=True) as df: for protein_acc in sorted(proteins): df.dump((protein_acc, proteins[protein_acc])) proteins = {} if not i % 100000000: logger.info(f"{i:>15,}") logger.info(f"{i:>15,}") cur.close() con.close() files.append(dt.mktemp()) with DumpFile(files[-1], compress=True) as df: for protein_acc in sorted(proteins): df.dump((protein_acc, proteins[protein_acc])) return files
def _export_hmms(p_uniprot2matches: str, pro_url: str, dt: DirectoryTree, buffer_size: int = 1000): logger.info("counting hits per model") signatures = {} with Store(p_uniprot2matches) as u2matches: cnt = 0 for entries in u2matches.values(): for entry_acc, locations in entries.items(): for loc in locations: if loc["model"] is None: continue # InterPro entries try: models = signatures[entry_acc] except KeyError: models = signatures[entry_acc] = {} try: models[loc["model"]] += 1 except KeyError: models[loc["model"]] = 1 cnt += 1 if not cnt % 10e6: logger.info(f"{cnt:>12,}") logger.info(f"{cnt:>12,}") for entry_acc, models in signatures.items(): # Select the model with the most hits model_acc = sorted(models, key=lambda k: (-models[k], k))[0] signatures[entry_acc] = model_acc logger.info("processing models") df = DumpFile(dt.mktemp(), compress=True) cnt = 0 ignored = 0 iterator = ippro.get_hmms(pro_url, multi_models=True) for entry_acc, model_acc, hmm_bytes in iterator: try: representative_model = signatures[entry_acc] except KeyError: # Signature without matches, i.e. without representative model ignored += 1 continue if model_acc and model_acc != representative_model: continue hmm_str = gzip.decompress(hmm_bytes).decode("utf-8") df.dump((entry_acc, "hmm", hmm_bytes, "application/gzip", None)) with StringIO(hmm_str) as stream: hmm = hmmer.HMMFile(stream) df.dump((entry_acc, "logo", json.dumps(hmm.logo("info_content_all", "hmm")), "application/json", None)) cnt += 2 if cnt >= buffer_size: df.close() yield df.path df = DumpFile(dt.mktemp(), compress=True) cnt = 0 df.close() yield df.path logger.info(f" {ignored} models ignored")
def insert_taxonomy(p_entries: str, p_proteins: str, p_structures: str, p_taxonomy: str, p_uniprot2matches: str, p_uniprot2proteome: str, stg_url: str, p_interpro2taxonomy: str, tmpdir: Optional[str] = None): logger.info("preparing data") dt = DirectoryTree(tmpdir) entries = loadobj(p_entries) taxonomy = loadobj(p_taxonomy) uniprot2pdbe = {} for pdb_id, entry in loadobj(p_structures).items(): for uniprot_acc, chains in entry["proteins"].items(): try: uniprot2pdbe[uniprot_acc][pdb_id] = chains except KeyError: uniprot2pdbe[uniprot_acc] = {pdb_id: chains} proteins = Store(p_proteins) u2matches = Store(p_uniprot2matches) u2proteome = Store(p_uniprot2proteome) logger.info("starting") i = 0 xrefs = {} files = [] for uniprot_acc, info in proteins.items(): taxon_id = info["taxid"] try: taxon = xrefs[taxon_id] except KeyError: taxon = xrefs[taxon_id] = init_xrefs() try: proteome_id = u2proteome[uniprot_acc] except KeyError: pass else: taxon["proteomes"].add(proteome_id) taxon["proteins"]["all"] += 1 protein_structures = uniprot2pdbe.get(uniprot_acc, {}) # Add structures to taxon, regardless of entry matches taxon["structures"]["all"] |= set(protein_structures.keys()) databases = set() for entry_acc, locations in u2matches.get(uniprot_acc, {}).items(): entry = entries[entry_acc] database = entry.database try: taxon["entries"][database].add(entry_acc) except KeyError: taxon["entries"][database] = {entry_acc} if database not in databases: # Counting the protein *once* per database databases.add(database) try: taxon["proteins"]["databases"][database] += 1 except KeyError: taxon["proteins"]["databases"][database] = 1 try: taxon["proteins"]["entries"][entry_acc] += 1 except KeyError: taxon["proteins"]["entries"][entry_acc] = 1 for pdb_id, chains in protein_structures.items(): for chain_id, segments in chains.items(): if overlaps_pdb_chain(locations, segments): try: taxon["structures"]["entries"][entry_acc].add( pdb_id) except KeyError: taxon["structures"]["entries"][entry_acc] = { pdb_id } break # Skip other chains i += 1 if not i % 1000000: output = dt.mktemp() dump_xrefs(xrefs, taxonomy, output) files.append(output) xrefs = {} if not i % 10000000: logger.info(f"{i:>12,}") if xrefs: output = dt.mktemp() dump_xrefs(xrefs, taxonomy, output) files.append(output) xrefs = {} logger.info(f"{i:>12,}") logger.info(f"temporary files: " f"{sum(map(os.path.getsize, files))/1024/1024:.0f} MB") proteins.close() u2matches.close() u2proteome.close() logger.info("populating taxonomy tables") con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4") cur = con.cursor() cur.execute("DROP TABLE IF EXISTS webfront_taxonomy") cur.execute(""" CREATE TABLE webfront_taxonomy ( accession VARCHAR(20) PRIMARY KEY NOT NULL, scientific_name VARCHAR(255) NOT NULL, full_name VARCHAR(512) NOT NULL, lineage LONGTEXT NOT NULL, parent_id VARCHAR(20), rank VARCHAR(20) NOT NULL, children LONGTEXT, counts LONGTEXT NOT NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.execute("DROP TABLE IF EXISTS webfront_taxonomyperentry") cur.execute(""" CREATE TABLE webfront_taxonomyperentry ( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, tax_id VARCHAR(20) NOT NULL, entry_acc VARCHAR(25) NOT NULL, counts LONGTEXT NULL NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.execute("DROP TABLE IF EXISTS webfront_taxonomyperentrydb") cur.execute(""" CREATE TABLE webfront_taxonomyperentrydb ( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, tax_id VARCHAR(20) NOT NULL, source_database VARCHAR(10) NOT NULL, counts LONGTEXT NOT NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.close() table = Table(con, query=""" INSERT INTO webfront_taxonomy VALUES (%s, %s, %s, %s, %s, %s, %s, %s) """) per_entry = Table(con, query=""" INSERT INTO webfront_taxonomyperentry (tax_id,entry_acc,counts) VALUES (%s, %s, %s) """) per_database = Table(con, query=""" INSERT INTO webfront_taxonomyperentrydb (tax_id,source_database,counts) VALUES (%s, %s, %s) """) with DumpFile(p_interpro2taxonomy, compress=True) as interpro2taxonomy: interpro_entries = { entry.accession for entry in entries.values() if entry.database == "interpro" and not entry.is_deleted } i = 0 for taxon_id, taxon_xrefs in merge_dumps(files): taxon = taxonomy[taxon_id] protein_counts = taxon_xrefs.pop("proteins") structure_counts = taxon_xrefs.pop("structures") counts = reduce(taxon_xrefs) # Add total protein count (not grouped by database/entry) counts["proteins"] = protein_counts["all"] # Add total structure count counts["structures"] = len(structure_counts["all"]) # Add total entry count (not grouped by database) counts["entries"]["total"] = sum(counts["entries"].values()) table.insert( (taxon_id, taxon["sci_name"], taxon["full_name"], f" {' '.join(taxon['lineage'])} ", taxon["parent"], taxon["rank"], jsonify(taxon["children"]), jsonify(counts))) # Remove the 'entry' property # (no needed for webfront_taxonomyperentry) entry_counts = counts.pop("entries") database_structures = {} for entry_acc, count in protein_counts["entries"].items(): if entry_acc in interpro_entries: interpro2taxonomy.dump((entry_acc, taxon_id, count)) counts["proteins"] = count try: entry_structures = structure_counts["entries"][entry_acc] except KeyError: counts["structures"] = 0 else: counts["structures"] = len(entry_structures) database = entries[entry_acc].database try: database_structures[database] |= entry_structures except KeyError: database_structures[database] = entry_structures.copy() finally: per_entry.insert((taxon_id, entry_acc, jsonify(counts))) for database, count in protein_counts["databases"].items(): counts.update({ "entries": entry_counts[database], "proteins": count, "structures": len(database_structures.get(database, [])) }) per_database.insert((taxon_id, database, jsonify(counts))) i += 1 if not i % 100000: logger.info(f"{i:>12,}") logger.info(f"{i:>12,}") table.close() per_entry.close() per_database.close() con.commit() dt.remove() logger.info("indexing") cur = con.cursor() cur.execute(""" CREATE INDEX i_webfront_taxonomyperentry_tax ON webfront_taxonomyperentry (tax_id) """) cur.execute(""" CREATE INDEX i_webfront_taxonomyperentry_entry ON webfront_taxonomyperentry (entry_acc) """) cur.execute(""" CREATE INDEX i_webfront_taxonomyperentrydb_tax ON webfront_taxonomyperentrydb (tax_id) """) cur.execute(""" CREATE INDEX i_webfront_taxonomyperentrydb_database ON webfront_taxonomyperentrydb (source_database) """) cur.close() con.close() logger.info("complete")
def insert_clans(stg_url: str, p_alignments: str, p_clans: str, p_entries: str, p_entry2xrefs: str, **kwargs): max_xrefs = kwargs.get("max_xrefs", 1000000) tmpdir = kwargs.get("tmpdir") logger.info("aggregating clan cross-references") dt = DirectoryTree(tmpdir) entry2clan = {} for entry_acc, entry in loadobj(p_entries).items(): if entry.clan: entry2clan[entry_acc] = entry.clan["accession"] clans = {} files = [] num_xrefs = 0 with DumpFile(p_entry2xrefs) as df: for entry_acc, entry_xrefs in df: try: clan_acc = entry2clan[entry_acc] except KeyError: continue try: clan_xrefs = clans[clan_acc] except KeyError: clan_xrefs = clans[clan_acc] = {} # We do not need the number of matches del entry_xrefs["matches"] cnt_before = sum(map(len, clan_xrefs.values())) deepupdate(entry_xrefs, clan_xrefs) cnt_after = sum(map(len, clan_xrefs.values())) num_xrefs += cnt_after - cnt_before if num_xrefs >= max_xrefs: file = dt.mktemp() with DumpFile(file, compress=True) as df2: for clan_acc in sorted(clans): df2.dump((clan_acc, clans[clan_acc])) files.append(file) clans = {} num_xrefs = 0 file = dt.mktemp() with DumpFile(file, compress=True) as df2: for clan_acc in sorted(clans): df2.dump((clan_acc, clans[clan_acc])) files.append(file) logger.info("inserting clans") clans = loadobj(p_clans) con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4") cur = con.cursor() cur.execute("DROP TABLE IF EXISTS webfront_set") cur.execute(""" CREATE TABLE webfront_set ( accession VARCHAR(20) PRIMARY KEY NOT NULL, name VARCHAR(400), description TEXT, source_database VARCHAR(10) NOT NULL, relationships LONGTEXT NOT NULL, authors TEXT, literature TEXT, counts LONGTEXT DEFAULT NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.close() sql = """ INSERT INTO webfront_set VALUES (%s, %s, %s, %s, %s, %s, %s, %s) """ with Table(con, sql) as table: for clan_acc, xrefs in merge_dumps(files): clan = clans[clan_acc] counts = reduce(xrefs) counts["entries"] = { clan["database"]: len(clan["members"]), "total": len(clan["members"]) } table.insert( (clan_acc, clan["name"], clan["description"], clan["database"], jsonify(clan["relationships"], nullable=False), jsonify(clan.get("authors")), jsonify(clan.get("literature")), jsonify(counts))) logger.info(f"temporary files: {dt.size / 1024 / 1024:.0f} MB") dt.remove() logger.info("inserting alignments") cur = con.cursor() cur.execute("DROP TABLE IF EXISTS webfront_alignment") cur.execute(""" CREATE TABLE webfront_alignment ( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, set_acc VARCHAR(20) NOT NULL, entry_acc VARCHAR(25) NOT NULL, target_acc VARCHAR(25) NOT NULL, target_set_acc VARCHAR(20), score DOUBLE NOT NULL, seq_length MEDIUMINT NOT NULL, domains TEXT NOT NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.close() sql = """ INSERT INTO webfront_alignment ( set_acc, entry_acc, target_acc, target_set_acc, score, seq_length, domains ) VALUES (%s, %s, %s, %s, %s, %s, %s) """ with DumpFile(p_alignments) as df, Table(con, sql) as table: for alignments in df: for aln in alignments: table.insert(aln) con.commit() con.close() logger.info("complete")
def export(url: str, p_entries: str, p_entry2xrefs: str, p_taxonomy: str, outdir: str, max_xrefs: int = 100000): logger.info("loading database versions") con = MySQLdb.connect(**url2dict(url)) cur = con.cursor() cur.execute( """ SELECT name, name_long, version, release_date FROM webfront_database WHERE type = 'entry' """ ) databases = {} release_version = release_date = None for name, full_name, version, date in cur: databases[name] = full_name if name == "interpro": release_version = version release_date = date.strftime("%Y-%m-%d") cur.close() con.close() if release_version is None: raise RuntimeError("missing release version/date for InterPro") logger.info("loading taxonomic info") sci_names = {} for taxon_id, taxon in loadobj(p_taxonomy).items(): sci_names[taxon_id] = taxon["sci_name"] try: shutil.rmtree(outdir) except FileNotFoundError: pass finally: os.makedirs(outdir, mode=0o775) entries = loadobj(p_entries) logger.info("starting") i = 0 types = {} num_xrefs = {} with DumpFile(p_entry2xrefs) as df: for accession, entry_xrefs in df: entry = entries[accession] if entry.is_deleted: continue fields, xrefs = _init_fields(entry) fields.append({ "name": "source_database", "value": databases[entry.database] }) for uniprot_acc, uniprot_id in entry_xrefs["proteins"]: xrefs.append({ "dbname": "UNIPROT", "dbkey": uniprot_acc }) xrefs.append({ "dbname": "UNIPROT", "dbkey": uniprot_id }) for tax_id in entry_xrefs["taxa"]: xrefs.append({ "dbname": "TAXONOMY", "dbkey": tax_id }) xrefs.append({ "dbname": "TAXONOMY", "dbkey": sci_names[tax_id] }) for upid in entry_xrefs["proteomes"]: xrefs.append({ "dbname": "PROTEOMES", "dbkey": upid }) for pdbe_id in entry_xrefs["structures"]: xrefs.append({ "dbname": "PDB", "dbkey": pdbe_id }) entry_type = entry.type.lower() try: dt, items = types[entry_type] except KeyError: dt = DirectoryTree(outdir, entry_type) items = [] types[entry_type] = (dt, items) num_xrefs[entry_type] = 0 items.append({ "fields": fields, "cross_references": xrefs }) num_xrefs[entry_type] += len(xrefs) if num_xrefs[entry_type] >= max_xrefs: path = dt.mktemp(suffix=".json") with open(path, "wt") as fh: json.dump({ "name": "InterPro", "release": release_version, "release_date": release_date, "entry_count": len(items), "entries": items }, fh, indent=4) items.clear() num_xrefs[entry_type] = 0 i += 1 if not i % 10000: logger.info(f"{i:>12,}") logger.info(f"{i:>12,}") for entry_type, (dt, items) in types.items(): if num_xrefs[entry_type]: path = dt.mktemp(suffix=".json") with open(path, "wt") as fh: json.dump({ "name": "InterPro", "release": release_version, "release_date": release_date, "entry_count": len(items), "entries": items }, fh, indent=4) logger.info("complete")
def export_documents(src_proteins: str, src_entries: str, src_proteomes: str, src_structures: str, src_taxonomy: str, src_uniprot2ida: str, src_uniprot2matches: str, src_uniprot2proteomes: str, outdirs: Sequence[str], version: str, cache_size: int = 100000): logger.info("preparing data") os.umask(0o002) organizers = [] for path in outdirs: try: shutil.rmtree(path) except FileNotFoundError: pass os.makedirs(path, mode=0o775) organizers.append(DirectoryTree(path)) open(os.path.join(path, f"{version}{LOAD_SUFFIX}"), "w").close() logger.info("loading domain architectures") domains = {} with Store(src_uniprot2ida) as u2ida: for dom_members, dom_arch, dom_arch_id in u2ida.values(): try: dom = domains[dom_arch_id] except KeyError: domains[dom_arch_id] = { "ida_id": dom_arch_id, "ida": dom_arch, "counts": 1 } else: dom["counts"] += 1 logger.info("writing IDA documents") num_documents = 0 domains = list(domains.values()) for i in range(0, len(domains), cache_size): documents = [] for dom in domains[i:i + cache_size]: documents.append(( IDA_INDEX + version, dom["ida_id"], dom )) num_documents += len(documents) for org in organizers: filepath = org.mktemp() dumpobj(filepath, documents) os.rename(filepath, f"{filepath}{EXTENSION}") domains = None proteins = Store(src_proteins) uniprot2ida = Store(src_uniprot2ida) uniprot2matches = Store(src_uniprot2matches) uniprot2proteomes = Store(src_uniprot2proteomes) entries = loadobj(src_entries) # mem: ~1.5 GB proteomes = loadobj(src_proteomes) # mem: <1 GB structures = loadobj(src_structures) # mem: ~ 4GB taxonomy = loadobj(src_taxonomy) # mem: ~ 2.5GB uniprot2pdbe = {} # mem: <1 GB for pdb_id, entry in structures.items(): for uniprot_acc in entry["proteins"]: try: uniprot2pdbe[uniprot_acc].append(pdb_id) except KeyError: uniprot2pdbe[uniprot_acc] = [pdb_id] logger.info("writing relationship documents") i = 0 documents = [] used_entries = set() used_taxa = set() for uniprot_acc, info in proteins.items(): taxid = info["taxid"] taxon = taxonomy[taxid] used_taxa.add(taxid) # remember that this taxon has been used try: dom_members, dom_arch, dom_arch_id = uniprot2ida[uniprot_acc] except KeyError: dom_members = [] dom_arch = dom_arch_id = None # Create an empty document (all properties set to None) doc = init_rel_doc() doc.update({ "protein_acc": uniprot_acc.lower(), "protein_length": info["length"], "protein_is_fragment": info["fragment"], "protein_db": "reviewed" if info["reviewed"] else "unreviewed", "text_protein": join(uniprot_acc, info["identifier"]), # Taxonomy "tax_id": taxid, "tax_name": taxon["sci_name"], "tax_lineage": taxon["lineage"], "tax_rank": taxon["rank"], "text_taxonomy": join(taxid, taxon["full_name"], taxon["rank"]) }) proteome_id = uniprot2proteomes.get(uniprot_acc) if proteome_id: proteome = proteomes[proteome_id] doc.update({ "proteome_acc": proteome_id.lower(), "proteome_name": proteome["name"], "proteome_is_reference": proteome["is_reference"], "text_proteome": join(proteome_id, proteome["name"], proteome["assembly"], proteome["taxon_id"], proteome["strain"]), }) # Adding PDBe structures/chains pdb_chains = {} # mapping PDB-chain ID -> chain segments pdb_documents = {} # mapping PDB-chain ID -> ES document for pdb_id in uniprot2pdbe.get(uniprot_acc, []): pdb_entry = structures[pdb_id] chains = pdb_entry["proteins"][uniprot_acc] pdb_doc = doc.copy() pdb_doc.update({ "structure_acc": pdb_id.lower(), "structure_resolution": pdb_entry["resolution"], "structure_date": pdb_entry["date"], "structure_evidence": pdb_entry["evidence"], "protein_structure": chains, "text_structure": join(pdb_id, pdb_entry["evidence"], pdb_entry["name"]) }) for chain_id, segments in chains.items(): pdb_chain_id = f"{pdb_id}-{chain_id}" locations = [] for segment in segments: locations.append({ "fragments": [{ "start": segment["protein_start"], "end": segment["protein_end"], }] }) chain_doc = pdb_doc.copy() chain_doc.update({ "structure_chain_acc": chain_id, "structure_protein_locations": locations, "structure_chain": pdb_chain_id }) pdb_documents[pdb_chain_id] = chain_doc pdb_chains[pdb_chain_id] = segments # Adding entries overlapping_chains = set() # chains associated to at least one entry matches = uniprot2matches.get(uniprot_acc, {}) num_protein_docs = 0 for entry_acc, locations in matches.items(): used_entries.add(entry_acc) # this entry has been used entry = entries[entry_acc] if entry.integrated_in: interpro_acc = entry.integrated_in.lower() else: interpro_acc = None entry_obj = { "entry_acc": entry_acc.lower(), "entry_db": entry.database, "entry_type": entry.type.lower(), "entry_date": entry.creation_date.strftime("%Y-%m-%d"), "entry_protein_locations": locations, "entry_go_terms": [t["identifier"] for t in entry.go_terms], "entry_integrated": interpro_acc, "text_entry": join(entry_acc, entry.short_name, entry.name, entry.type.lower(), interpro_acc), } if entry.clan: entry_obj.update({ "set_acc": entry.clan["accession"].lower(), "set_db": entry.database, "text_set": join(entry.clan["accession"], entry.clan["name"]), }) if entry_acc in dom_members: entry_obj.update({ "ida_id": dom_arch_id, "ida": dom_arch, }) # Test if the entry overlaps PDB chains entry_chains = set() for pdb_chain_id, segments in pdb_chains.items(): if overlaps_pdb_chain(locations, segments): # Entry overlaps chain: associate entry to struct/chain chain_doc = pdb_documents[pdb_chain_id] entry_doc = chain_doc.copy() entry_doc.update(entry_obj) documents.append(( entry.database + version, get_rel_doc_id(entry_doc), entry_doc )) entry_chains.add(pdb_chain_id) num_protein_docs += 1 if entry_chains: # Entry overlaps at least one chain overlapping_chains |= entry_chains else: # Associate entry to protein directly entry_doc = doc.copy() entry_doc.update(entry_obj) documents.append(( entry.database + version, get_rel_doc_id(entry_doc), entry_doc )) num_protein_docs += 1 # Add chains not overlapping any entry for chain_id, chain_doc in pdb_documents.items(): if chain_id in overlapping_chains: continue chain_doc.update({ "ida_id": dom_arch_id, "ida": dom_arch, }) documents.append(( # Not overlapping any entry -> not associated to a member DB REL_INDEX + version, get_rel_doc_id(chain_doc), chain_doc )) num_protein_docs += 1 if not num_protein_docs: # No relationships for this protein: fallback to protein doc documents.append(( REL_INDEX + version, get_rel_doc_id(doc), doc )) while len(documents) >= cache_size: for org in organizers: filepath = org.mktemp() dumpobj(filepath, documents[:cache_size]) os.rename(filepath, f"{filepath}{EXTENSION}") del documents[:cache_size] num_documents += cache_size i += 1 if not i % 10000000: logger.info(f"{i:>12,}") logger.info(f"{i:>12,}") logger.info("writing remaining documents") # Add unused entries for entry in entries.values(): if entry.accession in used_entries or entry.is_deleted: continue if entry.integrated_in: interpro_acc = entry.integrated_in.lower() else: interpro_acc = None doc = init_rel_doc() doc.update({ "entry_acc": entry.accession.lower(), "entry_db": entry.database, "entry_type": entry.type.lower(), "entry_date": entry.creation_date.strftime("%Y-%m-%d"), "entry_protein_locations": [], "entry_go_terms": [t["identifier"] for t in entry.go_terms], "entry_integrated": interpro_acc, "text_entry": join(entry.accession, entry.short_name, entry.name, entry.type.lower(), interpro_acc), }) if entry.clan: doc.update({ "set_acc": entry.clan["accession"].lower(), "set_db": entry.database, "text_set": join(entry.clan["accession"], entry.clan["name"]), }) documents.append(( entry.database + version, get_rel_doc_id(doc), doc )) # Add unused taxa for taxon in taxonomy.values(): if taxon["id"] in used_taxa: continue doc = init_rel_doc() doc.update({ "tax_id": taxon["id"], "tax_name": taxon["full_name"], "tax_lineage": taxon["lineage"], "tax_rank": taxon["rank"], "text_taxonomy": join(taxon["id"], taxon["full_name"], taxon["rank"]) }) documents.append(( REL_INDEX + version, get_rel_doc_id(doc), doc )) num_documents += len(documents) while documents: for org in organizers: filepath = org.mktemp() dumpobj(filepath, documents[:cache_size]) os.rename(filepath, f"{filepath}{EXTENSION}") del documents[:cache_size] proteins.close() uniprot2ida.close() uniprot2matches.close() uniprot2proteomes.close() for path in outdirs: open(os.path.join(path, f"{version}{DONE_SUFFIX}"), "w").close() logger.info(f"complete ({num_documents:,} documents)")
def export_entries(url: str, p_metacyc: str, p_clans: str, p_proteins: str, p_structures: str, p_uniprot2matches: str, p_uniprot2proteome: str, p_uniprot2ida: str, p_entry2xrefs: str, p_entries: str, **kwargs): min_overlap = kwargs.get("overlap", 0.2) processes = kwargs.get("processes", 1) min_similarity = kwargs.get("similarity", 0.75) tmpdir = kwargs.get("tmpdir") con = cx_Oracle.connect(url) cur = con.cursor() entries = {} logger.info("loading active InterPro entries") for entry in _get_interpro_entries(cur): entries[entry.accession] = entry logger.info("enriching entries with IntAct data") for accession, interactions in intact.get_interactions(cur).items(): try: entry = entries[accession] except KeyError: continue else: entry.ppi = interactions logger.info("loading deleted InterPro entries") for entry in _get_retired_interpro_entries(cur): if entry.accession in entries: cur.close() con.close() raise RuntimeError(f"entry cannot be active " f"and deleted {entry.accession}") entries[entry.accession] = entry logger.info("loading member database signatures") for entry in _get_signatures(cur): if entry.integrated_in and entry.integrated_in not in entries: cur.close() con.close() raise RuntimeError(f"{entry.accession} integrated " f"in missing entry ({entry.integrated_in})") entries[entry.accession] = entry logger.info("loading past entry names") past_names = _get_name_history(cur) logger.info("loading past signature integrations") past_integrations = _get_integration_history(cur) logger.info("loading ENZYME") u2enzyme = uniprot.get_swissprot2enzyme(cur) logger.info("loading Reactome pathways") u2reactome = uniprot.get_swissprot2reactome(cur) cur.close() con.close() logger.info("loading MetaCyc pathways") ec2metacyc = metacyc.get_ec2pathways(p_metacyc) # Updating entry history for entry in entries.values(): try: names = past_names[entry.accession] except KeyError: pass else: entry.history["names"] = names try: signatures = past_integrations[entry.accession] except KeyError: pass else: entry.history["signatures"] = signatures # Updating entry clan info for clan in loadobj(p_clans).values(): for entry_acc, score, seq_length in clan["members"]: try: entry = entries[entry_acc] except: continue else: entry.clan = { "accession": clan["accession"], "name": clan["name"] } inqueue = Queue(maxsize=processes) outqueue = Queue() workers = [] for _ in range(max(1, processes - 1)): dt = DirectoryTree(tmpdir) p = Process(target=_process_proteins, args=(inqueue, entries, min_overlap, dt, outqueue)) p.start() workers.append((p, dt)) logger.info("processing") uniprot2pdbe = {} for pdb_id, entry in loadobj(p_structures).items(): for uniprot_acc, chains in entry["proteins"].items(): try: uniprot2pdbe[uniprot_acc][pdb_id] = chains except KeyError: uniprot2pdbe[uniprot_acc] = {pdb_id: chains} proteins = Store(p_proteins) u2matches = Store(p_uniprot2matches) u2proteome = Store(p_uniprot2proteome) i = 0 for uniprot_acc, matches in u2matches.items(): inqueue.put(( uniprot_acc, proteins[uniprot_acc], matches, u2proteome.get(uniprot_acc), uniprot2pdbe.get(uniprot_acc, {}), set(u2enzyme.get(uniprot_acc, [])), set(u2reactome.get(uniprot_acc, [])) )) i += 1 if not i % 10000000: logger.info(f"{i:>15,}") proteins.close() u2matches.close() u2proteome.close() logger.info(f"{i:>15,}") # Send sentinel for _ in workers: inqueue.put(None) # Merge results from workers logger.info("exporting domain architectures") entries_with_xrefs = set() xref_files = [] entry_counts = {} entry_intersections = {} interpro2enzyme = {} interpro2reactome = {} with Store(p_uniprot2ida, u2matches.get_keys(), tmpdir) as u2ida: for _ in workers: obj = outqueue.get() xref_files.append(obj[0]) # str entries_with_xrefs |= obj[1] # set ida_file = obj[2] # str deepupdate(obj[3], entry_counts, replace=False) # dict deepupdate(obj[4], entry_intersections, replace=False) # dict deepupdate(obj[5], interpro2enzyme) # dict deepupdate(obj[6], interpro2reactome) # dict with DumpFile(ida_file) as df: i = 0 for uniprot_acc, dom_members, dom_str, dom_id in df: u2ida[uniprot_acc] = ( dom_members, dom_str, dom_id ) i += 1 if not i % 1000: u2ida.sync() u2ida.sync() size = u2ida.merge(processes=processes) # Adding empty EntryXrefs objects for entries without xrefs xref_files.append(workers[0][1].mktemp()) with DumpFile(xref_files[-1], compress=True) as df: for entry_acc in sorted(set(entries.keys()) - entries_with_xrefs): df.dump((entry_acc, EntryXrefs().asdict())) logger.info("exporting cross-references") with DumpFile(p_entry2xrefs, compress=True) as df: for entry_acc, xrefs in merge_dumps(xref_files): df.dump((entry_acc, xrefs)) entry = entries[entry_acc] # Reactome pathways if entry_acc in interpro2reactome: pathways = interpro2reactome[entry_acc] entry.pathways["reactome"] = [ dict(zip(("id", "name"), pthw)) for pthw in sorted(pathways) ] # EC numbers if entry_acc in interpro2enzyme: ecnos = sorted(interpro2enzyme[entry_acc]) entry.cross_references["ec"] = ecnos # MetaCyc pathways pathways = set() for ecno in ecnos: pathways |= set(ec2metacyc.get(ecno, [])) if pathways: entry.pathways["metacyc"] = [ dict(zip(("id", "name"), pthw)) for pthw in sorted(pathways) ] for p, dt in workers: size += dt.size dt.remove() logger.info(f"temporary files: {size / 1024 / 1024:.0f} MB") logger.info("calculating overlapping relationships") supfam = "homologous_superfamily" types = (supfam, "domain", "family", "repeat") for entry_acc, overlaps in entry_intersections.items(): entry1 = entries[entry_acc] entry_cnt = entry_counts[entry_acc] type1 = entry1.type.lower() for other_acc, overlap_counts in overlaps.items(): o1 = overlap_counts["1"] o2 = overlap_counts["2"] other_cnt = entry_counts[other_acc] # Independent coefficients coef1 = o1 / (entry_cnt + other_cnt - o1) coef2 = o2 / (entry_cnt + other_cnt - o2) # Final coefficient: average of independent coefficients coef = (coef1 + coef2) * 0.5 # Containment indices c1 = o1 / entry_cnt c2 = o2 / other_cnt if all([item < min_similarity for item in (coef, c1, c2)]): continue # Entries are similar enough entry2 = entries[other_acc] type2 = entry2.type.lower() if ((type1 == supfam and type2 in types) or (type1 in types and type2 == supfam)): # e1 -> e2 relationship entry1.overlaps_with.append({ "accession": other_acc, "name": entry2.name, "type": type2 }) # e2 -> e1 relationship entry2.overlaps_with.append({ "accession": entry_acc, "name": entry1.name, "type": type1 }) dumpobj(p_entries, entries) logger.info("populating ENTRY2PATHWAY") con = cx_Oracle.connect(url) cur = con.cursor() cur.execute("TRUNCATE TABLE INTERPRO.ENTRY2PATHWAY") cur.close() sql = "INSERT INTO INTERPRO.ENTRY2PATHWAY VALUES (:1, :2, :3, :4)" with Table(con, sql) as table: for e in entries.values(): for database, pathways in e.pathways.items(): code = PATHWAY_DATABASE[database] for pthw in pathways: table.insert(( e.accession, code, pthw["id"], pthw["name"] )) con.commit() con.close() logger.info("complete")
def _process_proteins(inqueue: Queue, entries: Mapping[str, Entry], min_overlap: bool, dt: DirectoryTree, outqueue: Queue): xrefs = {} # temporary dict accession->xrefs xref_files = [] # files containing xrefs entries_with_xrefs = set() # accession of entries having xrefs entry_counts = {} # number of matches entry_intersections = {} # number of overlapping matches interpro2enzyme = {} # InterPro-ENZYME mapping interpro2reactome = {} # InterPro-Reactome mapping ida_file = dt.mktemp() with DumpFile(ida_file, compress=True) as ida_df: i = 0 for obj in iter(inqueue.get, None): uniprot_acc = obj[0] # str protein_info = obj[1] # dict matches = obj[2] # dict proteome_id = obj[3] # str or None pdb_entries = obj[4] # dict enzymes = obj[5] # set pathways = obj[6] # set supermatches = [] all_locations = [] for entry_acc, locations in matches.items(): entry = entries[entry_acc] if entry.database == "interpro": # Adding EC / Reactome mapping if enzymes: try: interpro2enzyme[entry_acc] |= enzymes except KeyError: interpro2enzyme[entry_acc] = enzymes.copy() if pathways: try: interpro2reactome[entry_acc] |= pathways except KeyError: interpro2reactome[entry_acc] = pathways.copy() elif entry.database == "pfam": # Storing matches for IDA for loc in locations: all_locations.append({ "pfam": entry_acc, "interpro": entry.integrated_in, # We do not consider fragmented locations "start": loc["fragments"][0]["start"], "end": max(f["end"] for f in loc["fragments"]) }) # Adding cross-references (except IDA, still being calculated) try: entry_xrefs = xrefs[entry_acc] except KeyError: entry_xrefs = xrefs[entry_acc] = EntryXrefs() entries_with_xrefs.add(entry_acc) entry_xrefs.matches += len(locations) entry_xrefs.proteins.add(( uniprot_acc, protein_info["identifier"] )) if proteome_id: entry_xrefs.proteomes.add(proteome_id) for pdb_id, chains in pdb_entries.items(): for chain_id, segments in chains.items(): if overlaps_pdb_chain(locations, segments): entry_xrefs.structures.add(pdb_id) break # Skip other chains entry_xrefs.taxa.add(protein_info["taxid"]) # Create a Supermatch for each integrated signature match if entry.integrated_in: # Integrated member database signature interpro_acc = entry.integrated_in root = entries[interpro_acc].hierarchy["accession"] for loc in locations: sm = Supermatch(interpro_acc, loc["fragments"], root) supermatches.append(sm) # Finishing IDA domains = [] dom_members = set() for loc in sorted(all_locations, key=repr_fragment): if loc["interpro"]: domains.append(f"{loc['pfam']}:{loc['interpro']}") dom_members.add(loc["interpro"]) else: domains.append(loc["pfam"]) dom_members.add(loc["pfam"]) if domains: # Flush IDA dom_str = '-'.join(domains) dom_id = hashlib.sha1(dom_str.encode("utf-8")).hexdigest() ida_df.dump((uniprot_acc, dom_members, dom_str, dom_id)) # Adding cross-references now for key in dom_members: xrefs[key].ida.add(dom_id) # Merging overlapping supermatches merged = [] for sm_to_merge in sorted(supermatches): for sm_merged in merged: if sm_merged.overlaps(sm_to_merge, min_overlap): """ Supermatches overlap (sm_to_merge has been merged into sm_merged) """ break else: # sm_to_merge does not overlap with any other supermatches merged.append(sm_to_merge) # Group by entry merged_grouped = {} for sm in merged: for interpro_acc in sm.entries: try: merged_grouped[interpro_acc] += sm.fragments except KeyError: merged_grouped[interpro_acc] = list(sm.fragments) # Evaluate how entries overlap for interpro_acc, fragments1 in merged_grouped.items(): try: entry_counts[interpro_acc] += 1 except KeyError: entry_counts[interpro_acc] = 1 for other_acc, fragments2 in merged_grouped.items(): if other_acc >= interpro_acc: continue try: obj = entry_intersections[interpro_acc] except KeyError: obj = entry_intersections[interpro_acc] = {} try: overlaps = obj[other_acc] except KeyError: """ Use a dict rather than a list (or tuple) because deepupdate() would concatenate the lists created by different workers """ overlaps = obj[other_acc] = { "1": 0, "2": 0, } flag = 0 for f1 in fragments1: start1 = f1["start"] end1 = f1["end"] length1 = end1 - start1 + 1 for f2 in fragments2: start2 = f2["start"] end2 = f2["end"] length2 = end2 - start2 + 1 overlap = min(end1, end2) - max(start1, start2) + 1 if not flag & 1 and overlap >= length1 * 0.5: # 1st time fragments overlap >= 50% of f1 flag |= 1 overlaps["1"] += 1 if not flag & 2 and overlap >= length2 * 0.5: # 1st time fragments overlap >= 50% of f2 flag |= 2 overlaps["2"] += 1 if flag == 3: """ Both cases already happened -> no need to keep iterating """ break i += 1 if not i % 100000: # Flush Xrefs file = dt.mktemp() with DumpFile(file, compress=True) as xref_df: for entry_acc in sorted(xrefs): xref_df.dump((entry_acc, xrefs[entry_acc].asdict())) xrefs = {} xref_files.append(file) # Remaining xrefs file = dt.mktemp() with DumpFile(file, compress=True) as df: for entry_acc in sorted(xrefs): df.dump((entry_acc, xrefs[entry_acc].asdict())) xref_files.append(file) # Merge files (each worker will produce one merged file) xref_file = dt.mktemp() with DumpFile(xref_file, compress=True) as df: for entry_acc, xrefs in merge_dumps(xref_files): df.dump((entry_acc, xrefs)) outqueue.put(( xref_file, entries_with_xrefs, ida_file, entry_counts, entry_intersections, interpro2enzyme, interpro2reactome ))