def export_sequences(url: str, keyfile: str, output: str, processes: int = 1, tmpdir: Optional[str] = None): logger.info("starting") with Store(output, Store.load_keys(keyfile), tmpdir) as store: con = cx_Oracle.connect(url) cur = con.cursor() cur.execute(""" SELECT /*+ PARALLEL */ UX.AC, UP.SEQ_SHORT, UP.SEQ_LONG FROM UNIPARC.XREF UX INNER JOIN UNIPARC.PROTEIN UP ON UX.UPI = UP.UPI WHERE UX.DBID IN (2, 3) AND UX.DELETED = 'N' """) i = 0 for row in cur: store[row[0]] = row[2].read() if row[2] is not None else row[1] i += 1 if not i % 1000000: store.sync() if not i % 10000000: logger.info(f"{i:>12,}") cur.close() con.close() logger.info(f"{i:>12,}") size = store.merge(processes=processes) logger.info(f"temporary files: {size/1024/1024:.0f} MB")
def export_go(url: str, keyfile: str, output: str, processes: int = 1, tmpdir: Optional[str] = None): logger.info("starting") with Store(output, Store.load_keys(keyfile), tmpdir) as store: con = cx_Oracle.connect(url) cur = con.cursor() cur.execute(""" SELECT CODE, SORT_ORDER, TERM_NAME FROM GO.CV_CATEGORIES@GOAPRO """) categories = {row[0]: row[1:] for row in cur} cur.execute(""" SELECT E.ACCESSION, D.PRIMARY_ID, D.SECONDARY_ID, D.NOTE FROM SPTR.DBENTRY@SWPREAD E INNER JOIN SPTR.DBENTRY_2_DATABASE@SWPREAD D ON E.DBENTRY_ID = D.DBENTRY_ID WHERE E.ENTRY_TYPE IN (0, 1) -- Swiss-Prot and TrEMBL AND E.MERGE_STATUS != 'R' -- not 'Redundant' AND E.DELETED = 'N' -- not deleted AND E.FIRST_PUBLIC IS NOT NULL -- published AND D.DATABASE_ID = 'GO' -- GO annotation """) i = 0 for accession, go_id, sec_id, note in cur: # sec_id -> """ sec_id -> cat_code:term_name, e.g.: C:integral component of membrane node -> go_evidence: source,e.g.: IEA:InterPro """ cat_code, term_name = sec_id.split(':', 1) cat_order, cat_name = categories[cat_code] store.append(accession, (cat_order, go_id, term_name, cat_code, cat_name)) i += 1 if not i % 1000000: store.sync() if not i % 10000000: logger.info(f"{i:>12,}") cur.close() con.close() logger.info(f"{i:>12,}") size = store.merge(fn=_post_go, processes=processes) logger.info(f"temporary files: {size / 1024 / 1024:.0f} MB")
def export_name(url: str, keyfile: str, output: str, processes: int = 1, tmpdir: Optional[str] = None): logger.info("starting") with Store(output, Store.load_keys(keyfile), tmpdir) as store: con = cx_Oracle.connect(url) cur = con.cursor() cur.execute(""" SELECT ACCESSION, DESCR FROM ( SELECT E.ACCESSION, D.DESCR, ROW_NUMBER() OVER ( PARTITION BY E.ACCESSION ORDER BY CV.DESC_ID, -- 1=RecName, 2=AltName, 3=SubName CV.ORDER_IN, -- Swiss-Prot manual order D.DESCR -- TrEMBL alphabetic order ) RN FROM SPTR.DBENTRY@SWPREAD E INNER JOIN SPTR.DBENTRY_2_DESC@SWPREAD D ON E.DBENTRY_ID = D.DBENTRY_ID AND D.DESC_ID IN (1,4,11,13,16,23,25,28,35) --Full description section INNER JOIN SPTR.CV_DESC@SWPREAD CV ON D.DESC_ID = CV.DESC_ID WHERE E.ENTRY_TYPE IN (0, 1) AND E.MERGE_STATUS != 'R' AND E.DELETED = 'N' AND E.FIRST_PUBLIC IS NOT NULL ) WHERE RN = 1 """) i = 0 for accession, description in cur: store[accession] = description i += 1 if not i % 1000000: store.sync() if not i % 10000000: logger.info(f"{i:>12,}") cur.close() con.close() logger.info(f"{i:>12,}") size = store.merge(processes=processes) logger.info(f"temporary files: {size/1024/1024:.0f} MB")
def _write_match_tmp(signatures: dict, u2variants: dict, p_proteins: str, p_uniprot2matches: str, start: str, stop: Optional[str], output: str): proteins = Store(p_proteins) u2matches = Store(p_uniprot2matches) with open(output, "wt", encoding="utf-8") as fh: doc = getDOMImplementation().createDocument(None, None, None) for uniprot_acc, protein in proteins.range(start, stop): elem = doc.createElement("protein") elem.setAttribute("id", uniprot_acc) elem.setAttribute("name", protein["identifier"]) elem.setAttribute("length", str(protein["length"])) elem.setAttribute("crc64", protein["crc64"]) try: protein_entries = u2matches[uniprot_acc] except KeyError: pass else: for signature_acc in sorted(protein_entries): try: signature = signatures[signature_acc] except KeyError: # InterPro entry continue elem.appendChild( _create_match(doc, signature, protein_entries[signature_acc])) finally: elem.writexml(fh, addindent=" ", newl="\n") protein_variants = u2variants.get(uniprot_acc, []) for variant, length, crc64, matches in protein_variants: elem = doc.createElement("protein") elem.setAttribute("id", variant) elem.setAttribute("name", variant) elem.setAttribute("length", str(length)) elem.setAttribute("crc64", crc64) for signature_acc in sorted(matches): try: signature = signatures[signature_acc] except KeyError: # InterPro entry continue elem.appendChild( _create_match(doc, signature, matches[signature_acc])) elem.writexml(fh, addindent=" ", newl="\n")
def export_proteome(url: str, keyfile: str, output: str, processes: int = 1, tmpdir: Optional[str] = None): logger.info("starting") with Store(output, Store.load_keys(keyfile), tmpdir) as store: con = cx_Oracle.connect(url) cur = con.cursor() """ Without the DISTINCT, there would be duplicated rows, e.g. A0A059MHQ6 UP000024941 A0A059MHQ6 UP000024941 Even for duplicated rows, a given UniProt accession is associated to one unique UPID. It's just easier to remove the duplicates at the database level. """ cur.execute(""" SELECT DISTINCT E.ACCESSION, P.UPID FROM SPTR.DBENTRY@SWPREAD E INNER JOIN SPTR.PROTEOME2UNIPROT@SWPREAD P2U ON E.ACCESSION = P2U.ACCESSION AND E.TAX_ID = P2U.TAX_ID INNER JOIN SPTR.PROTEOME@SWPREAD P ON P2U.PROTEOME_ID = P.PROTEOME_ID AND P.IS_REFERENCE = 1 WHERE E.ENTRY_TYPE IN (0, 1) AND E.MERGE_STATUS != 'R' AND E.DELETED = 'N' AND E.FIRST_PUBLIC IS NOT NULL """) i = 0 for accession, upid in cur: store[accession] = upid i += 1 if not i % 1000000: store.sync() if not i % 10000000: logger.info(f"{i:>12,}") cur.close() con.close() logger.info(f"{i:>12,}") size = store.merge(processes=processes) logger.info(f"temporary files: {size/1024/1024:.0f} MB")
def export_evidence(url: str, keyfile: str, output: str, processes: int = 1, tmpdir: Optional[str] = None): logger.info("starting") with Store(output, Store.load_keys(keyfile), tmpdir) as store: con = cx_Oracle.connect(url) cur = con.cursor() cur.execute(""" SELECT ACCESSION, PROTEIN_EXISTENCE_ID, NAME FROM ( SELECT E.ACCESSION, E.PROTEIN_EXISTENCE_ID, GN.NAME, ROW_NUMBER() OVER ( PARTITION BY E.ACCESSION ORDER BY GN.GENE_NAME_TYPE_ID ) RN FROM SPTR.DBENTRY@SWPREAD E LEFT OUTER JOIN SPTR.GENE@SWPREAD G ON E.DBENTRY_ID = G.DBENTRY_ID LEFT OUTER JOIN SPTR.GENE_NAME@SWPREAD GN ON G.GENE_ID = GN.GENE_ID WHERE E.ENTRY_TYPE IN (0, 1) AND E.MERGE_STATUS != 'R' AND E.DELETED = 'N' AND E.FIRST_PUBLIC IS NOT NULL ) WHERE RN = 1 """) i = 0 for accession, evidence, gene in cur: store[accession] = (evidence, gene) i += 1 if not i % 1000000: store.sync() if not i % 10000000: logger.info(f"{i:>12,}") cur.close() con.close() logger.info(f"{i:>12,}") size = store.merge(processes=processes) logger.info(f"temporary files: {size/1024/1024:.0f} MB")
def export_features(url: str, keyfile: str, output: str, processes: int = 1, tmpdir: Optional[str] = None): logger.info("starting") with Store(output, Store.load_keys(keyfile), tmpdir) as store: con = cx_Oracle.connect(url) cur = con.cursor() cur.execute(""" SELECT FM.PROTEIN_AC, FM.METHOD_AC, LOWER(DB.DBSHORT), FM.POS_FROM, FM.POS_TO, FM.SEQ_FEATURE FROM INTERPRO.FEATURE_MATCH FM INNER JOIN INTERPRO.CV_DATABASE DB ON FM.DBCODE = DB.DBCODE """) i = 0 for row in cur: protein_acc = row[0] signature_acc = row[1] database = row[2] pos_start = row[3] pos_end = row[4] seq_feature = row[5] if database == "mobidblt" and seq_feature is None: seq_feature = "Consensus Disorder Prediction" store.update(protein_acc, { signature_acc: { "database": database, "locations": [(pos_start, pos_end, seq_feature)] } }, replace=True) i += 1 if not i % 1000000: store.sync() if not i % 100000000: logger.info(f"{i:>13,}") cur.close() con.close() logger.info(f"{i:>13,}") size = store.merge(processes=processes) logger.info(f"temporary files: {size/1024/1024:.0f} MB")
def export_comments(url: str, keyfile: str, output: str, processes: int = 1, tmpdir: Optional[str] = None): logger.info("starting") with Store(output, Store.load_keys(keyfile), tmpdir) as store: con = cx_Oracle.connect(url) cur = con.cursor() """ Note on the TEXT structure: Some comments have a title (e.g. Q01299) which is not retrieved when joining on CC_STRUCTURE_TYPE_ID = 1 """ cur.execute(""" SELECT E.ACCESSION, B.ORDER_IN, NVL(B.TEXT, SS.TEXT) FROM SPTR.DBENTRY@SWPREAD E INNER JOIN SPTR.COMMENT_BLOCK@SWPREAD B ON E.DBENTRY_ID = B.DBENTRY_ID AND B.COMMENT_TOPICS_ID = 2 -- FUNCTION comments LEFT OUTER JOIN SPTR.COMMENT_STRUCTURE@SWPREAD S ON B.COMMENT_BLOCK_ID = S.COMMENT_BLOCK_ID AND S.CC_STRUCTURE_TYPE_ID = 1 -- TEXT structure LEFT OUTER JOIN SPTR.COMMENT_SUBSTRUCTURE@SWPREAD SS ON S.COMMENT_STRUCTURE_ID = SS.COMMENT_STRUCTURE_ID WHERE E.ENTRY_TYPE IN (0, 1) -- Swiss-Prot and TrEMBL AND E.MERGE_STATUS != 'R' -- not 'Redundant' AND E.DELETED = 'N' -- not deleted AND E.FIRST_PUBLIC IS NOT NULL -- published """) i = 0 for accession, block_number, text in cur: store.append(accession, (block_number, text)) i += 1 if not i % 1000000: store.sync() if not i % 10000000: logger.info(f"{i:>12,}") cur.close() con.close() logger.info(f"{i:>12,}") size = store.merge(fn=_post_comments, processes=processes) logger.info(f"temporary files: {size/1024/1024:.0f} MB")
def chunk_proteins(url: str, keyfile: str, chunk_size: int = 50000): logger.info("loading") con = cx_Oracle.connect(url) cur = con.cursor() cur.execute(""" SELECT PROTEIN_AC FROM INTERPRO.PROTEIN """) accessions = [acc for acc, in cur] cur.close() con.close() logger.info("splitting into chunks") Store.dump_keys(Store.chunk(accessions, chunk_size), keyfile) logger.info("complete")
def _write_feature_tmp(features: dict, p_proteins: str, p_uniprot2features: str, start: str, stop: Optional[str], output: str): proteins = Store(p_proteins) u2features = Store(p_uniprot2features) with open(output, "wt", encoding="utf-8") as fh: doc = getDOMImplementation().createDocument(None, None, None) # for uniprot_acc, protein in proteins.range(start, stop): for uniprot_acc, protein_features in u2features.range(start, stop): protein = proteins[uniprot_acc] elem = doc.createElement("protein") elem.setAttribute("id", uniprot_acc) elem.setAttribute("name", protein["identifier"]) elem.setAttribute("length", str(protein["length"])) elem.setAttribute("crc64", protein["crc64"]) for feature_acc in sorted(protein_features): feature = features[feature_acc] feature_match = protein_features[feature_acc] match = doc.createElement("match") match.setAttribute("id", feature_acc) match.setAttribute("name", feature["name"]) match.setAttribute("dbname", feature["database"]) match.setAttribute("status", 'T') match.setAttribute("model", feature_acc) match.setAttribute("evd", feature["evidence"]) for loc in sorted(feature_match["locations"]): # there is only one fragment per location pos_start, pos_end, seq_feature = loc lcn = doc.createElement("lcn") lcn.setAttribute("start", str(pos_start)) lcn.setAttribute("end", str(pos_end)) if seq_feature: lcn.setAttribute("sequence-feature", seq_feature) match.appendChild(lcn) elem.appendChild(match) elem.writexml(fh, addindent=" ", newl="\n")
def export_proteins(url: str, keyfile: str, output: str, processes: int = 1, tmpdir: Optional[str] = None): logger.info("starting") with Store(output, Store.load_keys(keyfile), tmpdir) as store: con = cx_Oracle.connect(url) cur = con.cursor() cur.execute(""" SELECT PROTEIN_AC, NAME, DBCODE, LEN, FRAGMENT, TO_CHAR(TAX_ID), CRC64 FROM INTERPRO.PROTEIN """) i = 0 for row in cur: store[row[0]] = { "identifier": row[1], "reviewed": row[2] == 'S', "length": row[3], "fragment": row[4] == 'Y', "taxid": row[5], "crc64": row[6] } i += 1 if not i % 1000000: store.sync() if not i % 10000000: logger.info(f"{i:>12,}") cur.close() con.close() logger.info(f"{i:>12,}") size = store.merge(processes=processes) logger.info(f"temporary files: {size/1024/1024:.0f} MB")
def insert_extra_features(stg_url: str, p_uniprot2features: str): logger.info("starting") con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4") cur = con.cursor() cur.execute("DROP TABLE IF EXISTS webfront_proteinfeature") cur.execute(""" CREATE TABLE webfront_proteinfeature ( feature_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, protein_acc VARCHAR(15) NOT NULL, entry_acc VARCHAR(25) NOT NULL, source_database VARCHAR(10) NOT NULL, location_start INT NOT NULL, location_end INT NOT NULL, sequence_feature VARCHAR(35) ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.close() sql = """ INSERT INTO webfront_proteinfeature ( protein_acc, entry_acc, source_database, location_start, location_end, sequence_feature ) VALUES (%s, %s, %s, %s, %s, %s) """ with Store(p_uniprot2features) as proteins, Table(con, sql) as table: i = 0 for uniprot_acc, entries in proteins.items(): for entry_acc, info in entries.items(): for pos_start, pos_end, seq_feature in info["locations"]: table.insert((uniprot_acc, entry_acc, info["database"], pos_start, pos_end, seq_feature)) i += 1 if not i % 10000000: logger.info(f"{i:>12,}") logger.info(f"{i:>12,}") con.commit() logger.info("indexing") cur = con.cursor() cur.execute(""" CREATE INDEX i_proteinfeature ON webfront_proteinfeature (protein_acc) """) cur.close() con.close() logger.info("complete")
def export_entries(url: str, p_metacyc: str, p_clans: str, p_proteins: str, p_structures: str, p_uniprot2matches: str, p_uniprot2proteome: str, p_uniprot2ida: str, p_entry2xrefs: str, p_entries: str, **kwargs): min_overlap = kwargs.get("overlap", 0.2) processes = kwargs.get("processes", 1) min_similarity = kwargs.get("similarity", 0.75) tmpdir = kwargs.get("tmpdir") con = cx_Oracle.connect(url) cur = con.cursor() entries = {} logger.info("loading active InterPro entries") for entry in _get_interpro_entries(cur): entries[entry.accession] = entry logger.info("enriching entries with IntAct data") for accession, interactions in intact.get_interactions(cur).items(): try: entry = entries[accession] except KeyError: continue else: entry.ppi = interactions logger.info("loading deleted InterPro entries") for entry in _get_retired_interpro_entries(cur): if entry.accession in entries: cur.close() con.close() raise RuntimeError(f"entry cannot be active " f"and deleted {entry.accession}") entries[entry.accession] = entry logger.info("loading member database signatures") for entry in _get_signatures(cur): if entry.integrated_in and entry.integrated_in not in entries: cur.close() con.close() raise RuntimeError(f"{entry.accession} integrated " f"in missing entry ({entry.integrated_in})") entries[entry.accession] = entry logger.info("loading past entry names") past_names = _get_name_history(cur) logger.info("loading past signature integrations") past_integrations = _get_integration_history(cur) logger.info("loading ENZYME") u2enzyme = uniprot.get_swissprot2enzyme(cur) logger.info("loading Reactome pathways") u2reactome = uniprot.get_swissprot2reactome(cur) cur.close() con.close() logger.info("loading MetaCyc pathways") ec2metacyc = metacyc.get_ec2pathways(p_metacyc) # Updating entry history for entry in entries.values(): try: names = past_names[entry.accession] except KeyError: pass else: entry.history["names"] = names try: signatures = past_integrations[entry.accession] except KeyError: pass else: entry.history["signatures"] = signatures # Updating entry clan info for clan in loadobj(p_clans).values(): for entry_acc, score, seq_length in clan["members"]: try: entry = entries[entry_acc] except: continue else: entry.clan = { "accession": clan["accession"], "name": clan["name"] } inqueue = Queue(maxsize=processes) outqueue = Queue() workers = [] for _ in range(max(1, processes - 1)): dt = DirectoryTree(tmpdir) p = Process(target=_process_proteins, args=(inqueue, entries, min_overlap, dt, outqueue)) p.start() workers.append((p, dt)) logger.info("processing") uniprot2pdbe = {} for pdb_id, entry in loadobj(p_structures).items(): for uniprot_acc, chains in entry["proteins"].items(): try: uniprot2pdbe[uniprot_acc][pdb_id] = chains except KeyError: uniprot2pdbe[uniprot_acc] = {pdb_id: chains} proteins = Store(p_proteins) u2matches = Store(p_uniprot2matches) u2proteome = Store(p_uniprot2proteome) i = 0 for uniprot_acc, matches in u2matches.items(): inqueue.put(( uniprot_acc, proteins[uniprot_acc], matches, u2proteome.get(uniprot_acc), uniprot2pdbe.get(uniprot_acc, {}), set(u2enzyme.get(uniprot_acc, [])), set(u2reactome.get(uniprot_acc, [])) )) i += 1 if not i % 10000000: logger.info(f"{i:>15,}") proteins.close() u2matches.close() u2proteome.close() logger.info(f"{i:>15,}") # Send sentinel for _ in workers: inqueue.put(None) # Merge results from workers logger.info("exporting domain architectures") entries_with_xrefs = set() xref_files = [] entry_counts = {} entry_intersections = {} interpro2enzyme = {} interpro2reactome = {} with Store(p_uniprot2ida, u2matches.get_keys(), tmpdir) as u2ida: for _ in workers: obj = outqueue.get() xref_files.append(obj[0]) # str entries_with_xrefs |= obj[1] # set ida_file = obj[2] # str deepupdate(obj[3], entry_counts, replace=False) # dict deepupdate(obj[4], entry_intersections, replace=False) # dict deepupdate(obj[5], interpro2enzyme) # dict deepupdate(obj[6], interpro2reactome) # dict with DumpFile(ida_file) as df: i = 0 for uniprot_acc, dom_members, dom_str, dom_id in df: u2ida[uniprot_acc] = ( dom_members, dom_str, dom_id ) i += 1 if not i % 1000: u2ida.sync() u2ida.sync() size = u2ida.merge(processes=processes) # Adding empty EntryXrefs objects for entries without xrefs xref_files.append(workers[0][1].mktemp()) with DumpFile(xref_files[-1], compress=True) as df: for entry_acc in sorted(set(entries.keys()) - entries_with_xrefs): df.dump((entry_acc, EntryXrefs().asdict())) logger.info("exporting cross-references") with DumpFile(p_entry2xrefs, compress=True) as df: for entry_acc, xrefs in merge_dumps(xref_files): df.dump((entry_acc, xrefs)) entry = entries[entry_acc] # Reactome pathways if entry_acc in interpro2reactome: pathways = interpro2reactome[entry_acc] entry.pathways["reactome"] = [ dict(zip(("id", "name"), pthw)) for pthw in sorted(pathways) ] # EC numbers if entry_acc in interpro2enzyme: ecnos = sorted(interpro2enzyme[entry_acc]) entry.cross_references["ec"] = ecnos # MetaCyc pathways pathways = set() for ecno in ecnos: pathways |= set(ec2metacyc.get(ecno, [])) if pathways: entry.pathways["metacyc"] = [ dict(zip(("id", "name"), pthw)) for pthw in sorted(pathways) ] for p, dt in workers: size += dt.size dt.remove() logger.info(f"temporary files: {size / 1024 / 1024:.0f} MB") logger.info("calculating overlapping relationships") supfam = "homologous_superfamily" types = (supfam, "domain", "family", "repeat") for entry_acc, overlaps in entry_intersections.items(): entry1 = entries[entry_acc] entry_cnt = entry_counts[entry_acc] type1 = entry1.type.lower() for other_acc, overlap_counts in overlaps.items(): o1 = overlap_counts["1"] o2 = overlap_counts["2"] other_cnt = entry_counts[other_acc] # Independent coefficients coef1 = o1 / (entry_cnt + other_cnt - o1) coef2 = o2 / (entry_cnt + other_cnt - o2) # Final coefficient: average of independent coefficients coef = (coef1 + coef2) * 0.5 # Containment indices c1 = o1 / entry_cnt c2 = o2 / other_cnt if all([item < min_similarity for item in (coef, c1, c2)]): continue # Entries are similar enough entry2 = entries[other_acc] type2 = entry2.type.lower() if ((type1 == supfam and type2 in types) or (type1 in types and type2 == supfam)): # e1 -> e2 relationship entry1.overlaps_with.append({ "accession": other_acc, "name": entry2.name, "type": type2 }) # e2 -> e1 relationship entry2.overlaps_with.append({ "accession": entry_acc, "name": entry1.name, "type": type1 }) dumpobj(p_entries, entries) logger.info("populating ENTRY2PATHWAY") con = cx_Oracle.connect(url) cur = con.cursor() cur.execute("TRUNCATE TABLE INTERPRO.ENTRY2PATHWAY") cur.close() sql = "INSERT INTO INTERPRO.ENTRY2PATHWAY VALUES (:1, :2, :3, :4)" with Table(con, sql) as table: for e in entries.values(): for database, pathways in e.pathways.items(): code = PATHWAY_DATABASE[database] for pthw in pathways: table.insert(( e.accession, code, pthw["id"], pthw["name"] )) con.commit() con.close() logger.info("complete")
def export_matches(pro_url: str, stg_url: str, p_proteins: str, p_uniprot2matches: str, outdir: str, processes: int = 8): shutil.copy(os.path.join(os.path.dirname(__file__), "match_complete.dtd"), outdir) logger.info("loading isoforms") u2variants = {} for accession, variant in ippro.get_isoforms(pro_url).items(): protein_acc = variant["protein_acc"] try: variants = u2variants[protein_acc] except KeyError: variants = u2variants[protein_acc] = [] finally: variants.append((accession, variant["length"], variant["crc64"], variant["matches"])) logger.info("loading signatures") con = cx_Oracle.connect(pro_url) cur = con.cursor() signatures = ippro.get_signatures(cur) cur.close() con.close() logger.info("spawning processes") processes = max(1, processes - 1) ctx = mp.get_context(method="spawn") workers = [] with Store(p_proteins) as proteins: proteins_per_file = math.ceil(len(proteins) / processes) start_acc = None for i, uniprot_acc in enumerate(proteins): if not i % proteins_per_file: if start_acc: filename = f"match_{len(workers)+1}.xml" filepath = os.path.join(outdir, filename) p = ctx.Process(target=_write_match_tmp, args=(signatures, u2variants, p_proteins, p_uniprot2matches, start_acc, uniprot_acc, filepath)) p.start() workers.append((p, filepath)) start_acc = uniprot_acc filename = f"match_{len(workers) + 1}.xml" filepath = os.path.join(outdir, filename) p = ctx.Process(target=_write_match_tmp, args=(signatures, u2variants, p_proteins, p_uniprot2matches, start_acc, None, filepath)) p.start() workers.append((p, filepath)) logger.info("concatenating XML files") con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4") cur = con.cursor() cur.execute(""" SELECT name, name_alt, type, num_entries, version, release_date FROM webfront_database ORDER BY name_long """) doc = getDOMImplementation().createDocument(None, None, None) elem = doc.createElement("release") for name, name_alt, db_type, entry_count, version, date in cur: if db_type == "entry": dbinfo = doc.createElement("dbinfo") dbinfo.setAttribute("dbname", name_alt) if version: dbinfo.setAttribute("version", version) if entry_count: dbinfo.setAttribute("entry_count", str(entry_count)) if date: dbinfo.setAttribute("file_date", date.strftime("%d-%b-%y").upper()) elem.appendChild(dbinfo) cur.close() con.close() output = os.path.join(outdir, "match_complete.xml.gz") with gzip.open(output, "wt", encoding="utf-8") as fh: fh.write('<?xml version="1.0" encoding="UTF-8"?>\n') fh.write('<!DOCTYPE interpromatch SYSTEM "match_complete.dtd">\n') fh.write('<interpromatch>\n') elem.writexml(fh, addindent=" ", newl="\n") for i, (p, filepath) in enumerate(workers): p.join() with open(filepath, "rt", encoding="utf-8") as tfh: for line in tfh: fh.write(line) os.remove(filepath) logger.info(f"\t{i+1} / {len(workers)}") fh.write('</interpromatch>\n') logger.info("complete")
def export_documents(src_proteins: str, src_entries: str, src_proteomes: str, src_structures: str, src_taxonomy: str, src_uniprot2ida: str, src_uniprot2matches: str, src_uniprot2proteomes: str, outdirs: Sequence[str], version: str, cache_size: int = 100000): logger.info("preparing data") os.umask(0o002) organizers = [] for path in outdirs: try: shutil.rmtree(path) except FileNotFoundError: pass os.makedirs(path, mode=0o775) organizers.append(DirectoryTree(path)) open(os.path.join(path, f"{version}{LOAD_SUFFIX}"), "w").close() logger.info("loading domain architectures") domains = {} with Store(src_uniprot2ida) as u2ida: for dom_members, dom_arch, dom_arch_id in u2ida.values(): try: dom = domains[dom_arch_id] except KeyError: domains[dom_arch_id] = { "ida_id": dom_arch_id, "ida": dom_arch, "counts": 1 } else: dom["counts"] += 1 logger.info("writing IDA documents") num_documents = 0 domains = list(domains.values()) for i in range(0, len(domains), cache_size): documents = [] for dom in domains[i:i + cache_size]: documents.append(( IDA_INDEX + version, dom["ida_id"], dom )) num_documents += len(documents) for org in organizers: filepath = org.mktemp() dumpobj(filepath, documents) os.rename(filepath, f"{filepath}{EXTENSION}") domains = None proteins = Store(src_proteins) uniprot2ida = Store(src_uniprot2ida) uniprot2matches = Store(src_uniprot2matches) uniprot2proteomes = Store(src_uniprot2proteomes) entries = loadobj(src_entries) # mem: ~1.5 GB proteomes = loadobj(src_proteomes) # mem: <1 GB structures = loadobj(src_structures) # mem: ~ 4GB taxonomy = loadobj(src_taxonomy) # mem: ~ 2.5GB uniprot2pdbe = {} # mem: <1 GB for pdb_id, entry in structures.items(): for uniprot_acc in entry["proteins"]: try: uniprot2pdbe[uniprot_acc].append(pdb_id) except KeyError: uniprot2pdbe[uniprot_acc] = [pdb_id] logger.info("writing relationship documents") i = 0 documents = [] used_entries = set() used_taxa = set() for uniprot_acc, info in proteins.items(): taxid = info["taxid"] taxon = taxonomy[taxid] used_taxa.add(taxid) # remember that this taxon has been used try: dom_members, dom_arch, dom_arch_id = uniprot2ida[uniprot_acc] except KeyError: dom_members = [] dom_arch = dom_arch_id = None # Create an empty document (all properties set to None) doc = init_rel_doc() doc.update({ "protein_acc": uniprot_acc.lower(), "protein_length": info["length"], "protein_is_fragment": info["fragment"], "protein_db": "reviewed" if info["reviewed"] else "unreviewed", "text_protein": join(uniprot_acc, info["identifier"]), # Taxonomy "tax_id": taxid, "tax_name": taxon["sci_name"], "tax_lineage": taxon["lineage"], "tax_rank": taxon["rank"], "text_taxonomy": join(taxid, taxon["full_name"], taxon["rank"]) }) proteome_id = uniprot2proteomes.get(uniprot_acc) if proteome_id: proteome = proteomes[proteome_id] doc.update({ "proteome_acc": proteome_id.lower(), "proteome_name": proteome["name"], "proteome_is_reference": proteome["is_reference"], "text_proteome": join(proteome_id, proteome["name"], proteome["assembly"], proteome["taxon_id"], proteome["strain"]), }) # Adding PDBe structures/chains pdb_chains = {} # mapping PDB-chain ID -> chain segments pdb_documents = {} # mapping PDB-chain ID -> ES document for pdb_id in uniprot2pdbe.get(uniprot_acc, []): pdb_entry = structures[pdb_id] chains = pdb_entry["proteins"][uniprot_acc] pdb_doc = doc.copy() pdb_doc.update({ "structure_acc": pdb_id.lower(), "structure_resolution": pdb_entry["resolution"], "structure_date": pdb_entry["date"], "structure_evidence": pdb_entry["evidence"], "protein_structure": chains, "text_structure": join(pdb_id, pdb_entry["evidence"], pdb_entry["name"]) }) for chain_id, segments in chains.items(): pdb_chain_id = f"{pdb_id}-{chain_id}" locations = [] for segment in segments: locations.append({ "fragments": [{ "start": segment["protein_start"], "end": segment["protein_end"], }] }) chain_doc = pdb_doc.copy() chain_doc.update({ "structure_chain_acc": chain_id, "structure_protein_locations": locations, "structure_chain": pdb_chain_id }) pdb_documents[pdb_chain_id] = chain_doc pdb_chains[pdb_chain_id] = segments # Adding entries overlapping_chains = set() # chains associated to at least one entry matches = uniprot2matches.get(uniprot_acc, {}) num_protein_docs = 0 for entry_acc, locations in matches.items(): used_entries.add(entry_acc) # this entry has been used entry = entries[entry_acc] if entry.integrated_in: interpro_acc = entry.integrated_in.lower() else: interpro_acc = None entry_obj = { "entry_acc": entry_acc.lower(), "entry_db": entry.database, "entry_type": entry.type.lower(), "entry_date": entry.creation_date.strftime("%Y-%m-%d"), "entry_protein_locations": locations, "entry_go_terms": [t["identifier"] for t in entry.go_terms], "entry_integrated": interpro_acc, "text_entry": join(entry_acc, entry.short_name, entry.name, entry.type.lower(), interpro_acc), } if entry.clan: entry_obj.update({ "set_acc": entry.clan["accession"].lower(), "set_db": entry.database, "text_set": join(entry.clan["accession"], entry.clan["name"]), }) if entry_acc in dom_members: entry_obj.update({ "ida_id": dom_arch_id, "ida": dom_arch, }) # Test if the entry overlaps PDB chains entry_chains = set() for pdb_chain_id, segments in pdb_chains.items(): if overlaps_pdb_chain(locations, segments): # Entry overlaps chain: associate entry to struct/chain chain_doc = pdb_documents[pdb_chain_id] entry_doc = chain_doc.copy() entry_doc.update(entry_obj) documents.append(( entry.database + version, get_rel_doc_id(entry_doc), entry_doc )) entry_chains.add(pdb_chain_id) num_protein_docs += 1 if entry_chains: # Entry overlaps at least one chain overlapping_chains |= entry_chains else: # Associate entry to protein directly entry_doc = doc.copy() entry_doc.update(entry_obj) documents.append(( entry.database + version, get_rel_doc_id(entry_doc), entry_doc )) num_protein_docs += 1 # Add chains not overlapping any entry for chain_id, chain_doc in pdb_documents.items(): if chain_id in overlapping_chains: continue chain_doc.update({ "ida_id": dom_arch_id, "ida": dom_arch, }) documents.append(( # Not overlapping any entry -> not associated to a member DB REL_INDEX + version, get_rel_doc_id(chain_doc), chain_doc )) num_protein_docs += 1 if not num_protein_docs: # No relationships for this protein: fallback to protein doc documents.append(( REL_INDEX + version, get_rel_doc_id(doc), doc )) while len(documents) >= cache_size: for org in organizers: filepath = org.mktemp() dumpobj(filepath, documents[:cache_size]) os.rename(filepath, f"{filepath}{EXTENSION}") del documents[:cache_size] num_documents += cache_size i += 1 if not i % 10000000: logger.info(f"{i:>12,}") logger.info(f"{i:>12,}") logger.info("writing remaining documents") # Add unused entries for entry in entries.values(): if entry.accession in used_entries or entry.is_deleted: continue if entry.integrated_in: interpro_acc = entry.integrated_in.lower() else: interpro_acc = None doc = init_rel_doc() doc.update({ "entry_acc": entry.accession.lower(), "entry_db": entry.database, "entry_type": entry.type.lower(), "entry_date": entry.creation_date.strftime("%Y-%m-%d"), "entry_protein_locations": [], "entry_go_terms": [t["identifier"] for t in entry.go_terms], "entry_integrated": interpro_acc, "text_entry": join(entry.accession, entry.short_name, entry.name, entry.type.lower(), interpro_acc), }) if entry.clan: doc.update({ "set_acc": entry.clan["accession"].lower(), "set_db": entry.database, "text_set": join(entry.clan["accession"], entry.clan["name"]), }) documents.append(( entry.database + version, get_rel_doc_id(doc), doc )) # Add unused taxa for taxon in taxonomy.values(): if taxon["id"] in used_taxa: continue doc = init_rel_doc() doc.update({ "tax_id": taxon["id"], "tax_name": taxon["full_name"], "tax_lineage": taxon["lineage"], "tax_rank": taxon["rank"], "text_taxonomy": join(taxon["id"], taxon["full_name"], taxon["rank"]) }) documents.append(( REL_INDEX + version, get_rel_doc_id(doc), doc )) num_documents += len(documents) while documents: for org in organizers: filepath = org.mktemp() dumpobj(filepath, documents[:cache_size]) os.rename(filepath, f"{filepath}{EXTENSION}") del documents[:cache_size] proteins.close() uniprot2ida.close() uniprot2matches.close() uniprot2proteomes.close() for path in outdirs: open(os.path.join(path, f"{version}{DONE_SUFFIX}"), "w").close() logger.info(f"complete ({num_documents:,} documents)")
def export_matches(url: str, keyfile: str, output: str, processes: int = 1, tmpdir: Optional[str] = None): logger.info("starting") with Store(output, Store.load_keys(keyfile), tmpdir) as store: con = cx_Oracle.connect(url) cur = con.cursor() cur.execute(""" SELECT M.PROTEIN_AC, M.METHOD_AC, M.MODEL_AC, M.POS_FROM, M.POS_TO, M.FRAGMENTS, M.SCORE, E.ENTRY_AC FROM INTERPRO.MATCH M LEFT OUTER JOIN ( SELECT E.ENTRY_AC, EM.METHOD_AC FROM INTERPRO.ENTRY E INNER JOIN INTERPRO.ENTRY2METHOD EM ON E.ENTRY_AC = EM.ENTRY_AC WHERE E.CHECKED = 'Y' ) E ON M.METHOD_AC = E.METHOD_AC """) i = 0 for row in cur: if row[5]: fragments = [] for frag in row[5].split(','): # Format: START-END-STATUS s, e, t = frag.split('-') fragments.append({ "start": int(s), "end": int(e), "dc-status": DC_STATUSES[t] }) else: fragments = [{ "start": row[3], "end": row[4], "dc-status": DC_STATUSES['S'] # Continuous }] store.append( row[0], ( row[1], # signature row[2], # model row[6], # score fragments, row[7] # InterPro entry )) i += 1 if not i % 1000000: store.sync() if not i % 100000000: logger.info(f"{i:>13,}") cur.close() con.close() logger.info(f"{i:>13,}") size = store.merge(fn=_post_matches, processes=processes) logger.info(f"temporary files: {size/1024/1024:.0f} MB")
def export_matches(url: str, outdir: str, tmpdir: Optional[str] = None, processes: int = 8, proteins_per_file: int = 1000000): fd, proteins_file = mkstemp(dir=tmpdir) os.close(fd) os.remove(proteins_file) logger.info("exporting UniParc proteins") con = cx_Oracle.connect(url) cur = con.cursor() keys = [] with KVdb(proteins_file, writeback=True) as kvdb: cur.execute(""" SELECT UPI, LEN, CRC64 FROM UNIPARC.PROTEIN ORDER BY UPI """) for i, (upi, length, crc64) in enumerate(cur): kvdb[upi] = (length, crc64) if not i % 1e6: kvdb.sync() if not i % 1e4: keys.append(upi) kvdb.sync() logger.info("exporting UniParc matches") fd, matches_file = mkstemp(dir=outdir) os.close(fd) with Store(matches_file, keys, tmpdir) as store: cur.execute(""" SELECT MA.UPI, MA.METHOD_AC, MA.MODEL_AC, MA.SEQ_START, MA.SEQ_END, MA.SCORE, MA.SEQ_FEATURE, MA.FRAGMENTS FROM IPRSCAN.MV_IPRSCAN MA INNER JOIN INTERPRO.METHOD ME ON MA.METHOD_AC = ME.METHOD_AC """) i = 0 for row in cur: store.append(row[0], row[1:]) i += 1 if not i % 1e6: store.sync() if not i % 1e9: logger.info(f"{i:>15,}") logger.info(f"{i:>15,}") size = store.merge(fn=merge_matches, processes=processes) logger.info("loading signatures") signatures = ippro.get_signatures(cur) cur.close() con.close() logger.info("spawning processes") ctx = mp.get_context(method="spawn") inqueue = ctx.Queue() outqueue = ctx.Queue() workers = [] for _ in range(max(1, processes - 1)): p = ctx.Process(target=dump_proteins, args=(proteins_file, matches_file, signatures, inqueue, outqueue)) p.start() workers.append(p) with Store(matches_file) as store: num_files = 0 i = 0 from_upi = None for upi in store: i += 1 if not i % 1e8: logger.info(f"{i:>15,}") if i % proteins_per_file == 1: if from_upi: num_files += 1 filename = f"uniparc_match_{num_files}.dump" filepath = os.path.join(outdir, filename) inqueue.put((from_upi, upi, filepath)) from_upi = upi num_files += 1 filename = f"uniparc_match_{num_files}.dump" filepath = os.path.join(outdir, filename) inqueue.put((from_upi, None, filepath)) logger.info(f"{i:>15,}") for _ in workers: inqueue.put(None) logger.info("creating XML archive") output = os.path.join(outdir, "uniparc_match.tar.gz") with tarfile.open(output, "w:gz") as fh: for i in range(num_files): filepath = outqueue.get() fh.add(filepath, arcname=os.path.basename(filepath)) os.remove(filepath) logger.info(f"{i+1:>6}/{num_files}") for p in workers: p.join() size += os.path.getsize(proteins_file) os.remove(proteins_file) os.remove(matches_file) logger.info(f"temporary files: {size/1024**2:.0f} MB") logger.info("complete")
def dump_proteins(proteins_file: str, matches_file: str, signatures: dict, inqueue: mp.Queue, outqueue: mp.Queue): doc = getDOMImplementation().createDocument(None, None, None) with KVdb(proteins_file) as kvdb, Store(matches_file) as store: for from_upi, to_upi, filepath in iter(inqueue.get, None): with open(filepath, "wt") as fh: fh.write('<?xml version="1.0" encoding="UTF-8"?>\n') for upi, matches in store.range(from_upi, to_upi): try: length, crc64 = kvdb[upi] except KeyError: """ This may happen because UNIPARC.PROTEIN is refreshed using IPREAD while match data come from ISPRO, which uses UAPRO (more up-to-date than UAREAD) """ continue protein = doc.createElement("protein") protein.setAttribute("id", upi) protein.setAttribute("length", str(length)) protein.setAttribute("crc64", crc64) for signature_acc, model, locations in matches: signature = signatures[signature_acc] match = doc.createElement("match") match.setAttribute("id", signature_acc) match.setAttribute("name", signature["name"]) match.setAttribute("dbname", signature["database"]) match.setAttribute("status", 'T') match.setAttribute("evd", signature["evidence"]) match.setAttribute("model", model) if signature["interpro"]: ipr = doc.createElement("ipr") for attname, value in signature["interpro"]: if value: ipr.setAttribute(attname, value) match.appendChild(ipr) for start, end, score, aln, frags in locations: lcn = doc.createElement("lcn") lcn.setAttribute("start", str(start)) lcn.setAttribute("end", str(end)) if frags: lcn.setAttribute("fragments", frags) if aln: lcn.setAttribute("alignment", aln) lcn.setAttribute("score", str(score)) match.appendChild(lcn) protein.appendChild(match) protein.writexml(fh, addindent=" ", newl="\n") outqueue.put(filepath)
def insert_taxonomy(p_entries: str, p_proteins: str, p_structures: str, p_taxonomy: str, p_uniprot2matches: str, p_uniprot2proteome: str, stg_url: str, p_interpro2taxonomy: str, tmpdir: Optional[str] = None): logger.info("preparing data") dt = DirectoryTree(tmpdir) entries = loadobj(p_entries) taxonomy = loadobj(p_taxonomy) uniprot2pdbe = {} for pdb_id, entry in loadobj(p_structures).items(): for uniprot_acc, chains in entry["proteins"].items(): try: uniprot2pdbe[uniprot_acc][pdb_id] = chains except KeyError: uniprot2pdbe[uniprot_acc] = {pdb_id: chains} proteins = Store(p_proteins) u2matches = Store(p_uniprot2matches) u2proteome = Store(p_uniprot2proteome) logger.info("starting") i = 0 xrefs = {} files = [] for uniprot_acc, info in proteins.items(): taxon_id = info["taxid"] try: taxon = xrefs[taxon_id] except KeyError: taxon = xrefs[taxon_id] = init_xrefs() try: proteome_id = u2proteome[uniprot_acc] except KeyError: pass else: taxon["proteomes"].add(proteome_id) taxon["proteins"]["all"] += 1 protein_structures = uniprot2pdbe.get(uniprot_acc, {}) # Add structures to taxon, regardless of entry matches taxon["structures"]["all"] |= set(protein_structures.keys()) databases = set() for entry_acc, locations in u2matches.get(uniprot_acc, {}).items(): entry = entries[entry_acc] database = entry.database try: taxon["entries"][database].add(entry_acc) except KeyError: taxon["entries"][database] = {entry_acc} if database not in databases: # Counting the protein *once* per database databases.add(database) try: taxon["proteins"]["databases"][database] += 1 except KeyError: taxon["proteins"]["databases"][database] = 1 try: taxon["proteins"]["entries"][entry_acc] += 1 except KeyError: taxon["proteins"]["entries"][entry_acc] = 1 for pdb_id, chains in protein_structures.items(): for chain_id, segments in chains.items(): if overlaps_pdb_chain(locations, segments): try: taxon["structures"]["entries"][entry_acc].add( pdb_id) except KeyError: taxon["structures"]["entries"][entry_acc] = { pdb_id } break # Skip other chains i += 1 if not i % 1000000: output = dt.mktemp() dump_xrefs(xrefs, taxonomy, output) files.append(output) xrefs = {} if not i % 10000000: logger.info(f"{i:>12,}") if xrefs: output = dt.mktemp() dump_xrefs(xrefs, taxonomy, output) files.append(output) xrefs = {} logger.info(f"{i:>12,}") logger.info(f"temporary files: " f"{sum(map(os.path.getsize, files))/1024/1024:.0f} MB") proteins.close() u2matches.close() u2proteome.close() logger.info("populating taxonomy tables") con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4") cur = con.cursor() cur.execute("DROP TABLE IF EXISTS webfront_taxonomy") cur.execute(""" CREATE TABLE webfront_taxonomy ( accession VARCHAR(20) PRIMARY KEY NOT NULL, scientific_name VARCHAR(255) NOT NULL, full_name VARCHAR(512) NOT NULL, lineage LONGTEXT NOT NULL, parent_id VARCHAR(20), rank VARCHAR(20) NOT NULL, children LONGTEXT, counts LONGTEXT NOT NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.execute("DROP TABLE IF EXISTS webfront_taxonomyperentry") cur.execute(""" CREATE TABLE webfront_taxonomyperentry ( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, tax_id VARCHAR(20) NOT NULL, entry_acc VARCHAR(25) NOT NULL, counts LONGTEXT NULL NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.execute("DROP TABLE IF EXISTS webfront_taxonomyperentrydb") cur.execute(""" CREATE TABLE webfront_taxonomyperentrydb ( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, tax_id VARCHAR(20) NOT NULL, source_database VARCHAR(10) NOT NULL, counts LONGTEXT NOT NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.close() table = Table(con, query=""" INSERT INTO webfront_taxonomy VALUES (%s, %s, %s, %s, %s, %s, %s, %s) """) per_entry = Table(con, query=""" INSERT INTO webfront_taxonomyperentry (tax_id,entry_acc,counts) VALUES (%s, %s, %s) """) per_database = Table(con, query=""" INSERT INTO webfront_taxonomyperentrydb (tax_id,source_database,counts) VALUES (%s, %s, %s) """) with DumpFile(p_interpro2taxonomy, compress=True) as interpro2taxonomy: interpro_entries = { entry.accession for entry in entries.values() if entry.database == "interpro" and not entry.is_deleted } i = 0 for taxon_id, taxon_xrefs in merge_dumps(files): taxon = taxonomy[taxon_id] protein_counts = taxon_xrefs.pop("proteins") structure_counts = taxon_xrefs.pop("structures") counts = reduce(taxon_xrefs) # Add total protein count (not grouped by database/entry) counts["proteins"] = protein_counts["all"] # Add total structure count counts["structures"] = len(structure_counts["all"]) # Add total entry count (not grouped by database) counts["entries"]["total"] = sum(counts["entries"].values()) table.insert( (taxon_id, taxon["sci_name"], taxon["full_name"], f" {' '.join(taxon['lineage'])} ", taxon["parent"], taxon["rank"], jsonify(taxon["children"]), jsonify(counts))) # Remove the 'entry' property # (no needed for webfront_taxonomyperentry) entry_counts = counts.pop("entries") database_structures = {} for entry_acc, count in protein_counts["entries"].items(): if entry_acc in interpro_entries: interpro2taxonomy.dump((entry_acc, taxon_id, count)) counts["proteins"] = count try: entry_structures = structure_counts["entries"][entry_acc] except KeyError: counts["structures"] = 0 else: counts["structures"] = len(entry_structures) database = entries[entry_acc].database try: database_structures[database] |= entry_structures except KeyError: database_structures[database] = entry_structures.copy() finally: per_entry.insert((taxon_id, entry_acc, jsonify(counts))) for database, count in protein_counts["databases"].items(): counts.update({ "entries": entry_counts[database], "proteins": count, "structures": len(database_structures.get(database, [])) }) per_database.insert((taxon_id, database, jsonify(counts))) i += 1 if not i % 100000: logger.info(f"{i:>12,}") logger.info(f"{i:>12,}") table.close() per_entry.close() per_database.close() con.commit() dt.remove() logger.info("indexing") cur = con.cursor() cur.execute(""" CREATE INDEX i_webfront_taxonomyperentry_tax ON webfront_taxonomyperentry (tax_id) """) cur.execute(""" CREATE INDEX i_webfront_taxonomyperentry_entry ON webfront_taxonomyperentry (entry_acc) """) cur.execute(""" CREATE INDEX i_webfront_taxonomyperentrydb_tax ON webfront_taxonomyperentrydb (tax_id) """) cur.execute(""" CREATE INDEX i_webfront_taxonomyperentrydb_database ON webfront_taxonomyperentrydb (source_database) """) cur.close() con.close() logger.info("complete")
def insert_proteomes(p_entries: str, p_proteins: str, p_proteomes: str, p_structures: str, p_uniprot2ida: str, p_uniprot2matches: str, p_uniprot2proteome: str, stg_url: str): logger.info("preparing data") proteomes = loadobj(p_proteomes) uniprot2pdbe = {} for pdb_id, entry in loadobj(p_structures).items(): for uniprot_acc in entry["proteins"]: try: uniprot2pdbe[uniprot_acc].add(pdb_id) except KeyError: uniprot2pdbe[uniprot_acc] = {pdb_id} # Init all proteomes xrefs = {} for proteome_id in proteomes: xrefs[proteome_id] = { "domain_architectures": set(), "entries": {}, "proteins": 0, "sets": set(), "structures": set(), "taxa": set() } entries = loadobj(p_entries) proteins = Store(p_proteins) u2ida = Store(p_uniprot2ida) u2matches = Store(p_uniprot2matches) u2proteome = Store(p_uniprot2proteome) logger.info("starting") i = 0 for uniprot_acc, proteome_id in u2proteome.items(): proteome = xrefs[proteome_id] proteome["proteins"] += 1 info = proteins[uniprot_acc] proteome["taxa"].add(info["taxid"]) try: dom_members, dom_arch, dom_arch_id = u2ida[uniprot_acc] except KeyError: pass else: proteome["domain_architectures"].add(dom_arch_id) for entry_acc in u2matches.get(uniprot_acc, []): entry = entries[entry_acc] try: proteome["entries"][entry.database].add(entry_acc) except KeyError: proteome["entries"][entry.database] = {entry_acc} if entry.clan: proteome["sets"].add(entry.clan["accession"]) try: pdb_ids = uniprot2pdbe[uniprot_acc] except KeyError: pass else: proteome["structures"] |= pdb_ids i += 1 if not i % 10000000: logger.info(f"{i:>12,}") logger.info(f"{i:>12,}") proteins.close() u2ida.close() u2matches.close() u2proteome.close() con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4") cur = con.cursor() cur.execute("DROP TABLE IF EXISTS webfront_proteome") cur.execute(""" CREATE TABLE webfront_proteome ( accession VARCHAR(20) PRIMARY KEY NOT NULL, name VARCHAR(215) NOT NULL, is_reference TINYINT NOT NULL, strain VARCHAR(512), assembly VARCHAR(512), taxonomy_id VARCHAR(20) NOT NULL, counts LONGTEXT NOT NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.close() sql = """ INSERT INTO webfront_proteome VALUES (%s, %s, %s, %s, %s, %s, %s) """ with Table(con, sql) as table: for proteome_id, info in proteomes.items(): counts = reduce(xrefs[proteome_id]) counts["entries"]["total"] = sum(counts["entries"].values()) table.insert( (proteome_id, info["name"], 1 if info["is_reference"] else 0, info["strain"], info["assembly"], info["taxon_id"], jsonify(counts))) con.commit() con.close() logger.info("complete")
def _export_hmms(p_uniprot2matches: str, pro_url: str, dt: DirectoryTree, buffer_size: int = 1000): logger.info("counting hits per model") signatures = {} with Store(p_uniprot2matches) as u2matches: cnt = 0 for entries in u2matches.values(): for entry_acc, locations in entries.items(): for loc in locations: if loc["model"] is None: continue # InterPro entries try: models = signatures[entry_acc] except KeyError: models = signatures[entry_acc] = {} try: models[loc["model"]] += 1 except KeyError: models[loc["model"]] = 1 cnt += 1 if not cnt % 10e6: logger.info(f"{cnt:>12,}") logger.info(f"{cnt:>12,}") for entry_acc, models in signatures.items(): # Select the model with the most hits model_acc = sorted(models, key=lambda k: (-models[k], k))[0] signatures[entry_acc] = model_acc logger.info("processing models") df = DumpFile(dt.mktemp(), compress=True) cnt = 0 ignored = 0 iterator = ippro.get_hmms(pro_url, multi_models=True) for entry_acc, model_acc, hmm_bytes in iterator: try: representative_model = signatures[entry_acc] except KeyError: # Signature without matches, i.e. without representative model ignored += 1 continue if model_acc and model_acc != representative_model: continue hmm_str = gzip.decompress(hmm_bytes).decode("utf-8") df.dump((entry_acc, "hmm", hmm_bytes, "application/gzip", None)) with StringIO(hmm_str) as stream: hmm = hmmer.HMMFile(stream) df.dump((entry_acc, "logo", json.dumps(hmm.logo("info_content_all", "hmm")), "application/json", None)) cnt += 2 if cnt >= buffer_size: df.close() yield df.path df = DumpFile(dt.mktemp(), compress=True) cnt = 0 df.close() yield df.path logger.info(f" {ignored} models ignored")
def export(p_entries: str, p_uniprot2matches: str, outdir: str): logger.info("loading entries") entries = [] integrated = {} for e in loadobj(p_entries).values(): if e.database == "interpro" and not e.is_deleted: entries.append(e) for signatures in e.integrates.values(): for signature_acc in signatures: integrated[signature_acc] = (e.accession, e.name) logger.info("writing entry.list") with open(os.path.join(outdir, "entry.list"), "wt") as fh: fh.write("ENTRY_AC\tENTRY_TYPE\tENTRY_NAME\n") for e in sorted(entries, key=lambda e: (e.type, e.accession)): fh.write(f"{e.accession}\t{e.type}\t{e.name}\n") logger.info("writing names.dat") with open(os.path.join(outdir, "names.dat"), "wt") as fh: for e in sorted(entries, key=lambda e: e.accession): fh.write(f"{e.accession}\t{e.name}\n") logger.info("writing short_names.dat") with open(os.path.join(outdir, "short_names.dat"), "wt") as fh: for e in sorted(entries, key=lambda e: e.accession): fh.write(f"{e.accession}\t{e.short_name}\n") logger.info("writing interpro2go") with open(os.path.join(outdir, "interpro2go"), "wt") as fh: fh.write(f"!date: {datetime.now():%Y/%m/%d %H:%M:%S}\n") fh.write("!Mapping of InterPro entries to GO\n") fh.write("!\n") for e in sorted(entries, key=lambda e: e.accession): for term in e.go_terms: fh.write(f"InterPro:{e.accession} {e.name} > " f"GO:{term['name']} ; {term['identifier']}\n") logger.info("writing ParentChildTreeFile.txt") with open(os.path.join(outdir, "ParentChildTreeFile.txt"), "wt") as fh: for e in sorted(entries, key=lambda e: e.accession): root = e.hierarchy["accession"] if root == e.accession and e.hierarchy["children"]: _write_node(e.hierarchy, fh, level=0) logger.info("writing protein2ipr.dat.gz") filepath = os.path.join(outdir, "protein2ipr.dat.gz") with gzip.open(filepath, "wt") as fh, Store(p_uniprot2matches) as sh: i = 0 for uniprot_acc, protein_entries in sh.items(): matches = [] for signature_acc in sorted(protein_entries): try: interpro_acc, name = integrated[signature_acc] except KeyError: # Not integrated signature or InterPro entry continue locations = protein_entries[signature_acc] for loc in locations: matches.append(( uniprot_acc, interpro_acc, name, signature_acc, # We do not consider fragmented locations loc["fragments"][0]["start"], max(f["end"] for f in loc["fragments"]) )) for m in sorted(matches): fh.write('\t'.join(map(str, m)) + '\n') i += 1 if not i % 10000000: logger.debug(f"{i:>12,}") logger.info(f"{i:>12,}") logger.info("complete")
def export_structure_matches(url: str, p_proteins: str, p_structures: str, outdir: str): shutil.copy(os.path.join(os.path.dirname(__file__), "feature.dtd"), outdir) logger.info("loading structures") uniprot2pdbe = {} for pdb_id, entry in loadobj(p_structures).items(): for uniprot_acc, chains in entry["proteins"].items(): try: uniprot2pdbe[uniprot_acc][pdb_id] = chains except KeyError: uniprot2pdbe[uniprot_acc] = {pdb_id: chains} logger.info("loading CATH/SCOP domains") uni2prot2cath = pdbe.get_cath_domains(url) uni2prot2scop = pdbe.get_scop_domains(url) logger.info("writing file") output = os.path.join(outdir, "feature.xml.gz") with gzip.open(output, "wt", encoding="utf-8") as fh: fh.write('<?xml version="1.0" encoding="UTF-8"?>\n') fh.write('<!DOCTYPE interprofeature SYSTEM "feature.dtd">\n') fh.write('<interprofeature>\n') with Store(p_proteins) as proteins: doc = getDOMImplementation().createDocument(None, None, None) for uniprot_acc, protein in proteins.items(): pdb_entries = uniprot2pdbe.get(uniprot_acc, {}) cath_entries = uni2prot2cath.get(uniprot_acc, {}) scop_entries = uni2prot2scop.get(uniprot_acc, {}) if pdb_entries or cath_entries or scop_entries: elem = doc.createElement("protein") elem.setAttribute("id", uniprot_acc) elem.setAttribute("name", protein["identifier"]) elem.setAttribute("length", str(protein["length"])) elem.setAttribute("crc64", protein["crc64"]) for pdb_id in sorted(pdb_entries): chains = pdb_entries[pdb_id] for chain_id in sorted(chains): domain = doc.createElement("domain") domain.setAttribute("id", f"{pdb_id}{chain_id}") domain.setAttribute("dbname", "PDB") for loc in chains[chain_id]: start = loc["protein_start"] end = loc["protein_end"] coord = doc.createElement("coord") coord.setAttribute("pdb", pdb_id) coord.setAttribute("chain", chain_id) coord.setAttribute("start", str(start)) coord.setAttribute("end", str(end)) domain.appendChild(coord) elem.appendChild(domain) for domain_id in sorted(cath_entries): entry = cath_entries[domain_id] domain = doc.createElement("domain") domain.setAttribute("id", domain_id) domain.setAttribute("cfn", entry["superfamily"]["id"]) domain.setAttribute("dbname", "CATH") for loc in entry["locations"]: coord = doc.createElement("coord") coord.setAttribute("pdb", entry["pdb_id"]) coord.setAttribute("chain", entry["chain"]) coord.setAttribute("start", str(loc["start"])) coord.setAttribute("end", str(loc["end"])) domain.appendChild(coord) elem.appendChild(domain) for domain_id in sorted(scop_entries): entry = scop_entries[domain_id] domain = doc.createElement("domain") domain.setAttribute("id", domain_id) domain.setAttribute("cfn", entry["superfamily"]["id"]) domain.setAttribute("dbname", "SCOP") for loc in entry["locations"]: coord = doc.createElement("coord") coord.setAttribute("pdb", entry["pdb_id"]) coord.setAttribute("chain", entry["chain"]) coord.setAttribute("start", str(loc["start"])) coord.setAttribute("end", str(loc["end"])) domain.appendChild(coord) elem.appendChild(domain) elem.writexml(fh, addindent=" ", newl="\n") fh.write('</interprofeature>\n') logger.info("complete")
def export_features_matches(url: str, p_proteins: str, p_uniprot2features: str, outdir: str, processes: int = 8): shutil.copy(os.path.join(os.path.dirname(__file__), "extra.dtd"), outdir) logger.info("loading features") con = cx_Oracle.connect(url) cur = con.cursor() features = ippro.get_features(cur) cur.close() con.close() logger.info("spawning processes") processes = max(1, processes - 1) ctx = mp.get_context(method="spawn") workers = [] with Store(p_uniprot2features) as proteins: proteins_per_file = math.ceil(len(proteins) / processes) start_acc = None for i, uniprot_acc in enumerate(proteins): if not i % proteins_per_file: if start_acc: filename = f"extra_{len(workers) + 1}.xml" filepath = os.path.join(outdir, filename) p = ctx.Process(target=_write_feature_tmp, args=(features, p_proteins, p_uniprot2features, start_acc, uniprot_acc, filepath)) p.start() workers.append((p, filepath)) start_acc = uniprot_acc filename = f"extra_{len(workers) + 1}.xml" filepath = os.path.join(outdir, filename) p = ctx.Process(target=_write_feature_tmp, args=(features, p_proteins, p_uniprot2features, start_acc, None, filepath)) p.start() workers.append((p, filepath)) logger.info("concatenating XML files") output = os.path.join(outdir, "extra.xml.gz") with gzip.open(output, "wt", encoding="utf-8") as fh: fh.write('<?xml version="1.0" encoding="UTF-8"?>\n') fh.write('<!DOCTYPE interproextra SYSTEM "extra.dtd">\n') fh.write('<interproextra>\n') doc = getDOMImplementation().createDocument(None, None, None) elem = doc.createElement("release") databases = {(f["database"], f["version"]) for f in features.values()} for name, version in sorted(databases): dbinfo = doc.createElement("dbinfo") dbinfo.setAttribute("dbname", name) if version: dbinfo.setAttribute("version", version) elem.appendChild(dbinfo) elem.writexml(fh, addindent=" ", newl="\n") for i, (p, filepath) in enumerate(workers): p.join() with open(filepath, "rt", encoding="utf-8") as tfh: for line in tfh: fh.write(line) os.remove(filepath) logger.info(f"\t{i+1} / {len(workers)}") fh.write('</interproextra>\n') logger.info("complete")
def insert_proteins(p_entries: str, p_proteins: str, p_structures: str, p_taxonomy: str, p_uniprot2comments: str, p_uniprot2name: str, p_uniprot2evidences: str, p_uniprot2ida: str, p_uniprot2matches: str, p_uniprot2proteome: str, p_uniprot2sequence: str, pro_url: str, stg_url: str): logger.info("loading CATH/SCOP domains") uniprot2cath = pdbe.get_cath_domains(pro_url) uniprot2scop = pdbe.get_scop_domains(pro_url) logger.info("preparing data") proteins = Store(p_proteins) u2comments = Store(p_uniprot2comments) u2descriptions = Store(p_uniprot2name) u2evidences = Store(p_uniprot2evidences) u2ida = Store(p_uniprot2ida) u2matches = Store(p_uniprot2matches) u2proteome = Store(p_uniprot2proteome) u2sequence = Store(p_uniprot2sequence) taxonomy = {} for taxid, info in loadobj(p_taxonomy).items(): taxonomy[taxid] = jsonify({ "taxId": taxid, "scientificName": info["sci_name"], "fullName": info["full_name"] }) uniprot2pdbe = {} for pdb_id, entry in loadobj(p_structures).items(): for uniprot_acc in entry["proteins"]: try: uniprot2pdbe[uniprot_acc].append(pdb_id) except KeyError: uniprot2pdbe[uniprot_acc] = [pdb_id] logger.info("counting proteins/IDA") ida_count = {} for dom_members, dom_arch, dom_arch_id in u2ida.values(): try: ida_count[dom_arch_id] += 1 except KeyError: ida_count[dom_arch_id] = 1 logger.info("inserting proteins") entries = loadobj(p_entries) con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4") cur = con.cursor() cur.execute(""" SELECT protein_acc, COUNT(*) FROM webfront_varsplic GROUP BY protein_acc """) isoforms = dict(cur.fetchall()) cur.execute("DROP TABLE IF EXISTS webfront_protein") cur.execute(""" CREATE TABLE webfront_protein ( accession VARCHAR(15) PRIMARY KEY NOT NULL, identifier VARCHAR(16) NOT NULL, organism LONGTEXT NOT NULL, name VARCHAR(255) NOT NULL, description LONGTEXT, sequence LONGBLOB NOT NULL, length INT(11) NOT NULL, proteome VARCHAR(20), gene VARCHAR(70), go_terms LONGTEXT, evidence_code INT(11) NOT NULL, source_database VARCHAR(10) NOT NULL, is_fragment TINYINT NOT NULL, structure LONGTEXT, tax_id VARCHAR(20) NOT NULL, ida_id VARCHAR(40), ida TEXT, counts LONGTEXT NOT NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.close() i = 0 sql = """ INSERT into webfront_protein VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """ with Table(con, sql) as table: for uniprot_acc, protein_info in proteins.items(): taxid = protein_info["taxid"] try: taxon = taxonomy[taxid] except KeyError: table.close() con.close() raise RuntimeError(f"{uniprot_acc}: invalid taxon {taxid}") try: name = u2descriptions[uniprot_acc] except KeyError: table.close() con.close() raise RuntimeError(f"{uniprot_acc}: missing name") try: evidence, gene = u2evidences[uniprot_acc] except KeyError: table.close() con.close() raise RuntimeError(f"{uniprot_acc}: missing evidence") try: sequence = u2sequence[uniprot_acc] except KeyError: table.close() con.close() raise RuntimeError(f"{uniprot_acc}: missing sequence") proteome_id = u2proteome.get(uniprot_acc) clans = [] databases = {} go_terms = {} for entry_acc in u2matches.get(uniprot_acc, []): entry = entries[entry_acc] try: databases[entry.database] += 1 except KeyError: databases[entry.database] = 1 if entry.clan: clans.append(entry.clan["accession"]) for term in entry.go_terms: go_terms[term["identifier"]] = term protein_structures = {} domains = uniprot2cath.get(uniprot_acc) if domains: protein_structures["cath"] = {} for dom in domains.values(): dom_id = dom["id"] protein_structures["cath"][dom_id] = { "domain_id": dom["superfamily"]["id"], "coordinates": dom["locations"] } domains = uniprot2scop.get(uniprot_acc) if domains: protein_structures["scop"] = {} for dom in domains.values(): dom_id = dom["id"] protein_structures["scop"][dom_id] = { "domain_id": dom["superfamily"]["id"], "coordinates": dom["locations"] } try: dom_members, dom_arch, dom_arch_id = u2ida[uniprot_acc] except KeyError: dom_arch = dom_arch_id = None dom_count = 0 else: dom_count = ida_count[dom_arch_id] table.insert( (uniprot_acc, protein_info["identifier"], taxon, name, jsonify(u2comments.get(uniprot_acc)), gzip.compress(sequence.encode("utf-8")), protein_info["length"], proteome_id, gene, jsonify(list(go_terms.values())), evidence, "reviewed" if protein_info["reviewed"] else "unreviewed", 1 if protein_info["fragment"] else 0, jsonify(protein_structures), protein_info["taxid"], dom_arch_id, dom_arch, jsonify({ "domain_architectures": dom_count, "entries": databases, "isoforms": isoforms.get(uniprot_acc, 0), "proteomes": 1 if proteome_id else 0, "sets": len(set(clans)), "structures": len(uniprot2pdbe.get(uniprot_acc, [])), "taxa": 1 }))) i += 1 if not i % 10000000: logger.info(f"{i:>12,}") logger.info(f"{i:>12,}") con.commit() proteins.close() u2comments.close() u2descriptions.close() u2evidences.close() u2ida.close() u2matches.close() u2proteome.close() u2sequence.close() logger.info("indexing") cur = con.cursor() cur.execute(""" CREATE UNIQUE INDEX ui_protein_identifier ON webfront_protein (identifier) """) cur.execute(""" CREATE INDEX i_protein_proteome ON webfront_protein (proteome) """) cur.execute(""" CREATE INDEX i_protein_database ON webfront_protein (source_database) """) cur.execute(""" CREATE INDEX i_protein_taxon ON webfront_protein (tax_id) """) cur.execute(""" CREATE INDEX i_protein_ida ON webfront_protein (ida_id) """) cur.execute(""" CREATE INDEX i_protein_fragment ON webfront_protein (is_fragment) """) cur.close() con.close() logger.info("complete")
def insert_release_notes(p_entries: str, p_proteins: str, p_proteomes: str, p_structures: str, p_taxonomy: str, p_uniprot2matches: str, p_uniprot2proteome: str, rel_url: str, stg_url: str, relfile: str): logger.info("preparing data") uniprot2pdbe = {} for pdb_id, entry in loadobj(p_structures).items(): for uniprot_acc in entry["proteins"]: try: uniprot2pdbe[uniprot_acc].add(pdb_id) except KeyError: uniprot2pdbe[uniprot_acc] = {pdb_id} con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4") cur = con.cursor() cur.execute(""" SELECT name_long, version FROM webfront_database WHERE name_long IN ('UniProtKB', 'UniProtKB/Swiss-Prot', 'UniProtKB/TrEMBL') """) uniprot = {} for name, version in cur: uniprot[name] = { "version": version, "count": 0, "signatures": 0, "integrated_signatures": 0 } cur.close() con.close() entries = loadobj(p_entries) proteins = Store(p_proteins) u2matches = Store(p_uniprot2matches) u2proteome = Store(p_uniprot2proteome) # Entities found in InterPro integrated_proteomes = set() integrated_structures = set() integrated_taxonomy = set() # Number of proteins with GO terms from InterPro uniprot2go = 0 logger.info("starting") i = 0 for uniprot_acc, info in proteins.items(): i += 1 if not i % 10000000: logger.info(f"{i:>12,}") if info["reviewed"]: database = uniprot["UniProtKB/Swiss-Prot"] else: database = uniprot["UniProtKB/TrEMBL"] database["count"] += 1 try: matches = u2matches[uniprot_acc] except KeyError: # No matches continue # Protein matched by at least one signature database["signatures"] += 1 is_integrated = False for entry_acc in matches: entry = entries[entry_acc] if entry.database == "interpro": """ Protein matched by at least one InterPro entry, i.e. at least one integrated signature """ is_integrated = True if entry.go_terms: uniprot2go += 1 break if is_integrated: database["integrated_signatures"] += 1 try: proteome_id = u2proteome[uniprot_acc] except KeyError: pass else: integrated_proteomes.add(proteome_id) try: pdb_ids = uniprot2pdbe[uniprot_acc] except KeyError: pass else: integrated_structures |= pdb_ids integrated_taxonomy.add(info["taxid"]) proteins.close() u2matches.close() u2proteome.close() logger.info(f"{i:>12,}") # Sum Swiss-Prot and TrEMBL counts for key in ["count", "signatures", "integrated_signatures"]: value_sp = uniprot["UniProtKB/Swiss-Prot"][key] value_tr = uniprot["UniProtKB/TrEMBL"][key] uniprot["UniProtKB"][key] = value_sp + value_tr logger.info("tracking changes since last releases") con = MySQLdb.connect(**url2dict(rel_url), charset="utf8mb4") cur = con.cursor() cur.execute(""" SELECT accession, source_database, integrated_id FROM webfront_entry WHERE is_alive = 1 """) public_entries = set() public_integrated = set() for entry_acc, database, integrated_in in cur: if database == "interpro": public_entries.add(entry_acc) elif integrated_in: # Signature already integrated in the previous release public_integrated.add(entry_acc) cur.execute(""" SELECT name, version FROM webfront_database WHERE type = 'entry' """) public_databases = dict(cur.fetchall()) cur.execute("SELECT * FROM webfront_release_note") prev_releases = cur.fetchall() cur.close() con.close() con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4") cur = con.cursor() cur.execute("DROP TABLE IF EXISTS webfront_release_note") cur.execute(""" CREATE TABLE webfront_release_note ( version VARCHAR(20) PRIMARY KEY NOT NULL, release_date DATETIME NOT NULL, content LONGTEXT NOT NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.executemany( """ INSERT INTO webfront_release_note VALUES (%s, %s, %s) """, prev_releases) con.commit() prev_releases = None cur.execute(""" SELECT name, name_long, version, release_date FROM webfront_database WHERE type = 'entry' """) staging_databases = {row[0]: (row[1], row[2], row[3]) for row in cur} interpro_new = [] interpro_types = {} member_databases = {} pubmed_citations = set() interpro2go = 0 latest_entry = None for entry in sorted(loadobj(p_entries).values(), key=lambda e: e.creation_date): if entry.is_deleted: continue if entry.database == "interpro": for pub in entry.literature.values(): if pub["PMID"] is not None: pubmed_citations.add(pub["PMID"]) try: interpro_types[entry.type.lower()] += 1 except KeyError: interpro_types[entry.type.lower()] = 1 if entry.accession not in public_entries: interpro_new.append(entry.accession) interpro2go += len(entry.go_terms) latest_entry = entry.accession else: try: obj = member_databases[entry.database] except KeyError: database, version, _ = staging_databases[entry.database] is_new = is_updated = False if entry.database not in public_databases: is_new = True elif version != public_databases[entry.database]: is_updated = True obj = member_databases[entry.database] = { "name": database, "version": version, "signatures": 0, "integrated_signatures": 0, "recently_integrated": [], "is_new": is_new, "is_updated": is_updated, "sets": set() } obj["signatures"] += 1 if entry.integrated_in: obj["integrated_signatures"] += 1 if entry.accession not in public_integrated: # Recent integration obj["recently_integrated"].append(entry.accession) if entry.clan: obj["sets"].add(entry.clan["accession"]) # Transform sets of clans to counts: for obj in member_databases.values(): obj["sets"] = len(obj["sets"]) structures = list(loadobj(p_structures).values()) proteomes = set(loadobj(p_proteomes).keys()) errors = integrated_proteomes - proteomes if errors: raise RuntimeError(f"{len(errors)} invalid proteomes") taxa = set(loadobj(p_taxonomy).keys()) errors = integrated_taxonomy - taxa if errors: raise RuntimeError(f"{len(errors)} invalid taxa") content = { "notes": [], # TODO implement way to pass custom notes "interpro": { "entries": sum(interpro_types.values()), "new_entries": interpro_new, "latest_entry": latest_entry, "types": interpro_types, "go_terms": interpro2go }, "member_databases": member_databases, "proteins": uniprot, "structures": { "total": len(structures), "integrated": len(integrated_structures), "version": max(entry["date"] for entry in structures).strftime("%Y-%m-%d") }, "proteomes": { "total": len(proteomes), "integrated": len(integrated_proteomes), "version": uniprot["UniProtKB"]["version"] }, "taxonomy": { "total": len(taxa), "integrated": len(integrated_taxonomy), "version": uniprot["UniProtKB"]["version"] }, "citations": len(pubmed_citations) } _, version, date = staging_databases["interpro"] cur.execute( """ SELECT COUNT(*) FROM webfront_release_note WHERE version = %s """, (version, )) n_rows, = cur.fetchone() if n_rows: cur.execute( """ UPDATE webfront_release_note SET content = %s WHERE version = %s """, (json.dumps(content), version)) else: cur.execute( """ INSERT INTO webfront_release_note VALUES (%s, %s, %s) """, (version, date, json.dumps(content))) con.commit() cur.close() con.close() with open(relfile, "wt") as fh: new_integrated = 0 dbs_integrated = [] for db in sorted(member_databases.values(), key=lambda x: x["name"]): cnt = len(db["recently_integrated"]) if cnt: new_integrated += cnt dbs_integrated.append(f"{db['name']} ({cnt})") if new_integrated: integr_str = (f" integrates {new_integrated} new methods from " f"the {', '.join(dbs_integrated)} databases, and") else: integr_str = "" u_ver = uniprot["UniProtKB"]["version"] u_integ = uniprot["UniProtKB"]["integrated_signatures"] u_total = uniprot["UniProtKB"]["count"] u_cov = round(u_integ / u_total * 100, 1) fh.write(f"""\ Title ----- New releases: InterPro {version} and InterProScan 5.??-{version} Image: alternate text --------------------- InterPro: protein sequence analysis & classification Image: title ------------ InterPro: protein sequence analysis & classification Summary ------- InterPro version {version} and InterProScan 5.??-{version} are now available! \ InterPro now features hundreds of new methods integrated \ from partner databases, and InterProScan draws on over \ {sum(interpro_types.values())//1000*1000} entries. Body ---- <h3> <a href="http://www.ebi.ac.uk/interpro/">InterPro version {version}</a> </h3> <p> <a href="http://www.ebi.ac.uk/interpro/">InterPro {version}</a>\ {integr_str} covers {u_cov}% of UniProt Knowledgebase release {u_ver}. \ It predicts <a href="http://www.geneontology.org/">Gene Ontology</a> \ (GO) terms for over {uniprot2go/1e6:.0f} million UniProt proteins \ via the InterPro2GO pipeline. </p> <p> The new release includes an update to UniParc (uniparc_match.tar.gz) \ matches to InterPro methods. You can find this on our ftp site: \ <a href="ftp://ftp.ebi.ac.uk/pub/databases/interpro">ftp://ftp.ebi.ac.uk/pub/databases/interpro</a>. </p> <p> For full details, see <a href="//www.ebi.ac.uk/interpro/release_notes/">the latest InterPro Release Notes</a>. </p> <h3> <a href="https://github.com/ebi-pf-team/interproscan">InterProScan 5.??-{version}</a> </h3> <p> InterProScan 5.??-{version} uses data from the newly released InterPro {version}, \ which contains {sum(interpro_types.values()):,} entries. \ You can find the <a href="https://interproscan-docs.readthedocs.io/en/latest/ReleaseNotes.html">full release notes here</a>. </p> <p> If you need help with InterPro or InterProScan, please contact us using \ <a href="http://www.ebi.ac.uk/support/interpro">our support form</a> - \ your message will reach everyone on the team. </p> Meta fields: description ------------------------ We are pleased to announce the release of InterPro {version} \ and InterProScan 5.??-{version}! Meta fields: tags ----------------- Protein families, InterProScan, InterPro, Protein, \ protein family, protein motif URL alias --------- about/news/service-news/InterPro-{version} """) logger.info("complete")
def insert_structures(p_entries: str, p_proteins: str, p_structures: str, p_uniprot2ida: str, p_uniprot2matches: str, p_uniprot2proteome: str, stg_url: str): logger.info("preparing data") entries = {} for entry in loadobj(p_entries).values(): entries[entry.accession] = (entry.database, entry.clan) uniprot2pdbe = {} xrefs = {} for pdb_id, entry in loadobj(p_structures).items(): for uniprot_acc, chains in entry["proteins"].items(): try: uniprot2pdbe[uniprot_acc][pdb_id] = chains except KeyError: uniprot2pdbe[uniprot_acc] = {pdb_id: chains} xrefs[pdb_id] = { "domain_architectures": set(), "entries": {}, "proteomes": set(), "proteins": 0, "sets": set(), "taxa": set() } proteins = Store(p_proteins) u2ida = Store(p_uniprot2ida) u2matches = Store(p_uniprot2matches) u2proteome = Store(p_uniprot2proteome) logger.info("starting") i = 0 for uniprot_acc in sorted(uniprot2pdbe): info = proteins[uniprot_acc] try: dom_members, dom_arch, dom_arch_id = u2ida[uniprot_acc] except KeyError: dom_arch_id = None proteome_id = u2proteome.get(uniprot_acc) matches = u2matches.get(uniprot_acc, {}) for pdb_id, chains in uniprot2pdbe[uniprot_acc].items(): _xrefs = xrefs[pdb_id] if dom_arch_id: _xrefs["domain_architectures"].add(dom_arch_id) if proteome_id: _xrefs["proteomes"].add(proteome_id) _xrefs["proteins"] += 1 _xrefs["taxa"].add(info["taxid"]) for entry_acc, locations in matches.items(): database, clan = entries[entry_acc] for chain_id, segments in chains.items(): if overlaps_pdb_chain(locations, segments): try: _xrefs["entries"][database].add(entry_acc) except KeyError: _xrefs["entries"][database] = {entry_acc} if clan: _xrefs["sets"].add(clan["accession"]) break # Skip other chains i += 1 if not i % 10000: logger.info(f"{i:>12,}") logger.info(f"{i:>12,}") proteins.close() u2ida.close() u2matches.close() u2proteome.close() con = MySQLdb.connect(**url2dict(stg_url), charset="utf8mb4") cur = con.cursor() cur.execute("DROP TABLE IF EXISTS webfront_structure") cur.execute(""" CREATE TABLE webfront_structure ( accession VARCHAR(4) PRIMARY KEY NOT NULL, name VARCHAR(512) NOT NULL, source_database VARCHAR(10) NOT NULL, experiment_type VARCHAR(16) NOT NULL, release_date DATETIME NOT NULL, resolution FLOAT, literature LONGTEXT, chains LONGTEXT NOT NULL, proteins LONGTEXT NOT NULL, secondary_structures LONGTEXT, counts LONGTEXT NOT NULL ) CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci """) cur.close() sql = """ INSERT INTO webfront_structure VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """ with Table(con, sql) as table: for pdb_id, info in loadobj(p_structures).items(): counts = reduce(xrefs[pdb_id]) counts["entries"]["total"] = sum(counts["entries"].values()) table.insert(( pdb_id, info["name"], "pdb", info["evidence"], info["date"], info["resolution"], jsonify(info["citations"]), # Sorted list of unique chain (e.g. 'A', 'B', ...) jsonify(sorted({ chain_id for chains in info["proteins"].values() for chain_id in chains }), nullable=False), jsonify(info["proteins"], nullable=False), jsonify(info["secondary_structures"]), jsonify(counts))) con.commit() con.close() logger.info("complete")