def update_databases(cur: cx_Oracle.Cursor): logger.info("creating table CV_DATABASE") oracle.drop_table(cur, "UNIPARC.CV_DATABASE", purge=True) cur.execute(""" CREATE TABLE UNIPARC.CV_DATABASE TABLESPACE UNIPARC_TAB NOLOGGING AS SELECT * FROM UNIPARC.CV_DATABASE@UAREAD """) cur.execute("GRANT SELECT ON UNIPARC.CV_DATABASE TO PUBLIC") cur.execute(""" CREATE UNIQUE INDEX PK_CV_DATABASE ON UNIPARC.CV_DATABASE (ID) TABLESPACE UNIPARC_IND NOLOGGING """) cur.execute(""" CREATE UNIQUE INDEX UQ_CV_DATABASE$DESCR ON UNIPARC.CV_DATABASE (DESCR) TABLESPACE UNIPARC_IND NOLOGGING """) cur.execute(""" ALTER TABLE UNIPARC.CV_DATABASE ADD ( CONSTRAINT PK_CV_DATABASE PRIMARY KEY(ID) USING INDEX PK_CV_DATABASE, CONSTRAINT UQ_CV_DATABASE$DESCR UNIQUE (DESCR) USING INDEX UQ_CV_DATABASE$DESCR ) """) oracle.gather_stats(cur, "UNIPARC", "CV_DATABASE")
def update(url: str): con = cx_Oracle.connect(url) cur = con.cursor() update_databases(cur) update_proteins(cur) update_xrefs(cur) cur.close() con.close() logger.info("complete")
def import_databases(ora_url: str, pg_url: str): logger.info("populating") pg_con = psycopg2.connect(**url2dict(pg_url)) with pg_con.cursor() as pg_cur: pg_cur.execute("DROP TABLE IF EXISTS database") pg_cur.execute( """ CREATE TABLE database ( id SERIAL NOT NULL CONSTRAINT database_pkey PRIMARY KEY, name VARCHAR(50) NOT NULL, name_long VARCHAR(50) NOT NULL, version VARCHAR(50) NOT NULL, updated DATE NOT NULL ) """ ) ora_con = cx_Oracle.connect(ora_url) ora_cur = ora_con.cursor() ora_cur.execute( f""" SELECT D.DBCODE, LOWER(D.DBSHORT), D.DBNAME, V.VERSION, V.FILE_DATE FROM INTERPRO.CV_DATABASE D LEFT OUTER JOIN INTERPRO.DB_VERSION V ON D.DBCODE = V.DBCODE """ ) for row in ora_cur: if row[0] in DATABASES: pg_cur.execute( """ INSERT INTO database (name, name_long, version, updated) VALUES (%s, %s, %s, %s) """, row[1:] ) ora_cur.close() ora_con.close() pg_cur.execute( """ CREATE UNIQUE INDEX database_name_idx ON database (name) """ ) pg_con.commit() pg_con.close() logger.info("complete")
def export_sib(url: str, emails: dict): logger.info("exporting data") con = cx_Oracle.connect(url) cur = con.cursor() """ The SIB_EXPORT package belongs to INTERPRODP and exports the tables: * INTERPRO.ENTRY * INTERPRO.ENTRY2ENTRY * INTERPRO.METHOD * INTERPRO.ENTRY2METHOD * INTERPRO.PROTEIN * INTERPRO.MATCH * INTERPRO.DB_VERSION """ cur.callproc("SIB_EXPORT.EXP_SIB_DATA") logger.info("data successfully exported") cur.execute("SELECT VERSION FROM INTERPRO.DB_VERSION WHERE DBCODE = 'u'") version, = cur.fetchone() cur.close() con.close() email.send( emails, subject=f"Protein update {version}: data for SIB ready", content=f"""\ The data required by SIB was successfully exported. Please, archive the dump on the FTP, and inform SIB that the archive is ready. Recipients ---------- To: {emails['sib']} Subject ------- InterPro data for UniProt private release available Body ---- Dear Swiss-Prot team, The interpro.tar.gz archive for UniProt private release {version} \ is available at ftp://ftp-private-beta.ebi.ac.uk/interpro/ Kind regards, The InterPro Production Team """ )
def _check_matches(con: cx_Oracle.Connection): """ Check there are not errors in imported matches :param con: Oracle connection object """ cur = con.cursor() # Matches outside of the protein logger.info("checking out-of-bound matches") cur.execute(""" SELECT M.PROTEIN_AC, M.METHOD_AC, M.POS_TO, P.LEN FROM INTERPRO.MATCH_NEW M INNER JOIN INTERPRO.PROTEIN P ON M.PROTEIN_AC = P.PROTEIN_AC WHERE M.POS_TO > P.LEN """) cnt = 0 for row in cur: logger.critical("out-of-bound: {}\t{}\t{}\t{}".format(*row)) cnt += 1 if cnt: cur.close() con.close() raise RuntimeError(f"{cnt} out-of-bound matches") # Matches with invalid start/end positions logger.info("checking invalid matches") cur.execute(""" SELECT PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO FROM INTERPRO.MATCH_NEW WHERE POS_FROM < 1 OR POS_FROM > POS_TO """) cnt = 0 for row in cur: logger.critical("invalid: {}\t{}\t{}\t{}".format(*row)) cnt += 1 if cnt: cur.close() con.close() raise RuntimeError(f"{cnt} invalid matches") cur.close()
def update_variant_matches(url: str): """ Recreate splice-variants table with the most recent data from SWISSPROT_VARSPLIC. TREMBL_VARSPLIC (DBID=25) is obsolete and only contains deleted cross-references. :param url: Oracle connection string """ con = cx_Oracle.connect(url) cur = con.cursor() logger.info("updating VARSPLIC_MASTER") oracle.truncate_table(cur, "INTERPRO.VARSPLIC_MASTER", reuse_storage=True) cur.execute(""" INSERT INTO INTERPRO.VARSPLIC_MASTER SELECT SUBSTR(X.AC, 1, INSTR(X.AC, '-') - 1), SUBSTR(X.AC, INSTR(X.AC, '-') + 1), P.CRC64, P.LEN FROM UNIPARC.XREF X INNER JOIN UNIPARC.PROTEIN P ON X.UPI = P.UPI WHERE X.DBID = 24 AND X.DELETED = 'N' """) logger.info(f"{cur.rowcount} rows inserted") logger.info("updating VARSPLIC_MATCH") oracle.truncate_table(cur, "INTERPRO.VARSPLIC_MATCH", reuse_storage=True) cur.execute(""" INSERT INTO INTERPRO.VARSPLIC_MATCH SELECT X.AC, MV.METHOD_AC, MV.SEQ_START, MV.SEQ_END, 'T' AS STATUS, I2D.DBCODE, I2D.EVIDENCE, SYSDATE, SYSDATE, SYSDATE, 'INTERPRO', MV.EVALUE, MV.MODEL_AC, MV.FRAGMENTS FROM UNIPARC.XREF X INNER JOIN IPRSCAN.MV_IPRSCAN MV ON X.UPI = MV.UPI INNER JOIN INTERPRO.IPRSCAN2DBCODE I2D ON MV.ANALYSIS_ID = I2D.IPRSCAN_SIG_LIB_REL_ID INNER JOIN INTERPRO.METHOD M ON MV.METHOD_AC = M.METHOD_AC WHERE X.DBID = 24 AND X.DELETED = 'N' -- Exclude MobiDB-Lite, Phobius, SignalP (Euk, Gram+, Gram-), TMHMM, COILS AND I2D.DBCODE NOT IN ('g', 'j', 'n', 'q', 's', 'v', 'x') AND M.SKIP_FLAG = 'N' """) logger.info(f"{cur.rowcount} rows inserted") con.commit() cur.close() con.close()
def check_proteins_to_scan(url: str): con = cx_Oracle.connect(url) cur = con.cursor() logger.info("checking for errors") num_errors = check_proteins(cur) if num_errors: cur.close() con.close() raise RuntimeError(f"{num_errors} errors found") logger.info("no error found") ora.truncate_table(cur, "INTERPRO.PROTEIN_TO_SCAN") cur.execute(""" INSERT INTO INTERPRO.PROTEIN_TO_SCAN (PROTEIN_AC, UPI) SELECT P.PROTEIN_AC, X.UPI FROM INTERPRO.PROTEIN_CHANGES P INNER JOIN UNIPARC.XREF X ON P.PROTEIN_AC = X.AC AND X.DBID IN (2, 3) -- Swiss-Prot/TrEMBL AND X.DELETED = 'N' -- Not deleted """) con.commit() ora.gather_stats(cur, "INTERPRO", "PROTEIN_TO_SCAN") cur.close() con.close() logger.info("complete")
def _insert_matches(con: cx_Oracle.Connection): """ Update the MATCH table with data from the staging table :param con: Oracle connection object """ cur = con.cursor() logger.info("updating MATCH") cur.execute(""" DELETE FROM INTERPRO.MATCH WHERE PROTEIN_AC IN ( SELECT PROTEIN_AC FROM INTERPRO.PROTEIN_TO_SCAN ) """) logger.info(f"{cur.rowcount} rows deleted") cur.execute(""" INSERT INTO INTERPRO.MATCH SELECT * FROM INTERPRO.MATCH_NEW """) logger.info(f"{cur.rowcount} rows inserted") con.commit() oracle.drop_table(cur, "INTERPRO.MATCH_NEW", purge=True) cur.close()
def update_proteins(cur: cx_Oracle.Cursor): logger.info("creating table PROTEIN") oracle.drop_mview(cur, "UNIPARC.PROTEIN") oracle.drop_table(cur, "UNIPARC.PROTEIN", purge=True) cur.execute(""" CREATE TABLE UNIPARC.PROTEIN TABLESPACE UNIPARC_TAB NOLOGGING AS SELECT UPI, TIMESTAMP, USERSTAMP, CRC64, LEN, SEQ_SHORT, SEQ_LONG, MD5 FROM UNIPARC.PROTEIN@UAREAD """) cur.execute("GRANT SELECT ON UNIPARC.PROTEIN TO PUBLIC") logger.info("creating index PK_PROTEIN") cur.execute(""" CREATE UNIQUE INDEX PK_PROTEIN ON UNIPARC.PROTEIN (UPI) TABLESPACE UNIPARC_IND """) logger.info("gathering statistics on table PROTEIN") oracle.gather_stats(cur, "UNIPARC", "PROTEIN")
def update_feature_matches(url: str): """ Add protein feature matches for recently added/modified sequences :param url: Oracle connection string """ con = cx_Oracle.connect(url) cur = con.cursor() logger.info("updating FEATURE_MATCH") cur.execute(""" DELETE FROM INTERPRO.FEATURE_MATCH WHERE PROTEIN_AC IN ( SELECT PROTEIN_AC FROM INTERPRO.PROTEIN_TO_SCAN ) """) logger.info(f"{cur.rowcount} rows deleted") cur.execute(""" INSERT INTO INTERPRO.FEATURE_MATCH SELECT P.PROTEIN_AC, M.METHOD_AC, M.SEQ_FEATURE, M.SEQ_START, M.SEQ_END, D.DBCODE FROM INTERPRO.PROTEIN_TO_SCAN P INNER JOIN IPRSCAN.MV_IPRSCAN M ON P.UPI = M.UPI INNER JOIN INTERPRO.IPRSCAN2DBCODE D ON M.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID -- Include MobiDB-Lite, Phobius, SignalP (Euk, Gram+, Gram-), TMHMM, COILS WHERE D.DBCODE IN ('g', 'j', 'n', 's', 'v', 'q', 'x') """) logger.info(f"{cur.rowcount} rows inserted") con.commit() cur.close() con.close()
def update_xrefs(cur: cx_Oracle.Cursor): logger.info("creating table XREF") oracle.drop_table(cur, "UNIPARC.XREF", purge=True) oracle.drop_table(cur, "UNIPARC.XREF_OLD", purge=True) cur.execute(""" CREATE TABLE UNIPARC.XREF TABLESPACE UNIPARC_TAB NOLOGGING AS SELECT UPI, AC, DBID, DELETED, VERSION FROM UNIPARC.XREF@UAREAD """) cur.execute("GRANT SELECT ON UNIPARC.XREF TO PUBLIC") for col in ["UPI", "AC", "DBID"]: logger.info(f"creating index I_XREF${col}") cur.execute(f""" CREATE INDEX I_XREF${col} ON UNIPARC.XREF ({col}) TABLESPACE UNIPARC_IND """) logger.info("gathering statistics on table XREF") oracle.gather_stats(cur, "UNIPARC", "XREF")
def track_signature_changes(ora_url: str, pg_url: str, databases: Sequence[Database], data_dir: str): # First, get the SwissProt descriptions (before the update) all_sig2descs = get_swissprot_descriptions(pg_url) con = cx_Oracle.connect(ora_url) cur = con.cursor() results = {} for db in databases: cur.execute( """ SELECT M.METHOD_AC, M.NAME, M.DESCRIPTION, M.SIG_TYPE, EM.ENTRY_AC FROM INTERPRO.METHOD M LEFT OUTER JOIN INTERPRO.ENTRY2METHOD EM ON M.METHOD_AC = EM.METHOD_AC WHERE DBCODE = :1 """, (db.identifier,) ) old_signatures = {row[0]: row[1:] for row in cur} cur.execute( """ SELECT METHOD_AC, NAME, DESCRIPTION, SIG_TYPE FROM INTERPRO.METHOD_STG WHERE DBCODE = :1 """, (db.identifier,) ) new_signatures = {row[0]: row[1:] for row in cur} deleted = [] name_changes = [] descr_changes = [] type_changes = [] for acc in sorted(old_signatures): old_name, old_descr, old_type, entry_acc = old_signatures[acc] try: new_name, new_descr, new_type = new_signatures.pop(acc) except KeyError: deleted.append((acc, old_name, old_descr, entry_acc)) continue if old_name != new_name: name_changes.append((acc, old_name, new_name)) if old_descr != new_descr: descr_changes.append((acc, old_descr, new_descr)) if old_type != new_type: type_changes.append((acc, old_type, new_type)) results[db.identifier] = { "new": [ (acc, *new_signatures[acc]) for acc in sorted(new_signatures) ], "deleted": deleted, "changes": { "names": name_changes, "descriptions": descr_changes, "types": type_changes }, "proteins": get_sig_protein_counts(cur, db.identifier), "descriptions": { acc: all_sig2descs[acc] for acc in all_sig2descs if acc in old_signatures } } logger.info(db.name) logger.info(f" new: {len(results[db.identifier]['new']):>6}") logger.info(f" deleted: {len(results[db.identifier]['deleted']):>6}") cur.close() con.close() with open(os.path.join(data_dir, FILE_DB_SIG), "wb") as fh: pickle.dump(results, fh)
def import_sites(url: str, **kwargs): databases = kwargs.get("databases", []) force_import = kwargs.get("force_import", False) threads = kwargs.get("threads", 1) if databases: # expects a sequence of Database objects databases = {db.analysis_id for db in databases} con = cx_Oracle.connect(url) cur = con.cursor() pending = {} for analysis in get_analyses(cur, type="sites"): if databases and analysis.id not in databases: continue try: partition = SITE_PARITIONS[analysis.type] except KeyError: logger.warning(f"ignoring analysis {analysis.name}") continue try: pending[analysis.table].append((analysis, partition, SITE_SELECT)) except KeyError: pending[analysis.table] = [(analysis, partition, SITE_SELECT)] cur.execute("SELECT MAX(UPI) FROM UNIPARC.PROTEIN") max_upi, = cur.fetchone() cur.close() con.close() if not pending: logger.info("No databases to import") return elif threads < 1: threads = len(pending) with ThreadPoolExecutor(max_workers=threads) as executor: running = [] failed = 0 while True: con = cx_Oracle.connect(url) cur = con.cursor() tmp = [] for f, table, names in running: if not f.done(): tmp.append((f, table, names)) continue try: f.result() except Exception as exc: for name in names: logger.info(f"{name:<35} import failed ({exc})") failed += 1 else: for name in names: logger.info(f"{name:<35} import done") running = tmp tmp = {} for table, analyses in pending.items(): ready = [] for analysis, partition, columns in analyses: if analysis.is_ready(cur, max_upi): ready.append((analysis.id, partition, columns)) if len(ready) < len(analyses): # Not ready tmp[table] = analyses continue names = [e[0].name for e in analyses] for name in names: logger.info(f"{name:<35} ready for import") args = (url, table, "SITE", ready, force_import) f = executor.submit(update_analyses, *args) running.append((f, table, names)) pending = tmp cur.close() con.close() if pending or running: time.sleep(60) else: break if failed: raise RuntimeError(f"{failed} errors") con = cx_Oracle.connect(url) cur = con.cursor() for index in oracle.get_indexes(cur, "IPRSCAN", "SITE"): if index["unusable"]: logger.info(f"rebuilding index {index['name']}") oracle.catch_temp_error(fn=oracle.rebuild_index, args=(cur, index["name"])) logger.info("gathering statistics") oracle.gather_stats(cur, "IPRSCAN", "SITE") cur.close() con.close() logger.info("complete")
def import_signatures(ora_url: str, pg_url: str, allseqs: str, compseqs: str): logger.info("populating") with open(allseqs, "rb") as fh: allseqs = pickle.load(fh) with open(compseqs, "rb") as fh: compseqs = pickle.load(fh) pg_con = psycopg2.connect(**url2dict(pg_url)) with pg_con.cursor() as pg_cur: pg_cur.execute("SELECT name, id FROM database") databases = dict(pg_cur.fetchall()) ora_con = cx_Oracle.connect(ora_url) ora_cur = ora_con.cursor() ora_cur.execute( """ SELECT M.METHOD_AC, LOWER(D.DBSHORT), M.NAME, M.DESCRIPTION, T.ABBREV, M.ABSTRACT, M.ABSTRACT_LONG FROM INTERPRO.METHOD M INNER JOIN INTERPRO.CV_DATABASE D ON M.DBCODE = D.DBCODE INNER JOIN INTERPRO.CV_ENTRY_TYPE T ON M.SIG_TYPE = T.CODE """ ) values = [] for row in ora_cur: try: db_id = databases[row[1]] except KeyError: continue acc = row[0] try: n_rev_seqs, n_rev_matches, n_unrev_seqs = allseqs[acc] except KeyError: n_rev_seqs = n_rev_matches = n_unrev_seqs = 0 try: num_complete_sequences, num_residues = compseqs[acc] except KeyError: num_complete_sequences = num_residues = 0 values.append(( acc, # accession db_id, # database_id row[2], # name row[3], # description row[4], # type row[6].read() if row[6] is not None else row[5], # abstract n_rev_seqs + n_unrev_seqs, # num_sequences n_rev_seqs, # num_reviewed_sequences n_rev_matches, # num_reviewed_matches num_complete_sequences, # num_complete_sequences num_residues # num_residues )) ora_cur.close() ora_con.close() with pg_con.cursor() as pg_cur: pg_cur.execute("DROP TABLE IF EXISTS signature") pg_cur.execute( """ CREATE TABLE signature ( accession VARCHAR(25) NOT NULL CONSTRAINT signature_pkey PRIMARY KEY, database_id INTEGER NOT NULL, name VARCHAR(100) NOT NULL, description VARCHAR(400), type VARCHAR(25) NOT NULL, abstract TEXT, num_sequences INTEGER NOT NULL, num_reviewed_sequences INTEGER NOT NULL, num_reviewed_matches INTEGER NOT NULL, num_complete_sequences INTEGER NOT NULL, num_residues BIGINT NOT NULL ) """ ) execute_values(pg_cur, "INSERT INTO signature VALUES %s", values, page_size=1000) pg_cur.execute( """ CREATE INDEX signature_database_idx ON signature (database_id) """ ) pg_con.commit() pg_con.close() logger.info("complete")
def export_xrefs(url: str, outdir: str, emails: dict): """ Format for Uniprot dat files: <protein> DR <database>; <signature/entry>; <name>; <count>. Exceptions: - Gene3D: do not include prefix before accession (G3DSA:) replace signature's name by hyphen (-) - PRINTS: do not include match count - InterPro: do not include match count """ logger.info("exporting dat files") os.makedirs(outdir, 0o775, exist_ok=True) con = cx_Oracle.connect(url) cur = con.cursor() cur.execute("SELECT VERSION FROM INTERPRO.DB_VERSION WHERE DBCODE = 'u'") release = cur.fetchone()[0] cur.execute( """ SELECT PROTEIN_AC, METHOD_AC, MIN(METHOD_NAME), MIN(DBCODE), MIN(ENTRY_AC), MIN(SHORT_NAME), COUNT(*) FROM INTERPRO.XREF_SUMMARY GROUP BY PROTEIN_AC, METHOD_AC ORDER BY PROTEIN_AC """ ) dbcodes = { 'B': ("SFLD", "SFLD"), # 'D': ("ProDom", "PD"), # ProDom removed from InterPro 'F': ("PRINTS", "PP"), 'H': ("Pfam", "PF"), 'J': ("CDD", "CDD"), 'M': ("PROSITE", "PR"), 'N': ("TIGRFAMs", "TF"), 'P': ("PROSITE", "PR"), 'Q': ("HAMAP", "HP"), 'R': ("SMART", "SM"), 'U': ("PIRSF", "PI"), 'V': ("PANTHER", "PTHR"), 'X': ("Gene3D", "G3D"), 'Y': ("SUPFAM", "SF"), } files = [] handlers = {} for dbcode in dbcodes: if dbcode != 'P': # P (PROSITE patterns) -> same file than M (PROSITE profiles) dbname, dbkey = dbcodes[dbcode] filepath = os.path.join(outdir, dbname + ".dat") files.append(filepath) handlers[dbcode] = open(filepath, "wt") handlers['P'] = handlers['M'] filepath = os.path.join(outdir, "InterPro.dat") files.append(filepath) ifh = open(filepath, "wt") entries = {} prev_acc = None for row in cur: protein_acc = row[0] signature_acc = row[1] signature_name = row[2] dbcode = row[3] entry_acc = row[4] entry_name = row[5] num_matches = int(row[6]) dbname, dbkey = dbcodes[dbcode] fh = handlers[dbcode] if dbcode == 'X': """ Gene3D - accession: G3DSA:3.90.1580.10 -> 3.90.1580.10 - do not print signature's name """ fh.write(f"{protein_acc} DR {dbname}; {signature_acc[6:]}; " f"-; {num_matches}.\n") elif dbcode == 'F': # PRINTS: do not print match count fh.write(f"{protein_acc} DR {dbname}; {signature_acc}; " f"{signature_name}.\n") else: fh.write(f"{protein_acc} DR {dbname}; {signature_acc}; " f"{signature_name}; {num_matches}.\n") if protein_acc != prev_acc: for entry in sorted(entries): name = entries[entry] ifh.write(f"{prev_acc} DR InterPro; {entry}; {name}.\n") entries = {} prev_acc = protein_acc entries[entry_acc] = entry_name # Last protein if prev_acc: for entry in sorted(entries): name = entries[entry] ifh.write(f"{prev_acc} DR InterPro; {entry}; {name}.\n") ifh.close() for fh in handlers.values(): fh.close() for path in files: os.chmod(path, 0o775) email.send( info=emails, to=["uniprot_db"], cc=["uniprot_prod"], bcc=["sender"], subject="InterPro XREF files are ready", content=f"""\ Dear UniProt team, The InterPro cross-references files for {release} are available \ in the following directory: {outdir} Kind regards, The InterPro Production Team """ ) logger.info("complete")
def refresh_taxonomy(url: str): con = cx_Oracle.connect(url) cur = con.cursor() logger.info("creating TAXONOMY_LOAD") oracle.drop_table(cur, "INTERPRO.TAXONOMY_LOAD", purge=True) cur.execute(""" CREATE TABLE INTERPRO.TAXONOMY_LOAD NOLOGGING AS SELECT TAX_ID, PARENT_ID, SPTR_SCIENTIFIC AS SCIENTIFIC_NAME, RANK, NVL(SPTR_COMMON, NCBI_COMMON) AS COMMON_NAME FROM TAXONOMY.V_PUBLIC_NODE@SWPREAD """) oracle.gather_stats(cur, "INTERPRO", "TAXONOMY_LOAD") logger.info("populating ETAXI") oracle.truncate_table(cur, "INTERPRO.ETAXI", reuse_storage=True) cur.execute(""" INSERT INTO INTERPRO.ETAXI SELECT N.TAX_ID, N.PARENT_ID, N.SCIENTIFIC_NAME, 'X' COMPLETE_GENOME_FLAG, N.RANK, 0 HIDDEN, LR.TREE_LEFT LEFT_NUMBER, LR.TREE_RIGHT RIGHT_NUMBER, 'X' ANNOTATION_SOURCE, N.SCIENTIFIC_NAME || CASE WHEN N.COMMON_NAME IS NULL THEN '' ELSE ' (' || N.COMMON_NAME || ')' END FULL_NAME FROM INTERPRO.TAXONOMY_LOAD N INNER JOIN ( SELECT TAX_ID, MIN(TREE_NUMBER) TREE_LEFT, MAX(TREE_NUMBER) TREE_RIGHT FROM ( SELECT PARENT_ID AS TAX_ID, ROWNUM AS TREE_NUMBER FROM ( SELECT TAX_ID, PARENT_ID FROM ( SELECT TAX_ID, PARENT_ID FROM INTERPRO.TAXONOMY_LOAD UNION ALL SELECT 9999999 AS TAX_ID, TAX_ID AS PARENT_ID FROM INTERPRO.TAXONOMY_LOAD UNION ALL SELECT 0 AS TAX_ID, TAX_ID AS PARENT_ID FROM INTERPRO.TAXONOMY_LOAD ) START WITH TAX_ID = 1 CONNECT BY PRIOR TAX_ID=PARENT_ID ORDER SIBLINGS BY TAX_ID ) WHERE TAX_ID IN (9999999, 0) ) GROUP BY TAX_ID ) LR ON LR.TAX_ID = N.TAX_ID """) con.commit() oracle.gather_stats(cur, "INTERPRO", "ETAXI") # Dropping temporary table oracle.drop_table(cur, "INTERPRO.TAXONOMY_LOAD", purge=True) # logger.info("populating UNIPROT_TAXONOMY") # oracle.truncate_table(cur, "INTERPRO.UNIPROT_TAXONOMY", reuse_storage=True) # cur.execute( # """ # INSERT INTO INTERPRO.UNIPROT_TAXONOMY # SELECT P.PROTEIN_AC, P.TAX_ID, NVL(ET.LEFT_NUMBER, 0) LEFT_NUMBER, # NVL(ET.RIGHT_NUMBER, 0) RIGHT_NUMBER # FROM INTERPRO.PROTEIN P # LEFT OUTER JOIN INTERPRO.ETAXI ET ON P.TAX_ID = ET.TAX_ID # """ # ) # con.commit() # oracle.gather_stats(cur, "INTERPRO", "UNIPROT_TAXONOMY") # logger.info("populating MV_TAX_ENTRY_COUNT") # oracle.truncate_table(cur, "INTERPRO.MV_TAX_ENTRY_COUNT", reuse_storage=True) # cur.execute( # """ # INSERT INTO INTERPRO.MV_TAX_ENTRY_COUNT # WITH QUERY1 AS ( # SELECT ENTRY_AC, ANC.PARENT AS TAX_ID, COUNT(1) AS COUNT # FROM INTERPRO.UNIPROT_TAXONOMY UT # JOIN INTERPRO.MV_ENTRY2PROTEIN_TRUE MVEP # ON UT.PROTEIN_AC=MVEP.PROTEIN_AC # JOIN ( # SELECT NVL( # SUBSTR( # SYS_CONNECT_BY_PATH(TAX_ID, '.'), # 2, # INSTR(SYS_CONNECT_BY_PATH (TAX_ID,'.'),'.',2) - 2 # ), # TAX_ID # ) AS CHILD, # TAX_ID AS PARENT # FROM INTERPRO.ETAXI ET # CONNECT BY PRIOR PARENT_ID=TAX_ID # ) ANC ON ANC.CHILD=UT.TAX_ID # GROUP BY ENTRY_AC, ANC.PARENT # ), # QUERY2 AS ( # SELECT ENTRY_AC, TAX_ID, COUNT(1) AS COUNT # FROM INTERPRO.UNIPROT_TAXONOMY UT # JOIN INTERPRO.MV_ENTRY2PROTEIN_TRUE MVEP # ON UT.PROTEIN_AC=MVEP.PROTEIN_AC # GROUP BY ENTRY_AC, TAX_ID # ) # SELECT QUERY1.ENTRY_AC, QUERY1.TAX_ID, QUERY1.COUNT AS COUNT, # QUERY2.COUNT AS COUNT_SPECIFIED_TAX_ID # FROM QUERY1 # LEFT OUTER JOIN QUERY2 # ON QUERY1.ENTRY_AC = QUERY2.ENTRY_AC # AND QUERY1.TAX_ID = QUERY2.TAX_ID # """ # ) # con.commit() # oracle.gather_stats(cur, "INTERPRO", "MV_TAX_ENTRY_COUNT") cur.close() con.close() logger.info("complete")
def delete_obsoletes(url: str, truncate: bool = False, threads: int = 8, step: int = 10000): con = cx_Oracle.connect(url) cur = con.cursor() # Count the protein to delete cur.execute("SELECT COUNT(*) FROM INTERPRO.PROTEIN_TO_DELETE") stop, = cur.fetchone() logger.info(f"{stop:,} proteins to delete") if not stop: # Nothing to do: exit cur.close() con.close() return # Get tables with a FOREIGN KEY to INTERPRO.PROTEIN.PROTEIN_AC tables = [] for table, constraint, column in ora.get_child_tables( cur, "INTERPRO", "PROTEIN"): if truncate and table.startswith("MV_") or table.endswith("_NEW"): logger.info(f"truncating {table}") ora.truncate_table(cur, table) else: tables.append((table, constraint, column)) # Add INTERPRO.PROTEIN as we want also to delete rows in this table tables.append(("PROTEIN", None, "PROTEIN_AC")) logger.info("disabling referential constraints") num_errors = 0 for table, constraint, column in tables: if not constraint: continue try: ora.toggle_constraint(cur, table, constraint, False) except cx_Oracle.DatabaseError as exc: logger.error(exc) num_errors += 1 if num_errors: cur.close() con.close() raise RuntimeError(f"{num_errors} constraints could not be disabled") # Find partitions to run a DELETE statement for each partition tasks = [] for table, constraint, column in tables: partitions = ora.get_partitions(cur, "INTERPRO", table) if partitions: for p in partitions: tasks.append((table, p["name"], column)) else: tasks.append((table, None, column)) cur.close() con.close() logger.info(f"{len(tasks)} tables to update") with futures.ThreadPoolExecutor(max_workers=threads) as executor: fs = {} for table, partition, column in tasks: f = executor.submit(iterative_delete, url, table, partition, column, step, stop) fs[f] = (table, partition) num_errors = 0 for f in futures.as_completed(fs): table, partition = fs[f] if partition: name = f"{table} ({partition})" else: name = table try: num_rows = f.result() except Exception as exc: logger.info(f"{name}: failed ({exc})") num_errors += 1 else: logger.info(f"{name}: {num_rows:,} rows deleted") if num_errors: raise RuntimeError(f"{num_errors} tables failed") logger.info("enabling referential constraints") con = cx_Oracle.connect(url) cur = con.cursor() num_errors = 0 constraints = set() for table, constraint, column in tables: if not constraint or constraint in constraints: """ Either no constraint or prevent the same constrain to be enabled several times """ continue constraints.add(constraint) try: ora.toggle_constraint(cur, table, constraint, True) except cx_Oracle.DatabaseError as exc: logger.error(exc) num_errors += 1 if num_errors: cur.close() con.close() raise RuntimeError(f"{num_errors} constraints could not be enabled") for table, constraint, column in tables: for index in ora.get_indexes(cur, "INTERPRO", table): if index["unusable"]: logger.info(f"rebuilding index {index['name']}") ora.rebuild_index(cur, index["name"]) cur.close() con.close() logger.info("complete")
def update_database_site_matches(url: str, databases: Sequence[Database]): con = cx_Oracle.connect(url) cur = con.cursor() for database in databases: logger.info(database.name) """ Same partition names in: - IPRSCAN.SITE - INTERPRO.SITE_MATCH """ site_partition, ck_matches = SITE_PARTITIONS[database.identifier] oracle.drop_table(cur, "INTERPRO.SITE_MATCH_NEW", purge=True) cur.execute(""" CREATE TABLE INTERPRO.SITE_MATCH_NEW NOLOGGING AS SELECT * FROM INTERPRO.SITE_MATCH WHERE 1 = 0 """) logger.debug(f"\tinserting site matches") cur.execute( f""" INSERT /*+ APPEND */ INTO INTERPRO.SITE_MATCH_NEW SELECT X.AC, S.METHOD_AC, S.LOC_START, S.LOC_END, S.DESCRIPTION, S.RESIDUE, S.RESIDUE_START, S.RESIDUE_END, S.NUM_SITES, D.DBCODE FROM IPRSCAN.SITE PARTITION ({site_partition}) S INNER JOIN UNIPARC.XREF X ON S.UPI = X.UPI INNER JOIN INTERPRO.IPRSCAN2DBCODE D ON S.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID WHERE S.ANALYSIS_ID = :1 AND X.DBID IN (2, 3) -- Swiss-Prot or TrEMBL AND X.DELETED = 'N' """, (database.analysis_id, )) con.commit() logger.debug(f"\tindexing") cur.execute(""" CREATE INDEX I_SITE_MATCH_NEW ON INTERPRO.SITE_MATCH_NEW ( PROTEIN_AC, METHOD_AC, LOC_START, LOC_END ) TABLESPACE INTERPRO_IND NOLOGGING """) if ck_matches: logger.debug(f"\tchecking matches") match_partition = MATCH_PARTITIONS[database.identifier] cur.execute(f""" SELECT COUNT(*) FROM ( SELECT DISTINCT PROTEIN_AC, METHOD_AC, LOC_START, LOC_END FROM INTERPRO.SITE_MATCH_NEW MINUS SELECT DISTINCT PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO FROM INTERPRO.MATCH PARTITION ({match_partition}) ) """) cnt, = cur.fetchone() if cnt: cur.close() con.close() raise RuntimeError(f"{database.name}: {cnt} matches " f"in SITE_MATCH_NEW that are not in MATCH") logger.debug(f"\tadding constraint") cur.execute(""" ALTER TABLE INTERPRO.SITE_MATCH_NEW ADD CONSTRAINT FK_SITE_MATCH_NEW$P FOREIGN KEY (PROTEIN_AC) REFERENCES INTERPRO.PROTEIN (PROTEIN_AC) """) cur.execute(""" ALTER TABLE INTERPRO.SITE_MATCH_NEW ADD CONSTRAINT FK_SITE_MATCH_NEW$D FOREIGN KEY (DBCODE) REFERENCES INTERPRO.CV_DATABASE (DBCODE) """) logger.debug(f"\texchanging partition") cur.execute(f""" ALTER TABLE INTERPRO.SITE_MATCH EXCHANGE PARTITION ({site_partition}) WITH TABLE INTERPRO.SITE_MATCH_NEW """) oracle.drop_table(cur, "INTERPRO.SITE_MATCH_NEW", purge=True) for index in oracle.get_indexes(cur, "INTERPRO", "SITE_MATCH"): if index["unusable"]: logger.info(f"rebuilding index {index['name']}") oracle.rebuild_index(cur, index["name"]) cur.close() con.close() logger.info("complete")
def update_database_feature_matches(url: str, databases: Sequence[Database]): con = cx_Oracle.connect(url) cur = con.cursor() for database in databases: logger.info(f"{database.name}") oracle.drop_table(cur, "INTERPRO.FEATURE_MATCH_NEW", purge=True) logger.debug(f"\tpopulating FEATURE_MATCH_NEW " f"(ANALYSIS_ID: {database.analysis_id})") cur.execute(""" CREATE TABLE INTERPRO.FEATURE_MATCH_NEW NOLOGGING AS SELECT * FROM INTERPRO.FEATURE_MATCH WHERE 1 = 0 """) cur.execute( """ INSERT /*+ APPEND */ INTO INTERPRO.FEATURE_MATCH_NEW SELECT X.AC, M.METHOD_AC, M.SEQ_FEATURE, M.SEQ_START, M.SEQ_END, D.DBCODE FROM IPRSCAN.MV_IPRSCAN M INNER JOIN UNIPARC.XREF X ON M.UPI = X.UPI INNER JOIN INTERPRO.IPRSCAN2DBCODE D ON M.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID WHERE M.ANALYSIS_ID = :1 AND X.DBID IN (2, 3) -- Swiss-Prot or TrEMBL AND X.DELETED = 'N' """, (database.analysis_id, )) con.commit() # Add indexes to be able to exchange partition logger.debug("\tcreating constraints") cur.execute(""" ALTER TABLE INTERPRO.FEATURE_MATCH_NEW ADD CONSTRAINT CK_FMATCH_NEW$FROM CHECK ( POS_FROM >= 1 ) """) cur.execute(""" ALTER TABLE INTERPRO.FEATURE_MATCH_NEW ADD CONSTRAINT CK_FMATCH_NEW$NEG CHECK ( POS_TO >= POS_FROM ) """) cur.execute(""" ALTER TABLE INTERPRO.FEATURE_MATCH_NEW ADD CONSTRAINT PK_FMATCH_NEW PRIMARY KEY (PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO, DBCODE) """) cur.execute(f""" ALTER TABLE INTERPRO.FEATURE_MATCH_NEW ADD CONSTRAINT FK_FMATCH_NEW$M$D FOREIGN KEY (METHOD_AC, DBCODE) REFERENCES INTERPRO.FEATURE_METHOD (METHOD_AC, DBCODE) """) cur.execute(""" ALTER TABLE INTERPRO.FEATURE_MATCH_NEW ADD CONSTRAINT FK_FMATCH_NEW$P FOREIGN KEY (PROTEIN_AC) REFERENCES INTERPRO.PROTEIN (PROTEIN_AC) """) cur.execute("SELECT COUNT(*) FROM INTERPRO.FEATURE_MATCH_NEW") cnt, = cur.fetchone() if not cnt: raise RuntimeError(f"no rows inserted " f"for analysis ID {database.analysis_id}") logger.debug(f"\texchanging partition") partition = FEATURE_MATCH_PARTITIONS[database.identifier] cur.execute(f""" ALTER TABLE INTERPRO.FEATURE_MATCH EXCHANGE PARTITION ({partition}) WITH TABLE INTERPRO.FEATURE_MATCH_NEW """) oracle.drop_table(cur, "INTERPRO.FEATURE_MATCH_NEW", purge=True) logger.info("\tgathering statistics") oracle.gather_stats(cur, "INTERPRO", "FEATURE_MATCH", partition) for index in oracle.get_indexes(cur, "INTERPRO", "FEATURE_MATCH"): if index["unusable"]: logger.info(f"rebuilding index {index['name']}") oracle.rebuild_index(cur, index["name"]) cur.close() con.close() logger.info("complete")
def delete_obsoletes(url: str, databases: Sequence[Database], **kwargs): step = kwargs.get("step", 10000) threads = kwargs.get("threads", 8) con = cx_Oracle.connect(url) cur = con.cursor() # track signatures that need to be deleted ora.drop_table(cur, "METHOD_TO_DELETE") cur.execute( """ CREATE TABLE INTERPRO.METHOD_TO_DELETE ( ID NUMBER NOT NULL, METHOD_AC VARCHAR2(25) NOT NULL ) """ ) for db in databases: cur.execute( """ INSERT INTO INTERPRO.METHOD_TO_DELETE (ID, METHOD_AC) SELECT ROWNUM, METHOD_AC FROM ( SELECT METHOD_AC FROM INTERPRO.METHOD WHERE DBCODE = :dbcode MINUS SELECT METHOD_AC FROM INTERPRO.METHOD_STG WHERE DBCODE = :dbcode ) """, dbcode=db.identifier ) con.commit() cur.execute( """ CREATE UNIQUE INDEX UI_METHOD_TO_DELETE ON INTERPRO.METHOD_TO_DELETE (ID) """ ) cur.execute("SELECT COUNT(*) FROM INTERPRO.METHOD_TO_DELETE") stop, = cur.fetchone() logger.info(f"{stop:,} signatures to delete") if not stop: # Nothing to delete cur.close() con.close() return # Get tables with a FOREIGN KEY to INTERPRO.METHOD tables = [] child_tables = ora.get_child_tables(cur, "INTERPRO", "METHOD") for table, constraint, column in child_tables: tables.append((table, constraint, column)) # Add INTERPRO.METHOD as we want also to delete rows in this table tables.append(("METHOD", None, "METHOD_AC")) logger.info("disabling referential constraints") num_errors = 0 for table, constraint, column in tables: if not constraint: continue try: ora.toggle_constraint(cur, table, constraint, False) except cx_Oracle.DatabaseError as exc: logger.error(exc) num_errors += 1 if num_errors: cur.close() con.close() raise RuntimeError(f"{num_errors} constraints could not be disabled") tasks = [] for table, constraint, column in tables: if table == "MATCH": for db in databases: partition = MATCH_PARTITIONS[db.identifier] logger.info(f"truncating {table} ({partition})") ora.truncate_partition(cur, table, partition) elif table == "SITE_MATCH": for db in databases: partition = SITE_PARTITIONS[db.identifier] logger.info(f"truncating {table} ({partition})") ora.truncate_partition(cur, table, partition) else: tasks.append((table, None, column)) cur.close() con.close() with ThreadPoolExecutor(max_workers=threads) as executor: fs = {} for table, partition, column in tasks: args = (url, table, partition, column, step, stop) f = executor.submit(delete_from_table, *args) fs[f] = (table, partition) num_errors = 0 for f in as_completed(fs): table, partition = fs[f] if partition: name = f"{table} ({partition})" else: name = table try: num_rows = f.result() except Exception as exc: logger.info(f"{name}: failed ({exc})") num_errors += 1 else: logger.info(f"{name}: {num_rows:,} rows deleted") if num_errors: raise RuntimeError(f"{num_errors} tables failed") logger.info("enabling referential constraints") con = cx_Oracle.connect(url) cur = con.cursor() num_errors = 0 constraints = set() for table, constraint, column in tables: if not constraint or constraint in constraints: """ Either no constraint or prevent the same constrain to be enabled several times """ continue logger.debug(f"enabling: {table}.{constraint}") constraints.add(constraint) try: ora.toggle_constraint(cur, table, constraint, True) except cx_Oracle.DatabaseError as exc: logger.error(exc) num_errors += 1 if num_errors: cur.close() con.close() raise RuntimeError(f"{num_errors} constraints could not be enabled") for table, constraint, column in tables: for index in ora.get_indexes(cur, "INTERPRO", table): if index["unusable"]: logger.info(f"rebuilding index {index['name']}") ora.rebuild_index(cur, index["name"]) cur.close() con.close() logger.info("complete")
def import_annotations(ora_url: str, pg_url: str): pg_con = psycopg2.connect(**url2dict(pg_url)) with pg_con.cursor() as pg_cur: for name in ("protein2go", "publication", "term"): pg_cur.execute(f"DROP TABLE IF EXISTS {name}") pg_cur.execute(""" CREATE TABLE protein2go ( protein_acc VARCHAR(15) NOT NULL, term_id VARCHAR(10) NOT NULL, ref_db_code VARCHAR(10) NOT NULL, ref_db_id VARCHAR(60) NOT NULL ) """) pg_cur.execute(""" CREATE TABLE publication ( id VARCHAR(25) NOT NULL CONSTRAINT publication_pkey PRIMARY KEY, title VARCHAR(1500) NOT NULL, published DATE NOT NULL ) """) pg_cur.execute(""" CREATE TABLE term ( id VARCHAR(10) NOT NULL CONSTRAINT term_pkey PRIMARY KEY, name VARCHAR(200) NOT NULL, category VARCHAR(25) NOT NULL, num_constraints INTEGER NOT NULL, is_obsolete BOOLEAN NOT NULL, definition VARCHAR NOT NULL, replaced_id VARCHAR(10) ) """) ora_con = cx_Oracle.connect(ora_url) ora_cur = ora_con.cursor() logger.info("populating: protein2go") """ Filtering on length: Some annotations are not on proteins, but on post-translation modifications or processing events. e.g. P27958:PRO_0000037566 (protein: P27958; chain: PRO_0000037573) Protein accessions are 15 characters long (max), so anything longer than 15 characters cannot be an accession. A better (but heavier) approach would be to join with our PROTEIN table. """ ora_cur.execute(""" SELECT A.ENTITY_ID, A.GO_ID, A.REF_DB_CODE, A.REF_DB_ID FROM GO.ANNOTATIONS@GOAPRO A INNER JOIN GO.ECO2EVIDENCE@GOAPRO E ON A.ECO_ID = E.ECO_ID INNER JOIN GO.CV_SOURCES@GOAPRO S ON S.CODE = A.SOURCE WHERE A.ENTITY_TYPE = 'protein' AND LENGTH(A.ENTITY_ID) <= 15 AND E.GO_EVIDENCE != 'IEA' AND S.IS_PUBLIC = 'Y' """) sql = "INSERT INTO protein2go VALUES %s" execute_values(pg_cur, sql, ora_cur, page_size=1000) logger.info("populating: publication") ora_cur.execute(""" SELECT ID, TITLE, FIRST_PUBLISH_DATE FROM GO.PUBLICATIONS@GOAPRO WHERE ID IN ( SELECT DISTINCT A.REF_DB_ID FROM GO.ANNOTATIONS@GOAPRO A INNER JOIN GO.ECO2EVIDENCE@GOAPRO E ON A.ECO_ID = E.ECO_ID INNER JOIN GO.CV_SOURCES@GOAPRO S ON S.CODE = A.SOURCE WHERE A.ENTITY_TYPE = 'protein' AND LENGTH(A.ENTITY_ID) <= 15 AND E.GO_EVIDENCE != 'IEA' AND S.IS_PUBLIC = 'Y' AND A.REF_DB_CODE = 'PMID' ) """) sql = "INSERT INTO publication VALUES %s" execute_values(pg_cur, sql, ora_cur, page_size=1000) logger.info("populating: term") ora_cur.execute(""" SELECT CHILD_ID, PARENT_ID FROM GO.ANCESTORS@GOAPRO WHERE CHILD_ID != PARENT_ID """) ancestors = {} for term_id, parent_id in ora_cur: try: ancestors[term_id].add(parent_id) except KeyError: ancestors[term_id] = {parent_id} ora_cur.execute(""" SELECT DISTINCT GO_ID, CONSTRAINT_ID FROM GO.TERM_TAXON_CONSTRAINTS@GOAPRO """) constraints = {} for term_id, constraint_id in ora_cur: try: constraints[term_id].add(constraint_id) except KeyError: constraints[term_id] = {constraint_id} ora_cur.execute(""" SELECT T.GO_ID, T.NAME, C.TERM_NAME, T.IS_OBSOLETE, D.DEFINITION, NULL FROM GO.TERMS@GOAPRO T INNER JOIN GO.DEFINITIONS@GOAPRO D ON T.GO_ID = D.GO_ID INNER JOIN GO.CV_CATEGORIES@GOAPRO C ON T.CATEGORY = C.CODE UNION ALL SELECT S.SECONDARY_ID, T.NAME, C.TERM_NAME, T.IS_OBSOLETE, D.DEFINITION, T.GO_ID FROM GO.SECONDARIES@GOAPRO S INNER JOIN GO.TERMS@GOAPRO T ON S.GO_ID = T.GO_ID INNER JOIN GO.CV_CATEGORIES@GOAPRO C ON T.CATEGORY = C.CODE INNER JOIN GO.DEFINITIONS@GOAPRO D ON T.GO_ID = D.GO_ID """) sql = "INSERT INTO term VALUES %s" execute_values(pg_cur, sql, ((row[0], row[1], row[2], len(_get_constraints(row[0], ancestors, constraints)), row[3] == 'Y', row[4], row[5]) for row in ora_cur), page_size=1000) ora_cur.close() ora_con.close() pg_cur.execute(""" CREATE INDEX protein2go_protein_idx ON protein2go (protein_acc) """) pg_cur.execute(""" CREATE INDEX protein2go_term_idx ON protein2go (term_id) """) pg_cur.execute("ANALYZE publication") pg_cur.execute("ANALYZE term") pg_con.commit() pg_con.close() logger.info("complete")
def update_hmm_clans(url: str, dbkey: str, hmmdb: str, **kwargs): clan_source = kwargs.get("source") threads = kwargs.get("threads") tmpdir = kwargs.get("tmpdir") dbcode = DATABASES[dbkey] logger.info("deleting old clans") con = cx_Oracle.connect(url) cur = con.cursor() cur.execute("DELETE FROM INTERPRO.CLAN WHERE DBCODE = :1", (dbcode, )) con.commit() cur.close() con.close() logger.info("loading new clans") if dbkey == "panther": clans = contrib.panther.get_clans(url) def getsubdir(x): return x[:7] elif dbkey == "pfam": clans = contrib.pfam.get_clans(clan_source) def getsubdir(x): return x[:5] elif dbkey == "pirsf": clans = contrib.pirsf.get_clans(clan_source) def getsubdir(x): return x[:8] else: raise NotImplementedError() clans_to_insert = {} mem2clan = {} for c in clans: clans_to_insert[c.accession] = c for m in c.members: mem2clan[m["accession"]] = (c.accession, m["score"]) workdir = mkdtemp(dir=tmpdir) num_duplicates = 0 with futures.ThreadPoolExecutor(max_workers=threads) as executor: logger.info("emitting consensus sequences") fs = {} models = set() for model_acc, hmm in iter_models(hmmdb): if model_acc not in mem2clan: # Ignore models not belonging to a clan continue elif model_acc in models: num_duplicates += 1 continue subdir = os.path.join(workdir, getsubdir(model_acc)) try: os.mkdir(subdir) except FileExistsError: pass prefix = os.path.join(subdir, model_acc) hmmfile = prefix + HMM_SUFFIX with open(hmmfile, "wt") as fh: fh.write(hmm) seqfile = prefix + SEQ_SUFFIX f = executor.submit(hmmemit, hmmfile, seqfile) fs[f] = model_acc models.add(model_acc) done, not_done = futures.wait(fs) if not_done: shutil.rmtree(workdir) raise RuntimeError(f"{len(not_done)} error(s)") elif num_duplicates: shutil.rmtree(workdir) raise RuntimeError(f"HMM database {hmmdb} contains " f"{num_duplicates} duplicated models.") logger.info("searching consensus sequences") fs = {} for model_acc in models: prefix = os.path.join(workdir, getsubdir(model_acc), model_acc) seqfile = prefix + SEQ_SUFFIX outfile = prefix + OUT_SUFFIX domfile = prefix + DOM_SUFFIX f = executor.submit(hmmscan, hmmdb, seqfile, domfile, outfile) fs[f] = model_acc con = cx_Oracle.connect(url) cur = con.cursor() cur2 = con.cursor() cur2.setinputsizes(25, 25, cx_Oracle.DB_TYPE_BINARY_DOUBLE, cx_Oracle.DB_TYPE_CLOB) clan_sql = "INSERT INTO INTERPRO.CLAN VALUES (:1, :2, :3, :4)" memb_sql = "INSERT INTO INTERPRO.CLAN_MEMBER VALUES (:1, :2, :3, :4)" mtch_sql = "INSERT INTO INTERPRO.CLAN_MATCH VALUES (:1, :2, :3, :4)" completed = errors = progress = 0 for f in futures.as_completed(fs): model_acc = fs[f] completed += 1 if not f.result(): logger.error(f"{model_acc}") errors += 1 continue prefix = os.path.join(workdir, getsubdir(model_acc), model_acc) outfile = prefix + OUT_SUFFIX domfile = prefix + DOM_SUFFIX clan_acc, score = mem2clan[model_acc] sequence = load_sequence(prefix + SEQ_SUFFIX) try: clan = clans_to_insert.pop(clan_acc) except KeyError: # Clan already inserted pass else: cur.execute( clan_sql, (clan.accession, dbcode, clan.name, clan.description)) cur.execute(memb_sql, (clan_acc, model_acc, len(sequence), score)) matches = [] for target in load_hmmscan_results(outfile, domfile): if target["accession"] == model_acc: continue domains = [] for dom in target["domains"]: domains.append((dom["coordinates"]["ali"]["start"], dom["coordinates"]["ali"]["end"])) matches.append( (model_acc, target["accession"], target["evalue"], json.dumps(domains))) if matches: cur2.executemany(mtch_sql, matches) pc = completed * 100 // len(fs) if pc > progress: progress = pc logger.debug(f"{progress:>10}%") con.commit() cur.close() cur2.close() con.close() size = calc_dir_size(workdir) logger.info(f"disk usage: {size / 1024 ** 2:,.0f} MB") shutil.rmtree(workdir) if errors: raise RuntimeError(f"{errors} error(s)")
def _prepare_matches(con: cx_Oracle.Connection): """ Import protein matches in a staging table :param con: Oracle connection object """ cur = con.cursor() logger.info("populating MATCH_NEW") oracle.drop_table(cur, "INTERPRO.MATCH_NEW", purge=True) cur.execute(""" CREATE TABLE INTERPRO.MATCH_NEW NOLOGGING AS SELECT * FROM INTERPRO.MATCH WHERE 1 = 0 """) cur.execute(""" INSERT /*+ APPEND */ INTO INTERPRO.MATCH_NEW SELECT P.PROTEIN_AC, M.METHOD_AC, M.SEQ_START, M.SEQ_END, 'T', D.DBCODE, D.EVIDENCE, SYSDATE, SYSDATE, SYSDATE, 'INTERPRO', M.EVALUE, M.MODEL_AC, M.FRAGMENTS FROM INTERPRO.PROTEIN_TO_SCAN P INNER JOIN IPRSCAN.MV_IPRSCAN M ON P.UPI = M.UPI INNER JOIN INTERPRO.IPRSCAN2DBCODE D ON M.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID -- Exclude MobiDB-Lite, Phobius, SignalP (Euk, Gram+, Gram-), TMHMM, COILS WHERE D.DBCODE NOT IN ('g', 'j', 'n', 's', 'v', 'q', 'x') AND M.SEQ_START != M.SEQ_END """) con.commit() logger.info("indexing") for col in ("DBCODE", "PROTEIN_AC"): cur.execute(f""" CREATE INDEX I_MATCH_NEW${col} ON INTERPRO.MATCH_NEW ({col}) TABLESPACE INTERPRO_IND NOLOGGING """) # logger.info("gathering statistics") # oracle.gather_stats(cur, "INTERPRO", "MATCH_NEW") logger.info("deleting SUPERFAMILY duplicated matches") cur.execute(""" DELETE FROM INTERPRO.MATCH_NEW M1 WHERE EXISTS( SELECT 1 FROM INTERPRO.MATCH_NEW M2 WHERE M2.DBCODE = 'Y' AND M1.PROTEIN_AC = M2.PROTEIN_AC AND M1.METHOD_AC = M2.METHOD_AC AND M1.POS_FROM = M2.POS_FROM AND M1.POS_TO = M2.POS_TO AND M1.SCORE > M2.SCORE ) """) logger.info(f"{cur.rowcount} SUPERFAMILY matches deleted") con.commit() cur.close()
def update_database_matches(url: str, databases: Sequence[Database]): con = cx_Oracle.connect(url) cur = con.cursor() for database in databases: logger.info(f"{database.name}") oracle.drop_table(cur, "INTERPRO.MATCH_NEW", purge=True) logger.debug(f"\tpopulating MATCH_MEW " f"(ANALYSIS_ID: {database.analysis_id})") cur.execute(""" CREATE TABLE INTERPRO.MATCH_NEW NOLOGGING AS SELECT * FROM INTERPRO.MATCH WHERE 1 = 0 """) cur.execute( """ INSERT /*+ APPEND */ INTO INTERPRO.MATCH_NEW SELECT X.AC, M.METHOD_AC, M.SEQ_START, M.SEQ_END, 'T', D.DBCODE, D.EVIDENCE, SYSDATE, SYSDATE, SYSDATE, 'INTERPRO', M.EVALUE, M.MODEL_AC, M.FRAGMENTS FROM IPRSCAN.MV_IPRSCAN M INNER JOIN UNIPARC.XREF X ON M.UPI = X.UPI INNER JOIN INTERPRO.IPRSCAN2DBCODE D ON M.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID WHERE M.ANALYSIS_ID = :1 AND M.SEQ_START != M.SEQ_END AND X.DBID IN (2, 3) -- Swiss-Prot or TrEMBL AND X.DELETED = 'N' """, (database.analysis_id, )) con.commit() # Add constraints/indexes to be able to exchange partition logger.debug("\tcreating indexes and constraints") for col in ("DBCODE", "EVIDENCE", "STATUS", "METHOD_AC"): cur.execute(f""" CREATE INDEX MATCH_NEW${col[0]} ON INTERPRO.MATCH_NEW ({col}) TABLESPACE INTERPRO_IND NOLOGGING """) cur.execute(""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT CK_MATCH_NEW$FROM CHECK ( POS_FROM >= 1 ) """) cur.execute(""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT CK_MATCH_NEW$NEG CHECK ( POS_TO - POS_FROM > 0 ) """) cur.execute(""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT CK_MATCH_NEW$STATUS CHECK ( STATUS != 'N' OR (STATUS = 'N' AND DBCODE IN ('P', 'M', 'Q')) ) """) cur.execute(""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT PK_MATCH_NEW PRIMARY KEY (PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO) """) cur.execute(f""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT FK_MATCH_NEW$DBCODE FOREIGN KEY (DBCODE) REFERENCES INTERPRO.CV_DATABASE (DBCODE) """) cur.execute(""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT FK_MATCH_NEW$EVI FOREIGN KEY (EVIDENCE) REFERENCES INTERPRO.CV_EVIDENCE (CODE) """) cur.execute(f""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT FK_MATCH_NEW$METHOD FOREIGN KEY (METHOD_AC) REFERENCES INTERPRO.METHOD (METHOD_AC) """) cur.execute(f""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT FK_MATCH_NEW$PROTEIN FOREIGN KEY (PROTEIN_AC) REFERENCES INTERPRO.PROTEIN (PROTEIN_AC) """) cur.execute(f""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT FK_MATCH_NEW$STATUS FOREIGN KEY (STATUS) REFERENCES INTERPRO.CV_STATUS (CODE) """) cur.execute(f""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT CK_MATCH_NEW$PROTEIN CHECK (PROTEIN_AC IS NOT NULL ) """) cur.execute(f""" ALTER TABLE INTERPRO.MATCH_NEW ADD CONSTRAINT CK_MATCH_NEW$METHOD CHECK (METHOD_AC IS NOT NULL ) """) cur.execute("SELECT COUNT(*) FROM INTERPRO.MATCH_NEW") cnt, = cur.fetchone() if not cnt: raise RuntimeError(f"no rows inserted " f"for analysis ID {database.analysis_id}") logger.debug(f"\texchanging partition") partition = MATCH_PARTITIONS[database.identifier] cur.execute(f""" ALTER TABLE INTERPRO.MATCH EXCHANGE PARTITION ({partition}) WITH TABLE INTERPRO.MATCH_NEW """) oracle.drop_table(cur, "INTERPRO.MATCH_NEW", purge=True) logger.info("\tgathering statistics") oracle.gather_stats(cur, "INTERPRO", "MATCH", partition) for index in oracle.get_indexes(cur, "INTERPRO", "MATCH"): if index["unusable"]: logger.info(f"rebuilding index {index['name']}") oracle.rebuild_index(cur, index["name"]) cur.close() con.close() logger.info("complete")
def update_site_matches(url: str): """ Add protein site matches for recently added/modified sequences :param url: Oracle connection string """ con = cx_Oracle.connect(url) cur = con.cursor() logger.info("populating SITE_MATCH_NEW") oracle.drop_table(cur, "INTERPRO.SITE_MATCH_NEW", purge=True) cur.execute(""" CREATE TABLE INTERPRO.SITE_MATCH_NEW NOLOGGING AS SELECT P.PROTEIN_AC, S.METHOD_AC, D.DBCODE, S.LOC_START, S.LOC_END, S.DESCRIPTION, S.RESIDUE, S.RESIDUE_START, S.RESIDUE_END, S.NUM_SITES FROM INTERPRO.PROTEIN_TO_SCAN P INNER JOIN IPRSCAN.SITE S ON P.UPI = S.UPI INNER JOIN INTERPRO.IPRSCAN2DBCODE D ON S.ANALYSIS_ID = D.IPRSCAN_SIG_LIB_REL_ID """) logger.info("indexing") cur.execute(""" CREATE INDEX I_SITE_NEW ON INTERPRO.SITE_MATCH_NEW (PROTEIN_AC, METHOD_AC, LOC_START, LOC_END) TABLESPACE INTERPRO_IND NOLOGGING """) logger.info("gathering statistics") oracle.gather_stats(cur, "INTERPRO", "SITE_MATCH_NEW") logger.info("checking") params = [] queries = [] for identifier, (_, ck_matches) in SITE_PARTITIONS.items(): if ck_matches: partition = MATCH_PARTITIONS[identifier] params.append(identifier) queries.append(f""" SELECT DISTINCT PROTEIN_AC, METHOD_AC, POS_FROM, POS_TO FROM INTERPRO.MATCH PARTITION ({partition}) """) if queries: in_cond = [f":{i+1}" for i in range(len(params))] cur.execute( f""" SELECT COUNT(*) FROM ( SELECT DISTINCT PROTEIN_AC, METHOD_AC, LOC_START, LOC_END FROM INTERPRO.SITE_MATCH_NEW WHERE DBCODE IN ({', '.join(in_cond)}) MINUS ( {' UNION ALL '.join(queries)} ) ) """, params) cnt, = cur.fetchone() if cnt: cur.close() con.close() raise RuntimeError(f"{cnt} matches in SITE_MATCH_NEW " f"that are not in MATCH") logger.info("updating SITE_MATCH") cur.execute(""" DELETE FROM INTERPRO.SITE_MATCH WHERE PROTEIN_AC IN ( SELECT PROTEIN_AC FROM INTERPRO.PROTEIN_TO_SCAN ) """) logger.info(f"{cur.rowcount} rows deleted") cur.execute(""" INSERT INTO INTERPRO.SITE_MATCH SELECT * FROM INTERPRO.SITE_MATCH_NEW """) logger.info(f"{cur.rowcount} rows inserted") con.commit() oracle.drop_table(cur, "INTERPRO.SITE_MATCH_NEW", purge=True) cur.close() con.close()
def add_staging(url: str, update: Sequence[Tuple[Database, str]]): con = cx_Oracle.connect(url) cur = con.cursor() ora.drop_table(cur, "METHOD_STG", purge=True) cur.execute( """ CREATE TABLE INTERPRO.METHOD_STG ( METHOD_AC VARCHAR2(25) NOT NULL, NAME VARCHAR2(100), DBCODE CHAR(1) NOT NULL, DESCRIPTION VARCHAR2(400), SIG_TYPE CHAR(1) NOT NULL, ABSTRACT VARCHAR2(4000), ABSTRACT_LONG CLOB ) """ ) sql = """ INSERT INTO INTERPRO.METHOD_STG VALUES (:1, :2, :3, :4, :5, :6, :7) """ with Table(con, sql) as table: errors = 0 for db, src in update: if db.identifier == 'f': # FunFams signatures = contrib.cath.parse_functional_families(src) elif db.identifier == 'H': # Pfam signatures = contrib.pfam.get_signatures(src) elif db.identifier == 'J': # CDD signatures = contrib.cdd.parse_signatures(src) elif db.identifier == 'M': # PROSITE profiles signatures = contrib.prosite.parse_profiles(src) elif db.identifier == 'P': # PROSITE patterns signatures = contrib.prosite.parse_patterns(src) elif db.identifier == 'Q': # HAMAP signatures = contrib.hamap.parse_signatures(src) elif db.identifier == 'V': # PANTHER signatures = contrib.panther.parse_signatures(src) elif db.identifier == 'X': # CATH-Gene3D signatures = contrib.cath.parse_superfamilies(src) else: logger.error(f"{db.name}: unsupported member database") errors += 1 continue for m in signatures: if m.abstract is None: abstract = abstract_long = None elif len(m.abstract) <= 4000: abstract = m.abstract abstract_long = None else: abstract = None abstract_long = m.abstract table.insert(( m.accession, m.name or m.accession, db.identifier, m.description, m.sig_type, abstract, abstract_long )) if errors: cur.close() con.close() raise RuntimeError(f"{errors} errors occurred") con.commit() code2name = {db.identifier: db.name for db, _ in update} cur.execute( """ SELECT DBCODE, COUNT(*) FROM INTERPRO.METHOD_STG GROUP BY DBCODE """ ) for dbcode, cnt in cur: logger.info(f"{code2name[dbcode]:<30} {cnt:>10,} signatures") cur.close() con.close()
def track_changes(url: str, swissp: str, trembl: str, version: str, date: str, data_dir: str, tmpdir: Optional[str] = None): workdir = mkdtemp(dir=tmpdir) con = cx_Oracle.connect(url) cur = con.cursor() cur.execute("SELECT VERSION FROM INTERPRO.DB_VERSION WHERE DBCODE = 'u'") old_version, = cur.fetchone() logger.info(f"exporting protein counts per entry (UniProt {old_version})") export_entries_protein_counts(cur, data_dir) cur.close() con.close() logger.info(f"dumping UniProt {old_version} proteins") files = export_proteins(url, workdir) logger.info(f"loading UniProt {version} proteins") fd, database = mkstemp(dir=workdir) os.close(fd) os.remove(database) con = sqlite3.connect(database) con.execute(""" CREATE TABLE protein ( accession TEXT NOT NULL PRIMARY KEY, identifier TEXT NOT NULL, is_reviewed INTEGER NOT NULL, crc64 TEXT NOT NULL, length INTEGER NOT NULL, is_fragment INTEGER NOT NULL, taxon_id INTEGER NOT NULL ) """) con.close() sprot.load(swissp, database, "protein") sprot.load(trembl, database, "protein") size = os.path.getsize(database) + sum(map(os.path.getsize, files)) logger.info("tracking changes") con = cx_Oracle.connect(url) cur = con.cursor() ora.truncate_table(cur, "INTERPRO.PROTEIN_CHANGES") ora.truncate_table(cur, "INTERPRO.PROTEIN_TO_DELETE") cur.close() # New proteins sql = """ INSERT INTO INTERPRO.PROTEIN VALUES (:acc, :identifer, :dbcode, :crc64, :length, SYSDATE, USER, :fragment, 'N', :taxid) """ new_proteins = Table(con, sql) # Annotation/sequence changes sql = """ UPDATE INTERPRO.PROTEIN SET NAME = :identifer, DBCODE = :dbcode, CRC64 = :crc64, LEN = :length, TIMESTAMP = SYSDATE, USERSTAMP = USER, FRAGMENT = :fragment, TAX_ID = :taxid WHERE PROTEIN_AC = :acc """ existing_proteins = Table(con, sql) # Obsolete proteins sql = "INSERT INTO INTERPRO.PROTEIN_TO_DELETE VALUES (:1, :2)" obsolete_proteins = Table(con, sql) # Proteins to track sql = "INSERT INTO INTERPRO.PROTEIN_CHANGES VALUES (:1)" track_proteins = Table(con, sql) old_reviewed = old_unreviewed = 0 new_reviewed = new_unreviewed = 0 min_acc = max_acc = None con2 = sqlite3.connect(database) cur2 = con2.cursor() for file in files: with open(file, "rb") as fh: old_proteins = pickle.load(fh) os.remove(file) start = min(old_proteins) stop = max(old_proteins) if min_acc is None: min_acc = start max_acc = stop cur2.execute( """ SELECT * FROM protein WHERE accession BETWEEN ? AND ? """, (start, stop)) for row in cur2: new_seq = Sequence(*row) if new_seq.is_reviewed: new_reviewed += 1 else: new_unreviewed += 1 try: old_seq = old_proteins.pop(new_seq.accession) except KeyError: new_proteins.insert(new_seq.asdict()) track_proteins.insert((new_seq.accession, )) continue if old_seq.is_reviewed: old_reviewed += 1 else: old_unreviewed += 1 if new_seq.crc64 != old_seq.crc64: # Sequence update existing_proteins.update(new_seq.asdict()) # Track the protein (sequence change -> match changes) track_proteins.insert((new_seq.accession, )) elif new_seq.annotation != old_seq.annotation: # Annotation update existing_proteins.update(new_seq.asdict()) for old_seq in old_proteins.values(): obsolete_proteins.insert( (obsolete_proteins.count + 1, old_seq.accession)) if old_seq.is_reviewed: old_reviewed += 1 else: old_unreviewed += 1 """ If there is a new protein with an accession lower than the lowest accession of the old proteins, or with an an accession greater than the greatest accession of the old proteins, it has not been considered until now """ cur2.execute( """ SELECT * FROM protein WHERE accession < ? OR accession > ? """, (min_acc, max_acc)) for row in cur2: new_seq = Sequence(*row) if new_seq.is_reviewed: new_reviewed += 1 else: new_unreviewed += 1 new_proteins.insert(new_seq.asdict()) track_proteins.insert((new_seq.accession, )) cur2.close() con2.close() os.remove(database) os.rmdir(workdir) new_proteins.close() existing_proteins.close() obsolete_proteins.close() track_proteins.close() cur = con.cursor() cur.executemany( """ UPDATE INTERPRO.DB_VERSION SET VERSION = :1, ENTRY_COUNT = :2, FILE_DATE = TO_DATE(:3, 'DD-Mon-YYYY'), LOAD_DATE = SYSDATE WHERE DBCODE = :4 """, [(version, new_reviewed, date, 'S'), (version, new_unreviewed, date, 'T'), (version, new_reviewed + new_unreviewed, date, 'u')]) con.commit() ora.gather_stats(cur, "INTERPRO", "PROTEIN_CHANGES") ora.gather_stats(cur, "INTERPRO", "PROTEIN_TO_DELETE") cur.close() con.close() logger.info(f"Reviewed (before): {old_reviewed:>12}") logger.info(f"Unreviewed (before): {old_unreviewed:>12}") logger.info(f"Reviewed (now): {new_reviewed:>12}") logger.info(f"Uneviewed (now): {new_unreviewed:>12}") logger.info(f"New proteins: {new_proteins.count:>12}") logger.info(f"Updated proteins: {existing_proteins.count:>12}") logger.info(f"Obsolete sequences: {obsolete_proteins.count:>12}") logger.info(f"disk usage: {size / 1024 ** 2:.0f} MB")
def update_cdd_clans(url: str, cddmasters: str, cddid: str, fam2supfam: str, **kwargs): threads = kwargs.get("threads") tmpdir = kwargs.get("tmpdir") dbcode = DATABASES["cdd"] logger.info("deleting old clans") con = cx_Oracle.connect(url) cur = con.cursor() cur.execute("DELETE FROM INTERPRO.CLAN WHERE DBCODE = :1", (dbcode, )) con.commit() cur.close() con.close() clans = contrib.cdd.get_clans(cddid, fam2supfam) clans_to_insert = {} mem2clan = {} for c in clans: clans_to_insert[c.accession] = c for m in c.members: mem2clan[m["accession"]] = (c.accession, m["score"]) logger.info("parsing representative sequences") workdir = mkdtemp(dir=tmpdir) fd, files_list = mkstemp(dir=workdir) id2acc = {} seqfiles = {} with open(fd, "wt") as fh: for model_id, model_acc, sequence in iter_sequences(cddmasters): if model_acc not in mem2clan or model_acc in seqfiles: continue subdir = os.path.join(workdir, model_acc[:5]) try: os.mkdir(subdir) except FileExistsError: pass prefix = os.path.join(subdir, model_acc) seqfile = prefix + SEQ_SUFFIX with open(seqfile, "wt") as fh2: fh2.write(sequence) fh.write(f"{seqfile}\n") seqfiles[model_acc] = prefix id2acc[model_id] = model_acc logger.info("building profile database") fd, database = mkstemp(dir=workdir) os.close(fd) os.remove(database) sp.run(["mk_compass_db", "-i", files_list, "-o", database], stderr=sp.DEVNULL, stdout=sp.DEVNULL, check=True) with futures.ThreadPoolExecutor(max_workers=threads) as executor: logger.info("querying sequences") fs = {} for model_acc, prefix in seqfiles.items(): seqfile = prefix + SEQ_SUFFIX outfile = prefix + OUT_SUFFIX f = executor.submit(compass_vs_db, seqfile, database, outfile) fs[f] = (model_acc, prefix) con = cx_Oracle.connect(url) cur = con.cursor() cur2 = con.cursor() cur2.setinputsizes(25, 25, cx_Oracle.DB_TYPE_BINARY_DOUBLE, cx_Oracle.DB_TYPE_CLOB) clan_sql = "INSERT INTO INTERPRO.CLAN VALUES (:1, :2, :3, :4)" memb_sql = "INSERT INTO INTERPRO.CLAN_MEMBER VALUES (:1, :2, :3, :4)" mtch_sql = "INSERT INTO INTERPRO.CLAN_MATCH VALUES (:1, :2, :3, :4)" completed = errors = progress = 0 for f in futures.as_completed(fs): model_acc, prefix = fs[f] completed += 1 if not f.result(): logger.error(f"{model_acc}") errors += 1 continue clan_acc, score = mem2clan[model_acc] sequence = load_sequence(prefix + SEQ_SUFFIX) try: clan = clans_to_insert.pop(clan_acc) except KeyError: # Clan already inserted pass else: cur.execute( clan_sql, (clan.accession, dbcode, clan.name, clan.description)) cur.execute(memb_sql, (clan_acc, model_acc, len(sequence), score)) matches = [] for target in load_compass_results(prefix + OUT_SUFFIX): target_acc = id2acc[target["id"]] if target_acc == model_acc: continue matches.append( (model_acc, target_acc, target["evalue"], json.dumps([(target["start"], target["end"])]))) if matches: cur2.executemany(mtch_sql, matches) pc = completed * 100 // len(fs) if pc > progress: progress = pc logger.debug(f"{progress:>10}%") con.commit() cur.close() cur2.close() con.close() size = calc_dir_size(workdir) logger.info(f"disk usage: {size / 1024 ** 2:,.0f} MB") shutil.rmtree(workdir) if errors: raise RuntimeError(f"{errors} error(s)")
def import_taxonomy(ora_url: str, pg_url: str): logger.info("loading taxonomy info") ora_con = cx_Oracle.connect(ora_url) ora_cur = ora_con.cursor() ora_cur.execute(""" SELECT TAX_ID, SCIENTIFIC_NAME, RANK, LEFT_NUMBER, RIGHT_NUMBER, PARENT_ID FROM INTERPRO.ETAXI """) taxa = {} for tax_id, name, rank, left_num, right_num, parent_id in ora_cur: if tax_id in (1, 131567): """ Skip root and meta-superkingdom (131567) which contains: * Bacteria (2) * Archaea (2157) * Eukaryota (2759) """ continue elif parent_id in (1, 131567): rank = "superkingdom" parent_id = None taxa[tax_id] = (name, rank, left_num, right_num, parent_id) ora_cur.close() ora_con.close() pg_con = psycopg2.connect(**url2dict(pg_url)) with pg_con.cursor() as pg_cur: pg_cur.execute("DROP TABLE IF EXISTS taxon") pg_cur.execute("DROP TABLE IF EXISTS lineage") pg_cur.execute(""" CREATE TABLE taxon ( id INTEGER NOT NULL CONSTRAINT taxon_id_pkey PRIMARY KEY, name VARCHAR(255) NOT NULL, rank VARCHAR(50) NOT NULL, left_number INTEGER NOT NULL, right_number INTEGER NOT NULL, parent_id INTEGER, lineage TEXT NOT NULL ) """) pg_cur.execute(""" CREATE TABLE lineage ( child_id INTEGER NOT NULL, parent_id INTEGER NOT NULL, parent_rank VARCHAR(255) NOT NULL ) """) logger.info("populating: taxon") execute_values(pg_cur, "INSERT INTO taxon VALUES %s", ((tax_id, name, rank, left_num, right_num, parent_id, json.dumps(get_lineage(taxa, tax_id))) for tax_id, (name, rank, left_num, right_num, parent_id) in taxa.items()), page_size=1000) pg_cur.execute(""" CREATE INDEX taxon_left_number_idx ON taxon (left_number) """) logger.info("populating: lineage") execute_values(pg_cur, "INSERT INTO lineage VALUES %s", iter_lineage(taxa), page_size=1000) pg_cur.execute(""" CREATE UNIQUE INDEX lineage_child_parent_uidx ON lineage (child_id, parent_id) """) pg_cur.execute(""" CREATE INDEX lineage_child_rank_idx ON lineage (child_id, parent_rank) """) pg_con.commit() pg_con.close() logger.info("complete")