def export_sequences(url: str, keyfile: str, output: str, processes: int = 1, tmpdir: Optional[str] = None): logger.info("starting") with Store(output, Store.load_keys(keyfile), tmpdir) as store: con = cx_Oracle.connect(url) cur = con.cursor() cur.execute(""" SELECT /*+ PARALLEL */ UX.AC, UP.SEQ_SHORT, UP.SEQ_LONG FROM UNIPARC.XREF UX INNER JOIN UNIPARC.PROTEIN UP ON UX.UPI = UP.UPI WHERE UX.DBID IN (2, 3) AND UX.DELETED = 'N' """) i = 0 for row in cur: store[row[0]] = row[2].read() if row[2] is not None else row[1] i += 1 if not i % 1000000: store.sync() if not i % 10000000: logger.info(f"{i:>12,}") cur.close() con.close() logger.info(f"{i:>12,}") size = store.merge(processes=processes) logger.info(f"temporary files: {size/1024/1024:.0f} MB")
def export_go(url: str, keyfile: str, output: str, processes: int = 1, tmpdir: Optional[str] = None): logger.info("starting") with Store(output, Store.load_keys(keyfile), tmpdir) as store: con = cx_Oracle.connect(url) cur = con.cursor() cur.execute(""" SELECT CODE, SORT_ORDER, TERM_NAME FROM GO.CV_CATEGORIES@GOAPRO """) categories = {row[0]: row[1:] for row in cur} cur.execute(""" SELECT E.ACCESSION, D.PRIMARY_ID, D.SECONDARY_ID, D.NOTE FROM SPTR.DBENTRY@SWPREAD E INNER JOIN SPTR.DBENTRY_2_DATABASE@SWPREAD D ON E.DBENTRY_ID = D.DBENTRY_ID WHERE E.ENTRY_TYPE IN (0, 1) -- Swiss-Prot and TrEMBL AND E.MERGE_STATUS != 'R' -- not 'Redundant' AND E.DELETED = 'N' -- not deleted AND E.FIRST_PUBLIC IS NOT NULL -- published AND D.DATABASE_ID = 'GO' -- GO annotation """) i = 0 for accession, go_id, sec_id, note in cur: # sec_id -> """ sec_id -> cat_code:term_name, e.g.: C:integral component of membrane node -> go_evidence: source,e.g.: IEA:InterPro """ cat_code, term_name = sec_id.split(':', 1) cat_order, cat_name = categories[cat_code] store.append(accession, (cat_order, go_id, term_name, cat_code, cat_name)) i += 1 if not i % 1000000: store.sync() if not i % 10000000: logger.info(f"{i:>12,}") cur.close() con.close() logger.info(f"{i:>12,}") size = store.merge(fn=_post_go, processes=processes) logger.info(f"temporary files: {size / 1024 / 1024:.0f} MB")
def export_name(url: str, keyfile: str, output: str, processes: int = 1, tmpdir: Optional[str] = None): logger.info("starting") with Store(output, Store.load_keys(keyfile), tmpdir) as store: con = cx_Oracle.connect(url) cur = con.cursor() cur.execute(""" SELECT ACCESSION, DESCR FROM ( SELECT E.ACCESSION, D.DESCR, ROW_NUMBER() OVER ( PARTITION BY E.ACCESSION ORDER BY CV.DESC_ID, -- 1=RecName, 2=AltName, 3=SubName CV.ORDER_IN, -- Swiss-Prot manual order D.DESCR -- TrEMBL alphabetic order ) RN FROM SPTR.DBENTRY@SWPREAD E INNER JOIN SPTR.DBENTRY_2_DESC@SWPREAD D ON E.DBENTRY_ID = D.DBENTRY_ID AND D.DESC_ID IN (1,4,11,13,16,23,25,28,35) --Full description section INNER JOIN SPTR.CV_DESC@SWPREAD CV ON D.DESC_ID = CV.DESC_ID WHERE E.ENTRY_TYPE IN (0, 1) AND E.MERGE_STATUS != 'R' AND E.DELETED = 'N' AND E.FIRST_PUBLIC IS NOT NULL ) WHERE RN = 1 """) i = 0 for accession, description in cur: store[accession] = description i += 1 if not i % 1000000: store.sync() if not i % 10000000: logger.info(f"{i:>12,}") cur.close() con.close() logger.info(f"{i:>12,}") size = store.merge(processes=processes) logger.info(f"temporary files: {size/1024/1024:.0f} MB")
def export_proteome(url: str, keyfile: str, output: str, processes: int = 1, tmpdir: Optional[str] = None): logger.info("starting") with Store(output, Store.load_keys(keyfile), tmpdir) as store: con = cx_Oracle.connect(url) cur = con.cursor() """ Without the DISTINCT, there would be duplicated rows, e.g. A0A059MHQ6 UP000024941 A0A059MHQ6 UP000024941 Even for duplicated rows, a given UniProt accession is associated to one unique UPID. It's just easier to remove the duplicates at the database level. """ cur.execute(""" SELECT DISTINCT E.ACCESSION, P.UPID FROM SPTR.DBENTRY@SWPREAD E INNER JOIN SPTR.PROTEOME2UNIPROT@SWPREAD P2U ON E.ACCESSION = P2U.ACCESSION AND E.TAX_ID = P2U.TAX_ID INNER JOIN SPTR.PROTEOME@SWPREAD P ON P2U.PROTEOME_ID = P.PROTEOME_ID AND P.IS_REFERENCE = 1 WHERE E.ENTRY_TYPE IN (0, 1) AND E.MERGE_STATUS != 'R' AND E.DELETED = 'N' AND E.FIRST_PUBLIC IS NOT NULL """) i = 0 for accession, upid in cur: store[accession] = upid i += 1 if not i % 1000000: store.sync() if not i % 10000000: logger.info(f"{i:>12,}") cur.close() con.close() logger.info(f"{i:>12,}") size = store.merge(processes=processes) logger.info(f"temporary files: {size/1024/1024:.0f} MB")
def export_evidence(url: str, keyfile: str, output: str, processes: int = 1, tmpdir: Optional[str] = None): logger.info("starting") with Store(output, Store.load_keys(keyfile), tmpdir) as store: con = cx_Oracle.connect(url) cur = con.cursor() cur.execute(""" SELECT ACCESSION, PROTEIN_EXISTENCE_ID, NAME FROM ( SELECT E.ACCESSION, E.PROTEIN_EXISTENCE_ID, GN.NAME, ROW_NUMBER() OVER ( PARTITION BY E.ACCESSION ORDER BY GN.GENE_NAME_TYPE_ID ) RN FROM SPTR.DBENTRY@SWPREAD E LEFT OUTER JOIN SPTR.GENE@SWPREAD G ON E.DBENTRY_ID = G.DBENTRY_ID LEFT OUTER JOIN SPTR.GENE_NAME@SWPREAD GN ON G.GENE_ID = GN.GENE_ID WHERE E.ENTRY_TYPE IN (0, 1) AND E.MERGE_STATUS != 'R' AND E.DELETED = 'N' AND E.FIRST_PUBLIC IS NOT NULL ) WHERE RN = 1 """) i = 0 for accession, evidence, gene in cur: store[accession] = (evidence, gene) i += 1 if not i % 1000000: store.sync() if not i % 10000000: logger.info(f"{i:>12,}") cur.close() con.close() logger.info(f"{i:>12,}") size = store.merge(processes=processes) logger.info(f"temporary files: {size/1024/1024:.0f} MB")
def export_features(url: str, keyfile: str, output: str, processes: int = 1, tmpdir: Optional[str] = None): logger.info("starting") with Store(output, Store.load_keys(keyfile), tmpdir) as store: con = cx_Oracle.connect(url) cur = con.cursor() cur.execute(""" SELECT FM.PROTEIN_AC, FM.METHOD_AC, LOWER(DB.DBSHORT), FM.POS_FROM, FM.POS_TO, FM.SEQ_FEATURE FROM INTERPRO.FEATURE_MATCH FM INNER JOIN INTERPRO.CV_DATABASE DB ON FM.DBCODE = DB.DBCODE """) i = 0 for row in cur: protein_acc = row[0] signature_acc = row[1] database = row[2] pos_start = row[3] pos_end = row[4] seq_feature = row[5] if database == "mobidblt" and seq_feature is None: seq_feature = "Consensus Disorder Prediction" store.update(protein_acc, { signature_acc: { "database": database, "locations": [(pos_start, pos_end, seq_feature)] } }, replace=True) i += 1 if not i % 1000000: store.sync() if not i % 100000000: logger.info(f"{i:>13,}") cur.close() con.close() logger.info(f"{i:>13,}") size = store.merge(processes=processes) logger.info(f"temporary files: {size/1024/1024:.0f} MB")
def export_comments(url: str, keyfile: str, output: str, processes: int = 1, tmpdir: Optional[str] = None): logger.info("starting") with Store(output, Store.load_keys(keyfile), tmpdir) as store: con = cx_Oracle.connect(url) cur = con.cursor() """ Note on the TEXT structure: Some comments have a title (e.g. Q01299) which is not retrieved when joining on CC_STRUCTURE_TYPE_ID = 1 """ cur.execute(""" SELECT E.ACCESSION, B.ORDER_IN, NVL(B.TEXT, SS.TEXT) FROM SPTR.DBENTRY@SWPREAD E INNER JOIN SPTR.COMMENT_BLOCK@SWPREAD B ON E.DBENTRY_ID = B.DBENTRY_ID AND B.COMMENT_TOPICS_ID = 2 -- FUNCTION comments LEFT OUTER JOIN SPTR.COMMENT_STRUCTURE@SWPREAD S ON B.COMMENT_BLOCK_ID = S.COMMENT_BLOCK_ID AND S.CC_STRUCTURE_TYPE_ID = 1 -- TEXT structure LEFT OUTER JOIN SPTR.COMMENT_SUBSTRUCTURE@SWPREAD SS ON S.COMMENT_STRUCTURE_ID = SS.COMMENT_STRUCTURE_ID WHERE E.ENTRY_TYPE IN (0, 1) -- Swiss-Prot and TrEMBL AND E.MERGE_STATUS != 'R' -- not 'Redundant' AND E.DELETED = 'N' -- not deleted AND E.FIRST_PUBLIC IS NOT NULL -- published """) i = 0 for accession, block_number, text in cur: store.append(accession, (block_number, text)) i += 1 if not i % 1000000: store.sync() if not i % 10000000: logger.info(f"{i:>12,}") cur.close() con.close() logger.info(f"{i:>12,}") size = store.merge(fn=_post_comments, processes=processes) logger.info(f"temporary files: {size/1024/1024:.0f} MB")
def export_proteins(url: str, keyfile: str, output: str, processes: int = 1, tmpdir: Optional[str] = None): logger.info("starting") with Store(output, Store.load_keys(keyfile), tmpdir) as store: con = cx_Oracle.connect(url) cur = con.cursor() cur.execute(""" SELECT PROTEIN_AC, NAME, DBCODE, LEN, FRAGMENT, TO_CHAR(TAX_ID), CRC64 FROM INTERPRO.PROTEIN """) i = 0 for row in cur: store[row[0]] = { "identifier": row[1], "reviewed": row[2] == 'S', "length": row[3], "fragment": row[4] == 'Y', "taxid": row[5], "crc64": row[6] } i += 1 if not i % 1000000: store.sync() if not i % 10000000: logger.info(f"{i:>12,}") cur.close() con.close() logger.info(f"{i:>12,}") size = store.merge(processes=processes) logger.info(f"temporary files: {size/1024/1024:.0f} MB")
def export_matches(url: str, keyfile: str, output: str, processes: int = 1, tmpdir: Optional[str] = None): logger.info("starting") with Store(output, Store.load_keys(keyfile), tmpdir) as store: con = cx_Oracle.connect(url) cur = con.cursor() cur.execute(""" SELECT M.PROTEIN_AC, M.METHOD_AC, M.MODEL_AC, M.POS_FROM, M.POS_TO, M.FRAGMENTS, M.SCORE, E.ENTRY_AC FROM INTERPRO.MATCH M LEFT OUTER JOIN ( SELECT E.ENTRY_AC, EM.METHOD_AC FROM INTERPRO.ENTRY E INNER JOIN INTERPRO.ENTRY2METHOD EM ON E.ENTRY_AC = EM.ENTRY_AC WHERE E.CHECKED = 'Y' ) E ON M.METHOD_AC = E.METHOD_AC """) i = 0 for row in cur: if row[5]: fragments = [] for frag in row[5].split(','): # Format: START-END-STATUS s, e, t = frag.split('-') fragments.append({ "start": int(s), "end": int(e), "dc-status": DC_STATUSES[t] }) else: fragments = [{ "start": row[3], "end": row[4], "dc-status": DC_STATUSES['S'] # Continuous }] store.append( row[0], ( row[1], # signature row[2], # model row[6], # score fragments, row[7] # InterPro entry )) i += 1 if not i % 1000000: store.sync() if not i % 100000000: logger.info(f"{i:>13,}") cur.close() con.close() logger.info(f"{i:>13,}") size = store.merge(fn=_post_matches, processes=processes) logger.info(f"temporary files: {size/1024/1024:.0f} MB")