Esempio n. 1
0
def connect_mysql(url: str) -> MySQLdb.Connection:
    obj = url2dict(url)
    # MySQLdb and psycopg2 use different keyword params for password/database
    obj.update({
        "passwd": obj.pop("password"),
        "db": obj.pop("dbname")
    })
    return MySQLdb.connect(**obj)
Esempio n. 2
0
def import_databases(ora_url: str, pg_url: str):
    logger.info("populating")
    pg_con = psycopg2.connect(**url2dict(pg_url))
    with pg_con.cursor() as pg_cur:
        pg_cur.execute("DROP TABLE IF EXISTS database")
        pg_cur.execute(
            """
            CREATE TABLE database (
                id SERIAL NOT NULL 
                    CONSTRAINT database_pkey PRIMARY KEY,
                name VARCHAR(50) NOT NULL,
                name_long VARCHAR(50) NOT NULL,
                version VARCHAR(50) NOT NULL,
                updated DATE NOT NULL
            )
            """
        )

        ora_con = cx_Oracle.connect(ora_url)
        ora_cur = ora_con.cursor()
        ora_cur.execute(
            f"""
            SELECT D.DBCODE, LOWER(D.DBSHORT), D.DBNAME, V.VERSION, V.FILE_DATE
            FROM INTERPRO.CV_DATABASE D
            LEFT OUTER JOIN INTERPRO.DB_VERSION V
              ON D.DBCODE = V.DBCODE
            """
        )

        for row in ora_cur:
            if row[0] in DATABASES:
                pg_cur.execute(
                    """
                    INSERT INTO database (name, name_long, version, updated)
                    VALUES (%s, %s, %s, %s)
                    """, row[1:]
                )

        ora_cur.close()
        ora_con.close()

        pg_cur.execute(
            """
            CREATE UNIQUE INDEX database_name_idx
            ON database (name)
            """
        )
        pg_con.commit()

    pg_con.close()
    logger.info("complete")
Esempio n. 3
0
def get_swissprot_descriptions(pg_url: str) -> dict:
    con = psycopg2.connect(**url2dict(pg_url))
    with con.cursor() as cur:
        cur.execute(
            """
            SELECT DISTINCT s2p.signature_acc, pn.text
            FROM interpro.signature2protein s2p
            INNER JOIN interpro.protein_name pn ON s2p.name_id = pn.name_id
            WHERE s2p.is_reviewed            
            """
        )

        signatures = {}
        for signature_acc, text in cur:
            try:
                signatures[signature_acc].add(text)
            except KeyError:
                signatures[signature_acc] = {text}

    con.close()

    return signatures
Esempio n. 4
0
def import_signatures(ora_url: str, pg_url: str, allseqs: str, compseqs: str):
    logger.info("populating")
    with open(allseqs, "rb") as fh:
        allseqs = pickle.load(fh)

    with open(compseqs, "rb") as fh:
        compseqs = pickle.load(fh)

    pg_con = psycopg2.connect(**url2dict(pg_url))
    with pg_con.cursor() as pg_cur:
        pg_cur.execute("SELECT name, id FROM database")
        databases = dict(pg_cur.fetchall())

    ora_con = cx_Oracle.connect(ora_url)
    ora_cur = ora_con.cursor()
    ora_cur.execute(
        """
        SELECT
            M.METHOD_AC, LOWER(D.DBSHORT), M.NAME, M.DESCRIPTION, T.ABBREV, 
            M.ABSTRACT, M.ABSTRACT_LONG
        FROM INTERPRO.METHOD M
        INNER JOIN INTERPRO.CV_DATABASE D ON M.DBCODE = D.DBCODE
        INNER JOIN INTERPRO.CV_ENTRY_TYPE T ON M.SIG_TYPE = T.CODE
        """
    )
    values = []
    for row in ora_cur:
        try:
            db_id = databases[row[1]]
        except KeyError:
            continue

        acc = row[0]
        try:
            n_rev_seqs, n_rev_matches, n_unrev_seqs = allseqs[acc]
        except KeyError:
            n_rev_seqs = n_rev_matches = n_unrev_seqs = 0

        try:
            num_complete_sequences, num_residues = compseqs[acc]
        except KeyError:
            num_complete_sequences = num_residues = 0

        values.append((
            acc,                        # accession
            db_id,                      # database_id
            row[2],                     # name
            row[3],                     # description
            row[4],                     # type
            row[6].read() if row[6] is not None else row[5],    # abstract
            n_rev_seqs + n_unrev_seqs,  # num_sequences
            n_rev_seqs,                 # num_reviewed_sequences
            n_rev_matches,              # num_reviewed_matches
            num_complete_sequences,     # num_complete_sequences
            num_residues                # num_residues
        ))
    ora_cur.close()
    ora_con.close()

    with pg_con.cursor() as pg_cur:
        pg_cur.execute("DROP TABLE IF EXISTS signature")
        pg_cur.execute(
            """
            CREATE TABLE signature (
                accession VARCHAR(25) NOT NULL 
                    CONSTRAINT signature_pkey PRIMARY KEY,
                database_id INTEGER NOT NULL,
                name VARCHAR(100) NOT NULL,
                description VARCHAR(400),
                type VARCHAR(25) NOT NULL,
                abstract TEXT,
                num_sequences INTEGER NOT NULL,
                num_reviewed_sequences INTEGER NOT NULL,
                num_reviewed_matches INTEGER NOT NULL,
                num_complete_sequences INTEGER NOT NULL,
                num_residues BIGINT NOT NULL
            )
            """
        )
        execute_values(pg_cur, "INSERT INTO signature VALUES %s", values,
                       page_size=1000)

        pg_cur.execute(
            """
            CREATE INDEX signature_database_idx
            ON signature (database_id)
            """
        )
        pg_con.commit()

    pg_con.close()
    logger.info("complete")
Esempio n. 5
0
def import_annotations(ora_url: str, pg_url: str):
    pg_con = psycopg2.connect(**url2dict(pg_url))
    with pg_con.cursor() as pg_cur:
        for name in ("protein2go", "publication", "term"):
            pg_cur.execute(f"DROP TABLE IF EXISTS {name}")

        pg_cur.execute("""
            CREATE TABLE protein2go (
                protein_acc VARCHAR(15) NOT NULL,
                term_id VARCHAR(10) NOT NULL,
                ref_db_code VARCHAR(10) NOT NULL,
                ref_db_id VARCHAR(60) NOT NULL
            )
            """)
        pg_cur.execute("""
            CREATE TABLE publication (
                id VARCHAR(25) NOT NULL 
                    CONSTRAINT publication_pkey PRIMARY KEY,
                title VARCHAR(1500) NOT NULL,
                published DATE NOT NULL
            )
            """)
        pg_cur.execute("""
            CREATE TABLE term (
                id VARCHAR(10) NOT NULL 
                    CONSTRAINT term_pkey PRIMARY KEY,
                name VARCHAR(200) NOT NULL,
                category VARCHAR(25) NOT NULL,
                num_constraints INTEGER NOT NULL,
                is_obsolete BOOLEAN NOT NULL,
                definition VARCHAR NOT NULL,
                replaced_id VARCHAR(10)
            )
            """)

        ora_con = cx_Oracle.connect(ora_url)
        ora_cur = ora_con.cursor()

        logger.info("populating: protein2go")
        """
        Filtering on length:
        Some annotations are not on proteins, but on post-translation modifications or processing events.
          e.g. P27958:PRO_0000037566 (protein: P27958; chain: PRO_0000037573)

        Protein accessions are 15 characters long (max), so anything longer than 15 characters cannot be an accession.
        A better (but heavier) approach would be to join with our PROTEIN table.
        """
        ora_cur.execute("""
            SELECT A.ENTITY_ID, A.GO_ID, A.REF_DB_CODE, A.REF_DB_ID
            FROM GO.ANNOTATIONS@GOAPRO A
            INNER JOIN GO.ECO2EVIDENCE@GOAPRO E ON A.ECO_ID = E.ECO_ID
            INNER JOIN GO.CV_SOURCES@GOAPRO S ON S.CODE = A.SOURCE
            WHERE A.ENTITY_TYPE = 'protein'
            AND LENGTH(A.ENTITY_ID) <= 15
            AND E.GO_EVIDENCE != 'IEA'
            AND S.IS_PUBLIC = 'Y'
            """)

        sql = "INSERT INTO protein2go VALUES %s"
        execute_values(pg_cur, sql, ora_cur, page_size=1000)

        logger.info("populating: publication")
        ora_cur.execute("""
            SELECT ID, TITLE, FIRST_PUBLISH_DATE
            FROM GO.PUBLICATIONS@GOAPRO
            WHERE ID IN (
              SELECT DISTINCT A.REF_DB_ID
              FROM GO.ANNOTATIONS@GOAPRO A
              INNER JOIN GO.ECO2EVIDENCE@GOAPRO E ON A.ECO_ID = E.ECO_ID
              INNER JOIN GO.CV_SOURCES@GOAPRO S ON S.CODE = A.SOURCE
              WHERE A.ENTITY_TYPE = 'protein'
              AND LENGTH(A.ENTITY_ID) <= 15
              AND E.GO_EVIDENCE != 'IEA'
              AND S.IS_PUBLIC = 'Y'
              AND A.REF_DB_CODE = 'PMID'
            )
            """)
        sql = "INSERT INTO publication VALUES %s"
        execute_values(pg_cur, sql, ora_cur, page_size=1000)

        logger.info("populating: term")
        ora_cur.execute("""
            SELECT CHILD_ID, PARENT_ID
            FROM GO.ANCESTORS@GOAPRO
            WHERE CHILD_ID != PARENT_ID
            """)
        ancestors = {}
        for term_id, parent_id in ora_cur:
            try:
                ancestors[term_id].add(parent_id)
            except KeyError:
                ancestors[term_id] = {parent_id}

        ora_cur.execute("""
            SELECT DISTINCT GO_ID, CONSTRAINT_ID
            FROM GO.TERM_TAXON_CONSTRAINTS@GOAPRO
            """)
        constraints = {}
        for term_id, constraint_id in ora_cur:
            try:
                constraints[term_id].add(constraint_id)
            except KeyError:
                constraints[term_id] = {constraint_id}

        ora_cur.execute("""
            SELECT T.GO_ID, T.NAME, C.TERM_NAME, T.IS_OBSOLETE,
                   D.DEFINITION, NULL
            FROM GO.TERMS@GOAPRO T
            INNER JOIN GO.DEFINITIONS@GOAPRO D ON T.GO_ID = D.GO_ID
            INNER JOIN GO.CV_CATEGORIES@GOAPRO C ON T.CATEGORY = C.CODE
            UNION ALL
            SELECT S.SECONDARY_ID, T.NAME, C.TERM_NAME, T.IS_OBSOLETE,
                   D.DEFINITION, T.GO_ID
            FROM GO.SECONDARIES@GOAPRO S
            INNER JOIN GO.TERMS@GOAPRO T ON S.GO_ID = T.GO_ID
            INNER JOIN GO.CV_CATEGORIES@GOAPRO C ON T.CATEGORY = C.CODE
            INNER JOIN GO.DEFINITIONS@GOAPRO D ON T.GO_ID = D.GO_ID
            """)

        sql = "INSERT INTO term VALUES %s"
        execute_values(pg_cur,
                       sql,
                       ((row[0], row[1], row[2],
                         len(_get_constraints(row[0], ancestors, constraints)),
                         row[3] == 'Y', row[4], row[5]) for row in ora_cur),
                       page_size=1000)

        ora_cur.close()
        ora_con.close()

        pg_cur.execute("""
            CREATE INDEX protein2go_protein_idx
            ON protein2go (protein_acc)
            """)
        pg_cur.execute("""
            CREATE INDEX protein2go_term_idx
            ON protein2go (term_id)
            """)
        pg_cur.execute("ANALYZE publication")
        pg_cur.execute("ANALYZE term")
        pg_con.commit()

    pg_con.close()
    logger.info("complete")
Esempio n. 6
0
def import_taxonomy(ora_url: str, pg_url: str):
    logger.info("loading taxonomy info")
    ora_con = cx_Oracle.connect(ora_url)
    ora_cur = ora_con.cursor()
    ora_cur.execute("""
        SELECT TAX_ID, SCIENTIFIC_NAME, RANK, LEFT_NUMBER, RIGHT_NUMBER, 
               PARENT_ID
        FROM INTERPRO.ETAXI
        """)
    taxa = {}
    for tax_id, name, rank, left_num, right_num, parent_id in ora_cur:
        if tax_id in (1, 131567):
            """
            Skip root and meta-superkingdom (131567) which contains:
                * Bacteria (2)
                * Archaea (2157)
                * Eukaryota (2759)
            """
            continue
        elif parent_id in (1, 131567):
            rank = "superkingdom"
            parent_id = None

        taxa[tax_id] = (name, rank, left_num, right_num, parent_id)

    ora_cur.close()
    ora_con.close()

    pg_con = psycopg2.connect(**url2dict(pg_url))
    with pg_con.cursor() as pg_cur:
        pg_cur.execute("DROP TABLE IF EXISTS taxon")
        pg_cur.execute("DROP TABLE IF EXISTS lineage")
        pg_cur.execute("""
            CREATE TABLE taxon (
                id INTEGER NOT NULL 
                    CONSTRAINT taxon_id_pkey PRIMARY KEY,
                name VARCHAR(255) NOT NULL,
                rank VARCHAR(50) NOT NULL,
                left_number INTEGER NOT NULL,
                right_number INTEGER NOT NULL,
                parent_id INTEGER,
                lineage TEXT NOT NULL
            )
            """)
        pg_cur.execute("""
            CREATE TABLE lineage (
                child_id INTEGER NOT NULL,
                parent_id INTEGER NOT NULL,
                parent_rank VARCHAR(255) NOT NULL
            )
            """)

        logger.info("populating: taxon")
        execute_values(pg_cur,
                       "INSERT INTO taxon VALUES %s",
                       ((tax_id, name, rank, left_num, right_num, parent_id,
                         json.dumps(get_lineage(taxa, tax_id)))
                        for tax_id, (name, rank, left_num, right_num,
                                     parent_id) in taxa.items()),
                       page_size=1000)

        pg_cur.execute("""
            CREATE INDEX taxon_left_number_idx
            ON taxon (left_number)
            """)

        logger.info("populating: lineage")
        execute_values(pg_cur,
                       "INSERT INTO lineage VALUES %s",
                       iter_lineage(taxa),
                       page_size=1000)

        pg_cur.execute("""
            CREATE UNIQUE INDEX lineage_child_parent_uidx
            ON lineage (child_id, parent_id)
            """)
        pg_cur.execute("""
            CREATE INDEX lineage_child_rank_idx
            ON lineage (child_id, parent_rank)
            """)

        pg_con.commit()

    pg_con.close()
    logger.info("complete")