Beispiel #1
0
def genomicsqlite_txdb(txdb):
    """
    import bioC TxDb sourced originally from:
    http://bioconductor.org/packages/release/BiocViews.html#___TxDb
    """
    outfile = txdb[:-7] + ".genomicsqlite"
    conn = sqlite3.connect(txdb)
    conn.executescript(genomicsqlite.vacuum_into_sql(conn, outfile))
    conn.close()
    # create GRIs on the three feature tables
    conn = genomicsqlite.connect(outfile)
    conn.executescript(
        genomicsqlite.create_genomic_range_index_sql(conn,
                                                     "transcript",
                                                     "tx_chrom",
                                                     "tx_start",
                                                     "tx_end",
                                                     floor=2))
    conn.executescript(
        genomicsqlite.create_genomic_range_index_sql(conn,
                                                     "cds",
                                                     "cds_chrom",
                                                     "cds_start",
                                                     "cds_end",
                                                     floor=2))
    # intentionally left exon unindexed
    conn.close()
    return outfile
Beispiel #2
0
def test_gri_levels_in_sql(tmp_path):
    dbfile = str(tmp_path / "test.gsql")
    con = genomicsqlite.connect(dbfile, unsafe_load=True)
    _fill_exons(con)
    con.commit()

    results = list(
        con.execute("SELECT * FROM genomic_range_index_levels('exons')"))
    assert results == [(3, 1)]

    with pytest.raises(sqlite3.OperationalError, match="no such table"):
        con.execute("SELECT * FROM genomic_range_index_levels('nonexistent')")

    con.executescript("CREATE TABLE empty(rid TEXT, beg INTEGER, end INTEGER)")
    with pytest.raises(sqlite3.OperationalError,
                       match="missing genomic range index"):
        con.execute(
            "SELECT _gri_ceiling, _gri_floor FROM genomic_range_index_levels('empty')"
        )

    con.executescript(
        genomicsqlite.create_genomic_range_index_sql(con, "empty", "rid",
                                                     "beg", "end"))
    results = list(
        con.execute(
            "SELECT _gri_ceiling, _gri_floor FROM genomic_range_index_levels('empty')"
        ))
    assert results == [(15, 0)]
Beispiel #3
0
def test_attach(tmp_path):
    dbfile = str(tmp_path / "test.gsql")
    con = genomicsqlite.connect(dbfile, unsafe_load=True)
    _fill_exons(con, gri=False)
    con.commit()

    attach_script = genomicsqlite.attach_sql(con,
                                             str(tmp_path /
                                                 "test_attached.gsql"),
                                             "db2",
                                             unsafe_load=True)
    con.executescript(attach_script)
    con.executescript("CREATE TABLE db2.exons2 AS SELECT * FROM exons")
    con.executescript(
        genomicsqlite.create_genomic_range_index_sql(con, "db2.exons2", "rid",
                                                     "beg", "end"))
    ref_script = genomicsqlite.put_reference_assembly_sql(
        con, "GRCh38_no_alt_analysis_set", schema="db2")
    con.executescript(ref_script)

    query = (
        "SELECT exons.id, db2.exons2.id FROM exons LEFT JOIN db2.exons2 ON db2.exons2._rowid_ IN\n"
        + genomicsqlite.genomic_range_rowids_sql(
            con, "db2.exons2", "exons.rid", "exons.beg", "exons.end") +
        " AND exons.id != db2.exons2.id ORDER BY exons.id, db2.exons2.id")
    results = list(con.execute(query))
    assert len(results) == 5191

    refseq_by_name = genomicsqlite.get_reference_sequences_by_name(
        con, schema="db2")
    assert len(refseq_by_name) > 24
Beispiel #4
0
def test_gri_lvl():
    # Test the _gri_lvl generated column which calculates each feature's level number based on its
    # length.
    con = sqlite3.connect(":memory:")
    con.executescript(
        "CREATE TABLE features(rid INTEGER, beg INTEGER, end INTEGER, expected_lvl INTEGER)"
    )
    for lvl in range(16):
        for ofs in (-2, -1, 0, 1):
            featlen = 16 ** lvl + ofs
            tup = (420, 420 + featlen, (0 - lvl if ofs < 1 else 0 - lvl - 1))
            con.execute("INSERT INTO features VALUES(42,?,?,?)", tup)
    con.executescript(
        genomicsqlite.create_genomic_range_index_sql(con, "features", "rid", "beg", "end")
    )
    assert (
        next(
            con.execute("SELECT count(*) FROM features WHERE expected_lvl == ifnull(_gri_lvl,999)")
        )[0]
        == 62
    )
    assert (
        next(
            con.execute("SELECT count(*) FROM features WHERE expected_lvl != ifnull(_gri_lvl,999)")
        )[0]
        == 2
    )
Beispiel #5
0
def _fill_exons(con, floor=None, table="exons", gri=True, len_gri=False):
    con.execute(
        f"CREATE TABLE {table}(rid TEXT NOT NULL, beg INTEGER NOT NULL, end INTEGER NOT NULL, len INTEGER NOT NULL, id TEXT NOT NULL)"
    )
    for line in _EXONS.strip().split("\n"):
        line = line.split("\t")
        con.execute(
            f"INSERT INTO {table}(rid,beg,end,len,id) VALUES(?,?,?,?,?)",
            (line[0], int(line[1]) - 1, int(line[2]), int(line[2]) - int(line[1]) + 1, line[3]),
        )
    if gri:
        con.executescript(
            genomicsqlite.create_genomic_range_index_sql(
                con, table, "rid", "beg", ("beg+len" if len_gri else "end"), floor=floor
            )
        )
Beispiel #6
0
def test_abutment():
    # Test GRI query correctness with various cases of features abutting the query range
    con = sqlite3.connect(":memory:")
    con.executescript("CREATE TABLE features(rid INTEGER, beg INTEGER, end INTEGER)")
    pos0 = 10000000000
    for lvl in range(9):
        for ofs in (-1, 0, 1):
            for tup in ((pos0 - 16 ** lvl, pos0 + ofs), (pos0 + 123 + ofs, pos0 + 123 + 16 ** 9)):
                con.execute("INSERT INTO features VALUES(42,?,?)", tup)
    con.executescript(
        genomicsqlite.create_genomic_range_index_sql(con, "features", "rid", "beg", "end")
    )
    query = genomicsqlite.genomic_range_rowids_sql(
        con, "features", "42", str(pos0), str(pos0 + 123)
    )[1:-1]
    control_query = f"SELECT _rowid_ FROM features NOT INDEXED WHERE rid = 42 AND NOT (end < {pos0} OR beg > {pos0+123}) ORDER BY _rowid_"
    assert list(con.execute(query)) == list(con.execute(control_query))
Beispiel #7
0
def test_gri_levels_in_sql(tmp_path):
    dbfile = str(tmp_path / "test.gsql")
    con = genomicsqlite.connect(dbfile, unsafe_load=True)
    _fill_exons(con)
    con.commit()

    # test caching & invalidation:
    results = list(con.execute("SELECT * FROM genomic_range_index_levels('exons')"))
    assert results == [(3, 1)]
    results = list(con.execute("SELECT * FROM genomic_range_index_levels('exons')"))
    assert results == [(3, 1)]
    results = list(con.execute("SELECT * FROM genomic_range_index_levels('main.exons')"))
    assert results == [(3, 1)]
    tch1 = con.total_changes
    con.execute("INSERT INTO exons VALUES('ether',0,4097,4097,'ether')")
    tch2 = con.total_changes
    assert tch2 > tch1
    results = list(con.execute("SELECT * FROM genomic_range_index_levels('exons')"))
    assert results == [(4, 1)]
    con.commit()
    results = list(con.execute("SELECT * FROM genomic_range_index_levels('exons')"))
    assert results == [(4, 1)]
    con.execute("DELETE FROM exons WHERE rid = 'ether'")
    con.commit()
    results = list(con.execute("SELECT * FROM genomic_range_index_levels('main.exons')"))
    assert results == [(3, 1)]
    results = list(con.execute("SELECT * FROM genomic_range_index_levels('exons')"))
    assert results == [(3, 1)]

    with pytest.raises(sqlite3.OperationalError, match="no such table"):
        con.execute("SELECT * FROM genomic_range_index_levels('nonexistent')")

    con.executescript("CREATE TABLE empty(rid TEXT, beg INTEGER, end INTEGER)")
    with pytest.raises(sqlite3.OperationalError, match="missing genomic range index"):
        con.execute("SELECT _gri_ceiling, _gri_floor FROM genomic_range_index_levels('empty')")

    con.executescript(
        genomicsqlite.create_genomic_range_index_sql(con, "empty", "rid", "beg", "end")
    )
    results = list(
        con.execute("SELECT _gri_ceiling, _gri_floor FROM genomic_range_index_levels('empty')")
    )
    assert results == [(15, 0)]
Beispiel #8
0
def main():
    header = f"""\
        -- GenomicSQLite smoke test --
        timestamp: {time.asctime(time.gmtime())}
        platform: {platform.platform()}
        uname: {os.uname()}
        python: {platform.python_implementation()} {platform.python_version()}"""
    print(textwrap.dedent(header))

    print("cpu", end="")
    try:
        if platform.system() == "Linux":
            with open("/proc/cpuinfo") as cpuinfo:
                modelname = (line.strip().replace("\t", "") for line in cpuinfo
                             if line.lower().strip().startswith("model name"))
                print(" " + next(modelname, ": ???"))
        elif platform.system() == "Darwin":
            sysctl = subprocess.run(
                ["sysctl", "-n", "machdep.cpu.brand_string"],
                check=True,
                universal_newlines=True,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )
            if sysctl.returncode == 0 and sysctl.stdout.strip():
                print(": " + sysctl.stdout.strip())
            else:
                print(": ???")
        else:
            print(": ???")
    except Exception:
        print(": ???")

    env_keys = [
        k for k in os.environ
        if ("genomicsqlite" in k.lower()) or k in ("PATH", "PYTHONPATH",
                                                   "LD_LIBRARY_PATH",
                                                   "DYLD_LIBRARY_PATH")
    ]
    if env_keys:
        print("environment:")
        for k in env_keys:
            print(f"  {k}={os.environ[k]}")

    print("sqlite3: ", end="")
    import sqlite3

    conn = sqlite3.connect(":memory:")
    print(next(conn.execute("SELECT sqlite_version()"))[0])

    print("genomicsqlite: ", end="")
    try:
        import genomicsqlite
    except ImportError as exn:
        print(f"\n\nUnable to import genomicsqlite: {exn}")
        print(
            "The Python genomicsqlite package may need to be installed via pip3 or conda,"
        )
        print("or its location may need to be added to PYTHONPATH.")
        sys.exit(1)

    print(genomicsqlite.__version__)
    print(f"genomicsqlite library: {genomicsqlite._DLL}")

    dbfile = os.path.join(os.environ.get("TMPDIR", "/tmp"),
                          f"genomicsqlite_smoke_test.{time.monotonic()}")
    print(f"\ntest database: {dbfile}")
    try:
        dbconn = genomicsqlite.connect(dbfile)
        for (table, bed) in (("exons1", _EXONS1), ("exons2", _EXONS2)):
            dbconn.execute(
                f"CREATE TABLE {table}(chrom TEXT, pos INTEGER, end INTEGER, id TEXT PRIMARY KEY)"
            )
            for line in bed.strip().split("\n"):
                line = line.split("\t")
                dbconn.execute(
                    f"INSERT INTO {table}(chrom,pos,end,id) VALUES(?,?,?,?)",
                    (line[0], int(line[1]) - 1, int(line[2]), line[3]),
                )
            dbconn.executescript(
                genomicsqlite.create_genomic_range_index_sql(
                    dbconn, table, "chrom", "pos", "end"))
        dbconn.commit()

        query = (
            "SELECT exons1.id, exons2.id FROM exons1 LEFT JOIN exons2 ON exons2._rowid_ IN\n"
            + textwrap.indent(
                genomicsqlite.genomic_range_rowids_sql(
                    dbconn, "exons2", "exons1.chrom", "exons1.pos",
                    "exons1.end"),
                " ",
            ) + "\nORDER BY exons1.id, exons2.id")
        print("GRI query:\n" + textwrap.indent(query, "  "))
        print("GRI query plan:")
        gri_constraints = 0
        for expl in dbconn.execute("EXPLAIN QUERY PLAN " + query):
            print("  " + expl[3])
            if ("((_gri_rid,_gri_lvl,_gri_beg)>(?,?,?) AND (_gri_rid,_gri_lvl,_gri_beg)<(?,?,?))"
                    in expl[3]):
                gri_constraints += 1

        results = list(dbconn.execute(query))
        control = "SELECT exons1.id, exons2.id FROM exons1 LEFT JOIN exons2 NOT INDEXED ON NOT (exons2.end < exons1.pos OR exons2.pos > exons1.end) ORDER BY exons1.id, exons2.id"
        control_results = list(dbconn.execute(control))
        assert len(control_results) == 1139
        assert results == control_results

        if gri_constraints != 3:
            print("GRI query opcodes:")
            for expl in dbconn.execute("EXPLAIN " + query):
                print("  " + str(expl))

            print(
                "\n** WARNING: GRI yielded correct results, but with a possibly suboptimal query plan."
            )
            print(
                "** Please redirect this log to a file and send to maintainers @ https://github.com/mlin/GenomicSQLite\n"
            )

            sys.exit(2)

        dbconn.close()
        print("\nGenomicSQLite smoke test OK =)\n")
    finally:
        os.remove(dbfile)
Beispiel #9
0
def test_query_in_sql(tmp_path):
    dbfile = str(tmp_path / "test.gsql")
    con = genomicsqlite.connect(dbfile, unsafe_load=True)
    _fill_exons(con)
    con.commit()

    query = "SELECT id FROM exons WHERE exons._rowid_ IN genomic_range_rowids('exons',?,?,?)"
    results = list(con.execute(query, ("chr17", 43044294, 43048294)))

    control_query = genomicsqlite.genomic_range_rowids_sql(con, "exons")
    control_query = "SELECT id FROM exons WHERE exons._rowid_ IN\n" + control_query
    control_results = list(
        con.execute(control_query, ("chr17", 43044294, 43048294)))

    assert results == control_results

    for expl in con.execute(
            "EXPLAIN QUERY PLAN SELECT _rowid_ FROM genomic_range_rowids('exons',?,?,?) ORDER BY _rowid_",
        ("chr17", 43044294, 43048294),
    ):
        assert "USE TEMP B-TREE FOR ORDER BY" not in expl[3]

    assert next((expl[3] for expl in con.execute(
        "EXPLAIN QUERY PLAN SELECT _rowid_ FROM genomic_range_rowids('exons',?,?,?) ORDER BY _rowid_ DESC",
        ("chr17", 43044294, 43048294),
    ) if "USE TEMP B-TREE FOR ORDER BY" in expl[3]))

    with pytest.raises(sqlite3.OperationalError, match="domain error"):
        con.execute(
            "SELECT * FROM genomic_range_rowids('exons', 'chr17', 43044294, 43048294, 16, 0)"
        )

    dbfile2 = str(tmp_path / "test2.gsql")
    con2 = genomicsqlite.connect(dbfile2, unsafe_load=True)
    _fill_exons(con2)
    con2.commit()
    con2.close()

    con.executescript(
        genomicsqlite.attach_sql(con, dbfile2, "db2", immutable=True))
    query = """
        SELECT main.exons.id, db2.exons.id
            FROM main.exons LEFT JOIN db2.exons ON
            db2.exons._rowid_ IN genomic_range_rowids('db2.exons', main.exons.rid, main.exons.beg, main.exons.end)
            AND main.exons.id != db2.exons.id ORDER BY main.exons.id, db2.exons.id
        """
    results = list(con.execute(query))
    assert len(results) == 5191

    with pytest.raises(sqlite3.OperationalError, match="no such table"):
        con.execute(
            "SELECT * FROM genomic_range_rowids('nonexistent', 'chr17', 43044294, 43048294)"
        )

    con.executescript("CREATE TABLE empty(rid TEXT, beg INTEGER, end INTEGER)")
    with pytest.raises(sqlite3.OperationalError, match="no such index"):
        con.execute(
            "SELECT * FROM genomic_range_rowids('empty', 'chr17', 43044294, 43048294)"
        )

    con.executescript(
        genomicsqlite.create_genomic_range_index_sql(con, "empty", "rid",
                                                     "beg", "end"))
    results = list(
        con.execute(
            "SELECT * FROM genomic_range_rowids('empty', 'chr17', 43044294, 43048294)"
        ))
    assert results == []
Beispiel #10
0
def test_level_detection():
    # test corner cases for the bit of genomic_range_rowids() which detects the level range of
    # extant features

    con = sqlite3.connect(":memory:")
    con.executescript(
        "CREATE TABLE features(rid INTEGER, beg INTEGER, end INTEGER)")
    con.executescript(
        genomicsqlite.create_genomic_range_index_sql(con, "features", "rid",
                                                     "beg", "end"))

    def fanout(query):
        return sum(
            1 for expl in con.execute("EXPLAIN QUERY PLAN " +
                                      query, (None, None, None)) if
            "((_gri_rid,_gri_lvl,_gri_beg)>(?,?,?) AND (_gri_rid,_gri_lvl,_gri_beg)<(?,?,?))"
            in expl[3])

    assert fanout(
        genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1]) == 16

    con.executescript(
        "INSERT INTO features VALUES(NULL, NULL, NULL); INSERT INTO features VALUES(NULL, 0, 10000000000)"
    )
    assert fanout(
        genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1]) == 16
    assert not list(
        con.execute(
            genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1],
            (None, 123, 456)))

    con.executescript("INSERT INTO features VALUES(42, 1048568, 1048584)")
    query = genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1]
    print("\n" + query)
    assert "-1" in query and "-0x10)" in query
    assert not list(
        con.execute(
            genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1],
            (42, None, 1048584)))

    assert fanout(query) == 1

    con.executescript("""
        INSERT INTO features VALUES(44, 1048568, 1048584);
        INSERT INTO features VALUES(44, 0, 64000)
        """)
    query = genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1]
    print("\n" + query)
    assert "-4," in query and "-0x10000)" in query
    assert "-3," in query and "-0x1000)" in query
    assert "-2," in query and "-0x100)" in query
    assert "-1," in query and "-0x10)" in query
    assert fanout(query) == 4

    assert fanout(
        genomicsqlite.genomic_range_rowids_sql(con, "features",
                                               ceiling=6)[1:-1]) == 7
    assert (fanout(
        genomicsqlite.genomic_range_rowids_sql(con,
                                               "features",
                                               ceiling=6,
                                               floor=3)[1:-1]) == 4)

    con.executescript("""
        INSERT INTO features VALUES(43, NULL, 10000000000);
        INSERT INTO features VALUES(44, 0, NULL)
        """)
    assert fanout(
        genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1]) == 4

    con.executescript("""
        INSERT INTO features VALUES(43, 0, 10000000000);
        INSERT INTO features VALUES(43, 32, 33)
        """)
    query = genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1]
    print("\n" + query)
    assert fanout(query) == 10

    con.executescript("""
        INSERT INTO features VALUES(43, 0, 10000000000);
        INSERT INTO features VALUES(43, 32, 32)
        """)
    query = genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1]
    assert fanout(query) == 10
    assert len(list(con.execute(query, (43, 32, 33)))) == 4
    assert len(list(con.execute(query, (43, 33, 33)))) == 3