def test_indexing(): con = sqlite3.connect(":memory:") _fill_exons(con) con.commit() query = genomicsqlite.genomic_range_rowids_sql(con, "exons") query = "SELECT id FROM exons WHERE exons._rowid_ IN\n" + query print("\n" + query) # The query should only search the relevant GRI levels indexed = 0 for expl in con.execute("EXPLAIN QUERY PLAN " + query, ("chr17", 43044294, 43048294)): print(expl[3]) if "USING INDEX exons__gri" in expl[3]: indexed += 1 assert indexed == 3 # The query should be covered by the index except for one final fetch of exons.id opcodes = list( con.execute("EXPLAIN " + query, ("chr17", 43044294, 43048294))) # for expl in opcodes: # if expl[1] in {"OpenRead", "OpenPseudo", "Column"}: # print(expl) accessed_cursors = list(opcode[2] for opcode in opcodes if opcode[1] == "Column") table_rootpages = set(row[0] for row in con.execute( "SELECT rootpage FROM sqlite_master WHERE type='table'")) table_cursors = set( opcode[2] for opcode in opcodes if opcode[1] == "OpenRead" and opcode[3] in table_rootpages) assert sum(1 for cursor in accessed_cursors if cursor not in table_cursors) > 1 # get from idx assert sum(1 for cursor in accessed_cursors if cursor in table_cursors) == 1 # get from table # (Also, it should produce correct results) control_query = "SELECT id FROM exons NOT INDEXED WHERE rid = ? AND NOT (end < ? OR beg > ?) ORDER BY _rowid_" random.seed(0xBADF00D) total_results = 0 for _ in range(50000): beg = random.randint(43044294 - 10000, 43044294 + 10000) end = beg + random.randint(1, random.choice([10, 100, 1000, 10000])) ids = list(row[0] for row in con.execute(query, ("chr17", beg, end))) control_ids = list( row[0] for row in con.execute(control_query, ("chr17", beg, end))) assert ids == control_ids total_results += len(control_ids) assert total_results == 189935 with pytest.raises(sqlite3.OperationalError): genomicsqlite.genomic_range_rowids_sql(con, "nonexistent_table")
def test_refseq(): con = sqlite3.connect(":memory:") create_assembly = genomicsqlite.put_reference_assembly_sql( con, "GRCh38_no_alt_analysis_set") lines = create_assembly.strip().split("\n") print("\n".join([line for line in lines if "INSERT INTO" in line][:24])) assert len([line for line in lines if "INSERT INTO" in line]) == 195 print("\n".join([line for line in lines if "INSERT INTO" not in line])) assert len([line for line in lines if "INSERT INTO" not in line]) == 2 con.executescript(create_assembly) _fill_exons(con, floor=2) con.commit() refseq_by_rid = genomicsqlite.get_reference_sequences_by_rid(con) refseq_by_name = genomicsqlite.get_reference_sequences_by_name(con) for refseq in refseq_by_rid.values(): assert refseq_by_rid[refseq.rid] == refseq assert refseq_by_name[refseq.name] == refseq if refseq.name == "chr17": assert refseq.rid == 17 assert refseq.length == 83257441 assert refseq.assembly == "GRCh38_no_alt_analysis_set" assert refseq.refget_id == "f9a0fb01553adb183568e3eb9d8626db" assert len(refseq_by_rid) == 195 query = ( "SELECT _gri_refseq._gri_rid, rid, beg, end, id FROM exons, _gri_refseq WHERE exons.rid = gri_refseq_name AND exons._rowid_ IN " + genomicsqlite.genomic_range_rowids_sql(con, "exons")) print("\n" + query) assert len([line for line in query.split("\n") if "BETWEEN" in line]) == 2 assert len(list(con.execute(query, ("chr17", 43115725, 43125370)))) == 56
def test_attach(tmp_path): dbfile = str(tmp_path / "test.gsql") con = genomicsqlite.connect(dbfile, unsafe_load=True) _fill_exons(con, gri=False) con.commit() attach_script = genomicsqlite.attach_sql(con, str(tmp_path / "test_attached.gsql"), "db2", unsafe_load=True) con.executescript(attach_script) con.executescript("CREATE TABLE db2.exons2 AS SELECT * FROM exons") con.executescript( genomicsqlite.create_genomic_range_index_sql(con, "db2.exons2", "rid", "beg", "end")) ref_script = genomicsqlite.put_reference_assembly_sql( con, "GRCh38_no_alt_analysis_set", schema="db2") con.executescript(ref_script) query = ( "SELECT exons.id, db2.exons2.id FROM exons LEFT JOIN db2.exons2 ON db2.exons2._rowid_ IN\n" + genomicsqlite.genomic_range_rowids_sql( con, "db2.exons2", "exons.rid", "exons.beg", "exons.end") + " AND exons.id != db2.exons2.id ORDER BY exons.id, db2.exons2.id") results = list(con.execute(query)) assert len(results) == 5191 refseq_by_name = genomicsqlite.get_reference_sequences_by_name( con, schema="db2") assert len(refseq_by_name) > 24
def test_join(): for len_gri in (False, True): con = sqlite3.connect(":memory:") con.executescript( genomicsqlite.put_reference_assembly_sql( con, "GRCh38_no_alt_analysis_set")) _fill_exons(con, table="exons") _fill_exons(con, floor=2, table="exons2", len_gri=len_gri) con.commit() query = ( "SELECT exons.id, exons2.id FROM exons LEFT JOIN exons2 ON exons2._rowid_ IN\n" + genomicsqlite.genomic_range_rowids_sql( con, "exons2", "exons.rid", "exons.beg", "exons.end") + " AND exons.id != exons2.id ORDER BY exons.id, exons2.id") print(query) indexed = 0 for expl in con.execute("EXPLAIN QUERY PLAN " + query): print(expl[3]) if ("((_gri_rid,_gri_lvl,_gri_beg)>(?,?,?) AND (_gri_rid,_gri_lvl,_gri_beg)<(?,?,?))" in expl[3]): indexed += 1 assert indexed == 2 results = list(con.execute(query)) assert len(results) == 5191 assert len([result for result in results if result[1] is None]) == 5 control = "SELECT exons.id, exons2.id FROM exons LEFT JOIN exons2 NOT INDEXED ON NOT (exons2.end < exons.beg OR exons2.beg > exons.end) AND exons.id != exons2.id ORDER BY exons.id, exons2.id" control = list(con.execute(control)) assert results == control
def test_web(): conn = genomicsqlite.connect( "https://github.com/mlin/sqlite_zstd_vfs/releases/download/web-test-db-v1/TxDb.Hsapiens.UCSC.hg38.knownGene.vacuum.genomicsqlite", read_only=True, ) results = list(t[0] for t in conn.execute( "SELECT tx_name FROM transcript WHERE _rowid_ IN " + genomicsqlite.genomic_range_rowids_sql(conn, "transcript") + " ORDER BY tx_name", ("chr12", 111803912, 111804012), )) print(results) sys.stdout.flush() assert results == sorted([ "ENST00000416293.7", "ENST00000261733.7", "ENST00000548536.1", "ENST00000549106.1" ]) exon_cds_counts = """ SELECT exon._rowid_ AS exon_id, COUNT(cds._rowid_) AS contained_cds FROM genomic_range_index_levels('cds'), exon LEFT JOIN cds ON cds._rowid_ IN genomic_range_rowids('cds', exon_chrom, exon_start, exon_end, _gri_ceiling, _gri_floor) AND (exon_start = cds_start AND exon_end >= cds_start OR exon_start <= cds_start AND exon_end = cds_end) GROUP BY exon._rowid_ """ for expl in conn.execute("EXPLAIN QUERY PLAN " + exon_cds_counts): print(expl[3]) sys.stdout.flush() cds_exon_count_hist = list( conn.execute( f"SELECT contained_cds, count(exon_id) AS exon_count FROM ({exon_cds_counts}) GROUP BY contained_cds ORDER BY contained_cds" )) assert cds_exon_count_hist[:2] == [(0, 270532), (1, 310059)]
def test_abutment(): # Test GRI query correctness with various cases of features abutting the query range con = sqlite3.connect(":memory:") con.executescript("CREATE TABLE features(rid INTEGER, beg INTEGER, end INTEGER)") pos0 = 10000000000 for lvl in range(9): for ofs in (-1, 0, 1): for tup in ((pos0 - 16 ** lvl, pos0 + ofs), (pos0 + 123 + ofs, pos0 + 123 + 16 ** 9)): con.execute("INSERT INTO features VALUES(42,?,?)", tup) con.executescript( genomicsqlite.create_genomic_range_index_sql(con, "features", "rid", "beg", "end") ) query = genomicsqlite.genomic_range_rowids_sql( con, "features", "42", str(pos0), str(pos0 + 123) )[1:-1] control_query = f"SELECT _rowid_ FROM features NOT INDEXED WHERE rid = 42 AND NOT (end < {pos0} OR beg > {pos0+123}) ORDER BY _rowid_" assert list(con.execute(query)) == list(con.execute(control_query))
def test_gnomad_sites_small(tmp_path): dbfile = str(tmp_path / "test.gsql") vcf_into_sqlite( os.path.join(HERE, "data/gnomad.r3.0.sites.ALDH2.vcf.bgz"), str(dbfile), "--table-prefix", "gnomad_", ) con = genomicsqlite.connect(dbfile, read_only=True) query = ( "SELECT variant_rowid, id_jsarray FROM (SELECT _gri_rid AS rid FROM _gri_refseq WHERE gri_refseq_name=?1) AS query, gnomad_variants WHERE variant_rowid IN" + genomicsqlite.genomic_range_rowids_sql(con, "gnomad_variants", "query.rid")) rs671 = ("chr12", 111803912, 111804012) print(query) indexed = 0 for expl in con.execute("EXPLAIN QUERY PLAN " + query, rs671): print(expl[3]) if "USING INDEX gnomad_variants__gri" in expl[3]: indexed += 1 assert indexed == 3 opcodes = list(con.execute("EXPLAIN " + query, rs671)) accessed_cursors = list(opcode[2] for opcode in opcodes if opcode[1] == "Column") table_rootpages = set(row[0] for row in con.execute( "SELECT rootpage FROM sqlite_master WHERE type='table'")) table_cursors = set( opcode[2] for opcode in opcodes if opcode[1] == "OpenRead" and opcode[3] in table_rootpages) assert sum( 1 for cursor in accessed_cursors if cursor not in table_cursors) > 1 assert sum(1 for cursor in accessed_cursors if cursor in table_cursors) == 1 results = list(con.execute(query, rs671)) results_rowids = set(vt[0] for vt in results) assert next(vt for vt in results if vt[1] and "rs671" in vt[1]) control = "SELECT variant_rowid FROM gnomad_variants NATURAL JOIN _gri_refseq WHERE gri_refseq_name = ? AND NOT ((pos+rlen) < ? OR pos > ?)" control_rowids = set(vt[0] for vt in con.execute(control, rs671)) assert len(control_rowids) == 22 assert results_rowids == control_rowids
def test_connect(tmp_path): dbfile = str(tmp_path / "test.gsql") con = genomicsqlite.connect(dbfile, unsafe_load=True) con.executescript( genomicsqlite.put_reference_assembly_sql(con, "GRCh38_no_alt_analysis_set")) _fill_exons(con) con.commit() del con con = genomicsqlite.connect(dbfile, read_only=True) query = ( "WITH exons2 AS (SELECT * from exons) SELECT exons.id, exons2.id FROM exons2 LEFT JOIN exons ON exons._rowid_ IN\n" + genomicsqlite.genomic_range_rowids_sql(con, "exons", "exons2.rid", "exons2.beg", "exons2.end") + " AND exons.id != exons2.id ORDER BY exons.id, exons2.id") results = list(con.execute(query)) assert len(results) == 5191
def test_gnomad_sites_small(tmp_path): dbfile = str(tmp_path / "gnomad_lines.gsql") vcf_lines_into_sqlite( os.path.join(HERE, "data/gnomad.r3.0.sites.ALDH2.vcf.bgz"), str(dbfile), "--table", "gnomad_vcf_lines", ) con = genomicsqlite.connect(dbfile, read_only=True) query = ("SELECT _rowid_, line FROM gnomad_vcf_lines WHERE _rowid_ IN " + genomicsqlite.genomic_range_rowids_sql(con, "gnomad_vcf_lines")) rs671 = ("chr12", 111803912, 111804012) print(query) indexed = 0 for expl in con.execute("EXPLAIN QUERY PLAN " + query, rs671): print(expl[3]) if "USING INDEX gnomad_vcf_lines__gri" in expl[3]: indexed += 1 assert indexed == 3 opcodes = list(con.execute("EXPLAIN " + query, rs671)) accessed_cursors = list(opcode[2] for opcode in opcodes if opcode[1] == "Column") table_rootpages = set(row[0] for row in con.execute( "SELECT rootpage FROM sqlite_master WHERE type='table'")) table_cursors = set( opcode[2] for opcode in opcodes if opcode[1] == "OpenRead" and opcode[3] in table_rootpages) assert sum( 1 for cursor in accessed_cursors if cursor not in table_cursors) > 1 assert sum(1 for cursor in accessed_cursors if cursor in table_cursors) == 1 results = list(con.execute(query, rs671)) results_rowids = set(vt[0] for vt in results) assert next(vt for vt in results if vt[1] and "rs671" in vt[1]) control = "SELECT rowid FROM gnomad_vcf_lines WHERE NOT ((POS+rlen) < ? OR POS > ?)" control_rowids = set(vt[0] for vt in con.execute(control, (rs671[1], rs671[2]))) assert len(control_rowids) == 22 assert results_rowids == control_rowids
def main(): header = f"""\ -- GenomicSQLite smoke test -- timestamp: {time.asctime(time.gmtime())} platform: {platform.platform()} uname: {os.uname()} python: {platform.python_implementation()} {platform.python_version()}""" print(textwrap.dedent(header)) print("cpu", end="") try: if platform.system() == "Linux": with open("/proc/cpuinfo") as cpuinfo: modelname = (line.strip().replace("\t", "") for line in cpuinfo if line.lower().strip().startswith("model name")) print(" " + next(modelname, ": ???")) elif platform.system() == "Darwin": sysctl = subprocess.run( ["sysctl", "-n", "machdep.cpu.brand_string"], check=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) if sysctl.returncode == 0 and sysctl.stdout.strip(): print(": " + sysctl.stdout.strip()) else: print(": ???") else: print(": ???") except Exception: print(": ???") env_keys = [ k for k in os.environ if ("genomicsqlite" in k.lower()) or k in ("PATH", "PYTHONPATH", "LD_LIBRARY_PATH", "DYLD_LIBRARY_PATH") ] if env_keys: print("environment:") for k in env_keys: print(f" {k}={os.environ[k]}") print("sqlite3: ", end="") import sqlite3 conn = sqlite3.connect(":memory:") print(next(conn.execute("SELECT sqlite_version()"))[0]) print("genomicsqlite: ", end="") try: import genomicsqlite except ImportError as exn: print(f"\n\nUnable to import genomicsqlite: {exn}") print( "The Python genomicsqlite package may need to be installed via pip3 or conda," ) print("or its location may need to be added to PYTHONPATH.") sys.exit(1) print(genomicsqlite.__version__) print(f"genomicsqlite library: {genomicsqlite._DLL}") dbfile = os.path.join(os.environ.get("TMPDIR", "/tmp"), f"genomicsqlite_smoke_test.{time.monotonic()}") print(f"\ntest database: {dbfile}") try: dbconn = genomicsqlite.connect(dbfile) for (table, bed) in (("exons1", _EXONS1), ("exons2", _EXONS2)): dbconn.execute( f"CREATE TABLE {table}(chrom TEXT, pos INTEGER, end INTEGER, id TEXT PRIMARY KEY)" ) for line in bed.strip().split("\n"): line = line.split("\t") dbconn.execute( f"INSERT INTO {table}(chrom,pos,end,id) VALUES(?,?,?,?)", (line[0], int(line[1]) - 1, int(line[2]), line[3]), ) dbconn.executescript( genomicsqlite.create_genomic_range_index_sql( dbconn, table, "chrom", "pos", "end")) dbconn.commit() query = ( "SELECT exons1.id, exons2.id FROM exons1 LEFT JOIN exons2 ON exons2._rowid_ IN\n" + textwrap.indent( genomicsqlite.genomic_range_rowids_sql( dbconn, "exons2", "exons1.chrom", "exons1.pos", "exons1.end"), " ", ) + "\nORDER BY exons1.id, exons2.id") print("GRI query:\n" + textwrap.indent(query, " ")) print("GRI query plan:") gri_constraints = 0 for expl in dbconn.execute("EXPLAIN QUERY PLAN " + query): print(" " + expl[3]) if ("((_gri_rid,_gri_lvl,_gri_beg)>(?,?,?) AND (_gri_rid,_gri_lvl,_gri_beg)<(?,?,?))" in expl[3]): gri_constraints += 1 results = list(dbconn.execute(query)) control = "SELECT exons1.id, exons2.id FROM exons1 LEFT JOIN exons2 NOT INDEXED ON NOT (exons2.end < exons1.pos OR exons2.pos > exons1.end) ORDER BY exons1.id, exons2.id" control_results = list(dbconn.execute(control)) assert len(control_results) == 1139 assert results == control_results if gri_constraints != 3: print("GRI query opcodes:") for expl in dbconn.execute("EXPLAIN " + query): print(" " + str(expl)) print( "\n** WARNING: GRI yielded correct results, but with a possibly suboptimal query plan." ) print( "** Please redirect this log to a file and send to maintainers @ https://github.com/mlin/GenomicSQLite\n" ) sys.exit(2) dbconn.close() print("\nGenomicSQLite smoke test OK =)\n") finally: os.remove(dbfile)
def test_query_in_sql(tmp_path): dbfile = str(tmp_path / "test.gsql") con = genomicsqlite.connect(dbfile, unsafe_load=True) _fill_exons(con) con.commit() query = "SELECT id FROM exons WHERE exons._rowid_ IN genomic_range_rowids('exons',?,?,?)" results = list(con.execute(query, ("chr17", 43044294, 43048294))) control_query = genomicsqlite.genomic_range_rowids_sql(con, "exons") control_query = "SELECT id FROM exons WHERE exons._rowid_ IN\n" + control_query control_results = list( con.execute(control_query, ("chr17", 43044294, 43048294))) assert results == control_results for expl in con.execute( "EXPLAIN QUERY PLAN SELECT _rowid_ FROM genomic_range_rowids('exons',?,?,?) ORDER BY _rowid_", ("chr17", 43044294, 43048294), ): assert "USE TEMP B-TREE FOR ORDER BY" not in expl[3] assert next((expl[3] for expl in con.execute( "EXPLAIN QUERY PLAN SELECT _rowid_ FROM genomic_range_rowids('exons',?,?,?) ORDER BY _rowid_ DESC", ("chr17", 43044294, 43048294), ) if "USE TEMP B-TREE FOR ORDER BY" in expl[3])) with pytest.raises(sqlite3.OperationalError, match="domain error"): con.execute( "SELECT * FROM genomic_range_rowids('exons', 'chr17', 43044294, 43048294, 16, 0)" ) dbfile2 = str(tmp_path / "test2.gsql") con2 = genomicsqlite.connect(dbfile2, unsafe_load=True) _fill_exons(con2) con2.commit() con2.close() con.executescript( genomicsqlite.attach_sql(con, dbfile2, "db2", immutable=True)) query = """ SELECT main.exons.id, db2.exons.id FROM main.exons LEFT JOIN db2.exons ON db2.exons._rowid_ IN genomic_range_rowids('db2.exons', main.exons.rid, main.exons.beg, main.exons.end) AND main.exons.id != db2.exons.id ORDER BY main.exons.id, db2.exons.id """ results = list(con.execute(query)) assert len(results) == 5191 with pytest.raises(sqlite3.OperationalError, match="no such table"): con.execute( "SELECT * FROM genomic_range_rowids('nonexistent', 'chr17', 43044294, 43048294)" ) con.executescript("CREATE TABLE empty(rid TEXT, beg INTEGER, end INTEGER)") with pytest.raises(sqlite3.OperationalError, match="no such index"): con.execute( "SELECT * FROM genomic_range_rowids('empty', 'chr17', 43044294, 43048294)" ) con.executescript( genomicsqlite.create_genomic_range_index_sql(con, "empty", "rid", "beg", "end")) results = list( con.execute( "SELECT * FROM genomic_range_rowids('empty', 'chr17', 43044294, 43048294)" )) assert results == []
def test_level_detection(): # test corner cases for the bit of genomic_range_rowids() which detects the level range of # extant features con = sqlite3.connect(":memory:") con.executescript( "CREATE TABLE features(rid INTEGER, beg INTEGER, end INTEGER)") con.executescript( genomicsqlite.create_genomic_range_index_sql(con, "features", "rid", "beg", "end")) def fanout(query): return sum( 1 for expl in con.execute("EXPLAIN QUERY PLAN " + query, (None, None, None)) if "((_gri_rid,_gri_lvl,_gri_beg)>(?,?,?) AND (_gri_rid,_gri_lvl,_gri_beg)<(?,?,?))" in expl[3]) assert fanout( genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1]) == 16 con.executescript( "INSERT INTO features VALUES(NULL, NULL, NULL); INSERT INTO features VALUES(NULL, 0, 10000000000)" ) assert fanout( genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1]) == 16 assert not list( con.execute( genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1], (None, 123, 456))) con.executescript("INSERT INTO features VALUES(42, 1048568, 1048584)") query = genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1] print("\n" + query) assert "-1" in query and "-0x10)" in query assert not list( con.execute( genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1], (42, None, 1048584))) assert fanout(query) == 1 con.executescript(""" INSERT INTO features VALUES(44, 1048568, 1048584); INSERT INTO features VALUES(44, 0, 64000) """) query = genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1] print("\n" + query) assert "-4," in query and "-0x10000)" in query assert "-3," in query and "-0x1000)" in query assert "-2," in query and "-0x100)" in query assert "-1," in query and "-0x10)" in query assert fanout(query) == 4 assert fanout( genomicsqlite.genomic_range_rowids_sql(con, "features", ceiling=6)[1:-1]) == 7 assert (fanout( genomicsqlite.genomic_range_rowids_sql(con, "features", ceiling=6, floor=3)[1:-1]) == 4) con.executescript(""" INSERT INTO features VALUES(43, NULL, 10000000000); INSERT INTO features VALUES(44, 0, NULL) """) assert fanout( genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1]) == 4 con.executescript(""" INSERT INTO features VALUES(43, 0, 10000000000); INSERT INTO features VALUES(43, 32, 33) """) query = genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1] print("\n" + query) assert fanout(query) == 10 con.executescript(""" INSERT INTO features VALUES(43, 0, 10000000000); INSERT INTO features VALUES(43, 32, 32) """) query = genomicsqlite.genomic_range_rowids_sql(con, "features")[1:-1] assert fanout(query) == 10 assert len(list(con.execute(query, (43, 32, 33)))) == 4 assert len(list(con.execute(query, (43, 33, 33)))) == 3
def test_txdbquery(genomicsqlite_txdb): conn = genomicsqlite.connect(genomicsqlite_txdb, read_only=True) assert next(conn.execute("PRAGMA page_size"))[0] == 16384 # one query results = list(t[0] for t in conn.execute( "SELECT tx_name FROM transcript WHERE _rowid_ IN " + genomicsqlite.genomic_range_rowids_sql(conn, "transcript") + " ORDER BY tx_name", ("chr12", 111803912, 111804012), )) print(results) assert results == sorted([ "ENST00000416293.7", "ENST00000261733.7", "ENST00000548536.1", "ENST00000549106.1" ]) # random queries chroms = list( conn.execute( "SELECT tx_chrom, length FROM (SELECT tx_chrom, MAX(tx_end) AS length FROM transcript GROUP BY tx_chrom) WHERE length > 1000000" )) random.seed(0xBADF00D) for tbl in ("transcript", "cds"): query = genomicsqlite.genomic_range_rowids_sql(conn, tbl)[1:-1] fanout = 0 for expl in conn.execute("EXPLAIN QUERY PLAN " + query, (None, None, None)): print(expl[3]) if ("((_gri_rid,_gri_lvl,_gri_beg)>(?,?,?) AND (_gri_rid,_gri_lvl,_gri_beg)<(?,?,?))" in expl[3]): fanout += 1 assert (tbl, fanout) in (("transcript", 5), ("cds", 3)) pfx = "tx" if tbl == "transcript" else tbl control_query = f"SELECT _rowid_ FROM {tbl} NOT INDEXED WHERE {pfx}_chrom = ? AND NOT ({pfx}_end < ? OR {pfx}_start > ?) ORDER BY _rowid_" total_results = 0 for _ in range(2000): chrom = random.choice(chroms) beg = random.randint(0, chrom[1] - 65536) end = beg + random.randint(1, random.choice([16, 256, 4096, 65536 ])) ids = list(row[0] for row in conn.execute(query, (chrom[0], beg, end))) control_ids = list( row[0] for row in conn.execute(control_query, (chrom[0], beg, end))) assert ids == control_ids total_results += len(control_ids) assert total_results in (7341, 2660) # join exon to cds ("which exons are coding?") exon_cds_counts = ( "SELECT exon._rowid_ AS exon_id, COUNT(cds._rowid_) AS contained_cds FROM exon LEFT JOIN cds ON cds._rowid_ IN " + genomicsqlite.genomic_range_rowids_sql(conn, "cds", "exon_chrom", "exon_start", "exon_end") + " AND (exon_start = cds_start AND exon_end >= cds_start OR exon_start <= cds_start AND exon_end = cds_end) GROUP BY exon._rowid_" ) print(exon_cds_counts) fanout = 0 for expl in conn.execute("EXPLAIN QUERY PLAN " + exon_cds_counts): print(expl[3]) if ("((_gri_rid,_gri_lvl,_gri_beg)>(?,?,?) AND (_gri_rid,_gri_lvl,_gri_beg)<(?,?,?))" in expl[3]): fanout += 1 assert fanout == 3 cds_exon_count_hist = list( conn.execute( f"SELECT contained_cds, count(exon_id) AS exon_count FROM ({exon_cds_counts}) GROUP BY contained_cds ORDER BY contained_cds" )) for elt in cds_exon_count_hist: print(elt) assert cds_exon_count_hist[:2] == [(0, 270532), (1, 310059)]