def test_genomic_unspliced(self): """ Monoexonic fragment that overlaps gene 1 """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) talon.make_temp_monoexonic_transcript_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) chrom = "chr1" strand = "+" positions = (1, 990) annotation = talon.identify_monoexon_transcript( chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) novelty_types = [x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_ID'] == correct_gene_ID assert "genomic_transcript" in novelty_types assert annotation['end_delta'] == -10 conn.close()
def test_no_match(self): """ Example with no FSM match """ conn, cursor = get_db_cursor() build = "toy_build" edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) gene_starts, gene_ends = talon.make_gene_start_and_end_dict(cursor, build) chrom = "chr1" positions = [1, 100, 500, 600] strand = "+" edge_IDs = [2] vertex_IDs = [2,3,4,5] v_novelty = [0, 0, 0, 0] all_matches = talon.search_for_ISM(edge_IDs, transcript_dict) gene_ID, transcript_ID, novelty, start_end_info = talon.process_FSM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, gene_starts, gene_ends, edge_dict, location_dict, run_info) assert gene_ID == transcript_ID == None conn.close()
def test_FSM_end_diff(self): """ Example where the transcript is an FSM but has a difference on the ends large enough to be novel. """ conn, cursor = get_db_cursor() build = "toy_build" edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) orig_vertices = run_info['vertex'] gene_starts, gene_ends = talon.make_gene_start_and_end_dict(cursor, build) chrom = "chr2" positions = [1, 100, 500, 600, 900, 1301] #Last postion is > 300bp away strand = "+" edge_IDs = [13, 14, 15] vertex_IDs = [14, 15, 16, 17] v_novelty = [0, 0, 0, 0] all_matches = talon.search_for_ISM(edge_IDs, transcript_dict) gene_ID, transcript_ID, novelty, start_end_info = talon.process_FSM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, gene_starts, gene_ends, edge_dict, location_dict, run_info) correct_gene_ID = fetch_correct_ID("TG2", "gene", cursor) correct_transcript_ID = fetch_correct_ID("TG2-001", "transcript", cursor) assert gene_ID == correct_gene_ID assert transcript_ID == correct_transcript_ID assert start_end_info["end_vertex"] == orig_vertices + 1 conn.close()
def test_ISM_suffix(self): """ Example where the transcript is an ISM with suffix """ conn, cursor = get_db_cursor() build = "toy_build" edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) chrom = "chr1" strand = "+" positions = [500, 600, 900, 1000] edge_IDs = [4] vertex_IDs = [4, 5] v_novelty = [0, 0] all_matches = talon.search_for_ISM(edge_IDs, transcript_dict) gene_ID, transcript_ID, novelty, start_end_info = talon.process_ISM( chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, transcript_dict, gene_starts, gene_ends, edge_dict, location_dict, run_info) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) assert gene_ID == correct_gene_ID assert start_end_info["vertex_IDs"] == [3, 4, 5, 6] assert start_end_info["edge_IDs"] == [3, 4, 5] assert start_end_info["start_novelty"] == 0 # because the exon is known assert start_end_info["end_novelty"] == 0 assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None conn.close()
def test_ISM_prefix(self): """ Example where the transcript is a prefix ISM with a novel start """ conn, cursor = get_db_cursor() build = "toy_build" edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) orig_exons = run_info["edge"] chrom = "chr1" strand = "+" positions = [1, 100, 500, 600] edge_IDs = [2] vertex_IDs = [2, 3] v_novelty = [0, 0] all_matches = talon.search_for_ISM(edge_IDs, transcript_dict) gene_ID, transcript_ID, novelty, start_end_info = talon.process_ISM( chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, transcript_dict, gene_starts, gene_ends, edge_dict, location_dict, run_info) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) assert gene_ID == correct_gene_ID assert start_end_info["vertex_IDs"] == [1, 2, 3, 4] assert start_end_info["edge_IDs"] == [1, 2, 3] conn.close()
def test_spliced_antisense(self): """ Example where the transcript matches known vertices but is antisense """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) chrom = "chr2" strand = "-" positions = [1000, 900, 600, 500, 100, 1] annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) anti_gene_ID = fetch_correct_ID("TG2", "gene", cursor) gene_novelty_types = [x[-2] for x in annotation['gene_novelty']] t_novelty_types = [x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_novelty'][0][-1] == "TRUE" assert "antisense_gene" in gene_novelty_types assert "antisense_transcript" in t_novelty_types assert annotation['start_delta'] == annotation['end_delta'] == 0 conn.close()
def test_FSM_end_diff(self): """ Example where the transcript is an FSM but has a difference on the ends large enough to be novel. """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) chrom = "chr2" strand = "+" positions = [1, 100, 500, 600, 900, 1500] annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) correct_gene_ID = fetch_correct_ID("TG2", "gene", cursor) novelty_types = [x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_ID'] == correct_gene_ID assert annotation['end_delta'] == None conn.close()
def test_NIC_instead_of_ISM(self): """ Test case where the transcript looks like an ISM, but is NIC on account of having known starts and ends """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) chrom = "chr3" strand = "+" positions = (800, 1000, 1200, 1400, 1600, 1800, 2000, 2200) annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) correct_gene_ID = fetch_correct_ID("TG5", "gene", cursor) novelty_types = [x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_ID'] == correct_gene_ID assert "NIC_transcript" in novelty_types conn.close()
def test_antisense(self): """ Example where the transcript is antisense """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) talon.make_temp_monoexonic_transcript_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict(cursor, build) chrom = "chr4" strand = "+" positions = ( 1300, 3900 ) annotation = talon.identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) anti_gene_ID = fetch_correct_ID("TG6", "gene", cursor) gene_novelty_types = [ x[-2] for x in annotation['gene_novelty']] t_novelty_types = [ x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_novelty'][0][-1] == "TRUE" assert "antisense_gene" in gene_novelty_types assert "antisense_transcript" in t_novelty_types conn.close()
def test_monoexonic_edge_case(self): """ Case I observed during testing where start and end accidentally ended up being assigned to the same vertex """ conn, cursor = get_db_cursor() build = "toy_build" location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) chrom = "chr1" pos = [550, 610] start = pos[0] splice_pos = pos[1] run_info.cutoff_5p = 500 run_info.cutoff_3p = 500 strand = "+" start_match, start_diff = talon.permissive_vertex_search( chrom, start, strand, splice_pos, "start", location_dict, run_info) end = pos[1] splice_pos = pos[0] end_match, end_diff = talon.permissive_vertex_search( chrom, end, strand, splice_pos, "end", location_dict, run_info) assert start_match == 3 assert end_match == 4
def test_match(self): """ Example where the transcript is a moniexonic match. """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) talon.make_temp_monoexonic_transcript_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict(cursor, build) chrom = "chr4" strand = "-" positions = ( 3900, 1100 ) annotation = talon.identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) correct_gene_ID = fetch_correct_ID("TG6", "gene", cursor) correct_transcript_ID = fetch_correct_ID("TG6-001", "transcript", cursor) assert annotation['gene_ID'] == correct_gene_ID assert annotation['start_delta'] == 100 assert annotation['end_delta'] == -100 conn.close()
def test_match_monoexonic(self): """ Test the permissive match strategy on a monoexonic transcript """ conn, cursor = get_db_cursor() build = "toy_build" location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) chrom = "chr2" pos = [920, 970] start = pos[0] splice_pos = pos[1] run_info.cutoff_5p = 500 run_info.cutoff_3p = 500 strand = "+" start_match, start_diff = talon.permissive_vertex_search( chrom, start, strand, splice_pos, "start", location_dict, run_info) end = pos[1] splice_pos = pos[0] end_match, end_diff = talon.permissive_vertex_search( chrom, end, strand, splice_pos, "end", location_dict, run_info) assert start_match == fetch_correct_vertex_ID(chrom, 900, cursor) assert start_diff == 20 assert end_match == fetch_correct_vertex_ID(chrom, 1000, cursor) assert end_diff == -30 conn.close()
def test_find_match(self): """ Example where the toy transcript edge dict does not contain the edge being queried. """ conn, cursor = get_db_cursor() # Create a location dict and then fetch vertices for two psotions build = "toy_build" location_dict = talon.make_location_dict(build, cursor) edge_dict = talon.make_edge_dict(cursor) conn.close() chrom = "chr1" pos1 = 600 pos2 = 500 v1 = talon.search_for_vertex_at_pos(chrom, pos1, location_dict)["location_ID"] v2 = talon.search_for_vertex_at_pos(chrom, pos2, location_dict)["location_ID"] assert v1 != None assert v2 != None # Now look for the edge between them edge_match = talon.search_for_edge(v1, v2, "exon", edge_dict) assert edge_match == None # Try them in the opposite order edge_match = talon.search_for_edge(v2, v1, "exon", edge_dict) assert edge_match["edge_ID"] == 3
def test_NIC_match(self): """ Example where the transcript is an NIC match to an existing one by virtue of skipping an exon. """ conn, cursor = get_db_cursor() build = "toy_build" edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) chrom = "chr1" positions = [1, 100, 900, 1000] edge_IDs = [run_info.edge + 1] vertex_IDs = [2, 5] strand = "+" v_novelty = [0, 0] gene_ID, transcript_ID, novelty, start_end_info = talon.process_NIC( chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, gene_starts, gene_ends, edge_dict, location_dict, vertex_2_gene, run_info) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) assert gene_ID == correct_gene_ID assert start_end_info["vertex_IDs"] == [1, 2, 5, 6] assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None conn.close()
def test_ISM_internal(self): """ Example where the transcript matches an internal exon """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) talon.make_temp_monoexonic_transcript_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) chrom = "chr1" strand = "+" positions = (500, 600) annotation = talon.identify_monoexon_transcript( chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) novelty_types = [x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_ID'] == correct_gene_ID assert "ISM_transcript" in novelty_types assert annotation['start_delta'] == annotation['end_delta'] == 0 conn.close()
def test_partial_match_3prime(self): """ Example where the transcript is short, so it overlaps the annotated transcript but is not an accepted match. the end should get assigned to the annotated end, but the end is novel """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) talon.make_temp_monoexonic_transcript_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict(cursor, build) chrom = "chr4" strand = "-" positions = ( 2000, 1100 ) annotation = talon.identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) correct_gene_ID = fetch_correct_ID("TG6", "gene", cursor) assert annotation['gene_ID'] == correct_gene_ID assert annotation['start_delta'] == None assert annotation['end_delta'] == -100 conn.close()
def test_NNC(self): """ Example where the transcript skips an exon and has a novel splice donor """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) chrom = "chr1" strand = "+" positions = [1, 50, 900, 1000] annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) novelty_types = [x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_ID'] == correct_gene_ID assert "NNC_transcript" in novelty_types assert annotation['start_delta'] == annotation['end_delta'] == 0 conn.close()
def test_FSM_perfect(self): """ Example where the transcript is a perfect full splice match. """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) chrom = "chr1" strand = "+" positions = [1, 100, 500, 600, 900, 1000] annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) correct_transcript_ID = fetch_correct_ID("TG1-001", "transcript", cursor) assert annotation['gene_ID'] == correct_gene_ID assert annotation['transcript_ID'] == correct_transcript_ID assert annotation['transcript_novelty'] == [] conn.close()
def test_antisense(self): """ Example where the vertices are known but there is no same-strand match """ conn, cursor = get_db_cursor() build = "toy_build" edge_dict = talon.make_edge_dict(cursor) locations = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) # Construct temp novel gene db talon.make_temp_novel_gene_table(cursor, "toy_build") chrom = "chr1" start = 1000 end = 1 edge_IDs = [run_info.edge + 1] positions = [1000, 900, 100, 1] vertex_IDs = [5, 2] strand = "-" anti_strand = "+" v_novelty = (0, 0, 0, 0) # Find antisense match gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \ talon.process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info, cursor) #anti_gene_ID = talon.find_gene_match_on_vertex_basis(vertex_IDs, # anti_strand, # vertex_2_gene) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) anti_gene_ID = gene_novelty[-1][-1] assert anti_gene_ID == correct_gene_ID assert start_end_info["vertex_IDs"] == [6, 5, 2, 1] conn.close()
def test_find_no_match(self): """ Example where the toy transcript database contains no matches for the position. """ conn, cursor = get_db_cursor() build = "toy_build" location_dict = talon.make_location_dict(build, cursor) chrom = "chr1" pos = 0 vertex_match = talon.search_for_vertex_at_pos(chrom, pos, location_dict) conn.close() # Make sure that no match got returned assert vertex_match == None
def test_single_match(self): """ Example where the interval overlaps exactly one gene """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, "toy_build") location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) chrom = "chr1" pos = [0, 1500] strand = "+" gene_ID, match_strand = talon.search_for_overlap_with_gene( chrom, pos[0], pos[1], strand, cursor, run_info) assert gene_ID == fetch_correct_ID("TG1", "gene", cursor) assert match_strand == strand conn.close()
def test_find_exactly_one_match(self): """ Example where the toy transcript database contains exactly one match for the position. """ conn, cursor = get_db_cursor() build = "toy_build" location_dict = talon.make_location_dict(build, cursor) chrom = "chr1" pos = 1 match = talon.search_for_vertex_at_pos(chrom, pos, location_dict) conn.close() print(match) # Make sure that match is correct and that we can access various # attributes using their names assert match["genome_build"] == "toy_build" assert match["chromosome"] == "chr1" assert match["position"] == 1
def test_all_known_locations(self): """ Example where the toy transcript database contains matches for all vertices. """ conn, cursor = get_db_cursor() build = "toy_build" location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) orig_vertex_count = run_info['vertex'] strand = "+" conn.close() chrom = "chr1" pos = [1, 100, 500, 600, 900, 1000] vertex_IDs, novelty = talon.match_splice_vertices( chrom, pos, strand, location_dict, run_info) assert vertex_IDs == [2, 3, 4, 5] assert run_info['vertex'] == orig_vertex_count
def test_overlap_but_no_vertex_match(self): """ Example where the transcript is short, so it overlaps the annotated transcript but is not an accepted match. the start should get assigned to the annotated end, but the end is novel """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) talon.make_temp_monoexonic_transcript_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict(cursor, build) tot_vertices = len(vertex_2_gene) query = """ SELECT COUNT(*) FROM temp_monoexon """ tot_monoexonic = cursor.execute(query).fetchone()[0] chrom = "chr4" strand = "-" positions = ( 2500, 2000 ) annotation = talon.identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) correct_gene_ID = fetch_correct_ID("TG6", "gene", cursor) print(annotation['start_vertex']) print(annotation['end_vertex']) assert annotation['gene_ID'] == correct_gene_ID assert annotation['start_delta'] == None assert annotation['end_delta'] == None # Now check if the transcript got added to the right data structures assert len(vertex_2_gene) == tot_vertices + 2 assert cursor.execute(query).fetchone()[0] == tot_monoexonic + 1 conn.close()
def test_2_genes_same_strand(self): """ Example where query overlaps two genes. Must choose the one with more overlap """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, "toy_build") location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) chrom = "chr1" pos = [800, 5050] strand = "+" gene_ID, match_strand = talon.search_for_overlap_with_gene( chrom, pos[0], pos[1], strand, cursor, run_info) assert gene_ID == fetch_correct_ID("TG1", "gene", cursor) assert match_strand == "+" conn.close()
def test_same_strand_match_with_two_genes(self): """ Example where interval overlaps two genes, one of which is on the same strand. """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, "toy_build") location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) chrom = "chr1" pos = [1500, 910] strand = "-" gene_ID, match_strand = talon.search_for_overlap_with_gene( chrom, pos[0], pos[1], strand, cursor, run_info) assert gene_ID == fetch_correct_ID("TG3", "gene", cursor) assert match_strand == strand conn.close()
def test_same_strand_match_left_overlap(self): """ Example where the overlap is on the same strand. Query start is to the left of the gene, and query end is before the end of the gene. """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, "toy_build") location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) chrom = "chr1" pos = [550, 1700] strand = "-" gene_ID, match_strand = talon.search_for_overlap_with_gene( chrom, pos[0], pos[1], strand, cursor, run_info) assert gene_ID == fetch_correct_ID("TG3", "gene", cursor) assert match_strand == strand conn.close()
def test_antisense_match(self): """ Example where interval overlaps one gene in the antisense direction. """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, "toy_build") location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) chrom = "chr1" pos = [1400, 2100] strand = "+" gene_ID, match_strand = talon.search_for_overlap_with_gene( chrom, pos[0], pos[1], strand, cursor, run_info) assert gene_ID == fetch_correct_ID("TG3", "gene", cursor) assert match_strand == "-" conn.close()
def test_edgecase_single_base_exon(self): """ Example where the first exon is only one basepair long """ conn, cursor = get_db_cursor() build = "toy_build" location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) chrom = "chr1" pos = [1, 1, 500, 600] start = pos[0] splice_pos = pos[2] cutoff = 500 strand = "+" vertex_match, diff = talon.permissive_vertex_search( chrom, start, strand, splice_pos, "start", location_dict, run_info) assert vertex_match == fetch_correct_vertex_ID(chrom, 1, cursor) assert diff == 0 conn.close()
def test_location_update(self): """ Update locations """ conn, cursor = get_db_cursor() build = "toy_build" location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) orig_n_pos = run_info.vertex talon.create_vertex("chr4", 2000, run_info, location_dict) batch_size = 10 talon.batch_add_locations(cursor, location_dict, batch_size) # Test if the table has the correct number of locations now query = "SELECT * FROM location" cursor.execute(query) loc_IDs = [x['location_ID'] for x in cursor.fetchall()] assert orig_n_pos + 1 in loc_IDs assert len(loc_IDs) == orig_n_pos + 1 conn.close()