def test_match(self): """ Example where the transcript is a moniexonic match. """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) talon.make_temp_monoexonic_transcript_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict(cursor, build) chrom = "chr4" strand = "-" positions = ( 3900, 1100 ) annotation = talon.identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) correct_gene_ID = fetch_correct_ID("TG6", "gene", cursor) correct_transcript_ID = fetch_correct_ID("TG6-001", "transcript", cursor) assert annotation['gene_ID'] == correct_gene_ID assert annotation['start_delta'] == 100 assert annotation['end_delta'] == -100 conn.close()
def test_NIC_instead_of_ISM(self): """ Test case where the transcript looks like an ISM, but is NIC on account of having known starts and ends """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) chrom = "chr3" strand = "+" positions = (800, 1000, 1200, 1400, 1600, 1800, 2000, 2200) annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) correct_gene_ID = fetch_correct_ID("TG5", "gene", cursor) novelty_types = [x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_ID'] == correct_gene_ID assert "NIC_transcript" in novelty_types conn.close()
def test_NNC(self): """ Example where the transcript skips an exon and has a novel splice donor """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) chrom = "chr1" strand = "+" positions = [1, 50, 900, 1000] annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) novelty_types = [x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_ID'] == correct_gene_ID assert "NNC_transcript" in novelty_types assert annotation['start_delta'] == annotation['end_delta'] == 0 conn.close()
def test_spliced_antisense(self): """ Example where the transcript matches known vertices but is antisense """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) chrom = "chr2" strand = "-" positions = [1000, 900, 600, 500, 100, 1] annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) anti_gene_ID = fetch_correct_ID("TG2", "gene", cursor) gene_novelty_types = [x[-2] for x in annotation['gene_novelty']] t_novelty_types = [x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_novelty'][0][-1] == "TRUE" assert "antisense_gene" in gene_novelty_types assert "antisense_transcript" in t_novelty_types assert annotation['start_delta'] == annotation['end_delta'] == 0 conn.close()
def test_FSM_perfect(self): """ Example where the transcript is a perfect full splice match. """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) chrom = "chr1" strand = "+" positions = [1, 100, 500, 600, 900, 1000] annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) correct_transcript_ID = fetch_correct_ID("TG1-001", "transcript", cursor) assert annotation['gene_ID'] == correct_gene_ID assert annotation['transcript_ID'] == correct_transcript_ID assert annotation['transcript_novelty'] == [] conn.close()
def test_ISM_internal(self): """ Example where the transcript matches an internal exon """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) talon.make_temp_monoexonic_transcript_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) chrom = "chr1" strand = "+" positions = (500, 600) annotation = talon.identify_monoexon_transcript( chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) novelty_types = [x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_ID'] == correct_gene_ID assert "ISM_transcript" in novelty_types assert annotation['start_delta'] == annotation['end_delta'] == 0 conn.close()
def test_partial_match_3prime(self): """ Example where the transcript is short, so it overlaps the annotated transcript but is not an accepted match. the end should get assigned to the annotated end, but the end is novel """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) talon.make_temp_monoexonic_transcript_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict(cursor, build) chrom = "chr4" strand = "-" positions = ( 2000, 1100 ) annotation = talon.identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) correct_gene_ID = fetch_correct_ID("TG6", "gene", cursor) assert annotation['gene_ID'] == correct_gene_ID assert annotation['start_delta'] == None assert annotation['end_delta'] == -100 conn.close()
def test_antisense(self): """ Example where the transcript is antisense """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) talon.make_temp_monoexonic_transcript_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict(cursor, build) chrom = "chr4" strand = "+" positions = ( 1300, 3900 ) annotation = talon.identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) anti_gene_ID = fetch_correct_ID("TG6", "gene", cursor) gene_novelty_types = [ x[-2] for x in annotation['gene_novelty']] t_novelty_types = [ x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_novelty'][0][-1] == "TRUE" assert "antisense_gene" in gene_novelty_types assert "antisense_transcript" in t_novelty_types conn.close()
def test_NIC_match(self): """ Example where the transcript is an NIC match to an existing one by virtue of skipping an exon. """ conn, cursor = get_db_cursor() build = "toy_build" edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) chrom = "chr1" positions = [1, 100, 900, 1000] edge_IDs = [run_info.edge + 1] vertex_IDs = [2, 5] strand = "+" v_novelty = [0, 0] gene_ID, transcript_ID, novelty, start_end_info = talon.process_NIC( chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, gene_starts, gene_ends, edge_dict, location_dict, vertex_2_gene, run_info) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) assert gene_ID == correct_gene_ID assert start_end_info["vertex_IDs"] == [1, 2, 5, 6] assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None conn.close()
def test_FSM_end_diff(self): """ Example where the transcript is an FSM but has a difference on the ends large enough to be novel. """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) chrom = "chr2" strand = "+" positions = [1, 100, 500, 600, 900, 1500] annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) correct_gene_ID = fetch_correct_ID("TG2", "gene", cursor) novelty_types = [x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_ID'] == correct_gene_ID assert annotation['end_delta'] == None conn.close()
def test_genomic_unspliced(self): """ Monoexonic fragment that overlaps gene 1 """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) talon.make_temp_monoexonic_transcript_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) chrom = "chr1" strand = "+" positions = (1, 990) annotation = talon.identify_monoexon_transcript( chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) novelty_types = [x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_ID'] == correct_gene_ID assert "genomic_transcript" in novelty_types assert annotation['end_delta'] == -10 conn.close()
def test_antisense(self): """ Example where the vertices are known but there is no same-strand match """ conn, cursor = get_db_cursor() build = "toy_build" edge_dict = talon.make_edge_dict(cursor) locations = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) # Construct temp novel gene db talon.make_temp_novel_gene_table(cursor, "toy_build") chrom = "chr1" start = 1000 end = 1 edge_IDs = [run_info.edge + 1] positions = [1000, 900, 100, 1] vertex_IDs = [5, 2] strand = "-" anti_strand = "+" v_novelty = (0, 0, 0, 0) # Find antisense match gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \ talon.process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info, cursor) #anti_gene_ID = talon.find_gene_match_on_vertex_basis(vertex_IDs, # anti_strand, # vertex_2_gene) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) anti_gene_ID = gene_novelty[-1][-1] assert anti_gene_ID == correct_gene_ID assert start_end_info["vertex_IDs"] == [6, 5, 2, 1] conn.close()
def test_no_match(self): """ Example where no match exists """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, "toy_build") run_info = talon.init_run_info(cursor, build) vertex2gene = talon.make_vertex_2_gene_dict(cursor) vertex_IDs = (1000, 2000, 3000, 4000) strand = "+" gene_ID = talon.find_gene_match_on_vertex_basis( vertex_IDs, strand, vertex2gene) assert gene_ID == None conn.close()
def test_NNC_type_match(self): """ Example where some vertices match a gene, while others don't. """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, "toy_build") run_info = talon.init_run_info(cursor, build) vertex2gene = talon.make_vertex_2_gene_dict(cursor) vertex_IDs = (1, 200, 3, 4, 5, 6) strand = "+" gene_ID = talon.find_gene_match_on_vertex_basis( vertex_IDs, strand, vertex2gene) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) assert gene_ID == correct_gene_ID conn.close()
def test_overlap_but_no_vertex_match(self): """ Example where the transcript is short, so it overlaps the annotated transcript but is not an accepted match. the start should get assigned to the annotated end, but the end is novel """ conn, cursor = get_db_cursor() build = "toy_build" talon.make_temp_novel_gene_table(cursor, build) talon.make_temp_monoexonic_transcript_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict(cursor, build) tot_vertices = len(vertex_2_gene) query = """ SELECT COUNT(*) FROM temp_monoexon """ tot_monoexonic = cursor.execute(query).fetchone()[0] chrom = "chr4" strand = "-" positions = ( 2500, 2000 ) annotation = talon.identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) correct_gene_ID = fetch_correct_ID("TG6", "gene", cursor) print(annotation['start_vertex']) print(annotation['end_vertex']) assert annotation['gene_ID'] == correct_gene_ID assert annotation['start_delta'] == None assert annotation['end_delta'] == None # Now check if the transcript got added to the right data structures assert len(vertex_2_gene) == tot_vertices + 2 assert cursor.execute(query).fetchone()[0] == tot_monoexonic + 1 conn.close()
def test_genomic(self): """ Example where the transcript overlaps a gene but contains no known splice vertices """ conn, cursor = get_db_cursor() build = "toy_build" edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) # Construct temp novel gene db talon.make_temp_novel_gene_table(cursor, "toy_build") chrom = "chr1" positions = [1000, 950, 700, 600] edge_IDs = [run_info.edge + 1, run_info.edge + 2] vertex_IDs = [run_info.vertex + 1, run_info.vertex + 2] strand = "-" gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \ talon.process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, gene_starts, gene_ends, edge_dict, location_dict, vertex_2_gene, run_info, cursor) correct_gene_ID = fetch_correct_ID("TG3", "gene", cursor) assert gene_ID == correct_gene_ID assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None assert gene_novelty == [] assert transcript_novelty[-1][-2] == "genomic_transcript" conn.close()
def test_intergenic(self): """ Example where the transcript is an NIC match to an existing one by virtue of a new splice donor. """ conn, cursor = get_db_cursor() build = "toy_build" edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) correct_gene_ID = run_info.genes + 1 # Construct temp novel gene db talon.make_temp_novel_gene_table(cursor, "toy_build") chrom = "chrX" positions = [1, 100, 900, 1000] edge_IDs = [run_info.edge + 1, run_info.edge + 2] vertex_IDs = [run_info.vertex + 1, run_info.vertex + 2] strand = "+" gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \ talon.process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, gene_starts, gene_ends, edge_dict, location_dict, vertex_2_gene, run_info, cursor) assert gene_ID == correct_gene_ID assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None assert gene_novelty[0][-2] == "intergenic_novel" conn.close()
def test_NIC_with_all_known_edges(self): """ Test case derived from a real mouse Map2k4 read. All of edges are known (except 3'), yet the read is NIC not FSM/ISM """ conn = sqlite3.connect("scratch/Map2k4.db") conn.row_factory = sqlite3.Row cursor = conn.cursor() build = "mm10" talon.make_temp_novel_gene_table(cursor, build) edge_dict = talon.make_edge_dict(cursor) location_dict = talon.make_location_dict(build, cursor) run_info = talon.init_run_info(cursor, build) transcript_dict = talon.make_transcript_dict(cursor, build) vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = talon.make_gene_start_and_end_dict( cursor, build) chrom = "chr11" strand = "-" positions = [ 65788254, 65788136, 65775765, 65775733, 65756371, 65756269, 65735366, 65735192, 65719603, 65719484, 65712297, 65712178, 65709983, 65709932, 65707111, 65706984, 65696365, 65696288, 65693570, 65693422, 65691773, 65691728, 65690804, 65689322 ] annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info) assert annotation['gene_ID'] == 1 assert annotation['transcript_ID'] == 8 novelty_types = [x[-2] for x in annotation['transcript_novelty']] assert "NIC_transcript" in novelty_types conn.close()
def test_vertex2gene_update(self): """ Update vertex to gene relationships """ conn, cursor = get_db_cursor() build = "toy_build" vertex_2_gene = talon.make_vertex_2_gene_dict(cursor) # Pretend that vertex 1 and 2 can now belong to gene 2 as well as 1 talon.update_vertex_2_gene(2, (1, 2), "-", vertex_2_gene) # Add redundant assignments talon.update_vertex_2_gene(1, (1, 2, 3, 4, 5, 6), "+", vertex_2_gene) batch_size = 100 talon.batch_add_vertex2gene(cursor, vertex_2_gene, batch_size) # Use queries to check if the insert worked as expected query = "SELECT * FROM vertex WHERE vertex_ID = '1'" cursor.execute(query) gene_IDs = [x['gene_ID'] for x in cursor.fetchall()] assert gene_IDs == [1, 2] query = "SELECT * FROM vertex WHERE gene_ID = '1'" cursor.execute(query) vertex_IDs = [x['vertex_ID'] for x in cursor.fetchall()] assert vertex_IDs == [1, 2, 3, 4, 5, 6]