def test_NIC_instead_of_ISM(self): """ Test case where the transcript looks like an ISM, but is NIC on account of having known starts and ends """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict( cursor, build) chrom = "chr3" strand = "+" positions = (800, 1000, 1200, 1400, 1600, 1800, 2000, 2200) annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info, "tmp_gene") correct_gene_ID = fetch_correct_ID("TG5", "gene", cursor) novelty_types = [x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_ID'] == correct_gene_ID assert "NIC_transcript" in novelty_types conn.close()
def test_ISM_prefix(self): """ Example where the transcript is a prefix ISM with a novel start """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" run_info = talon.init_run_info(database, build) talon.get_counters(database) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) transcript_dict = init_refs.make_transcript_dict(cursor, build) gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict( cursor, build) chrom = "chr1" strand = "+" positions = [1, 100, 500, 600] edge_IDs = [2] vertex_IDs = [2, 3] v_novelty = [0, 0] all_matches = talon.search_for_ISM(edge_IDs, transcript_dict) gene_ID, transcript_ID, novelty, start_end_info = talon.process_ISM( chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, transcript_dict, gene_starts, gene_ends, edge_dict, location_dict, run_info) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) assert gene_ID == correct_gene_ID assert start_end_info["vertex_IDs"] == [1, 2, 3, 4] assert start_end_info["edge_IDs"] == [1, 2, 3] conn.close()
def test_antisense(self): """ Example where the transcript is antisense """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) init_refs.make_temp_monoexonic_transcript_table(cursor, build) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict( cursor, build) chrom = "chr4" strand = "+" positions = (1300, 3900) annotation = talon.identify_monoexon_transcript( chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info, 'temp_gene', 'temp_monoexon') anti_gene_ID = fetch_correct_ID("TG6", "gene", cursor) gene_novelty_types = [x[-2] for x in annotation['gene_novelty']] t_novelty_types = [x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_novelty'][0][-1] == "TRUE" assert "antisense_gene" in gene_novelty_types assert "antisense_transcript" in t_novelty_types conn.close()
def test_match(self): """ Example where the transcript is a monoexonic match. """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) init_refs.make_temp_monoexonic_transcript_table(cursor, build) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start") gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end") chrom = "chr4" strand = "-" positions = ( 3900, 1100 ) annotation = talon.identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info, 'temp_gene', 'temp_monoexon') correct_gene_ID = fetch_correct_ID("TG6", "gene", cursor) correct_transcript_ID = fetch_correct_ID("TG6-001", "transcript", cursor) assert annotation['gene_ID'] == correct_gene_ID assert annotation['start_delta'] == 100 assert annotation['end_delta'] == -100 conn.close()
def test_no_match(self): """ Example with no ISM match """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" run_info = talon.init_run_info(database, build) talon.get_counters(database) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) transcript_dict = init_refs.make_transcript_dict(cursor, build) gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict( cursor, build) chrom = "chr1" strand = "+" positions = [1, 100, 900, 1000] edge_IDs = [200] vertex_IDs = [2, 5] v_novelty = [0, 0] all_matches = talon.search_for_ISM(edge_IDs, transcript_dict) assert all_matches == None conn.close()
def test_gene_update(self): """ Try to add novel gene entries to database while ignoring duplicates """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" run_info = talon.init_run_info(database, build) talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) talon.create_gene("chr4", 1, 1000, "+", cursor, "temp_gene") # Write to file os.system("mkdir -p scratch/db_updates/") with open("scratch/db_updates/genes.tsv", 'w') as f: cursor.execute("SELECT gene_ID, strand FROM temp_gene") for entry in cursor.fetchall(): f.write("\t".join([str(x) for x in entry]) + "\n") talon.batch_add_genes(cursor, "scratch/db_updates/genes.tsv", 10) # Test if gene with ID 6 is there, but make sure we didn't add # duplicates of the other genes query = "SELECT * FROM genes" gene_IDs = [x['gene_ID'] for x in cursor.execute(query)] assert 7 in gene_IDs assert len(gene_IDs) == 7 conn.close()
def test_with_novel_location(self): """ Example where the toy transcript database contains a novel position. """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" talon.get_counters(database) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) orig_vertex_count = talon.vertex_counter.value() orig_n_locations = len(location_dict["chr1"]) conn.close() chrom = "chr1" strand = "+" pos = [1, 150, 500, 600, 900, 1000] vertex_IDs, novelty = talon.match_splice_vertices(chrom, pos, strand, location_dict, run_info) # Make sure that no match got returned new_vertex_count = talon.vertex_counter.value() assert vertex_IDs == [ new_vertex_count, 3, 4, 5] # Make sure the data structures got updated assert new_vertex_count == orig_vertex_count + 1 assert len(location_dict["chr1"]) == orig_n_locations + 1
def test_NNC_match(self): """ Example where the transcript is an NNC match to an existing one by virtue of a new splice donor. """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" talon.get_counters(database) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts = init_refs.make_gene_start_or_end_dict( cursor, build, "start") gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end") chrom = "chr1" positions = [1, 110, 900, 1000] edge_IDs = [talon.edge_counter.value() + 1] vertex_IDs = [talon.vertex_counter.value() + 1, 5] strand = "+" v_novelty = [0, 0] gene_ID, transcript_ID, transcript_novelty, start_end_info = talon.process_NNC( chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, gene_starts, gene_ends, edge_dict, location_dict, vertex_2_gene, run_info) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) assert gene_ID == correct_gene_ID assert start_end_info["vertex_IDs"] == [1] + vertex_IDs + [6] assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None conn.close()
def test_no_match(self): """ Example with no FSM match """ conn, cursor = get_db_cursor() build = "toy_build" db = "scratch/toy.db" talon.get_counters(db) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(db, build) transcript_dict = init_refs.make_transcript_dict(cursor, build) gene_starts = init_refs.make_gene_start_or_end_dict( cursor, build, "start") gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end") chrom = "chr1" positions = [1, 100, 500, 600] strand = "+" edge_IDs = [2] vertex_IDs = [2, 3, 4, 5] v_novelty = [0, 0, 0, 0] all_matches = talon.search_for_ISM(edge_IDs, transcript_dict) gene_ID, transcript_ID, novelty, start_end_info = talon.process_FSM( chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, gene_starts, gene_ends, edge_dict, location_dict, run_info) assert gene_ID == transcript_ID == None conn.close()
def test_FSM_perfect(self): """ Example where the transcript is a perfect full splice match. """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict( cursor, build) chrom = "chr1" strand = "+" positions = [1, 100, 500, 600, 900, 1000] annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info, "temp_gene") correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) correct_transcript_ID = fetch_correct_ID("TG1-001", "transcript", cursor) assert annotation['gene_ID'] == correct_gene_ID assert annotation['transcript_ID'] == correct_transcript_ID assert annotation['transcript_novelty'] == [] conn.close()
def test_FSM_end_diff(self): """ Example where the transcript is an FSM but has a difference on the ends large enough to be novel. """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict( cursor, build) chrom = "chr2" strand = "+" positions = [1, 100, 500, 600, 900, 1500] annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info, "temp_gene") correct_gene_ID = fetch_correct_ID("TG2", "gene", cursor) novelty_types = [x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_ID'] == correct_gene_ID assert annotation['end_delta'] == None conn.close()
def test_genomic_unspliced(self): """ Monoexonic fragment that overlaps gene 1 """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) init_refs.make_temp_monoexonic_transcript_table(cursor, build) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict( cursor, build) chrom = "chr1" strand = "+" positions = (1, 990) annotation = talon.identify_monoexon_transcript( chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info, "temp_gene", "temp_monoexon") correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) novelty_types = [x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_ID'] == correct_gene_ID assert "genomic_transcript" in novelty_types assert annotation['end_delta'] == -10 conn.close()
def test_antisense(self): """ Example where all of the vertices are in the database, but the edges are not, because they are antisense to the original transcript """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" talon.get_counters(database) edge_dict = init_refs.make_edge_dict(cursor) run_info = talon.init_run_info(database, build) orig_n_edges = len(edge_dict) conn.close() chrom = "chr2" vertex_IDs = [13, 12, 11, 10] strand = "-" edge_IDs, novelty = talon.match_all_splice_edges( vertex_IDs, strand, edge_dict, run_info) expected_edges = [] for i in range(1, 4): num = orig_n_edges + i edge_id = num expected_edges.append(edge_id) assert edge_IDs == expected_edges assert novelty == [1, 1, 1]
def test_NNC(self): """ Example where the transcript skips an exon and has a novel splice donor """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict( cursor, build) chrom = "chr1" strand = "+" positions = [1, 50, 900, 1000] annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info, "temp_gene") correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) novelty_types = [x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_ID'] == correct_gene_ID assert "NNC_transcript" in novelty_types assert annotation['start_delta'] == annotation['end_delta'] == 0 conn.close()
def test_antisense(self): """ Example where the vertices are known but there is no same-strand match """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" talon.get_counters(database) edge_dict = init_refs.make_edge_dict(cursor) locations = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts = init_refs.make_gene_start_or_end_dict( cursor, build, "start") gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end") # Construct temp novel gene db init_refs.make_temp_novel_gene_table(cursor, "toy_build") chrom = "chr1" start = 1000 end = 1 edge_IDs = [talon.edge_counter.value() + 1] positions = [1000, 900, 100, 1] vertex_IDs = [5, 2] strand = "-" anti_strand = "+" v_novelty = (0, 0, 0, 0) # Find antisense match gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \ talon.process_spliced_antisense(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, gene_starts, gene_ends, edge_dict, locations, vertex_2_gene, run_info, cursor, "temp_gene") #anti_gene_ID = talon.find_gene_match_on_vertex_basis(vertex_IDs, # anti_strand, # vertex_2_gene) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) anti_gene_ID = gene_novelty[-1][-1] assert anti_gene_ID == correct_gene_ID assert start_end_info["vertex_IDs"] == [6, 5, 2, 1] conn.close()
def test_transcript_update(self): """ Try to add novel transcript entries to database while ignoring duplicates """ conn, cursor = get_db_cursor() build = "toy_build" transcript_dict = init_refs.make_transcript_dict(cursor, build) database = "scratch/toy.db" talon.get_counters(database) talon.create_transcript("chr1", 1, 1000, 1, (1, ), (1, 2), transcript_dict) # Write to file os.system("mkdir -p scratch/db_updates/") with open("scratch/db_updates/transcripts.tsv", 'w') as f: for transcript in transcript_dict.values(): if type(transcript) is dict: entry = "\t".join([ str(x) for x in (transcript['transcript_ID'], transcript['gene_ID'], transcript['start_exon'], transcript['jn_path'], transcript['end_exon'], transcript['start_vertex'], transcript['end_vertex'], transcript['n_exons']) ]) f.write(entry + "\n") batch_size = 5 talon.batch_add_transcripts(cursor, "scratch/db_updates/transcripts.tsv", batch_size) # Test if transcript with ID 8 is there, but make sure we didn't add # duplicates of the others query = "SELECT * FROM transcripts" cursor.execute(query) transcripts = cursor.fetchall() transcript_IDs = [x['transcript_ID'] for x in transcripts] assert 8 in transcript_IDs assert len(transcript_IDs) == 8 # Test if None value was handled correctly for transcript in transcripts: if transcript['transcript_ID'] == 8: assert transcript['jn_path'] == None conn.close()
def test_intergenic(self): """ Example where the transcript is an NIC match to an existing one by virtue of a new splice donor. """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" talon.get_counters(database) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts = init_refs.make_gene_start_or_end_dict( cursor, build, "start") gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end") correct_gene_ID = talon.gene_counter.value() + 1 # Construct temp novel gene db init_refs.make_temp_novel_gene_table(cursor, "toy_build") chrom = "chrX" positions = [1, 100, 900, 1000] edge_IDs = [ talon.edge_counter.value() + 1, talon.edge_counter.value() + 2 ] vertex_IDs = [ talon.vertex_counter.value() + 1, talon.vertex_counter.value() + 2 ] strand = "+" gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \ talon.process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, gene_starts, gene_ends, edge_dict, location_dict, vertex_2_gene, run_info, cursor, "temp_gene") assert gene_ID == correct_gene_ID assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None assert gene_novelty[0][-2] == "intergenic_novel" conn.close()
def test_genomic(self): """ Example where the transcript overlaps a gene but contains no known splice vertices """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" talon.get_counters(database) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts = init_refs.make_gene_start_or_end_dict( cursor, build, "start") gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end") # Construct temp novel gene db init_refs.make_temp_novel_gene_table(cursor, "toy_build") chrom = "chr1" positions = [1000, 950, 700, 600] edge_IDs = [ talon.edge_counter.value() + 1, talon.edge_counter.value() + 2 ] vertex_IDs = [ talon.vertex_counter.value() + 1, talon.vertex_counter.value() + 2 ] strand = "-" gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \ talon.process_remaining_mult_cases(chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict, gene_starts, gene_ends, edge_dict, location_dict, vertex_2_gene, run_info, cursor, "temp_gene") correct_gene_ID = fetch_correct_ID("TG3", "gene", cursor) assert gene_ID == correct_gene_ID assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None assert gene_novelty == [] assert transcript_novelty[-1][-2] == "genomic_transcript" conn.close()
def test_transcript_assigned_intergenic(self): """ This test covers a case reported by a user where a read overlaps the ~600bp mono-exonic pseudogene HMGB1P1. The read itself has 2 exons, the second of which contains the small pseudogene inside. earlier versions of TALON classified the read as intergenic, when it was actually supposed to be genomic """ # Set up references database = "scratch/multiexon_read_overlapping_monoexon_transcript/talon.db" conn = sqlite3.connect(database) conn.row_factory = sqlite3.Row cursor = conn.cursor() build = "hg38" talon.get_counters(database) run_info = talon.init_run_info(database, build) struct_collection = talon.prepare_data_structures(cursor, run_info) # Use pysam to get the read from the SAM file sam_file = "input_files/multiexon_read_overlapping_monoexon_transcript/read.sam" with pysam.AlignmentFile(sam_file) as sam: for entry in sam: sam_record = entry break # Get read attributes chrom = sam_record.reference_name strand = "-" if sam_record.is_reverse else "+" sam_start = sam_record.reference_start sam_end = sam_record.reference_end # Do we get any overlap with the reference gene? best_gene, match_strand = talon.search_for_overlap_with_gene( chrom, min(sam_start, sam_end), max(sam_start, sam_end), strand, cursor, run_info, struct_collection.tmp_gene) assert best_gene == 1 assert match_strand == "-" annotation_info = talon.annotate_read(sam_record, cursor, run_info, struct_collection, mode=0) assert annotation_info['gene_ID'] == 1 assert annotation_info['transcript_ID'] == 2 assert 'genomic_transcript' in annotation_info['transcript_novelty'][0]
def test_counter_update(self): """ Update counters """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" talon.get_counters(database) # Change the counter values to some arbitrary numbers for i in range(10): talon.gene_counter.increment() for i in range(20): talon.transcript_counter.increment() for i in range(2): talon.edge_counter.increment() for i in range(5): talon.vertex_counter.increment() for i in range(30): talon.dataset_counter.increment() for i in range(6): talon.observed_counter.increment() # Now try the update talon.update_counter(cursor) # Check results with queries cursor.execute("""SELECT * FROM counters""") for category, value in cursor.fetchall(): if category == "genes": assert value == 16 elif category == "transcripts": assert value == 27 elif category == "edge": assert value == 33 elif category == "vertex": assert value == 39 elif category == "observed": assert value == 6 elif category == "dataset": assert value == 30 else: if category != "genome_build": pytest.fail("Unexpected entry in counters table") conn.close()
def test_ISM_suffix(self): """ Example where the transcript is an ISM with suffix """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" run_info = talon.init_run_info(database, build) talon.get_counters(database) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) transcript_dict = init_refs.make_transcript_dict(cursor, build) gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start") gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end") chrom = "chr1" strand = "+" positions = [ 500, 600, 900, 1000 ] edge_IDs = [4] vertex_IDs = [4, 5] v_novelty = [0, 0] all_matches = talon.search_for_ISM(edge_IDs, transcript_dict) gene_ID, transcript_ID, novelty, start_end_info = talon.process_ISM(chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, transcript_dict, gene_starts, gene_ends, edge_dict, location_dict, run_info) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) assert gene_ID == correct_gene_ID assert start_end_info["vertex_IDs"] == [3, 4, 5, 6] assert start_end_info["edge_IDs"] == [3, 4, 5] assert start_end_info["start_novelty"] == 0 # because the exon is known assert start_end_info["end_novelty"] == 0 assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None conn.close()
def test_all_known_edges(self): """ Example where the toy transcript database contains matches for all vertices. """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" talon.get_counters(database) edge_dict = init_refs.make_edge_dict(cursor) run_info = talon.init_run_info(database, build) conn.close() chrom = "chr1" vertex_IDs = [2, 3, 4, 5] strand = "+" edge_IDs, novelty = talon.match_all_splice_edges( vertex_IDs, strand, edge_dict, run_info) assert edge_IDs == [2, 3, 4] assert novelty == [0, 0, 0]
def test_NIC_with_all_known_edges(self): """ Test case derived from a real mouse Map2k4 read. All of edges are known (except 3'), yet the read is NIC not FSM/ISM """ database = "scratch/Map2k4.db" talon.get_counters(database) conn = sqlite3.connect(database) conn.row_factory = sqlite3.Row cursor = conn.cursor() build = "mm10" init_refs.make_temp_novel_gene_table(cursor, build) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict( cursor, build) chrom = "chr11" strand = "-" positions = [ 65788254, 65788136, 65775765, 65775733, 65756371, 65756269, 65735366, 65735192, 65719603, 65719484, 65712297, 65712178, 65709983, 65709932, 65707111, 65706984, 65696365, 65696288, 65693570, 65693422, 65691773, 65691728, 65690804, 65689322 ] annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info, "temp_gene") assert annotation['gene_ID'] == 1 assert annotation['transcript_ID'] == 8 novelty_types = [x[-2] for x in annotation['transcript_novelty']] assert "NIC_transcript" in novelty_types conn.close()
def test_partial_match(self): """ Example where the transcript overlaps a single-exon transcript, but is shorter. In the past, the start would be assigned to the annotated start, and the end would be novel. This is no longer the case- at this time, the transcript will be assigned to the annotated match. """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) init_refs.make_temp_monoexonic_transcript_table(cursor, build) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start") gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end") chrom = "chr4" strand = "-" positions = ( 3900, 2900 ) annotation = talon.identify_monoexon_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info, 'temp_gene', 'temp_monoexon') correct_gene_ID = fetch_correct_ID("TG6", "gene", cursor) correct_transcript_ID = fetch_correct_ID("TG6-001", "transcript", cursor) assert annotation['gene_ID'] == correct_gene_ID assert annotation['transcript_ID'] == correct_transcript_ID assert annotation['start_delta'] == 100 assert annotation['end_delta'] == -1900 conn.close()
def test_FSM_start_diff(self): """ Example where the transcript is an FSM but has a difference on the start large enough to be novel. """ conn, cursor = get_db_cursor() build = "toy_build" db = "scratch/toy.db" talon.get_counters(db) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(db, build) transcript_dict = init_refs.make_transcript_dict(cursor, build) orig_vertices = talon.vertex_counter.value() gene_starts = init_refs.make_gene_start_or_end_dict( cursor, build, "start") gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end") chrom = "chr1" positions = [2501, 1500, 1000, 900] #First postion is > 500bp away strand = "-" edge_IDs = [7] vertex_IDs = [7, 6] v_novelty = [0, 0] all_matches = talon.search_for_ISM(edge_IDs, transcript_dict) gene_ID, transcript_ID, novelty, start_end_info = talon.process_FSM( chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, gene_starts, gene_ends, edge_dict, location_dict, run_info) correct_gene_ID = fetch_correct_ID("TG3", "gene", cursor) correct_transcript_ID = fetch_correct_ID("TG3-001", "transcript", cursor) assert gene_ID == correct_gene_ID assert transcript_ID == correct_transcript_ID assert start_end_info["start_vertex"] == orig_vertices + 1 assert start_end_info["end_vertex"] == 5 conn.close()
def test_FSM_perfect(self): """ Example where the transcript is a perfect full splice match. """ conn, cursor = get_db_cursor() build = "toy_build" db = "scratch/toy.db" talon.get_counters(db) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(db, build) transcript_dict = init_refs.make_transcript_dict(cursor, build) gene_starts = init_refs.make_gene_start_or_end_dict( cursor, build, "start") gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end") chrom = "chr1" positions = [1, 100, 500, 600, 900, 1010] strand = "+" edge_IDs = [2, 3, 4] vertex_IDs = [2, 3, 4, 5] v_novelty = [0, 0, 0, 0] all_matches = talon.search_for_ISM(edge_IDs, transcript_dict) gene_ID, transcript_ID, novelty, start_end_info = talon.process_FSM( chrom, positions, strand, edge_IDs, vertex_IDs, all_matches, gene_starts, gene_ends, edge_dict, location_dict, run_info) correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) correct_transcript_ID = fetch_correct_ID("TG1-001", "transcript", cursor) assert gene_ID == correct_gene_ID assert transcript_ID == correct_transcript_ID assert novelty == [] assert start_end_info["start_vertex"] == 1 assert start_end_info["end_vertex"] == 6 assert start_end_info["diff_3p"] == 10 conn.close()
def test_all_known_locations(self): """ Example where the toy transcript database contains matches for all vertices. """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" talon.get_counters(database) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) orig_vertex_count = talon.vertex_counter.value() strand = "+" conn.close() chrom = "chr1" pos = [1, 100, 500, 600, 900, 1000] vertex_IDs, novelty = talon.match_splice_vertices(chrom, pos, strand, location_dict, run_info) assert vertex_IDs == [2, 3, 4, 5] assert talon.vertex_counter.value() == orig_vertex_count
def test_edge_update(self): """ Try to add novel exons and introns. """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" talon.get_counters(database) edge_dict = init_refs.make_edge_dict(cursor) run_info = talon.init_run_info(database, build) orig_n_edges = talon.edge_counter.value() talon.create_edge(2, 1, "exon", "-", edge_dict) # Write to file os.system("mkdir -p scratch/db_updates/") with open("scratch/db_updates/edges.tsv", 'w') as f: for edge in list(edge_dict.values()): if type(edge) is dict: entry = "\t".join([ str(x) for x in [ edge['edge_ID'], edge['v1'], edge['v2'], edge['edge_type'], edge['strand'] ] ]) f.write(entry + "\n") batch_size = 10 talon.batch_add_edges(cursor, "scratch/db_updates/edges.tsv", batch_size) # Test if the edge table has the correct number of edges now query = "SELECT * FROM edge" cursor.execute(query) edge_IDs = [x['edge_ID'] for x in cursor.fetchall()] assert orig_n_edges + 1 in edge_IDs assert len(edge_IDs) == orig_n_edges + 1 conn.close()
def test_spliced_antisense(self): """ Example where the transcript matches known vertices but is antisense """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" talon.get_counters(database) init_refs.make_temp_novel_gene_table(cursor, build) edge_dict = init_refs.make_edge_dict(cursor) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) transcript_dict = init_refs.make_transcript_dict(cursor, build) vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor) gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict( cursor, build) chrom = "chr2" strand = "-" positions = [1000, 900, 600, 500, 100, 1] annotation = talon.identify_transcript(chrom, positions, strand, cursor, location_dict, edge_dict, transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info, "temp_gene") anti_gene_ID = fetch_correct_ID("TG2", "gene", cursor) gene_novelty_types = [x[-2] for x in annotation['gene_novelty']] t_novelty_types = [x[-2] for x in annotation['transcript_novelty']] assert annotation['gene_novelty'][0][-1] == "TRUE" assert "antisense_gene" in gene_novelty_types assert "antisense_transcript" in t_novelty_types assert annotation['start_delta'] == annotation['end_delta'] == 0 conn.close()
def test_location_update(self): """ Update locations """ conn, cursor = get_db_cursor() build = "toy_build" database = "scratch/toy.db" talon.get_counters(database) location_dict = init_refs.make_location_dict(build, cursor) run_info = talon.init_run_info(database, build) orig_n_pos = talon.vertex_counter.value() talon.create_vertex("chr4", 2000, location_dict, run_info) # Write to file os.system("mkdir -p scratch/db_updates/") with open("scratch/db_updates/loc.tsv", 'w') as f: for chrom_dict in location_dict.values(): for loc in list(chrom_dict.values()): if type(loc) is dict: entry = ("\t".join([ str(x) for x in (loc['location_ID'], loc['genome_build'], loc['chromosome'], loc['position']) ])) f.write(entry + "\n") batch_size = 10 talon.batch_add_locations(cursor, "scratch/db_updates/loc.tsv", batch_size) # Test if the table has the correct number of locations now query = "SELECT * FROM location" cursor.execute(query) loc_IDs = [x['location_ID'] for x in cursor.fetchall()] assert orig_n_pos + 1 in loc_IDs assert len(loc_IDs) == orig_n_pos + 1 conn.close()