Beispiel #1
0
    def test_antisense(self):
        """ Example where the transcript is antisense """

        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, build)
        talon.make_temp_monoexonic_transcript_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(cursor, build)

        chrom = "chr4"
        strand = "+"
        positions = ( 1300, 3900 )

        annotation = talon.identify_monoexon_transcript(chrom, positions,
                                               strand, cursor,
                                               location_dict, edge_dict,
                                               transcript_dict, vertex_2_gene,
                                               gene_starts, gene_ends, run_info)

        anti_gene_ID = fetch_correct_ID("TG6", "gene", cursor)
        gene_novelty_types = [ x[-2] for x in annotation['gene_novelty']]
        t_novelty_types = [ x[-2] for x in annotation['transcript_novelty']]
        assert annotation['gene_novelty'][0][-1] == "TRUE"
        assert "antisense_gene" in gene_novelty_types
        assert "antisense_transcript" in t_novelty_types

        conn.close() 
Beispiel #2
0
    def test_partial_match_3prime(self):
        """ Example where the transcript is short, so it overlaps the
            annotated transcript but is not an accepted match.
            the end should get assigned to the annotated end, but the end is
            novel """

        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, build)
        talon.make_temp_monoexonic_transcript_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(cursor, build)

        chrom = "chr4"
        strand = "-"
        positions = ( 2000, 1100 )

        annotation = talon.identify_monoexon_transcript(chrom, positions,
                                               strand, cursor,
                                               location_dict, edge_dict,
                                               transcript_dict, vertex_2_gene,
                                               gene_starts, gene_ends, run_info)

        correct_gene_ID = fetch_correct_ID("TG6", "gene", cursor)
        assert annotation['gene_ID'] == correct_gene_ID
        assert annotation['start_delta'] == None
        assert annotation['end_delta'] == -100

        conn.close()
Beispiel #3
0
    def test_NIC_match(self):
        """ Example where the transcript is an NIC match to an existing one by 
            virtue of skipping an exon.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        positions = [1, 100, 900, 1000]
        edge_IDs = [run_info.edge + 1]
        vertex_IDs = [2, 5]
        strand = "+"
        v_novelty = [0, 0]

        gene_ID, transcript_ID, novelty, start_end_info = talon.process_NIC(
            chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
            gene_starts, gene_ends, edge_dict, location_dict, vertex_2_gene,
            run_info)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        assert gene_ID == correct_gene_ID
        assert start_end_info["vertex_IDs"] == [1, 2, 5, 6]
        assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None
        conn.close()
Beispiel #4
0
    def test_ISM_suffix(self):
        """ Example where the transcript is an ISM with suffix
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        strand = "+"
        positions = [500, 600, 900, 1000]
        edge_IDs = [4]
        vertex_IDs = [4, 5]
        v_novelty = [0, 0]

        all_matches = talon.search_for_ISM(edge_IDs, transcript_dict)
        gene_ID, transcript_ID, novelty, start_end_info = talon.process_ISM(
            chrom, positions, strand, edge_IDs, vertex_IDs, all_matches,
            transcript_dict, gene_starts, gene_ends, edge_dict, location_dict,
            run_info)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)

        assert gene_ID == correct_gene_ID
        assert start_end_info["vertex_IDs"] == [3, 4, 5, 6]
        assert start_end_info["edge_IDs"] == [3, 4, 5]
        assert start_end_info["start_novelty"] == 0  # because the exon is known
        assert start_end_info["end_novelty"] == 0
        assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None
        conn.close()
Beispiel #5
0
    def test_NNC(self):
        """ Example where the transcript skips an exon and has a novel splice
            donor
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        strand = "+"
        positions = [1, 50, 900, 1000]

        annotation = talon.identify_transcript(chrom, positions, strand,
                                               cursor, location_dict,
                                               edge_dict, transcript_dict,
                                               vertex_2_gene, gene_starts,
                                               gene_ends, run_info)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert annotation['gene_ID'] == correct_gene_ID
        assert "NNC_transcript" in novelty_types
        assert annotation['start_delta'] == annotation['end_delta'] == 0
        conn.close()
Beispiel #6
0
    def test_FSM_perfect(self):
        """ Example where the transcript is a perfect full splice match.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        strand = "+"
        positions = [1, 100, 500, 600, 900, 1000]

        annotation = talon.identify_transcript(chrom, positions, strand,
                                               cursor, location_dict,
                                               edge_dict, transcript_dict,
                                               vertex_2_gene, gene_starts,
                                               gene_ends, run_info)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        correct_transcript_ID = fetch_correct_ID("TG1-001", "transcript",
                                                 cursor)
        assert annotation['gene_ID'] == correct_gene_ID
        assert annotation['transcript_ID'] == correct_transcript_ID
        assert annotation['transcript_novelty'] == []
        conn.close()
Beispiel #7
0
    def test_match(self):
        """ Example where the transcript is a moniexonic match.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, build)
        talon.make_temp_monoexonic_transcript_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(cursor, build)

        chrom = "chr4"
        strand = "-"
        positions = ( 3900, 1100 )

        annotation = talon.identify_monoexon_transcript(chrom, positions, 
                                               strand, cursor,
                                               location_dict, edge_dict,
                                               transcript_dict, vertex_2_gene,
                                               gene_starts, gene_ends, run_info)

        correct_gene_ID = fetch_correct_ID("TG6", "gene", cursor)
        correct_transcript_ID = fetch_correct_ID("TG6-001", "transcript", cursor)
        assert annotation['gene_ID'] == correct_gene_ID
        assert annotation['start_delta'] == 100
        assert annotation['end_delta'] == -100

        conn.close()
Beispiel #8
0
    def test_ISM_internal(self):
        """ Example where the transcript matches an internal exon
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, build)
        talon.make_temp_monoexonic_transcript_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        strand = "+"
        positions = (500, 600)

        annotation = talon.identify_monoexon_transcript(
            chrom, positions, strand, cursor, location_dict, edge_dict,
            transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert annotation['gene_ID'] == correct_gene_ID
        assert "ISM_transcript" in novelty_types
        assert annotation['start_delta'] == annotation['end_delta'] == 0
        conn.close()
Beispiel #9
0
    def test_no_match(self):
        """ Example with no FSM match """

        conn, cursor = get_db_cursor()
        build = "toy_build"
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(cursor, build)

        chrom = "chr1"
        positions = [1, 100, 500, 600] 
        strand = "+"
        edge_IDs = [2]
        vertex_IDs = [2,3,4,5]
        v_novelty = [0, 0, 0, 0]

        all_matches = talon.search_for_ISM(edge_IDs, transcript_dict)

        gene_ID, transcript_ID, novelty, start_end_info = talon.process_FSM(chrom,
                                                            positions, strand, edge_IDs,
                                                            vertex_IDs, all_matches,
                                                            gene_starts, gene_ends,
                                                            edge_dict,
                                                            location_dict, run_info)

        assert gene_ID == transcript_ID == None 
        conn.close()       
Beispiel #10
0
    def test_FSM_end_diff(self):
        """ Example where the transcript is an FSM but has a difference on
            the ends large enough to be novel.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        orig_vertices = run_info['vertex']
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(cursor, build)

        chrom = "chr2"
        positions = [1, 100, 500, 600, 900, 1301] #Last postion is > 300bp away
        strand = "+"
        edge_IDs = [13, 14, 15]
        vertex_IDs = [14, 15, 16, 17] 
        v_novelty = [0, 0, 0, 0]
  
        all_matches = talon.search_for_ISM(edge_IDs, transcript_dict)

        gene_ID, transcript_ID, novelty, start_end_info = talon.process_FSM(chrom,
                                                            positions, strand, edge_IDs,
                                                            vertex_IDs, all_matches,
                                                            gene_starts, gene_ends,
                                                            edge_dict,
                                                            location_dict, run_info) 

        correct_gene_ID = fetch_correct_ID("TG2", "gene", cursor)
        correct_transcript_ID = fetch_correct_ID("TG2-001", "transcript", cursor)
        assert gene_ID == correct_gene_ID
        assert transcript_ID == correct_transcript_ID
        assert start_end_info["end_vertex"] == orig_vertices + 1
        conn.close()
Beispiel #11
0
    def test_FSM_end_diff(self):
        """ Example where the transcript is an FSM but has a difference on
            the ends large enough to be novel.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr2"
        strand = "+"
        positions = [1, 100, 500, 600, 900, 1500]

        annotation = talon.identify_transcript(chrom, positions, strand,
                                               cursor, location_dict,
                                               edge_dict, transcript_dict,
                                               vertex_2_gene, gene_starts,
                                               gene_ends, run_info)

        correct_gene_ID = fetch_correct_ID("TG2", "gene", cursor)
        novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert annotation['gene_ID'] == correct_gene_ID
        assert annotation['end_delta'] == None
        conn.close()
Beispiel #12
0
    def test_genomic_unspliced(self):
        """ Monoexonic fragment that overlaps gene 1 """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, build)
        talon.make_temp_monoexonic_transcript_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        strand = "+"
        positions = (1, 990)

        annotation = talon.identify_monoexon_transcript(
            chrom, positions, strand, cursor, location_dict, edge_dict,
            transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert annotation['gene_ID'] == correct_gene_ID
        assert "genomic_transcript" in novelty_types
        assert annotation['end_delta'] == -10
        conn.close()
Beispiel #13
0
    def test_ISM_prefix(self):
        """ Example where the transcript is a prefix ISM with a novel start
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)
        orig_exons = run_info["edge"]

        chrom = "chr1"
        strand = "+"
        positions = [1, 100, 500, 600]
        edge_IDs = [2]
        vertex_IDs = [2, 3]
        v_novelty = [0, 0]

        all_matches = talon.search_for_ISM(edge_IDs, transcript_dict)
        gene_ID, transcript_ID, novelty, start_end_info = talon.process_ISM(
            chrom, positions, strand, edge_IDs, vertex_IDs, all_matches,
            transcript_dict, gene_starts, gene_ends, edge_dict, location_dict,
            run_info)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        assert gene_ID == correct_gene_ID
        assert start_end_info["vertex_IDs"] == [1, 2, 3, 4]
        assert start_end_info["edge_IDs"] == [1, 2, 3]
        conn.close()
Beispiel #14
0
    def test_spliced_antisense(self):
        """ Example where the transcript matches known vertices but is antisense
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr2"
        strand = "-"
        positions = [1000, 900, 600, 500, 100, 1]

        annotation = talon.identify_transcript(chrom, positions, strand,
                                               cursor, location_dict,
                                               edge_dict, transcript_dict,
                                               vertex_2_gene, gene_starts,
                                               gene_ends, run_info)

        anti_gene_ID = fetch_correct_ID("TG2", "gene", cursor)
        gene_novelty_types = [x[-2] for x in annotation['gene_novelty']]
        t_novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert annotation['gene_novelty'][0][-1] == "TRUE"
        assert "antisense_gene" in gene_novelty_types
        assert "antisense_transcript" in t_novelty_types
        assert annotation['start_delta'] == annotation['end_delta'] == 0
        conn.close()
Beispiel #15
0
    def test_NIC_instead_of_ISM(self):
        """ Test case where the transcript looks like an ISM, but is NIC on
            account of having known starts and ends """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr3"
        strand = "+"
        positions = (800, 1000, 1200, 1400, 1600, 1800, 2000, 2200)

        annotation = talon.identify_transcript(chrom, positions, strand,
                                               cursor, location_dict,
                                               edge_dict, transcript_dict,
                                               vertex_2_gene, gene_starts,
                                               gene_ends, run_info)

        correct_gene_ID = fetch_correct_ID("TG5", "gene", cursor)
        novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert annotation['gene_ID'] == correct_gene_ID
        assert "NIC_transcript" in novelty_types
        conn.close()
    def test_find_no_match(self):
        """ Example where the toy transcript database contains no matches
            for the edge set.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        transcript_dict = talon.make_transcript_dict(cursor, build)
        conn.close()

        edges = (100, 200, 300)
        matches = talon.search_for_ISM(edges, transcript_dict)

        # Make sure that no match got returned
        assert matches == None
    def test_find_monoexon_match(self):
        """ Input is a sinlge exon that matches part of an existing transcript
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        transcript_dict = talon.make_transcript_dict(cursor, build)

        edges = (14, )
        matches = talon.search_for_ISM(edges, transcript_dict)

        # Make sure that correct match got returned
        correct_gene_ID = fetch_correct_ID("TG2", "gene", cursor)

        assert matches[0]["gene_ID"] == correct_gene_ID
        conn.close()
Beispiel #18
0
    def test_antisense(self):
        """ Example where the vertices are known but there is no same-strand 
            match """

        conn, cursor = get_db_cursor()
        build = "toy_build"
        edge_dict = talon.make_edge_dict(cursor)
        locations = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        # Construct temp novel gene db
        talon.make_temp_novel_gene_table(cursor, "toy_build")

        chrom = "chr1"
        start = 1000
        end = 1
        edge_IDs = [run_info.edge + 1]
        positions = [1000, 900, 100, 1]
        vertex_IDs = [5, 2]
        strand = "-"
        anti_strand = "+"
        v_novelty = (0, 0, 0, 0)

        # Find antisense match
        gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \
                                      talon.process_spliced_antisense(chrom, positions,
                                                                  strand, edge_IDs,
                                                                  vertex_IDs,
                                                                  transcript_dict,
                                                                  gene_starts,
                                                                  gene_ends,
                                                                  edge_dict, locations,
                                                                  vertex_2_gene, run_info,
                                                                  cursor)
        #anti_gene_ID = talon.find_gene_match_on_vertex_basis(vertex_IDs,
        #                                                     anti_strand,
        #                                                     vertex_2_gene)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        anti_gene_ID = gene_novelty[-1][-1]
        assert anti_gene_ID == correct_gene_ID
        assert start_end_info["vertex_IDs"] == [6, 5, 2, 1]

        conn.close()
    def test_find_no_match(self):
        """ Example where the toy transcript database contains no matches
            for the edge set.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        transcript_dict = talon.make_transcript_dict(cursor, build)
        conn.close()

        edges = frozenset({1, 3, 4, 5})
        gene_ID, transcript = talon.search_for_transcript(
            edges, transcript_dict)

        # Make sure that no match got returned
        assert gene_ID == None
        assert transcript == None
    def test_find_match(self):
        """ Example where the toy transcript database contains exactly one  
            ISM match for the transcript.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        transcript_dict = talon.make_transcript_dict(cursor, build)

        edges = (2, 3)
        matches = talon.search_for_ISM(edges, transcript_dict)

        # Make sure that correct match got returned
        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)

        assert matches[0]["gene_ID"] == correct_gene_ID
        conn.close()
Beispiel #21
0
    def test_overlap_but_no_vertex_match(self):
        """ Example where the transcript is short, so it overlaps the
            annotated transcript but is not an accepted match.
            the start should get assigned to the annotated end, but the end is
            novel """

        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, build)
        talon.make_temp_monoexonic_transcript_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(cursor, build)
        tot_vertices = len(vertex_2_gene)
        query = """ SELECT COUNT(*) FROM temp_monoexon """
        tot_monoexonic = cursor.execute(query).fetchone()[0]

        chrom = "chr4"
        strand = "-"
        positions = ( 2500, 2000 )

        annotation = talon.identify_monoexon_transcript(chrom, positions,
                                               strand, cursor,
                                               location_dict, edge_dict,
                                               transcript_dict, vertex_2_gene,
                                               gene_starts, gene_ends, run_info)

        correct_gene_ID = fetch_correct_ID("TG6", "gene", cursor)
        print(annotation['start_vertex'])
        print(annotation['end_vertex'])
        assert annotation['gene_ID'] == correct_gene_ID
        assert annotation['start_delta'] == None
        assert annotation['end_delta'] == None

        # Now check if the transcript got added to the right data structures
        assert len(vertex_2_gene) == tot_vertices + 2
        assert cursor.execute(query).fetchone()[0] == tot_monoexonic + 1

        conn.close()
    def test_find_match(self):
        """ Example where the toy transcript database contains exactly one match 
            for the transcript.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        transcript_dict = talon.make_transcript_dict(cursor, build)

        edges = frozenset({12, 13, 14, 15, 16})
        gene_ID, transcript = talon.search_for_transcript(
            edges, transcript_dict)

        # Make sure that correct match got returned
        correct_gene_ID = fetch_correct_ID("TG2", "gene", cursor)
        correct_transcript_ID = fetch_correct_ID("TG2-001", "transcript",
                                                 cursor)

        assert gene_ID == correct_gene_ID
        assert transcript[0] == correct_transcript_ID
        conn.close()
Beispiel #23
0
    def test_no_match(self):
        """ Example with no ISM match """

        conn, cursor = get_db_cursor()
        build = "toy_build"
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        strand = "+"
        positions = [1, 100, 900, 1000]
        edge_IDs = [200]
        vertex_IDs = [2, 5]
        v_novelty = [0, 0]

        all_matches = talon.search_for_ISM(edge_IDs, transcript_dict)
        assert all_matches == None
        conn.close()
Beispiel #24
0
    def test_genomic(self):
        """ Example where the transcript overlaps a gene but contains no known
            splice vertices
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        # Construct temp novel gene db
        talon.make_temp_novel_gene_table(cursor, "toy_build")

        chrom = "chr1"
        positions = [1000, 950, 700, 600]
        edge_IDs = [run_info.edge + 1, run_info.edge + 2]
        vertex_IDs = [run_info.vertex + 1, run_info.vertex + 2]
        strand = "-"

        gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \
                             talon.process_remaining_mult_cases(chrom, positions,
                                                                strand, edge_IDs,
                                                                vertex_IDs,
                                                                transcript_dict,
                                                                gene_starts, gene_ends,
                                                                edge_dict, location_dict,
                                                                vertex_2_gene, run_info,
                                                                cursor)
        correct_gene_ID = fetch_correct_ID("TG3", "gene", cursor)
        assert gene_ID == correct_gene_ID
        assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None
        assert gene_novelty == []
        assert transcript_novelty[-1][-2] == "genomic_transcript"
        conn.close()
Beispiel #25
0
    def test_intergenic(self):
        """ Example where the transcript is an NIC match to an existing one by
            virtue of a new splice donor.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)
        correct_gene_ID = run_info.genes + 1

        # Construct temp novel gene db
        talon.make_temp_novel_gene_table(cursor, "toy_build")

        chrom = "chrX"
        positions = [1, 100, 900, 1000]
        edge_IDs = [run_info.edge + 1, run_info.edge + 2]
        vertex_IDs = [run_info.vertex + 1, run_info.vertex + 2]
        strand = "+"

        gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \
                             talon.process_remaining_mult_cases(chrom, positions,
                                                                strand, edge_IDs,
                                                                vertex_IDs,
                                                                transcript_dict,
                                                                gene_starts, gene_ends,
                                                                edge_dict, location_dict,
                                                                vertex_2_gene, run_info,
                                                                cursor)

        assert gene_ID == correct_gene_ID
        assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None
        assert gene_novelty[0][-2] == "intergenic_novel"
        conn.close()
Beispiel #26
0
    def test_NIC_with_all_known_edges(self):
        """ Test case derived from a real mouse Map2k4 read. All of edges are
            known (except 3'), yet the read is NIC not FSM/ISM """

        conn = sqlite3.connect("scratch/Map2k4.db")
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()
        build = "mm10"
        talon.make_temp_novel_gene_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr11"
        strand = "-"
        positions = [
            65788254, 65788136, 65775765, 65775733, 65756371, 65756269,
            65735366, 65735192, 65719603, 65719484, 65712297, 65712178,
            65709983, 65709932, 65707111, 65706984, 65696365, 65696288,
            65693570, 65693422, 65691773, 65691728, 65690804, 65689322
        ]

        annotation = talon.identify_transcript(chrom, positions, strand,
                                               cursor, location_dict,
                                               edge_dict, transcript_dict,
                                               vertex_2_gene, gene_starts,
                                               gene_ends, run_info)

        assert annotation['gene_ID'] == 1
        assert annotation['transcript_ID'] == 8
        novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert "NIC_transcript" in novelty_types

        conn.close()
Beispiel #27
0
    def test_transcript_update(self):
        """ Try to add novel transcript entries to database while ignoring 
            duplicates
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        edge_dict = talon.make_edge_dict(cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        talon.create_transcript("chr1", 1, 1000, 1, (1, 2, 3), (1, 2, 3, 4),
                                transcript_dict, run_info)

        batch_size = 5
        talon.batch_add_transcripts(cursor, transcript_dict, batch_size)

        # Test if transcript with ID 7 is there, but make sure we didn't add
        # duplicates of the others
        query = "SELECT * FROM transcripts"
        cursor.execute(query)
        transcript_IDs = [x['transcript_ID'] for x in cursor.fetchall()]
        assert 8 in transcript_IDs
        assert len(transcript_IDs) == 8
        conn.close()
Beispiel #28
0
    def test_FSM_perfect(self):
        """ Example where the transcript is a perfect full splice match.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)        
        transcript_dict = talon.make_transcript_dict(cursor, build)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(cursor, build)

        chrom = "chr1"
        positions = [1, 100, 500, 600, 900, 1010]
        strand = "+"
        edge_IDs = [2, 3, 4]
        vertex_IDs = [2, 3, 4, 5]
        v_novelty = [0, 0, 0, 0]

        all_matches = talon.search_for_ISM(edge_IDs, transcript_dict)

        gene_ID, transcript_ID, novelty, start_end_info = talon.process_FSM(chrom, 
                                                            positions, strand, edge_IDs,
                                                            vertex_IDs, all_matches,
                                                            gene_starts, gene_ends,
                                                            edge_dict,
                                                            location_dict, run_info)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) 
        correct_transcript_ID = fetch_correct_ID("TG1-001", "transcript", cursor)
        assert gene_ID == correct_gene_ID
        assert transcript_ID == correct_transcript_ID
        assert novelty == []
        assert start_end_info["start_vertex"] == 1
        assert start_end_info["end_vertex"] == 6
        assert start_end_info["diff_3p"] == 10
        conn.close()