Exemple #1
0
    def test_counter_update(self):
        """ Update counters """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        run_info = talon.init_run_info(cursor, build)

        # Change the counter values to some arbitrary numbers
        run_info.genes = 10
        run_info.transcripts = 20
        run_info.edge = 2000
        run_info.vertex = 10000
        run_info.dataset = 30
        run_info.observed = 400

        # Now try the update
        talon.update_counter(cursor, run_info)
        run_info = None

        # Check results with queries
        run_info_2 = talon.init_run_info(cursor, build)
        assert run_info_2.genes == 10
        assert run_info_2.transcripts == 20
        assert run_info_2.edge == 2000
        assert run_info_2.vertex == 10000
        assert run_info_2.dataset == 30
        assert run_info_2.observed == 400
    def test_with_novel_location(self):
        """ Example where the toy transcript database contains a novel position.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)
        orig_vertex_count = talon.vertex_counter.value()
        orig_n_locations = len(location_dict["chr1"])
        conn.close()

        chrom = "chr1"
        strand = "+"
        pos = [1, 150, 500, 600, 900, 1000]
        vertex_IDs, novelty = talon.match_splice_vertices(chrom, pos, 
                                                                  strand,
                                                                  location_dict, 
                                                                  run_info)

        # Make sure that no match got returned
        new_vertex_count = talon.vertex_counter.value()
        assert vertex_IDs == [ new_vertex_count, 3, 4, 5]
       
        # Make sure the data structures got updated
        assert new_vertex_count == orig_vertex_count + 1
        assert len(location_dict["chr1"]) == orig_n_locations + 1
Exemple #3
0
    def test_ISM_suffix(self):
        """ Example where the transcript is an ISM with suffix
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        strand = "+"
        positions = [500, 600, 900, 1000]
        edge_IDs = [4]
        vertex_IDs = [4, 5]
        v_novelty = [0, 0]

        all_matches = talon.search_for_ISM(edge_IDs, transcript_dict)
        gene_ID, transcript_ID, novelty, start_end_info = talon.process_ISM(
            chrom, positions, strand, edge_IDs, vertex_IDs, all_matches,
            transcript_dict, gene_starts, gene_ends, edge_dict, location_dict,
            run_info)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)

        assert gene_ID == correct_gene_ID
        assert start_end_info["vertex_IDs"] == [3, 4, 5, 6]
        assert start_end_info["edge_IDs"] == [3, 4, 5]
        assert start_end_info["start_novelty"] == 0  # because the exon is known
        assert start_end_info["end_novelty"] == 0
        assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None
        conn.close()
Exemple #4
0
    def test_match_monoexonic(self):
        """ Test the permissive match strategy on a monoexonic transcript """

        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)

        chrom = "chr2"
        pos = [920, 970]
        start = pos[0]
        splice_pos = pos[1]
        run_info.cutoff_5p = 500
        run_info.cutoff_3p = 500
        strand = "+"

        start_match, start_diff = talon.permissive_vertex_search(
            chrom, start, strand, splice_pos, "start", location_dict, run_info)
        end = pos[1]
        splice_pos = pos[0]
        end_match, end_diff = talon.permissive_vertex_search(
            chrom, end, strand, splice_pos, "end", location_dict, run_info)

        assert start_match == fetch_correct_vertex_ID(chrom, 900, cursor)
        assert start_diff == 20
        assert end_match == fetch_correct_vertex_ID(chrom, 1000, cursor)
        assert end_diff == -30
        conn.close()
    def test_FSM_perfect(self):
        """ Example where the transcript is a perfect full splice match.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        strand = "+"
        positions = [1, 100, 500, 600, 900, 1000]

        annotation = talon.identify_transcript(chrom, positions, strand,
                                               cursor, location_dict,
                                               edge_dict, transcript_dict,
                                               vertex_2_gene, gene_starts,
                                               gene_ends, run_info)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        correct_transcript_ID = fetch_correct_ID("TG1-001", "transcript",
                                                 cursor)
        assert annotation['gene_ID'] == correct_gene_ID
        assert annotation['transcript_ID'] == correct_transcript_ID
        assert annotation['transcript_novelty'] == []
        conn.close()
Exemple #6
0
    def test_monoexonic_edge_case(self):
        """ Case I observed during testing where start and end accidentally 
            ended up being assigned to the same vertex """

        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)

        chrom = "chr1"
        pos = [550, 610]
        start = pos[0]
        splice_pos = pos[1]
        run_info.cutoff_5p = 500
        run_info.cutoff_3p = 500
        strand = "+"

        start_match, start_diff = talon.permissive_vertex_search(
            chrom, start, strand, splice_pos, "start", location_dict, run_info)

        end = pos[1]
        splice_pos = pos[0]
        end_match, end_diff = talon.permissive_vertex_search(
            chrom, end, strand, splice_pos, "end", location_dict, run_info)

        assert start_match == 3
        assert end_match == 4
    def test_FSM_end_diff(self):
        """ Example where the transcript is an FSM but has a difference on
            the ends large enough to be novel.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr2"
        strand = "+"
        positions = [1, 100, 500, 600, 900, 1500]

        annotation = talon.identify_transcript(chrom, positions, strand,
                                               cursor, location_dict,
                                               edge_dict, transcript_dict,
                                               vertex_2_gene, gene_starts,
                                               gene_ends, run_info)

        correct_gene_ID = fetch_correct_ID("TG2", "gene", cursor)
        novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert annotation['gene_ID'] == correct_gene_ID
        assert annotation['end_delta'] == None
        conn.close()
    def test_NIC_instead_of_ISM(self):
        """ Test case where the transcript looks like an ISM, but is NIC on
            account of having known starts and ends """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr3"
        strand = "+"
        positions = (800, 1000, 1200, 1400, 1600, 1800, 2000, 2200)

        annotation = talon.identify_transcript(chrom, positions, strand,
                                               cursor, location_dict,
                                               edge_dict, transcript_dict,
                                               vertex_2_gene, gene_starts,
                                               gene_ends, run_info)

        correct_gene_ID = fetch_correct_ID("TG5", "gene", cursor)
        novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert annotation['gene_ID'] == correct_gene_ID
        assert "NIC_transcript" in novelty_types
        conn.close()
    def test_genomic_unspliced(self):
        """ Monoexonic fragment that overlaps gene 1 """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, build)
        talon.make_temp_monoexonic_transcript_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        strand = "+"
        positions = (1, 990)

        annotation = talon.identify_monoexon_transcript(
            chrom, positions, strand, cursor, location_dict, edge_dict,
            transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert annotation['gene_ID'] == correct_gene_ID
        assert "genomic_transcript" in novelty_types
        assert annotation['end_delta'] == -10
        conn.close()
    def test_spliced_antisense(self):
        """ Example where the transcript matches known vertices but is antisense
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr2"
        strand = "-"
        positions = [1000, 900, 600, 500, 100, 1]

        annotation = talon.identify_transcript(chrom, positions, strand,
                                               cursor, location_dict,
                                               edge_dict, transcript_dict,
                                               vertex_2_gene, gene_starts,
                                               gene_ends, run_info)

        anti_gene_ID = fetch_correct_ID("TG2", "gene", cursor)
        gene_novelty_types = [x[-2] for x in annotation['gene_novelty']]
        t_novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert annotation['gene_novelty'][0][-1] == "TRUE"
        assert "antisense_gene" in gene_novelty_types
        assert "antisense_transcript" in t_novelty_types
        assert annotation['start_delta'] == annotation['end_delta'] == 0
        conn.close()
    def test_NNC(self):
        """ Example where the transcript skips an exon and has a novel splice
            donor
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        strand = "+"
        positions = [1, 50, 900, 1000]

        annotation = talon.identify_transcript(chrom, positions, strand,
                                               cursor, location_dict,
                                               edge_dict, transcript_dict,
                                               vertex_2_gene, gene_starts,
                                               gene_ends, run_info)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert annotation['gene_ID'] == correct_gene_ID
        assert "NNC_transcript" in novelty_types
        assert annotation['start_delta'] == annotation['end_delta'] == 0
        conn.close()
Exemple #12
0
    def test_gene_update(self):
        """ Try to add novel gene entries to database while ignoring duplicates
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        run_info = talon.init_run_info(database, build)
        talon.get_counters(database)

        init_refs.make_temp_novel_gene_table(cursor, build)
        talon.create_gene("chr4", 1, 1000, "+", cursor, "temp_gene")

        # Write to file
        os.system("mkdir -p scratch/db_updates/")
        with open("scratch/db_updates/genes.tsv", 'w') as f:
            cursor.execute("SELECT gene_ID, strand FROM temp_gene")
            for entry in cursor.fetchall():
                f.write("\t".join([str(x) for x in entry]) + "\n")

        talon.batch_add_genes(cursor, "scratch/db_updates/genes.tsv", 10)

        # Test if gene with ID 6 is there, but make sure we didn't add
        # duplicates of the other genes
        query = "SELECT * FROM genes"
        gene_IDs = [x['gene_ID'] for x in cursor.execute(query)]
        assert 7 in gene_IDs
        assert len(gene_IDs) == 7
        conn.close()
Exemple #13
0
    def test_NIC_match(self):
        """ Example where the transcript is an NIC match to an existing one by 
            virtue of skipping an exon.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)
        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
        gene_starts = init_refs.make_gene_start_or_end_dict(
            cursor, build, "start")
        gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end")

        chrom = "chr1"
        positions = [1, 100, 900, 1000]
        edge_IDs = [talon.edge_counter.value() + 1]
        vertex_IDs = [2, 5]
        strand = "+"
        v_novelty = [0, 0]

        gene_ID, transcript_ID, novelty, start_end_info = talon.process_NIC(
            chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
            gene_starts, gene_ends, edge_dict, location_dict, vertex_2_gene,
            run_info)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        assert gene_ID == correct_gene_ID
        assert start_end_info["vertex_IDs"] == [1, 2, 5, 6]
        assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None
        conn.close()
Exemple #14
0
    def test_NNC_match(self):
        """ Example where the transcript is an NNC match to an existing one by
            virtue of a new splice donor.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        positions = [1, 110, 900, 1000]
        edge_IDs = [run_info.edge + 1]
        vertex_IDs = [run_info.vertex + 1, 5]
        strand = "+"
        v_novelty = [0, 0]

        gene_ID, transcript_ID, transcript_novelty, start_end_info = talon.process_NNC(
            chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
            gene_starts, gene_ends, edge_dict, location_dict, vertex_2_gene,
            run_info)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        assert gene_ID == correct_gene_ID
        assert start_end_info["vertex_IDs"] == [1] + vertex_IDs + [6]
        assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None
        conn.close()
Exemple #15
0
    def test_NIC(self):
        """ Example where the transcript skips an exon
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)

        init_refs.make_temp_novel_gene_table(cursor, build)
        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        strand = "+"
        positions = (1, 100, 900, 1000)

        annotation = talon.identify_transcript(chrom, positions, strand,
                                               cursor, location_dict,
                                               edge_dict, transcript_dict,
                                               vertex_2_gene, gene_starts,
                                               gene_ends, run_info,
                                               "temp_gene")

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert annotation['gene_ID'] == correct_gene_ID
        assert "NIC_transcript" in novelty_types
        assert annotation['start_delta'] == annotation['end_delta'] == 0
        conn.close()
    def test_FSM_start_diff(self):
        """ Example where the transcript is an FSM but has a difference on
            the start large enough to be novel.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        orig_vertices = run_info['vertex']
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        positions = [2501, 1500, 1000, 900]  #First postion is > 500bp away
        strand = "-"
        edge_IDs = [7]
        vertex_IDs = [7, 6]
        v_novelty = [0, 0]

        all_matches = talon.search_for_ISM(edge_IDs, transcript_dict)

        gene_ID, transcript_ID, novelty, start_end_info = talon.process_FSM(
            chrom, positions, strand, edge_IDs, vertex_IDs, all_matches,
            gene_starts, gene_ends, edge_dict, location_dict, run_info)

        correct_gene_ID = fetch_correct_ID("TG3", "gene", cursor)
        correct_transcript_ID = fetch_correct_ID("TG3-001", "transcript",
                                                 cursor)
        assert gene_ID == correct_gene_ID
        assert transcript_ID == correct_transcript_ID
        assert start_end_info["start_vertex"] == orig_vertices + 1
        assert start_end_info["end_vertex"] == 5
        conn.close()
    def test_antisense(self):
        """ Example where all of the vertices are in the database, but the edges
            are not, because they are antisense to the original transcript """

        conn, cursor = get_db_cursor()
        build = "toy_build"
        edge_dict = talon.make_edge_dict(cursor)
        run_info = talon.init_run_info(cursor, build)
        orig_n_edges = len(edge_dict)
        conn.close()

        chrom = "chr2"
        vertex_IDs = [14, 13, 12, 11, 10, 9]
        strand = "-"

        edge_IDs, novelty = talon.match_all_transcript_edges(
            vertex_IDs, strand, edge_dict, run_info)
        expected_edges = []
        for i in range(1, 6):
            num = orig_n_edges + i
            edge_id = num
            expected_edges.append(edge_id)

        assert edge_IDs == tuple(expected_edges)
        assert novelty == (1, 1, 1, 1, 1)
    def test_no_match(self):
        """ Example with no FSM match """

        conn, cursor = get_db_cursor()
        build = "toy_build"
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        positions = [1, 100, 500, 600]
        strand = "+"
        edge_IDs = [2]
        vertex_IDs = [2, 3, 4, 5]
        v_novelty = [0, 0, 0, 0]

        all_matches = talon.search_for_ISM(edge_IDs, transcript_dict)

        gene_ID, transcript_ID, novelty, start_end_info = talon.process_FSM(
            chrom, positions, strand, edge_IDs, vertex_IDs, all_matches,
            gene_starts, gene_ends, edge_dict, location_dict, run_info)

        assert gene_ID == transcript_ID == None
        conn.close()
    def test_FSM_perfect(self):
        """ Example where the transcript is a perfect full splice match.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        positions = [1, 100, 500, 600, 900, 1010]
        strand = "+"
        edge_IDs = [2, 3, 4]
        vertex_IDs = [2, 3, 4, 5]
        v_novelty = [0, 0, 0, 0]

        all_matches = talon.search_for_ISM(edge_IDs, transcript_dict)

        gene_ID, transcript_ID, novelty, start_end_info = talon.process_FSM(
            chrom, positions, strand, edge_IDs, vertex_IDs, all_matches,
            gene_starts, gene_ends, edge_dict, location_dict, run_info)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        correct_transcript_ID = fetch_correct_ID("TG1-001", "transcript",
                                                 cursor)
        assert gene_ID == correct_gene_ID
        assert transcript_ID == correct_transcript_ID
        assert novelty == []
        assert start_end_info["start_vertex"] == 1
        assert start_end_info["end_vertex"] == 6
        assert start_end_info["diff_3p"] == 10
        conn.close()
Exemple #20
0
    def test_partial_match_3prime(self):
        """ Example where the transcript is short, so it overlaps the
            annotated transcript but is not an accepted match.
            the end should get assigned to the annotated end, but the end is
            novel """

        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, build)
        talon.make_temp_monoexonic_transcript_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr4"
        strand = "-"
        positions = (2000, 1100)

        annotation = talon.identify_monoexon_transcript(
            chrom, positions, strand, cursor, location_dict, edge_dict,
            transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info)

        correct_gene_ID = fetch_correct_ID("TG6", "gene", cursor)
        assert annotation['gene_ID'] == correct_gene_ID
        assert annotation['start_delta'] == None
        assert annotation['end_delta'] == -100

        conn.close()
Exemple #21
0
    def test_antisense(self):
        """ Example where the transcript is antisense """

        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, build)
        talon.make_temp_monoexonic_transcript_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr4"
        strand = "+"
        positions = (1300, 3900)

        annotation = talon.identify_monoexon_transcript(
            chrom, positions, strand, cursor, location_dict, edge_dict,
            transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info)

        anti_gene_ID = fetch_correct_ID("TG6", "gene", cursor)
        gene_novelty_types = [x[-2] for x in annotation['gene_novelty']]
        t_novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert annotation['gene_novelty'][0][-1] == "TRUE"
        assert "antisense_gene" in gene_novelty_types
        assert "antisense_transcript" in t_novelty_types

        conn.close()
Exemple #22
0
    def test_match(self):
        """ Example where the transcript is a moniexonic match.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, build)
        talon.make_temp_monoexonic_transcript_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr4"
        strand = "-"
        positions = (3900, 1100)

        annotation = talon.identify_monoexon_transcript(
            chrom, positions, strand, cursor, location_dict, edge_dict,
            transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info)

        correct_gene_ID = fetch_correct_ID("TG6", "gene", cursor)
        correct_transcript_ID = fetch_correct_ID("TG6-001", "transcript",
                                                 cursor)
        assert annotation['gene_ID'] == correct_gene_ID
        assert annotation['start_delta'] == 100
        assert annotation['end_delta'] == -100

        conn.close()
    def test_no_match(self):
        """ Example with no ISM match """

        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        run_info = talon.init_run_info(database, build)
        talon.get_counters(database)

        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        strand = "+"
        positions = [1, 100, 900, 1000]
        edge_IDs = [200]
        vertex_IDs = [2, 5]
        v_novelty = [0, 0]

        all_matches = talon.search_for_ISM(edge_IDs, transcript_dict)
        assert all_matches == None
        conn.close()
    def test_antisense(self):
        """ Example where all of the vertices are in the database, but the edges
            are not, because they are antisense to the original transcript """

        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)
        edge_dict = init_refs.make_edge_dict(cursor)
        run_info = talon.init_run_info(database, build)
        orig_n_edges = len(edge_dict)
        conn.close()

        chrom = "chr2"
        vertex_IDs = [13, 12, 11, 10]
        strand = "-"

        edge_IDs, novelty = talon.match_all_splice_edges(
            vertex_IDs, strand, edge_dict, run_info)
        expected_edges = []
        for i in range(1, 4):
            num = orig_n_edges + i
            edge_id = num
            expected_edges.append(edge_id)

        assert edge_IDs == expected_edges
        assert novelty == [1, 1, 1]
    def test_ISM_prefix(self):
        """ Example where the transcript is a prefix ISM with a novel start
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        run_info = talon.init_run_info(database, build)
        talon.get_counters(database)

        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        strand = "+"
        positions = [1, 100, 500, 600]
        edge_IDs = [2]
        vertex_IDs = [2, 3]
        v_novelty = [0, 0]

        all_matches = talon.search_for_ISM(edge_IDs, transcript_dict)
        gene_ID, transcript_ID, novelty, start_end_info = talon.process_ISM(
            chrom, positions, strand, edge_IDs, vertex_IDs, all_matches,
            transcript_dict, gene_starts, gene_ends, edge_dict, location_dict,
            run_info)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        assert gene_ID == correct_gene_ID
        assert start_end_info["vertex_IDs"] == [1, 2, 3, 4]
        assert start_end_info["edge_IDs"] == [1, 2, 3]
        conn.close()
    def test_ISM_internal(self):
        """ Example where the transcript matches an internal exon
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, build)
        talon.make_temp_monoexonic_transcript_table(cursor, build)
        edge_dict = talon.make_edge_dict(cursor)
        location_dict = talon.make_location_dict(build, cursor)
        run_info = talon.init_run_info(cursor, build)
        transcript_dict = talon.make_transcript_dict(cursor, build)
        vertex_2_gene = talon.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = talon.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        strand = "+"
        positions = (500, 600)

        annotation = talon.identify_monoexon_transcript(
            chrom, positions, strand, cursor, location_dict, edge_dict,
            transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert annotation['gene_ID'] == correct_gene_ID
        assert "ISM_transcript" in novelty_types
        assert annotation['start_delta'] == annotation['end_delta'] == 0
        conn.close()
Exemple #27
0
    def test_antisense(self):
        """ Example where the vertices are known but there is no same-strand 
            match """

        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)
        edge_dict = init_refs.make_edge_dict(cursor)
        locations = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
        gene_starts = init_refs.make_gene_start_or_end_dict(
            cursor, build, "start")
        gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end")

        # Construct temp novel gene db
        init_refs.make_temp_novel_gene_table(cursor, "toy_build")

        chrom = "chr1"
        start = 1000
        end = 1
        edge_IDs = [talon.edge_counter.value() + 1]
        positions = [1000, 900, 100, 1]
        vertex_IDs = [5, 2]
        strand = "-"
        anti_strand = "+"
        v_novelty = (0, 0, 0, 0)

        # Find antisense match
        gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \
                                      talon.process_spliced_antisense(chrom, positions,
                                                                  strand, edge_IDs,
                                                                  vertex_IDs,
                                                                  transcript_dict,
                                                                  gene_starts,
                                                                  gene_ends,
                                                                  edge_dict, locations,
                                                                  vertex_2_gene, run_info,
                                                                  cursor, "temp_gene")
        #anti_gene_ID = talon.find_gene_match_on_vertex_basis(vertex_IDs,
        #                                                     anti_strand,
        #                                                     vertex_2_gene)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        anti_gene_ID = gene_novelty[-1][-1]
        assert anti_gene_ID == correct_gene_ID
        assert start_end_info["vertex_IDs"] == [6, 5, 2, 1]

        conn.close()
    def test_no_match(self):
        """ Example where the supplied interval should not match anything
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        run_info = talon.init_run_info(database, build, tmp_dir="scratch/tmp/")
        init_refs.make_temp_novel_gene_table(cursor, "toy_build")
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build, tmp_dir="scratch/tmp/")

        chrom = "chr1"
        pos = [3000, 4000]
        strand = "+"
        gene_ID, match_strand = talon.search_for_overlap_with_gene(
            chrom, pos[0], pos[1], strand, cursor, run_info, "temp_gene")
        assert gene_ID == None

        # Should get same results for flipped interval
        gene_ID, match_strand = talon.search_for_overlap_with_gene(
            chrom, pos[0], pos[1], strand, cursor, run_info, "temp_gene")
        assert gene_ID == None
        conn.close()
Exemple #29
0
    def test_datasets(self):
        """ Try to add dataset metadata to database """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        edge_dict = talon.make_edge_dict(cursor)
        run_info = talon.init_run_info(cursor, build)

        datasets = [(1, "toy", "toy", "toy")]
        talon.add_datasets(cursor, datasets)

        # Test if items are there
        query = "SELECT * FROM dataset"
        cursor.execute(query)
        assert len(cursor.fetchall()) == 1
        conn.close()
    def test_no_match(self):
        """ Example where no match exists """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        talon.make_temp_novel_gene_table(cursor, "toy_build")
        run_info = talon.init_run_info(cursor, build)
        vertex2gene = talon.make_vertex_2_gene_dict(cursor)

        vertex_IDs = (1000, 2000, 3000, 4000)
        strand = "+"

        gene_ID = talon.find_gene_match_on_vertex_basis(
            vertex_IDs, strand, vertex2gene)

        assert gene_ID == None
        conn.close()