Example #1
0
    def test_NIC_instead_of_ISM(self):
        """ Test case where the transcript looks like an ISM, but is NIC on
            account of having known starts and ends """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)

        init_refs.make_temp_novel_gene_table(cursor, build)
        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr3"
        strand = "+"
        positions = (800, 1000, 1200, 1400, 1600, 1800, 2000, 2200)

        annotation = talon.identify_transcript(chrom, positions, strand,
                                               cursor, location_dict,
                                               edge_dict, transcript_dict,
                                               vertex_2_gene, gene_starts,
                                               gene_ends, run_info, "tmp_gene")

        correct_gene_ID = fetch_correct_ID("TG5", "gene", cursor)
        novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert annotation['gene_ID'] == correct_gene_ID
        assert "NIC_transcript" in novelty_types
        conn.close()
    def test_ISM_prefix(self):
        """ Example where the transcript is a prefix ISM with a novel start
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        run_info = talon.init_run_info(database, build)
        talon.get_counters(database)

        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        strand = "+"
        positions = [1, 100, 500, 600]
        edge_IDs = [2]
        vertex_IDs = [2, 3]
        v_novelty = [0, 0]

        all_matches = talon.search_for_ISM(edge_IDs, transcript_dict)
        gene_ID, transcript_ID, novelty, start_end_info = talon.process_ISM(
            chrom, positions, strand, edge_IDs, vertex_IDs, all_matches,
            transcript_dict, gene_starts, gene_ends, edge_dict, location_dict,
            run_info)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        assert gene_ID == correct_gene_ID
        assert start_end_info["vertex_IDs"] == [1, 2, 3, 4]
        assert start_end_info["edge_IDs"] == [1, 2, 3]
        conn.close()
Example #3
0
    def test_antisense(self):
        """ Example where the transcript is antisense """

        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)
        init_refs.make_temp_novel_gene_table(cursor, build)
        init_refs.make_temp_monoexonic_transcript_table(cursor, build)
        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr4"
        strand = "+"
        positions = (1300, 3900)

        annotation = talon.identify_monoexon_transcript(
            chrom, positions, strand, cursor, location_dict, edge_dict,
            transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info,
            'temp_gene', 'temp_monoexon')

        anti_gene_ID = fetch_correct_ID("TG6", "gene", cursor)
        gene_novelty_types = [x[-2] for x in annotation['gene_novelty']]
        t_novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert annotation['gene_novelty'][0][-1] == "TRUE"
        assert "antisense_gene" in gene_novelty_types
        assert "antisense_transcript" in t_novelty_types

        conn.close()
Example #4
0
    def test_match(self):
        """ Example where the transcript is a monoexonic match.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)
        init_refs.make_temp_novel_gene_table(cursor, build)
        init_refs.make_temp_monoexonic_transcript_table(cursor, build)
        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
        gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start")
        gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end")

        chrom = "chr4"
        strand = "-"
        positions = ( 3900, 1100 )

        annotation = talon.identify_monoexon_transcript(chrom, positions, 
                                               strand, cursor,
                                               location_dict, edge_dict,
                                               transcript_dict, vertex_2_gene,
                                               gene_starts, gene_ends, run_info,
                                               'temp_gene', 'temp_monoexon')

        correct_gene_ID = fetch_correct_ID("TG6", "gene", cursor)
        correct_transcript_ID = fetch_correct_ID("TG6-001", "transcript", cursor)
        assert annotation['gene_ID'] == correct_gene_ID
        assert annotation['start_delta'] == 100
        assert annotation['end_delta'] == -100

        conn.close()
    def test_no_match(self):
        """ Example with no ISM match """

        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        run_info = talon.init_run_info(database, build)
        talon.get_counters(database)

        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        strand = "+"
        positions = [1, 100, 900, 1000]
        edge_IDs = [200]
        vertex_IDs = [2, 5]
        v_novelty = [0, 0]

        all_matches = talon.search_for_ISM(edge_IDs, transcript_dict)
        assert all_matches == None
        conn.close()
Example #6
0
    def test_gene_update(self):
        """ Try to add novel gene entries to database while ignoring duplicates
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        run_info = talon.init_run_info(database, build)
        talon.get_counters(database)

        init_refs.make_temp_novel_gene_table(cursor, build)
        talon.create_gene("chr4", 1, 1000, "+", cursor, "temp_gene")

        # Write to file
        os.system("mkdir -p scratch/db_updates/")
        with open("scratch/db_updates/genes.tsv", 'w') as f:
            cursor.execute("SELECT gene_ID, strand FROM temp_gene")
            for entry in cursor.fetchall():
                f.write("\t".join([str(x) for x in entry]) + "\n")

        talon.batch_add_genes(cursor, "scratch/db_updates/genes.tsv", 10)

        # Test if gene with ID 6 is there, but make sure we didn't add
        # duplicates of the other genes
        query = "SELECT * FROM genes"
        gene_IDs = [x['gene_ID'] for x in cursor.execute(query)]
        assert 7 in gene_IDs
        assert len(gene_IDs) == 7
        conn.close()
    def test_with_novel_location(self):
        """ Example where the toy transcript database contains a novel position.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)
        orig_vertex_count = talon.vertex_counter.value()
        orig_n_locations = len(location_dict["chr1"])
        conn.close()

        chrom = "chr1"
        strand = "+"
        pos = [1, 150, 500, 600, 900, 1000]
        vertex_IDs, novelty = talon.match_splice_vertices(chrom, pos, 
                                                                  strand,
                                                                  location_dict, 
                                                                  run_info)

        # Make sure that no match got returned
        new_vertex_count = talon.vertex_counter.value()
        assert vertex_IDs == [ new_vertex_count, 3, 4, 5]
       
        # Make sure the data structures got updated
        assert new_vertex_count == orig_vertex_count + 1
        assert len(location_dict["chr1"]) == orig_n_locations + 1
Example #8
0
    def test_NNC_match(self):
        """ Example where the transcript is an NNC match to an existing one by
            virtue of a new splice donor.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)
        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
        gene_starts = init_refs.make_gene_start_or_end_dict(
            cursor, build, "start")
        gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end")

        chrom = "chr1"
        positions = [1, 110, 900, 1000]
        edge_IDs = [talon.edge_counter.value() + 1]
        vertex_IDs = [talon.vertex_counter.value() + 1, 5]
        strand = "+"
        v_novelty = [0, 0]

        gene_ID, transcript_ID, transcript_novelty, start_end_info = talon.process_NNC(
            chrom, positions, strand, edge_IDs, vertex_IDs, transcript_dict,
            gene_starts, gene_ends, edge_dict, location_dict, vertex_2_gene,
            run_info)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        assert gene_ID == correct_gene_ID
        assert start_end_info["vertex_IDs"] == [1] + vertex_IDs + [6]
        assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None
        conn.close()
Example #9
0
    def test_no_match(self):
        """ Example with no FSM match """

        conn, cursor = get_db_cursor()
        build = "toy_build"
        db = "scratch/toy.db"
        talon.get_counters(db)

        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(db, build)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        gene_starts = init_refs.make_gene_start_or_end_dict(
            cursor, build, "start")
        gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end")

        chrom = "chr1"
        positions = [1, 100, 500, 600]
        strand = "+"
        edge_IDs = [2]
        vertex_IDs = [2, 3, 4, 5]
        v_novelty = [0, 0, 0, 0]

        all_matches = talon.search_for_ISM(edge_IDs, transcript_dict)

        gene_ID, transcript_ID, novelty, start_end_info = talon.process_FSM(
            chrom, positions, strand, edge_IDs, vertex_IDs, all_matches,
            gene_starts, gene_ends, edge_dict, location_dict, run_info)

        assert gene_ID == transcript_ID == None
        conn.close()
Example #10
0
    def test_FSM_perfect(self):
        """ Example where the transcript is a perfect full splice match.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)
        init_refs.make_temp_novel_gene_table(cursor, build)
        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        strand = "+"
        positions = [1, 100, 500, 600, 900, 1000]

        annotation = talon.identify_transcript(chrom, positions, strand,
                                               cursor, location_dict,
                                               edge_dict, transcript_dict,
                                               vertex_2_gene, gene_starts,
                                               gene_ends, run_info,
                                               "temp_gene")

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        correct_transcript_ID = fetch_correct_ID("TG1-001", "transcript",
                                                 cursor)
        assert annotation['gene_ID'] == correct_gene_ID
        assert annotation['transcript_ID'] == correct_transcript_ID
        assert annotation['transcript_novelty'] == []
        conn.close()
Example #11
0
    def test_FSM_end_diff(self):
        """ Example where the transcript is an FSM but has a difference on
            the ends large enough to be novel.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)

        init_refs.make_temp_novel_gene_table(cursor, build)
        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr2"
        strand = "+"
        positions = [1, 100, 500, 600, 900, 1500]

        annotation = talon.identify_transcript(chrom, positions, strand,
                                               cursor, location_dict,
                                               edge_dict, transcript_dict,
                                               vertex_2_gene, gene_starts,
                                               gene_ends, run_info,
                                               "temp_gene")

        correct_gene_ID = fetch_correct_ID("TG2", "gene", cursor)
        novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert annotation['gene_ID'] == correct_gene_ID
        assert annotation['end_delta'] == None
        conn.close()
Example #12
0
    def test_genomic_unspliced(self):
        """ Monoexonic fragment that overlaps gene 1 """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)

        init_refs.make_temp_novel_gene_table(cursor, build)
        init_refs.make_temp_monoexonic_transcript_table(cursor, build)
        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        strand = "+"
        positions = (1, 990)

        annotation = talon.identify_monoexon_transcript(
            chrom, positions, strand, cursor, location_dict, edge_dict,
            transcript_dict, vertex_2_gene, gene_starts, gene_ends, run_info,
            "temp_gene", "temp_monoexon")

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert annotation['gene_ID'] == correct_gene_ID
        assert "genomic_transcript" in novelty_types
        assert annotation['end_delta'] == -10
        conn.close()
    def test_antisense(self):
        """ Example where all of the vertices are in the database, but the edges
            are not, because they are antisense to the original transcript """

        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)
        edge_dict = init_refs.make_edge_dict(cursor)
        run_info = talon.init_run_info(database, build)
        orig_n_edges = len(edge_dict)
        conn.close()

        chrom = "chr2"
        vertex_IDs = [13, 12, 11, 10]
        strand = "-"

        edge_IDs, novelty = talon.match_all_splice_edges(
            vertex_IDs, strand, edge_dict, run_info)
        expected_edges = []
        for i in range(1, 4):
            num = orig_n_edges + i
            edge_id = num
            expected_edges.append(edge_id)

        assert edge_IDs == expected_edges
        assert novelty == [1, 1, 1]
Example #14
0
    def test_NNC(self):
        """ Example where the transcript skips an exon and has a novel splice
            donor
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)

        init_refs.make_temp_novel_gene_table(cursor, build)
        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr1"
        strand = "+"
        positions = [1, 50, 900, 1000]

        annotation = talon.identify_transcript(chrom, positions, strand,
                                               cursor, location_dict,
                                               edge_dict, transcript_dict,
                                               vertex_2_gene, gene_starts,
                                               gene_ends, run_info,
                                               "temp_gene")

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert annotation['gene_ID'] == correct_gene_ID
        assert "NNC_transcript" in novelty_types
        assert annotation['start_delta'] == annotation['end_delta'] == 0
        conn.close()
Example #15
0
    def test_antisense(self):
        """ Example where the vertices are known but there is no same-strand 
            match """

        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)
        edge_dict = init_refs.make_edge_dict(cursor)
        locations = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
        gene_starts = init_refs.make_gene_start_or_end_dict(
            cursor, build, "start")
        gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end")

        # Construct temp novel gene db
        init_refs.make_temp_novel_gene_table(cursor, "toy_build")

        chrom = "chr1"
        start = 1000
        end = 1
        edge_IDs = [talon.edge_counter.value() + 1]
        positions = [1000, 900, 100, 1]
        vertex_IDs = [5, 2]
        strand = "-"
        anti_strand = "+"
        v_novelty = (0, 0, 0, 0)

        # Find antisense match
        gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \
                                      talon.process_spliced_antisense(chrom, positions,
                                                                  strand, edge_IDs,
                                                                  vertex_IDs,
                                                                  transcript_dict,
                                                                  gene_starts,
                                                                  gene_ends,
                                                                  edge_dict, locations,
                                                                  vertex_2_gene, run_info,
                                                                  cursor, "temp_gene")
        #anti_gene_ID = talon.find_gene_match_on_vertex_basis(vertex_IDs,
        #                                                     anti_strand,
        #                                                     vertex_2_gene)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        anti_gene_ID = gene_novelty[-1][-1]
        assert anti_gene_ID == correct_gene_ID
        assert start_end_info["vertex_IDs"] == [6, 5, 2, 1]

        conn.close()
Example #16
0
    def test_transcript_update(self):
        """ Try to add novel transcript entries to database while ignoring 
            duplicates
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        database = "scratch/toy.db"
        talon.get_counters(database)
        talon.create_transcript("chr1", 1, 1000, 1, (1, ), (1, 2),
                                transcript_dict)

        # Write to file
        os.system("mkdir -p scratch/db_updates/")
        with open("scratch/db_updates/transcripts.tsv", 'w') as f:
            for transcript in transcript_dict.values():
                if type(transcript) is dict:
                    entry = "\t".join([
                        str(x) for x in (transcript['transcript_ID'],
                                         transcript['gene_ID'],
                                         transcript['start_exon'],
                                         transcript['jn_path'],
                                         transcript['end_exon'],
                                         transcript['start_vertex'],
                                         transcript['end_vertex'],
                                         transcript['n_exons'])
                    ])
                    f.write(entry + "\n")

        batch_size = 5
        talon.batch_add_transcripts(cursor,
                                    "scratch/db_updates/transcripts.tsv",
                                    batch_size)

        # Test if transcript with ID 8 is there, but make sure we didn't add
        # duplicates of the others
        query = "SELECT * FROM transcripts"
        cursor.execute(query)
        transcripts = cursor.fetchall()
        transcript_IDs = [x['transcript_ID'] for x in transcripts]
        assert 8 in transcript_IDs
        assert len(transcript_IDs) == 8

        # Test if None value was handled correctly
        for transcript in transcripts:
            if transcript['transcript_ID'] == 8:
                assert transcript['jn_path'] == None

        conn.close()
Example #17
0
    def test_intergenic(self):
        """ Example where the transcript is an NIC match to an existing one by
            virtue of a new splice donor.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)
        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
        gene_starts = init_refs.make_gene_start_or_end_dict(
            cursor, build, "start")
        gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end")
        correct_gene_ID = talon.gene_counter.value() + 1

        # Construct temp novel gene db
        init_refs.make_temp_novel_gene_table(cursor, "toy_build")

        chrom = "chrX"
        positions = [1, 100, 900, 1000]
        edge_IDs = [
            talon.edge_counter.value() + 1,
            talon.edge_counter.value() + 2
        ]
        vertex_IDs = [
            talon.vertex_counter.value() + 1,
            talon.vertex_counter.value() + 2
        ]
        strand = "+"

        gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \
                             talon.process_remaining_mult_cases(chrom, positions,
                                                                strand, edge_IDs,
                                                                vertex_IDs,
                                                                transcript_dict,
                                                                gene_starts, gene_ends,
                                                                edge_dict, location_dict,
                                                                vertex_2_gene, run_info,
                                                                cursor, "temp_gene")

        assert gene_ID == correct_gene_ID
        assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None
        assert gene_novelty[0][-2] == "intergenic_novel"
        conn.close()
Example #18
0
    def test_genomic(self):
        """ Example where the transcript overlaps a gene but contains no known
            splice vertices
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)
        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
        gene_starts = init_refs.make_gene_start_or_end_dict(
            cursor, build, "start")
        gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end")

        # Construct temp novel gene db
        init_refs.make_temp_novel_gene_table(cursor, "toy_build")

        chrom = "chr1"
        positions = [1000, 950, 700, 600]
        edge_IDs = [
            talon.edge_counter.value() + 1,
            talon.edge_counter.value() + 2
        ]
        vertex_IDs = [
            talon.vertex_counter.value() + 1,
            talon.vertex_counter.value() + 2
        ]
        strand = "-"

        gene_ID, transcript_ID, gene_novelty, transcript_novelty, start_end_info = \
                             talon.process_remaining_mult_cases(chrom, positions,
                                                                strand, edge_IDs,
                                                                vertex_IDs,
                                                                transcript_dict,
                                                                gene_starts, gene_ends,
                                                                edge_dict, location_dict,
                                                                vertex_2_gene, run_info,
                                                                cursor, "temp_gene")
        correct_gene_ID = fetch_correct_ID("TG3", "gene", cursor)
        assert gene_ID == correct_gene_ID
        assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None
        assert gene_novelty == []
        assert transcript_novelty[-1][-2] == "genomic_transcript"
        conn.close()
Example #19
0
    def test_transcript_assigned_intergenic(self):
        """ This test covers a case reported by a user where a read overlaps
            the ~600bp mono-exonic pseudogene HMGB1P1. The read itself has
            2 exons, the second of which contains the small pseudogene inside.
            earlier versions of TALON classified the read as intergenic,
            when it was actually supposed to be genomic """

        # Set up references
        database = "scratch/multiexon_read_overlapping_monoexon_transcript/talon.db"
        conn = sqlite3.connect(database)
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()

        build = "hg38"
        talon.get_counters(database)
        run_info = talon.init_run_info(database, build)
        struct_collection = talon.prepare_data_structures(cursor, run_info)

        # Use pysam to get the read from the SAM file
        sam_file = "input_files/multiexon_read_overlapping_monoexon_transcript/read.sam"
        with pysam.AlignmentFile(sam_file) as sam:
            for entry in sam:
                sam_record = entry
                break

        # Get read attributes
        chrom = sam_record.reference_name
        strand = "-" if sam_record.is_reverse else "+"
        sam_start = sam_record.reference_start
        sam_end = sam_record.reference_end

        # Do we get any overlap with the reference gene?
        best_gene, match_strand = talon.search_for_overlap_with_gene(
            chrom, min(sam_start, sam_end), max(sam_start, sam_end), strand,
            cursor, run_info, struct_collection.tmp_gene)
        assert best_gene == 1
        assert match_strand == "-"

        annotation_info = talon.annotate_read(sam_record,
                                              cursor,
                                              run_info,
                                              struct_collection,
                                              mode=0)

        assert annotation_info['gene_ID'] == 1
        assert annotation_info['transcript_ID'] == 2
        assert 'genomic_transcript' in annotation_info['transcript_novelty'][0]
Example #20
0
    def test_counter_update(self):
        """ Update counters """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"

        talon.get_counters(database)

        # Change the counter values to some arbitrary numbers
        for i in range(10):
            talon.gene_counter.increment()
        for i in range(20):
            talon.transcript_counter.increment()
        for i in range(2):
            talon.edge_counter.increment()
        for i in range(5):
            talon.vertex_counter.increment()
        for i in range(30):
            talon.dataset_counter.increment()
        for i in range(6):
            talon.observed_counter.increment()

        # Now try the update
        talon.update_counter(cursor)

        # Check results with queries
        cursor.execute("""SELECT * FROM counters""")
        for category, value in cursor.fetchall():
            if category == "genes":
                assert value == 16
            elif category == "transcripts":
                assert value == 27
            elif category == "edge":
                assert value == 33
            elif category == "vertex":
                assert value == 39
            elif category == "observed":
                assert value == 6
            elif category == "dataset":
                assert value == 30
            else:
                if category != "genome_build":
                    pytest.fail("Unexpected entry in counters table")

        conn.close()
Example #21
0
    def test_ISM_suffix(self):
        """ Example where the transcript is an ISM with suffix
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        run_info = talon.init_run_info(database, build)
        talon.get_counters(database)

        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start")
        gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end")

        chrom = "chr1"
        strand = "+"
        positions = [ 500, 600, 900, 1000 ]
        edge_IDs = [4]
        vertex_IDs = [4, 5]
        v_novelty = [0, 0]

        all_matches = talon.search_for_ISM(edge_IDs, transcript_dict)
        gene_ID, transcript_ID, novelty, start_end_info = talon.process_ISM(chrom, 
                                                            positions, 
                                                            strand, edge_IDs,
                                                            vertex_IDs, 
                                                            all_matches, 
                                                            transcript_dict,
                                                            gene_starts, gene_ends, 
                                                            edge_dict, location_dict, 
                                                            run_info)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor) 

        assert gene_ID == correct_gene_ID
        assert start_end_info["vertex_IDs"] == [3, 4, 5, 6]
        assert start_end_info["edge_IDs"] == [3, 4, 5]
        assert start_end_info["start_novelty"] == 0 # because the exon is known
        assert start_end_info["end_novelty"] == 0
        assert transcript_dict[frozenset(start_end_info["edge_IDs"])] != None
        conn.close()
    def test_all_known_edges(self):
        """ Example where the toy transcript database contains matches for all
            vertices.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)
        edge_dict = init_refs.make_edge_dict(cursor)
        run_info = talon.init_run_info(database, build)
        conn.close()

        chrom = "chr1"
        vertex_IDs = [2, 3, 4, 5]
        strand = "+"
        edge_IDs, novelty = talon.match_all_splice_edges(
            vertex_IDs, strand, edge_dict, run_info)

        assert edge_IDs == [2, 3, 4]
        assert novelty == [0, 0, 0]
Example #23
0
    def test_NIC_with_all_known_edges(self):
        """ Test case derived from a real mouse Map2k4 read. All of edges are
            known (except 3'), yet the read is NIC not FSM/ISM """

        database = "scratch/Map2k4.db"
        talon.get_counters(database)
        conn = sqlite3.connect(database)
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()
        build = "mm10"
        init_refs.make_temp_novel_gene_table(cursor, build)
        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr11"
        strand = "-"
        positions = [
            65788254, 65788136, 65775765, 65775733, 65756371, 65756269,
            65735366, 65735192, 65719603, 65719484, 65712297, 65712178,
            65709983, 65709932, 65707111, 65706984, 65696365, 65696288,
            65693570, 65693422, 65691773, 65691728, 65690804, 65689322
        ]

        annotation = talon.identify_transcript(chrom, positions, strand,
                                               cursor, location_dict,
                                               edge_dict, transcript_dict,
                                               vertex_2_gene, gene_starts,
                                               gene_ends, run_info,
                                               "temp_gene")

        assert annotation['gene_ID'] == 1
        assert annotation['transcript_ID'] == 8
        novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert "NIC_transcript" in novelty_types

        conn.close()
Example #24
0
    def test_partial_match(self):
        """ Example where the transcript overlaps a single-exon transcript,
            but is shorter. In the past, the start would be assigned to the 
            annotated start, and the end would be novel. This is no longer
            the case- at this time, the transcript will be assigned to
            the annotated match. """

        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)
        init_refs.make_temp_novel_gene_table(cursor, build)
        init_refs.make_temp_monoexonic_transcript_table(cursor, build)
        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
        gene_starts = init_refs.make_gene_start_or_end_dict(cursor, build, "start")
        gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end")

        chrom = "chr4"
        strand = "-"
        positions = ( 3900, 2900 )

        annotation = talon.identify_monoexon_transcript(chrom, positions,
                                               strand, cursor,
                                               location_dict, edge_dict,
                                               transcript_dict, vertex_2_gene,
                                               gene_starts, gene_ends, run_info,
                                               'temp_gene', 'temp_monoexon')

        correct_gene_ID = fetch_correct_ID("TG6", "gene", cursor)
        correct_transcript_ID = fetch_correct_ID("TG6-001", "transcript", cursor)
        assert annotation['gene_ID'] == correct_gene_ID
        assert annotation['transcript_ID'] == correct_transcript_ID
        assert annotation['start_delta'] == 100
        assert annotation['end_delta'] == -1900

        conn.close()
Example #25
0
    def test_FSM_start_diff(self):
        """ Example where the transcript is an FSM but has a difference on
            the start large enough to be novel.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        db = "scratch/toy.db"
        talon.get_counters(db)

        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(db, build)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        orig_vertices = talon.vertex_counter.value()
        gene_starts = init_refs.make_gene_start_or_end_dict(
            cursor, build, "start")
        gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end")

        chrom = "chr1"
        positions = [2501, 1500, 1000, 900]  #First postion is > 500bp away
        strand = "-"
        edge_IDs = [7]
        vertex_IDs = [7, 6]
        v_novelty = [0, 0]

        all_matches = talon.search_for_ISM(edge_IDs, transcript_dict)

        gene_ID, transcript_ID, novelty, start_end_info = talon.process_FSM(
            chrom, positions, strand, edge_IDs, vertex_IDs, all_matches,
            gene_starts, gene_ends, edge_dict, location_dict, run_info)

        correct_gene_ID = fetch_correct_ID("TG3", "gene", cursor)
        correct_transcript_ID = fetch_correct_ID("TG3-001", "transcript",
                                                 cursor)
        assert gene_ID == correct_gene_ID
        assert transcript_ID == correct_transcript_ID
        assert start_end_info["start_vertex"] == orig_vertices + 1
        assert start_end_info["end_vertex"] == 5
        conn.close()
Example #26
0
    def test_FSM_perfect(self):
        """ Example where the transcript is a perfect full splice match.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        db = "scratch/toy.db"
        talon.get_counters(db)
        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(db, build)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        gene_starts = init_refs.make_gene_start_or_end_dict(
            cursor, build, "start")
        gene_ends = init_refs.make_gene_start_or_end_dict(cursor, build, "end")

        chrom = "chr1"
        positions = [1, 100, 500, 600, 900, 1010]
        strand = "+"
        edge_IDs = [2, 3, 4]
        vertex_IDs = [2, 3, 4, 5]
        v_novelty = [0, 0, 0, 0]

        all_matches = talon.search_for_ISM(edge_IDs, transcript_dict)

        gene_ID, transcript_ID, novelty, start_end_info = talon.process_FSM(
            chrom, positions, strand, edge_IDs, vertex_IDs, all_matches,
            gene_starts, gene_ends, edge_dict, location_dict, run_info)

        correct_gene_ID = fetch_correct_ID("TG1", "gene", cursor)
        correct_transcript_ID = fetch_correct_ID("TG1-001", "transcript",
                                                 cursor)
        assert gene_ID == correct_gene_ID
        assert transcript_ID == correct_transcript_ID
        assert novelty == []
        assert start_end_info["start_vertex"] == 1
        assert start_end_info["end_vertex"] == 6
        assert start_end_info["diff_3p"] == 10
        conn.close()
    def test_all_known_locations(self):
        """ Example where the toy transcript database contains matches for all
            vertices.
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)
        orig_vertex_count = talon.vertex_counter.value()
        strand = "+"
        conn.close()

        chrom = "chr1"
        pos = [1, 100, 500, 600, 900, 1000]
        vertex_IDs, novelty = talon.match_splice_vertices(chrom, pos, 
                                                                  strand,
                                                                  location_dict, 
                                                                  run_info)

        assert vertex_IDs == [2, 3, 4, 5]
        assert talon.vertex_counter.value() == orig_vertex_count
Example #28
0
    def test_edge_update(self):
        """ Try to add novel exons and introns. """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)
        edge_dict = init_refs.make_edge_dict(cursor)
        run_info = talon.init_run_info(database, build)
        orig_n_edges = talon.edge_counter.value()

        talon.create_edge(2, 1, "exon", "-", edge_dict)

        # Write to file
        os.system("mkdir -p scratch/db_updates/")
        with open("scratch/db_updates/edges.tsv", 'w') as f:
            for edge in list(edge_dict.values()):
                if type(edge) is dict:
                    entry = "\t".join([
                        str(x) for x in [
                            edge['edge_ID'], edge['v1'], edge['v2'],
                            edge['edge_type'], edge['strand']
                        ]
                    ])
                    f.write(entry + "\n")

        batch_size = 10
        talon.batch_add_edges(cursor, "scratch/db_updates/edges.tsv",
                              batch_size)

        # Test if the edge table has the correct number of edges now
        query = "SELECT * FROM edge"
        cursor.execute(query)
        edge_IDs = [x['edge_ID'] for x in cursor.fetchall()]
        assert orig_n_edges + 1 in edge_IDs
        assert len(edge_IDs) == orig_n_edges + 1
        conn.close()
Example #29
0
    def test_spliced_antisense(self):
        """ Example where the transcript matches known vertices but is antisense
        """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)

        init_refs.make_temp_novel_gene_table(cursor, build)
        edge_dict = init_refs.make_edge_dict(cursor)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)
        transcript_dict = init_refs.make_transcript_dict(cursor, build)
        vertex_2_gene = init_refs.make_vertex_2_gene_dict(cursor)
        gene_starts, gene_ends = init_refs.make_gene_start_and_end_dict(
            cursor, build)

        chrom = "chr2"
        strand = "-"
        positions = [1000, 900, 600, 500, 100, 1]

        annotation = talon.identify_transcript(chrom, positions, strand,
                                               cursor, location_dict,
                                               edge_dict, transcript_dict,
                                               vertex_2_gene, gene_starts,
                                               gene_ends, run_info,
                                               "temp_gene")

        anti_gene_ID = fetch_correct_ID("TG2", "gene", cursor)
        gene_novelty_types = [x[-2] for x in annotation['gene_novelty']]
        t_novelty_types = [x[-2] for x in annotation['transcript_novelty']]
        assert annotation['gene_novelty'][0][-1] == "TRUE"
        assert "antisense_gene" in gene_novelty_types
        assert "antisense_transcript" in t_novelty_types
        assert annotation['start_delta'] == annotation['end_delta'] == 0
        conn.close()
Example #30
0
    def test_location_update(self):
        """ Update locations """
        conn, cursor = get_db_cursor()
        build = "toy_build"
        database = "scratch/toy.db"
        talon.get_counters(database)
        location_dict = init_refs.make_location_dict(build, cursor)
        run_info = talon.init_run_info(database, build)
        orig_n_pos = talon.vertex_counter.value()

        talon.create_vertex("chr4", 2000, location_dict, run_info)

        # Write to file
        os.system("mkdir -p scratch/db_updates/")
        with open("scratch/db_updates/loc.tsv", 'w') as f:
            for chrom_dict in location_dict.values():
                for loc in list(chrom_dict.values()):
                    if type(loc) is dict:
                        entry = ("\t".join([
                            str(x)
                            for x in (loc['location_ID'], loc['genome_build'],
                                      loc['chromosome'], loc['position'])
                        ]))
                        f.write(entry + "\n")

        batch_size = 10
        talon.batch_add_locations(cursor, "scratch/db_updates/loc.tsv",
                                  batch_size)

        # Test if the table has the correct number of locations now
        query = "SELECT * FROM location"
        cursor.execute(query)
        loc_IDs = [x['location_ID'] for x in cursor.fetchall()]
        assert orig_n_pos + 1 in loc_IDs
        assert len(loc_IDs) == orig_n_pos + 1
        conn.close()