Example #1
0
    def test_index(self):

        # Create the list of files
        files = ["trinity.gtf",
                 "trinity.gff3",
                 "trinity.cDNA_match.gff3",
                 "trinity.match_matchpart.gff3"]
        # files = [pkg_resources.resource_filename("Mikado.tests", filename) for filename in files]

        namespace = Namespace(default=False)
        namespace.distance = 2000
        namespace.index = True
        namespace.prediction = None
        namespace.log = os.path.join(tempfile.gettempdir(), "index.log")
        logger = create_null_logger("null")

        for ref in files:
            with self.subTest(ref=ref):
                temp_ref = os.path.join(tempfile.gettempdir(), ref)
                with pkg_resources.resource_stream("Mikado.tests", ref) as ref_handle,\
                        open(temp_ref, "wb") as out_handle:
                    out_handle.write(ref_handle.read())
                namespace.reference = to_gff(temp_ref)
                compare(namespace)

                self.assertTrue(os.path.exists(namespace.log))
                self.assertTrue(os.path.exists("{}.midx".format(namespace.reference.name)))
                self.assertGreater(os.stat("{}.midx".format(namespace.reference.name)).st_size, 0)
                genes, positions = load_index(namespace, logger)
                self.assertIsInstance(genes, dict)
                self.assertIsInstance(positions, dict)
                self.assertEqual(len(genes), 38)
                os.remove(namespace.reference.name)
                os.remove(namespace.log)
                os.remove("{}.midx".format(namespace.reference.name))
class DrosoTester(unittest.TestCase):

    logger = create_null_logger("droso")

    def setUp(self):

        ref_gtf = """2L\tprotein_coding\ttranscript\t523736\t540560\t.\t+\t.\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "1"; gene_name "ush"; transcript_name "ush-RC"; exon_id "FBgn0003963:5"; gene_biotype "protein_coding";
2L\tprotein_coding\texon\t523736\t524059\t.\t+\t.\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "1"; gene_name "ush"; transcript_name "ush-RC"; exon_id "FBgn0003963:5"; gene_biotype "protein_coding";
2L\tprotein_coding\texon\t525392\t525436\t.\t+\t.\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "2"; gene_name "ush"; transcript_name "ush-RC"; exon_id "FBgn0003963:677"; gene_biotype "protein_coding";
2L\tprotein_coding\texon\t536023\t536966\t.\t+\t.\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "3"; gene_name "ush"; transcript_name "ush-RC"; exon_id "FBgn0003963:7"; gene_biotype "protein_coding";
2L\tprotein_coding\texon\t537037\t537431\t.\t+\t.\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "4"; gene_name "ush"; transcript_name "ush-RC"; exon_id "FBgn0003963:8"; gene_biotype "protein_coding";
2L\tprotein_coding\texon\t537549\t537749\t.\t+\t.\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "5"; gene_name "ush"; transcript_name "ush-RC"; exon_id "FBgn0003963:9"; gene_biotype "protein_coding";
2L\tprotein_coding\texon\t537863\t539249\t.\t+\t.\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "6"; gene_name "ush"; transcript_name "ush-RC"; exon_id "FBgn0003963:10"; gene_biotype "protein_coding";
2L\tprotein_coding\texon\t539310\t539452\t.\t+\t.\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "7"; gene_name "ush"; transcript_name "ush-RC"; exon_id "FBgn0003963:11"; gene_biotype "protein_coding";
2L\tprotein_coding\texon\t539518\t540560\t.\t+\t.\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "8"; gene_name "ush"; transcript_name "ush-RC"; exon_id "FBgn0003963:13"; gene_biotype "protein_coding";
2L\tprotein_coding\tCDS\t524038\t524059\t.\t+\t0\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "1"; gene_name "ush"; transcript_name "ush-RC"; protein_id "FBpp0302929"; gene_biotype "protein_coding";
2L\tprotein_coding\tCDS\t525392\t525436\t.\t+\t2\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "2"; gene_name "ush"; transcript_name "ush-RC"; protein_id "FBpp0302929"; gene_biotype "protein_coding";
2L\tprotein_coding\tCDS\t536023\t536966\t.\t+\t2\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "3"; gene_name "ush"; transcript_name "ush-RC"; protein_id "FBpp0302929"; gene_biotype "protein_coding";
2L\tprotein_coding\tCDS\t537037\t537431\t.\t+\t0\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "4"; gene_name "ush"; transcript_name "ush-RC"; protein_id "FBpp0302929"; gene_biotype "protein_coding";
2L\tprotein_coding\tCDS\t537549\t537749\t.\t+\t1\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "5"; gene_name "ush"; transcript_name "ush-RC"; protein_id "FBpp0302929"; gene_biotype "protein_coding";
2L\tprotein_coding\tCDS\t537863\t539249\t.\t+\t1\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "6"; gene_name "ush"; transcript_name "ush-RC"; protein_id "FBpp0302929"; gene_biotype "protein_coding";
2L\tprotein_coding\tCDS\t539310\t539452\t.\t+\t0\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "7"; gene_name "ush"; transcript_name "ush-RC"; protein_id "FBpp0302929"; gene_biotype "protein_coding";
2L\tprotein_coding\tCDS\t539518\t540016\t.\t+\t1\tgene_id "FBgn0003963"; transcript_id "FBtr0329895"; exon_number "8"; gene_name "ush"; transcript_name "ush-RC"; protein_id "FBpp0302929"; gene_biotype "protein_coding";
"""

        pred_gtf = """2L\tStringTie\ttranscript\t476445\t479670\t1000\t-\t.\tgene_id "Stringtie.63"; transcript_id "Stringtie.63.1"; cov "141.769424"; FPKM "inf";
2L\tStringTie\texon\t476445\t478204\t1000\t-\t.\tgene_id "Stringtie.63"; transcript_id "Stringtie.63.1"; exon_number "1"; cov "149.294586";
2L\tStringTie\texon\t479407\t479670\t1000\t-\t.\tgene_id "Stringtie.63"; transcript_id "Stringtie.63.1"; exon_number "2"; cov "91.601692";"""

        ref_lines = [
            parsers.GTF.GtfLine(line)
            for line in filter(lambda x: x != '', ref_gtf.split("\n"))
        ]
        self.ref = loci.Transcript(ref_lines[0])
        self.ref.logger = self.logger
        for l in ref_lines[1:]:
            self.ref.add_exon(l)
        self.ref.finalize()

        pred_lines = [
            parsers.GTF.GtfLine(line)
            for line in filter(lambda x: x != '', pred_gtf.split("\n"))
        ]
        self.pred = loci.Transcript(pred_lines[0])
        for l in pred_lines[1:]:
            self.pred.add_exon(l)
        self.pred.finalize()

    def test_code(self):

        self.ref.finalize()
        self.assertGreater(len(self.ref.combined_cds), 0)
        self.assertEqual(len(self.ref.selected_cds_introns), 7)
        self.assertEqual(len(self.ref.combined_cds_introns), 7)
Example #3
0
    def test_locus(self):
        """Basic testing of the Locus functionality."""

        logger = create_null_logger("null")
        logger.setLevel("WARNING")
        logger.info("Started")
        slocus = Superlocus(self.transcript1,
                            json_conf=self.my_json,
                            logger=logger)
        slocus.add_transcript_to_locus(self.transcript2)
        self.assertEqual(slocus.strand, self.transcript1.strand)
        self.assertEqual(slocus.start,
                         min(self.transcript1.start, self.transcript2.start))
        self.assertEqual(slocus.end,
                         max(self.transcript1.end, self.transcript2.end))
        logger.info(slocus.transcripts)
        slocus.define_subloci()
        logger.info(slocus.subloci)
        logger.info(slocus.transcripts)
        self.assertEqual(len(slocus.transcripts), 2)
        self.assertEqual(len(slocus.subloci), 2)
        slocus.define_monosubloci()
        self.assertEqual(len(slocus.monosubloci), 2)
        slocus.define_loci()
        self.assertEqual(len(slocus.loci), 1)
        self.assertEqual(
            list(slocus.loci[list(
                slocus.loci.keys())[0]].transcripts.keys())[0], "t0")
        gff_transcript3 = """Chr1\tfoo\ttranscript\t101\t200\t.\t-\t.\tID=tminus0
Chr1\tfoo\texon\t101\t200\t.\t-\t.\tID=tminus0:exon1;Parent=tminus0""".split(
            "\n")
        gff_transcript3 = [GFF.GffLine(x) for x in gff_transcript3]
        transcript3 = Transcript(gff_transcript3[0])
        for exon in gff_transcript3[1:]:
            transcript3.add_exon(exon)
        transcript3.finalize()
        minusuperlocus = Superlocus(transcript3, json_conf=self.my_json)
        minusuperlocus.define_loci()
        self.assertEqual(len(minusuperlocus.loci), 1)
        self.assertTrue(transcript3.strand != self.transcript1.strand)
Example #4
0
    def test_index(self):

        # Create the list of files
        files = [
            "trinity.gtf", "trinity.gff3", "trinity.cDNA_match.gff3",
            "trinity.match_matchpart.gff3"
        ]
        # files = [pkg_resources.resource_filename("Mikado.tests", filename) for filename in files]

        namespace = Namespace(default=False)
        namespace.distance = 2000
        namespace.index = True
        namespace.prediction = None
        namespace.log = os.path.join(tempfile.gettempdir(), "index.log")
        logger = create_null_logger("null")

        for ref in files:
            with self.subTest(ref=ref):
                temp_ref = os.path.join(tempfile.gettempdir(), ref)
                with pkg_resources.resource_stream("Mikado.tests", ref) as ref_handle,\
                        open(temp_ref, "wb") as out_handle:
                    out_handle.write(ref_handle.read())
                namespace.reference = to_gff(temp_ref)
                compare(namespace)

                self.assertTrue(os.path.exists(namespace.log))
                self.assertTrue(
                    os.path.exists("{}.midx".format(namespace.reference.name)))
                self.assertGreater(
                    os.stat("{}.midx".format(
                        namespace.reference.name)).st_size, 0)
                genes, positions = load_index(namespace, logger)
                self.assertIsInstance(genes, dict)
                self.assertIsInstance(positions, dict)
                self.assertEqual(len(genes), 38)
                os.remove(namespace.reference.name)
                os.remove(namespace.log)
                os.remove("{}.midx".format(namespace.reference.name))
    def testDoubleOrf(self):
        """Test to verify the introduction of multiple ORFs."""

        self.tr.strip_cds()
        self.tr.finalized = False

        first_orf = parsers.bed12.BED12()
        first_orf.chrom = self.tr.id
        first_orf.start = 1
        first_orf.end = self.tr.cdna_length
        first_orf.name = "first"
        first_orf.strand = "+"
        first_orf.score = 0
        first_orf.thick_start = 51
        first_orf.thick_end = 398
        first_orf.block_count = 1
        first_orf.blockSize = self.tr.cdna_length
        first_orf.block_sizes = [self.tr.cdna_length]
        first_orf.block_starts = [0]
        first_orf.rgb = 0
        first_orf.has_start_codon = True
        first_orf.has_stop_codon = True
        first_orf.transcriptomic = True
        self.assertFalse(first_orf.invalid)
        # This should not be incorporated
        second_orf = parsers.bed12.BED12()
        second_orf.chrom = self.tr.id
        second_orf.start = 1
        second_orf.end = self.tr.cdna_length
        second_orf.name = "second"
        second_orf.strand = "+"
        second_orf.score = 0
        second_orf.thick_start = 201
        second_orf.thick_end = 410
        second_orf.block_count = 1
        second_orf.blockSize = self.tr.cdna_length
        second_orf.block_sizes = [self.tr.cdna_length]
        second_orf.block_starts = [0]
        second_orf.rgb = 0
        second_orf.has_start_codon = True
        second_orf.has_stop_codon = True
        second_orf.transcriptomic = True
        self.assertFalse(second_orf.invalid)

        self.assertTrue(
            loci.Transcript.is_overlapping_cds(first_orf, second_orf))

        # This should be added
        third_orf = parsers.bed12.BED12()
        third_orf.chrom = self.tr.id
        third_orf.start = 1
        third_orf.end = self.tr.cdna_length
        third_orf.name = "third"
        third_orf.strand = "+"
        third_orf.score = 0
        third_orf.thick_start = 501
        third_orf.thick_end = 800
        third_orf.block_count = 1
        third_orf.blockSize = self.tr.cdna_length
        third_orf.block_sizes = [self.tr.cdna_length]
        third_orf.block_starts = [0]
        third_orf.rgb = 0
        third_orf.has_start_codon = True
        third_orf.has_stop_codon = True
        third_orf.transcriptomic = True
        self.assertFalse(third_orf.invalid)

        self.assertFalse(
            loci.Transcript.is_overlapping_cds(first_orf, third_orf))
        self.assertFalse(
            loci.Transcript.is_overlapping_cds(second_orf, third_orf))

        self.assertFalse(third_orf == second_orf)
        self.assertFalse(first_orf == second_orf)
        self.assertFalse(first_orf == third_orf)

        candidates = [first_orf, second_orf, third_orf]

        # self.assertEqual(len(Mikado.py.loci.transcript.Transcript.find_overlapping_cds(candidates)), 2)

        logger = create_null_logger("null")
        self.tr.logger = logger

        self.tr.load_orfs([first_orf, second_orf, third_orf])

        self.assertTrue(self.tr.is_complete)
        self.tr.finalize()
        self.assertEqual(
            self.tr.number_internal_orfs, 2,
            (self.tr.cdna_length, self.tr.selected_start_distance_from_tss,
             self.tr.selected_end_distance_from_tes))

        self.assertEqual(self.tr.combined_cds_length, 648)
        self.assertEqual(self.tr.selected_cds_length, 348)
        self.assertEqual(self.tr.number_internal_orfs, 2,
                         "\n".join([str(x) for x in self.tr.internal_orfs]))

        new_transcripts = sorted(self.tr.split_by_cds())

        self.assertEqual(len(new_transcripts), 2)
        self.assertEqual(new_transcripts[0].three_utr_length, 0)
        self.assertEqual(new_transcripts[1].five_utr_length, 0)
class TranscriptTesterPositive(unittest.TestCase):

    logger = create_null_logger("test_at")

    tr_gff = """Chr2    TAIR10    mRNA    626642    629176    .    +    .    ID=AT2G02380.1;Parent=AT2G02380
Chr2    TAIR10    exon    626642    626780    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    five_prime_UTR    626642    626780    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    exon    626842    626880    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    five_prime_UTR    626842    626877    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    626878    626880    .    +    0    Parent=AT2G02380.1
Chr2    TAIR10    exon    626963    627059    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    626963    627059    .    +    0    Parent=AT2G02380.1
Chr2    TAIR10    exon    627137    627193    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    627137    627193    .    +    2    Parent=AT2G02380.1
Chr2    TAIR10    exon    627312    627397    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    627312    627397    .    +    2    Parent=AT2G02380.1
Chr2    TAIR10    exon    627488    627559    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    627488    627559    .    +    0    Parent=AT2G02380.1
Chr2    TAIR10    exon    627696    627749    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    627696    627749    .    +    0    Parent=AT2G02380.1
Chr2    TAIR10    exon    627840    627915    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    627840    627915    .    +    0    Parent=AT2G02380.1
Chr2    TAIR10    exon    628044    628105    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    628044    628105    .    +    2    Parent=AT2G02380.1
Chr2    TAIR10    exon    628182    628241    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    628182    628241    .    +    0    Parent=AT2G02380.1
Chr2    TAIR10    exon    628465    628676    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    628465    628569    .    +    0    Parent=AT2G02380.1
Chr2    TAIR10    three_prime_UTR    628570    628676    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    exon    629070    629176    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    three_prime_UTR    629070    629176    .    +    .    Parent=AT2G02380.1"""

    tr_lines = tr_gff.split("\n")
    for pos, line in enumerate(tr_lines):
        tr_lines[pos] = re.sub(r"\s+", r"\t", line)
        assert len(tr_lines[pos].split("\t")) == 9, line.split("\t")

    tr_gff_lines = [parsers.GFF.GffLine(line) for line in tr_lines]

    for l in tr_gff_lines:
        assert l.header is False

    def setUp(self):
        """Basic creation test."""

        self.tr = loci.Transcript(self.tr_gff_lines[0])
        for line in self.tr_gff_lines[1:]:
            self.tr.add_exon(line)
        self.tr.finalize()
        self.tr.logger = self.logger

        self.orf = parsers.bed12.BED12()
        self.orf.chrom = self.tr.id
        self.orf.start = 1
        self.orf.end = self.tr.cdna_length
        self.orf.name = self.tr.id
        self.orf.strand = "+"
        self.orf.score = 0
        self.orf.thick_start = self.tr.selected_start_distance_from_tss + 1
        self.orf.thick_end = self.tr.cdna_length - self.tr.selected_end_distance_from_tes
        self.orf.block_count = 1
        self.orf.blockSize = self.tr.cdna_length
        self.orf.block_starts = 0
        self.orf.has_start_codon = True
        self.orf.has_stop_codon = True
        self.orf.transcriptomic = True

    def test_basics(self):

        self.assertEqual(self.tr.chrom, "Chr2")
        self.assertEqual(self.tr.strand, "+")
        self.assertEqual(self.tr.exon_num, 12)
        self.assertEqual(self.tr.exon_num, len(self.tr.exons))
        self.assertEqual(self.tr.start, 626642)
        self.assertEqual(self.tr.end, 629176)
        exons = [(626642, 626780), (626842, 626880), (626963, 627059),
                 (627137, 627193), (627312, 627397), (627488, 627559),
                 (627696, 627749), (627840, 627915), (628044, 628105),
                 (628182, 628241), (628465, 628676), (629070, 629176)]
        self.assertEqual(self.tr.exons, exons, self.tr.exons)

    def test_no_exons(self):

        self.tr.finalized = False
        self.tr.exons = []
        self.tr.finalize()
        self.assertEqual(self.tr.chrom, "Chr2")
        self.assertEqual(self.tr.strand, "+")
        self.assertEqual(self.tr.exon_num, 12)
        self.assertEqual(self.tr.exon_num, len(self.tr.exons))
        self.assertEqual(self.tr.start, 626642)
        self.assertEqual(self.tr.end, 629176)
        exons = [(626642, 626780), (626842, 626880), (626963, 627059),
                 (627137, 627193), (627312, 627397), (627488, 627559),
                 (627696, 627749), (627840, 627915), (628044, 628105),
                 (628182, 628241), (628465, 628676), (629070, 629176)]
        self.assertEqual(self.tr.exons, exons, self.tr.exons)

    def test_cds(self):
        self.assertEqual(self.tr.combined_cds, self.tr.selected_cds)
        cds = [(626878, 626880), (626963, 627059), (627137, 627193),
               (627312, 627397), (627488, 627559), (627696, 627749),
               (627840, 627915), (628044, 628105), (628182, 628241),
               (628465, 628569)]

        self.assertEqual(self.tr.combined_cds, cds, self.tr.combined_cds)
        self.assertEqual(self.tr.selected_cds_start, 626878)
        self.assertEqual(self.tr.selected_cds_end, 628569)

    def test_secondary_orf(self):

        self.assertEqual(self.tr.cds_not_maximal, 0)
        self.assertEqual(self.tr.cds_not_maximal_fraction, 0)

    def test_utr(self):
        self.assertEqual(self.tr.five_utr, [(626642, 626780),
                                            (626842, 626877)])
        self.assertEqual(self.tr.three_utr, [(628570, 628676),
                                             (629070, 629176)])

    def test_introns(self):

        introns = {(626781, 626841), (626881, 626962), (627060, 627136),
                   (627194, 627311), (627398, 627487), (627560, 627695),
                   (627750, 627839), (627916, 628043), (628106, 628181),
                   (628242, 628464), (628677, 629069)}

        self.assertEqual(self.tr.introns, introns, self.tr.introns)

        introns = {(626881, 626962), (627060, 627136), (627194, 627311),
                   (627398, 627487), (627560, 627695), (627750, 627839),
                   (627916, 628043), (628106, 628181), (628242, 628464)}

        self.assertEqual(
            self.tr.combined_cds_introns, introns,
            (sorted(self.tr.combined_cds_introns), sorted(introns)))

        cds_introns = {(626881, 626962), (627060, 627136), (627194, 627311),
                       (627398, 627487), (627560, 627695), (627750, 627839),
                       (627916, 628043), (628106, 628181), (628242, 628464)}

        self.assertEqual(self.tr.selected_cds_introns, cds_introns,
                         self.tr.selected_cds_introns)

    def test_utr_metrics(self):
        """Test for UTR exon num, start distance, etc."""

        self.assertEqual(self.tr.five_utr_num, 2)
        self.assertEqual(self.tr.three_utr_num, 2)
        self.assertEqual(self.tr.five_utr_num_complete, 1)
        self.assertEqual(self.tr.three_utr_num_complete, 1)

        self.assertEqual(self.tr.five_utr_length,
                         626780 + 1 - 626642 + 626877 + 1 - 626842)
        self.assertEqual(self.tr.three_utr_length,
                         628676 + 1 - 628570 + 629176 + 1 - 629070)

        self.assertEqual(self.tr.selected_start_distance_from_tss,
                         626780 + 1 - 626642 + 626878 - 626842,
                         self.tr.selected_end_distance_from_tes)
        self.assertEqual(self.tr.selected_start_distance_from_tss,
                         self.tr.start_distance_from_tss)

        self.assertEqual(self.tr.selected_end_distance_from_tes,
                         628676 - 628569 + 629176 + 1 - 629070,
                         self.tr.selected_end_distance_from_tes)
        self.assertEqual(self.tr.selected_end_distance_from_tes,
                         self.tr.end_distance_from_tes)

        self.assertEqual(self.tr.selected_end_distance_from_junction,
                         628676 - 628569)

    def test_strip_cds(self):

        with self.assertLogs(logger=self.logger, level="DEBUG") as log_split:
            self.tr.strip_cds()

        self.assertIn(
            "DEBUG:{}:Stripping CDS from AT2G02380.1".format(self.logger.name),
            log_split.output)

        self.assertEqual(self.tr.selected_cds_length, 0)
        self.assertEqual(self.tr.three_utr, [])
        self.assertEqual(self.tr.five_utr, [])
        self.assertEqual(self.tr.selected_cds, [])
        self.assertEqual(self.tr.selected_cds_start, None)
        self.assertEqual(self.tr.selected_cds_end, None)

    def test_with_no_gff_utr(self):
        """
        Test the creation of the transcript without the UTR lines, verify that everything is still alright
        :return:
        """
        tr_gff = """Chr2    TAIR10    mRNA    626642    629176    .    +    .    ID=AT2G02380.1;Parent=AT2G02380
Chr2    TAIR10    exon    626642    626780    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    exon    626842    626880    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    626878    626880    .    +    0    Parent=AT2G02380.1
Chr2    TAIR10    exon    626963    627059    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    626963    627059    .    +    0    Parent=AT2G02380.1
Chr2    TAIR10    exon    627137    627193    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    627137    627193    .    +    2    Parent=AT2G02380.1
Chr2    TAIR10    exon    627312    627397    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    627312    627397    .    +    2    Parent=AT2G02380.1
Chr2    TAIR10    exon    627488    627559    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    627488    627559    .    +    0    Parent=AT2G02380.1
Chr2    TAIR10    exon    627696    627749    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    627696    627749    .    +    0    Parent=AT2G02380.1
Chr2    TAIR10    exon    627840    627915    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    627840    627915    .    +    0    Parent=AT2G02380.1
Chr2    TAIR10    exon    628044    628105    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    628044    628105    .    +    2    Parent=AT2G02380.1
Chr2    TAIR10    exon    628182    628241    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    628182    628241    .    +    0    Parent=AT2G02380.1
Chr2    TAIR10    exon    628465    628676    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    628465    628569    .    +    0    Parent=AT2G02380.1
Chr2    TAIR10    exon    629070    629176    .    +    .    Parent=AT2G02380.1"""

        tr_lines = tr_gff.split("\n")
        logger = create_default_logger("test")
        logger.setLevel("INFO")
        for pos, line in enumerate(tr_lines):
            tr_lines[pos] = re.sub(r"\s+", "\t", line)
            assert len(tr_lines[pos].split("\t")) == 9, line.split("\t")

        tr_gff_lines = [parsers.GFF.GffLine(line) for line in tr_lines]

        transcript = loci.Transcript(tr_gff_lines[0], logger=logger)
        for line in tr_gff_lines[1:]:
            transcript.add_exon(line)

        self.assertEqual(transcript.exons, self.tr.exons)
        self.assertNotEqual([], transcript.combined_cds)
        transcript.finalize()
        self.assertTrue(transcript.is_coding)
        self.assertEqual(transcript.five_utr, self.tr.five_utr)
        self.assertEqual(transcript.three_utr, self.tr.three_utr)

    def test_remove_utr(self):
        """Test for CDS stripping. We remove the UTRs and verify
        that start/end have moved, no UTR is present, etc."""

        self.tr.remove_utrs()
        self.assertEqual(self.tr.selected_cds_start, self.tr.start)
        self.assertEqual(self.tr.selected_cds_end, self.tr.end)
        self.assertEqual(self.tr.three_utr, [])
        self.assertEqual(self.tr.five_utr, [])
        cds = [(626878, 626880), (626963, 627059), (627137, 627193),
               (627312, 627397), (627488, 627559), (627696, 627749),
               (627840, 627915), (628044, 628105), (628182, 628241),
               (628465, 628569)]

        self.assertEqual(self.tr.combined_cds, cds, self.tr.combined_cds)
        self.assertEqual(self.tr.combined_utr, [], self.tr.combined_utr)

    def test_load_orf(self):
        """Test for loading a single ORF. We strip the CDS and reload it."""

        with self.assertLogs(logger=self.logger, level="DEBUG") as cm_out:
            self.tr.strip_cds()
            self.assertIn("Stripping CDS", cm_out.output[0])
        self.tr.load_orfs([self.orf])
        cds = [(626878, 626880), (626963, 627059), (627137, 627193),
               (627312, 627397), (627488, 627559), (627696, 627749),
               (627840, 627915), (628044, 628105), (628182, 628241),
               (628465, 628569)]

        self.assertEqual(self.tr.combined_cds, cds, self.tr.combined_cds)
        self.assertEqual(self.tr.selected_cds_start, 626878)
        self.assertEqual(self.tr.selected_cds_end, 628569)

    def test_negative_orf(self):
        """Test loading a negative strand ORF onto a multiexonic transcript.
        This should have no effect."""

        self.orf.strand = "-"
        self.tr.strip_cds()
        self.tr.load_orfs([self.orf])
        self.assertEqual(self.tr.selected_cds_start, None)

    def test_raises_invalid(self):

        self.tr.finalized = False
        self.tr.strand = None

        __current = self.tr.deepcopy()
        self.assertRaises(exceptions.InvalidTranscript, self.tr.finalize)

        self.assertFalse(self.tr.finalized)
        # self.assertTrue(__current is self.tr)
        self.tr.strand = "+"
        self.tr.finalize()
        self.tr.finalized = False
        self.tr.exons += [(625878, 625880)]
        self.assertRaises(exceptions.InvalidTranscript, self.tr.finalize)

    def test_complete(self):

        self.assertTrue(self.tr.has_stop_codon)
        self.assertTrue(self.tr.has_start_codon)
        self.assertTrue(self.tr.is_complete)

    def test_lengths(self):

        self.assertEqual(self.tr.cdna_length, 1061)
        self.assertEqual(self.tr.selected_cds_length, 672)
        self.assertAlmostEqual(self.tr.combined_cds_fraction,
                               672 / 1061,
                               delta=0.01)
        self.assertAlmostEqual(self.tr.selected_cds_fraction,
                               672 / 1061,
                               delta=0.01)

    def testSegments(self):

        self.assertEqual(self.tr.combined_cds_num, 10)
        self.assertEqual(self.tr.selected_cds_num, 10)
        self.assertEqual(self.tr.highest_cds_exon_number, 10)
        self.assertEqual(self.tr.max_intron_length, 393)
        self.assertEqual(self.tr.number_internal_orfs, 1)

    def testDoubleOrf(self):
        """Test to verify the introduction of multiple ORFs."""

        self.tr.strip_cds()
        self.tr.finalized = False

        first_orf = parsers.bed12.BED12()
        first_orf.chrom = self.tr.id
        first_orf.start = 1
        first_orf.end = self.tr.cdna_length
        first_orf.name = "first"
        first_orf.strand = "+"
        first_orf.score = 0
        first_orf.thick_start = 51
        first_orf.thick_end = 398
        first_orf.block_count = 1
        first_orf.blockSize = self.tr.cdna_length
        first_orf.block_sizes = [self.tr.cdna_length]
        first_orf.block_starts = [0]
        first_orf.rgb = 0
        first_orf.has_start_codon = True
        first_orf.has_stop_codon = True
        first_orf.transcriptomic = True
        self.assertFalse(first_orf.invalid)
        # This should not be incorporated
        second_orf = parsers.bed12.BED12()
        second_orf.chrom = self.tr.id
        second_orf.start = 1
        second_orf.end = self.tr.cdna_length
        second_orf.name = "second"
        second_orf.strand = "+"
        second_orf.score = 0
        second_orf.thick_start = 201
        second_orf.thick_end = 410
        second_orf.block_count = 1
        second_orf.blockSize = self.tr.cdna_length
        second_orf.block_sizes = [self.tr.cdna_length]
        second_orf.block_starts = [0]
        second_orf.rgb = 0
        second_orf.has_start_codon = True
        second_orf.has_stop_codon = True
        second_orf.transcriptomic = True
        self.assertFalse(second_orf.invalid)

        self.assertTrue(
            loci.Transcript.is_overlapping_cds(first_orf, second_orf))

        # This should be added
        third_orf = parsers.bed12.BED12()
        third_orf.chrom = self.tr.id
        third_orf.start = 1
        third_orf.end = self.tr.cdna_length
        third_orf.name = "third"
        third_orf.strand = "+"
        third_orf.score = 0
        third_orf.thick_start = 501
        third_orf.thick_end = 800
        third_orf.block_count = 1
        third_orf.blockSize = self.tr.cdna_length
        third_orf.block_sizes = [self.tr.cdna_length]
        third_orf.block_starts = [0]
        third_orf.rgb = 0
        third_orf.has_start_codon = True
        third_orf.has_stop_codon = True
        third_orf.transcriptomic = True
        self.assertFalse(third_orf.invalid)

        self.assertFalse(
            loci.Transcript.is_overlapping_cds(first_orf, third_orf))
        self.assertFalse(
            loci.Transcript.is_overlapping_cds(second_orf, third_orf))

        self.assertFalse(third_orf == second_orf)
        self.assertFalse(first_orf == second_orf)
        self.assertFalse(first_orf == third_orf)

        candidates = [first_orf, second_orf, third_orf]

        # self.assertEqual(len(Mikado.py.loci.transcript.Transcript.find_overlapping_cds(candidates)), 2)

        logger = create_null_logger("null")
        self.tr.logger = logger

        self.tr.load_orfs([first_orf, second_orf, third_orf])

        self.assertTrue(self.tr.is_complete)
        self.tr.finalize()
        self.assertEqual(
            self.tr.number_internal_orfs, 2,
            (self.tr.cdna_length, self.tr.selected_start_distance_from_tss,
             self.tr.selected_end_distance_from_tes))

        self.assertEqual(self.tr.combined_cds_length, 648)
        self.assertEqual(self.tr.selected_cds_length, 348)
        self.assertEqual(self.tr.number_internal_orfs, 2,
                         "\n".join([str(x) for x in self.tr.internal_orfs]))

        new_transcripts = sorted(self.tr.split_by_cds())

        self.assertEqual(len(new_transcripts), 2)
        self.assertEqual(new_transcripts[0].three_utr_length, 0)
        self.assertEqual(new_transcripts[1].five_utr_length, 0)
class MonoBaseTester(unittest.TestCase):
    """
    This test verifies the correct ORF loading and splitting
     in the case where the transcript has multiple ORFs and
     in one case it starts exactly at the terminal point of
      a previous exon.
    """

    logger = create_null_logger("null")

    def setUp(self):
        self.tr = loci.Transcript()
        self.tr.chrom = "Chr5"
        self.tr.start = 22597965
        self.tr.end = 22602701
        self.tr.strand = "+"
        self.tr.score = 1000
        self.tr.parent = "StringTie_DN.70115"
        self.tr.id = "StringTie_DN.70115.4"
        self.tr.source = "StringTie"
        self.tr.feature = "transcript"
        self.tr.add_exons([(22597965, 22601782), (22601862, 22601957),
                           (22602039, 22602701)])

        self.tr.logger = self.logger

        # First ORF
        self.bed1 = parsers.bed12.BED12()
        self.bed1.chrom = self.tr.id
        self.bed1.start = 1
        self.bed1.end = 4577
        self.bed1.name = "{0}.1".format(self.tr.id)
        self.bed1.strand = "+"
        self.bed1.score = 0
        self.bed1.thick_start = 434
        self.bed1.thick_end = 3736
        self.bed1.has_start_codon = True
        self.bed1.transcriptomic = True
        self.bed1.has_stop_codon = True
        self.bed1.block_count = 1
        self.bed1.block_sizes = [len(self.bed1)]
        self.bed1.block_starts = [0]

        # Second ORF
        self.bed2 = copy.deepcopy(self.bed1)
        self.bed2.name = "{0}.2".format(self.tr.id)
        self.bed2.thick_start = 2
        self.bed2.thick_end = 388
        self.bed2.has_start_codon = False

        # Third ORF
        self.bed3 = copy.deepcopy(self.bed1)
        self.bed3.name = "{0}.3".format(self.tr.id)
        self.bed3.thick_start = 3914
        self.bed3.thick_end = 4393

    def test_finalise(self):
        self.tr.finalize()
        self.assertTrue(self.tr.finalized)

        self.assertEqual(self.tr.max_exon_length, 3818)
        self.assertEqual(self.tr.min_exon_length, 96)
        self.assertEqual(self.tr.max_intron_length, 81, self.tr.introns)
        self.assertEqual(self.tr.min_intron_length, 79, self.tr.introns)

    def test_load_orfs(self):
        self.assertFalse(self.bed1.invalid)
        self.assertFalse(self.bed2.invalid)
        self.assertFalse(self.bed3.invalid)
        self.assertEqual(self.bed3.cds_len,
                         self.bed3.thick_end - self.bed3.thick_start + 1)

        self.tr.load_orfs([self.bed1, self.bed2, self.bed3])
        self.assertEqual(self.tr.number_internal_orfs, 3)
        self.assertEqual(self.tr.selected_cds_length, self.bed1.cds_len)

    def test_split(self):

        self.tr.load_orfs([self.bed3, self.bed1])
        splitted_transcripts = [l for l in self.tr.split_by_cds()]
        self.assertEqual(len(splitted_transcripts), 2)

    def test_print(self):

        self.tr.logger = self.logger
        self.tr.finalize()
        self.maxDiff = None

        real_printed = """Chr5\tStringTie\ttranscript\t22597965\t22602701\t1000\t+\t.\tID=StringTie_DN.70115.4;Parent=StringTie_DN.70115
Chr5\tStringTie\texon\t22597965\t22601782\t.\t+\t.\tID=StringTie_DN.70115.4.exon1;Parent=StringTie_DN.70115.4
Chr5\tStringTie\texon\t22601862\t22601957\t.\t+\t.\tID=StringTie_DN.70115.4.exon2;Parent=StringTie_DN.70115.4
Chr5\tStringTie\texon\t22602039\t22602701\t.\t+\t.\tID=StringTie_DN.70115.4.exon3;Parent=StringTie_DN.70115.4"""

        self.assertEqual(str(self.tr.format("gff3")), real_printed)

        real_printed_gtf = """Chr5\tStringTie\ttranscript\t22597965\t22602701\t1000\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4";
Chr5\tStringTie\texon\t22597965\t22601782\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4";
Chr5\tStringTie\texon\t22601862\t22601957\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4";
Chr5\tStringTie\texon\t22602039\t22602701\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4";"""

        self.assertEqual(self.tr.__str__(to_gtf=True), real_printed_gtf)

        pass

    def test_print_cds(self):

        self.tr.load_orfs([self.bed1])
        self.maxDiff = None

        # self.bed1.end = 4577
        # self.bed1.thick_start = 434
        # self.bed1.thick_end = 3736

        real_printed = """Chr5\tStringTie\tmRNA\t22597965\t22602701\t1000\t+\t.\tID=StringTie_DN.70115.4;Parent=StringTie_DN.70115;Name=StringTie_DN.70115.4
Chr5\tStringTie\texon\t22597965\t22601782\t.\t+\t.\tID=StringTie_DN.70115.4.exon1;Parent=StringTie_DN.70115.4
Chr5\tStringTie\tfive_prime_UTR\t22597965\t22598397\t.\t+\t.\tID=StringTie_DN.70115.4.five_prime_UTR1;Parent=StringTie_DN.70115.4
Chr5\tStringTie\tCDS\t22598398\t22601700\t.\t+\t0\tID=StringTie_DN.70115.4.CDS1;Parent=StringTie_DN.70115.4
Chr5\tStringTie\tthree_prime_UTR\t22601701\t22601782\t.\t+\t.\tID=StringTie_DN.70115.4.three_prime_UTR1;Parent=StringTie_DN.70115.4
Chr5\tStringTie\texon\t22601862\t22601957\t.\t+\t.\tID=StringTie_DN.70115.4.exon2;Parent=StringTie_DN.70115.4
Chr5\tStringTie\tthree_prime_UTR\t22601862\t22601957\t.\t+\t.\tID=StringTie_DN.70115.4.three_prime_UTR2;Parent=StringTie_DN.70115.4
Chr5\tStringTie\texon\t22602039\t22602701\t.\t+\t.\tID=StringTie_DN.70115.4.exon3;Parent=StringTie_DN.70115.4
Chr5\tStringTie\tthree_prime_UTR\t22602039\t22602701\t.\t+\t.\tID=StringTie_DN.70115.4.three_prime_UTR3;Parent=StringTie_DN.70115.4"""

        self.assertEqual(str(self.tr), real_printed)

        real_printed_gtf = """Chr5\tStringTie\tmRNA\t22597965\t22602701\t1000\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4"; Name "StringTie_DN.70115.4";
Chr5\tStringTie\texon\t22597965\t22601782\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4";
Chr5\tStringTie\t5UTR\t22597965\t22598397\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4";
Chr5\tStringTie\tCDS\t22598398\t22601700\t.\t+\t0\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4";
Chr5\tStringTie\t3UTR\t22601701\t22601782\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4";
Chr5\tStringTie\texon\t22601862\t22601957\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4";
Chr5\tStringTie\t3UTR\t22601862\t22601957\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4";
Chr5\tStringTie\texon\t22602039\t22602701\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4";
Chr5\tStringTie\t3UTR\t22602039\t22602701\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4";"""

        import itertools

        for lines in itertools.zip_longest(
                self.tr.__str__(to_gtf=True).split("\n"),
                real_printed_gtf.split("\n")):
            self.assertEqual(lines[0], lines[1])

        # self.assertEqual(self.tr.__str__(to_gtf=True),
        #                  real_printed_gtf.rstrip())

    def test_print_multiple_orfs(self):

        self.maxDiff = None
        self.tr.load_orfs([self.bed1, self.bed3])

        # self.bed1.end = 4577
        # self.bed1.thick_start = 434
        # self.bed1.thick_end = 3736
        # self.bed3.thick_start = 3914
        # self.bed3.thick_end = 4393

        real_printed = """Chr5\tStringTie\tmRNA\t22597965\t22602701\t1000\t+\t.\tID=StringTie_DN.70115.4.orf1;Parent=StringTie_DN.70115;Name=StringTie_DN.70115.4;maximal=True
Chr5\tStringTie\texon\t22597965\t22601782\t.\t+\t.\tID=StringTie_DN.70115.4.orf1.exon1;Parent=StringTie_DN.70115.4.orf1
Chr5\tStringTie\tfive_prime_UTR\t22597965\t22598397\t.\t+\t.\tID=StringTie_DN.70115.4.orf1.five_prime_UTR1;Parent=StringTie_DN.70115.4.orf1
Chr5\tStringTie\tCDS\t22598398\t22601700\t.\t+\t0\tID=StringTie_DN.70115.4.orf1.CDS1;Parent=StringTie_DN.70115.4.orf1
Chr5\tStringTie\tthree_prime_UTR\t22601701\t22601782\t.\t+\t.\tID=StringTie_DN.70115.4.orf1.three_prime_UTR1;Parent=StringTie_DN.70115.4.orf1
Chr5\tStringTie\texon\t22601862\t22601957\t.\t+\t.\tID=StringTie_DN.70115.4.orf1.exon2;Parent=StringTie_DN.70115.4.orf1
Chr5\tStringTie\tthree_prime_UTR\t22601862\t22601957\t.\t+\t.\tID=StringTie_DN.70115.4.orf1.three_prime_UTR2;Parent=StringTie_DN.70115.4.orf1
Chr5\tStringTie\texon\t22602039\t22602701\t.\t+\t.\tID=StringTie_DN.70115.4.orf1.exon3;Parent=StringTie_DN.70115.4.orf1
Chr5\tStringTie\tthree_prime_UTR\t22602039\t22602701\t.\t+\t.\tID=StringTie_DN.70115.4.orf1.three_prime_UTR3;Parent=StringTie_DN.70115.4.orf1
Chr5\tStringTie\tmRNA\t22597965\t22602701\t1000\t+\t.\tID=StringTie_DN.70115.4.orf2;Parent=StringTie_DN.70115;Name=StringTie_DN.70115.4;maximal=False
Chr5\tStringTie\texon\t22597965\t22601782\t.\t+\t.\tID=StringTie_DN.70115.4.orf2.exon1;Parent=StringTie_DN.70115.4.orf2
Chr5\tStringTie\tfive_prime_UTR\t22597965\t22601782\t.\t+\t.\tID=StringTie_DN.70115.4.orf2.five_prime_UTR1;Parent=StringTie_DN.70115.4.orf2
Chr5\tStringTie\texon\t22601862\t22601957\t.\t+\t.\tID=StringTie_DN.70115.4.orf2.exon2;Parent=StringTie_DN.70115.4.orf2
Chr5\tStringTie\tfive_prime_UTR\t22601862\t22601956\t.\t+\t.\tID=StringTie_DN.70115.4.orf2.five_prime_UTR2;Parent=StringTie_DN.70115.4.orf2
Chr5\tStringTie\tCDS\t22601957\t22601957\t.\t+\t0\tID=StringTie_DN.70115.4.orf2.CDS1;Parent=StringTie_DN.70115.4.orf2
Chr5\tStringTie\tCDS\t22602039\t22602517\t.\t+\t2\tID=StringTie_DN.70115.4.orf2.CDS2;Parent=StringTie_DN.70115.4.orf2
Chr5\tStringTie\texon\t22602039\t22602701\t.\t+\t.\tID=StringTie_DN.70115.4.orf2.exon3;Parent=StringTie_DN.70115.4.orf2
Chr5\tStringTie\tthree_prime_UTR\t22602518\t22602701\t.\t+\t.\tID=StringTie_DN.70115.4.orf2.three_prime_UTR1;Parent=StringTie_DN.70115.4.orf2"""

        self.assertEqual(self.tr.format("gff", all_orfs=True), real_printed)

    def test_print_without_cds(self):

        self.maxDiff = None
        self.tr.load_orfs([self.bed1, self.bed3])
        real_printed = """Chr5\tStringTie\tmRNA\t22597965\t22602701\t1000\t+\t.\tID=StringTie_DN.70115.4;Parent=StringTie_DN.70115
Chr5\tStringTie\texon\t22597965\t22601782\t.\t+\t.\tID=StringTie_DN.70115.4.exon1;Parent=StringTie_DN.70115.4
Chr5\tStringTie\texon\t22601862\t22601957\t.\t+\t.\tID=StringTie_DN.70115.4.exon2;Parent=StringTie_DN.70115.4
Chr5\tStringTie\texon\t22602039\t22602701\t.\t+\t.\tID=StringTie_DN.70115.4.exon3;Parent=StringTie_DN.70115.4"""

        self.assertEqual(self.tr.format("gff3", with_cds=False), real_printed)

        real_printed_gtf = """Chr5\tStringTie\tmRNA\t22597965\t22602701\t1000\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4";
Chr5\tStringTie\texon\t22597965\t22601782\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4";
Chr5\tStringTie\texon\t22601862\t22601957\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4";
Chr5\tStringTie\texon\t22602039\t22602701\t.\t+\t.\tgene_id "StringTie_DN.70115"; transcript_id "StringTie_DN.70115.4";"""

        self.assertEqual(self.tr.format("gtf", with_cds=False),
                         real_printed_gtf)
class TranscriptTesterNegative(unittest.TestCase):

    logger = create_null_logger("null")
    logger.setLevel(logging.WARNING)

    tr_gff = """Chr1    TAIR10    mRNA    5928    8737    .    -    .    ID=AT1G01020.1;Parent=AT1G01020
Chr1    TAIR10    five_prime_UTR    8667    8737    .    -    .    Parent=AT1G01020.1
Chr1    TAIR10    CDS    8571    8666    .    -    0    Parent=AT1G01020.1;
Chr1    TAIR10    exon    8571    8737    .    -    .    Parent=AT1G01020.1
Chr1    TAIR10    CDS    8417    8464    .    -    0    Parent=AT1G01020.1;
Chr1    TAIR10    exon    8417    8464    .    -    .    Parent=AT1G01020.1
Chr1    TAIR10    CDS    8236    8325    .    -    0    Parent=AT1G01020.1;
Chr1    TAIR10    exon    8236    8325    .    -    .    Parent=AT1G01020.1
Chr1    TAIR10    CDS    7942    7987    .    -    0    Parent=AT1G01020.1;
Chr1    TAIR10    exon    7942    7987    .    -    .    Parent=AT1G01020.1
Chr1    TAIR10    CDS    7762    7835    .    -    2    Parent=AT1G01020.1;
Chr1    TAIR10    exon    7762    7835    .    -    .    Parent=AT1G01020.1
Chr1    TAIR10    CDS    7564    7649    .    -    0    Parent=AT1G01020.1;
Chr1    TAIR10    exon    7564    7649    .    -    .    Parent=AT1G01020.1
Chr1    TAIR10    CDS    7384    7450    .    -    1    Parent=AT1G01020.1;
Chr1    TAIR10    exon    7384    7450    .    -    .    Parent=AT1G01020.1
Chr1    TAIR10    CDS    7157    7232    .    -    0    Parent=AT1G01020.1;
Chr1    TAIR10    exon    7157    7232    .    -    .    Parent=AT1G01020.1
Chr1    TAIR10    CDS    6915    7069    .    -    2    Parent=AT1G01020.1;
Chr1    TAIR10    three_prime_UTR    6437    6914    .    -    .    Parent=AT1G01020.1
Chr1    TAIR10    exon    6437    7069    .    -    .    Parent=AT1G01020.1
Chr1    TAIR10    three_prime_UTR    5928    6263    .    -    .    Parent=AT1G01020.1
Chr1    TAIR10    exon    5928    6263    .    -    .    Parent=AT1G01020.1"""

    tr_lines = [line for line in tr_gff.split("\n") if line]
    for pos, line in enumerate(tr_lines):
        tr_lines[pos] = re.sub("\s+", "\t", line)
        assert len(tr_lines[pos].split("\t")) == 9, line.split("\t")

    tr_gff_lines = [Mikado.parsers.GFF.GffLine(line) for line in tr_lines]

    for l in tr_gff_lines:
        assert l.header is False
    #         print(l)

    def setUp(self):
        """Basic creation test."""

        self.tr = Mikado.loci.Transcript(self.tr_gff_lines[0], logger=self.logger)
        for line in self.tr_gff_lines[1:]:
            self.tr.add_exon(line)
        self.tr.name = self.tr.id
        self.tr.finalize()
        self.tr.logger = self.logger

        self.orf = Mikado.parsers.bed12.BED12()
        self.orf.chrom = self.tr.id
        self.orf.start = 1
        self.orf.end = self.tr.cdna_length
        self.orf.name = self.tr.id
        self.orf.strand = "+"
        self.orf.score = 0
        self.orf.thick_start = self.tr.selected_start_distance_from_tss + 1
        self.orf.thick_end = self.tr.cdna_length - self.tr.selected_end_distance_from_tes
        self.orf.block_count = 1
        self.orf.blockSize = self.tr.cdna_length
        self.orf.block_starts = 0
        self.orf.has_start_codon = True
        self.orf.has_stop_codon = True
        self.orf.transcriptomic = True
        self.assertFalse(self.orf.invalid)
        self.assertEqual(len(self.tr), self.tr.end - self.tr.start + 1)

    def test_print(self):

        self.maxDiff = None
        real_printed = """Chr1\tTAIR10\tmRNA\t5928\t8737\t.\t-\t.\tID=AT1G01020.1;Parent=AT1G01020;Name=AT1G01020.1
Chr1\tTAIR10\texon\t5928\t6263\t.\t-\t.\tID=AT1G01020.1.exon1;Parent=AT1G01020.1
Chr1\tTAIR10\tthree_prime_UTR\t5928\t6263\t.\t-\t.\tID=AT1G01020.1.three_prime_UTR1;Parent=AT1G01020.1
Chr1\tTAIR10\texon\t6437\t7069\t.\t-\t.\tID=AT1G01020.1.exon2;Parent=AT1G01020.1
Chr1\tTAIR10\tthree_prime_UTR\t6437\t6914\t.\t-\t.\tID=AT1G01020.1.three_prime_UTR2;Parent=AT1G01020.1
Chr1\tTAIR10\tCDS\t6915\t7069\t.\t-\t2\tID=AT1G01020.1.CDS1;Parent=AT1G01020.1
Chr1\tTAIR10\tCDS\t7157\t7232\t.\t-\t0\tID=AT1G01020.1.CDS2;Parent=AT1G01020.1
Chr1\tTAIR10\texon\t7157\t7232\t.\t-\t.\tID=AT1G01020.1.exon3;Parent=AT1G01020.1
Chr1\tTAIR10\tCDS\t7384\t7450\t.\t-\t1\tID=AT1G01020.1.CDS3;Parent=AT1G01020.1
Chr1\tTAIR10\texon\t7384\t7450\t.\t-\t.\tID=AT1G01020.1.exon4;Parent=AT1G01020.1
Chr1\tTAIR10\tCDS\t7564\t7649\t.\t-\t0\tID=AT1G01020.1.CDS4;Parent=AT1G01020.1
Chr1\tTAIR10\texon\t7564\t7649\t.\t-\t.\tID=AT1G01020.1.exon5;Parent=AT1G01020.1
Chr1\tTAIR10\tCDS\t7762\t7835\t.\t-\t2\tID=AT1G01020.1.CDS5;Parent=AT1G01020.1
Chr1\tTAIR10\texon\t7762\t7835\t.\t-\t.\tID=AT1G01020.1.exon6;Parent=AT1G01020.1
Chr1\tTAIR10\tCDS\t7942\t7987\t.\t-\t0\tID=AT1G01020.1.CDS6;Parent=AT1G01020.1
Chr1\tTAIR10\texon\t7942\t7987\t.\t-\t.\tID=AT1G01020.1.exon7;Parent=AT1G01020.1
Chr1\tTAIR10\tCDS\t8236\t8325\t.\t-\t0\tID=AT1G01020.1.CDS7;Parent=AT1G01020.1
Chr1\tTAIR10\texon\t8236\t8325\t.\t-\t.\tID=AT1G01020.1.exon8;Parent=AT1G01020.1
Chr1\tTAIR10\tCDS\t8417\t8464\t.\t-\t0\tID=AT1G01020.1.CDS8;Parent=AT1G01020.1
Chr1\tTAIR10\texon\t8417\t8464\t.\t-\t.\tID=AT1G01020.1.exon9;Parent=AT1G01020.1
Chr1\tTAIR10\tCDS\t8571\t8666\t.\t-\t0\tID=AT1G01020.1.CDS9;Parent=AT1G01020.1
Chr1\tTAIR10\texon\t8571\t8737\t.\t-\t.\tID=AT1G01020.1.exon10;Parent=AT1G01020.1
Chr1\tTAIR10\tfive_prime_UTR\t8667\t8737\t.\t-\t.\tID=AT1G01020.1.five_prime_UTR1;Parent=AT1G01020.1"""

        rp = set(real_printed.split("\n"))
        fp = set(str(self.tr).split("\n"))

        # print()
        # print(real_printed)
        # print("============")
        # print(str(self.tr))
        # print("============")

        diff = "\n====\n".join(["\n".join(sorted(list(rp - set.intersection(rp, fp)))),
                               "\n".join(sorted(list(fp - set.intersection(rp, fp))))])

        self.assertEqual(real_printed,
                         str(self.tr),
                         diff)

    def test_empty(self):

        """
        Test that the inference of exons is valid.
        :return:
        """

        self.tr.exons = []
        self.tr.finalized = False
        self.tr.finalize()
        self.assertEqual(self.tr.strand, "-")
        self.assertEqual(self.tr.number_internal_orfs, 1)
        self.assertEqual(self.tr.exon_num, 10)
        self.assertEqual(self.tr.exon_num, len(self.tr.exons))
        self.assertEqual(self.tr.start, 5928)
        self.assertEqual(self.tr.end, 8737)
        exons = [(5928, 6263), (6437, 7069), (7157, 7232),
                 (7384, 7450), (7564, 7649), (7762, 7835),
                 (7942, 7987), (8236, 8325), (8417, 8464), (8571, 8737)]
        # exons = [intervaltree.Interval(*exon) for exon in exons]
        self.assertEqual(self.tr.exons,
                         exons,
                         self.tr.exons)
        # self.assertRaises(Mikado.exceptions.InvalidTranscript, self.tr.finalize)

    def test_invalid_utr(self):

        """
        Test that a transcript with UTR but no CDS defined will raise an exception.
        :return:
        """

        self.tr.combined_cds = []
        self.tr.finalized = False
        self.assertRaises(Mikado.exceptions.InvalidTranscript, self.tr.finalize)

    def test_basics(self):

        """
        Test basic assertions about the transcript:

        - chromosome (.chrom) should be Chr1
        - strand should be -
        - number of internal orfs should be 1
        - number of exons should be 10
        - the metric "exon_num" should be 10 as well
        - start should be 5928 (1-based offset)
        - end should be 8737
        - the exons should correspond to those in the original strings (defined here in the list)
          and all of them should be of the "Interval" class

        :return:
        """

        self.assertEqual(self.tr.chrom, "Chr1")
        self.assertEqual(self.tr.strand, "-")
        self.assertEqual(self.tr.number_internal_orfs, 1)
        self.assertEqual(self.tr.exon_num, 10)
        self.assertEqual(self.tr.exon_num, len(self.tr.exons))
        self.assertEqual(self.tr.start, 5928)
        self.assertEqual(self.tr.end, 8737)
        exons = [(5928, 6263), (6437, 7069), (7157, 7232),
                 (7384, 7450), (7564, 7649), (7762, 7835),
                 (7942, 7987), (8236, 8325), (8417, 8464), (8571, 8737)]
        # exons = [intervaltree.Interval(*exon) for exon in exons]
        self.assertEqual(self.tr.exons,
                         exons,
                         self.tr.exons)

    def test_cds(self):
        self.assertEqual(sorted(self.tr.combined_cds),
                         sorted(self.tr.selected_cds))
        cds = [(6915, 7069), (7157, 7232), (7384, 7450), (7564, 7649), (7762, 7835), (7942, 7987),
               (8236, 8325), (8417, 8464), (8571, 8666)]

        self.assertEqual(self.tr.combined_cds,
                         cds,
                         self.tr.combined_cds)
        self.assertEqual(self.tr.selected_cds_start, 8666)
        self.assertEqual(self.tr.selected_cds_end, 6915)

    def test_utr(self):
        self.assertEqual(self.tr.five_utr, [(8667, 8737)])
        self.assertEqual(self.tr.three_utr, [(5928, 6263),
                                             (6437, 6914)])

    def test_utr_metrics(self):

        """Test for UTR exon num, start distance, etc."""

        self.assertEqual(self.tr.five_utr_num, 1)
        self.assertEqual(self.tr.five_utr_num_complete, 0)
        self.assertEqual(self.tr.three_utr_num, 2)
        self.assertEqual(self.tr.three_utr_num_complete, 1)

        self.assertEqual(self.tr.five_utr_length, 8737 + 1 - 8667)
        self.assertEqual(self.tr.three_utr_length, 6263 + 1 - 5928 + 6914 + 1 - 6437)
        self.assertEqual(self.tr.selected_start_distance_from_tss,
                         8738 - 8667,
                         self.tr.selected_end_distance_from_tes)
        self.assertEqual(self.tr.selected_end_distance_from_tes,
                         6263 + 1 - 5928 + 6915 - 6437,
                         self.tr.selected_end_distance_from_tes)
        self.assertEqual(self.tr.selected_end_distance_from_junction,
                         6915 - 6437,
                         self.tr.selected_cds_end)
        self.assertEqual(self.tr.end_distance_from_junction,
                         self.tr.selected_end_distance_from_junction)

    def test_introns(self):

        introns = {(8465, 8570), (8326, 8416), (7988, 8235),
                   (7836, 7941), (7650, 7761), (7451, 7563),
                   (7233, 7383), (7070, 7156), (6264, 6436)}

        self.assertEqual(self.tr.introns,
                         introns,
                         self.tr.introns)

        cds_introns = {(8465, 8570), (8326, 8416), (7988, 8235),
                       (7836, 7941), (7650, 7761), (7451, 7563),
                       (7233, 7383), (7070, 7156)}

        self.assertEqual(self.tr.combined_cds_introns,
                         cds_introns,
                         self.tr.combined_cds_introns)

        selected_cds_introns = {(8465, 8570), (8326, 8416), (7988, 8235),
                                (7836, 7941), (7650, 7761), (7451, 7563),
                                (7233, 7383), (7070, 7156)}

        self.assertEqual(self.tr.selected_cds_introns,
                         selected_cds_introns,
                         self.tr.selected_cds_introns)

    # @unittest.SkipTest
    def test_strip_cds(self):
        """
        Test the "stip_cds" function which (as the name implies) removes completely the CDS
        from a transcript.
        :return:
        """

        with self.assertLogs("null", level="DEBUG") as log_split:
            self.tr.strip_cds()
        self.assertIn("DEBUG:null:Stripping CDS from AT1G01020.1", log_split.output)

        self.assertEqual(self.tr.selected_cds_length, 0)
        self.assertEqual(self.tr.three_utr, [])
        self.assertEqual(self.tr.five_utr, [])
        self.assertEqual(self.tr.selected_cds, [])
        self.assertEqual(self.tr.selected_cds_start, None)
        self.assertEqual(self.tr.selected_cds_end, None)

    def test_remove_utr(self):
        """Test for CDS stripping. We remove the UTRs and verify that start/end
        have moved, no UTR is present, etc."""

        # tr = deepcopy(self.tr)
        self.tr.remove_utrs()

        # tr = deepcopy(self.tr)
        # tr.remove_utrs()
        self.assertEqual(self.tr.selected_cds_start, self.tr.end,
                         ((self.tr.selected_cds_start, self.tr.selected_cds_end),
                          (self.tr.start, self.tr.end)))
        self.assertEqual(self.tr.selected_cds_end, self.tr.start)
        self.assertEqual(self.tr.three_utr, [])
        self.assertEqual(self.tr.five_utr, [])
        combined_cds = [(6915, 7069),
                        (7157, 7232),
                        (7384, 7450),
                        (7564, 7649),
                        (7762, 7835),
                        (7942, 7987),
                        (8236, 8325),
                        (8417, 8464),
                        (8571, 8666)]
        self.assertEqual(self.tr.combined_cds,
                         combined_cds,
                         self.tr.combined_cds)
        self.assertEqual(self.tr.combined_utr, [], self.tr.combined_utr)

    def test_load_orf(self):

        """Test for loading a single ORF. We strip the CDS and reload it."""

        self.tr.strip_cds()
        self.tr.load_orfs([self.orf])

        combined_cds = [(6915, 7069), (7157, 7232), (7384, 7450),
                        (7564, 7649), (7762, 7835), (7942, 7987),
                        (8236, 8325), (8417, 8464), (8571, 8666)]

        self.assertEqual(self.tr.combined_cds,
                         combined_cds,
                         self.tr.combined_cds)
        self.assertEqual(self.tr.selected_cds_start, 8666)
        self.assertEqual(self.tr.selected_cds_end, 6915)

    def test_negative_orf(self):
        """Test loading a negative strand ORF onto a multiexonic transcript.
        This should have no effect.
        """

        self.orf.strand = "-"
        self.tr.strip_cds()
        self.tr.load_orfs([self.orf])
        self.assertEqual(self.tr.selected_cds_start, None)

    def testSegments(self):

        self.assertEqual(self.tr.combined_cds_num, 9)
        self.assertEqual(self.tr.selected_cds_num, 9)
        self.assertEqual(self.tr.highest_cds_exon_number, 9)
        self.assertEqual(self.tr.max_intron_length, 248)
        self.assertEqual(self.tr.number_internal_orfs, 1)

    def test_lengths(self):

        self.assertEqual(self.tr.cdna_length, 1623)
        self.assertEqual(self.tr.selected_cds_length, 738)
        self.assertAlmostEqual(self.tr.combined_cds_fraction, 738 / 1623, delta=0.01)
        self.assertAlmostEqual(self.tr.selected_cds_fraction, 738 / 1623, delta=0.01)

    def test_print_no_cds(self):

        self.maxDiff = None
        # tr = deepcopy(self.tr)
        # tr.finalize()

        real_printed = """Chr1\tTAIR10\tmRNA\t5928\t8737\t.\t-\t.\tID=AT1G01020.1;Parent=AT1G01020;Name=AT1G01020.1
Chr1\tTAIR10\texon\t5928\t6263\t.\t-\t.\tID=AT1G01020.1.exon1;Parent=AT1G01020.1
Chr1\tTAIR10\texon\t6437\t7069\t.\t-\t.\tID=AT1G01020.1.exon2;Parent=AT1G01020.1
Chr1\tTAIR10\texon\t7157\t7232\t.\t-\t.\tID=AT1G01020.1.exon3;Parent=AT1G01020.1
Chr1\tTAIR10\texon\t7384\t7450\t.\t-\t.\tID=AT1G01020.1.exon4;Parent=AT1G01020.1
Chr1\tTAIR10\texon\t7564\t7649\t.\t-\t.\tID=AT1G01020.1.exon5;Parent=AT1G01020.1
Chr1\tTAIR10\texon\t7762\t7835\t.\t-\t.\tID=AT1G01020.1.exon6;Parent=AT1G01020.1
Chr1\tTAIR10\texon\t7942\t7987\t.\t-\t.\tID=AT1G01020.1.exon7;Parent=AT1G01020.1
Chr1\tTAIR10\texon\t8236\t8325\t.\t-\t.\tID=AT1G01020.1.exon8;Parent=AT1G01020.1
Chr1\tTAIR10\texon\t8417\t8464\t.\t-\t.\tID=AT1G01020.1.exon9;Parent=AT1G01020.1
Chr1\tTAIR10\texon\t8571\t8737\t.\t-\t.\tID=AT1G01020.1.exon10;Parent=AT1G01020.1"""

        self.assertEqual(real_printed, self.tr.format("gff", with_cds=False))

        real_printed_gtf = """Chr1\tTAIR10\tmRNA\t5928\t8737\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1"; Name "AT1G01020.1";
Chr1\tTAIR10\texon\t5928\t6263\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1";
Chr1\tTAIR10\texon\t6437\t7069\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1";
Chr1\tTAIR10\texon\t7157\t7232\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1";
Chr1\tTAIR10\texon\t7384\t7450\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1";
Chr1\tTAIR10\texon\t7564\t7649\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1";
Chr1\tTAIR10\texon\t7762\t7835\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1";
Chr1\tTAIR10\texon\t7942\t7987\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1";
Chr1\tTAIR10\texon\t8236\t8325\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1";
Chr1\tTAIR10\texon\t8417\t8464\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1";
Chr1\tTAIR10\texon\t8571\t8737\t.\t-\t.\tgene_id "AT1G01020"; transcript_id "AT1G01020.1";"""

        self.assertEqual(real_printed_gtf, self.tr.__str__(print_cds=False, to_gtf=True))

    def testDoubleOrf(self):

        """Test to verify the introduction of multiple ORFs."""

        self.tr.strip_cds()
        self.tr.finalized = False

        first_orf = Mikado.parsers.bed12.BED12()
        first_orf.chrom = self.tr.id
        first_orf.start = 1
        first_orf.end = self.tr.cdna_length
        first_orf.name = self.tr.id
        first_orf.strand = "+"
        first_orf.score = 0
        first_orf.thick_start = 100
        first_orf.thick_end = 501
        first_orf.block_count = 1
        first_orf.blockSize = self.tr.cdna_length
        first_orf.block_starts = 0
        first_orf.has_start_codon = True
        first_orf.has_stop_codon = True
        first_orf.transcriptomic = True
        self.assertFalse(first_orf.invalid, (len(first_orf), first_orf.cds_len))

        # This should not be incorporated
        second_orf = Mikado.parsers.bed12.BED12()
        second_orf.chrom = self.tr.id
        second_orf.start = 0
        second_orf.end = self.tr.cdna_length
        second_orf.name = "second"
        second_orf.strand = "+"
        second_orf.score = 1
        second_orf.thick_start = 300
        second_orf.thick_end = 401
        second_orf.block_count = 1
        second_orf.blockSize = self.tr.cdna_length
        second_orf.block_starts = 0
        second_orf.has_start_codon = True
        second_orf.has_stop_codon = True
        second_orf.transcriptomic = True
        self.assertFalse(second_orf.invalid, (len(second_orf), second_orf.cds_len))

        self.assertTrue(Mikado.loci.Transcript.is_overlapping_cds(first_orf, second_orf))

        # This should be added
        third_orf = Mikado.parsers.bed12.BED12()
        third_orf.chrom = self.tr.id
        third_orf.start = 1
        third_orf.end = self.tr.cdna_length
        third_orf.name = "third"
        third_orf.strand = "+"
        third_orf.score = 0
        third_orf.thick_start = 1000
        third_orf.thick_end = 1602
        third_orf.block_count = 1
        third_orf.blockSize = self.tr.cdna_length
        third_orf.block_starts = 0
        third_orf.has_start_codon = True
        third_orf.has_stop_codon = True
        third_orf.transcriptomic = True
        self.assertFalse(third_orf.invalid, (len(third_orf), third_orf.cds_len))

        self.assertFalse(
            Mikado.loci.Transcript.is_overlapping_cds(
                first_orf, third_orf))
        self.assertFalse(
            Mikado.loci.Transcript.is_overlapping_cds(
                second_orf, third_orf))

        self.assertFalse(third_orf == second_orf)
        self.assertFalse(first_orf == second_orf)
        self.assertFalse(first_orf == third_orf)

        candidates = [first_orf, second_orf, third_orf]
        self.tr.load_orfs(candidates)

        self.assertTrue(self.tr.is_complete)
        self.tr.finalize()
        self.assertEqual(self.tr.start, 5928)

        self.assertEqual(self.tr.number_internal_orfs, 2, "\n".join(
            [str(x) for x in self.tr.internal_orfs]))

        self.assertEqual(self.tr.combined_cds_length, 1005)
        self.assertEqual(self.tr.selected_cds_length, 603)

        new_transcripts = sorted(self.tr.split_by_cds(), key=operator.attrgetter("start"))

        self.assertEqual(len(new_transcripts), 2)
        self.assertEqual(new_transcripts[0].five_utr_length, 0)
        self.assertNotEqual(new_transcripts[0].three_utr_length, 0)
        self.assertEqual(new_transcripts[0].cdna_length, 624, msg="{0}-{1}{2}".format(
            new_transcripts[0].start,
            new_transcripts[0].end,
            new_transcripts[0].strand,
        ))
        self.assertEqual(new_transcripts[0].start, self.tr.start)
        self.assertEqual(new_transcripts[0].end, 6724)

        self.assertEqual(new_transcripts[1].three_utr_length, 0)
        self.assertEqual(new_transcripts[1].end, 8737)
Example #9
0
    def setUp(self):

        self.conf = configurator.to_json(None)
        self.conf["reference"]["genome"] = self.__genomefile__.name
        self.logger = create_null_logger("prepare")
        self.conf["prepare"]["keep_redundant"] = True
class TranscriptTester(unittest.TestCase):
    tr_gff = """Chr1    TAIR10    mRNA    5928    8737    .    .    .    ID=AT1G01020.1;Parent=AT1G01020;Name=AT1G01020.1;Index=1
Chr1    TAIR10    exon    5928    8737    .    .    .    Parent=AT1G01020.1"""

    tr_lines = tr_gff.split("\n")
    for pos, line in enumerate(tr_lines):
        tr_lines[pos] = re.sub("\s+", "\t", line)
        assert len(tr_lines[pos].split("\t")) == 9, line.split("\t")

    tr_gff_lines = [Mikado.parsers.GFF.GffLine(line) for line in tr_lines]

    for l in tr_gff_lines:
        assert l.header is False
    #         print(l)

    logger = create_null_logger("null")

    def setUp(self):
        """Basic creation test."""

        self.tr = Transcript()
        self.tr.logger = self.logger
        self.tr.chrom = "Chr1"
        self.tr.source = "TAIR10"
        self.tr.feature = "mRNA"
        self.tr.start = 5928
        self.tr.end = 8737
        self.tr.strand = "+"
        self.tr.add_exon((5928, 8737))
        self.tr.score = None
        self.tr.id, self.tr.parent, self.tr.name = "AT1G01020.1", "AT1G01020", "AT1G01020.1"
        self.tr.add_exon((8571, 8666), "CDS")
        self.tr.finalize()

        self.orf = Mikado.parsers.bed12.BED12()
        self.orf.chrom = self.tr.id
        self.orf.start = 1
        self.orf.end = self.tr.cdna_length
        self.orf.name = self.tr.id
        self.orf.strand = "+"
        self.orf.score = 0
        self.orf.thick_start = 8571 - 5928 + 1
        self.orf.thick_end = 8666 - 5928 + 1
        self.orf.block_count = 1
        self.orf.blockSize = self.tr.cdna_length
        self.orf.block_starts = 0
        self.orf.has_start_codon = True
        self.orf.has_stop_codon = True
        self.orf.transcriptomic = True
        self.assertFalse(self.orf.invalid, self.orf.invalid_reason)
        self.assertEqual((self.orf.thick_end - self.orf.thick_start + 1) % 3,
                         0)

    def test_invalid_inizialization(self):

        with self.assertRaises(TypeError):
            _ = Mikado.loci.Transcript(self.tr_gff_lines[1])

    def test_basics(self):

        self.assertEqual(self.tr.chrom, "Chr1")
        self.assertEqual(self.tr.exon_num, 1)
        self.assertEqual(self.tr.monoexonic, True)
        self.assertEqual(self.tr.exon_num, len(self.tr.exons))
        self.assertEqual(self.tr.start, 5928)
        self.assertEqual(self.tr.end, 8737)
        self.assertEqual(self.tr.exons, [tuple([5928, 8737])], self.tr.exons)

    def test_cds(self):
        """Test the CDS features.
        Note that in a single-exon transcript with no strand, start_codon and stop_codon are defined as False.
        """

        self.tr.load_orfs([self.orf])
        self.assertEqual(self.tr.combined_cds, self.tr.selected_cds)

        self.assertEqual(self.tr.combined_cds, [tuple([8571, 8666])],
                         self.tr.combined_cds)
        self.assertEqual(self.tr.selected_cds_start, 8571)
        self.assertEqual(self.tr.selected_cds_end, 8666)
        self.assertEqual(self.tr.has_start_codon, True)
        self.assertEqual(self.tr.has_stop_codon, True)

    def test_equality(self):

        new_transcript = self.tr.deepcopy()

        self.assertTrue(new_transcript == self.tr)

        new_transcript.strand = None
        self.assertFalse(
            new_transcript == self.tr)  # They have now a different strand

        new_transcript.unfinalize()
        new_transcript.strand = "+"  # It becomes a multiexonic transcript, so it must have a strand
        new_transcript.end = 9737

        new_exon = Mikado.parsers.GFF.GffLine(self.tr_lines[-1])
        new_exon.strand = "+"
        new_exon.start = 9000
        new_exon.end = 9737
        new_transcript.add_exon(new_exon)

        new_transcript.finalize()
        self.assertTrue(new_transcript != self.tr)

    def test_mono_finalising(self):

        transcript_line = [
            line for line in self.tr_gff_lines if line.feature == "mRNA"
        ]
        self.assertEqual(len(transcript_line), 1,
                         "\n".join([str(line) for line in self.tr_gff_lines]))

        tr = Mikado.loci.Transcript(transcript_line[0])
        exon_lines = [
            line for line in self.tr_gff_lines
            if line.is_exon is True and "UTR" not in line.feature.upper()
        ]
        tr.add_exons(exon_lines)
        tr.add_exon((8571, 8666), "CDS")

        tr.finalize()
        self.assertGreater(tr.three_utr_length, 0)
        self.assertGreater(tr.five_utr_length, 0)

    def test_invalid_transcript(self):
        lines = """Chr1\tTAIR10\tmRNA\t5928\t8737\t.\t.\t.\tID=AT1G01020.1;Parent=AT1G01020;Name=AT1G01020.1;Index=1
Chr1\tTAIR10\tCDS\t8571\t7500\t.\t.\t0\tParent=AT1G01020.1;
Chr1\tTAIR10\tCDS\t7503\t8666\t.\t.\t0\tParent=AT1G01020.1;
Chr1\tTAIR10\texon\t5928\t8737\t.\t.\t.\tParent=AT1G01020.1"""

        gff_lines = [
            Mikado.parsers.GFF.GffLine(line) for line in lines.split("\n")
        ]
        self.assertIsInstance(gff_lines[0], Mikado.parsers.GFF.GffLine)
        checker = False
        if gff_lines[0].feature.endswith(
                "transcript") or "RNA" in gff_lines[0].feature.upper():
            checker = True
        self.assertTrue(checker)
        self.assertTrue(gff_lines[0].is_transcript)
        transcript = Mikado.loci.Transcript(gff_lines[0])

        transcript.logger = self.logger
        transcript.add_exons(gff_lines[1:])

        with self.assertRaises(Mikado.exceptions.InvalidCDS):
            Mikado.loci.transcript_methods.finalizing._check_cdna_vs_utr(
                transcript)

    def test_utr(self):

        self.assertEqual(
            self.tr.selected_internal_orf, [("UTR", tuple([5928, 8570])),
                                            ("exon", tuple([5928, 8737])),
                                            ("CDS", tuple([8571, 8666]), 0),
                                            ("UTR", tuple([8667, 8737]))],
            "Right: {0}\nFound{1}".format([("UTR", 5928, 8570),
                                           ("CDS", 8571, 8666),
                                           ("UTR", 8667, 8737)],
                                          self.tr.selected_internal_orf))
        self.assertEqual(
            self.tr.combined_utr,
            [tuple([5928, 8570]), tuple([8667, 8737])])
        self.assertEqual(self.tr.five_utr, [tuple([5928, 8570])],
                         self.tr.five_utr)
        self.assertEqual(self.tr.three_utr, [tuple([8667, 8737])])

    def test_utr_metrics(self):
        """Test for UTR exon num, start distance, etc."""

        self.assertEqual(self.tr.five_utr_num, 1)
        self.assertEqual(self.tr.three_utr_num, 1)
        self.assertEqual(self.tr.five_utr_length, 8570 + 1 - 5928)
        self.assertEqual(self.tr.three_utr_length, 8737 + 1 - 8667)
        self.assertEqual(self.tr.selected_start_distance_from_tss, 8571 - 5928,
                         self.tr.selected_end_distance_from_tes)
        self.assertEqual(
            self.tr.selected_end_distance_from_tes, 8737 - 8666,
            (self.tr.selected_end_distance_from_tes, self.tr.strand))

    def test_strip_cds(self):

        self.tr.strip_cds()
        self.assertEqual(self.tr.selected_cds_length, 0)
        self.assertEqual(self.tr.three_utr, [])
        self.assertEqual(self.tr.five_utr, [])
        self.assertEqual(self.tr.selected_cds, [])
        self.assertEqual(self.tr.selected_cds_start, None)
        self.assertEqual(self.tr.selected_cds_end, None)

    def test_remove_utr(self):
        """Test for CDS stripping. We remove the UTRs and verify that start/end have moved, no UTR is present, etc.
        """

        self.tr.remove_utrs()
        self.assertEqual(self.tr.selected_cds_start, self.tr.start)
        self.assertEqual(self.tr.selected_cds_end, self.tr.end)
        self.assertEqual(self.tr.three_utr, [])
        self.assertEqual(self.tr.five_utr, [])
        self.assertEqual(self.tr.combined_cds, [tuple([8571, 8666])],
                         self.tr.combined_cds)
        self.assertEqual(self.tr.combined_utr, [], self.tr.combined_utr)

    def test_negative_orf(self):
        """Test loading a negative strand ORF onto a monoexonic transcript.
        This should reverse the ORF."""

        self.orf.strand = "-"
        self.tr.strip_cds(strand_specific=False)
        self.orf.has_stop_codon = False
        self.tr.load_orfs([self.orf])
        self.assertEqual(self.tr.strand, "-")
        self.assertEqual(self.tr.selected_cds_start, 8666)
        self.assertEqual(self.tr.selected_cds_end, 8571)

    def test_introns(self):

        self.assertEqual(self.tr.introns, set([]), self.tr.introns)
        self.assertEqual(self.tr.combined_cds_introns, set([]),
                         self.tr.combined_cds_introns)
        self.assertEqual(self.tr.selected_cds_introns, set([]),
                         self.tr.selected_cds_introns)

    def testDoubleOrf(self):
        """Test to verify the introduction of multiple ORFs."""

        self.tr.strip_cds()
        self.tr.finalized = False

        first_orf = Mikado.parsers.bed12.BED12()
        first_orf.chrom = self.tr.id
        first_orf.start = 1
        first_orf.end = self.tr.cdna_length
        first_orf.name = self.tr.id
        first_orf.strand = "+"
        first_orf.score = 0
        first_orf.thick_start = 51
        first_orf.thick_end = 398
        first_orf.block_count = 1
        first_orf.blockSize = self.tr.cdna_length
        first_orf.block_sizes = [self.tr.cdna_length]
        first_orf.block_starts = [0]
        first_orf.rgb = 0
        first_orf.has_start_codon = True
        first_orf.has_stop_codon = True
        first_orf.transcriptomic = True
        self.assertFalse(first_orf.invalid)

        # This should not be incorporated
        second_orf = Mikado.parsers.bed12.BED12()
        second_orf.chrom = self.tr.id
        second_orf.start = 1
        second_orf.end = self.tr.cdna_length
        second_orf.name = "second"
        second_orf.strand = "+"
        second_orf.score = 0
        second_orf.thick_start = 201
        second_orf.thick_end = 410
        second_orf.block_count = 1
        second_orf.blockSize = self.tr.cdna_length
        second_orf.block_sizes = [self.tr.cdna_length]
        second_orf.block_starts = [0]
        second_orf.rgb = 0
        second_orf.has_start_codon = True
        second_orf.has_stop_codon = True
        second_orf.transcriptomic = True
        self.assertFalse(second_orf.invalid)

        self.assertTrue(
            Mikado.loci.Transcript.is_overlapping_cds(first_orf, second_orf))

        # This should be added
        third_orf = Mikado.parsers.bed12.BED12()
        third_orf.chrom = self.tr.id
        third_orf.start = 1
        third_orf.end = self.tr.cdna_length
        third_orf.name = "third"
        third_orf.strand = "+"
        third_orf.score = 0
        third_orf.thick_start = 501
        third_orf.thick_end = 800
        third_orf.block_count = 1
        third_orf.blockSize = self.tr.cdna_length
        third_orf.block_sizes = [self.tr.cdna_length]
        third_orf.block_starts = [0]
        third_orf.rgb = 0
        third_orf.has_start_codon = True
        third_orf.has_stop_codon = True
        third_orf.transcriptomic = True
        self.assertFalse(third_orf.invalid)

        self.assertFalse(
            Mikado.loci.Transcript.is_overlapping_cds(first_orf, third_orf))
        self.assertFalse(
            Mikado.loci.Transcript.is_overlapping_cds(second_orf, third_orf))

        self.assertFalse(third_orf == second_orf)
        self.assertFalse(first_orf == second_orf)
        self.assertFalse(first_orf == third_orf)

        candidates = [first_orf, second_orf, third_orf]

        self.tr.logger = self.logger

        self.tr.load_orfs([first_orf])
        self.tr.load_orfs([second_orf])
        self.tr.load_orfs([third_orf])

        self.tr.load_orfs([first_orf, second_orf, third_orf])

        self.assertTrue(self.tr.is_complete)
        self.tr.finalize()
        self.assertEqual(
            self.tr.number_internal_orfs, 2,
            (self.tr.cdna_length, self.tr.selected_start_distance_from_tss,
             self.tr.selected_end_distance_from_tes))

        self.assertEqual(self.tr.combined_cds_length, 648)
        self.assertEqual(self.tr.selected_cds_length, 348)
        self.assertEqual(self.tr.number_internal_orfs, 2,
                         "\n".join([str(x) for x in self.tr.internal_orfs]))

        new_transcripts = sorted(self.tr.split_by_cds())

        self.assertEqual(len(new_transcripts), 2)
        self.assertEqual(new_transcripts[0].three_utr_length, 0)
        self.assertEqual(new_transcripts[1].five_utr_length, 0)

    def testDoubleOrf_negative(self):
        """Test to verify the introduction of multiple ORFs."""

        self.tr.strip_cds(strand_specific=False)
        self.tr.finalized = False

        first_orf = Mikado.parsers.bed12.BED12()
        first_orf.chrom = self.tr.id
        first_orf.start = 1
        first_orf.end = self.tr.cdna_length
        first_orf.name = self.tr.id
        first_orf.strand = "-"
        first_orf.score = 0
        first_orf.thick_start = 51
        first_orf.thick_end = 398
        first_orf.block_count = 1
        first_orf.blockSize = self.tr.cdna_length
        first_orf.block_sizes = [self.tr.cdna_length]
        first_orf.block_starts = [0]
        first_orf.rgb = 0
        first_orf.has_start_codon = True
        first_orf.has_stop_codon = True
        first_orf.transcriptomic = True
        self.assertFalse(first_orf.invalid)

        # This should not be incorporated
        second_orf = Mikado.parsers.bed12.BED12()
        second_orf.chrom = self.tr.id
        second_orf.start = 1
        second_orf.end = self.tr.cdna_length
        second_orf.name = "second"
        second_orf.strand = "-"
        second_orf.score = 0
        second_orf.thick_start = 201
        second_orf.thick_end = 410
        second_orf.block_count = 1
        second_orf.blockSize = self.tr.cdna_length
        second_orf.block_sizes = [self.tr.cdna_length]
        second_orf.block_starts = [0]
        second_orf.rgb = 0
        second_orf.has_start_codon = True
        second_orf.has_stop_codon = True
        second_orf.transcriptomic = True

        self.assertFalse(second_orf.invalid)

        # self.assertTrue(Mikado.loci.Transcript.is_overlapping_cds(first_orf,
        #                                                           second_orf))

        # This should be added
        third_orf = Mikado.parsers.bed12.BED12()
        third_orf.chrom = self.tr.id
        third_orf.start = 1
        third_orf.end = self.tr.cdna_length
        third_orf.name = "third"
        third_orf.strand = "-"
        third_orf.score = 0
        third_orf.thick_start = 501
        third_orf.thick_end = 800
        third_orf.block_count = 1
        third_orf.blockSize = self.tr.cdna_length
        third_orf.block_sizes = [self.tr.cdna_length]
        third_orf.block_starts = [0]
        third_orf.rgb = 0
        third_orf.has_start_codon = True
        third_orf.has_stop_codon = True
        third_orf.transcriptomic = True
        self.assertFalse(third_orf.invalid)

        self.assertFalse(
            Mikado.loci.Transcript.is_overlapping_cds(first_orf, third_orf))
        self.assertFalse(
            Mikado.loci.Transcript.is_overlapping_cds(second_orf, third_orf))

        self.assertFalse(third_orf == second_orf)
        self.assertFalse(first_orf == second_orf)
        self.assertFalse(first_orf == third_orf)

        candidates = [first_orf, second_orf, third_orf]

        # self.assertEqual(len(self.tr.find_overlapping_cds(candidates)), 2)

        self.tr.logger = self.logger

        self.tr.load_orfs(candidates)

        self.assertTrue(self.tr.is_complete)
        self.tr.finalize()
        self.assertEqual(
            self.tr.number_internal_orfs, 2,
            (self.tr.cdna_length, self.tr.selected_start_distance_from_tss,
             self.tr.selected_end_distance_from_tes))

        # self.assertEqual(self.tr.combined_cds_length, 648)
        self.assertEqual(self.tr.selected_cds_length, 348)
        self.assertEqual(self.tr.number_internal_orfs, 2,
                         "\n".join([str(x) for x in self.tr.internal_orfs]))

        new_transcripts = sorted(self.tr.split_by_cds())

        self.assertEqual(len(new_transcripts), 2)
        self.assertEqual(new_transcripts[0].five_utr_length, 0)
        self.assertEqual(new_transcripts[1].three_utr_length, 0)

    def test_wrong_orf(self):
        # This should be added
        orf = Mikado.parsers.bed12.BED12()
        orf.chrom = self.tr.id
        orf.start = 1
        orf.end = self.tr.cdna_length + 1
        orf.name = "third"
        orf.strand = "-"
        orf.score = 0
        orf.thick_start = 501
        orf.thick_end = 800
        orf.block_count = 1
        orf.blockSize = self.tr.cdna_length
        orf.block_sizes = [self.tr.cdna_length]
        orf.block_starts = [0]
        orf.rgb = 0
        orf.has_start_codon = True
        orf.has_stop_codon = True
        orf.transcriptomic = True
        self.assertFalse(orf.invalid)

        self.tr.logger = self.logger
        self.tr.strip_cds()
        self.tr.strand = "+"
        self.logger.setLevel("WARNING")
        # self.tr.load_orfs([orf])
        with self.assertLogs("null", level="DEBUG") as cm_out:
            self.tr.load_orfs([orf])

        self.assertFalse(self.tr.is_coding)
Example #11
0
def transfer_cds(transcript: Transcript,
                 ref_cdna: str,
                 ref_bed: BED12,
                 target_cdna: str,
                 target_bed: BED12,
                 logger=create_null_logger()):

    if transcript is None:
        return transcript, target_bed, (None, None, False)

    transcript.finalize()
    assert target_bed.transcriptomic is True

    logger.debug("Starting with %s, phases: %s (BED %s)", transcript.id,
                 transcript.phases, target_bed.phase)

    if ref_bed.coding is False:
        logger.debug("%s is non coding, returning immediately.", transcript.id,
                     transcript.phases)
        transcript.attributes["aligner_cds"] = False
        transcript.attributes["was_coding"] = transcript.is_coding
        target_bed.coding = False
        transcript.strip_cds()
        pep_coords = (None, None, True)
    else:
        original_start, original_end = target_bed.thick_start, target_bed.thick_end
        original_phase, original_phases = target_bed.phase, transcript.phases.copy(
        )
        ref_pep = str(
            Seq.Seq(str(
                ref_cdna[ref_bed.thick_start -
                         1:ref_bed.thick_end])).translate(to_stop=False))

        ref_has_multiple_stops = False
        if ref_pep.count("*") == 0:
            pass
        elif abs(ref_pep.index("*") * 3 - ref_bed.cds_len) in (0, 3):
            ref_pep = ref_pep[:ref_pep.index(
                "*")]  # This is the "good" case: the CDS is correct.
        else:
            ref_has_multiple_stops = True
            logger.warning(
                "The sequence of %s has in frame stop codons. Adjusting the program to take this into account.",
                ref_bed.name)

        logger.debug("%s now has phases: %s (%s)", transcript.id,
                     transcript.phases, target_bed.phase)
        target_bed, pep_coords = transfer_by_alignment(ref_pep,
                                                       target_cdna,
                                                       target_bed,
                                                       logger=logger)
        logger.debug("%s now has phases: %s; target bed: %s", transcript.id,
                     transcript.phases, target_bed.phase)
        pep_coords = (pep_coords[0], pep_coords[1],
                      (pep_coords[0] == 1 and pep_coords[1] == len(ref_pep)))

        if target_bed.thick_start == original_start and target_bed.thick_end == original_end:
            transcript.attributes["aligner_cds"] = True
            logger.debug("%s now has phases: %s", transcript.id,
                         transcript.phases)
        else:
            transcript.attributes["aligner_cds"] = False
            transcript.strip_cds()
            if target_bed.coding is True:
                transcript.load_orfs([target_bed])

        logger.debug("%s now has phases: %s", transcript.id, transcript.phases)
        # Now we have to decide whether the transcript has the "original" CDS or not
        result, cigar = transfer.get_and_prepare_cigar(str(ref_cdna),
                                                       str(target_cdna))
        ref_array, target_array = transfer.create_translation_array(cigar)
        try:
            target_start = target_array[ref_array.index(ref_bed.thick_start)]
        except IndexError:
            target_start = target_bed.start
        try:
            target_end = target_array[ref_array.index(ref_bed.thick_end)]
        except IndexError:
            target_end = target_bed.end

        if target_start == target_bed.thick_start and target_end == target_bed.thick_end:
            transcript.attributes["original_cds"] = True
        else:
            transcript.attributes["original_cds"] = False

        if ref_cdna == target_cdna:
            logger.debug("%s now has phases: %s", transcript.id,
                         transcript.phases)
            if transcript.is_coding is False:
                raise AssertionError("{} not coding".format(transcript.id))
            elif transcript.attributes["original_cds"] is False:
                raise AssertionError("\n".join([
                    str(_) for _ in [
                        transcript.id,
                        (target_bed.thick_start, target_start,
                         target_bed.thick_start == target_start),
                        (target_bed.thick_end, target_end,
                         target_bed.thick_end == target_end
                         ), target_bed.thick_start == target_start
                        and target_bed.thick_end == target_end
                    ]
                ]))

    return transcript, target_bed, pep_coords
Example #12
0
def transfer_by_alignment(ref_pep,
                          target_cdna,
                          target_bed,
                          logger=create_null_logger()):
    frames = dict()
    # Get the three-frame translation
    logger.debug("Phase for %s: %s", target_bed.name, target_bed.phase)
    for frame in range(3):
        frames[frame] = str(
            Seq.Seq(str(target_cdna[frame:])).translate(to_stop=False))

    # This will get the best match in the 3-frame translation
    frame_res = dict()
    for frame in frames:
        res, cigar = transfer.get_and_prepare_cigar(ref_pep,
                                                    frames[frame],
                                                    open=3,
                                                    extend=1,
                                                    matrix=parasail.blosum85)
        frame_res[frame] = (res, cigar)
    # Now it is time to try to transfer it ... Ignore any deletions at the beginning
    cig_start = 0
    translation_start = 0
    logger.debug("Frames for %s (phase %s): %s", target_bed.name,
                 target_bed.phase, frame_res)
    best_frame = sorted(frame_res.keys(),
                        key=lambda k: frame_res[k][0].score,
                        reverse=True)[0]
    best_cigar = frame_res[best_frame][1]
    logger.debug("Best frame for %s: %s (cigar: %s)", target_bed.name,
                 best_frame, best_cigar)

    for cig_pos, cig in enumerate(best_cigar):
        le, op = cig
        if not transfer.op_consumes[op][0]:
            # Pass by deletions
            translation_start += best_cigar[cig_start][0]
            cig_start += 1
            continue
        else:
            if transfer.op_consumes[op][1]:
                # anslation_start += best_cigar[cig_start][0]
                break
            else:
                cig_start += 1
                continue

    # This is 0-based; we have to add 1 because we start 1 base after the gap at the beginning
    logger.debug("Translation start for %s: %s; phase: %s", target_bed.name,
                 translation_start, target_bed.phase)
    if translation_start > 0:
        translation_start = 3 * translation_start + best_frame
    else:
        # We have to account for the frame!
        translation_start = best_frame

    translated = str(
        Seq.Seq(str(target_cdna[translation_start:])).translate(
            to_stop=(ref_pep.count("*") <= 1)))

    # Logic to handle when the CDS is broken
    # This is 1-based, so we have to add 1 to
    target_bed.thick_start = translation_start + 1
    end = target_bed.thick_start + len(translated) * 3 - 1

    logger.debug("Phase for %s: %s", target_bed.name, target_bed.phase)
    if translated and translated[0] != ref_pep[0]:
        if translation_start in (0, 1, 2):
            target_bed.phase = translation_start
            target_bed.thick_start = 1
        else:
            target_bed.coding = False
            return target_bed, (None, None, False)
    elif not translated:
        target_bed.coding = False
        return target_bed, (None, None, False)

    # Get the coordinates on the original protein

    pep_res, pep_cigar = transfer.get_and_prepare_cigar(
        ref_pep, translated, open=3, extend=1, matrix=parasail.blosum85)

    pep_ref_array, pep_target_array = transfer.create_translation_array(
        pep_cigar)
    pep_start, pep_end = None, None

    for pos in range(1, len(pep_ref_array) + 1):
        if pep_ref_array[pos - 1] and pep_target_array[pos - 1]:
            if not pep_start:
                pep_start = pos
        pep_end = pos

    # Now check whether we can add the stop codon
    if end + 3 < len(target_cdna):
        end += 3
    else:  # Here we have to presume that it is open.
        end = len(target_cdna)

    # print(translation_start * 3, translated)
    target_bed.thick_end = end
    target_bed.coding = True
    target_bed.transcriptomic = True
    logger.debug("Phase for %s: %s", target_bed.name, target_bed.phase)
    return target_bed, (pep_start, pep_end)
Example #13
0
class LocusTester(unittest.TestCase):

    logger = create_null_logger("locus_tester")

    def setUp(self):

        gff_transcript1 = """Chr1\tfoo\ttranscript\t101\t300\t.\t+\t.\tID=t0
Chr1\tfoo\texon\t101\t300\t.\t+\t.\tID=t0:exon1;Parent=t0
Chr1\tfoo\tCDS\t101\t250\t.\t+\t.\tID=t0:exon1;Parent=t0""".split("\n")
        gff_transcript1 = [GFF.GffLine(x) for x in gff_transcript1]
        self.assertEqual(gff_transcript1[0].chrom, "Chr1", gff_transcript1[0])
        self.transcript1 = Transcript(gff_transcript1[0])
        for exon in gff_transcript1[1:]:
            self.transcript1.add_exon(exon)
        self.transcript1.finalize()
        self.assertTrue(self.transcript1.monoexonic)
        self.assertEqual(self.transcript1.chrom, gff_transcript1[0].chrom)

        gff_transcript2 = """Chr1\tfoo\ttranscript\t101\t600\t.\t+\t.\tID=t1
Chr1\tfoo\texon\t101\t200\t.\t+\t.\tID=t1:exon1;Parent=t1
Chr1\tfoo\texon\t301\t400\t.\t+\t.\tID=t1:exon2;Parent=t1
Chr1\tfoo\texon\t501\t600\t.\t+\t.\tID=t1:exon3;Parent=t1""".split("\n")
        gff_transcript2 = [GFF.GffLine(x) for x in gff_transcript2]
        self.transcript2 = Transcript(gff_transcript2[0], logger=self.logger)

        for exon in gff_transcript2[1:-1]:
            self.transcript2.add_exon(exon)
        # Test that a transcript cannot be finalized if
        # the exons do not define the external boundaries
        with self.assertLogs("null", level="WARNING") as _:
            self.transcript2.finalize()
        with self.assertRaises(exceptions.ModificationError):
            self.transcript2.add_exon(gff_transcript2[-1])

        self.transcript2.finalized = False
        self.transcript2.start = 101
        self.transcript2.end = 600
        self.transcript2.add_exon(gff_transcript2[-1])
        self.transcript2.finalize()
        self.assertFalse(self.transcript2.monoexonic)
        self.assertEqual(self.transcript2.exon_num, len(gff_transcript2) - 1)
        # Test that trying to modify a transcript after it has been finalized causes errors
        with self.assertRaises(exceptions.ModificationError):
            for exon in gff_transcript2[1:]:
                self.transcript2.add_exon(exon)
        # Test that creating a superlocus without configuration fails
        with self.assertRaises(exceptions.NoJsonConfigError):
            _ = Superlocus(self.transcript1)
        self.my_json = os.path.join(os.path.dirname(__file__),
                                    "configuration.yaml")
        self.my_json = configurator.to_json(self.my_json)
        self.assertIn("scoring", self.my_json, self.my_json.keys())

    def test_locus(self):
        """Basic testing of the Locus functionality."""

        logger = create_null_logger("null")
        logger.setLevel("WARNING")
        logger.info("Started")
        slocus = Superlocus(self.transcript1,
                            json_conf=self.my_json,
                            logger=logger)
        slocus.add_transcript_to_locus(self.transcript2)
        self.assertEqual(slocus.strand, self.transcript1.strand)
        self.assertEqual(slocus.start,
                         min(self.transcript1.start, self.transcript2.start))
        self.assertEqual(slocus.end,
                         max(self.transcript1.end, self.transcript2.end))
        logger.info(slocus.transcripts)
        slocus.define_subloci()
        logger.info(slocus.subloci)
        logger.info(slocus.transcripts)
        self.assertEqual(len(slocus.transcripts), 2)
        self.assertEqual(len(slocus.subloci), 2)
        slocus.define_monosubloci()
        self.assertEqual(len(slocus.monosubloci), 2)
        slocus.define_loci()
        self.assertEqual(len(slocus.loci), 1)
        self.assertEqual(
            list(slocus.loci[list(
                slocus.loci.keys())[0]].transcripts.keys())[0], "t0")
        gff_transcript3 = """Chr1\tfoo\ttranscript\t101\t200\t.\t-\t.\tID=tminus0
Chr1\tfoo\texon\t101\t200\t.\t-\t.\tID=tminus0:exon1;Parent=tminus0""".split(
            "\n")
        gff_transcript3 = [GFF.GffLine(x) for x in gff_transcript3]
        transcript3 = Transcript(gff_transcript3[0])
        for exon in gff_transcript3[1:]:
            transcript3.add_exon(exon)
        transcript3.finalize()
        minusuperlocus = Superlocus(transcript3, json_conf=self.my_json)
        minusuperlocus.define_loci()
        self.assertEqual(len(minusuperlocus.loci), 1)
        self.assertTrue(transcript3.strand != self.transcript1.strand)
Example #14
0
class ASeventsTester(unittest.TestCase):

    logger = create_null_logger("ASevents")

    def setUp(self):

        self.conf = dict()
        self.conf["pick"] = dict()
        self.conf["pick"]["alternative_splicing"] = dict()
        self.conf["pick"]["alternative_splicing"]["max_utr_length"] = 10000
        self.conf["pick"]["alternative_splicing"]["max_fiveutr_length"] = 10000
        self.conf["pick"]["alternative_splicing"][
            "max_threeutr_length"] = 10000
        self.conf["pick"]["alternative_splicing"]["valid_ccodes"] = [
            "j", "J", "O", "mo"
        ]
        self.conf["pick"]["alternative_splicing"]["redundant_ccodes"] = [
            "c", "=", "_", "m"
        ]
        self.conf["pick"]["alternative_splicing"][
            "only_confirmed_introns"] = False
        self.conf["pick"]["alternative_splicing"]["min_score_perc"] = 0.5
        self.conf["pick"]["alternative_splicing"][
            "keep_retained_introns"] = True
        self.conf["pick"]["alternative_splicing"]["min_cdna_overlap"] = 0.2
        self.conf["pick"]["alternative_splicing"]["min_cds_overlap"] = 0.2
        self.conf["pick"]["alternative_splicing"]["max_isoforms"] = 3

        self.t1 = Transcript()
        self.t1.chrom = "Chr1"
        self.t1.strand = "+"
        self.t1.score = 20
        self.t1.id = "G1.1"
        self.t1.parent = "G1"
        self.t1.start = 101
        self.t1.end = 1500

        self.t1.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1500)],
                          "exon")
        self.t1.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)],
                          "CDS")
        self.t1.finalize()

        self.locus = Locus(self.t1)
        self.locus.logger = self.logger
        self.locus.json_conf = self.conf

    def test_not_intersecting(self):

        # This one is contained and should be rejected
        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 20
        t2.id = "G1.1"
        t2.parent = "G1"
        t2.start = 601
        t2.end = 1420
        t2.add_exons([(601, 700), (1001, 1300), (1401, 1420)], "exon")
        t2.add_exons([(601, 700), (1001, 1300), (1401, 1420)], "CDS")
        t2.finalize()

        self.assertEqual(
            self.locus.is_alternative_splicing(t2)[:2], (False, "c"))

    def test_valid_as(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 20
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1600

        t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460),
                      (1501, 1600)], "exon")
        t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)],
                     "CDS")
        t2.finalize()

        self.assertEqual(
            self.locus.is_alternative_splicing(t2)[:2], (True, "J"))

        self.locus.add_transcript_to_locus(t2)
        self.assertEqual(len(self.locus.transcripts), 2,
                         self.locus.transcripts)

    def test_redundant_as(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 20
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1600

        t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460),
                      (1501, 1600)], "exon")
        t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)],
                     "CDS")

        t2.finalize()

        self.locus.add_transcript_to_locus(t2)
        self.assertEqual(len(self.locus.transcripts), 2,
                         self.locus.transcripts)

        t3 = Transcript()
        t3.chrom = "Chr1"
        t3.strand = "+"
        t3.score = 20
        t3.id = "G3.1"
        t3.parent = "G3"
        t3.start = 201
        t3.end = 1630

        t3.add_exons([(201, 500), (601, 700), (1001, 1300), (1401, 1460),
                      (1501, 1630)], "exon")
        t3.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)],
                     "CDS")
        t3.finalize()

        self.assertEqual(
            self.locus.is_alternative_splicing(t3)[:2], (False, "J"))
        self.locus.add_transcript_to_locus(t3)
        self.assertEqual(len(self.locus.transcripts), 2,
                         self.locus.transcripts)

    def test_non_redundant_as(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 20
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1600

        t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460),
                      (1501, 1600)], "exon")
        t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)],
                     "CDS")
        t2.finalize()

        self.locus.add_transcript_to_locus(t2)
        self.assertEqual(len(self.locus.transcripts), 2,
                         self.locus.transcripts)

        t3 = Transcript()
        t3.chrom = "Chr1"
        t3.strand = "+"
        t3.score = 20
        t3.id = "G3.1"
        t3.parent = "G3"
        t3.start = 201
        t3.end = 1630

        t3.add_exons([(201, 500), (601, 670), (1031, 1300), (1401, 1460),
                      (1501, 1630)], "exon")
        t3.add_exons([(401, 500), (601, 670), (1031, 1300), (1401, 1440)],
                     "CDS")
        t3.logger = self.logger
        t3.finalize()

        self.assertEqual(
            self.locus.is_alternative_splicing(t3)[:2], (True, "j"))
        self.locus.add_transcript_to_locus(t3)
        self.assertEqual(len(self.locus.transcripts), 3,
                         self.locus.transcripts)

    def test_lowscore(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1600

        t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460),
                      (1501, 1600)], "exon")
        t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)],
                     "CDS")
        t2.finalize()

        self.locus.add_transcript_to_locus(t2)
        self.assertEqual(len(self.locus.transcripts), 2,
                         self.locus.transcripts)
Example #15
0
    def setUp(self):

        self.conf = configurator.to_json(None)
        self.conf["reference"]["genome"] = self.__genomefile__.name
        self.logger = create_null_logger("prepare")
        self.conf["prepare"]["keep_redundant"] = True