Esempio n. 1
0
class ExternalTester(unittest.TestCase):
    def setUp(self):

        self.transcript = Transcript()
        self.transcript.chrom = "15"
        self.transcript.source = "protein_coding"
        self.transcript.start = 47631264
        self.transcript.end = 48051999

        exons = [(47631264, 47631416), (47704590, 47704669),
                 (47762671, 47762742), (47893062, 47893093),
                 (47895572, 47895655), (48051942, 48051999)]

        self.transcript.strand = "+"
        self.transcript.add_exons(exons)
        self.transcript.id = "ENST00000560636"
        self.transcript.parent = "ENSG00000137872"

    def test_copying(self):

        self.transcript.external_scores.update({"test": 0, "test1": 1})
        self.assertEqual(self.transcript.external_scores.test, 0)
        self.assertEqual(self.transcript.external_scores.test1, 1)
        transcript = self.transcript.deepcopy()
        self.assertEqual(transcript.external_scores.test, 0)
        self.assertEqual(transcript.external_scores.test1, 1)
Esempio n. 2
0
class ExternalTester(unittest.TestCase):
    
    def setUp(self):
        
        self.transcript = Transcript()
        self.transcript.chrom = "15"
        self.transcript.source = "protein_coding"
        self.transcript.start = 47631264
        self.transcript.end = 48051999

        exons = [(47631264, 47631416),
                 (47704590, 47704669),
                 (47762671, 47762742),
                 (47893062, 47893093),
                 (47895572, 47895655),
                 (48051942, 48051999)]

        self.transcript.strand = "+"
        self.transcript.add_exons(exons)
        self.transcript.id = "ENST00000560636"
        self.transcript.parent = "ENSG00000137872"
        
    def test_copying(self):
        
        self.transcript.external_scores.update({"test": 0, "test1": 1})
        self.assertEqual(self.transcript.external_scores.test, 0)
        self.assertEqual(self.transcript.external_scores.test1, 1)
        transcript = self.transcript.deepcopy()
        self.assertEqual(transcript.external_scores.test, 0)
        self.assertEqual(transcript.external_scores.test1, 1)
Esempio n. 3
0
class TestRetrieval(unittest.TestCase):
    def setUp(self):
        self.tr = Transcript()
        self.tr.chrom = "Chr1"
        self.tr.start = 101
        self.tr.end = 2000
        self.tr.strand = None
        self.tr.add_exons([(101, 2000)])
        self.tr.id = "test1"
        self.tr.parent = "gene1"
        self.tr.finalize()
        conf = to_json(
            os.path.join(os.path.dirname(__file__), "configuration.yaml"))
        self.assertTrue(conf["pick"]["chimera_split"]["blast_check"])
        self.assertTrue(conf["pick"]["chimera_split"]["execute"])
        self.assertEqual(
            conf["pick"]["chimera_split"]["blast_params"]["leniency"],
            "LENIENT")

        conf["pick"]["orf_loading"]["minimal_secondary_orf_length"] = 50

        self.tr.json_conf = conf

    def test_load_pos_and_neg(self):

        b1 = BED12(transcriptomic=True)
        b1.chrom = self.tr.id
        b1.start = 0
        b1.end = self.tr.cdna_length - 1
        b1.strand = "+"
        b1.name = "first"
        b1.thick_start = 101
        b1.thick_end = 190
        self.assertFalse(b1.invalid)

        b2 = b1.copy()
        b2.strand = "-"
        b2.thick_start = 1
        b2.thick_end = 87
        b2.name = "second"
        self.assertFalse(b2.invalid)
        with self.assertLogs("null", "DEBUG") as _:
            after_overlap_check = retrieval.find_overlapping_cds(
                self.tr, [b1, b2])
        # print(*_.output, sep="\n")

        self.assertEqual(len(after_overlap_check), 2,
                         self.tr.json_conf["pick"]["orf_loading"])
        self.assertEqual(after_overlap_check, [b1, b2],
                         [_.name for _ in after_overlap_check])
        retrieval.load_orfs(self.tr, [b1, b2])
        self.assertEqual(self.tr.number_internal_orfs, 1)
        self.assertEqual(self.tr.combined_cds_start, 201,
                         self.tr.combined_cds_start)
        self.assertEqual(self.tr.combined_cds_length, 90)

    def test_connect(self):

        retrieval._connect_to_db(self.tr)
        reflector = reflection.Inspector.from_engine(self.tr.engine)
Esempio n. 4
0
class TestRetrieval(unittest.TestCase):
    
    def setUp(self):
        self.tr = Transcript()
        self.tr.chrom = "Chr1"
        self.tr.start = 101
        self.tr.end = 2000
        self.tr.strand = None
        self.tr.add_exons([(101, 2000)])
        self.tr.id = "test1"
        self.tr.parent = "gene1"
        self.tr.finalize()
        conf = to_json(os.path.join(
            os.path.dirname(__file__),
            "configuration.yaml"
        ))
        self.assertTrue(conf["pick"]["chimera_split"]["blast_check"])
        self.assertTrue(conf["pick"]["chimera_split"]["execute"])
        self.assertEqual(conf["pick"]["chimera_split"]["blast_params"]["leniency"],
                         "LENIENT")

        conf["pick"]["orf_loading"]["minimal_secondary_orf_length"] = 50

        self.tr.json_conf = conf

    def test_load_pos_and_neg(self):
        
        b1 = BED12(transcriptomic=True)
        b1.chrom = self.tr.id
        b1.start = 0
        b1.end = self.tr.cdna_length - 1
        b1.strand = "+"
        b1.name = "first"
        b1.thick_start = 101
        b1.thick_end = 190
        self.assertFalse(b1.invalid)

        b2 = b1.copy()
        b2.strand = "-"
        b2.thick_start = 1
        b2.thick_end = 87
        b2.name = "second"
        self.assertFalse(b2.invalid)
        with self.assertLogs("null", "DEBUG") as _:
            after_overlap_check = retrieval.find_overlapping_cds(self.tr, [b1, b2])
        # print(*_.output, sep="\n")

        self.assertEqual(len(after_overlap_check), 2, self.tr.json_conf["pick"]["orf_loading"])
        self.assertEqual(after_overlap_check,
                         [b1, b2],
                         [_.name for _ in after_overlap_check])
        retrieval.load_orfs(self.tr, [b1, b2])
        self.assertEqual(self.tr.number_internal_orfs, 1)
        self.assertEqual(self.tr.combined_cds_start, 201, self.tr.combined_cds_start)
        self.assertEqual(self.tr.combined_cds_length, 90)

    def test_connect(self):

        retrieval._connect_to_db(self.tr)
        reflector = reflection.Inspector.from_engine(self.tr.engine)
Esempio n. 5
0
    def setUp(self):
        """Basic creation test."""

        self.tr = Transcript()
        self.tr.logger = self.logger
        self.tr.chrom = "Chr1"
        self.tr.source = "TAIR10"
        self.tr.feature = "mRNA"
        self.tr.start = 5928
        self.tr.end = 8737
        self.tr.strand = "+"
        self.tr.add_exon((5928, 8737))
        self.tr.score = None
        self.tr.id, self.tr.parent, self.tr.name = "AT1G01020.1", "AT1G01020", "AT1G01020.1"
        self.tr.add_exon((8571, 8666), "CDS")
        self.tr.finalize()

        self.orf = Mikado.parsers.bed12.BED12()
        self.orf.chrom = self.tr.id
        self.orf.start = 1
        self.orf.end = self.tr.cdna_length
        self.orf.name = self.tr.id
        self.orf.strand = "+"
        self.orf.score = 0
        self.orf.thick_start = 8571 - 5928 + 1
        self.orf.thick_end = 8666 - 5928 + 1
        self.orf.block_count = 1
        self.orf.blockSize = self.tr.cdna_length
        self.orf.block_starts = 0
        self.orf.has_start_codon = True
        self.orf.has_stop_codon = True
        self.orf.transcriptomic = True
        self.assertFalse(self.orf.invalid, self.orf.invalid_reason)
        self.assertEqual((self.orf.thick_end - self.orf.thick_start + 1) % 3,
                         0)
Esempio n. 6
0
    def setUp(self):

        self.tr = Transcript()
        self.tr.start, self.tr.end, self.tr.chrom, self.tr.strand = (101, 1000, "Chr1", "+")
        self.tr.id = "test1"
        self.tr.add_exons([(101, 400), (701, 1000)])
        self.tr.finalize()
Esempio n. 7
0
    def setUp(self):

        gff_transcript1 = """Chr1\tfoo\ttranscript\t101\t300\t.\t+\t.\tID=t0
Chr1\tfoo\texon\t101\t300\t.\t+\t.\tID=t0:exon1;Parent=t0
Chr1\tfoo\tCDS\t101\t250\t.\t+\t.\tID=t0:exon1;Parent=t0""".split("\n")
        gff_transcript1 = [GFF.GffLine(x) for x in gff_transcript1]
        self.assertEqual(gff_transcript1[0].chrom, "Chr1", gff_transcript1[0])
        self.transcript1 = Transcript(gff_transcript1[0])
        for exon in gff_transcript1[1:]:
            self.transcript1.add_exon(exon)
        self.transcript1.finalize()
        self.assertTrue(self.transcript1.monoexonic)
        self.assertEqual(self.transcript1.chrom, gff_transcript1[0].chrom)

        gff_transcript2 = """Chr1\tfoo\ttranscript\t101\t600\t.\t+\t.\tID=t1
Chr1\tfoo\texon\t101\t200\t.\t+\t.\tID=t1:exon1;Parent=t1
Chr1\tfoo\texon\t301\t400\t.\t+\t.\tID=t1:exon2;Parent=t1
Chr1\tfoo\texon\t501\t600\t.\t+\t.\tID=t1:exon3;Parent=t1""".split("\n")
        gff_transcript2 = [GFF.GffLine(x) for x in gff_transcript2]
        self.transcript2 = Transcript(gff_transcript2[0], logger=self.logger)

        for exon in gff_transcript2[1:-1]:
            self.transcript2.add_exon(exon)
        # Test that a transcript cannot be finalized if
        # the exons do not define the external boundaries
        with self.assertLogs("null", level="WARNING") as _:
            self.transcript2.finalize()
        with self.assertRaises(exceptions.ModificationError):
            self.transcript2.add_exon(gff_transcript2[-1])

        self.transcript2.finalized = False
        self.transcript2.start = 101
        self.transcript2.end = 600
        self.transcript2.add_exon(gff_transcript2[-1])
        self.transcript2.finalize()
        self.assertFalse(self.transcript2.monoexonic)
        self.assertEqual(self.transcript2.exon_num, len(gff_transcript2) - 1)
        # Test that trying to modify a transcript after it has been finalized causes errors
        with self.assertRaises(exceptions.ModificationError):
            for exon in gff_transcript2[1:]:
                self.transcript2.add_exon(exon)
        # Test that creating a superlocus without configuration fails
        with self.assertRaises(exceptions.NoJsonConfigError):
            _ = Superlocus(self.transcript1)
        self.my_json = os.path.join(os.path.dirname(__file__),
                                    "configuration.yaml")
        self.my_json = configurator.to_json(self.my_json)
        self.assertIn("scoring", self.my_json, self.my_json.keys())
Esempio n. 8
0
    def setUp(self):

        self.transcript = Transcript()
        self.transcript.chrom = "15"
        self.transcript.source = "protein_coding"
        self.transcript.start = 47631264
        self.transcript.end = 48051999

        exons = [(47631264, 47631416), (47704590, 47704669),
                 (47762671, 47762742), (47893062, 47893093),
                 (47895572, 47895655), (48051942, 48051999)]

        self.transcript.strand = "+"
        self.transcript.add_exons(exons)
        self.transcript.id = "ENST00000560636"
        self.transcript.parent = "ENSG00000137872"
Esempio n. 9
0
 def test_failed_expansion(self):
     logger = create_default_logger("test_failed_expansion",
                                    level="WARNING")
     raw = [
         Transcript(line, logger=logger) for line in Bed12Parser(
             open(
                 pkg_resources.resource_filename(
                     "Mikado.tests",
                     os.path.join("test_pick_pad", "fail.bed12"))))
     ]
     transcripts = dict((_.id, _) for _ in raw)
     [_.finalize() for _ in transcripts.values()]
     template = transcripts["template"]  # 4535908	4540293
     candidate = transcripts["candidate"]  # 4536444	4540027
     backup = candidate.copy()
     fai = pysam.FastaFile(
         pkg_resources.resource_filename(
             "Mikado.tests",
             os.path.join("test_pick_pad", "failing_seq.fa.gz")))
     logger.setLevel("DEBUG")
     candidate.logger.setLevel("DEBUG")
     pad_transcript(candidate,
                    backup,
                    start_transcript=template,
                    end_transcript=template,
                    fai=fai,
                    logger=logger)
Esempio n. 10
0
    def setUp(self):
        """Basic creation test."""

        self.tr = Transcript()
        self.tr.logger = self.logger
        self.tr.chrom = "Chr1"
        self.tr.source = "TAIR10"
        self.tr.feature = "mRNA"
        self.tr.start = 5928
        self.tr.end = 8737
        self.tr.strand = "+"
        self.tr.add_exon((5928, 8737))
        self.tr.score = None
        self.tr.id, self.tr.parent, self.tr.name = "AT1G01020.1", "AT1G01020", "AT1G01020.1"
        self.tr.add_exon((8571, 8666), "CDS")
        self.tr.finalize()

        self.orf = Mikado.parsers.bed12.BED12()
        self.orf.chrom = self.tr.id
        self.orf.start = 1
        self.orf.end = self.tr.cdna_length
        self.orf.name = self.tr.id
        self.orf.strand = "+"
        self.orf.score = 0
        self.orf.thick_start = 8571 - 5928 + 1
        self.orf.thick_end = 8666 - 5928 + 1
        self.orf.block_count = 1
        self.orf.blockSize = self.tr.cdna_length
        self.orf.block_starts = 0
        self.orf.has_start_codon = True
        self.orf.has_stop_codon = True
        self.orf.transcriptomic = True
        self.assertFalse(self.orf.invalid, self.orf.invalid_reason)
        self.assertEqual((self.orf.thick_end - self.orf.thick_start + 1) % 3, 0)
Esempio n. 11
0
    def test_regression(self):

        sequence = """TC
CTCACAGTTACTATAAGCTCGTCT
ATGGCCAGAGACGGTGGTGTTTCTTGTTTACGAA
GGTCGGAGATGATGAGCGTCGGTGGTATCGGAGGAATTGAATCTGCGCCGTTGGATTTAG
ATGAAGTTCATGTCTTAGCCGTTGATGACAGTCTCGTTGATCGTATTGTCATCGAGAGAT
TGCTTCGTATTACTTCCTGCAAAGTTACGGCGGTAGATAGTGGATGGCGTGCTCTGGAAT
TTCTAGGGTTAGATAATGAGAAAGCTTCTGCTGAATTCGATAGATTGAAAGTTGATTTGA
TCATCACTGATTACTGTATGCCTGGAATGACTGGTTATGAGCTTCTCAAGAAGATTAAGG
AATCGTCCAATTTCAGAGAAGTTCCGGTTGTAATCATGTCGTCGGAGAATGTATTGACCA
GAATCGACAGATGCCTTGAGGAAGGTGCTCAAGATTTCTTATTGAAACCGGTGAAACTCG
CCGACGTGAAACGTCTGAGAAGTCATTTAACTAAAGACGTTAAACTTTCCAACGGAAACA
AACGGAAGCTTCCGGAAGATTCTAGTTCCGTTAACTCTTCGCTTCCTCCACCGTCACCTC
CGTTGACTATCTCGCCTGA"""

        record = SeqRecord.SeqRecord(Seq.Seq(sub("\n", "", sequence)),
                                     id="class_Chr1.1006.0")
        index = {record.id: record}

        line = "\t".join([
            'class_Chr1.1006.0', '0', '619',
            'ID=class_Chr1.1006.0|m.22308;class_Chr1.1006.0|g.22308;ORF_class_Chr1.1006.0|g.22308_class_Chr1.1006.0|m.22308_type:internal_len:206_(+)',
            '0', '+', '2', '617', '0', '1', '619', '0'
        ])

        # Now we are going back to find the start codon
        bed_line = bed12.BED12(line,
                               transcriptomic=True,
                               fasta_index=index,
                               max_regression=0.2)
        self.assertFalse(bed_line.invalid, bed_line.invalid_reason)
        self.assertEqual(bed_line.phase, 0)
        # Start codon in frame found at location 27
        self.assertEqual(bed_line.thick_start, 27)
        self.assertTrue(bed_line.has_start_codon)
        self.assertFalse(bed_line.has_stop_codon)

        lines = """Chr1	CLASS	transcript	3442811	3443785	1000	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0"; exon_number "1"; Abundance "22.601495"; canonical_proportion "1.0";
Chr1	CLASS	exon	3442811	3442999	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";
Chr1	CLASS	exon	3443099	3443169	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";
Chr1	CLASS	exon	3443252	3443329	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";
Chr1	CLASS	exon	3443417	3443493	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";
Chr1	CLASS	exon	3443582	3443785	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";"""

        lines = [GTF.GtfLine(_) for _ in lines.split("\n") if _]

        transcript = Transcript(lines[0])
        transcript.add_exons(lines[1:])
        transcript.finalize()
        transcript.load_orfs([bed_line])
        self.assertTrue(transcript.is_coding)
        self.assertTrue(transcript.has_start_codon)
        self.assertFalse(transcript.has_stop_codon)
        self.assertEqual(transcript.selected_cds_end, transcript.start)
        self.assertEqual(transcript.selected_cds_start, transcript.end - 26)
Esempio n. 12
0
    def test_regression(self):

        sequence = """TC
CTCACAGTTACTATAAGCTCGTCT
ATGGCCAGAGACGGTGGTGTTTCTTGTTTACGAA
GGTCGGAGATGATGAGCGTCGGTGGTATCGGAGGAATTGAATCTGCGCCGTTGGATTTAG
ATGAAGTTCATGTCTTAGCCGTTGATGACAGTCTCGTTGATCGTATTGTCATCGAGAGAT
TGCTTCGTATTACTTCCTGCAAAGTTACGGCGGTAGATAGTGGATGGCGTGCTCTGGAAT
TTCTAGGGTTAGATAATGAGAAAGCTTCTGCTGAATTCGATAGATTGAAAGTTGATTTGA
TCATCACTGATTACTGTATGCCTGGAATGACTGGTTATGAGCTTCTCAAGAAGATTAAGG
AATCGTCCAATTTCAGAGAAGTTCCGGTTGTAATCATGTCGTCGGAGAATGTATTGACCA
GAATCGACAGATGCCTTGAGGAAGGTGCTCAAGATTTCTTATTGAAACCGGTGAAACTCG
CCGACGTGAAACGTCTGAGAAGTCATTTAACTAAAGACGTTAAACTTTCCAACGGAAACA
AACGGAAGCTTCCGGAAGATTCTAGTTCCGTTAACTCTTCGCTTCCTCCACCGTCACCTC
CGTTGACTATCTCGCCTGA"""

        record = SeqRecord.SeqRecord(Seq.Seq(sub("\n", "", sequence)), id="class_Chr1.1006.0")
        index = {record.id: record}

        line = "\t".join(
            ['class_Chr1.1006.0',
             '0',
             '619',
             'ID=class_Chr1.1006.0|m.22308;class_Chr1.1006.0|g.22308;ORF_class_Chr1.1006.0|g.22308_class_Chr1.1006.0|m.22308_type:internal_len:206_(+)',
             '0',
             '+',
             '2',
             '617',
             '0',
             '1',
             '619',
             '0'])

        # Now we are going back to find the start codon
        bed_line = bed12.BED12(line, transcriptomic=True, fasta_index=index, max_regression=0.2)
        self.assertFalse(bed_line.invalid, bed_line.invalid_reason)
        self.assertEqual(bed_line.phase, 0)
        # Start codon in frame found at location 27
        self.assertEqual(bed_line.thick_start, 27)
        self.assertTrue(bed_line.has_start_codon)
        self.assertFalse(bed_line.has_stop_codon)

        lines = """Chr1	CLASS	transcript	3442811	3443785	1000	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0"; exon_number "1"; Abundance "22.601495"; canonical_proportion "1.0";
Chr1	CLASS	exon	3442811	3442999	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";
Chr1	CLASS	exon	3443099	3443169	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";
Chr1	CLASS	exon	3443252	3443329	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";
Chr1	CLASS	exon	3443417	3443493	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";
Chr1	CLASS	exon	3443582	3443785	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";"""

        lines = [GTF.GtfLine(_) for _ in lines.split("\n") if _]

        transcript = Transcript(lines[0])
        transcript.add_exons(lines[1:])
        transcript.finalize()
        transcript.load_orfs([bed_line])
        self.assertTrue(transcript.is_coding)
        self.assertTrue(transcript.has_start_codon)
        self.assertFalse(transcript.has_stop_codon)
        self.assertEqual(transcript.selected_cds_end, transcript.start)
        self.assertEqual(transcript.selected_cds_start, transcript.end - 26)
Esempio n. 13
0
    def setUp(self):

        self.conf = dict()

        self.t1 = Transcript()
        self.t1.chrom = "Chr1"
        self.t1.strand = "+"
        self.t1.score = 20
        self.t1.id = "G1.1"
        self.t1.parent = "G1"
        self.t1.start = 101
        self.t1.end = 1500

        self.t1.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1500)],
                          "exon")
        self.t1.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)],
                          "CDS")
        self.t1.finalize()
Esempio n. 14
0
    def setUp(self):

        self.conf = dict()
        self.conf["pick"] = dict()
        self.conf["pick"]["alternative_splicing"] = dict()
        self.conf["pick"]["alternative_splicing"]["max_utr_length"] = 10000
        self.conf["pick"]["alternative_splicing"]["max_fiveutr_length"] = 10000
        self.conf["pick"]["alternative_splicing"][
            "max_threeutr_length"] = 10000
        self.conf["pick"]["alternative_splicing"]["valid_ccodes"] = [
            "j", "J", "O", "mo"
        ]
        self.conf["pick"]["alternative_splicing"]["redundant_ccodes"] = [
            "c", "=", "_", "m"
        ]
        self.conf["pick"]["alternative_splicing"][
            "only_confirmed_introns"] = False
        self.conf["pick"]["alternative_splicing"]["min_score_perc"] = 0.5
        self.conf["pick"]["alternative_splicing"][
            "keep_retained_introns"] = True
        self.conf["pick"]["alternative_splicing"]["min_cdna_overlap"] = 0.2
        self.conf["pick"]["alternative_splicing"]["min_cds_overlap"] = 0.2
        self.conf["pick"]["alternative_splicing"]["max_isoforms"] = 3

        self.t1 = Transcript()
        self.t1.chrom = "Chr1"
        self.t1.strand = "+"
        self.t1.score = 20
        self.t1.id = "G1.1"
        self.t1.parent = "G1"
        self.t1.start = 101
        self.t1.end = 1500

        self.t1.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1500)],
                          "exon")
        self.t1.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)],
                          "CDS")
        self.t1.finalize()

        self.locus = Locus(self.t1)
        self.locus.logger = self.logger
        self.locus.json_conf = self.conf
Esempio n. 15
0
    def setUp(self):

        self.tr = Transcript()
        self.tr.logger = self.logger
        self.tr.start = 101
        self.tr.end = 10000
        self.tr.add_exons([(101, 300),
                           (501, 800),
                           (1001, 1200),
                           (1301, 2000),
                           (3501, 5000),
                           (5501, 6000),
                           (6201, 7000),
                           (7301, 7700),
                           (8201, 9000),
                           (9101, 9300),
                           (9501, 9700),
                           (9801, 10000)])
        self.tr.id = "test1"
        self.tr.parent = "test1.gene"
Esempio n. 16
0
    def setUp(self):
        self.tr = Transcript()
        self.tr.chrom = "Chr1"
        self.tr.start = 101
        self.tr.end = 2000
        self.tr.strand = None
        self.tr.add_exons([(101, 2000)])
        self.tr.id = "test1"
        self.tr.parent = "gene1"
        self.tr.finalize()
        conf = to_json(
            os.path.join(os.path.dirname(__file__), "configuration.yaml"))
        self.assertTrue(conf["pick"]["chimera_split"]["blast_check"])
        self.assertTrue(conf["pick"]["chimera_split"]["execute"])
        self.assertEqual(
            conf["pick"]["chimera_split"]["blast_params"]["leniency"],
            "LENIENT")

        conf["pick"]["orf_loading"]["minimal_secondary_orf_length"] = 50

        self.tr.json_conf = conf
Esempio n. 17
0
    def setUp(self):

        lines = """Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	mRNA	40282	46004	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960;Name=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2;aed=0.0;note=TRIAE_CS42_5DL_TGACv1_434051_AA1427960;confidence=High;has_start=True;has_stop=True;original_stop=True;protein_rank=P1;transcript_rank=T2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	40282	40933	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	three_prime_UTR	40282	40720	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.three_prime_UTR1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	40721	40933	.	-	0	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	41018	41111	.	-	1	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS2;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	41018	41111	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon2;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	41227	41468	.	-	0	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS3;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	41227	41468	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon3;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	41673	41831	.	-	0	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS4;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	41673	41831	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon4;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	41946	42820	.	-	2	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS5;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	41946	42820	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon5;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	42905	42913	.	-	2	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS6;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	42905	42913	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon6;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	45373	45496	.	-	0	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS7;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	45373	45496	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon7;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	45600	45651	.	-	1	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS8;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	45600	45651	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon8;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	45726	45726	.	-	2	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS9;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	45726	45726	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon9;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	45875	45893	.	-	0	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS10;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	45875	46004	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon10;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	five_prime_UTR	45894	46004	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.five_prime_UTR1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2"""

        lines = [GffLine("\t".join(_.split())) for _ in lines.split("\n") if _]
        self.transcript = Transcript(lines[0], logger=self.logger)
        self.transcript.add_exons(lines[1:])
        self.correct_phases = {(40721, 40933): 2,
                               (41018, 41111): 0,
                               (41227, 41468): 2,
                               (41673, 41831): 2,
                               (41946, 42820): 1,
                               (42905, 42913): 1,
                               (45373, 45496): 2,
                               (45600, 45651): 0,
                               (45726, 45726): 2,
                               (45875, 45893): 0}
Esempio n. 18
0
    def test_casePositive(self):

        tr = Transcript()
        tr.chrom, tr.start, tr.end, tr.strand = "Chr1", 101, 3000, "+"
        tr.id = "test1"
        tr.add_exons([(101, 300),
                      (401, 600),
                      (801, 1200),
                      (2501, 3000)
                      ])

        tr.add_exons([(421, 600),  # 180
                      (801, 1200),  # 400
                      (2501, 2700)  # 200  = 780 % 3 == 0
                      ], features="CDS")
        with self.assertLogs("null", "DEBUG") as _:
            tr.finalize()
        self.assertTrue(tr.is_coding)

        b12 = tr.as_bed12()
        self.assertEqual(b12.thick_start, tr.combined_cds_start)
        self.assertEqual(b12.thick_end, tr.combined_cds_end)
        self.assertEqual(len(b12.block_sizes), tr.exon_num)
        self.assertEqual(b12.block_sizes,
                         [200, 200, 400, 500],
                         b12.block_sizes)
        self.assertEqual(b12.strand, "+")
        self.assertEqual(b12.block_starts,
                         [0, 300, 700, 2400],
                         b12.block_starts)
        self.assertEqual(str(b12),
                         "\t".join([str(_) for _ in
                                    ["Chr1", 100, 3000, tr.id, 0, tr.strand,
                                     b12.thick_start - 1, b12.thick_end,
                                     0, 4,
                                     ",".join([str(__) for __ in [200, 200, 400, 500]]),
                                     ",".join([str(___) for ___ in [0, 300, 700, 2400]])]]
                                   ))
Esempio n. 19
0
    def test_creation_from_transcript(self):

        t = Transcript()
        t.chrom = "Chr1"
        t.start = 100
        t.end = 200
        t.strand = "+"
        t.id = "test"
        t.parent = "parent"
        gene = Gene(t)
        self.assertIn(t.id, gene, gene.keys())
        self.assertIn(t, gene, gene.keys())
        self.assertEqual(t.chrom, gene.chrom)
        self.assertEqual(t.start, gene.start)
        self.assertEqual(t.end, gene.end)
        self.assertEqual(t.strand, gene.strand)
        self.assertIs(t, gene[t.id])
Esempio n. 20
0
class MetricEntry:
    """
    Basic class that defines the metrics loaded from the Mikado file.
    """

    metrics = [
        _ for _ in Transcript.get_available_metrics() if _ not in
        ["tid", "parent", "score", "best_bits", "snowy_blast_score"]
    ]

    def __init__(self, row):

        self.__tid = row["tid"]
        self.__locus = row["parent"]

        for key in self.metrics:
            if row[key].lower() == "true":
                row[key] = 1.0
            elif row[key].lower() == "false":
                row[key] = 0.0
            else:
                try:
                    row[key] = float(row[key])
                except ValueError as exc:
                    raise ValueError(
                        "Invalid value for key {0}: {1}.\n{2}".format(
                            key, row[key], exc))
            setattr(self, key, row[key])

    @property
    def matrix_row(self):
        return [getattr(self, key) for key in self.metrics]

    @property
    def features(self):
        return self.metrics

    @property
    def tid(self):
        return self.__tid

    @property
    def locus(self):
        return self.__locus
Esempio n. 21
0
    def test_creation_from_transcript(self):

        t = Transcript()
        t.chrom = "Chr1"
        t.start = 100
        t.end = 200
        t.strand = "+"
        t.id = "test"
        t.parent = "parent"
        gene = Gene(t)
        self.assertIn(t.id, gene, gene.keys())
        self.assertIn(t, gene, gene.keys())
        self.assertEqual(t.chrom, gene.chrom)
        self.assertEqual(t.start, gene.start)
        self.assertEqual(t.end, gene.end)
        self.assertEqual(t.strand, gene.strand)
        self.assertIs(t, gene[t.id])
Esempio n. 22
0
    def setUp(self):
        
        self.transcript = Transcript()
        self.transcript.chrom = "15"
        self.transcript.source = "protein_coding"
        self.transcript.start = 47631264
        self.transcript.end = 48051999

        exons = [(47631264, 47631416),
                 (47704590, 47704669),
                 (47762671, 47762742),
                 (47893062, 47893093),
                 (47895572, 47895655),
                 (48051942, 48051999)]

        self.transcript.strand = "+"
        self.transcript.add_exons(exons)
        self.transcript.id = "ENST00000560636"
        self.transcript.parent = "ENSG00000137872"
Esempio n. 23
0
    def setUp(self):
        self.tr = Transcript()
        self.tr.chrom = "Chr1"
        self.tr.start = 101
        self.tr.end = 2000
        self.tr.strand = None
        self.tr.add_exons([(101, 2000)])
        self.tr.id = "test1"
        self.tr.parent = "gene1"
        self.tr.finalize()
        conf = to_json(os.path.join(
            os.path.dirname(__file__),
            "configuration.yaml"
        ))
        self.assertTrue(conf["pick"]["chimera_split"]["blast_check"])
        self.assertTrue(conf["pick"]["chimera_split"]["execute"])
        self.assertEqual(conf["pick"]["chimera_split"]["blast_params"]["leniency"],
                         "LENIENT")

        conf["pick"]["orf_loading"]["minimal_secondary_orf_length"] = 50

        self.tr.json_conf = conf
Esempio n. 24
0
    def test_locus(self):
        """Basic testing of the Locus functionality."""

        logger = create_null_logger("null")
        logger.setLevel("WARNING")
        logger.info("Started")
        slocus = Superlocus(self.transcript1,
                            json_conf=self.my_json,
                            logger=logger)
        slocus.add_transcript_to_locus(self.transcript2)
        self.assertEqual(slocus.strand, self.transcript1.strand)
        self.assertEqual(slocus.start,
                         min(self.transcript1.start, self.transcript2.start))
        self.assertEqual(slocus.end,
                         max(self.transcript1.end, self.transcript2.end))
        logger.info(slocus.transcripts)
        slocus.define_subloci()
        logger.info(slocus.subloci)
        logger.info(slocus.transcripts)
        self.assertEqual(len(slocus.transcripts), 2)
        self.assertEqual(len(slocus.subloci), 2)
        slocus.define_monosubloci()
        self.assertEqual(len(slocus.monosubloci), 2)
        slocus.define_loci()
        self.assertEqual(len(slocus.loci), 1)
        self.assertEqual(
            list(slocus.loci[list(
                slocus.loci.keys())[0]].transcripts.keys())[0], "t0")
        gff_transcript3 = """Chr1\tfoo\ttranscript\t101\t200\t.\t-\t.\tID=tminus0
Chr1\tfoo\texon\t101\t200\t.\t-\t.\tID=tminus0:exon1;Parent=tminus0""".split(
            "\n")
        gff_transcript3 = [GFF.GffLine(x) for x in gff_transcript3]
        transcript3 = Transcript(gff_transcript3[0])
        for exon in gff_transcript3[1:]:
            transcript3.add_exon(exon)
        transcript3.finalize()
        minusuperlocus = Superlocus(transcript3, json_conf=self.my_json)
        minusuperlocus.define_loci()
        self.assertEqual(len(minusuperlocus.loci), 1)
        self.assertTrue(transcript3.strand != self.transcript1.strand)
Esempio n. 25
0
    def test_casePositive(self):

        tr = Transcript()
        tr.chrom, tr.start, tr.end, tr.strand = "Chr1", 101, 3000, "+"
        tr.id = "test1"
        tr.add_exons([(101, 300), (401, 600), (801, 1200), (2501, 3000)])

        tr.add_exons(
            [
                (421, 600),  # 180
                (801, 1200),  # 400
                (2501, 2700)  # 200  = 780 % 3 == 0
            ],
            features="CDS")
        with self.assertLogs("null", "DEBUG") as _:
            tr.finalize()
        self.assertTrue(tr.is_coding)

        b12 = tr.as_bed12()
        self.assertEqual(b12.thick_start, tr.combined_cds_start)
        self.assertEqual(b12.thick_end, tr.combined_cds_end)
        self.assertEqual(len(b12.block_sizes), tr.exon_num)
        self.assertEqual(b12.block_sizes, [200, 200, 400, 500],
                         b12.block_sizes)
        self.assertEqual(b12.strand, "+")
        self.assertEqual(b12.block_starts, [0, 300, 700, 2400],
                         b12.block_starts)
        self.assertEqual(
            str(b12), "\t".join([
                str(_) for _ in [
                    "Chr1", 100, 3000, tr.id, 0, tr.strand, b12.thick_start -
                    1, b12.thick_end, 0, 4, ",".join([
                        str(__) for __ in [200, 200, 400, 500]
                    ]), ",".join([str(___) for ___ in [0, 300, 700, 2400]])
                ]
            ]))
Esempio n. 26
0
 def setUp(self):
     self.reference = "Chr5\t26574999\t26578625\tID=AT5G66600.3;coding=True;phase=0\t0\t-\t26575104\t26578315\t0\t11\t411,126,87,60,100,809,126,72,82,188,107\t0,495,711,885,1035,1261,2163,2378,2856,3239,3519"
     self.reference = Transcript(BED12(self.reference),
                                 source="TAIR10",
                                 is_reference=True)
Esempio n. 27
0
    def test_coding_negative(self):
        tr = Transcript()
        tr.chrom = "Chr1"
        tr.start = 101
        tr.end = 2000
        tr.strand = "-"
        tr.add_exons([(101, 300),
                      (1701, 2000)])
        tr.add_exons([(101, 300),
                      (1701, 2000)], features="CDS")
        tr.id = "test1"
        tr.parent = "gene1"

        # Phase 0, 0 because the first CDS exon is 300bp
        gff = tr.format("gff", with_introns=True)
        self.maxDiff = None
        res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1;Name=test1
Chr1\tMikado\tCDS\t101\t300\t.\t-\t0\tID=test1.CDS1;Parent=test1
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1
Chr1\tMikado\tCDS\t1701\t2000\t.\t-\t0\tID=test1.CDS2;Parent=test1
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1"""
        self.assertEqual(gff,
                         res,
                         "++++\n\n"+"\n+++\n".join([gff, res,
                                                    ",\t".join([str(_) for _ in tr.internal_orfs])
                                                    ]
                                                   )
                         )

        gtf = tr.format("gtf", with_introns=True)
        res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Name "test1";
Chr1\tMikado\tCDS\t101\t300\t.\t-\t0\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tCDS\t1701\t2000\t.\t-\t0\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";"""
        self.assertEqual(gtf, res,
                         "++++\n\n" + "\n+++\n".join([gtf, res]))
Esempio n. 28
0
    def test_non_coding_negative(self):
        tr = Transcript()
        tr.chrom = "Chr1"
        tr.start = 101
        tr.end = 2000
        tr.strand = "-"
        tr.add_exons([(101, 300), (1701, 2000)])
        tr.id = "test1"
        tr.parent = "gene1"
        tr.finalize()

        gff = tr.format("gff", with_introns=True)
        self.maxDiff = None
        res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1"""
        self.assertEqual(gff, res, "++++\n\n" + "\n+++\n".join([gff, res]))

        gtf = tr.format("gtf", with_introns=True)
        res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";"""
        self.assertEqual(gtf, res, "++++\n\n" + "\n+++\n".join([gtf, res]))
Esempio n. 29
0
    def test_coding_negative(self):
        tr = Transcript()
        tr.chrom = "Chr1"
        tr.start = 101
        tr.end = 2000
        tr.strand = "-"
        tr.add_exons([(101, 300), (1701, 2000)])
        tr.add_exons([(101, 300), (1701, 2000)], features="CDS")
        tr.id = "test1"
        tr.parent = "gene1"

        # Phase 0, 0 because the first CDS exon is 300bp
        gff = tr.format("gff", with_introns=True)
        self.maxDiff = None
        res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1;Name=test1
Chr1\tMikado\tCDS\t101\t300\t.\t-\t0\tID=test1.CDS1;Parent=test1
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1
Chr1\tMikado\tCDS\t1701\t2000\t.\t-\t0\tID=test1.CDS2;Parent=test1
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1"""
        self.assertEqual(
            gff, res, "++++\n\n" + "\n+++\n".join(
                [gff, res, ",\t".join([str(_) for _ in tr.internal_orfs])]))

        gtf = tr.format("gtf", with_introns=True)
        res = """Chr1\tMikado\tmRNA\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1"; Name "test1";
Chr1\tMikado\tCDS\t101\t300\t.\t-\t0\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tCDS\t1701\t2000\t.\t-\t0\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";"""
        self.assertEqual(gtf, res, "++++\n\n" + "\n+++\n".join([gtf, res]))
Esempio n. 30
0
    def test_add_two_partials(self):

        logger = create_null_logger("test_add_two_partials")
        logger.setLevel("INFO")
        json_conf = load_and_validate_config(None)
        json_conf.reference.genome = self.fai
        json_conf.pick.alternative_splicing.only_confirmed_introns = False
        json_conf.pick.run_options.only_reference_update = True

        ref = Transcript(is_reference=True)
        ref.chrom, ref.strand, ref.id = "Chr5", "-", "AT5G66670.2"
        ref.add_exons([(26611258, 26612889)])
        ref.add_exons([(26611474, 26612700)], features=["CDS"])
        ref.finalize()
        self.assertTrue(ref.is_coding)

        # Chr5	TAIR10	mRNA	26611258	26612889	.	-	.	ID=AT5G66670.2;Parent=AT5G66670;Name=AT5G66670.2;index=1
        # Chr5	TAIR10	protein	26611474	26612700	.	-	.	ID=AT5G66670.2-Protein;Parent=AT5G66670.2;Name=AT5G66670.2;derives_from=AT5G66670.2
        # Chr5	TAIR10	three_prime_UTR	26611258	26611473	.	-	.	Parent=AT5G66670.2
        # Chr5	TAIR10	CDS	26611474	26612700	.	-	0	Parent=AT5G66670.2
        # Chr5	TAIR10	five_prime_UTR	26612701	26612889	.	-	.	Parent=AT5G66670.2
        # Chr5	TAIR10	exon	26611258	26612889	.	-	.	Parent=AT5G66670.2

        template1 = Transcript(is_reference=False)
        template1.chrom, template1.strand, template1.id = ref.chrom, ref.strand, ref.id + "_frag1"
        template1.add_exons(((26611116, 26611157), (26611258, 26612670)))
        template1.add_exons(((26611474, 26612670), ), features=["CDS"])
        template1.finalize()
        self.assertTrue(template1.is_coding)

        template2 = Transcript(is_reference=False)
        template2.chrom, template2.strand, template2.id = ref.chrom, ref.strand, ref.id + "_frag2"
        template2.add_exons(((26611574, 26612889), (26613007, 26613403)))
        template2.add_exons(((26611574, 26612700), ), features=["CDS"])
        template2.finalize()
        self.assertTrue(template2.is_coding)

        logger.setLevel("INFO")
        json_conf.pick.alternative_splicing.pad = True
        locus = Locus(ref, configuration=json_conf, logger=logger)
        locus.add_transcript_to_locus(template1)
        locus.add_transcript_to_locus(template2)
        self.assertIn(template2.id, locus)
        # self.assertIn(template1.id, locus)
        # locus.logger.setLevel("DEBUG")
        # for tid in locus:
        #     locus[tid].logger.setLevel("DEBUG")
        locus.finalize_alternative_splicing(check_requirements=False)
        self.assertTrue(locus._finalized)
        self.assertNotIn(template1.id, locus, "\n" + str(locus))
        self.assertNotIn(template2.id, locus, "\n" + str(locus))
        self.assertEqual(
            locus[ref.id].end, template2.end,
            ((locus[ref.id].end, ref.end, template2.end, template1.end),
             (locus[ref.id].start, ref.start, template2.start,
              template1.start)))
Esempio n. 31
0
class WrongLoadedOrf(unittest.TestCase):
    def setUp(self):

        self.tr = Transcript()
        self.tr.start, self.tr.end, self.tr.chrom, self.tr.strand = (101, 1000,
                                                                     "Chr1",
                                                                     "+")
        self.tr.id = "test1"
        self.tr.add_exons([(101, 400), (701, 1000)])
        self.tr.finalize()

    def test_load_invalid_length(self):

        b_invalid = BED12(transcriptomic=True)
        b_invalid.chrom = self.tr.id
        self.assertTrue(b_invalid.transcriptomic)
        # b_invalid.name = self.tr.id
        b_invalid.start = 0
        b_invalid.strand = "+"
        b_invalid.end = self.tr.cdna_length + 10
        b_invalid.thick_start = 101
        b_invalid.thick_end = 190
        self.assertEqual(b_invalid.chrom, b_invalid.id, b_invalid.id)

        with self.assertLogs("null", "WARNING") as cm:
            retrieval.load_orfs(self.tr, [b_invalid])

        found_message = False
        for _ in cm.output:
            if "Wrong ORF for {}:".format(self.tr.id) in _:
                found_message = True
                break

        self.assertTrue(found_message, cm.output)

    def test_load_invalid_multiple(self):

        b_valid = BED12(transcriptomic=True)
        b_valid.chrom = self.tr.id
        b_valid.name = "valid"
        b_valid.start, b_valid.end, b_valid.strand = 0, self.tr.cdna_length - 1, "+"
        b_valid.thick_start, b_valid.thick_end = 101, 190

        b_invalid = b_valid.copy()
        b_invalid.name = "invalid"
        b_invalid.thick_start = 1
        b_invalid.thick_end = 89
        b_invalid.phase = 0

        self.assertTrue(b_invalid.invalid)
        self.assertFalse(b_valid.invalid, b_valid.invalid_reason)

        with self.assertLogs("null", "DEBUG") as _:
            retrieval.load_orfs(self.tr, [b_valid, b_invalid])

        # print(*cm.output, sep="\n")

        self.assertEqual(self.tr.number_internal_orfs, 1)

    def test_filter_non_transcriptomic(self):

        b_valid = BED12(transcriptomic=True)
        b_valid.chrom = self.tr.id
        b_valid.name = "valid"
        b_valid.start, b_valid.end, b_valid.strand = 0, self.tr.cdna_length - 1, "+"
        b_valid.thick_start, b_valid.thick_end = 101, 190

        b_invalid = b_valid.copy()
        b_invalid.name = "non-transcriptomic"
        b_invalid.transcriptomic = False

        retained = retrieval.find_overlapping_cds(self.tr,
                                                  [b_invalid, b_valid])
        self.assertEqual(retained, [b_valid])
Esempio n. 32
0
class TranscriptTester(unittest.TestCase):
    tr_gff = """Chr1    TAIR10    mRNA    5928    8737    .    .    .    ID=AT1G01020.1;Parent=AT1G01020;Name=AT1G01020.1;Index=1
Chr1    TAIR10    exon    5928    8737    .    .    .    Parent=AT1G01020.1"""

    tr_lines = tr_gff.split("\n")
    for pos, line in enumerate(tr_lines):
        tr_lines[pos] = re.sub("\s+", "\t", line)
        assert len(tr_lines[pos].split("\t")) == 9, line.split("\t")

    tr_gff_lines = [Mikado.parsers.GFF.GffLine(line) for line in tr_lines]

    for l in tr_gff_lines:
        assert l.header is False
    #         print(l)

    logger = create_null_logger("null")

    def setUp(self):
        """Basic creation test."""

        self.tr = Transcript()
        self.tr.logger = self.logger
        self.tr.chrom = "Chr1"
        self.tr.source = "TAIR10"
        self.tr.feature = "mRNA"
        self.tr.start = 5928
        self.tr.end = 8737
        self.tr.strand = "+"
        self.tr.add_exon((5928, 8737))
        self.tr.score = None
        self.tr.id, self.tr.parent, self.tr.name = "AT1G01020.1", "AT1G01020", "AT1G01020.1"
        self.tr.add_exon((8571, 8666), "CDS")
        self.tr.finalize()

        self.orf = Mikado.parsers.bed12.BED12()
        self.orf.chrom = self.tr.id
        self.orf.start = 1
        self.orf.end = self.tr.cdna_length
        self.orf.name = self.tr.id
        self.orf.strand = "+"
        self.orf.score = 0
        self.orf.thick_start = 8571 - 5928 + 1
        self.orf.thick_end = 8666 - 5928 + 1
        self.orf.block_count = 1
        self.orf.blockSize = self.tr.cdna_length
        self.orf.block_starts = 0
        self.orf.has_start_codon = True
        self.orf.has_stop_codon = True
        self.orf.transcriptomic = True
        self.assertFalse(self.orf.invalid, self.orf.invalid_reason)
        self.assertEqual((self.orf.thick_end - self.orf.thick_start + 1) % 3, 0)

    def test_invalid_inizialization(self):

        with self.assertRaises(TypeError):
            _ =  Mikado.loci.Transcript(self.tr_gff_lines[1])

    def test_basics(self):

        self.assertEqual(self.tr.chrom, "Chr1")
        self.assertEqual(self.tr.exon_num, 1)
        self.assertEqual(self.tr.monoexonic, True)
        self.assertEqual(self.tr.exon_num, len(self.tr.exons))
        self.assertEqual(self.tr.start, 5928)
        self.assertEqual(self.tr.end, 8737)
        self.assertEqual(self.tr.exons,
                         [tuple([5928, 8737])],
                         self.tr.exons)

    def test_cds(self):
        """Test the CDS features.
        Note that in a single-exon transcript with no strand, start_codon and stop_codon are defined as False.
        """

        self.tr.load_orfs([self.orf])
        self.assertEqual(self.tr.combined_cds, self.tr.selected_cds)

        self.assertEqual(self.tr.combined_cds,
                         [tuple([8571, 8666])],
                         self.tr.combined_cds)
        self.assertEqual(self.tr.selected_cds_start, 8571)
        self.assertEqual(self.tr.selected_cds_end, 8666)
        self.assertEqual(self.tr.has_start_codon, True)
        self.assertEqual(self.tr.has_stop_codon, True)

    def test_equality(self):

        new_transcript = self.tr.deepcopy()

        self.assertTrue(new_transcript == self.tr)

        new_transcript.strand = None
        self.assertFalse(new_transcript == self.tr)  # They have now a different strand

        new_transcript.unfinalize()
        new_transcript.strand = "+"  # It becomes a multiexonic transcript, so it must have a strand
        new_transcript.end = 9737

        new_exon = Mikado.parsers.GFF.GffLine(self.tr_lines[-1])
        new_exon.strand = "+"
        new_exon.start = 9000
        new_exon.end = 9737
        new_transcript.add_exon(new_exon)

        new_transcript.finalize()
        self.assertTrue(new_transcript != self.tr)

    def test_mono_finalising(self):

        transcript_line = [line for line in self.tr_gff_lines if line.feature == "mRNA" ]
        self.assertEqual(len(transcript_line), 1,
                         "\n".join([str(line) for line in self.tr_gff_lines]))

        tr = Mikado.loci.Transcript(transcript_line[0])
        exon_lines = [line for line in self.tr_gff_lines if
                      line.is_exon is True and "UTR" not in line.feature.upper()]
        tr.add_exons(exon_lines)
        tr.add_exon((8571, 8666), "CDS")

        tr.finalize()
        self.assertGreater(tr.three_utr_length, 0)
        self.assertGreater(tr.five_utr_length, 0)

    def test_invalid_transcript(self):
        lines = """Chr1\tTAIR10\tmRNA\t5928\t8737\t.\t.\t.\tID=AT1G01020.1;Parent=AT1G01020;Name=AT1G01020.1;Index=1
Chr1\tTAIR10\tCDS\t8571\t7500\t.\t.\t0\tParent=AT1G01020.1;
Chr1\tTAIR10\tCDS\t7503\t8666\t.\t.\t0\tParent=AT1G01020.1;
Chr1\tTAIR10\texon\t5928\t8737\t.\t.\t.\tParent=AT1G01020.1"""

        gff_lines = [Mikado.parsers.GFF.GffLine(line) for line in lines.split("\n")]
        self.assertIsInstance(gff_lines[0], Mikado.parsers.GFF.GffLine)
        checker = False
        if gff_lines[0].feature.endswith("transcript") or "RNA" in gff_lines[0].feature.upper():
            checker = True
        self.assertTrue(checker)
        self.assertTrue(gff_lines[0].is_transcript)
        transcript = Mikado.loci.Transcript(gff_lines[0])

        transcript.logger = self.logger
        transcript.add_exons(gff_lines[1:])

        with self.assertRaises(Mikado.exceptions.InvalidCDS):
            Mikado.transcripts.transcript_methods.finalizing._check_cdna_vs_utr(transcript)

    def test_utr(self):

        self.assertEqual(self.tr.selected_internal_orf,
                         [("UTR", tuple([5928, 8570])),
                          ("exon", tuple([5928, 8737])),
                          ("CDS", tuple([8571, 8666]), 0),
                          ("UTR", tuple([8667, 8737]))],
                         "Right: {0}\nFound{1}".format([("UTR", 5928, 8570), ("CDS", 8571, 8666), ("UTR", 8667, 8737)],
                                                       self.tr.selected_internal_orf))
        self.assertEqual(self.tr.combined_utr, [tuple([5928, 8570]),
                                                tuple([8667, 8737])])
        self.assertEqual(self.tr.five_utr, [tuple([5928, 8570])],
                         self.tr.five_utr)
        self.assertEqual(self.tr.three_utr, [tuple([8667, 8737])])

    def test_utr_metrics(self):

        """Test for UTR exon num, start distance, etc."""

        self.assertEqual(self.tr.five_utr_num, 1)
        self.assertEqual(self.tr.three_utr_num, 1)
        self.assertEqual(self.tr.five_utr_length, 8570 + 1 - 5928)
        self.assertEqual(self.tr.three_utr_length, 8737 + 1 - 8667)
        self.assertEqual(self.tr.selected_start_distance_from_tss,
                         8571 - 5928,
                         self.tr.selected_end_distance_from_tes)
        self.assertEqual(self.tr.selected_end_distance_from_tes,
                         8737 - 8666,
                         (self.tr.selected_end_distance_from_tes, self.tr.strand))

    def test_strip_cds(self):

        self.tr.strip_cds()
        self.assertEqual(self.tr.selected_cds_length, 0)
        self.assertEqual(self.tr.three_utr, [])
        self.assertEqual(self.tr.five_utr, [])
        self.assertEqual(self.tr.selected_cds, [])
        self.assertEqual(self.tr.selected_cds_start, None)
        self.assertEqual(self.tr.selected_cds_end, None)

    def test_remove_utr(self):
        """Test for CDS stripping. We remove the UTRs and verify that start/end have moved, no UTR is present, etc.
        """

        self.tr.remove_utrs()
        self.assertEqual(self.tr.selected_cds_start, self.tr.start)
        self.assertEqual(self.tr.selected_cds_end, self.tr.end)
        self.assertEqual(self.tr.three_utr, [])
        self.assertEqual(self.tr.five_utr, [])
        self.assertEqual(self.tr.combined_cds,
                         [tuple([8571, 8666])],
                         self.tr.combined_cds)
        self.assertEqual(self.tr.combined_utr, [], self.tr.combined_utr)

    def test_negative_orf(self):
        """Test loading a negative strand ORF onto a monoexonic transcript.
        This should reverse the ORF."""

        self.orf.strand = "-"
        self.tr.strip_cds(strand_specific=False)
        self.orf.has_stop_codon = False
        self.tr.load_orfs([self.orf])
        self.assertEqual(self.tr.strand, "-")
        self.assertEqual(self.tr.selected_cds_start, 8666)
        self.assertEqual(self.tr.selected_cds_end, 8571)

    def test_introns(self):

        self.assertEqual(self.tr.introns,
                         set([
                         ]),
                         self.tr.introns
                         )
        self.assertEqual(self.tr.combined_cds_introns,
                         set([
                         ]),
                         self.tr.combined_cds_introns
                         )
        self.assertEqual(self.tr.selected_cds_introns,
                         set([
                         ]),
                         self.tr.selected_cds_introns
                         )

    def testDoubleOrf(self):

        """Test to verify the introduction of multiple ORFs."""

        self.tr.strip_cds()
        self.tr.finalized = False

        first_orf = Mikado.parsers.bed12.BED12()
        first_orf.chrom = self.tr.id
        first_orf.start = 1
        first_orf.end = self.tr.cdna_length
        first_orf.name = self.tr.id
        first_orf.strand = "+"
        first_orf.score = 0
        first_orf.thick_start = 51
        first_orf.thick_end = 398
        first_orf.block_count = 1
        first_orf.blockSize = self.tr.cdna_length
        first_orf.block_sizes = [self.tr.cdna_length]
        first_orf.block_starts = [0]
        first_orf.rgb = 0
        first_orf.has_start_codon = True
        first_orf.has_stop_codon = True
        first_orf.transcriptomic = True
        self.assertFalse(first_orf.invalid)

        # This should not be incorporated
        second_orf = Mikado.parsers.bed12.BED12()
        second_orf.chrom = self.tr.id
        second_orf.start = 1
        second_orf.end = self.tr.cdna_length
        second_orf.name = "second"
        second_orf.strand = "+"
        second_orf.score = 0
        second_orf.thick_start = 201
        second_orf.thick_end = 410
        second_orf.block_count = 1
        second_orf.blockSize = self.tr.cdna_length
        second_orf.block_sizes = [self.tr.cdna_length]
        second_orf.block_starts = [0]
        second_orf.rgb = 0
        second_orf.has_start_codon = True
        second_orf.has_stop_codon = True
        second_orf.transcriptomic = True
        self.assertFalse(second_orf.invalid)

        self.assertTrue(Mikado.loci.Transcript.is_overlapping_cds(
            first_orf, second_orf))

        # This should be added
        third_orf = Mikado.parsers.bed12.BED12()
        third_orf.chrom = self.tr.id
        third_orf.start = 1
        third_orf.end = self.tr.cdna_length
        third_orf.name = "third"
        third_orf.strand = "+"
        third_orf.score = 0
        third_orf.thick_start = 501
        third_orf.thick_end = 800
        third_orf.block_count = 1
        third_orf.blockSize = self.tr.cdna_length
        third_orf.block_sizes = [self.tr.cdna_length]
        third_orf.block_starts = [0]
        third_orf.rgb = 0
        third_orf.has_start_codon = True
        third_orf.has_stop_codon = True
        third_orf.transcriptomic = True
        self.assertFalse(third_orf.invalid)

        self.assertFalse(Mikado.loci.Transcript.is_overlapping_cds(first_orf, third_orf))
        self.assertFalse(Mikado.loci.Transcript.is_overlapping_cds(second_orf, third_orf))

        self.assertFalse(third_orf == second_orf)
        self.assertFalse(first_orf == second_orf)
        self.assertFalse(first_orf == third_orf)

        candidates = [first_orf, second_orf, third_orf]

        self.tr.logger = self.logger

        self.tr.load_orfs([first_orf])
        self.tr.load_orfs([second_orf])
        self.tr.load_orfs([third_orf])

        self.tr.load_orfs([first_orf, second_orf, third_orf])

        self.assertTrue(self.tr.is_complete)
        self.tr.finalize()
        self.assertEqual(self.tr.number_internal_orfs, 2,
                         (self.tr.cdna_length, self.tr.selected_start_distance_from_tss,
                          self.tr.selected_end_distance_from_tes))

        self.assertEqual(self.tr.combined_cds_length, 648)
        self.assertEqual(self.tr.selected_cds_length, 348)
        self.assertEqual(self.tr.number_internal_orfs, 2,
                         "\n".join([str(x) for x in self.tr.internal_orfs]))

        new_transcripts = sorted(self.tr.split_by_cds())

        self.assertEqual(len(new_transcripts), 2)
        self.assertEqual(new_transcripts[0].three_utr_length, 0)
        self.assertEqual(new_transcripts[1].five_utr_length, 0)

    def testDoubleOrf_negative(self):

        """Test to verify the introduction of multiple ORFs."""

        self.tr.strip_cds(strand_specific=False)
        self.tr.finalized = False

        first_orf = Mikado.parsers.bed12.BED12()
        first_orf.chrom = self.tr.id
        first_orf.start = 1
        first_orf.end = self.tr.cdna_length
        first_orf.name = self.tr.id
        first_orf.strand = "-"
        first_orf.score = 0
        first_orf.thick_start = 51
        first_orf.thick_end = 398
        first_orf.block_count = 1
        first_orf.blockSize = self.tr.cdna_length
        first_orf.block_sizes = [self.tr.cdna_length]
        first_orf.block_starts = [0]
        first_orf.rgb = 0
        first_orf.has_start_codon = True
        first_orf.has_stop_codon = True
        first_orf.transcriptomic = True
        self.assertFalse(first_orf.invalid)

        # This should not be incorporated
        second_orf = Mikado.parsers.bed12.BED12()
        second_orf.chrom = self.tr.id
        second_orf.start = 1
        second_orf.end = self.tr.cdna_length
        second_orf.name = "second"
        second_orf.strand = "-"
        second_orf.score = 0
        second_orf.thick_start = 201
        second_orf.thick_end = 410
        second_orf.block_count = 1
        second_orf.blockSize = self.tr.cdna_length
        second_orf.block_sizes = [self.tr.cdna_length]
        second_orf.block_starts = [0]
        second_orf.rgb = 0
        second_orf.has_start_codon = True
        second_orf.has_stop_codon = True
        second_orf.transcriptomic = True

        self.assertFalse(second_orf.invalid)

        # self.assertTrue(Mikado.loci.Transcript.is_overlapping_cds(first_orf,
        #                                                           second_orf))

        # This should be added
        third_orf = Mikado.parsers.bed12.BED12()
        third_orf.chrom = self.tr.id
        third_orf.start = 1
        third_orf.end = self.tr.cdna_length
        third_orf.name = "third"
        third_orf.strand = "-"
        third_orf.score = 0
        third_orf.thick_start = 501
        third_orf.thick_end = 800
        third_orf.block_count = 1
        third_orf.blockSize = self.tr.cdna_length
        third_orf.block_sizes = [self.tr.cdna_length]
        third_orf.block_starts = [0]
        third_orf.rgb = 0
        third_orf.has_start_codon = True
        third_orf.has_stop_codon = True
        third_orf.transcriptomic = True
        self.assertFalse(third_orf.invalid)

        self.assertFalse(
            Mikado.loci.Transcript.is_overlapping_cds(
                first_orf, third_orf))
        self.assertFalse(
            Mikado.loci.Transcript.is_overlapping_cds(
                second_orf, third_orf))

        self.assertFalse(third_orf == second_orf)
        self.assertFalse(first_orf == second_orf)
        self.assertFalse(first_orf == third_orf)

        candidates = [first_orf, second_orf, third_orf]

        # self.assertEqual(len(self.tr.find_overlapping_cds(candidates)), 2)

        self.tr.logger = self.logger

        self.tr.load_orfs(candidates)

        self.assertTrue(self.tr.is_complete)
        self.tr.finalize()
        self.assertEqual(self.tr.number_internal_orfs, 2, (
            self.tr.cdna_length,
            self.tr.selected_start_distance_from_tss,
            self.tr.selected_end_distance_from_tes))

        # self.assertEqual(self.tr.combined_cds_length, 648)
        self.assertEqual(self.tr.selected_cds_length, 348)
        self.assertEqual(self.tr.number_internal_orfs, 2, "\n".join([str(x) for x in self.tr.internal_orfs]))

        new_transcripts = sorted(self.tr.split_by_cds())

        self.assertEqual(len(new_transcripts), 2)
        self.assertEqual(new_transcripts[0].five_utr_length, 0)
        self.assertEqual(new_transcripts[1].three_utr_length, 0)

    def test_wrong_orf(self):
        # This should be added
        orf = Mikado.parsers.bed12.BED12()
        orf.chrom = self.tr.id
        orf.start = 1
        orf.end = self.tr.cdna_length + 1
        orf.name = "third"
        orf.strand = "-"
        orf.score = 0
        orf.thick_start = 501
        orf.thick_end = 800
        orf.block_count = 1
        orf.blockSize = self.tr.cdna_length
        orf.block_sizes = [self.tr.cdna_length]
        orf.block_starts = [0]
        orf.rgb = 0
        orf.has_start_codon = True
        orf.has_stop_codon = True
        orf.transcriptomic = True
        self.assertFalse(orf.invalid)

        self.tr.logger = self.logger
        self.tr.strip_cds()
        self.tr.strand = "+"
        self.logger.setLevel("WARNING")
        # self.tr.load_orfs([orf])
        with self.assertLogs("null", level="DEBUG") as cm_out:
            self.tr.load_orfs([orf])

        self.assertFalse(self.tr.is_coding)
Esempio n. 33
0
    def test_wrong_cds(self):

        transcript = Transcript()
        transcript.chrom = "15"
        transcript.source = "protein_coding"
        transcript.start = 47631264
        transcript.end = 48051999

        exons = [(47631264, 47631416), (47704590, 47704669),
                 (47762671, 47762742), (47893062, 47893093),
                 (47895572, 47895655), (48051942, 48051999)]

        transcript.strand = "+"
        transcript.add_exons(exons)
        transcript.id = "ENST00000560636"
        transcript.parent = "ENSG00000137872"
        cds_line = "\t".join([
            "15", "protein_coding", "CDS", "48051996", "48051996", ".", "+",
            "0", "ID=ENST00000560636.cds1;Parent=ENST00000560636"
        ])
        cds_line = GffLine(cds_line)
        transcript.add_exon(cds_line)
        logger = Mikado.utilities.log_utils.create_null_logger()
        transcript.logger = logger
        with self.assertLogs("null", level="WARNING"):
            transcript.finalize()

        trimmed = trim_coding(transcript, logger, max_length=50)
        self.assertEqual(trimmed.start, 47631366)
        self.assertEqual(trimmed.end, 48051992)
Esempio n. 34
0
class TranscriptTester(unittest.TestCase):
    tr_gff = """Chr1    TAIR10    mRNA    5928    8737    .    .    .    ID=AT1G01020.1;Parent=AT1G01020;Name=AT1G01020.1;Index=1
Chr1    TAIR10    exon    5928    8737    .    .    .    Parent=AT1G01020.1"""

    tr_lines = tr_gff.split("\n")
    for pos, line in enumerate(tr_lines):
        tr_lines[pos] = re.sub("\s+", "\t", line)
        assert len(tr_lines[pos].split("\t")) == 9, line.split("\t")

    tr_gff_lines = [Mikado.parsers.GFF.GffLine(line) for line in tr_lines]

    for l in tr_gff_lines:
        assert l.header is False
    #         print(l)

    logger = create_null_logger("null")

    def setUp(self):
        """Basic creation test."""

        self.tr = Transcript()
        self.tr.logger = self.logger
        self.tr.chrom = "Chr1"
        self.tr.source = "TAIR10"
        self.tr.feature = "mRNA"
        self.tr.start = 5928
        self.tr.end = 8737
        self.tr.strand = "+"
        self.tr.add_exon((5928, 8737))
        self.tr.score = None
        self.tr.id, self.tr.parent, self.tr.name = "AT1G01020.1", "AT1G01020", "AT1G01020.1"
        self.tr.add_exon((8571, 8666), "CDS")
        self.tr.finalize()

        self.orf = Mikado.parsers.bed12.BED12()
        self.orf.chrom = self.tr.id
        self.orf.start = 1
        self.orf.end = self.tr.cdna_length
        self.orf.name = self.tr.id
        self.orf.strand = "+"
        self.orf.score = 0
        self.orf.thick_start = 8571 - 5928 + 1
        self.orf.thick_end = 8666 - 5928 + 1
        self.orf.block_count = 1
        self.orf.blockSize = self.tr.cdna_length
        self.orf.block_starts = 0
        self.orf.has_start_codon = True
        self.orf.has_stop_codon = True
        self.orf.transcriptomic = True
        self.assertFalse(self.orf.invalid, self.orf.invalid_reason)
        self.assertEqual((self.orf.thick_end - self.orf.thick_start + 1) % 3,
                         0)

    def test_invalid_inizialization(self):

        with self.assertRaises(TypeError):
            _ = Mikado.loci.Transcript(self.tr_gff_lines[1])

    def test_basics(self):

        self.assertEqual(self.tr.chrom, "Chr1")
        self.assertEqual(self.tr.exon_num, 1)
        self.assertEqual(self.tr.monoexonic, True)
        self.assertEqual(self.tr.exon_num, len(self.tr.exons))
        self.assertEqual(self.tr.start, 5928)
        self.assertEqual(self.tr.end, 8737)
        self.assertEqual(self.tr.exons, [tuple([5928, 8737])], self.tr.exons)

    def test_cds(self):
        """Test the CDS features.
        Note that in a single-exon transcript with no strand, start_codon and stop_codon are defined as False.
        """

        self.tr.load_orfs([self.orf])
        self.assertEqual(self.tr.combined_cds, self.tr.selected_cds)

        self.assertEqual(self.tr.combined_cds, [tuple([8571, 8666])],
                         self.tr.combined_cds)
        self.assertEqual(self.tr.selected_cds_start, 8571)
        self.assertEqual(self.tr.selected_cds_end, 8666)
        self.assertEqual(self.tr.has_start_codon, True)
        self.assertEqual(self.tr.has_stop_codon, True)

    def test_equality(self):

        new_transcript = self.tr.deepcopy()

        self.assertTrue(new_transcript == self.tr)

        new_transcript.strand = None
        self.assertFalse(
            new_transcript == self.tr)  # They have now a different strand

        new_transcript.unfinalize()
        new_transcript.strand = "+"  # It becomes a multiexonic transcript, so it must have a strand
        new_transcript.end = 9737

        new_exon = Mikado.parsers.GFF.GffLine(self.tr_lines[-1])
        new_exon.strand = "+"
        new_exon.start = 9000
        new_exon.end = 9737
        new_transcript.add_exon(new_exon)

        new_transcript.finalize()
        self.assertTrue(new_transcript != self.tr)

    def test_mono_finalising(self):

        transcript_line = [
            line for line in self.tr_gff_lines if line.feature == "mRNA"
        ]
        self.assertEqual(len(transcript_line), 1,
                         "\n".join([str(line) for line in self.tr_gff_lines]))

        tr = Mikado.loci.Transcript(transcript_line[0])
        exon_lines = [
            line for line in self.tr_gff_lines
            if line.is_exon is True and "UTR" not in line.feature.upper()
        ]
        tr.add_exons(exon_lines)
        tr.add_exon((8571, 8666), "CDS")

        tr.finalize()
        self.assertGreater(tr.three_utr_length, 0)
        self.assertGreater(tr.five_utr_length, 0)

    def test_invalid_transcript(self):
        lines = """Chr1\tTAIR10\tmRNA\t5928\t8737\t.\t.\t.\tID=AT1G01020.1;Parent=AT1G01020;Name=AT1G01020.1;Index=1
Chr1\tTAIR10\tCDS\t8571\t7500\t.\t.\t0\tParent=AT1G01020.1;
Chr1\tTAIR10\tCDS\t7503\t8666\t.\t.\t0\tParent=AT1G01020.1;
Chr1\tTAIR10\texon\t5928\t8737\t.\t.\t.\tParent=AT1G01020.1"""

        gff_lines = [
            Mikado.parsers.GFF.GffLine(line) for line in lines.split("\n")
        ]
        self.assertIsInstance(gff_lines[0], Mikado.parsers.GFF.GffLine)
        checker = False
        if gff_lines[0].feature.endswith(
                "transcript") or "RNA" in gff_lines[0].feature.upper():
            checker = True
        self.assertTrue(checker)
        self.assertTrue(gff_lines[0].is_transcript)
        transcript = Mikado.loci.Transcript(gff_lines[0])

        transcript.logger = self.logger
        transcript.add_exons(gff_lines[1:])

        with self.assertRaises(Mikado.exceptions.InvalidCDS):
            Mikado.loci.transcript_methods.finalizing._check_cdna_vs_utr(
                transcript)

    def test_utr(self):

        self.assertEqual(
            self.tr.selected_internal_orf, [("UTR", tuple([5928, 8570])),
                                            ("exon", tuple([5928, 8737])),
                                            ("CDS", tuple([8571, 8666]), 0),
                                            ("UTR", tuple([8667, 8737]))],
            "Right: {0}\nFound{1}".format([("UTR", 5928, 8570),
                                           ("CDS", 8571, 8666),
                                           ("UTR", 8667, 8737)],
                                          self.tr.selected_internal_orf))
        self.assertEqual(
            self.tr.combined_utr,
            [tuple([5928, 8570]), tuple([8667, 8737])])
        self.assertEqual(self.tr.five_utr, [tuple([5928, 8570])],
                         self.tr.five_utr)
        self.assertEqual(self.tr.three_utr, [tuple([8667, 8737])])

    def test_utr_metrics(self):
        """Test for UTR exon num, start distance, etc."""

        self.assertEqual(self.tr.five_utr_num, 1)
        self.assertEqual(self.tr.three_utr_num, 1)
        self.assertEqual(self.tr.five_utr_length, 8570 + 1 - 5928)
        self.assertEqual(self.tr.three_utr_length, 8737 + 1 - 8667)
        self.assertEqual(self.tr.selected_start_distance_from_tss, 8571 - 5928,
                         self.tr.selected_end_distance_from_tes)
        self.assertEqual(
            self.tr.selected_end_distance_from_tes, 8737 - 8666,
            (self.tr.selected_end_distance_from_tes, self.tr.strand))

    def test_strip_cds(self):

        self.tr.strip_cds()
        self.assertEqual(self.tr.selected_cds_length, 0)
        self.assertEqual(self.tr.three_utr, [])
        self.assertEqual(self.tr.five_utr, [])
        self.assertEqual(self.tr.selected_cds, [])
        self.assertEqual(self.tr.selected_cds_start, None)
        self.assertEqual(self.tr.selected_cds_end, None)

    def test_remove_utr(self):
        """Test for CDS stripping. We remove the UTRs and verify that start/end have moved, no UTR is present, etc.
        """

        self.tr.remove_utrs()
        self.assertEqual(self.tr.selected_cds_start, self.tr.start)
        self.assertEqual(self.tr.selected_cds_end, self.tr.end)
        self.assertEqual(self.tr.three_utr, [])
        self.assertEqual(self.tr.five_utr, [])
        self.assertEqual(self.tr.combined_cds, [tuple([8571, 8666])],
                         self.tr.combined_cds)
        self.assertEqual(self.tr.combined_utr, [], self.tr.combined_utr)

    def test_negative_orf(self):
        """Test loading a negative strand ORF onto a monoexonic transcript.
        This should reverse the ORF."""

        self.orf.strand = "-"
        self.tr.strip_cds(strand_specific=False)
        self.orf.has_stop_codon = False
        self.tr.load_orfs([self.orf])
        self.assertEqual(self.tr.strand, "-")
        self.assertEqual(self.tr.selected_cds_start, 8666)
        self.assertEqual(self.tr.selected_cds_end, 8571)

    def test_introns(self):

        self.assertEqual(self.tr.introns, set([]), self.tr.introns)
        self.assertEqual(self.tr.combined_cds_introns, set([]),
                         self.tr.combined_cds_introns)
        self.assertEqual(self.tr.selected_cds_introns, set([]),
                         self.tr.selected_cds_introns)

    def testDoubleOrf(self):
        """Test to verify the introduction of multiple ORFs."""

        self.tr.strip_cds()
        self.tr.finalized = False

        first_orf = Mikado.parsers.bed12.BED12()
        first_orf.chrom = self.tr.id
        first_orf.start = 1
        first_orf.end = self.tr.cdna_length
        first_orf.name = self.tr.id
        first_orf.strand = "+"
        first_orf.score = 0
        first_orf.thick_start = 51
        first_orf.thick_end = 398
        first_orf.block_count = 1
        first_orf.blockSize = self.tr.cdna_length
        first_orf.block_sizes = [self.tr.cdna_length]
        first_orf.block_starts = [0]
        first_orf.rgb = 0
        first_orf.has_start_codon = True
        first_orf.has_stop_codon = True
        first_orf.transcriptomic = True
        self.assertFalse(first_orf.invalid)

        # This should not be incorporated
        second_orf = Mikado.parsers.bed12.BED12()
        second_orf.chrom = self.tr.id
        second_orf.start = 1
        second_orf.end = self.tr.cdna_length
        second_orf.name = "second"
        second_orf.strand = "+"
        second_orf.score = 0
        second_orf.thick_start = 201
        second_orf.thick_end = 410
        second_orf.block_count = 1
        second_orf.blockSize = self.tr.cdna_length
        second_orf.block_sizes = [self.tr.cdna_length]
        second_orf.block_starts = [0]
        second_orf.rgb = 0
        second_orf.has_start_codon = True
        second_orf.has_stop_codon = True
        second_orf.transcriptomic = True
        self.assertFalse(second_orf.invalid)

        self.assertTrue(
            Mikado.loci.Transcript.is_overlapping_cds(first_orf, second_orf))

        # This should be added
        third_orf = Mikado.parsers.bed12.BED12()
        third_orf.chrom = self.tr.id
        third_orf.start = 1
        third_orf.end = self.tr.cdna_length
        third_orf.name = "third"
        third_orf.strand = "+"
        third_orf.score = 0
        third_orf.thick_start = 501
        third_orf.thick_end = 800
        third_orf.block_count = 1
        third_orf.blockSize = self.tr.cdna_length
        third_orf.block_sizes = [self.tr.cdna_length]
        third_orf.block_starts = [0]
        third_orf.rgb = 0
        third_orf.has_start_codon = True
        third_orf.has_stop_codon = True
        third_orf.transcriptomic = True
        self.assertFalse(third_orf.invalid)

        self.assertFalse(
            Mikado.loci.Transcript.is_overlapping_cds(first_orf, third_orf))
        self.assertFalse(
            Mikado.loci.Transcript.is_overlapping_cds(second_orf, third_orf))

        self.assertFalse(third_orf == second_orf)
        self.assertFalse(first_orf == second_orf)
        self.assertFalse(first_orf == third_orf)

        candidates = [first_orf, second_orf, third_orf]

        self.tr.logger = self.logger

        self.tr.load_orfs([first_orf])
        self.tr.load_orfs([second_orf])
        self.tr.load_orfs([third_orf])

        self.tr.load_orfs([first_orf, second_orf, third_orf])

        self.assertTrue(self.tr.is_complete)
        self.tr.finalize()
        self.assertEqual(
            self.tr.number_internal_orfs, 2,
            (self.tr.cdna_length, self.tr.selected_start_distance_from_tss,
             self.tr.selected_end_distance_from_tes))

        self.assertEqual(self.tr.combined_cds_length, 648)
        self.assertEqual(self.tr.selected_cds_length, 348)
        self.assertEqual(self.tr.number_internal_orfs, 2,
                         "\n".join([str(x) for x in self.tr.internal_orfs]))

        new_transcripts = sorted(self.tr.split_by_cds())

        self.assertEqual(len(new_transcripts), 2)
        self.assertEqual(new_transcripts[0].three_utr_length, 0)
        self.assertEqual(new_transcripts[1].five_utr_length, 0)

    def testDoubleOrf_negative(self):
        """Test to verify the introduction of multiple ORFs."""

        self.tr.strip_cds(strand_specific=False)
        self.tr.finalized = False

        first_orf = Mikado.parsers.bed12.BED12()
        first_orf.chrom = self.tr.id
        first_orf.start = 1
        first_orf.end = self.tr.cdna_length
        first_orf.name = self.tr.id
        first_orf.strand = "-"
        first_orf.score = 0
        first_orf.thick_start = 51
        first_orf.thick_end = 398
        first_orf.block_count = 1
        first_orf.blockSize = self.tr.cdna_length
        first_orf.block_sizes = [self.tr.cdna_length]
        first_orf.block_starts = [0]
        first_orf.rgb = 0
        first_orf.has_start_codon = True
        first_orf.has_stop_codon = True
        first_orf.transcriptomic = True
        self.assertFalse(first_orf.invalid)

        # This should not be incorporated
        second_orf = Mikado.parsers.bed12.BED12()
        second_orf.chrom = self.tr.id
        second_orf.start = 1
        second_orf.end = self.tr.cdna_length
        second_orf.name = "second"
        second_orf.strand = "-"
        second_orf.score = 0
        second_orf.thick_start = 201
        second_orf.thick_end = 410
        second_orf.block_count = 1
        second_orf.blockSize = self.tr.cdna_length
        second_orf.block_sizes = [self.tr.cdna_length]
        second_orf.block_starts = [0]
        second_orf.rgb = 0
        second_orf.has_start_codon = True
        second_orf.has_stop_codon = True
        second_orf.transcriptomic = True

        self.assertFalse(second_orf.invalid)

        # self.assertTrue(Mikado.loci.Transcript.is_overlapping_cds(first_orf,
        #                                                           second_orf))

        # This should be added
        third_orf = Mikado.parsers.bed12.BED12()
        third_orf.chrom = self.tr.id
        third_orf.start = 1
        third_orf.end = self.tr.cdna_length
        third_orf.name = "third"
        third_orf.strand = "-"
        third_orf.score = 0
        third_orf.thick_start = 501
        third_orf.thick_end = 800
        third_orf.block_count = 1
        third_orf.blockSize = self.tr.cdna_length
        third_orf.block_sizes = [self.tr.cdna_length]
        third_orf.block_starts = [0]
        third_orf.rgb = 0
        third_orf.has_start_codon = True
        third_orf.has_stop_codon = True
        third_orf.transcriptomic = True
        self.assertFalse(third_orf.invalid)

        self.assertFalse(
            Mikado.loci.Transcript.is_overlapping_cds(first_orf, third_orf))
        self.assertFalse(
            Mikado.loci.Transcript.is_overlapping_cds(second_orf, third_orf))

        self.assertFalse(third_orf == second_orf)
        self.assertFalse(first_orf == second_orf)
        self.assertFalse(first_orf == third_orf)

        candidates = [first_orf, second_orf, third_orf]

        # self.assertEqual(len(self.tr.find_overlapping_cds(candidates)), 2)

        self.tr.logger = self.logger

        self.tr.load_orfs(candidates)

        self.assertTrue(self.tr.is_complete)
        self.tr.finalize()
        self.assertEqual(
            self.tr.number_internal_orfs, 2,
            (self.tr.cdna_length, self.tr.selected_start_distance_from_tss,
             self.tr.selected_end_distance_from_tes))

        # self.assertEqual(self.tr.combined_cds_length, 648)
        self.assertEqual(self.tr.selected_cds_length, 348)
        self.assertEqual(self.tr.number_internal_orfs, 2,
                         "\n".join([str(x) for x in self.tr.internal_orfs]))

        new_transcripts = sorted(self.tr.split_by_cds())

        self.assertEqual(len(new_transcripts), 2)
        self.assertEqual(new_transcripts[0].five_utr_length, 0)
        self.assertEqual(new_transcripts[1].three_utr_length, 0)

    def test_wrong_orf(self):
        # This should be added
        orf = Mikado.parsers.bed12.BED12()
        orf.chrom = self.tr.id
        orf.start = 1
        orf.end = self.tr.cdna_length + 1
        orf.name = "third"
        orf.strand = "-"
        orf.score = 0
        orf.thick_start = 501
        orf.thick_end = 800
        orf.block_count = 1
        orf.blockSize = self.tr.cdna_length
        orf.block_sizes = [self.tr.cdna_length]
        orf.block_starts = [0]
        orf.rgb = 0
        orf.has_start_codon = True
        orf.has_stop_codon = True
        orf.transcriptomic = True
        self.assertFalse(orf.invalid)

        self.tr.logger = self.logger
        self.tr.strip_cds()
        self.tr.strand = "+"
        self.logger.setLevel("WARNING")
        # self.tr.load_orfs([orf])
        with self.assertLogs("null", level="DEBUG") as cm_out:
            self.tr.load_orfs([orf])

        self.assertFalse(self.tr.is_coding)
Esempio n. 35
0
class TestMetricsEndDistances(unittest.TestCase):

    logger = create_default_logger("End")
    logger.setLevel("ERROR")

    def setUp(self):

        self.tr = Transcript()
        self.tr.logger = self.logger
        self.tr.start = 101
        self.tr.end = 10000
        self.tr.add_exons([(101, 300),
                           (501, 800),
                           (1001, 1200),
                           (1301, 2000),
                           (3501, 5000),
                           (5501, 6000),
                           (6201, 7000),
                           (7301, 7700),
                           (8201, 9000),
                           (9101, 9300),
                           (9501, 9700),
                           (9801, 10000)])
        self.tr.id = "test1"
        self.tr.parent = "test1.gene"

    def test_end_positive(self):

        self.tr.strand = "+"

        cds = [(1161, 1200),  # 40 % 3 == 1
               (1301, 2000),  # 700 % 3 == 1
               (3501, 5000),  # 1500 % 3 == 0
               (5501, 6000),  # 500 % 3 == 2
               (6201, 7000),  # 800 % 3 == 2
               (7301, 7700),  # 400 % 3 == 1
               (8201, 9000),  # 800 % 3 == 2
               (9101, 9130)]

        self.tr.add_exons(cds, features="CDS")
        self.tr.finalize()
        self.assertEqual(self.tr.selected_cds_end,
                         self.tr.combined_cds_end)
        self.assertEqual(self.tr.selected_cds_end,
                         9130)
        self.assertEqual(self.tr.end_distance_from_junction,
                         (9300 - 9131 + 1) + (9700 - 9501 + 1)
                         )
        self.assertEqual(self.tr.end_distance_from_tes,
                         (9300 - 9131 + 1) + (9700 - 9501 + 1) + (10000 - 9801 + 1)
                         )

        self.tr.strip_cds()
        self.assertEqual(len(self.tr.internal_orfs), 0, self.tr.internal_orfs)
        self.tr.finalized = False
        cds = [(1161, 1200),  # 40 % 3 == 1
               (1301, 2000),  # 700 % 3 == 1
               (3501, 5000),  # 1500 % 3 == 0
               (5501, 6000),  # 500 % 3 == 2
               (6201, 7000),  # 800 % 3 == 2
               (7301, 7700),  # 400 % 3 == 1
               (8201, 9000),  # 800 % 3 == 2
               (9101, 9300),  # 200 % 3 == 2
               (9501, 9690)  # 190 % 3 == 1
               ]
        self.tr.add_exons(cds, features="CDS")

        self.tr.finalize()
        self.assertEqual(self.tr.combined_cds_end,
                         9690)
        self.assertEqual(self.tr.selected_cds_end,
                         self.tr.combined_cds_end)

        self.assertEqual(self.tr.end_distance_from_junction,
                         (9700 - 9691 + 1)
                         )
        self.assertEqual(self.tr.end_distance_from_tes,
                         (9700 - 9691 + 1) + (10000 - 9801 + 1)
                         )

        self.tr.strip_cds()
        self.assertEqual(self.tr.combined_cds_end,
                         self.tr.selected_cds_end,
                         self.tr.combined_cds)
        self.assertEqual(self.tr.combined_cds_end,
                         None,
                         self.tr.combined_cds_end)

        self.tr.finalized = False
        cds = [(1161, 1200),  # 40 % 3 == 1
               (1301, 2000),  # 700 % 3 == 1
               (3501, 5000),  # 1500 % 3 == 0
               (5501, 6000),  # 500 % 3 == 2
               (6201, 7000),  # 800 % 3 == 2
               (7301, 7700),  # 400 % 3 == 1
               (8201, 9000),  # 800 % 3 == 2
               (9101, 9300),  # 200 % 3 == 2
               (9501, 9700),  # 200 % 3 == 2
               (9801, 9820),  # 20 % 2 == 2
               ]
        self.tr.add_exons(cds, features="CDS")

        self.tr.finalize()
        self.assertEqual(self.tr.combined_cds_end,
                         9820)
        self.assertEqual(self.tr.selected_cds_end,
                         self.tr.combined_cds_end)
        self.assertEqual(self.tr.end_distance_from_tes,
                         180)
        self.assertEqual(self.tr.end_distance_from_junction,
                         0)

    def test_end_negative(self):

        self.tr.strand = "-"

        # self.tr.add_exons([(101, 300),
        #                    (501, 800),
        #                    (1001, 1200),
        #                    (1301, 2000),
        #                    (3501, 5000),
        #                    (5501, 6000),
        #                    (6201, 7000),
        #                    (7301, 7700),
        #                    (8201, 9000),
        #                    (9101, 9300),
        #                    (9501, 9700),
        #                    (9801, 10000)])

        cds = [(1161, 1200),  # 40 % 3 == 1
               (1301, 2000),  # 700 % 3 == 1
               (3501, 5000),  # 1500 % 3 == 0
               (5501, 6000),  # 500 % 3 == 2
               (6201, 7000),  # 800 % 3 == 2
               (7301, 7700),  # 400 % 3 == 1
               (8201, 9000),  # 800 % 3 == 2
               (9101, 9130)]

        self.assertEqual(sum(x[1] - x[0] + 1 for x in cds) % 3, 0)

        self.tr.add_exons(cds, features="CDS")
        self.tr.finalize()
        self.assertTrue(self.tr.is_coding)
        self.assertEqual(self.tr.selected_cds_end,
                         self.tr.combined_cds_end)
        self.assertEqual(self.tr.selected_cds_end,
                         1161)
        self.assertEqual(self.tr.end_distance_from_junction,
                         (1161-1001) + (800-501+1),
                         (self.tr.end_distance_from_junction,
                          (1161-1001) + (800-501+1))
                         )
        self.assertEqual(self.tr.end_distance_from_tes,
                         self.tr.end_distance_from_junction + (300 - 101 + 1),
                         (self.tr.end_distance_from_tes,
                          self.tr.end_distance_from_junction + (300 - 101 + 1))
                         )

        self.tr.strip_cds()
        self.assertEqual(len(self.tr.internal_orfs), 0, self.tr.internal_orfs)
        self.tr.finalized = False
        cds = [(721, 800),
               (1001, 1200),  # 200 % 3 == 2
               (1301, 2000),  # 700 % 3 == 1
               (3501, 5000),  # 1500 % 3 == 0
               (5501, 6000),  # 500 % 3 == 2
               (6201, 7000),  # 800 % 3 == 2
               (7301, 7700),  # 400 % 3 == 1
               (8201, 9000),  # 800 % 3 == 2
               (9101, 9130),  # 200 % 3 == 2
               ]
        self.tr.add_exons(cds, features="CDS")

        self.tr.finalize()
        self.assertEqual(self.tr.combined_cds_end,
                         721)
        self.assertEqual(self.tr.selected_cds_end,
                         self.tr.combined_cds_end)

        self.assertEqual(self.tr.end_distance_from_junction,
                         (721-501),
                         (self.tr.end_distance_from_junction, (721-501))
                         )
        self.assertEqual(self.tr.end_distance_from_tes,
                         self.tr.end_distance_from_junction + (300 - 101 + 1),
                         (self.tr.end_distance_from_tes,
                         self.tr.end_distance_from_junction + (300 - 101 + 1))
                         )

        self.tr.strip_cds()
        self.assertEqual(self.tr.combined_cds_end,
                         self.tr.selected_cds_end,
                         self.tr.combined_cds)
        self.assertEqual(self.tr.combined_cds_end,
                         None,
                         self.tr.combined_cds_end)

        self.tr.finalized = False
        cds = [(161, 300),    # 140 % 3 == 2
               (501, 800),    # 300 % 3 == 0
               (1001, 1200),  # 200 % 3 == 2
               (1301, 2000),  # 700 % 3 == 1
               (3501, 5000),  # 1500 % 3 == 0
               (5501, 6000),  # 500 % 3 == 2
               (6201, 7000),  # 800 % 3 == 2
               (7301, 7700),  # 400 % 3 == 1
               (8201, 9000),  # 800 % 3 == 2
               (9101, 9130),  # 30 % 3 == 0
               ]

        self.assertEqual(sum((_[1] - _[0] +1) % 3 for _ in cds ) % 3, 0)
        self.tr.logger = self.logger
        self.tr.add_exons(cds, features="CDS")
        self.tr.finalize()
        self.assertEqual(self.tr.combined_cds_end,
                         161)
        self.assertEqual(self.tr.selected_cds_end,
                         self.tr.combined_cds_end)
        self.assertEqual(self.tr.end_distance_from_tes,
                         60)
        self.assertEqual(self.tr.end_distance_from_junction,
                         0)
Esempio n. 36
0
class TestPadding(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.fai = pkg_resources.resource_filename("Mikado.tests",
                                                  "chr5.fas.gz")

    def setUp(self):
        self.reference = "Chr5\t26574999\t26578625\tID=AT5G66600.3;coding=True;phase=0\t0\t-\t26575104\t26578315\t0\t11\t411,126,87,60,100,809,126,72,82,188,107\t0,495,711,885,1035,1261,2163,2378,2856,3239,3519"
        self.reference = Transcript(BED12(self.reference),
                                    source="TAIR10",
                                    is_reference=True)

    def test_basic_padding(self):

        logger = create_null_logger("test_basic_padding")
        logger.setLevel("INFO")
        template = self.reference.copy()
        template.id = "AT5G66600.3_exp"
        template.strip_cds()
        template.unfinalize()

        template.remove_exon((26575000, 26575410))  # First exon
        template.start = 26574650
        template.add_exon((26574970, 26575410))  # New exon, template at 5'
        template.add_exon((26574650, 26574820))  # New UTR exon

        template.remove_exon((26578519, 26578625))  # Last exon
        template.end = 26579700
        template.add_exon((26578519, 26578725))
        template.add_exon((26579325, 26579700))

        template.finalize()
        fai = pysam.FastaFile(
            pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz"))

        new5 = pad_transcript(self.reference, self.reference.deepcopy(), None,
                              template, fai, logger)
        self.assertIn((26574970, 26575410), new5.exons)
        self.assertIn((26574650, 26574820), new5.exons)
        self.assertEqual(template.start, new5.start)
        self.assertEqual(self.reference.end, new5.end)

        new3 = pad_transcript(self.reference, self.reference.deepcopy(),
                              template, None, fai, logger)
        self.assertIn((26578519, 26578725), new3.exons)
        self.assertIn((26579325, 26579700), new3.exons)
        self.assertEqual(self.reference.start, new3.start)
        self.assertEqual(template.end, new5.end)

        new53 = pad_transcript(self.reference, self.reference.deepcopy(),
                               template, template, fai, logger)
        self.assertIn((26574970, 26575410), new53.exons)
        self.assertIn((26574650, 26574820), new53.exons)
        self.assertIn((26578519, 26578725), new53.exons)
        self.assertIn((26579325, 26579700), new53.exons)
        self.assertEqual(template.start, new53.start)
        self.assertEqual(template.end, new53.end)

    def test_locus_padding_equal_or_n(self):

        for num, exons_to_add in enumerate([
            ((26574970, 26575410), (26578519, 26578725)),
            ((26574970, 26575410), (26574650, 26574820), (26578519, 26578725),
             (26579325, 26579700))
        ]):

            for num2, pad_transcripts in enumerate((False, True)):
                with self.subTest(exons_to_add=exons_to_add,
                                  pad_transcripts=pad_transcripts):
                    logger = create_null_logger(
                        "test_locus_padding_equal_or_n_" + str(num + num2 * 2))
                    logger.setLevel("DEBUG")
                    template = self.reference.copy()
                    del template.is_reference
                    template.id = "AT5G66600.3_exp"
                    template.unfinalize()
                    template.remove_exon((26575000, 26575410))  # First exon
                    template.remove_exon((26578519, 26578625))  # Last exon
                    template.start = min([_[0] for _ in exons_to_add])
                    template.end = max([_[1] for _ in exons_to_add])
                    template.add_exons(
                        exons_to_add)  # New exon, template at 5'
                    template.finalize()
                    json_conf = load_and_validate_config(None)
                    json_conf.reference.genome = self.fai
                    json_conf.pick.alternative_splicing.only_confirmed_introns = False
                    json_conf.pick.run_options.only_reference_update = True
                    json_conf.pick.alternative_splicing.pad = pad_transcripts
                    locus = Locus(self.reference.copy(),
                                  logger=logger,
                                  configuration=json_conf)
                    self.assertTrue(locus[self.reference.id].is_reference)
                    self.assertEqual(locus.perform_padding, pad_transcripts)
                    locus.add_transcript_to_locus(template)
                    if pad_transcripts is True:
                        self.assertIn(template.id, locus)
                    locus.finalize_alternative_splicing()
                    self.assertNotIn(template.id, locus)
                    if pad_transcripts is False:
                        self.assertEqual(locus[self.reference.id].start,
                                         self.reference.start)
                        self.assertEqual(locus[self.reference.id].end,
                                         self.reference.end)
                    else:
                        self.assertTrue(locus.perform_padding)
                        self.assertEqual(locus[self.reference.id].start,
                                         template.start,
                                         (locus[self.reference.id].exons[0],
                                          template.exons[0]))
                        self.assertEqual(
                            locus[self.reference.id].end, template.end,
                            (locus[self.reference.id].end, template.end))
                        self.assertNotIn(template.id, locus)

    def test_removal_after_padding(self):
        """Here we test that, given three transcripts, the first one will be expanded to be identical to the second;
        the second will be removed as redundant; the third will be expanded to compatible with the padded first.
        """

        logger = create_default_logger("test_add_two_partials", "INFO")
        json_conf = load_and_validate_config(None)
        json_conf.reference.genome = self.fai
        json_conf.pick.alternative_splicing.min_cds_overlap = 0.2
        json_conf.pick.alternative_splicing.min_cdna_overlap = 0.2
        json_conf.pick.alternative_splicing.only_confirmed_introns = False
        json_conf.pick.alternative_splicing.keep_retained_introns = True
        json_conf.pick.alternative_splicing.pad = True
        json_conf.scoring.requirements.expression = ["cdna_length"]
        json_conf.scoring.requirements.parameters = {
            "cdna_length": SizeFilter(operator="gt", value=0)
        }
        json_conf.scoring.requirements._expression = json_conf.scoring.requirements._create_expression(
            json_conf.scoring.requirements.expression,
            json_conf.scoring.requirements.parameters)
        json_conf.scoring.as_requirements.expression = ["cdna_length"]
        json_conf.scoring.as_requirements.parameters = {
            "cdna_length": SizeFilter(operator="gt", value=0)
        }
        json_conf.scoring.as_requirements._expression = json_conf.scoring.requirements._create_expression(
            json_conf.scoring.requirements.expression,
            json_conf.scoring.requirements.parameters)

        t1 = Transcript(
            BED12(
                "Chr5\t26584779\t26587869\tID=AT5G66610.1;coding=True;phase=0\t0\t+\t26585222\t26587755\t0\t11\t\
100,54,545,121,78,105,213,63,119,59,443\t0,440,565,1202,1437,1640,1858,2154,2304,2507,2647"
            ))

        t2_1 = Transcript(
            BED12(
                "Chr5\t26584773\t26586510\tID=AT5G66610.2_1;coding=True;phase=0\t0\t+\t26585222\t26586510\t0\t\
6\t177,54,545,121,78,85\t0,446,571,1208,1443,1652"))
        t2_2 = Transcript(
            BED12(
                "Chr5\t26584873\t26587782\tID=AT5G66610.2_2;coding=True;phase=0\t0\t+\t26585222\t\
26587755\t0\t10\t77,54,545,121,78,99,213,63,119,496\t0,346,471,1108,1343,1552,1764,2060,2210,2413"
            ))

        t1.finalize()
        t2_1.finalize()
        t2_2.finalize()
        t1.is_reference = True
        self.assertEqual(t1.start, 26584780)
        locus = Locus(t1, logger=logger, configuration=json_conf)
        locus.add_transcript_to_locus(t2_1, check_in_locus=False)
        locus.add_transcript_to_locus(t2_2, check_in_locus=False)
        self.assertTrue(locus.primary_transcript_id == t1.id)
        locus.logger.setLevel("INFO")
        locus.finalize_alternative_splicing(_scores={
            t1.id: 20,
            t2_1.id: 15,
            t2_2.id: 10
        })
        self.assertIn(t1.id, locus.transcripts)
        if t2_1.id in locus.transcripts:
            for tid1, tid2 in itertools.combinations(locus.transcripts.keys(),
                                                     2):
                res, _ = Assigner.compare(locus[tid1], locus[tid2])
                print(tid1, tid2, res.ccode)
            self.assertNotIn(t2_1.id, locus.transcripts.keys(),
                             [(key, val.start, val.end)
                              for key, val in locus.transcripts.items()])

        self.assertIn(t2_2.id, locus.transcripts,
                      "\n".join(tr.format("bed12") for tr in locus))
        self.assertTrue(locus[t2_2.id].attributes["padded"])
        # self.assertTrue(locus[t1.id].attributes["padded"])
        self.assertGreaterEqual(t1.start, locus[t1.id].start,
                                locus[t1.id].format("bed12"))
        self.assertEqual(
            locus[t2_2.id].start,
            locus[t1.id].start,
            ((locus[t2_2.id].start, t1.start, t2_1.start, t2_2.start),
             (locus[t2_2.id].end, t1.end, t2_1.end, t2_2.end)),
        )
        self.assertEqual(locus[t1.id].end, locus[t2_2.id].end)

    def test_add_two_partials(self):

        logger = create_null_logger("test_add_two_partials")
        logger.setLevel("INFO")
        json_conf = load_and_validate_config(None)
        json_conf.reference.genome = self.fai
        json_conf.pick.alternative_splicing.only_confirmed_introns = False
        json_conf.pick.run_options.only_reference_update = True

        ref = Transcript(is_reference=True)
        ref.chrom, ref.strand, ref.id = "Chr5", "-", "AT5G66670.2"
        ref.add_exons([(26611258, 26612889)])
        ref.add_exons([(26611474, 26612700)], features=["CDS"])
        ref.finalize()
        self.assertTrue(ref.is_coding)

        # Chr5	TAIR10	mRNA	26611258	26612889	.	-	.	ID=AT5G66670.2;Parent=AT5G66670;Name=AT5G66670.2;index=1
        # Chr5	TAIR10	protein	26611474	26612700	.	-	.	ID=AT5G66670.2-Protein;Parent=AT5G66670.2;Name=AT5G66670.2;derives_from=AT5G66670.2
        # Chr5	TAIR10	three_prime_UTR	26611258	26611473	.	-	.	Parent=AT5G66670.2
        # Chr5	TAIR10	CDS	26611474	26612700	.	-	0	Parent=AT5G66670.2
        # Chr5	TAIR10	five_prime_UTR	26612701	26612889	.	-	.	Parent=AT5G66670.2
        # Chr5	TAIR10	exon	26611258	26612889	.	-	.	Parent=AT5G66670.2

        template1 = Transcript(is_reference=False)
        template1.chrom, template1.strand, template1.id = ref.chrom, ref.strand, ref.id + "_frag1"
        template1.add_exons(((26611116, 26611157), (26611258, 26612670)))
        template1.add_exons(((26611474, 26612670), ), features=["CDS"])
        template1.finalize()
        self.assertTrue(template1.is_coding)

        template2 = Transcript(is_reference=False)
        template2.chrom, template2.strand, template2.id = ref.chrom, ref.strand, ref.id + "_frag2"
        template2.add_exons(((26611574, 26612889), (26613007, 26613403)))
        template2.add_exons(((26611574, 26612700), ), features=["CDS"])
        template2.finalize()
        self.assertTrue(template2.is_coding)

        logger.setLevel("INFO")
        json_conf.pick.alternative_splicing.pad = True
        locus = Locus(ref, configuration=json_conf, logger=logger)
        locus.add_transcript_to_locus(template1)
        locus.add_transcript_to_locus(template2)
        self.assertIn(template2.id, locus)
        # self.assertIn(template1.id, locus)
        # locus.logger.setLevel("DEBUG")
        # for tid in locus:
        #     locus[tid].logger.setLevel("DEBUG")
        locus.finalize_alternative_splicing(check_requirements=False)
        self.assertTrue(locus._finalized)
        self.assertNotIn(template1.id, locus, "\n" + str(locus))
        self.assertNotIn(template2.id, locus, "\n" + str(locus))
        self.assertEqual(
            locus[ref.id].end, template2.end,
            ((locus[ref.id].end, ref.end, template2.end, template1.end),
             (locus[ref.id].start, ref.start, template2.start,
              template1.start)))

    @unittest.skip
    def test_failed_expansion(self):
        logger = create_default_logger("test_failed_expansion",
                                       level="WARNING")
        raw = [
            Transcript(line, logger=logger) for line in Bed12Parser(
                open(
                    pkg_resources.resource_filename(
                        "Mikado.tests",
                        os.path.join("test_pick_pad", "fail.bed12"))))
        ]
        transcripts = dict((_.id, _) for _ in raw)
        [_.finalize() for _ in transcripts.values()]
        template = transcripts["template"]  # 4535908	4540293
        candidate = transcripts["candidate"]  # 4536444	4540027
        backup = candidate.copy()
        fai = pysam.FastaFile(
            pkg_resources.resource_filename(
                "Mikado.tests",
                os.path.join("test_pick_pad", "failing_seq.fa.gz")))
        logger.setLevel("DEBUG")
        candidate.logger.setLevel("DEBUG")
        pad_transcript(candidate,
                       backup,
                       start_transcript=template,
                       end_transcript=template,
                       fai=fai,
                       logger=logger)
Esempio n. 37
0
    def test_noncoding(self):

        transcript = Transcript()
        transcript.chrom = "Chr1"
        transcript.source = "test"
        transcript.start = 10000
        transcript.end = 20000

        exons = [(10000, 11500), (12000, 13000), (15000, 18000),
                 (19000, 20000)]

        transcript.add_exons(exons)
        transcript.strand = "+"
        transcript.finalize()

        logger = Mikado.utilities.log_utils.create_null_logger("correct_cds2")

        copied = transcript.deepcopy()

        trimmed = trim_noncoding(copied, max_length=50)
        self.assertEqual(trimmed.start, 11450)
        self.assertEqual(trimmed.end, 19050)

        copied = transcript.deepcopy()

        trimmed = trim_noncoding(copied, max_length=200)
        self.assertEqual(trimmed.start, 11300)
        self.assertEqual(trimmed.end, 19200)
Esempio n. 38
0
    def test_correct_cds(self):

        transcript = Transcript()
        transcript.chrom = "Chr1"
        transcript.source = "test"
        transcript.start = 10000
        transcript.end = 20000

        exons = [(10000, 11500), (12000, 13000), (15000, 18000),
                 (19000, 20000)]

        cds = [
            (11400, 11500),  # 101
            (12000, 13000),  # 1001 ==> 1102
            (15000, 17998)
        ]  # 2998 == > 3090 (y)

        transcript.add_exons(exons)
        transcript.add_exons(cds, features="CDS")

        transcript.strand = "+"
        transcript.finalize()

        logger = Mikado.utilities.log_utils.create_null_logger("correct_cds")

        copied = transcript.deepcopy()

        trimmed = trim_coding(copied, logger, max_length=50)
        self.assertEqual(trimmed.start, 11400)
        self.assertEqual(trimmed.end, 19050)

        copied = transcript.deepcopy()
        self.assertEqual(copied.start, 10000)
        trimmed = trim_coding(copied, logger, max_length=200)
        self.assertEqual(trimmed.start, 11300)
        self.assertEqual(trimmed.end, 19200)
Esempio n. 39
0
    def test_non_coding_negative(self):
        tr = Transcript()
        tr.chrom = "Chr1"
        tr.start = 101
        tr.end = 2000
        tr.strand = "-"
        tr.add_exons([(101, 300),
                      (1701, 2000)])
        tr.id = "test1"
        tr.parent = "gene1"
        tr.finalize()

        gff = tr.format("gff", with_introns=True)
        self.maxDiff = None
        res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tID=test1;Parent=gene1
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tID=test1.exon1;Parent=test1
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tID=test1.intron1;Parent=test1
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tID=test1.exon2;Parent=test1"""
        self.assertEqual(gff,
                         res,
                         "++++\n\n"+"\n+++\n".join([gff, res]))

        gtf = tr.format("gtf", with_introns=True)
        res = """Chr1\tMikado\ttranscript\t101\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t101\t300\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\tintron\t301\t1700\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";
Chr1\tMikado\texon\t1701\t2000\t.\t-\t.\tgene_id "gene1"; transcript_id "test1";"""
        self.assertEqual(gtf, res,
                         "++++\n\n" + "\n+++\n".join([gtf, res]))
Esempio n. 40
0
class PhaseChecker(unittest.TestCase):

    logger = create_default_logger("pcheck")
    logger.setLevel("DEBUG")

    def setUp(self):

        lines = """Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	mRNA	40282	46004	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960;Name=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2;aed=0.0;note=TRIAE_CS42_5DL_TGACv1_434051_AA1427960;confidence=High;has_start=True;has_stop=True;original_stop=True;protein_rank=P1;transcript_rank=T2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	40282	40933	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	three_prime_UTR	40282	40720	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.three_prime_UTR1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	40721	40933	.	-	0	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	41018	41111	.	-	1	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS2;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	41018	41111	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon2;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	41227	41468	.	-	0	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS3;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	41227	41468	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon3;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	41673	41831	.	-	0	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS4;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	41673	41831	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon4;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	41946	42820	.	-	2	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS5;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	41946	42820	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon5;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	42905	42913	.	-	2	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS6;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	42905	42913	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon6;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	45373	45496	.	-	0	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS7;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	45373	45496	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon7;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	45600	45651	.	-	1	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS8;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	45600	45651	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon8;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	45726	45726	.	-	2	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS9;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	45726	45726	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon9;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	45875	45893	.	-	0	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS10;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	45875	46004	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon10;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	five_prime_UTR	45894	46004	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.five_prime_UTR1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2"""

        lines = [GffLine("\t".join(_.split())) for _ in lines.split("\n") if _]
        self.transcript = Transcript(lines[0], logger=self.logger)
        self.transcript.add_exons(lines[1:])
        self.correct_phases = {(40721, 40933): 2,
                               (41018, 41111): 0,
                               (41227, 41468): 2,
                               (41673, 41831): 2,
                               (41946, 42820): 1,
                               (42905, 42913): 1,
                               (45373, 45496): 2,
                               (45600, 45651): 0,
                               (45726, 45726): 2,
                               (45875, 45893): 0}

    @unittest.skip
    def test_check_phases(self):
        self.transcript.finalize()
        phases = dict((_[1], _[2]) for _ in self.transcript.internal_orfs[0]
                      if _[0] == "CDS")
        self.assertEqual(self.transcript.combined_cds_start, 45893)

        self.assertEqual(phases.keys(),
                         self.correct_phases.keys(),
                         list(zip(sorted(phases.keys()),
                                  sorted(self.correct_phases.keys()))))

        if self.correct_phases != phases:
            for key in sorted(phases.keys(), reverse=True):
                self.assertEqual(phases[key], self.correct_phases[key],
                                 (key, phases[key], self.correct_phases[key]))

        self.assertEqual(self.correct_phases,
                         phases,
                         (self.correct_phases, phases))
Esempio n. 41
0
class WrongLoadedOrf(unittest.TestCase):

    def setUp(self):

        self.tr = Transcript()
        self.tr.start, self.tr.end, self.tr.chrom, self.tr.strand = (101, 1000, "Chr1", "+")
        self.tr.id = "test1"
        self.tr.add_exons([(101, 400), (701, 1000)])
        self.tr.finalize()

    def test_load_invalid_length(self):

        b_invalid = BED12(transcriptomic=True)
        b_invalid.chrom = self.tr.id
        self.assertTrue(b_invalid.transcriptomic)
        # b_invalid.name = self.tr.id
        b_invalid.start = 0
        b_invalid.strand = "+"
        b_invalid.end = self.tr.cdna_length + 10
        b_invalid.thick_start = 101
        b_invalid.thick_end = 190
        self.assertEqual(b_invalid.chrom,
                         b_invalid.id,
                         b_invalid.id)

        with self.assertLogs("null", "WARNING") as cm:
            retrieval.load_orfs(self.tr, [b_invalid])

        found_message = False
        for _ in cm.output:
            if "Wrong ORF for {}:".format(self.tr.id) in _:
                found_message = True
                break

        self.assertTrue(found_message, cm.output)

    def test_load_invalid_multiple(self):

        b_valid = BED12(transcriptomic=True)
        b_valid.chrom = self.tr.id
        b_valid.name = "valid"
        b_valid.start, b_valid.end, b_valid.strand = 0, self.tr.cdna_length - 1, "+"
        b_valid.thick_start, b_valid.thick_end = 101, 190

        b_invalid = b_valid.copy()
        b_invalid.name = "invalid"
        b_invalid.thick_start = 1
        b_invalid.thick_end = 89
        b_invalid.phase = 0

        self.assertTrue(b_invalid.invalid)
        self.assertFalse(b_valid.invalid, b_valid.invalid_reason)

        with self.assertLogs("null", "DEBUG") as _:
            retrieval.load_orfs(self.tr, [b_valid, b_invalid])

        # print(*cm.output, sep="\n")

        self.assertEqual(self.tr.number_internal_orfs, 1)

    def test_filter_non_transcriptomic(self):

        b_valid = BED12(transcriptomic=True)
        b_valid.chrom = self.tr.id
        b_valid.name = "valid"
        b_valid.start, b_valid.end, b_valid.strand = 0, self.tr.cdna_length - 1, "+"
        b_valid.thick_start, b_valid.thick_end = 101, 190

        b_invalid = b_valid.copy()
        b_invalid.name = "non-transcriptomic"
        b_invalid.transcriptomic = False

        retained = retrieval.find_overlapping_cds(self.tr, [b_invalid, b_valid])
        self.assertEqual(retained, [b_valid])
Esempio n. 42
0
def main():

    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("-mi",
                        "--max-intron",
                        default=10000,
                        dest="max_intron",
                        type=int,
                        help="Maximum intron length for UTR introns.")
    parser.add_argument("gff", type=parser_factory)
    parser.add_argument("out",
                        default=sys.stdout,
                        type=argparse.FileType("wt"),
                        nargs="?")
    args = parser.parse_args()

    if args.max_intron < 0:
        raise ValueError("Max intron length <0 specified! {0}".format(
            args.max_intron))

    ref_gff = isinstance(args.gff, GFF3)
    if ref_gff:
        form = "gff3"
    else:
        form = "gtf"

    current = None
    current_transcript = None

    last_header = []
    for record in args.gff:
        if record.header is True:
            # print(record, file=sys.stderr)
            if current is not None:
                current = remove_introns(current, args)
                print(current.format(form), file=args.out)
                print(*last_header, sep="\n", file=args.out, end='')
                current = None
                current_transcript = None
            print(*last_header, sep="\n", end="")
            last_header = [record]
            continue
        if record.feature not in ("gene", "mRNA", "CDS", "exon"):
            continue

        if record.is_gene is True and ref_gff:
            print(record, file=sys.stderr)
            last_header = []
            if current is not None:
                # current = remove_introns(current, args)
                print(current.format(form), file=args.out)
                print(*last_header, sep="\n", file=args.out, end='')
                current = None
                current_transcript = None
        if record.is_transcript:
            if ref_gff is False:
                if current_transcript is not None:
                    # current_transcript = remove_introns_from_transcr(current_transcript,
                    #                                                 args)
                    assert current_transcript.combined_cds_length > 0
                    print(current_transcript, file=args.out)
                    print(*last_header, sep="\n", file=args.out)
                    last_header = []
            elif ref_gff is True:
                if current_transcript is not None:
                    if current is None:
                        current = Gene(current_transcript)
                        current.add(current_transcript)
                    else:
                        assert current_transcript.parent[0] != current.id
                        current.add(current_transcript)
                    # if current.id == current_transcript.parent[0]:

                    # else:
                    #     current = remove_introns(current, args)
                    #     print(current.format(form), file=args.out)
                    #     print("###", file=args.out)
                    #     current = None

                # elif current_transcript is not None:
                #     current = Gene(current_transcript)

            current_transcript = Transcript(record)
        elif record.is_exon:
            if record.feature not in ("CDS", "exon"):
                continue
            current_transcript.add_exon(record)
        else:
            continue
        continue

    if ref_gff and current is not None:
        print(*last_header, sep="\n", file=args.out)
        last_header = []
        current = remove_introns(current, args)
        print(current.format(form), file=args.out)
    elif not ref_gff and current_transcript is not None:
        current_transcript = remove_introns_from_transcr(
            current_transcript, args)
        print(current_transcript.format(form), file=args.out)
        print(*last_header, sep="\n", file=args.out, end='')
Esempio n. 43
0
def main():

    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("-mi", "--max-intron", default=10000, dest="max_intron",
                        type=int, help="Maximum intron length for UTR introns.")
    parser.add_argument("gff", type=to_gff)
    parser.add_argument("out", default=sys.stdout, type=argparse.FileType("wt"), nargs="?")
    args = parser.parse_args()

    if args.max_intron < 0:
        raise ValueError("Max intron length <0 specified! {0}".format(args.max_intron))

    ref_gff = isinstance(args.gff, GFF3)
    if ref_gff:
        form = "gff3"
    else:
        form = "gtf"

    current = None
    current_transcript = None

    last_header = []
    for record in args.gff:
        if record.header is True:
            # print(record, file=sys.stderr)
            if current is not None:
                current = remove_introns(current, args)
                print(current.format(form), file=args.out)
                print(*last_header, sep="\n", file=args.out, end='')
                current = None
                current_transcript = None
            print(*last_header, sep="\n", end="")
            last_header = [record]
            continue
        if record.feature not in ("gene", "mRNA", "CDS", "exon"):
            continue
        
        if record.is_gene is True and ref_gff:
            print(record, file=sys.stderr)
            last_header = []
            if current is not None:
                # current = remove_introns(current, args)
                print(current.format(form), file=args.out)
                print(*last_header, sep="\n", file=args.out, end='')
                current = None
                current_transcript = None
        if record.is_transcript:
            if ref_gff is False:
                if current_transcript is not None:
                    # current_transcript = remove_introns_from_transcr(current_transcript,
                    #                                                 args)
                    assert current_transcript.combined_cds_length > 0
                    print(current_transcript, file=args.out)
                    print(*last_header, sep="\n", file=args.out)
                    last_header = []
            elif ref_gff is True:
                if current_transcript is not None:
                    if current is None:
                        current = Gene(current_transcript)
                        current.add(current_transcript)
                    else:
                        assert current_transcript.parent[0] != current.id
                        current.add(current_transcript)
                    # if current.id == current_transcript.parent[0]:
                        
                    # else:
                    #     current = remove_introns(current, args)
                    #     print(current.format(form), file=args.out)
                    #     print("###", file=args.out)
                    #     current = None
                
                # elif current_transcript is not None:
                #     current = Gene(current_transcript)

            current_transcript = Transcript(record)
        elif record.is_exon:
            if record.feature not in ("CDS", "exon"):
                continue
            current_transcript.add_exon(record)
        else:
            continue
        continue
    
    if ref_gff and current is not None:
        print(*last_header, sep="\n", file=args.out)
        last_header = []
        current = remove_introns(current, args)
        print(current.format(form), file=args.out)
    elif not ref_gff and current_transcript is not None:
        current_transcript = remove_introns_from_transcr(current_transcript,
                                                         args)
        print(current_transcript.format(form), file=args.out)
        print(*last_header, sep="\n", file=args.out, end='')
Esempio n. 44
0
    def test_removal_after_padding(self):
        """Here we test that, given three transcripts, the first one will be expanded to be identical to the second;
        the second will be removed as redundant; the third will be expanded to compatible with the padded first.
        """

        logger = create_default_logger("test_add_two_partials", "INFO")
        json_conf = load_and_validate_config(None)
        json_conf.reference.genome = self.fai
        json_conf.pick.alternative_splicing.min_cds_overlap = 0.2
        json_conf.pick.alternative_splicing.min_cdna_overlap = 0.2
        json_conf.pick.alternative_splicing.only_confirmed_introns = False
        json_conf.pick.alternative_splicing.keep_retained_introns = True
        json_conf.pick.alternative_splicing.pad = True
        json_conf.scoring.requirements.expression = ["cdna_length"]
        json_conf.scoring.requirements.parameters = {
            "cdna_length": SizeFilter(operator="gt", value=0)
        }
        json_conf.scoring.requirements._expression = json_conf.scoring.requirements._create_expression(
            json_conf.scoring.requirements.expression,
            json_conf.scoring.requirements.parameters)
        json_conf.scoring.as_requirements.expression = ["cdna_length"]
        json_conf.scoring.as_requirements.parameters = {
            "cdna_length": SizeFilter(operator="gt", value=0)
        }
        json_conf.scoring.as_requirements._expression = json_conf.scoring.requirements._create_expression(
            json_conf.scoring.requirements.expression,
            json_conf.scoring.requirements.parameters)

        t1 = Transcript(
            BED12(
                "Chr5\t26584779\t26587869\tID=AT5G66610.1;coding=True;phase=0\t0\t+\t26585222\t26587755\t0\t11\t\
100,54,545,121,78,105,213,63,119,59,443\t0,440,565,1202,1437,1640,1858,2154,2304,2507,2647"
            ))

        t2_1 = Transcript(
            BED12(
                "Chr5\t26584773\t26586510\tID=AT5G66610.2_1;coding=True;phase=0\t0\t+\t26585222\t26586510\t0\t\
6\t177,54,545,121,78,85\t0,446,571,1208,1443,1652"))
        t2_2 = Transcript(
            BED12(
                "Chr5\t26584873\t26587782\tID=AT5G66610.2_2;coding=True;phase=0\t0\t+\t26585222\t\
26587755\t0\t10\t77,54,545,121,78,99,213,63,119,496\t0,346,471,1108,1343,1552,1764,2060,2210,2413"
            ))

        t1.finalize()
        t2_1.finalize()
        t2_2.finalize()
        t1.is_reference = True
        self.assertEqual(t1.start, 26584780)
        locus = Locus(t1, logger=logger, configuration=json_conf)
        locus.add_transcript_to_locus(t2_1, check_in_locus=False)
        locus.add_transcript_to_locus(t2_2, check_in_locus=False)
        self.assertTrue(locus.primary_transcript_id == t1.id)
        locus.logger.setLevel("INFO")
        locus.finalize_alternative_splicing(_scores={
            t1.id: 20,
            t2_1.id: 15,
            t2_2.id: 10
        })
        self.assertIn(t1.id, locus.transcripts)
        if t2_1.id in locus.transcripts:
            for tid1, tid2 in itertools.combinations(locus.transcripts.keys(),
                                                     2):
                res, _ = Assigner.compare(locus[tid1], locus[tid2])
                print(tid1, tid2, res.ccode)
            self.assertNotIn(t2_1.id, locus.transcripts.keys(),
                             [(key, val.start, val.end)
                              for key, val in locus.transcripts.items()])

        self.assertIn(t2_2.id, locus.transcripts,
                      "\n".join(tr.format("bed12") for tr in locus))
        self.assertTrue(locus[t2_2.id].attributes["padded"])
        # self.assertTrue(locus[t1.id].attributes["padded"])
        self.assertGreaterEqual(t1.start, locus[t1.id].start,
                                locus[t1.id].format("bed12"))
        self.assertEqual(
            locus[t2_2.id].start,
            locus[t1.id].start,
            ((locus[t2_2.id].start, t1.start, t2_1.start, t2_2.start),
             (locus[t2_2.id].end, t1.end, t2_1.end, t2_2.end)),
        )
        self.assertEqual(locus[t1.id].end, locus[t2_2.id].end)