Ejemplo n.º 1
0
    def test_exon_switching_pos_noCDS(self):
        """Checking that an exon switching is treated correctly as a NON-retained intron even when the CDS is absent.
        Positive strand case"""

        t1 = Transcript()
        t1.chrom, t1.strand, t1.id = 1, "+", "t1"
        t1.add_exons([(101, 500), (801, 1000), (1201, 1300), (2501, 2800)])
        t1.add_exons(
            [
                (201, 500),  # 300
                (801, 1000),  # 200
                (1201, 1300),  # 100
                (2501, 2530)  # 30
            ],
            features="CDS")
        t1.finalize()

        t2 = Transcript()
        t2.chrom, t2.strand, t2.id = 1, "+", "t2"
        t2.add_exons([(101, 500), (801, 1000), (1201, 1300), (1501, 1800)])
        # t2.add_exons([(201, 500),  # 300
        #               (801, 1000),  # 200
        #               (1201, 1300),  # 100
        #               (1501, 1530)  # 30
        #               ], features="CDS")
        t2.finalize()

        sup = Superlocus(t1, json_conf=self.my_json)
        sup.add_transcript_to_locus(t2)

        sup.find_retained_introns(t2)

        self.assertEqual(sup.transcripts["t2"].retained_intron_num, 0,
                         sup.transcripts["t2"].retained_introns)
Ejemplo n.º 2
0
    def test_not_retained_neg(self):
        """Here we verify that a false retained intron is not called as such"""

        t1 = Transcript()
        t1.chrom, t1.strand, t1.id = 1, "-", "t1"
        t1.add_exons([(101, 500), (801, 1000), (1201, 1300), (1501, 1800)])
        t1.add_exons(
            [
                (201, 500),  # 300
                (801, 1000),  # 200
                (1201, 1300),  # 100
                (1501, 1530)  # 30
            ],
            features="CDS")
        t1.finalize()

        t2 = Transcript()
        t2.chrom, t2.strand, t2.id = 1, "-", "t2"
        t2.add_exons([(301, 1000), (1201, 1300), (1501, 1800)])
        t2.add_exons(
            [
                (1501, 1530),  # 30
                (1201, 1300),  # 100
                (471, 1000)  # 230
            ],
            features="CDS")
        t2.finalize()

        sup = Superlocus(t1, json_conf=self.my_json)
        sup.add_transcript_to_locus(t2)

        sup.find_retained_introns(t2)

        self.assertEqual(sup.transcripts["t2"].retained_intron_num, 0)
Ejemplo n.º 3
0
    def test_real_retained_pos_noCDS(self):
        """Here we verify that a real retained intron is called as such, even when the transcript lacks a CDS"""

        t1 = Transcript()
        t1.chrom, t1.strand, t1.id = 1, "+", "t1"
        t1.add_exons([(101, 500), (801, 1000), (1201, 1300), (1501, 1800)])
        t1.add_exons(
            [
                (201, 500),  # 300
                (801, 1000),  # 200
                (1201, 1300),  # 100
                (1501, 1530)  # 30
            ],
            features="CDS")
        t1.finalize()

        t2 = Transcript()
        t2.chrom, t2.strand, t2.id = 1, "+", "t2"
        t2.add_exons([(101, 500), (801, 1000), (1201, 1600)])
        # t2.add_exons([(201, 500),  # 300
        #               (801, 1000),  # 200
        #               (1201, 1420),  # 220
        #               ], features="CDS")
        t2.finalize()

        sup = Superlocus(t1, json_conf=self.my_json)
        sup.add_transcript_to_locus(t2)

        sup.find_retained_introns(t2)

        self.assertEqual(sup.transcripts["t2"].retained_introns,
                         ((1201, 1600), ))
Ejemplo n.º 4
0
    def test_mixed_strands(self):
        """Verify that no retained intron is called if the strands are mixed."""

        t1 = Transcript()
        t1.chrom, t1.strand, t1.id = 1, "+", "t1"
        t1.add_exons([(101, 500), (801, 1000), (1201, 1300), (1501, 1800)])
        t1.add_exons(
            [
                (201, 500),  # 300
                (801, 1000),  # 200
                (1201, 1300),  # 100
                (1501, 1530)  # 30
            ],
            features="CDS")
        t1.finalize()

        t2 = Transcript()
        t2.chrom, t2.strand, t2.id = 1, "-", "t2"
        t2.add_exons([(601, 1000), (1201, 1300), (1501, 1800)])
        t2.add_exons(
            [
                (1501, 1530),  # 30
                (1201, 1300),  # 100
                (771, 1000)  # 230
            ],
            features="CDS")
        t2.finalize()

        sup = Superlocus(t1, json_conf=self.my_json, stranded=False)
        sup.add_transcript_to_locus(t2)

        sup.find_retained_introns(t2)

        self.assertEqual(sup.transcripts["t2"].retained_intron_num, 0)
Ejemplo n.º 5
0
    def test_load_orfs(self):

        transcript_line = 'Chr1\t100\t2000\tID=foo;coding=True;phase=0'\
                          '\t0\t+\t300\t1850\t0\t4\t400,400,400,200\t0,500,1100,1700'
        transcript = Transcript(transcript_line)
        orf = transcript.orfs[0].to_transcriptomic()
        transcript2 = transcript.copy()
        transcript2.unfinalize()
        transcript2.chrom = "Chr2"
        transcript2.id = "foo.2"
        transcript2.finalize()
        other_orf = transcript2.orfs[0].to_transcriptomic()
        engine = create_engine("sqlite:///:memory:")
        db.metadata.create_all(engine)
        SessionMaker = sessionmaker(bind=engine)
        session = SessionMaker()
        query = Query(transcript.id, transcript.cdna_length)
        query2 = Query(transcript2.id, transcript2.cdna_length)
        session.add_all([query, query2])
        session.commit()
        serialized_orf = Orf(orf, query.query_id)
        self.assertEqual(serialized_orf.thick_end, orf.thick_end)
        self.assertEqual(serialized_orf.cds_len, orf.cds_len)
        serialized_other_orf = Orf(other_orf, query2.query_id)
        session.add_all([serialized_orf, serialized_other_orf])
        session.commit()
        sup = Superlocus(transcript)
        sup.session = session
        sup_orfs = asyncio.run(sup.get_orfs([query.query_id]))
        self.assertEqual(len(sup_orfs), 1)
        self.assertIn(transcript.id, sup_orfs)
        self.assertEqual(len(sup_orfs[transcript.id]), 1)
        self.assertIsInstance(sup_orfs[transcript.id][0], BED12,
                              type(sup_orfs[transcript.id][0]))
        self.assertTrue(
            sup_orfs[transcript.id][0] == orf, "\n" +
            "\n".join([str(orf), str(sup_orfs[transcript.id][0])]))
Ejemplo n.º 6
0
    def setUp(self):

        gff_transcript1 = """Chr1\tfoo\ttranscript\t101\t300\t.\t+\t.\tID=t0
Chr1\tfoo\texon\t101\t300\t.\t+\t.\tID=t0:exon1;Parent=t0
Chr1\tfoo\tCDS\t101\t250\t.\t+\t.\tID=t0:exon1;Parent=t0""".split("\n")
        gff_transcript1 = [GFF.GffLine(x) for x in gff_transcript1]
        self.assertEqual(gff_transcript1[0].chrom, "Chr1", gff_transcript1[0])
        self.transcript1 = Transcript(gff_transcript1[0])
        for exon in gff_transcript1[1:]:
            self.transcript1.add_exon(exon)
        self.transcript1.finalize()
        self.assertTrue(self.transcript1.monoexonic)
        self.assertEqual(self.transcript1.chrom, gff_transcript1[0].chrom)

        gff_transcript2 = """Chr1\tfoo\ttranscript\t101\t600\t.\t+\t.\tID=t1
Chr1\tfoo\texon\t101\t200\t.\t+\t.\tID=t1:exon1;Parent=t1
Chr1\tfoo\texon\t301\t400\t.\t+\t.\tID=t1:exon2;Parent=t1
Chr1\tfoo\texon\t501\t600\t.\t+\t.\tID=t1:exon3;Parent=t1""".split("\n")
        gff_transcript2 = [GFF.GffLine(x) for x in gff_transcript2]
        self.transcript2 = Transcript(gff_transcript2[0], logger=self.logger)

        for exon in gff_transcript2[1:-1]:
            self.transcript2.add_exon(exon)
        # Test that a transcript cannot be finalized if
        # the exons do not define the external boundaries
        with self.assertLogs("null", level="WARNING") as _:
            self.transcript2.finalize()
        with self.assertRaises(exceptions.ModificationError):
            self.transcript2.add_exon(gff_transcript2[-1])

        self.transcript2.finalized = False
        self.transcript2.start = 101
        self.transcript2.end = 600
        self.transcript2.add_exon(gff_transcript2[-1])
        self.transcript2.finalize()
        self.assertFalse(self.transcript2.monoexonic)
        self.assertEqual(self.transcript2.exon_num, len(gff_transcript2) - 1)
        # Test that trying to modify a transcript after it has been finalized causes errors
        with self.assertRaises(exceptions.ModificationError):
            for exon in gff_transcript2[1:]:
                self.transcript2.add_exon(exon)
        # Test that creating a superlocus without configuration fails
        with self.assertRaises(exceptions.NoJsonConfigError):
            _ = Superlocus(self.transcript1)
        self.my_json = os.path.join(os.path.dirname(__file__),
                                    "configuration.yaml")
        self.my_json = configurator.to_json(self.my_json)
        self.assertIn("scoring", self.my_json, self.my_json.keys())
Ejemplo n.º 7
0
    def test_get_external(self):
        checked_conf = load_and_validate_config(None).copy()
        checked_conf.pick.output_format.report_all_external_metrics = True
        transcript = Transcript()
        transcript.chrom = "15"
        transcript.source = "protein_coding"
        transcript.start = 47631264
        transcript.end = 48051999

        exons = [(47631264, 47631416), (47704590, 47704669),
                 (47762671, 47762742), (47893062, 47893093),
                 (47895572, 47895655), (48051942, 48051999)]

        transcript.strand = "+"
        transcript.add_exons(exons)
        transcript.id = "ENST00000560636"
        transcript.parent = "ENSG00000137872"
        transcript2 = transcript.copy()
        transcript2.id = "ENST00000560637"
        checked_conf.scoring.scoring["attributes.tpm"] = MinMaxScore.Schema(
        ).load({
            "rescaling": "max",
            "default": 0,
            "rtype": "float",
            'multiplier': 4,
            'use_raw': True,
            'percentage': True
        })
        transcript.attributes["tpm"] = 10

        int_source = ExternalSource('int', 'int', 0)
        float_source = ExternalSource('float', 'float', 0)
        bool_source = ExternalSource('bool', 'bool', 0)

        raw_int_source = ExternalSource('raw_int', 'int', 1)
        raw_float_source = ExternalSource('raw_float', 'float', 1)
        raw_bool_source = ExternalSource('raw_bool', 'bool', 1)

        int_score = External(1, 1, 10)
        float_score = External(1, 2, 10.0)
        bool_score = External(
            1, 3, int(False)
        )  # We cast as int here following external.py serialize function

        raw_int_score = External(1, 4, 8)
        raw_float_score = External(1, 5, 8.0)
        raw_bool_score = External(
            1, 6, int(True)
        )  # We cast as int here following external.py serialize function

        query = Query(transcript.id, transcript.cdna_length)
        query2 = Query(transcript2.id, transcript2.cdna_length)

        engine = create_engine("sqlite:///:memory:")
        db.metadata.create_all(engine)
        SessionMaker = sessionmaker(bind=engine)
        session = SessionMaker()
        session.add_all([
            int_source, float_source, bool_source, raw_int_source,
            raw_float_source, raw_bool_source
        ])
        session.add_all([query, query2])
        session.add_all([
            int_score, float_score, bool_score, raw_int_score, raw_float_score,
            raw_bool_score
        ])
        session.commit()
        sup = Superlocus(transcript, configuration=checked_conf)
        sup.session = session
        tid = transcript.id
        self.assertIn(tid, sup.transcripts)
        from collections import namedtuple
        qobj = {1: namedtuple('t', field_names=('query_name'))}
        qobj[1].query_name = 'ENST00000560636'
        external = asyncio.run(sup.get_external(qobj, [1]))

        self.assertEqual(
            external, {
                'ENST00000560636': {
                    'int': (10, False),
                    'float': (10.0, False),
                    'bool': (False, False),
                    'raw_int': (8, True),
                    'raw_float': (8.0, True),
                    'raw_bool': (True, True)
                }
            })

        sup.configuration.pick.output_format.report_all_external_metrics = False
        external = asyncio.run(sup.get_external(qobj, [1]))
        self.assertEqual(len(external), 0)
        # These are meaningless it's just to verify we are loading *only* these metrics.
        # We should *NOT* have 'float' as it is not present in any section.
        sup.configuration.scoring.scoring["external.int"] = MinMaxScore(
            rescaling="max", filter=None)
        sup.configuration.scoring.requirements.parameters[
            "external.raw_float"] = SizeFilter(operator="gt", value=100)
        sup.configuration.scoring.cds_requirements.parameters[
            "external.raw_int"] = SizeFilter(operator="lt", value=1)
        sup.configuration.scoring.as_requirements.parameters[
            "external.raw_bool"] = SizeFilter(operator="lt", value=1)
        sup.configuration.scoring.not_fragmentary.parameters[
            "external.bool"] = SizeFilter(operator="ne", value=False)
        external = asyncio.run(sup.get_external(qobj, [1]))
        self.assertEqual(
            external, {
                'ENST00000560636': {
                    'int': (10, False),
                    'raw_float': (8.0, True),
                    'bool': (False, False),
                    'raw_int': (8, True),
                    'raw_bool': (True, True)
                }
            })
Ejemplo n.º 8
0
    def test_retrieval(self):
        engine = create_engine("sqlite:///:memory:")
        db.metadata.create_all(engine)
        SessionMaker = sessionmaker(bind=engine)
        session = SessionMaker()

        transcript = Transcript(accept_undefined_multi=True)
        transcript.chrom = "15"
        transcript.source = "protein_coding"
        transcript.start = 47631264
        transcript.end = 48051999

        exons = [(47631264, 47631416), (47704590, 47704669),
                 (47762671, 47762742), (47893062, 47893093),
                 (47895572, 47895655), (48051942, 48051999)]

        transcript.strand = "+"
        transcript.add_exons(exons)
        transcript.id = "ENST00000560636"
        transcript.parent = "ENSG00000137872"
        transcript2 = transcript.copy()
        transcript2.id = "ENST00000560637"

        chrom_one = Chrom("1", 10**8)
        chrom_fifteen = Chrom("15", 5 * 10**8)
        session.add_all([chrom_one, chrom_fifteen])
        session.commit()
        # junction_start, junction_end, name, strand, score, chrom_id)
        # This junction is on a different chrom
        junction_chrom_one = Junction(47704669 + 1, 47762671 - 1, "chrom_one",
                                      "+", 10, chrom_one.chrom_id)
        # This junction is too far away
        outside_chrom_15 = Junction(47704669 - 10**6 + 1, 47762671 - 10**6 - 1,
                                    "chrom_15_outside", "+", 10,
                                    chrom_fifteen.chrom_id)
        # This junction is in the right place but wrong strand
        wrong_strand_chrom_15 = Junction(47704669 + 1, 47762671 - 1,
                                         "chrom_15_wrong_strand", "-", 10,
                                         chrom_fifteen.chrom_id)
        # This one is correct
        chrom_15_junction = Junction(47704669 + 1, 47762671 - 1, "chrom_15",
                                     "+", 10, chrom_fifteen.chrom_id)
        session.add_all([
            junction_chrom_one, outside_chrom_15, wrong_strand_chrom_15,
            chrom_15_junction
        ])
        session.commit()

        self.assertEqual(junction_chrom_one.chrom, "1")
        for junc in [
                outside_chrom_15, wrong_strand_chrom_15, chrom_15_junction
        ]:
            self.assertEqual(junc.chrom, "15")

        for strand, stranded in itertools.product(("+", "-", None),
                                                  (True, False)):
            transcript.unfinalize()
            transcript.strand = strand
            transcript.finalize()
            sup = Superlocus(transcript, stranded=stranded)
            self.assertTrue(
                (chrom_15_junction.junction_start, chrom_15_junction.end)
                in sup.introns, (chrom_15_junction, sup.introns))
            sup.session = session
            asyncio.run(sup._load_introns())
            if stranded is True and strand is not None:
                self.assertEqual(
                    sup.locus_verified_introns,
                    {(chrom_15_junction.junction_start,
                      chrom_15_junction.junction_end, strand)},
                    (stranded, strand))
            elif stranded is False:
                self.assertEqual(
                    sup.locus_verified_introns,
                    {(chrom_15_junction.junction_start,
                      chrom_15_junction.junction_end,
                      chrom_15_junction.strand),
                     (wrong_strand_chrom_15.junction_start,
                      wrong_strand_chrom_15.junction_end,
                      wrong_strand_chrom_15.strand)}, (stranded, strand))
            elif stranded is True and strand is None:
                self.assertEqual(sup.locus_verified_introns, set())
Ejemplo n.º 9
0
    def test_locus(self):
        """Basic testing of the Locus functionality."""

        logger = create_null_logger("null")
        logger.setLevel("WARNING")
        logger.info("Started")
        slocus = Superlocus(self.transcript1,
                            json_conf=self.my_json,
                            logger=logger)
        slocus.add_transcript_to_locus(self.transcript2)
        self.assertEqual(slocus.strand, self.transcript1.strand)
        self.assertEqual(slocus.start,
                         min(self.transcript1.start, self.transcript2.start))
        self.assertEqual(slocus.end,
                         max(self.transcript1.end, self.transcript2.end))
        logger.info(slocus.transcripts)
        slocus.define_subloci()
        logger.info(slocus.subloci)
        logger.info(slocus.transcripts)
        self.assertEqual(len(slocus.transcripts), 2)
        self.assertEqual(len(slocus.subloci), 2)
        slocus.define_monosubloci()
        self.assertEqual(len(slocus.monosubloci), 2)
        slocus.define_loci()
        self.assertEqual(len(slocus.loci), 1)
        self.assertEqual(
            list(slocus.loci[list(
                slocus.loci.keys())[0]].transcripts.keys())[0], "t0")
        gff_transcript3 = """Chr1\tfoo\ttranscript\t101\t200\t.\t-\t.\tID=tminus0
Chr1\tfoo\texon\t101\t200\t.\t-\t.\tID=tminus0:exon1;Parent=tminus0""".split(
            "\n")
        gff_transcript3 = [GFF.GffLine(x) for x in gff_transcript3]
        transcript3 = Transcript(gff_transcript3[0])
        for exon in gff_transcript3[1:]:
            transcript3.add_exon(exon)
        transcript3.finalize()
        minusuperlocus = Superlocus(transcript3, json_conf=self.my_json)
        minusuperlocus.define_loci()
        self.assertEqual(len(minusuperlocus.loci), 1)
        self.assertTrue(transcript3.strand != self.transcript1.strand)