def test_codon_finder_negative_2(self):
        gtf_lines = """Chr5	TAIR10	mRNA	5335	5769	.	-	.	transcript_id "AT5G01015.1"; gene_id "AT5G01015";
Chr5	TAIR10	CDS	5697	5766	.	-	0	transcript_id "AT5G01015.1"; gene_id "AT5G01015";;
Chr5	TAIR10	exon	5697	5769	.	-	.	transcript_id "AT5G01015.1"; gene_id "AT5G01015";
Chr5	TAIR10	CDS	5338	5576	.	-	1	transcript_id "AT5G01015.1"; gene_id "AT5G01015";
Chr5	TAIR10	exon	5335	5576	.	-	.	transcript_id "AT5G01015.1"; gene_id "AT5G01015";"""

        gtf_lines = [GtfLine(_) for _ in gtf_lines.split("\n")]
        t = Transcript(gtf_lines[0])
        t.add_exons(gtf_lines[1:])
        t.finalize()

        seq = self.genome[t.chrom][t.start - 1:t.end]
        logger = create_default_logger("test_codon_finder_negative_2",
                                       level="WARNING")
        self.assertTrue(t.has_start_codon)
        self.assertTrue(t.has_stop_codon)
        tc = TranscriptChecker(t, seq, logger=logger)
        tc.finalize()
        tc.check_orf()
        self.assertTrue(tc.is_coding)
        self.assertIn("has_stop_codon", tc.attributes)
        self.assertIn("has_start_codon", tc.attributes)
        self.assertFalse(tc.has_stop_codon)
        self.assertFalse(tc.has_start_codon)
    def test_codon_finder_negative_3(self):

        gtf_lines = """Chr5	TAIR10	mRNA	5335	5769	.	-	.	transcript_id "AT5G01015.1"; gene_id "AT5G01015";
Chr5	TAIR10	CDS	5697	5769	.	-	0	transcript_id "AT5G01015.1"; gene_id "AT5G01015";;
Chr5	TAIR10	exon	5697	5769	.	-	.	transcript_id "AT5G01015.1"; gene_id "AT5G01015";
Chr5	TAIR10	CDS	5335	5576	.	-	1	transcript_id "AT5G01015.1"; gene_id "AT5G01015";
Chr5	TAIR10	exon	5335	5576	.	-	.	transcript_id "AT5G01015.1"; gene_id "AT5G01015";"""

        gtf_lines = [GtfLine(_) for _ in gtf_lines.split("\n")]
        t = Transcript(gtf_lines[0])
        t.add_exons(gtf_lines[1:])
        t.finalize()

        seq = self.genome[t.chrom][t.start - 1:t.end]
        correct_seq = "".join(
            """ATGGAGTCTAGCTTGCATAGTGTGATTTTCTTAGGTTTGCTTGCGACGATTCTGGTTACG
ACCAATGGCCAAGGAGACGGGACGGGGCTAAATGCAGAAGAAATGTGGCCAGTGGAGGTG
GGGATGGAGTATAGAGTATGGAGGAGAAAGCTGATGACGCCATTGGAGCTGTGCTTGGAG
TGCAAATGCTGCTCCTCCACCACTTGTGCCACCATGCCTTGCTGTTTCGGCATCAATTGC
CAGCTTCCCAACAAGCCATTTGGCGTTTGTGCCTTTGTTCCCAAGTCATGCCATTGTAAT
TCTTGCTCCATTTGA""".split("\n"))
        logger = create_default_logger("test_codon_finder_negative_3",
                                       level="WARNING")
        tc = TranscriptChecker(t, seq, logger=logger)
        tc.finalize()
        correct_length = (5576 - 5335 + 1) + (5769 - 5697 + 1)
        self.assertEqual(correct_length, len(correct_seq),
                         (correct_length, len(correct_seq)))
        self.assertEqual(tc.cdna_length, correct_length,
                         (correct_length, tc.cdna_length))
        self.assertEqual(len(tc.cdna), tc.cdna_length)
        self.assertEqual(correct_seq, tc.cdna)

        tc.check_orf()
        tc_orfs = tc.find_overlapping_cds(tc.get_internal_orf_beds())
        self.assertEqual(1, len(tc_orfs))
        self.assertTrue(tc_orfs[0].has_stop_codon,
                        (tc_orfs[0], tc_orfs[0].stop_codon))
        self.assertTrue(tc_orfs[0].has_start_codon,
                        (tc_orfs[0], tc_orfs[0].start_codon))

        self.assertTrue(tc.is_coding)
        self.assertIn("has_stop_codon", tc.attributes)
        self.assertIn("has_start_codon", tc.attributes)
        self.assertTrue(tc.has_start_codon, tc.cdna)
        self.assertTrue(tc.has_stop_codon, tc.cdna)
    def test_init(self):

        with self.assertRaises(ValueError):
            tcheck = TranscriptChecker(self.model, None)

        for wrong_splices in ["AGGT", None, 100]:
            with self.assertRaises(ValueError):
                tcheck = TranscriptChecker(self.model,
                                           self.model_fasta,
                                           canonical_splices=wrong_splices)

        tcheck = TranscriptChecker(self.model, self.model_fasta)
        tcheck.finalize()
        self.assertEqual(tcheck.cdna_length, 1718)
        self.assertEqual(
            sorted(tcheck.exons),
            sorted([(exon[0], exon[1]) for exon in self.model.exons]))
        self.assertEqual(str(tcheck.fasta_seq.seq), self.model_fasta,
                         (type(tcheck.fasta_seq), type(self.model_fasta),
                          len(tcheck.fasta_seq), len(self.model_fasta)))

        with self.subTest(initializer=Bio.Seq.Seq):
            _ = TranscriptChecker(self.model,
                                  Bio.Seq.Seq(str(self.model_fasta)))

        with self.subTest(initializer=str):
            _ = TranscriptChecker(self.model, str(self.model_fasta))

        with self.subTest(initializer=pyfaidx.Sequence):
            _ = TranscriptChecker(
                self.model,
                pyfaidx.Sequence(seq=str(self.model_fasta), name=tcheck.id))

        # Now check initializing with a GFF/GTF line
        for out_format in ["gtf", "gff3"]:
            with self.subTest(out_format=out_format):
                line = self.model.format(out_format).split("\n")[0]
                try:
                    tcheck = TranscriptChecker(line, self.model_fasta)
                except ValueError as exc:
                    raise ValueError(line)
Exemple #4
0
def create_transcript(lines,
                      fasta_seq,
                      start,
                      end,
                      lenient=False,
                      is_reference=False,
                      strand_specific=False,
                      canonical_splices=(("GT", "AG"), ("GC", "AG"), ("AT",
                                                                      "AC")),
                      force_keep_cds=False,
                      logger=None):
    """Function to create the checker.

    :param lines: all the exon lines for an object
    :type lines: dict

    :param fasta_seq: genomic sequence of the transcript

    :param start: start position for the transcript
    :type start: int
    :param end: end position for the transcript
    :type end: int

    :type lenient: bool
    :type strand_specific: bool

    :param canonical_splices: the splices considered as canonical for the species.
    :type canonical_splices: list[tuple]

    :param force_keep_cds: boolean. If set to true, coding transcripts that would be flipped are instead excluded.
                           The intention is that this flag will mirror strip_cds.
    :type force_keep_cds: bool

    :param logger: optional logger to use during processing.

    :param is_reference: boolean. If set, the transcript's strand will not be checked.


    :rtype: (None|TranscriptChecker)
    """

    if logger is None:
        logger = create_null_logger()

    if "tid" not in lines:
        logger.error("Lines datastore lacks the transcript ID. Exiting.")
        return None

    try:
        logger.debug("Starting with %s", lines["tid"])
        transcript_line = Transcript()
        transcript_line.chrom = lines["chrom"]
        if "source" in lines:
            transcript_line.source = lines["source"]
        transcript_line.strand = lines["strand"]
        transcript_line.attributes.update(lines["attributes"])
        transcript_line.feature = "transcript"
        transcript_line.start, transcript_line.end = sorted([start, end])
        transcript_line.logger = logger
        assert lines["tid"] is not None, lines
        transcript_line.id = lines["tid"]
        transcript_line.parent = lines["parent"]

        for feature in lines["features"]:
            coords, phases = [], []
            for feat in lines["features"][feature]:
                try:
                    assert isinstance(
                        feat, (list, tuple)) and 2 <= len(feat) <= 3, feat
                except AssertionError:
                    raise exceptions.InvalidTranscript("Invalid feature")
                coords.append((feat[0], feat[1]))
                if len(feat) == 3 and feat[2] in (0, 1, 2, None):
                    phases.append(feat[2])
                else:
                    phases.append(None)
            try:
                assert len(phases) == len(coords)
            except AssertionError:
                raise exceptions.InvalidTranscript("Invalid phases/coords")
            transcript_line.add_exons(coords, features=feature, phases=phases)

        transcript_object = TranscriptChecker(
            transcript_line,
            fasta_seq,
            lenient=lenient,
            strand_specific=strand_specific,
            canonical_splices=canonical_splices,
            force_keep_cds=force_keep_cds,
            is_reference=is_reference,
            logger=logger)
        logger.debug("Finished adding exon lines to %s", lines["tid"])
        transcript_object.finalize()
        transcript_object.check_strand()
        transcript_object.check_orf()
    except exceptions.IncorrectStrandError:
        logger.info(
            "Discarded %s because of incorrect fusions of splice junctions",
            lines["tid"])
        # logger.exception(exc)
        transcript_object = None
    except exceptions.InvalidTranscript as exc:
        logger.info(
            "Discarded generically invalid transcript %s, exception: %s",
            lines["tid"], exc)
        transcript_object = None
    except AssertionError as exc:
        logger.info("Validation failed on %s, assertion failure: %s",
                    lines["tid"], exc)
        transcript_object = None
    except KeyboardInterrupt:
        raise KeyboardInterrupt
    except Exception as exc:
        logger.exception(exc)
        transcript_object = None

    return transcript_object
Exemple #5
0
def create_transcript(lines,
                      fasta_seq,
                      start,
                      end,
                      lenient=False,
                      strand_specific=False,
                      canonical_splices=(("GT", "AG"), ("GC", "AG"), ("AT",
                                                                      "AC")),
                      logger=None):
    """Function to create the checker.

    :param lines: all the exon lines for an object
    :type lines: dict

    :param fasta_seq: genomic sequence of the transcript

    :param start: start position for the transcript
    :type start: int
    :param end: end position for the transcript
    :type end: int

    :type lenient: bool
    :type strand_specific: bool

    :param canonical_splices: the splices considered as canonical for the species.
    :type canonical_splices: list[tuple]

    :param logger: optional logger to use during processing.

    :rtype: (None|TranscriptChecker)
    """

    if logger is None:
        logger = create_null_logger("checker")

    logger.debug("Starting with %s", lines["tid"])

    try:
        transcript_line = Transcript()
        transcript_line.chrom = lines["chrom"]
        if "source" in lines:
            transcript_line.source = lines["source"]
        transcript_line.strand = lines["strand"]
        transcript_line.attributes.update(lines["attributes"])
        transcript_line.feature = "transcript"
        transcript_line.start, transcript_line.end = sorted([start, end])
        transcript_line.logger = logger
        assert lines["tid"] is not None, lines
        transcript_line.id = lines["tid"]
        transcript_line.parent = lines["parent"]

        for feature in lines["features"]:
            transcript_line.add_exons(lines["features"][feature],
                                      features=feature)
        transcript_object = TranscriptChecker(
            transcript_line,
            fasta_seq,
            lenient=lenient,
            strand_specific=strand_specific,
            canonical_splices=canonical_splices,
            logger=logger)
        logger.debug("Finished adding exon lines to %s", lines["tid"])
        transcript_object.finalize()
        transcript_object.check_strand()
    except exceptions.IncorrectStrandError:
        logger.info(
            "Discarded %s because of incorrect fusions of splice junctions",
            lines["tid"])
        # logger.exception(exc)
        transcript_object = None
    except exceptions.InvalidTranscript:
        logger.info("Discarded generically invalid transcript %s",
                    lines["tid"])
        transcript_object = None
    except KeyboardInterrupt:
        raise KeyboardInterrupt
    except Exception as exc:
        logger.exception(exc)
        transcript_object = None

    logger.debug("Finished with %s", lines["tid"])

    return transcript_object
Exemple #6
0
def create_transcript(lines,
                      fasta_seq,
                      start,
                      end,
                      lenient=False,
                      strand_specific=False,
                      canonical_splices=(("GT", "AG"),
                                         ("GC", "AG"),
                                         ("AT", "AC")),
                      logger=None):
    """Function to create the checker.

    :param lines: all the exon lines for an object
    :type lines: dict

    :param fasta_seq: genomic sequence of the transcript

    :param start: start position for the transcript
    :type start: int
    :param end: end position for the transcript
    :type end: int

    :type lenient: bool
    :type strand_specific: bool

    :param canonical_splices: the splices considered as canonical for the species.
    :type canonical_splices: list[tuple]

    :param logger: optional logger to use during processing.

    :rtype: (None|TranscriptChecker)
    """

    if logger is None:
        logger = create_null_logger("checker")

    logger.debug("Starting with %s", lines["tid"])

    try:
        transcript_line = Transcript()
        transcript_line.chrom = lines["chrom"]
        if "source" in lines:
            transcript_line.source = lines["source"]
        transcript_line.strand = lines["strand"]
        transcript_line.attributes.update(lines["attributes"])
        transcript_line.feature = "transcript"
        transcript_line.start, transcript_line.end = sorted([start, end])
        transcript_line.logger = logger
        assert lines["tid"] is not None, lines
        transcript_line.id = lines["tid"]
        transcript_line.parent = lines["parent"]

        for feature in lines["features"]:
            transcript_line.add_exons(lines["features"][feature],
                                      features=feature)
        transcript_object = TranscriptChecker(transcript_line,
                                              fasta_seq,
                                              lenient=lenient,
                                              strand_specific=strand_specific,
                                              canonical_splices=canonical_splices,
                                              logger=logger)
        logger.debug("Finished adding exon lines to %s", lines["tid"])
        transcript_object.finalize()
        transcript_object.check_strand()
    except exceptions.IncorrectStrandError:
        logger.info("Discarded %s because of incorrect fusions of splice junctions",
                    lines["tid"])
        # logger.exception(exc)
        transcript_object = None
    except exceptions.InvalidTranscript:
        logger.info("Discarded generically invalid transcript %s",
                    lines["tid"])
        transcript_object = None
    except KeyboardInterrupt:
        raise KeyboardInterrupt
    except Exception as exc:
        logger.exception(exc)
        transcript_object = None

    logger.debug("Finished with %s", lines["tid"])

    return transcript_object
    def test_codon_finder_negative_strip_cds(self):
        gtf_lines = """Chr5	TAIR10	mRNA	5335	5769	.	-	.	transcript_id "AT5G01015.1"; gene_id "AT5G01015";
Chr5	TAIR10	CDS	5697	5769	.	-	0	transcript_id "AT5G01015.1"; gene_id "AT5G01015";;
Chr5	TAIR10	exon	5697	5769	.	-	.	transcript_id "AT5G01015.1"; gene_id "AT5G01015";
Chr5	TAIR10	CDS	5335	5576	.	-	1	transcript_id "AT5G01015.1"; gene_id "AT5G01015";
Chr5	TAIR10	exon	5335	5576	.	-	.	transcript_id "AT5G01015.1"; gene_id "AT5G01015";"""

        gtf_lines = [GtfLine(_) for _ in gtf_lines.split("\n")]
        t = Transcript(gtf_lines[0])
        t.add_exons(gtf_lines[1:])
        t.finalize()

        seq = str(self.genome[t.chrom][t.start - 1:t.end])
        # Basically insert an internal stop codon. This will make the ORF tests fail, leading to the ORF being stripped
        seq = seq[:72] + Bio.Seq.reverse_complement("TAG") + seq[75:]
        correct_seq = "".join(
            """ATGGAGTCTAGCTTGCATAGTGTGATTTTCTTAGGTTTGCTTGCGACGATTCTGGTTACG
ACCAATGGCCAAGGAGACGGGACGGGGCTAAATGCAGAAGAAATGTGGCCAGTGGAGGTG
GGGATGGAGTATAGAGTATGGAGGAGAAAGCTGATGACGCCATTGGAGCTGTGCTTGGAG
TGCAAATGCTGCTCCTCCACCACTTGTGCCACCATGCCTTGCTGTTTCGGCATCAATTGC
TAGCTTCCCAACAAGCCATTTGGCGTTTGTGCCTTTGTTCCCAAGTCATGCCATTGTAAT
TCTTGCTCCATTTGA""".split("\n"))
        logger = create_default_logger("test_codon_finder_negative_3",
                                       level="WARNING")

        with self.assertRaises(InvalidTranscript):
            for lenient in (False, True):
                tc = TranscriptChecker(t,
                                       seq,
                                       logger=logger,
                                       lenient=lenient,
                                       strip_faulty_cds=False)
                tc.finalize()
                tc.check_orf()

        for lenient in (False, True):
            tc = TranscriptChecker(t,
                                   seq,
                                   logger=logger,
                                   lenient=lenient,
                                   strip_faulty_cds=True)
            tc.finalize()
            correct_length = (5576 - 5335 + 1) + (5769 - 5697 + 1)
            self.assertEqual(correct_length, len(correct_seq),
                             (correct_length, len(correct_seq)))
            self.assertEqual(tc.cdna_length, correct_length,
                             (correct_length, tc.cdna_length))
            self.assertEqual(len(tc.cdna), tc.cdna_length)
            self.assertEqual(correct_seq, tc.cdna)

            tc.check_orf()
            self.assertFalse(tc.is_coding)
            tc_orfs = tc.find_overlapping_cds(tc.get_internal_orf_beds())
            self.assertEqual(1, len(tc_orfs))
            self.assertFalse(tc_orfs[0].has_stop_codon,
                             (tc_orfs[0], tc_orfs[0].stop_codon))
            self.assertFalse(tc_orfs[0].has_start_codon,
                             (tc_orfs[0], tc_orfs[0].start_codon))

            self.assertFalse(tc.is_coding)
            self.assertNotIn("has_stop_codon", tc.attributes)
            self.assertNotIn("has_start_codon", tc.attributes)
            self.assertFalse(tc.has_start_codon, tc.cdna)
            self.assertFalse(tc.has_stop_codon, tc.cdna)