Example #1
0
    def test_merger(self):

        with tempfile.NamedTemporaryFile(suffix=".tmp", mode="wt", delete=False) as first, \
                tempfile.NamedTemporaryFile(suffix=".tmp", mode="wt", delete=False) as second, \
                tempfile.NamedTemporaryFile(suffix=".tmp", mode="wt", delete=False) as third,\
                tempfile.NamedTemporaryFile(suffix=".out", mode="wt", delete=False) as out:
            print("1/first case", file=first)
            print("3/third case", file=first)
            print("4/fourth case", file=first)
            print("2/second case", file=second)
            print("5/fifth case", file=third)
            print("6/sixth case", file=second)
            first.flush()
            second.flush()
            third.flush()
            logger = create_default_logger("test_merger", level="DEBUG")
            utilities.merge_partial([first.name, second.name, third.name], out, logger=logger)
            with open(out.name) as _:
                lines = [l for l in _]
            self.assertEqual(len(lines), 6, (out.name, lines))
            self.assertEqual(lines[0], "first case\n", lines)
            self.assertEqual(lines[1], "second case\n")
            self.assertEqual(lines[2], "third case\n")
            self.assertEqual(lines[3], "fourth case\n")
            self.assertEqual(lines[4], "fifth case\n")
            self.assertEqual(lines[5], "sixth case\n")
            # Verify the temporary files have been deleted correctly
            existing = []
            for handle in (first, second, third):
                if os.path.exists(handle.name):
                    existing.append(handle.name)
            self.assertEqual(len(existing), 0, [os.remove(name) for name in existing])
            self.assertTrue(os.path.exists(out.name))
            os.remove(out.name)
Example #2
0
def main():

    logger = create_default_logger("sanitizer")

    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("-o",
                        "--out",
                        default=sys.stdout,
                        type=argparse.FileType("wt"))
    parser.add_argument("fasta", nargs="+", type=argparse.FileType("rt"))
    args = parser.parse_args()

    found_ids = Counter()

    starter = 96

    for fasta in args.fasta:
        if len(args.fasta) > 1:
            starter += 1
            prefix = "{}_".format(chr(starter))
        else:
            prefix = ""
        for record in Bio.SeqIO.parse(fasta, "fasta"):
            if record.id in found_ids:
                logger.warning(
                    "ID found other {} time{} in the input files!".format(
                        found_ids[record.id],
                        "s" if found_ids[record.id] > 1 else ""))
            record.id = "{}{}".format(prefix, record.id)
            record.description = ""
            Bio.SeqIO.write(record, args.out, "fasta")

    args.out.close()
Example #3
0
def main():

    logger = create_default_logger("sanitizer")

    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("-ml", "--min-length", default=0, type=pos)
    parser.add_argument("-o", "--out", default=sys.stdout, type=argparse.FileType("wt"))
    parser.add_argument("fasta", nargs="+", type=argparse.FileType("rt"))
    args = parser.parse_args()

    found_ids = Counter()

    starter = 96

    for fasta in args.fasta:
        if fasta.name.endswith(".gz"):
            fasta.close()
            fasta = gzip.open(fasta.name, "rt")

        if len(args.fasta) > 1:
            starter += 1
            prefix = "{}_".format(chr(starter))
        else:
            prefix = ""
        for record in Bio.SeqIO.parse(fasta, "fasta"):
            if record.id in found_ids:
                logger.warning("ID found other {} time{} in the input files!".format(
                    found_ids[record.id], "s" if found_ids[record.id] > 1 else ""))
            record.id = "{}{}".format(prefix, re.sub(r"\|", "_", record.id))
            record.description = ""
            if len(record) < args.min_length:
                continue
            Bio.SeqIO.write(record, args.out, "fasta")

    args.out.close()
Example #4
0
def main():

    logger = create_default_logger("sanitizer")

    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("-o", "--out", default=sys.stdout, type=argparse.FileType("wt"))
    parser.add_argument("fasta", nargs="+", type=argparse.FileType("rt"))
    args = parser.parse_args()

    found_ids = Counter()

    starter = 96

    for fasta in args.fasta:
        if len(args.fasta) > 1:
            starter += 1
            prefix = "{}_".format(chr(starter))
        else:
            prefix = ""
        for record in Bio.SeqIO.parse(fasta, "fasta"):
            if record.id in found_ids:
                logger.warning("ID found other {} time{} in the input files!".format(
                    found_ids[record.id], "s" if found_ids[record.id] > 1 else ""))
            record.id = "{}{}".format(prefix, record.id)
            record.description = ""
            Bio.SeqIO.write(record, args.out, "fasta")

    args.out.close()
Example #5
0
    def __init__(self, out_sq, queue, verbosity="INFO"):

        super().__init__()
        self.out_sq = out_sq
        self.logging_queue = logging_queue
        self.logger = create_default_logger("")
        self.log_level = verbosity
        create_queue_logger(self)
        self.engine = sqlalchemy.create_engine("sqlite:///{}".format(out_sq))
        transfer_base.metadata.create_all(self.engine)
        self.Session = sessionmaker(bind=self.engine)
        self.session = self.Session()
        self.queue = queue
    def test_with_no_gff_utr(self):
        """
        Test the creation of the transcript without the UTR lines, verify that everything is still alright
        :return:
        """
        tr_gff = """Chr2    TAIR10    mRNA    626642    629176    .    +    .    ID=AT2G02380.1;Parent=AT2G02380
Chr2    TAIR10    exon    626642    626780    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    exon    626842    626880    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    626878    626880    .    +    0    Parent=AT2G02380.1
Chr2    TAIR10    exon    626963    627059    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    626963    627059    .    +    0    Parent=AT2G02380.1
Chr2    TAIR10    exon    627137    627193    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    627137    627193    .    +    2    Parent=AT2G02380.1
Chr2    TAIR10    exon    627312    627397    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    627312    627397    .    +    2    Parent=AT2G02380.1
Chr2    TAIR10    exon    627488    627559    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    627488    627559    .    +    0    Parent=AT2G02380.1
Chr2    TAIR10    exon    627696    627749    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    627696    627749    .    +    0    Parent=AT2G02380.1
Chr2    TAIR10    exon    627840    627915    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    627840    627915    .    +    0    Parent=AT2G02380.1
Chr2    TAIR10    exon    628044    628105    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    628044    628105    .    +    2    Parent=AT2G02380.1
Chr2    TAIR10    exon    628182    628241    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    628182    628241    .    +    0    Parent=AT2G02380.1
Chr2    TAIR10    exon    628465    628676    .    +    .    Parent=AT2G02380.1
Chr2    TAIR10    CDS    628465    628569    .    +    0    Parent=AT2G02380.1
Chr2    TAIR10    exon    629070    629176    .    +    .    Parent=AT2G02380.1"""

        tr_lines = tr_gff.split("\n")
        logger = create_default_logger("test")
        logger.setLevel("INFO")
        for pos, line in enumerate(tr_lines):
            tr_lines[pos] = re.sub(r"\s+", "\t", line)
            assert len(tr_lines[pos].split("\t")) == 9, line.split("\t")

        tr_gff_lines = [parsers.GFF.GffLine(line) for line in tr_lines]

        transcript = loci.Transcript(tr_gff_lines[0], logger=logger)
        for line in tr_gff_lines[1:]:
            transcript.add_exon(line)

        self.assertEqual(transcript.exons, self.tr.exons)
        self.assertNotEqual([], transcript.combined_cds)
        transcript.finalize()
        self.assertTrue(transcript.is_coding)
        self.assertEqual(transcript.five_utr, self.tr.five_utr)
        self.assertEqual(transcript.three_utr, self.tr.three_utr)
    def test_split(self):

        self.tr.load_orfs([self.bed1, self.bed2])

        self.assertEqual(self.tr.selected_cds_start, 15494127)
        self.assertEqual(self.tr.selected_cds_end, 15495994)

        self.assertEqual(self.tr.combined_cds_start, 15490903)
        # The other CDS starts at 15494127

        logger = create_default_logger(self.tr.id)
        logger.setLevel("WARN")
        self.tr.logger = logger

        new_transcripts = [_ for _ in self.tr.split_by_cds()]
        self.assertEqual(len(new_transcripts), 2)
    def test_split(self):
        # Chr3	Cufflinks	mRNA	2949168	2952410	1000	-	.	ID=cufflinks_cufflinks_star_at.10687.1.orf1;
        # Chr3	Cufflinks	three_prime_UTR	2949168	2950204	.	-	.	Parent=cufflinks_cufflinks_star_at.10687.1.orf1
        # Chr3	Cufflinks	exon	2949168	2952410	.	-	.	Parent=cufflinks_cufflinks_star_at.10687.1.orf1
        # Chr3	Cufflinks	CDS	2,950,205	2,952,208	.	-	0	Parent=cufflinks_cufflinks_star_at.10687.1.orf1
        # Chr3	Cufflinks	five_prime_UTR	2952209	2952410	.	-	.	Parent=cufflinks_cufflinks_star_at.10687.1.orf1
        #
        # Chr3	Cufflinks	mRNA	2949168	2952410	1000	-	.	ID=cufflinks_cufflinks_star_at.10687.1.orf2;
        # Chr3	Cufflinks	three_prime_UTR	2949168	2949169	.	-	.	Parent=cufflinks_cufflinks_star_at.10687.1.orf2
        # Chr3	Cufflinks	exon	2949168	2952410	.	-	.	Parent=cufflinks_cufflinks_star_at.10687.1.orf2
        # Chr3	Cufflinks	CDS	2,949,170	2,949,868	.	-	0	Parent=cufflinks_cufflinks_star_at.10687.1.orf2
        # Chr3	Cufflinks	five_prime_UTR	2949869	2952410	.	-	.	Parent=cufflinks_cufflinks_star_at.10687.1.orf2
        # =============>
        # Chr3	Cufflinks	mRNA	2950205	2952410	1000	-	.	ID=cufflinks_cufflinks_star_at.10687.1.split1;
        # Chr3	Cufflinks	exon	2950205 2952410	.	-	.	Parent=cufflinks_cufflinks_star_at.10687.1.split1
        # Chr3	Cufflinks	CDS	2950205	2952208	.	-	0	Parent=cufflinks_cufflinks_star_at.10687.1.split1
        # Chr3	Cufflinks	five_prime_UTR	2952209	2952410	.	-	.	Parent=cufflinks_cufflinks_star_at.10687.1.split1
        #
        # Chr3	Cufflinks	mRNA	2949168	2949868	1000	-	.	ID=cufflinks_cufflinks_star_at.10687.1.split2;
        # Chr3	Cufflinks	three_prime_UTR	2949168	2949169	.	-	.	Parent=cufflinks_cufflinks_star_at.10687.1.split2
        # Chr3	Cufflinks	exon	2949168	2949868	.	-	.	Parent=cufflinks_cufflinks_star_at.10687.1.split2
        # Chr3	Cufflinks	CDS	2949170	2949868	.	-	0	Parent=cufflinks_cufflinks_star_at.10687.1.split2

        self.tr.load_orfs([self.bed1, self.bed2])
        self.assertEqual(self.tr.strand, "-")

        self.assertEqual(self.tr.number_internal_orfs, 2)
        self.assertEqual(self.tr.combined_cds_end, 2949170)
        self.assertEqual(self.tr.combined_cds_start, 2952208)
        self.assertEqual(self.tr.selected_cds_start, 2952208)
        self.assertEqual(self.tr.selected_cds_end, 2950205)

        logger = create_default_logger("splitter")
        logger.setLevel("ERROR")
        self.tr.logger = logger

        new_transcripts = sorted([_ for _ in self.tr.split_by_cds()])

        self.assertEqual(new_transcripts[0].start, self.tr.start)
        self.assertEqual(new_transcripts[0].end, 2949868,
                         "\n\n".join([str(_) for _ in new_transcripts]))
    def test_split(self):
# Chr3	Cufflinks	mRNA	2949168	2952410	1000	-	.	ID=cufflinks_cufflinks_star_at.10687.1.orf1;
# Chr3	Cufflinks	three_prime_UTR	2949168	2950204	.	-	.	Parent=cufflinks_cufflinks_star_at.10687.1.orf1
# Chr3	Cufflinks	exon	2949168	2952410	.	-	.	Parent=cufflinks_cufflinks_star_at.10687.1.orf1
# Chr3	Cufflinks	CDS	2,950,205	2,952,208	.	-	0	Parent=cufflinks_cufflinks_star_at.10687.1.orf1
# Chr3	Cufflinks	five_prime_UTR	2952209	2952410	.	-	.	Parent=cufflinks_cufflinks_star_at.10687.1.orf1
#
# Chr3	Cufflinks	mRNA	2949168	2952410	1000	-	.	ID=cufflinks_cufflinks_star_at.10687.1.orf2;
# Chr3	Cufflinks	three_prime_UTR	2949168	2949169	.	-	.	Parent=cufflinks_cufflinks_star_at.10687.1.orf2
# Chr3	Cufflinks	exon	2949168	2952410	.	-	.	Parent=cufflinks_cufflinks_star_at.10687.1.orf2
# Chr3	Cufflinks	CDS	2,949,170	2,949,868	.	-	0	Parent=cufflinks_cufflinks_star_at.10687.1.orf2
# Chr3	Cufflinks	five_prime_UTR	2949869	2952410	.	-	.	Parent=cufflinks_cufflinks_star_at.10687.1.orf2
# =============>
# Chr3	Cufflinks	mRNA	2950205	2952410	1000	-	.	ID=cufflinks_cufflinks_star_at.10687.1.split1;
# Chr3	Cufflinks	exon	2950205 2952410	.	-	.	Parent=cufflinks_cufflinks_star_at.10687.1.split1
# Chr3	Cufflinks	CDS	2950205	2952208	.	-	0	Parent=cufflinks_cufflinks_star_at.10687.1.split1
# Chr3	Cufflinks	five_prime_UTR	2952209	2952410	.	-	.	Parent=cufflinks_cufflinks_star_at.10687.1.split1
#
# Chr3	Cufflinks	mRNA	2949168	2949868	1000	-	.	ID=cufflinks_cufflinks_star_at.10687.1.split2;
# Chr3	Cufflinks	three_prime_UTR	2949168	2949169	.	-	.	Parent=cufflinks_cufflinks_star_at.10687.1.split2
# Chr3	Cufflinks	exon	2949168	2949868	.	-	.	Parent=cufflinks_cufflinks_star_at.10687.1.split2
# Chr3	Cufflinks	CDS	2949170	2949868	.	-	0	Parent=cufflinks_cufflinks_star_at.10687.1.split2

        self.tr.load_orfs([self.bed1, self.bed2])
        self.assertEqual(self.tr.strand, "-")

        self.assertEqual(self.tr.number_internal_orfs, 2)
        self.assertEqual(self.tr.combined_cds_end, 2949170)
        self.assertEqual(self.tr.combined_cds_start, 2952208)
        self.assertEqual(self.tr.selected_cds_start, 2952208)
        self.assertEqual(self.tr.selected_cds_end, 2950205)

        logger = create_default_logger("splitter")
        logger.setLevel("ERROR")
        self.tr.logger = logger

        new_transcripts = sorted([_ for _ in self.tr.split_by_cds()])

        self.assertEqual(new_transcripts[0].start, self.tr.start)
        self.assertEqual(new_transcripts[0].end, 2949868, "\n\n".join([str(_) for _ in new_transcripts]))
Example #10
0
    def test_split(self):
        self.tr.load_orfs([self.bed1, self.bed2])
        self.assertEqual(self.tr.number_internal_orfs, 2)
        logger = create_default_logger("splitter")
        logger.setLevel("ERROR")
        self.tr.logger = logger

        new_transcripts = [_ for _ in self.tr.split_by_cds()]

        new_transcripts = sorted(new_transcripts,
                                 key=operator.attrgetter("start", "end"))
        self.assertEqual(len(new_transcripts), 2)

        self.assertEqual(new_transcripts[0].start, 72914)
        self.assertEqual(new_transcripts[0].end, 74914)

        self.assertEqual(new_transcripts[1].end, 76276, self.tr.internal_orfs)
        self.assertEqual(new_transcripts[1].start, 75394)

        self.assertEqual(new_transcripts[0].selected_cds_start, 74914)
        self.assertEqual(new_transcripts[0].selected_cds_end, 74336)

        self.assertEqual(new_transcripts[1].selected_cds_start, 75804)
        self.assertEqual(new_transcripts[1].selected_cds_end, 75394)
    def test_split(self):
        self.tr.load_orfs([self.bed1, self.bed2])
        self.assertEqual(self.tr.number_internal_orfs, 2)
        logger = create_default_logger("splitter")
        logger.setLevel("ERROR")
        self.tr.logger = logger

        new_transcripts = [_ for _ in self.tr.split_by_cds()]

        new_transcripts = sorted(new_transcripts,
                                 key=operator.attrgetter("start", "end"))
        self.assertEqual(len(new_transcripts), 2)

        self.assertEqual(new_transcripts[0].start, 72914)
        self.assertEqual(new_transcripts[0].end, 74914)

        self.assertEqual(new_transcripts[1].end, 76276, self.tr.internal_orfs)
        self.assertEqual(new_transcripts[1].start, 75394)

        self.assertEqual(new_transcripts[0].selected_cds_start, 74914)
        self.assertEqual(new_transcripts[0].selected_cds_end, 74336)

        self.assertEqual(new_transcripts[1].selected_cds_start, 75804)
        self.assertEqual(new_transcripts[1].selected_cds_end, 75394)
Example #12
0
class MonoHolderTester(unittest.TestCase):

    logger = create_default_logger("MonoHolderTester")

    def setUp(self):

        self.conf = dict()

        self.t1 = Transcript()
        self.t1.chrom = "Chr1"
        self.t1.strand = "+"
        self.t1.score = 20
        self.t1.id = "G1.1"
        self.t1.parent = "G1"
        self.t1.start = 101
        self.t1.end = 1500

        self.t1.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1500)],
                          "exon")
        self.t1.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)],
                          "CDS")
        self.t1.finalize()

    def testCdsOverlap(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1600

        t2.add_exons([(101, 500), (601, 700), (1001, 1300), (1401, 1460),
                      (1501, 1600)], "exon")
        t2.add_exons([(401, 500), (601, 700), (1001, 1300), (1401, 1440)],
                     "CDS")
        t2.finalize()

        self.assertTrue(MonosublocusHolder.is_intersecting(self.t1, t2))

    def test_intronMatch(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1600

        t2.add_exons([(101, 500), (601, 700), (1001, 1320), (1451, 1460),
                      (1501, 1600)], "exon")
        t2.add_exons([(401, 500), (601, 700), (1001, 1320), (1451, 1460),
                      (1501, 1510)], "CDS")
        t2.finalize()

        self.assertTrue(self.t1.is_coding)
        self.assertTrue(t2.is_coding)

        self.assertTrue(
            MonosublocusHolder.is_intersecting(self.t1, t2,
                                               logger=self.logger))
        self.assertTrue(
            MonosublocusHolder.is_intersecting(self.t1,
                                               t2,
                                               cds_only=True,
                                               logger=self.logger))

    def test_intronOverlap(self):

        self.t1.strip_cds()
        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1470
        t2.add_exons([(101, 510), (601, 700), (960, 1350), (1420, 1470)])

        t2.finalize()
        self.assertTrue(MonosublocusHolder.is_intersecting(self.t1, t2))

    def test_noIntronOverlap(self):

        self.t1.strip_cds()
        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 1250
        t2.end = 2000
        t2.add_exons([(1250, 1560), (1800, 2000)])
        t2.finalize()
        self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))

    def test_noCDSOverlap(self):

        self.t1.strip_cds()
        self.assertEqual(self.t1.combined_cds_introns, set())
        self.t1.finalized = False
        self.t1.add_exons([(401, 500), (601, 700), (1001, 1100)], "CDS")
        self.t1.finalize()

        t2 = Transcript()
        t2.logger = self.logger
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1470
        t2.add_exons([(101, 510), (601, 700), (960, 1350), (1421, 1470)])
        t2.add_exons([(1201, 1350), (1421, 1450)], "CDS")
        t2.finalize()

        self.assertTrue(self.t1.is_coding)
        self.assertTrue(t2.is_coding)

        self.assertGreaterEqual(
            0,
            overlap((self.t1.combined_cds_start, self.t1.combined_cds_end),
                    (t2.combined_cds_start, t2.combined_cds_end)),
            [(self.t1.combined_cds_start, self.t1.combined_cds_end),
             (t2.combined_cds_start, t2.combined_cds_end)])

        self.assertTrue(
            MonosublocusHolder.is_intersecting(self.t1, t2,
                                               logger=self.logger))
        self.assertFalse(
            MonosublocusHolder.is_intersecting(self.t1,
                                               t2,
                                               cds_only=True,
                                               logger=self.logger))

    def test_only_CDS_overlap(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 1250
        t2.end = 2000
        t2.add_exons([(1250, 1560), (1801, 2000)])
        t2.add_exons([(1401, 1560), (1801, 1850)], "CDS")
        t2.finalize()
        self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))

        t2.strip_cds()
        t2.finalized = False
        t2.add_exons([(1461, 1560), (1801, 1850)], "CDS")
        # No CDS overlap this time
        self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))

    def test_no_overlap(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 1600
        t2.end = 2000
        t2.add_exons([(1600, 1700), (1801, 2000)])
        t2.add_exons([(1661, 1700), (1801, 1850)], "CDS")
        t2.finalize()
        self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))

    def test_same_id(self):

        t2 = Transcript()
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G1.1"
        t2.parent = "G1"
        t2.start = 1250
        t2.end = 2000
        t2.add_exons([(1250, 1560), (1801, 2000)])
        t2.add_exons([(1401, 1560), (1801, 1850)], "CDS")
        t2.finalize()
        # This fails because they have the same ID
        self.assertFalse(MonosublocusHolder.is_intersecting(self.t1, t2))
Example #13
0
def main():

    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("--bed12",
                        nargs=2,
                        required=True,
                        help="Transcriptomic cDNAs BED12s")
    parser.add_argument("--cdnas", nargs=2, required=True)
    parser.add_argument("-gf",
                        help="GFF3/BED12 of the transferred annotation.",
                        required=True)
    parser.add_argument("--out",
                        default=sys.stdout,
                        type=argparse.FileType("wt"))
    parser.add_argument("-ob",
                        "--out-bed",
                        dest="out_bed",
                        required=False,
                        default=None,
                        type=argparse.FileType("wt"))
    log = parser.add_mutually_exclusive_group()
    log.add_argument("-q", "--quiet", default=False, action="store_true")
    log.add_argument("-v", "--verbose", default=False, action="store_true")
    parser.add_argument("-p", "--processes", type=int, default=mp.cpu_count())
    args = parser.parse_args()

    logger = create_default_logger("master")
    verbosity = "INFO"
    if args.verbose is True:
        verbosity = "DEBUG"
    elif args.quiet is True:
        verbosity = "WARNING"

    listener = logging.handlers.QueueListener(logging_queue, logger)
    listener.propagate = False
    listener.start()
    logger.setLevel(verbosity)

    cdnas = dict()
    beds = dict()
    beds["ref"] = dict()
    beds["target"] = dict()

    gmap_pat = re.compile("\.mrna[0-9]*$")

    logger.info("Loading reference cDNAS")
    cdnas["ref"] = pyfaidx.Fasta(args.cdnas[0])
    logger.info("Loading target cDNAS")
    cdnas["target"] = pyfaidx.Fasta(args.cdnas[1])
    logger.info("Loaded cDNAs")
    logger.info("Loading reference BED12")
    for entry in Bed12Parser(args.bed12[0], transcriptomic=True):
        if entry.header:
            continue
        name = entry.chrom
        if name in beds["ref"]:
            raise KeyError("Duplicated ID for the reference: {}".format(name))
        if name not in cdnas["ref"]:
            raise KeyError("Reference {} not found in the cDNAs!".format(name))
        beds["ref"][name] = entry

    logger.info("Loading target BED12")
    beds["target"] = defaultdict(dict)
    for entry in Bed12Parser(args.bed12[1], transcriptomic=True):
        # Now, here we have to account for the fact that there *might* be multiple alignments
        name = re.sub(gmap_pat, "", entry.chrom)
        if entry.chrom not in cdnas["target"]:
            raise KeyError("Target {} not found in the cDNAs!".format(
                entry.chrom))
        beds["target"][name][entry.chrom] = entry
    logger.info("Loaded BED12s")

    # Now let us start parsing the GFF3, which we presume being a GMAP GFF3
    transcript = None

    logger.info("Launching sub-processes")
    procs = []
    queue = mp.Queue(-1)
    for proc in range(args.processes):
        sq = tempfile.NamedTemporaryFile(mode="wb")
        sq.close()
        sq = sq.name
        _proc = Transferer(sq, queue, verbosity=verbosity)
        _proc.start()
        procs.append(_proc)
    logger.info("Launched sub-processes, starting parsing annotation")

    # pool = mp.Pool(processes=args.processes)

    tnum = -1
    if args.gf.endswith(("bed12", "bed")):
        parser = Bed12Parser(args.gf, transcriptomic=False)
        for line in parser:
            if line.header:
                continue
            else:
                transcript = Transcript(line)
                tid = re.sub(gmap_pat, "", transcript.id)
                logger.debug("Found %s", tid)
                ref_cdna = str(cdnas["ref"][tid])
                ref_bed = beds["ref"][tid]
                target_cdna = str(cdnas["target"][transcript.id])
                target_bed = beds["target"][tid][transcript.id]
                tnum += 1
                logger.debug("Submitting %s", tid)
                queue.put((tnum, (transcript, ref_cdna, ref_bed, target_cdna,
                                  target_bed)))
            if tnum >= 10**4 and tnum % 10**4 == 0:
                logger.info("Parsed {} transcripts", tnum)
        logger.info("Finished parsing input genomic BED file")
    else:
        parser = to_gff(args.gf)

        for pos, line in enumerate(parser):
            if line.header is True:  # or (not isinstance(line, BED12) and line.is_gene is True):
                if str(line) == "###":
                    continue
                try:
                    print(line, file=args.out)
                except IndexError:
                    raise IndexError(line._line)
                continue
            elif not isinstance(line, BED12) and line.is_gene is True:
                continue
            elif line.is_transcript is True:
                if transcript:
                    if transcript.alias is None:
                        tid = re.sub(gmap_pat, "", transcript.id)
                    else:
                        tid = re.sub(gmap_pat, "", transcript.alias)
                    ref_cdna = str(cdnas["ref"][tid])
                    ref_bed = beds["ref"][tid]
                    target_cdna = str(cdnas["target"][transcript.id])
                    store = beds["target"].get(tid, None)
                    if store is None:
                        raise KeyError((tid, beds["target"].keys()))
                    target_bed = store.get(transcript.id, None)
                    if target_bed is None:
                        raise KeyError((tid, store.keys()))
                    tnum += 1
                    queue.put((tnum, (transcript, ref_cdna, ref_bed,
                                      target_cdna, target_bed)))
                try:
                    transcript = Transcript(line)
                except (ValueError, TypeError):
                    raise ValueError((pos, line))
            elif line.is_exon is True:
                transcript.add_exon(line)
            if tnum >= 10**4 and tnum % 10**4 == 0:
                logger.info("Parsed {} transcripts", tnum)

        if transcript:
            tnum += 1
            tid = re.sub(gmap_pat, "", transcript.id)
            ref_cdna = str(cdnas["ref"][tid])
            ref_bed = beds["ref"][tid]
            target_cdna = str(cdnas["target"][transcript.id])
            target_bed = beds["target"][tid][transcript.id]
            queue.put((tnum, (transcript, ref_cdna, ref_bed, target_cdna,
                              target_bed)))
        logger.info("Finished parsing input genomic GF file")

    queue.put("EXIT")
    logger.info("Waiting for subprocesses to finish")
    [_proc.join() for _proc in procs]

    # Now the printing ...
    # results = dict()

    logger.info("Subprocesses finished, printing")
    for proc in procs:
        sq = sqlalchemy.create_engine("sqlite:///{}".format(proc.out_sq))
        for res in sq.execute("select * from storer"):
            num, bed12, gff3 = res
            if args.out_bed is not None:
                print(bed12.decode(), file=args.out_bed)
            print(*gff3.decode().split("\n"), file=args.out, sep="\n")
        os.remove(proc.out_sq)

    logger.info("Finished!")
    return
Example #14
0
def main():

    logger = create_default_logger("stat_serializer")

    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("--db",
                        required=True,
                        help="SQLite database to connect to.")
    parser.add_argument("--force", action="store_true", default=False)
    parser.add_argument(
        "input_files",
        help=
        """TXT tab-delimited file, specifying the input files in the following way:
                        - species
                        - aligner
                        - assembler
                        - basename for the comparisons against the complete reference
                        - basename for the comparisons against the filtered reference
                        """)
    args = parser.parse_args()

    # Create the database
    connector = functools.partial(sqlite3.connect,
                                  database=args.db,
                                  check_same_thread=False)
    engine = create_engine("sqlite://", creator=connector)

    if args.force is True:
        logger.warn("Removing old data because force option in place")
        meta = sqlalchemy.MetaData(bind=engine)
        meta.reflect(engine)
        for tab in reversed(meta.sorted_tables):
            logger.warn("Dropping %s", tab)
            tab.drop()

    inspector = Inspector.from_engine(engine)
    Session = sessionmaker(bind=engine)
    # session = Session(bind=engine, autocommit=True, autoflush=True)
    session = Session()

    if Indexer.__tablename__ not in inspector.get_table_names():
        DBBASE.metadata.create_all(engine)  # @UndefinedVariable

    with open(args.input_files) as input_files:

        for row in input_files:
            species, aligner, assembler, complete, filtered = row.rstrip(
            ).split()
            if not os.path.exists("{}.stats".format(complete)):
                raise ValueError(
                    "Original file not found; line:\n{}".format(row))
            if not os.path.exists("{}.stats".format(filtered)):
                raise ValueError(
                    "Filtered file {} not found; line:\n{}".format(
                        "{}.stats".format(filtered), row))
            current_species = Indexer(species, aligner, assembler)
            session.add(current_species)
            session.commit()
            print(current_species.m_index)
            complete_load = CompareFiles(current_species.m_index,
                                         complete,
                                         filtered=False)
            session.add(complete_load)
            # session.commit()
            # filtered_load = CompareFiles(current_species.index, filtered, filtered=True)
            # session.add(filtered_load)
            # session.commit()
            #
            # orig_lines = [line.rstrip() for line in open(orig)]
            # filtered_lines = [line.rstrip() for line in open(filtered)]
            # # In the stats we have precision as second and sensitivity as first,
            # # we have to invert
            # for index, line_index in enumerate([5, 7, 8, 9, 12, 15]):
            #     precision = float(orig_lines[line_index].split(":")[1].split()[1])
            #     recall = float(filtered_lines[line_index].split(":")[1].split()[0])
            #     stats[
            #         # Name of the statistic:Base, Exon, etc
            #         list(stats.keys())[index]][
            #         b"TopHat"].append((precision, recall))
    session.commit()
Example #15
0
class TestMetricsEndDistances(unittest.TestCase):

    logger = create_default_logger("End")
    logger.setLevel("ERROR")

    def setUp(self):

        self.tr = Transcript()
        self.tr.logger = self.logger
        self.tr.start = 101
        self.tr.end = 10000
        self.tr.add_exons([(101, 300),
                           (501, 800),
                           (1001, 1200),
                           (1301, 2000),
                           (3501, 5000),
                           (5501, 6000),
                           (6201, 7000),
                           (7301, 7700),
                           (8201, 9000),
                           (9101, 9300),
                           (9501, 9700),
                           (9801, 10000)])
        self.tr.id = "test1"
        self.tr.parent = "test1.gene"

    def test_end_positive(self):

        self.tr.strand = "+"

        cds = [(1161, 1200),  # 40 % 3 == 1
               (1301, 2000),  # 700 % 3 == 1
               (3501, 5000),  # 1500 % 3 == 0
               (5501, 6000),  # 500 % 3 == 2
               (6201, 7000),  # 800 % 3 == 2
               (7301, 7700),  # 400 % 3 == 1
               (8201, 9000),  # 800 % 3 == 2
               (9101, 9130)]

        self.tr.add_exons(cds, features="CDS")
        self.tr.finalize()
        self.assertEqual(self.tr.selected_cds_end,
                         self.tr.combined_cds_end)
        self.assertEqual(self.tr.selected_cds_end,
                         9130)
        self.assertEqual(self.tr.end_distance_from_junction,
                         (9300 - 9131 + 1) + (9700 - 9501 + 1)
                         )
        self.assertEqual(self.tr.end_distance_from_tes,
                         (9300 - 9131 + 1) + (9700 - 9501 + 1) + (10000 - 9801 + 1)
                         )

        self.tr.strip_cds()
        self.assertEqual(len(self.tr.internal_orfs), 0, self.tr.internal_orfs)
        self.tr.finalized = False
        cds = [(1161, 1200),  # 40 % 3 == 1
               (1301, 2000),  # 700 % 3 == 1
               (3501, 5000),  # 1500 % 3 == 0
               (5501, 6000),  # 500 % 3 == 2
               (6201, 7000),  # 800 % 3 == 2
               (7301, 7700),  # 400 % 3 == 1
               (8201, 9000),  # 800 % 3 == 2
               (9101, 9300),  # 200 % 3 == 2
               (9501, 9690)  # 190 % 3 == 1
               ]
        self.tr.add_exons(cds, features="CDS")

        self.tr.finalize()
        self.assertEqual(self.tr.combined_cds_end,
                         9690)
        self.assertEqual(self.tr.selected_cds_end,
                         self.tr.combined_cds_end)

        self.assertEqual(self.tr.end_distance_from_junction,
                         (9700 - 9691 + 1)
                         )
        self.assertEqual(self.tr.end_distance_from_tes,
                         (9700 - 9691 + 1) + (10000 - 9801 + 1)
                         )

        self.tr.strip_cds()
        self.assertEqual(self.tr.combined_cds_end,
                         self.tr.selected_cds_end,
                         self.tr.combined_cds)
        self.assertEqual(self.tr.combined_cds_end,
                         None,
                         self.tr.combined_cds_end)

        self.tr.finalized = False
        cds = [(1161, 1200),  # 40 % 3 == 1
               (1301, 2000),  # 700 % 3 == 1
               (3501, 5000),  # 1500 % 3 == 0
               (5501, 6000),  # 500 % 3 == 2
               (6201, 7000),  # 800 % 3 == 2
               (7301, 7700),  # 400 % 3 == 1
               (8201, 9000),  # 800 % 3 == 2
               (9101, 9300),  # 200 % 3 == 2
               (9501, 9700),  # 200 % 3 == 2
               (9801, 9820),  # 20 % 2 == 2
               ]
        self.tr.add_exons(cds, features="CDS")

        self.tr.finalize()
        self.assertEqual(self.tr.combined_cds_end,
                         9820)
        self.assertEqual(self.tr.selected_cds_end,
                         self.tr.combined_cds_end)
        self.assertEqual(self.tr.end_distance_from_tes,
                         180)
        self.assertEqual(self.tr.end_distance_from_junction,
                         0)

    def test_end_negative(self):

        self.tr.strand = "-"

        # self.tr.add_exons([(101, 300),
        #                    (501, 800),
        #                    (1001, 1200),
        #                    (1301, 2000),
        #                    (3501, 5000),
        #                    (5501, 6000),
        #                    (6201, 7000),
        #                    (7301, 7700),
        #                    (8201, 9000),
        #                    (9101, 9300),
        #                    (9501, 9700),
        #                    (9801, 10000)])

        cds = [(1161, 1200),  # 40 % 3 == 1
               (1301, 2000),  # 700 % 3 == 1
               (3501, 5000),  # 1500 % 3 == 0
               (5501, 6000),  # 500 % 3 == 2
               (6201, 7000),  # 800 % 3 == 2
               (7301, 7700),  # 400 % 3 == 1
               (8201, 9000),  # 800 % 3 == 2
               (9101, 9130)]

        self.assertEqual(sum(x[1] - x[0] + 1 for x in cds) % 3, 0)

        self.tr.add_exons(cds, features="CDS")
        self.tr.finalize()
        self.assertTrue(self.tr.is_coding)
        self.assertEqual(self.tr.selected_cds_end,
                         self.tr.combined_cds_end)
        self.assertEqual(self.tr.selected_cds_end,
                         1161)
        self.assertEqual(self.tr.end_distance_from_junction,
                         (1161-1001) + (800-501+1),
                         (self.tr.end_distance_from_junction,
                          (1161-1001) + (800-501+1))
                         )
        self.assertEqual(self.tr.end_distance_from_tes,
                         self.tr.end_distance_from_junction + (300 - 101 + 1),
                         (self.tr.end_distance_from_tes,
                          self.tr.end_distance_from_junction + (300 - 101 + 1))
                         )

        self.tr.strip_cds()
        self.assertEqual(len(self.tr.internal_orfs), 0, self.tr.internal_orfs)
        self.tr.finalized = False
        cds = [(721, 800),
               (1001, 1200),  # 200 % 3 == 2
               (1301, 2000),  # 700 % 3 == 1
               (3501, 5000),  # 1500 % 3 == 0
               (5501, 6000),  # 500 % 3 == 2
               (6201, 7000),  # 800 % 3 == 2
               (7301, 7700),  # 400 % 3 == 1
               (8201, 9000),  # 800 % 3 == 2
               (9101, 9130),  # 200 % 3 == 2
               ]
        self.tr.add_exons(cds, features="CDS")

        self.tr.finalize()
        self.assertEqual(self.tr.combined_cds_end,
                         721)
        self.assertEqual(self.tr.selected_cds_end,
                         self.tr.combined_cds_end)

        self.assertEqual(self.tr.end_distance_from_junction,
                         (721-501),
                         (self.tr.end_distance_from_junction, (721-501))
                         )
        self.assertEqual(self.tr.end_distance_from_tes,
                         self.tr.end_distance_from_junction + (300 - 101 + 1),
                         (self.tr.end_distance_from_tes,
                         self.tr.end_distance_from_junction + (300 - 101 + 1))
                         )

        self.tr.strip_cds()
        self.assertEqual(self.tr.combined_cds_end,
                         self.tr.selected_cds_end,
                         self.tr.combined_cds)
        self.assertEqual(self.tr.combined_cds_end,
                         None,
                         self.tr.combined_cds_end)

        self.tr.finalized = False
        cds = [(161, 300),    # 140 % 3 == 2
               (501, 800),    # 300 % 3 == 0
               (1001, 1200),  # 200 % 3 == 2
               (1301, 2000),  # 700 % 3 == 1
               (3501, 5000),  # 1500 % 3 == 0
               (5501, 6000),  # 500 % 3 == 2
               (6201, 7000),  # 800 % 3 == 2
               (7301, 7700),  # 400 % 3 == 1
               (8201, 9000),  # 800 % 3 == 2
               (9101, 9130),  # 30 % 3 == 0
               ]

        self.assertEqual(sum((_[1] - _[0] +1) % 3 for _ in cds ) % 3, 0)
        self.tr.logger = self.logger
        self.tr.add_exons(cds, features="CDS")
        self.tr.finalize()
        self.assertEqual(self.tr.combined_cds_end,
                         161)
        self.assertEqual(self.tr.selected_cds_end,
                         self.tr.combined_cds_end)
        self.assertEqual(self.tr.end_distance_from_tes,
                         60)
        self.assertEqual(self.tr.end_distance_from_junction,
                         0)
class AugustusTester(unittest.TestCase):

    logger = create_default_logger("augustus")
    logger.setLevel("DEBUG")

    def test_truncated(self):

        lines = """Triticum_aestivum_CS42_TGACv1_scaffold_000043_1AL	Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus	mRNA	1	2785	.	+	.	ID=TRIAE4565_1AL_Aug_0021880.1;Parent=TRIAE4565_1AL_Aug_0021880;Name=TRIAE4565_1AL_Aug_0021880.1
Triticum_aestivum_CS42_TGACv1_scaffold_000043_1AL	Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus	CDS	1601	2446	.	+	1	ID=TRIAE4565_1AL_Aug_0021880.1.CDS1;Parent=TRIAE4565_1AL_Aug_0021880.1
Triticum_aestivum_CS42_TGACv1_scaffold_000043_1AL	Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus	exon	1601	2446	.	+	.	ID=TRIAE4565_1AL_Aug_0021880.1.exon1;Parent=TRIAE4565_1AL_Aug_0021880.1
Triticum_aestivum_CS42_TGACv1_scaffold_000043_1AL	Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus	CDS	2540	2654	.	+	1	ID=TRIAE4565_1AL_Aug_0021880.1.CDS2;Parent=TRIAE4565_1AL_Aug_0021880.1
Triticum_aestivum_CS42_TGACv1_scaffold_000043_1AL	Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus	exon	2540	2785	.	+	.	ID=TRIAE4565_1AL_Aug_0021880.1.exon2;Parent=TRIAE4565_1AL_Aug_0021880.1
Triticum_aestivum_CS42_TGACv1_scaffold_000043_1AL	Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus	three_prime_UTR	2655	2785	.	+	.	ID=TRIAE4565_1AL_Aug_0021880.1.three_prime_UTR1;Parent=TRIAE4565_1AL_Aug_0021880.1"""

        lines = [
            parsers.GFF.GffLine("\t".join(_.split()))
            for _ in lines.split("\n")
        ]

        transcript = loci.Transcript(lines[0], logger=self.logger)
        transcript.add_exons(lines[1:])

        with self.assertLogs("augustus", level="WARNING") as cm_out:
            transcript.finalize()
            self.assertTrue(
                any("The transcript TRIAE4565_1AL_Aug_0021880.1 has coordinates 1:2785"
                    in _ for _ in cm_out.output))

        self.assertTrue(transcript.is_coding)

    def test_three_truncated(self):
        lines = """Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL	Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus	mRNA	204336	224434	.	+	.	ID=TRIAE4565_1AL_Aug_0024630.1;Parent=TRIAE4565_1AL_Aug_0024630;Name=TRIAE4565_1AL_Aug_0024630.1
Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL	Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus	exon	204336	205303	.	+	.	ID=TRIAE4565_1AL_Aug_0024630.1.exon1;Parent=TRIAE4565_1AL_Aug_0024630.1
Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL	Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus	five_prime_UTR	204336	204546	.	+	.	ID=TRIAE4565_1AL_Aug_0024630.1.five_prime_UTR1;Parent=TRIAE4565_1AL_Aug_0024630.1
Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL	Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus	CDS	204547	205303	.	+	0	ID=TRIAE4565_1AL_Aug_0024630.1.CDS1;Parent=TRIAE4565_1AL_Aug_0024630.1
Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL	Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus	CDS	206227	207040	.	+	2	ID=TRIAE4565_1AL_Aug_0024630.1.CDS2;Parent=TRIAE4565_1AL_Aug_0024630.1
Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL	Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus	exon	206227	207040	.	+	.	ID=TRIAE4565_1AL_Aug_0024630.1.exon2;Parent=TRIAE4565_1AL_Aug_0024630.1"""

        lines = [
            parsers.GFF.GffLine("\t".join(_.split()))
            for _ in lines.split("\n")
        ]

        transcript = loci.Transcript(lines[0], logger=self.logger)
        transcript.add_exons(lines[1:])

        with self.assertLogs("augustus", level="WARNING") as cm_out:
            transcript.finalize()
            self.assertTrue(
                any("The transcript TRIAE4565_1AL_Aug_0024630.1 has coordinates 204336:224434"
                    in _ for _ in cm_out.output))

        self.assertTrue(transcript.is_coding)

    def test_invalid_three_truncated(self):
        lines = """Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL	Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus	mRNA	204336	225434	.	+	.	ID=TRIAE4565_1AL_Aug_0024630.1;Parent=TRIAE4565_1AL_Aug_0024630;Name=TRIAE4565_1AL_Aug_0024630.1
Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL	Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus	exon	204336	205303	.	+	.	ID=TRIAE4565_1AL_Aug_0024630.1.exon1;Parent=TRIAE4565_1AL_Aug_0024630.1
Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL	Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus	five_prime_UTR	204336	204546	.	+	.	ID=TRIAE4565_1AL_Aug_0024630.1.five_prime_UTR1;Parent=TRIAE4565_1AL_Aug_0024630.1
Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL	Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus	CDS	204547	205303	.	+	0	ID=TRIAE4565_1AL_Aug_0024630.1.CDS1;Parent=TRIAE4565_1AL_Aug_0024630.1
Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL	Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus	CDS	206227	207042	.	+	2	ID=TRIAE4565_1AL_Aug_0024630.1.CDS2;Parent=TRIAE4565_1AL_Aug_0024630.1
Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL	Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus	exon	206227	207042	.	+	.	ID=TRIAE4565_1AL_Aug_0024630.1.exon2;Parent=TRIAE4565_1AL_Aug_0024630.1
Triticum_aestivum_CS42_TGACv1_scaffold_000112_1AL	Triticum_aestivum_CS42_TGACv1_TRIAE4565_Augustus	exon	208227	210040	.	+	.	ID=TRIAE4565_1AL_Aug_0024630.1.exon2;Parent=TRIAE4565_1AL_Aug_0024630.1"""

        lines = [
            parsers.GFF.GffLine("\t".join(_.split()))
            for _ in lines.split("\n")
        ]

        transcript = loci.Transcript(lines[0], logger=self.logger)
        transcript.add_exons(lines[1:])

        with self.assertLogs("augustus", level="WARNING") as cm_out:
            transcript.finalize()
            self.assertTrue(
                any("The transcript TRIAE4565_1AL_Aug_0024630.1 has coordinates 204336:225434"
                    in _ for _ in cm_out.output))
            # self.assertTrue(any(
            #     "strip_cds" in _ for
            # _ in cm_out.output))

        self.assertFalse(transcript.is_coding)

    def test_valid_three_truncated(self):
        """
        Picked from the EnsEMBL Human GTF (v. 70)
        :return: 
        """

        lines = """11\tnonsense_mediated_decay\texon\t134177086\t134177102\t.\t+\t.\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "1"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; exon_id "ENSE00002461794";
11\tnonsense_mediated_decay\tCDS\t134177086\t134177102\t.\t+\t2\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "1"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; protein_id "ENSP00000397929";
11\tnonsense_mediated_decay\texon\t134179522\t134179657\t.\t+\t.\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "2"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; exon_id "ENSE00002147723";
11\tnonsense_mediated_decay\tCDS\t134179522\t134179657\t.\t+\t0\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "2"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; protein_id "ENSP00000397929";
11\tnonsense_mediated_decay\texon\t134180465\t134180545\t.\t+\t.\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "3"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; exon_id "ENSE00001278318";
11\tnonsense_mediated_decay\tCDS\t134180465\t134180545\t.\t+\t2\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "3"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; protein_id "ENSP00000397929";
11\tnonsense_mediated_decay\texon\t134180958\t134181064\t.\t+\t.\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "4"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; exon_id "ENSE00001140726";
11\tnonsense_mediated_decay\tCDS\t134180958\t134181064\t.\t+\t2\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "4"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; protein_id "ENSP00000397929";
11\tnonsense_mediated_decay\texon\t134182243\t134182383\t.\t+\t.\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "5"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; exon_id "ENSE00003177017";
11\tnonsense_mediated_decay\tCDS\t134182243\t134182383\t.\t+\t0\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "5"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; protein_id "ENSP00000397929";
11\tnonsense_mediated_decay\texon\t134182710\t134182781\t.\t+\t.\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "6"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; exon_id "ENSE00003096760";
11\tnonsense_mediated_decay\tCDS\t134182710\t134182781\t.\t+\t0\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "6"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; protein_id "ENSP00000397929";
11\tnonsense_mediated_decay\texon\t134183835\t134183922\t.\t+\t.\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "7"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; exon_id "ENSE00003040614";
11\tnonsense_mediated_decay\tCDS\t134183835\t134183837\t.\t+\t0\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "7"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; protein_id "ENSP00000397929";
11\tnonsense_mediated_decay\tstop_codon\t134183838\t134183840\t.\t+\t0\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "7"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006";
11\tnonsense_mediated_decay\texon\t134184224\t134184335\t.\t+\t.\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "8"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; exon_id "ENSE00003002659";
11\tnonsense_mediated_decay\texon\t134188525\t134188641\t.\t+\t.\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "9"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; exon_id "ENSE00003191545";
11\tnonsense_mediated_decay\texon\t134188771\t134189178\t.\t+\t.\tgene_id "ENSG00000166105"; transcript_id "ENST00000455971"; exon_number "10"; gene_name "GLB1L3"; gene_biotype "protein_coding"; transcript_name "GLB1L3-006"; exon_id "ENSE00001441085";"""

        lines = [
            parsers.GTF.GtfLine("\t".join(_.split("\t")))
            for _ in lines.split("\n")
        ]
        assert all([line.header is False for line in lines])

        transcript = loci.Transcript(lines[0], logger=self.logger)

        transcript.add_exons(lines[1:])
        with self.assertLogs("augustus", level="DEBUG") as cm_out:
            transcript.finalize()

        self.assertTrue(transcript.is_coding)
        self.assertEqual(
            560, transcript.selected_cds_length,
            sum(_[1][1] - _[1][0] + 1 for _ in transcript.selected_internal_orf
                if _[0] == "CDS"))
Example #17
0
class PhaseChecker(unittest.TestCase):

    logger = create_default_logger("pcheck")
    logger.setLevel("DEBUG")

    def setUp(self):

        lines = """Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	mRNA	40282	46004	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960;Name=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2;aed=0.0;note=TRIAE_CS42_5DL_TGACv1_434051_AA1427960;confidence=High;has_start=True;has_stop=True;original_stop=True;protein_rank=P1;transcript_rank=T2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	40282	40933	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	three_prime_UTR	40282	40720	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.three_prime_UTR1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	40721	40933	.	-	0	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	41018	41111	.	-	1	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS2;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	41018	41111	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon2;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	41227	41468	.	-	0	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS3;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	41227	41468	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon3;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	41673	41831	.	-	0	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS4;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	41673	41831	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon4;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	41946	42820	.	-	2	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS5;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	41946	42820	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon5;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	42905	42913	.	-	2	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS6;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	42905	42913	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon6;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	45373	45496	.	-	0	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS7;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	45373	45496	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon7;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	45600	45651	.	-	1	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS8;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	45600	45651	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon8;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	45726	45726	.	-	2	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS9;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	45726	45726	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon9;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	CDS	45875	45893	.	-	0	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.CDS10;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	exon	45875	46004	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.exon10;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2
Triticum_aestivum_CS42_TGACv1_scaffold_434051_5DL	TGACv1	five_prime_UTR	45894	46004	.	-	.	ID=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2.five_prime_UTR1;Parent=TRIAE_CS42_5DL_TGACv1_434051_AA1427960.2"""

        lines = [GffLine("\t".join(_.split())) for _ in lines.split("\n") if _]
        self.transcript = Transcript(lines[0], logger=self.logger)
        self.transcript.add_exons(lines[1:])
        self.correct_phases = {(40721, 40933): 2,
                               (41018, 41111): 0,
                               (41227, 41468): 2,
                               (41673, 41831): 2,
                               (41946, 42820): 1,
                               (42905, 42913): 1,
                               (45373, 45496): 2,
                               (45600, 45651): 0,
                               (45726, 45726): 2,
                               (45875, 45893): 0}

    @unittest.skip
    def test_check_phases(self):
        self.transcript.finalize()
        phases = dict((_[1], _[2]) for _ in self.transcript.internal_orfs[0]
                      if _[0] == "CDS")
        self.assertEqual(self.transcript.combined_cds_start, 45893)

        self.assertEqual(phases.keys(),
                         self.correct_phases.keys(),
                         list(zip(sorted(phases.keys()),
                                  sorted(self.correct_phases.keys()))))

        if self.correct_phases != phases:
            for key in sorted(phases.keys(), reverse=True):
                self.assertEqual(phases[key], self.correct_phases[key],
                                 (key, phases[key], self.correct_phases[key]))

        self.assertEqual(self.correct_phases,
                         phases,
                         (self.correct_phases, phases))