def test_monoexonic(self):

        exon = self.gff_lines[1]
        transcript_line = self.gff_lines[0]
        transcript_line.end = exon.end
        model = Transcript(transcript_line)
        model.add_exon(exon)
        model.finalize()
        fasta = self.fasta[model.chrom][model.start - 1:model.end]

        tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=False)
        tcheck.check_strand()
        self.assertIsNone(tcheck.strand)

        tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=True)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "+")

        neg = model.copy()
        neg.strand = "-"

        tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=False)
        tcheck.check_strand()
        self.assertIsNone(tcheck.strand)

        tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=True)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "-")
    def test_monoexonic(self):

        exon = self.gff_lines[1]
        transcript_line = self.gff_lines[0]
        transcript_line.end = exon.end
        model = Transcript(transcript_line)
        model.add_exon(exon)
        model.finalize()
        fasta = self.fasta[model.chrom][model.start - 1: model.end]

        tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=False)
        tcheck.check_strand()
        self.assertIsNone(tcheck.strand)

        tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=True)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "+")

        neg = model.copy()
        neg.strand = "-"

        tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=False)
        tcheck.check_strand()
        self.assertIsNone(tcheck.strand)

        tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=True)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "-")
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser(
        "Script to convert from BAM to GTF, for PB alignments")
    parser.add_argument(
        "--strict",
        action="store_true",
        default=False,
        help=
        "Switch. If set, this script will never output multiexonic transcripts \
                        without a defined strand.")
    parser.add_argument("--outfmt", choices=["gtf", "bed12"], default="gtf")
    parser.add_argument("bam", type=to_bam, help="Input BAM file")
    parser.add_argument("out",
                        nargs="?",
                        default=sys.stdout,
                        type=argparse.FileType("wt"),
                        help="Optional output file")
    args = parser.parse_args()

    # M 0 alignment match (can be a sequence match or mismatch)
    # I 1 insertion to the reference
    # D 2 deletion from the reference
    # N 3 skipped region from the reference
    # S 4 soft clipping (clipped sequences present in SEQ)
    # H 5 hard clipping (clipped sequences NOT present in SEQ)
    # P 6 padding (silent deletion from padded reference)
    # = 7 sequence match
    # X 8 sequence mismatch

    name_counter = Counter()

    for record in args.bam:
        if record.is_unmapped is True:
            continue
        transcript = Transcript(record,
                                accept_undefined_multi=(not args.strict))
        if name_counter.get(record.query_name):
            name = "{}_{}".format(record.query_name,
                                  name_counter.get(record.query_name))
        else:
            name = record.query_name

        transcript.id = name
        transcript.parent = transcript.attributes[
            "gene_id"] = "{0}.gene".format(name)
        name_counter.update([record.query_name])
        transcript.source = "bam2gtf"
        print(transcript.format(args.outfmt), file=args.out)
    def setUp(self):

        # Prepare the model
        self.model_lines= """Chr5	tair10	transcript	26584797	26595528	100	+	.	ID=c58_g1_i3.mrna1.19;Parent=c58_g1_i3.path1.19;Name=c58_g1_i3.mrna1.19;gene_name=c58_g1_i3
    Chr5	tair10	exon	26584797	26584879	.	+	.	ID=c58_g1_i3.mrna1.19.exon1;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26585220	26585273	.	+	.	ID=c58_g1_i3.mrna1.19.exon2;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26585345	26585889	.	+	.	ID=c58_g1_i3.mrna1.19.exon3;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26585982	26586294	.	+	.	ID=c58_g1_i3.mrna1.19.exon4;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26586420	26586524	.	+	.	ID=c58_g1_i3.mrna1.19.exon5;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26586638	26586850	.	+	.	ID=c58_g1_i3.mrna1.19.exon6;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26586934	26586996	.	+	.	ID=c58_g1_i3.mrna1.19.exon7;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26587084	26587202	.	+	.	ID=c58_g1_i3.mrna1.19.exon8;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26587287	26587345	.	+	.	ID=c58_g1_i3.mrna1.19.exon9;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26587427	26587472	.	+	.	ID=c58_g1_i3.mrna1.19.exon10;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26595411	26595528	.	+	.	ID=c58_g1_i3.mrna1.19.exon11;Parent=c58_g1_i3.mrna1.19"""

        self.gff_lines = []
        for line in self.model_lines.split("\n"):
            line = line.rstrip().lstrip()
            line = GffLine(line)
            self.gff_lines.append(line)
    
        self.model = Transcript(self.gff_lines[0])
        self.model.add_exons(self.gff_lines[1:])
        self.model.finalize()
    
        self.exons = [self.fasta[line.chrom][line.start - 1:line.end] for line in self.gff_lines[1:]]

        self.assertEqual(sum([len(exon) for exon in self.exons]), 1718, self.exons)
        # We need the whole genomic fragment
        self.model_fasta = self.fasta["Chr5"][self.model.start -1:self.model.end]
        self.assertEqual(self.gff_lines[1].start, 26584797)
        self.assertEqual(self.gff_lines[1].end, 26584879)
        self.assertEqual(self.model.exons[0][0], self.gff_lines[1].start)
        self.assertEqual(self.model.exons[0][1], self.gff_lines[1].end)
Beispiel #5
0
    def __prepare_transcript(self, prediction: Transcript):
        """
        Private method that checks that a prediction transcript is OK
        before starting to analyse its concordance with the reference.
        :param prediction:
        :return:
        """

        # Prepare the prediction to be analysed
        prediction.logger = self.logger
        try:
            prediction.finalize()
            # noinspection PyUnresolvedReferences
            if self.args.exclude_utr is True:
                prediction.remove_utrs()
        except InvalidCDS:
            try:
                prediction.strip_cds()
            except InvalidTranscript as err:
                self.logger.warn("Invalid transcript (due to CDS): %s",
                                 prediction.id)
                self.logger.warn("Error message: %s", err)
                self.done += 1
                self.print_tmap(None)
                return None
        except InvalidTranscript as err:
            #         args.queue.put_nowait("mock")
            self.logger.warn("Invalid transcript: %s", prediction.id)
            self.logger.warn("Error message: %s", err)
            self.done += 1
            self.print_tmap(None)
            return None
        return prediction
Beispiel #6
0
    def __prepare_transcript(self, prediction: Transcript):
        """
        Private method that checks that a prediction transcript is OK
        before starting to analyse its concordance with the reference.
        :param prediction:
        :return:
        """

        # Prepare the prediction to be analysed
        prediction.logger = self.logger
        try:
            prediction.finalize()
            # noinspection PyUnresolvedReferences
            if self.args.exclude_utr is True:
                prediction.remove_utrs()
        except InvalidCDS:
            try:
                prediction.strip_cds()
            except InvalidTranscript as err:
                self.logger.warn("Invalid transcript (due to CDS): %s",
                                 prediction.id)
                self.logger.warn("Error message: %s", err)
                self.done += 1
                self.print_tmap(None)
                return None
        except InvalidTranscript as err:
            #         args.queue.put_nowait("mock")
            self.logger.warn("Invalid transcript: %s", prediction.id)
            self.logger.warn("Error message: %s", err)
            self.done += 1
            self.print_tmap(None)
            return None
        return prediction
    def setUp(self):

        # Prepare the model
        self.model_lines = """Chr5	tair10	transcript	26584797	26595528	100	+	.	ID=c58_g1_i3.mrna1.19;Parent=c58_g1_i3.path1.19;Name=c58_g1_i3.mrna1.19;gene_name=c58_g1_i3
    Chr5	tair10	exon	26584797	26584879	.	+	.	ID=c58_g1_i3.mrna1.19.exon1;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26585220	26585273	.	+	.	ID=c58_g1_i3.mrna1.19.exon2;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26585345	26585889	.	+	.	ID=c58_g1_i3.mrna1.19.exon3;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26585982	26586294	.	+	.	ID=c58_g1_i3.mrna1.19.exon4;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26586420	26586524	.	+	.	ID=c58_g1_i3.mrna1.19.exon5;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26586638	26586850	.	+	.	ID=c58_g1_i3.mrna1.19.exon6;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26586934	26586996	.	+	.	ID=c58_g1_i3.mrna1.19.exon7;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26587084	26587202	.	+	.	ID=c58_g1_i3.mrna1.19.exon8;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26587287	26587345	.	+	.	ID=c58_g1_i3.mrna1.19.exon9;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26587427	26587472	.	+	.	ID=c58_g1_i3.mrna1.19.exon10;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26595411	26595528	.	+	.	ID=c58_g1_i3.mrna1.19.exon11;Parent=c58_g1_i3.mrna1.19"""

        self.gff_lines = []
        for line in self.model_lines.split("\n"):
            line = line.rstrip().lstrip()
            line = GffLine(line)
            self.gff_lines.append(line)

        self.model = Transcript(self.gff_lines[0])
        self.model.add_exons(self.gff_lines[1:])
        self.model.finalize()

        self.exons = [
            self.fasta[line.chrom][line.start - 1:line.end]
            for line in self.gff_lines[1:]
        ]

        self.assertEqual(sum([len(exon) for exon in self.exons]), 1718,
                         self.exons)
        # We need the whole genomic fragment
        self.model_fasta = self.fasta["Chr5"][self.model.start -
                                              1:self.model.end]
        self.assertEqual(self.gff_lines[1].start, 26584797)
        self.assertEqual(self.gff_lines[1].end, 26584879)
        self.assertEqual(self.model.exons[0][0], self.gff_lines[1].start)
        self.assertEqual(self.model.exons[0][1], self.gff_lines[1].end)
Beispiel #8
0
def launch(args):
    """
    Simple launcher script.

    :param args: the argparse Namespace
    """

    if hasattr(args, "region") and args.region is not None:
        try:
            args.chrom, args.start, args.end = args.region
        except ValueError as exc:
            raise ValueError("{0} {1}".format(exc, args.region))

    if args.start >= args.end:
        raise ValueError("Start greater than end: {0}\t{1}".format(
            args.start, args.end))

    transcript = None
    with GTF(args.gtf) as gtf:
        for row in gtf:
            if row.chrom != args.chrom:
                continue
            else:
                if row.is_transcript is True:
                    if transcript is not None and \
                            transcript.start >= args.start and transcript.end <= args.end:
                        print(transcript.format("gtf"), file=args.out)
                        transcript = None
                    if args.assume_sorted is True and row.start > args.end:
                        break
                    transcript = Transcript(row)
                else:
                    transcript.add_exon(row)

    if transcript is not None and transcript.start >= args.start and transcript.end <= args.end:
        print(transcript.format("gtf"), file=args.out)
    def test_negative(self):

        gtf_lines = """Chr5	Cufflinks	transcript	26575364	26578163	1000	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";exon_number "1";FPKM "2.9700103727";conf_hi "3.260618";frac "0.732092";cov "81.895309";conf_lo "2.679403";
Chr5	Cufflinks	exon	26575364	26575410	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26575495	26575620	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26575711	26575797	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26575885	26575944	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26576035	26576134	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26576261	26577069	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26577163	26577288	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26577378	26577449	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26577856	26578163	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";"""

        gtf_lines = [GtfLine(line) for line in gtf_lines.split("\n")]

        self.assertEqual(len([_ for _ in gtf_lines if _.header]), 0)

        transcript = Transcript(gtf_lines[0])
        transcript.add_exons(gtf_lines[1:])
        transcript.finalize()
        fasta_seq = self.fasta[transcript.chrom][transcript.start -
                                                 1:transcript.end]

        tr_neg = transcript.copy()
        tchecker = TranscriptChecker(tr_neg, fasta_seq, strand_specific=False)
        self.assertEqual(tchecker.strand, "-")
        self.assertEqual(tchecker.fasta_seq, fasta_seq)
        tchecker.check_strand()
        self.assertEqual(tchecker.strand, "-")

        tr_neg = transcript.copy()
        tr_neg.strand = "+"
        for ss in (False, True):
            with self.subTest(ss=ss):
                tchecker = TranscriptChecker(tr_neg.copy(),
                                             fasta_seq,
                                             strand_specific=ss)
                tchecker.check_strand()
                if ss:
                    self.assertEqual(tchecker.strand, "+")
                    self.assertTrue(tchecker.suspicious_splicing)
                else:
                    self.assertEqual(tchecker.strand, "-")
    def test_positive_strand(self):

        gtf_lines = """chr1A	Self_CESAR/windows_chr1A.gp	transcript	265021906	265026255	.	+	.	gene_id "TraesCS1A01G152900.1"; transcript_id "TraesCS1A01G152900.1";
chr1A	Self_CESAR/windows_chr1A.gp	exon	265021906	265021971	.	+	.	gene_id "TraesCS1A01G152900.1"; transcript_id "TraesCS1A01G152900.1"; exon_number "1"; exon_id "TraesCS1A01G152900.1.1";
chr1A	Self_CESAR/windows_chr1A.gp	CDS	265021906	265021971	.	+	0	gene_id "TraesCS1A01G152900.1"; transcript_id "TraesCS1A01G152900.1"; exon_number "1"; exon_id "TraesCS1A01G152900.1.1";
chr1A	Self_CESAR/windows_chr1A.gp	exon	265022056	265026255	.	+	.	gene_id "TraesCS1A01G152900.1"; transcript_id "TraesCS1A01G152900.1"; exon_number "2"; exon_id "TraesCS1A01G152900.1.2";
chr1A	Self_CESAR/windows_chr1A.gp	CDS	265022056	265026252	.	+	0	gene_id "TraesCS1A01G152900.1"; transcript_id "TraesCS1A01G152900.1"; exon_number "2"; exon_id "TraesCS1A01G152900.1.2";
chr1A	Self_CESAR/windows_chr1A.gp	start_codon	265021906	265021908	.	+	0	gene_id "TraesCS1A01G152900.1"; transcript_id "TraesCS1A01G152900.1"; exon_number "1"; exon_id "TraesCS1A01G152900.1.1";
chr1A	Self_CESAR/windows_chr1A.gp	stop_codon	265026253	265026255	.	+	0	gene_id "TraesCS1A01G152900.1"; transcript_id "TraesCS1A01G152900.1"; exon_number "2"; exon_id "TraesCS1A01G152900.1.2";"""

        gtf_lines = [GtfLine(_) for _ in gtf_lines.split("\n")]
        t = Transcript(gtf_lines[0])
        t.add_exons(gtf_lines[1:])
        t.finalize()
        self.assertEqual(t.start, t.combined_cds_start)
        self.assertEqual(t.end, t.combined_cds_end)
    def test_negative(self):

        gtf_lines = """Chr5	Cufflinks	transcript	26575364	26578163	1000	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";exon_number "1";FPKM "2.9700103727";conf_hi "3.260618";frac "0.732092";cov "81.895309";conf_lo "2.679403";
Chr5	Cufflinks	exon	26575364	26575410	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26575495	26575620	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26575711	26575797	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26575885	26575944	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26576035	26576134	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26576261	26577069	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26577163	26577288	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26577378	26577449	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26577856	26578163	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";"""

        gtf_lines = [GtfLine(line) for line in gtf_lines.split("\n")]

        self.assertEqual(len([_ for _ in gtf_lines if _.header]), 0)

        transcript = Transcript(gtf_lines[0])
        transcript.add_exons(gtf_lines[1:])
        transcript.finalize()
        fasta_seq = self.fasta[transcript.chrom][transcript.start - 1:transcript.end]

        tr_neg = transcript.copy()
        tchecker = TranscriptChecker(tr_neg, fasta_seq, strand_specific=False)
        self.assertEqual(tchecker.strand, "-")
        self.assertEqual(tchecker.fasta_seq, fasta_seq)
        tchecker.check_strand()
        self.assertEqual(tchecker.strand, "-")

        tr_neg = transcript.copy()
        tr_neg.strand = "+"
        for ss in (False, True):
            with self.subTest(ss=ss):
                tchecker = TranscriptChecker(tr_neg.copy(), fasta_seq, strand_specific=ss)
                tchecker.check_strand()
                if ss:
                    self.assertEqual(tchecker.strand, "+")
                    self.assertTrue(tchecker.suspicious_splicing)
                else:
                    self.assertEqual(tchecker.strand, "-")
Beispiel #12
0
def main():
    parser = argparse.ArgumentParser(
        "Script to convert from BAM to GTF, for PB alignments")
    parser.add_argument("bam", type=to_bam, help="Input BAM file")
    parser.add_argument("out",
                        nargs="?",
                        default=sys.stdout,
                        type=argparse.FileType("wt"),
                        help="Optional output file")
    args = parser.parse_args()

    # M 0 alignment match (can be a sequence match or mismatch)
    # I 1 insertion to the reference
    # D 2 deletion from the reference
    # N 3 skipped region from the reference
    # S 4 soft clipping (clipped sequences present in SEQ)
    # H 5 hard clipping (clipped sequences NOT present in SEQ)
    # P 6 padding (silent deletion from padded reference)
    # = 7 sequence match
    # X 8 sequence mismatch

    for record in args.bam:
        try:
            start, end = record.reference_start, record.get_blocks()[-1][1]
        except IndexError:
            continue
        current = start

        exons = []
        current_exon = None

        r_length = record.inferred_length  # Read length
        # Upper because of STAR
        edit_distance = [_[1] for _ in record.tags
                         if _[0].upper() == "NM"][0]  # Excluding clipping
        matches = 0
        alen = 0

        for cigar, length in record.cigartuples:
            if cigar in (4, 5):  # Insertion, clipping (soft or hard)
                continue
            elif cigar == 1:
                alen += length
            elif cigar == 3:  # Intron
                if current_exon is None:
                    continue  # Read positioned at the end/beginning of scaffold
                # assert current_exon is not None
                exons.append(current_exon)
                current_exon = None
                current += length
            elif cigar in (0, 2):  # Match or deletion
                if current_exon is None:
                    current_exon = (current + 1, current + length)
                else:
                    current_exon = (current_exon[0], current_exon[1] + length)
                current += length
                if cigar == 0:
                    matches += length
                    alen += length

        snps = sum(
            len(_) for _ in re.split(
                "[0-9]*", [_[1] for _ in record.tags if _[0] == "MD"][0])
            if not _.startswith("^"))

        identity = round(100 * (matches - snps) / r_length, 2)
        coverage = round(100 * alen / r_length, 2)

        exons.append(current_exon)
        # This is clearly a mistake
        if current != end:
            msg = """{0}, {1}, {2}
Cigar tuples: {3}
Cigar string: {4}
Blocks: {5}
Exons: {6}
            """.format(current, end, end - current, record.cigartuples,
                       record.cigarstring, record.get_blocks(), exons)
            print(AssertionError(msg), file=sys.stderr)
            #            raise AssertionError(msg)
            continue

        transcript = Transcript()
        transcript.id = record.query_name
        transcript.exons = exons
        transcript.parent = transcript.attributes[
            "gene_id"] = "{0}.gene".format(record.query_name)
        transcript.attributes["identity"] = identity
        transcript.attributes["coverage"] = coverage
        transcript.attributes["cigar"] = record.cigarstring
        transcript.chrom = args.bam.getrname(record.rname)
        if len(transcript.exons) == 0:
            continue

        assert len(transcript.exons) > 0
        try:
            transcript.start = min(x[0] for x in transcript.exons)
        except IndexError:
            raise IndexError(new_exons, transcript.exons)
        transcript.end = max(x[1] for x in transcript.exons)

        transcript.score = record.mapq
        transcript.source = "bam2gtf"

        transcript.id = record.query_name
        transcript.parent = transcript.attributes[
            "gene_id"] = "{0}.gene".format(record.query_name)
        transcript.chrom = args.bam.getrname(record.rname)

        assert len(transcript.exons) > 0
        try:
            transcript.start = min(x[0] for x in transcript.exons)
        except IndexError:
            raise IndexError(new_exons, transcript.exons)
        transcript.end = max(x[1] for x in transcript.exons)

        transcript.score = record.mapq

        transcript.source = "bam2gtf"

        transcript.attributes.update(
            dict((tag, str(value)) for tag, value in record.get_tags()))

        if "XS" in transcript.attributes:
            transcript.strand = transcript.attributes["XS"]
        else:
            if record.is_reverse:
                transcript.strand = "-"
            else:
                transcript.strand = "+"

        print(transcript.__str__(to_gtf=True), file=args.out)
Beispiel #13
0
def main():
    parser = argparse.ArgumentParser("Script to convert from BAM to GTF, for PB alignments")
    parser.add_argument("bam", type=to_bam, help="Input BAM file")
    parser.add_argument("out", nargs="?", default=sys.stdout, type=argparse.FileType("wt"),
                        help="Optional output file")
    args = parser.parse_args()

    # M 0 alignment match (can be a sequence match or mismatch)
    # I 1 insertion to the reference
    # D 2 deletion from the reference
    # N 3 skipped region from the reference
    # S 4 soft clipping (clipped sequences present in SEQ)
    # H 5 hard clipping (clipped sequences NOT present in SEQ)
    # P 6 padding (silent deletion from padded reference)
    # = 7 sequence match
    # X 8 sequence mismatch

    for record in args.bam:
        try:
            start, end = record.reference_start, record.get_blocks()[-1][1]
        except IndexError:
            continue
        current = start

        exons = []
        current_exon = None

        r_length = record.inferred_length  # Read length
        # Upper because of STAR
        edit_distance = [_[1] for _ in record.tags if _[0].upper() == "NM"][0]  # Excluding clipping
        matches = 0
        alen = 0

        for cigar,length in record.cigartuples:
            if cigar in (4,5):  # Insertion, clipping (soft or hard)
                continue
            elif cigar == 1:
                alen += length
            elif cigar == 3:  # Intron
                if current_exon is None:
                    continue # Read positioned at the end/beginning of scaffold
                # assert current_exon is not None
                exons.append(current_exon)
                current_exon = None
                current += length
            elif cigar in (0, 2):  # Match or deletion
                if current_exon is None:
                    current_exon = (current + 1, current + length)
                else:
                    current_exon = (current_exon[0], current_exon[1] + length)
                current += length
                if cigar == 0:
                    matches += length
                    alen += length

        snps = sum(len(_) for _ in re.split("[0-9]*", [_[1] for _ in record.tags if _[0] == "MD"][0]) if not _.startswith("^"))

        identity = round(100 * (matches - snps) / r_length, 2)
        coverage = round(100 * alen / r_length, 2)

        exons.append(current_exon)
        # This is clearly a mistake
        if current != end:
            msg = """{0}, {1}, {2}
Cigar tuples: {3}
Cigar string: {4}
Blocks: {5}
Exons: {6}
            """.format(current, end, end-current,
                       record.cigartuples,
                       record.cigarstring,
                       record.get_blocks(),
                       exons)
            print(AssertionError(msg), file=sys.stderr)
            #            raise AssertionError(msg)
            continue

        transcript = Transcript()
        transcript.id = record.query_name
        transcript.exons = exons
        transcript.parent = transcript.attributes["gene_id"] =  "{0}.gene".format(record.query_name)
        transcript.attributes["identity"] = identity
        transcript.attributes["coverage"] = coverage
        transcript.attributes["cigar"] = record.cigarstring
        transcript.chrom = args.bam.getrname(record.rname)
        if len(transcript.exons) == 0:
            continue

        assert len(transcript.exons) > 0
        try:
                transcript.start = min(x[0] for x in transcript.exons)
        except IndexError:
                raise IndexError(new_exons, transcript.exons)
        transcript.end = max(x[1] for x in transcript.exons)

        transcript.score = record.mapq
        transcript.source = "bam2gtf"

        transcript.id = record.query_name
        transcript.parent = transcript.attributes["gene_id"] =  "{0}.gene".format(record.query_name)
        transcript.chrom = args.bam.getrname(record.rname)

        assert len(transcript.exons) > 0
        try:
                transcript.start = min(x[0] for x in transcript.exons)
        except IndexError:
                raise IndexError(new_exons, transcript.exons)
        transcript.end = max(x[1] for x in transcript.exons)

        transcript.score = record.mapq

        transcript.source = "bam2gtf"

        transcript.attributes.update(dict((tag,str(value)) for tag,value in record.get_tags()))

        if "XS" in transcript.attributes:
            transcript.strand = transcript.attributes["XS"]
        else:
            if record.is_reverse:
                transcript.strand = "-"
            else:
                transcript.strand = "+"

        print(transcript.__str__(to_gtf=True), file=args.out)
Beispiel #14
0
def parse_prediction(args, genes, positions, queue_logger):
    """
    This function performs the real comparison between the reference and the prediction.
     It needs the following inputs:
    :param args: the Namespace with the necessary parameters
    :param genes: Dictionary with the reference genes, of the form
    dict[chrom][(start,end)] = [gene object]
    :param positions: Dictionary with the positions of the reference genes, of the form
    dict[chrom][IntervalTree]
    :param queue_logger: Logger
    :return:
    """

    # start the class which will manage the statistics
    accountant_instance = Accountant(genes, args)
    assigner_instance = Assigner(genes, positions, args, accountant_instance)

    transcript = None
    if hasattr(args, "self") and args.self is True:
        args.prediction = to_gff(args.reference.name)
    ref_gff = isinstance(args.prediction, GFF3)
    __found_with_orf = set()

    for row in args.prediction:
        if row.header is True:
            continue
        #         queue_logger.debug("Row:\n{0:>20}".format(str(row)))
        if row.is_transcript is True or row.feature == "match":
            queue_logger.debug("Transcript row:\n%s", str(row))
            if transcript is not None:
                if re.search(r"\.orf[0-9]+$", transcript.id):
                    __name = re.sub(r"\.orf[0-9]+$", "", transcript.id)
                    if __name not in __found_with_orf:
                        __found_with_orf.add(__name)
                        assigner_instance.get_best(transcript)
                    else:
                        pass
                else:
                    assigner_instance.get_best(transcript)
            transcript = Transcript(row, logger=queue_logger)
        elif row.is_exon is True:
            # Case 1: we are talking about cDNA_match and GFF
            if ref_gff is True and "match" not in row.feature:
                if transcript is None:
                    raise TypeError(
                        "Transcript not defined inside the GFF; line:\n{}".
                        format(row))
                else:
                    queue_logger.debug("Adding exon to transcript %s: %s",
                                       transcript.id, row)
                    transcript.add_exon(row)
            elif ref_gff is True and "match" in row.feature:
                if transcript is not None and row.id == transcript.id:
                    transcript.add_exon(row)
                elif transcript is not None and transcript.id in row.parent:
                    transcript.add_exon(row)
                elif transcript is None or (transcript is not None
                                            and row.id != transcript.id):
                    if transcript is not None:
                        if re.search(r"\.orf[0-9]+$", transcript.id) and \
                                (not transcript.id.endswith("orf1")):
                            pass
                        else:
                            assigner_instance.get_best(transcript)
                    queue_logger.debug("New transcript: %s", row.transcript)
                    transcript = Transcript(row, logger=queue_logger)
            elif ref_gff is False:
                if transcript is None or (transcript is not None
                                          and transcript.id != row.transcript):
                    if transcript is not None:
                        if re.search(r"\.orf[0-9]+$", transcript.id) and \
                                (not transcript.id.endswith("orf1")):
                            pass
                        else:
                            assigner_instance.get_best(transcript)
                    queue_logger.debug("New transcript: %s", row.transcript)
                    transcript = Transcript(row, logger=queue_logger)
                transcript.add_exon(row)
            else:
                raise TypeError("Unmatched exon: {}".format(row))

        elif row.header:
            continue
        else:
            queue_logger.debug("Skipped row: {}".format(row))

    if transcript is not None:
        if re.search(r"\.orf[0-9]+$",
                     transcript.id) and not transcript.id.endswith("orf1"):
            pass
        else:
            assigner_instance.get_best(transcript)

    # Finish everything, including printing refmap and stats
    assigner_instance.finish()
    args.prediction.close()
Beispiel #15
0
def prepare_reference(args,
                      queue_logger,
                      ref_gff=False) -> (dict, collections.defaultdict(dict)):
    """
    Method to prepare the data structures that hold the reference
    information for the parsing.
    :param args:
    :param queue_logger:
    :param ref_gff:
    :return: genes, positions
    """

    genes = dict()
    positions = collections.defaultdict(dict)
    transcript2gene = dict()

    for row in args.reference:
        # Assume we are going to use GTF for the moment
        if row.is_transcript is True or (ref_gff is True
                                         and row.feature == "match"):
            queue_logger.debug("Transcript\n%s", str(row))
            transcript = Transcript(row, logger=queue_logger)
            if row.feature == "match":
                gid = row.id
            else:
                gid = row.gene

            transcript2gene[row.id] = gid
            if gid not in genes:
                genes[gid] = Gene(transcript, gid=gid, logger=queue_logger)
            genes[gid].add(transcript)
            assert transcript.id in genes[gid].transcripts
        elif row.is_exon is True:
            if ref_gff is True:
                if "cDNA_match" in row.feature:
                    row.parent = row.id
                    # row.gene = row.id
                    if row.id not in transcript2gene:
                        genes[row.id] = Gene(None,
                                             gid=row.id,
                                             logger=queue_logger)
                        transcript2gene[row.id] = row.id
                        transcript = Transcript(row, logger=queue_logger)
                        genes[row.id].add(transcript)
                found = False
                for transcript in row.transcript:
                    if transcript in transcript2gene:
                        # We have to perform the check because there are some GFFs
                        # e.g. TAIR
                        # where CDSs are defined within a spurious "Protein" feature
                        found = True
                        gid = transcript2gene[transcript]
                        genes[gid][transcript].add_exon(row)
                if found is False:
                    queue_logger.warn(
                        "This feature has no corresponding transcript! %s",
                        str(row))
            else:
                if row.gene in genes and row.transcript in genes[
                        row.gene].transcripts:
                    genes[row.gene][row.transcript].add_exon(row)
                else:
                    if row.gene not in genes:
                        genes[row.gene] = Gene(None,
                                               gid=row.gene,
                                               logger=queue_logger)
                    if row.transcript not in genes[row.gene]:
                        transcript = Transcript(row, logger=queue_logger)
                        transcript2gene[row.id] = row.gene
                        genes[row.gene].add(transcript)
                    genes[row.gene][row.transcript].add_exon(row)

    genes, positions = finalize_reference(genes, positions, queue_logger, args)

    if len(genes) == 0:
        raise KeyError("No genes remained for the reference!")
    return genes, positions
class TChekerTester(unittest.TestCase):

    temp_genome = None

    @classmethod
    def setUpClass(cls):

        # Prepare the genome
        cls.temp_genome = tempfile.NamedTemporaryFile(mode="wb", suffix=".fa")
        with pkg_resources.resource_stream("Mikado.tests", "chr5.fas.gz") as comp:
            cls.temp_genome.write(gzip.decompress(comp.read()))
        cls.temp_genome.flush()
        cls.fasta = pyfaidx.Fasta(cls.temp_genome.name)


    def setUp(self):

        # Prepare the model
        self.model_lines= """Chr5	tair10	transcript	26584797	26595528	100	+	.	ID=c58_g1_i3.mrna1.19;Parent=c58_g1_i3.path1.19;Name=c58_g1_i3.mrna1.19;gene_name=c58_g1_i3
    Chr5	tair10	exon	26584797	26584879	.	+	.	ID=c58_g1_i3.mrna1.19.exon1;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26585220	26585273	.	+	.	ID=c58_g1_i3.mrna1.19.exon2;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26585345	26585889	.	+	.	ID=c58_g1_i3.mrna1.19.exon3;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26585982	26586294	.	+	.	ID=c58_g1_i3.mrna1.19.exon4;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26586420	26586524	.	+	.	ID=c58_g1_i3.mrna1.19.exon5;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26586638	26586850	.	+	.	ID=c58_g1_i3.mrna1.19.exon6;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26586934	26586996	.	+	.	ID=c58_g1_i3.mrna1.19.exon7;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26587084	26587202	.	+	.	ID=c58_g1_i3.mrna1.19.exon8;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26587287	26587345	.	+	.	ID=c58_g1_i3.mrna1.19.exon9;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26587427	26587472	.	+	.	ID=c58_g1_i3.mrna1.19.exon10;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26595411	26595528	.	+	.	ID=c58_g1_i3.mrna1.19.exon11;Parent=c58_g1_i3.mrna1.19"""

        self.gff_lines = []
        for line in self.model_lines.split("\n"):
            line = line.rstrip().lstrip()
            line = GffLine(line)
            self.gff_lines.append(line)
    
        self.model = Transcript(self.gff_lines[0])
        self.model.add_exons(self.gff_lines[1:])
        self.model.finalize()
    
        self.exons = [self.fasta[line.chrom][line.start - 1:line.end] for line in self.gff_lines[1:]]

        self.assertEqual(sum([len(exon) for exon in self.exons]), 1718, self.exons)
        # We need the whole genomic fragment
        self.model_fasta = self.fasta["Chr5"][self.model.start -1:self.model.end]
        self.assertEqual(self.gff_lines[1].start, 26584797)
        self.assertEqual(self.gff_lines[1].end, 26584879)
        self.assertEqual(self.model.exons[0][0], self.gff_lines[1].start)
        self.assertEqual(self.model.exons[0][1], self.gff_lines[1].end)

    @classmethod
    def tearDownClass(cls):
        # Remove the genome
        if hasattr(cls.temp_genome, "close"):
            cls.temp_genome.close()
            cls.fasta.close()
            os.remove("{}.fai".format(cls.temp_genome.name))

    def test_translation_table(self):

        self.assertEqual(TranscriptChecker.get_translation_table(),
                         {65: 84, 67: 71, 71: 67, 84: 65})

    def test_rev_complement(self):

        string = "AGTCGTGCAGNGTCGAAGTGCAACAGTGC"

        self.assertEqual(TranscriptChecker.rev_complement(string),
                         "GCACTGTTGCACTTCGACNCTGCACGACT")

    def test_init(self):

        tcheck = TranscriptChecker(self.model, self.model_fasta)
        self.assertEqual(tcheck.cdna_length, 1718)
        self.assertEqual(sorted(tcheck.exons), sorted([(exon.start, exon.end) for exon in self.exons]))
        self.assertEqual(tcheck.fasta_seq, self.model_fasta)

    def test_check_reverse_strand(self):
        
        self.model.strand = "-"
        tcheck = TranscriptChecker(self.model, self.model_fasta, strand_specific=False)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "+")

    def test_check_strand_not_reversed(self):
        self.model.strand = "-"
        tcheck = TranscriptChecker(self.model, self.model_fasta, strand_specific=True)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "-")
        self.assertTrue(tcheck.attributes["canonical_on_reverse_strand"])
        self.assertTrue(tcheck.suspicious_splicing)

    def test_monoexonic(self):

        exon = self.gff_lines[1]
        transcript_line = self.gff_lines[0]
        transcript_line.end = exon.end
        model = Transcript(transcript_line)
        model.add_exon(exon)
        model.finalize()
        fasta = self.fasta[model.chrom][model.start - 1: model.end]

        tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=False)
        tcheck.check_strand()
        self.assertIsNone(tcheck.strand)

        tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=True)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "+")

        neg = model.copy()
        neg.strand = "-"

        tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=False)
        tcheck.check_strand()
        self.assertIsNone(tcheck.strand)

        tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=True)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "-")

    def test_negative(self):

        gtf_lines = """Chr5	Cufflinks	transcript	26575364	26578163	1000	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";exon_number "1";FPKM "2.9700103727";conf_hi "3.260618";frac "0.732092";cov "81.895309";conf_lo "2.679403";
Chr5	Cufflinks	exon	26575364	26575410	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26575495	26575620	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26575711	26575797	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26575885	26575944	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26576035	26576134	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26576261	26577069	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26577163	26577288	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26577378	26577449	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26577856	26578163	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";"""

        gtf_lines = [GtfLine(line) for line in gtf_lines.split("\n")]

        self.assertEqual(len([_ for _ in gtf_lines if _.header]), 0)

        transcript = Transcript(gtf_lines[0])
        transcript.add_exons(gtf_lines[1:])
        transcript.finalize()
        fasta_seq = self.fasta[transcript.chrom][transcript.start - 1:transcript.end]

        tr_neg = transcript.copy()
        tchecker = TranscriptChecker(tr_neg, fasta_seq, strand_specific=False)
        self.assertEqual(tchecker.strand, "-")
        self.assertEqual(tchecker.fasta_seq, fasta_seq)
        tchecker.check_strand()
        self.assertEqual(tchecker.strand, "-")

        tr_neg = transcript.copy()
        tr_neg.strand = "+"
        for ss in (False, True):
            with self.subTest(ss=ss):
                tchecker = TranscriptChecker(tr_neg.copy(), fasta_seq, strand_specific=ss)
                tchecker.check_strand()
                if ss:
                    self.assertEqual(tchecker.strand, "+")
                    self.assertTrue(tchecker.suspicious_splicing)
                else:
                    self.assertEqual(tchecker.strand, "-")

    def test_suspicious(self):

        self.model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertTrue(self.model.suspicious_splicing)
        del self.model.attributes["mixed_splices"]
        self.assertFalse(self.model.suspicious_splicing)

        self.model.attributes["canonical_number"] = 0
        self.assertFalse(self.model.suspicious_splicing)

        del self.model.attributes["canonical_number"]

        self.model.attributes["canonical_on_reverse_strand"] = True
        self.assertTrue(self.model.suspicious_splicing)
        self.model.attributes["canonical_on_reverse_strand"] = False
        self.assertFalse(self.model.suspicious_splicing)
        self.model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertTrue(self.model.suspicious_splicing)

        del self.model.attributes["mixed_splices"]
        del self.model.attributes["canonical_on_reverse_strand"]
        self.model.attributes["canonical_number"] = 0
        self.assertFalse(self.model.suspicious_splicing)
        self.assertTrue(self.model.only_non_canonical_splicing)
        self.model.attributes["canonical_on_reverse_strand"] = True
        self.assertTrue(self.model.suspicious_splicing)
        self.assertTrue(self.model.only_non_canonical_splicing)
        del self.model.attributes["canonical_on_reverse_strand"]
        self.model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertTrue(self.model.suspicious_splicing)
        self.assertTrue(self.model.only_non_canonical_splicing)

    def test_monoexonic_suspicious(self):

        """A monoexonic transcript should never appear as a suspicious transcript, in terms of splicing."""

        exon = self.gff_lines[1]
        transcript_line = self.gff_lines[0]
        transcript_line.end = exon.end
        model = Transcript(transcript_line)
        model.add_exon(exon)
        model.finalize()

        model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertFalse(model.suspicious_splicing)
        del model.attributes["mixed_splices"]
        self.assertFalse(model.suspicious_splicing)
        
        model.attributes["canonical_number"] = 0
        self.assertFalse(model.suspicious_splicing)
        
        del model.attributes["canonical_number"]
        
        model.attributes["canonical_on_reverse_strand"] = True
        self.assertFalse(model.suspicious_splicing)
        model.attributes["canonical_on_reverse_strand"] = False
        self.assertFalse(model.suspicious_splicing)
        model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertFalse(model.suspicious_splicing)
        
        del model.attributes["mixed_splices"]
        del model.attributes["canonical_on_reverse_strand"]
        model.attributes["canonical_number"] = 0
        self.assertFalse(model.suspicious_splicing)
        self.assertFalse(model.only_non_canonical_splicing)
        model.attributes["canonical_on_reverse_strand"] = True
        self.assertFalse(model.suspicious_splicing)
        self.assertFalse(model.only_non_canonical_splicing)
        del model.attributes["canonical_on_reverse_strand"]
        model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertFalse(model.suspicious_splicing)
        self.assertFalse(model.only_non_canonical_splicing)
    def test_monoexonic_suspicious(self):
        """A monoexonic transcript should never appear as a suspicious transcript, in terms of splicing."""

        exon = self.gff_lines[1]
        transcript_line = self.gff_lines[0]
        transcript_line.end = exon.end
        model = Transcript(transcript_line)
        model.add_exon(exon)
        model.finalize()

        model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertFalse(model.suspicious_splicing)
        del model.attributes["mixed_splices"]
        self.assertFalse(model.suspicious_splicing)

        model.attributes["canonical_number"] = 0
        self.assertFalse(model.suspicious_splicing)

        del model.attributes["canonical_number"]

        model.attributes["canonical_on_reverse_strand"] = True
        self.assertFalse(model.suspicious_splicing)
        model.attributes["canonical_on_reverse_strand"] = False
        self.assertFalse(model.suspicious_splicing)
        model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertFalse(model.suspicious_splicing)

        del model.attributes["mixed_splices"]
        del model.attributes["canonical_on_reverse_strand"]
        model.attributes["canonical_number"] = 0
        self.assertFalse(model.suspicious_splicing)
        self.assertFalse(model.only_non_canonical_splicing)
        model.attributes["canonical_on_reverse_strand"] = True
        self.assertFalse(model.suspicious_splicing)
        self.assertFalse(model.only_non_canonical_splicing)
        del model.attributes["canonical_on_reverse_strand"]
        model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertFalse(model.suspicious_splicing)
        self.assertFalse(model.only_non_canonical_splicing)
class TChekerTester(unittest.TestCase):

    temp_genome = None

    @classmethod
    def setUpClass(cls):

        # Prepare the genome
        cls.temp_genome = tempfile.NamedTemporaryFile(mode="wb", suffix=".fa")
        with pkg_resources.resource_stream("Mikado.tests",
                                           "chr5.fas.gz") as comp:
            cls.temp_genome.write(gzip.decompress(comp.read()))
        cls.temp_genome.flush()
        cls.fasta = pyfaidx.Fasta(cls.temp_genome.name)

    def setUp(self):

        # Prepare the model
        self.model_lines = """Chr5	tair10	transcript	26584797	26595528	100	+	.	ID=c58_g1_i3.mrna1.19;Parent=c58_g1_i3.path1.19;Name=c58_g1_i3.mrna1.19;gene_name=c58_g1_i3
    Chr5	tair10	exon	26584797	26584879	.	+	.	ID=c58_g1_i3.mrna1.19.exon1;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26585220	26585273	.	+	.	ID=c58_g1_i3.mrna1.19.exon2;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26585345	26585889	.	+	.	ID=c58_g1_i3.mrna1.19.exon3;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26585982	26586294	.	+	.	ID=c58_g1_i3.mrna1.19.exon4;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26586420	26586524	.	+	.	ID=c58_g1_i3.mrna1.19.exon5;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26586638	26586850	.	+	.	ID=c58_g1_i3.mrna1.19.exon6;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26586934	26586996	.	+	.	ID=c58_g1_i3.mrna1.19.exon7;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26587084	26587202	.	+	.	ID=c58_g1_i3.mrna1.19.exon8;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26587287	26587345	.	+	.	ID=c58_g1_i3.mrna1.19.exon9;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26587427	26587472	.	+	.	ID=c58_g1_i3.mrna1.19.exon10;Parent=c58_g1_i3.mrna1.19
    Chr5	tair10	exon	26595411	26595528	.	+	.	ID=c58_g1_i3.mrna1.19.exon11;Parent=c58_g1_i3.mrna1.19"""

        self.gff_lines = []
        for line in self.model_lines.split("\n"):
            line = line.rstrip().lstrip()
            line = GffLine(line)
            self.gff_lines.append(line)

        self.model = Transcript(self.gff_lines[0])
        self.model.add_exons(self.gff_lines[1:])
        self.model.finalize()

        self.exons = [
            self.fasta[line.chrom][line.start - 1:line.end]
            for line in self.gff_lines[1:]
        ]

        self.assertEqual(sum([len(exon) for exon in self.exons]), 1718,
                         self.exons)
        # We need the whole genomic fragment
        self.model_fasta = self.fasta["Chr5"][self.model.start -
                                              1:self.model.end]
        self.assertEqual(self.gff_lines[1].start, 26584797)
        self.assertEqual(self.gff_lines[1].end, 26584879)
        self.assertEqual(self.model.exons[0][0], self.gff_lines[1].start)
        self.assertEqual(self.model.exons[0][1], self.gff_lines[1].end)

    @classmethod
    def tearDownClass(cls):
        # Remove the genome
        if hasattr(cls.temp_genome, "close"):
            cls.temp_genome.close()
            cls.fasta.close()
            os.remove("{}.fai".format(cls.temp_genome.name))

    def test_translation_table(self):

        self.assertEqual(TranscriptChecker.get_translation_table(), {
            65: 84,
            67: 71,
            71: 67,
            84: 65
        })

    def test_rev_complement(self):

        string = "AGTCGTGCAGNGTCGAAGTGCAACAGTGC"

        self.assertEqual(TranscriptChecker.rev_complement(string),
                         "GCACTGTTGCACTTCGACNCTGCACGACT")

    def test_init(self):

        tcheck = TranscriptChecker(self.model, self.model_fasta)
        self.assertEqual(tcheck.cdna_length, 1718)
        self.assertEqual(
            sorted(tcheck.exons),
            sorted([(exon.start, exon.end) for exon in self.exons]))
        self.assertEqual(tcheck.fasta_seq, self.model_fasta)

    def test_check_reverse_strand(self):

        self.model.strand = "-"
        tcheck = TranscriptChecker(self.model,
                                   self.model_fasta,
                                   strand_specific=False)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "+")

    def test_check_strand_not_reversed(self):
        self.model.strand = "-"
        tcheck = TranscriptChecker(self.model,
                                   self.model_fasta,
                                   strand_specific=True)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "-")
        self.assertTrue(tcheck.attributes["canonical_on_reverse_strand"])
        self.assertTrue(tcheck.suspicious_splicing)

    def test_monoexonic(self):

        exon = self.gff_lines[1]
        transcript_line = self.gff_lines[0]
        transcript_line.end = exon.end
        model = Transcript(transcript_line)
        model.add_exon(exon)
        model.finalize()
        fasta = self.fasta[model.chrom][model.start - 1:model.end]

        tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=False)
        tcheck.check_strand()
        self.assertIsNone(tcheck.strand)

        tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=True)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "+")

        neg = model.copy()
        neg.strand = "-"

        tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=False)
        tcheck.check_strand()
        self.assertIsNone(tcheck.strand)

        tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=True)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "-")

    def test_negative(self):

        gtf_lines = """Chr5	Cufflinks	transcript	26575364	26578163	1000	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";exon_number "1";FPKM "2.9700103727";conf_hi "3.260618";frac "0.732092";cov "81.895309";conf_lo "2.679403";
Chr5	Cufflinks	exon	26575364	26575410	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26575495	26575620	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26575711	26575797	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26575885	26575944	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26576035	26576134	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26576261	26577069	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26577163	26577288	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26577378	26577449	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";
Chr5	Cufflinks	exon	26577856	26578163	.	-	.	gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";"""

        gtf_lines = [GtfLine(line) for line in gtf_lines.split("\n")]

        self.assertEqual(len([_ for _ in gtf_lines if _.header]), 0)

        transcript = Transcript(gtf_lines[0])
        transcript.add_exons(gtf_lines[1:])
        transcript.finalize()
        fasta_seq = self.fasta[transcript.chrom][transcript.start -
                                                 1:transcript.end]

        tr_neg = transcript.copy()
        tchecker = TranscriptChecker(tr_neg, fasta_seq, strand_specific=False)
        self.assertEqual(tchecker.strand, "-")
        self.assertEqual(tchecker.fasta_seq, fasta_seq)
        tchecker.check_strand()
        self.assertEqual(tchecker.strand, "-")

        tr_neg = transcript.copy()
        tr_neg.strand = "+"
        for ss in (False, True):
            with self.subTest(ss=ss):
                tchecker = TranscriptChecker(tr_neg.copy(),
                                             fasta_seq,
                                             strand_specific=ss)
                tchecker.check_strand()
                if ss:
                    self.assertEqual(tchecker.strand, "+")
                    self.assertTrue(tchecker.suspicious_splicing)
                else:
                    self.assertEqual(tchecker.strand, "-")

    def test_suspicious(self):

        self.model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertTrue(self.model.suspicious_splicing)
        del self.model.attributes["mixed_splices"]
        self.assertFalse(self.model.suspicious_splicing)

        self.model.attributes["canonical_number"] = 0
        self.assertFalse(self.model.suspicious_splicing)

        del self.model.attributes["canonical_number"]

        self.model.attributes["canonical_on_reverse_strand"] = True
        self.assertTrue(self.model.suspicious_splicing)
        self.model.attributes["canonical_on_reverse_strand"] = False
        self.assertFalse(self.model.suspicious_splicing)
        self.model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertTrue(self.model.suspicious_splicing)

        del self.model.attributes["mixed_splices"]
        del self.model.attributes["canonical_on_reverse_strand"]
        self.model.attributes["canonical_number"] = 0
        self.assertFalse(self.model.suspicious_splicing)
        self.assertTrue(self.model.only_non_canonical_splicing)
        self.model.attributes["canonical_on_reverse_strand"] = True
        self.assertTrue(self.model.suspicious_splicing)
        self.assertTrue(self.model.only_non_canonical_splicing)
        del self.model.attributes["canonical_on_reverse_strand"]
        self.model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertTrue(self.model.suspicious_splicing)
        self.assertTrue(self.model.only_non_canonical_splicing)

    def test_monoexonic_suspicious(self):
        """A monoexonic transcript should never appear as a suspicious transcript, in terms of splicing."""

        exon = self.gff_lines[1]
        transcript_line = self.gff_lines[0]
        transcript_line.end = exon.end
        model = Transcript(transcript_line)
        model.add_exon(exon)
        model.finalize()

        model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertFalse(model.suspicious_splicing)
        del model.attributes["mixed_splices"]
        self.assertFalse(model.suspicious_splicing)

        model.attributes["canonical_number"] = 0
        self.assertFalse(model.suspicious_splicing)

        del model.attributes["canonical_number"]

        model.attributes["canonical_on_reverse_strand"] = True
        self.assertFalse(model.suspicious_splicing)
        model.attributes["canonical_on_reverse_strand"] = False
        self.assertFalse(model.suspicious_splicing)
        model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertFalse(model.suspicious_splicing)

        del model.attributes["mixed_splices"]
        del model.attributes["canonical_on_reverse_strand"]
        model.attributes["canonical_number"] = 0
        self.assertFalse(model.suspicious_splicing)
        self.assertFalse(model.only_non_canonical_splicing)
        model.attributes["canonical_on_reverse_strand"] = True
        self.assertFalse(model.suspicious_splicing)
        self.assertFalse(model.only_non_canonical_splicing)
        del model.attributes["canonical_on_reverse_strand"]
        model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertFalse(model.suspicious_splicing)
        self.assertFalse(model.only_non_canonical_splicing)
Beispiel #19
0
def parse_prediction(args, genes, positions, queue_logger):

    """
    This function performs the real comparison between the reference and the prediction.
     It needs the following inputs:
    :param args: the Namespace with the necessary parameters
    :param genes: Dictionary with the reference genes, of the form
    dict[chrom][(start,end)] = [gene object]
    :param positions: Dictionary with the positions of the reference genes, of the form
    dict[chrom][IntervalTree]
    :param queue_logger: Logger
    :return:
    """

    # start the class which will manage the statistics
    accountant_instance = Accountant(genes, args)
    assigner_instance = Assigner(genes, positions, args, accountant_instance)

    transcript = None
    if hasattr(args, "self") and args.self is True:
        args.prediction = to_gff(args.reference.name)
    ref_gff = isinstance(args.prediction, GFF3)
    __found_with_orf = set()

    for row in args.prediction:
        if row.header is True:
            continue
        #         queue_logger.debug("Row:\n{0:>20}".format(str(row)))
        if row.is_transcript is True or row.feature == "match":
            queue_logger.debug("Transcript row:\n%s", str(row))
            if transcript is not None:
                if re.search(r"\.orf[0-9]+$", transcript.id):
                    __name = re.sub(r"\.orf[0-9]+$", "", transcript.id)
                    if __name not in __found_with_orf:
                        __found_with_orf.add(__name)
                        assigner_instance.get_best(transcript)
                    else:
                        pass
                else:
                    assigner_instance.get_best(transcript)
            transcript = Transcript(row, logger=queue_logger)
        elif row.is_exon is True:
            # Case 1: we are talking about cDNA_match and GFF
            if ref_gff is True and "match" not in row.feature:
                if transcript is None:
                    raise TypeError("Transcript not defined inside the GFF; line:\n{}".format(row))
                else:
                    queue_logger.debug("Adding exon to transcript %s: %s",
                                       transcript.id, row)
                    transcript.add_exon(row)
            elif ref_gff is True and "match" in row.feature:
                if transcript is not None and row.id == transcript.id:
                    transcript.add_exon(row)
                elif transcript is not None and transcript.id in row.parent:
                    transcript.add_exon(row)
                elif transcript is None or (transcript is not None and row.id != transcript.id):
                    if transcript is not None:
                        if re.search(r"\.orf[0-9]+$", transcript.id) and \
                                (not transcript.id.endswith("orf1")):
                            pass
                        else:
                            assigner_instance.get_best(transcript)
                    queue_logger.debug("New transcript: %s", row.transcript)
                    transcript = Transcript(row, logger=queue_logger)
            elif ref_gff is False:
                if transcript is None or (transcript is not None and transcript.id != row.transcript):
                    if transcript is not None:
                        if re.search(r"\.orf[0-9]+$", transcript.id) and \
                                (not transcript.id.endswith("orf1")):
                            pass
                        else:
                            assigner_instance.get_best(transcript)
                    queue_logger.debug("New transcript: %s", row.transcript)
                    transcript = Transcript(row, logger=queue_logger)
                transcript.add_exon(row)
            else:
                raise TypeError("Unmatched exon: {}".format(row))

        elif row.header:
            continue
        else:
            queue_logger.debug("Skipped row: {}".format(row))

    if transcript is not None:
        if re.search(r"\.orf[0-9]+$", transcript.id) and not transcript.id.endswith("orf1"):
            pass
        else:
            assigner_instance.get_best(transcript)

    # Finish everything, including printing refmap and stats
    assigner_instance.finish()
    args.prediction.close()
    def test_monoexonic_suspicious(self):

        """A monoexonic transcript should never appear as a suspicious transcript, in terms of splicing."""

        exon = self.gff_lines[1]
        transcript_line = self.gff_lines[0]
        transcript_line.end = exon.end
        model = Transcript(transcript_line)
        model.add_exon(exon)
        model.finalize()

        model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertFalse(model.suspicious_splicing)
        del model.attributes["mixed_splices"]
        self.assertFalse(model.suspicious_splicing)
        
        model.attributes["canonical_number"] = 0
        self.assertFalse(model.suspicious_splicing)
        
        del model.attributes["canonical_number"]
        
        model.attributes["canonical_on_reverse_strand"] = True
        self.assertFalse(model.suspicious_splicing)
        model.attributes["canonical_on_reverse_strand"] = False
        self.assertFalse(model.suspicious_splicing)
        model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertFalse(model.suspicious_splicing)
        
        del model.attributes["mixed_splices"]
        del model.attributes["canonical_on_reverse_strand"]
        model.attributes["canonical_number"] = 0
        self.assertFalse(model.suspicious_splicing)
        self.assertFalse(model.only_non_canonical_splicing)
        model.attributes["canonical_on_reverse_strand"] = True
        self.assertFalse(model.suspicious_splicing)
        self.assertFalse(model.only_non_canonical_splicing)
        del model.attributes["canonical_on_reverse_strand"]
        model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertFalse(model.suspicious_splicing)
        self.assertFalse(model.only_non_canonical_splicing)