def test_monoexonic(self): exon = self.gff_lines[1] transcript_line = self.gff_lines[0] transcript_line.end = exon.end model = Transcript(transcript_line) model.add_exon(exon) model.finalize() fasta = self.fasta[model.chrom][model.start - 1:model.end] tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=False) tcheck.check_strand() self.assertIsNone(tcheck.strand) tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=True) tcheck.check_strand() self.assertEqual(tcheck.strand, "+") neg = model.copy() neg.strand = "-" tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=False) tcheck.check_strand() self.assertIsNone(tcheck.strand) tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=True) tcheck.check_strand() self.assertEqual(tcheck.strand, "-")
def test_monoexonic(self): exon = self.gff_lines[1] transcript_line = self.gff_lines[0] transcript_line.end = exon.end model = Transcript(transcript_line) model.add_exon(exon) model.finalize() fasta = self.fasta[model.chrom][model.start - 1: model.end] tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=False) tcheck.check_strand() self.assertIsNone(tcheck.strand) tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=True) tcheck.check_strand() self.assertEqual(tcheck.strand, "+") neg = model.copy() neg.strand = "-" tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=False) tcheck.check_strand() self.assertIsNone(tcheck.strand) tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=True) tcheck.check_strand() self.assertEqual(tcheck.strand, "-")
def main(): parser = argparse.ArgumentParser( "Script to convert from BAM to GTF, for PB alignments") parser.add_argument( "--strict", action="store_true", default=False, help= "Switch. If set, this script will never output multiexonic transcripts \ without a defined strand.") parser.add_argument("--outfmt", choices=["gtf", "bed12"], default="gtf") parser.add_argument("bam", type=to_bam, help="Input BAM file") parser.add_argument("out", nargs="?", default=sys.stdout, type=argparse.FileType("wt"), help="Optional output file") args = parser.parse_args() # M 0 alignment match (can be a sequence match or mismatch) # I 1 insertion to the reference # D 2 deletion from the reference # N 3 skipped region from the reference # S 4 soft clipping (clipped sequences present in SEQ) # H 5 hard clipping (clipped sequences NOT present in SEQ) # P 6 padding (silent deletion from padded reference) # = 7 sequence match # X 8 sequence mismatch name_counter = Counter() for record in args.bam: if record.is_unmapped is True: continue transcript = Transcript(record, accept_undefined_multi=(not args.strict)) if name_counter.get(record.query_name): name = "{}_{}".format(record.query_name, name_counter.get(record.query_name)) else: name = record.query_name transcript.id = name transcript.parent = transcript.attributes[ "gene_id"] = "{0}.gene".format(name) name_counter.update([record.query_name]) transcript.source = "bam2gtf" print(transcript.format(args.outfmt), file=args.out)
def setUp(self): # Prepare the model self.model_lines= """Chr5 tair10 transcript 26584797 26595528 100 + . ID=c58_g1_i3.mrna1.19;Parent=c58_g1_i3.path1.19;Name=c58_g1_i3.mrna1.19;gene_name=c58_g1_i3 Chr5 tair10 exon 26584797 26584879 . + . ID=c58_g1_i3.mrna1.19.exon1;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26585220 26585273 . + . ID=c58_g1_i3.mrna1.19.exon2;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26585345 26585889 . + . ID=c58_g1_i3.mrna1.19.exon3;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26585982 26586294 . + . ID=c58_g1_i3.mrna1.19.exon4;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26586420 26586524 . + . ID=c58_g1_i3.mrna1.19.exon5;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26586638 26586850 . + . ID=c58_g1_i3.mrna1.19.exon6;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26586934 26586996 . + . ID=c58_g1_i3.mrna1.19.exon7;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26587084 26587202 . + . ID=c58_g1_i3.mrna1.19.exon8;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26587287 26587345 . + . ID=c58_g1_i3.mrna1.19.exon9;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26587427 26587472 . + . ID=c58_g1_i3.mrna1.19.exon10;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26595411 26595528 . + . ID=c58_g1_i3.mrna1.19.exon11;Parent=c58_g1_i3.mrna1.19""" self.gff_lines = [] for line in self.model_lines.split("\n"): line = line.rstrip().lstrip() line = GffLine(line) self.gff_lines.append(line) self.model = Transcript(self.gff_lines[0]) self.model.add_exons(self.gff_lines[1:]) self.model.finalize() self.exons = [self.fasta[line.chrom][line.start - 1:line.end] for line in self.gff_lines[1:]] self.assertEqual(sum([len(exon) for exon in self.exons]), 1718, self.exons) # We need the whole genomic fragment self.model_fasta = self.fasta["Chr5"][self.model.start -1:self.model.end] self.assertEqual(self.gff_lines[1].start, 26584797) self.assertEqual(self.gff_lines[1].end, 26584879) self.assertEqual(self.model.exons[0][0], self.gff_lines[1].start) self.assertEqual(self.model.exons[0][1], self.gff_lines[1].end)
def __prepare_transcript(self, prediction: Transcript): """ Private method that checks that a prediction transcript is OK before starting to analyse its concordance with the reference. :param prediction: :return: """ # Prepare the prediction to be analysed prediction.logger = self.logger try: prediction.finalize() # noinspection PyUnresolvedReferences if self.args.exclude_utr is True: prediction.remove_utrs() except InvalidCDS: try: prediction.strip_cds() except InvalidTranscript as err: self.logger.warn("Invalid transcript (due to CDS): %s", prediction.id) self.logger.warn("Error message: %s", err) self.done += 1 self.print_tmap(None) return None except InvalidTranscript as err: # args.queue.put_nowait("mock") self.logger.warn("Invalid transcript: %s", prediction.id) self.logger.warn("Error message: %s", err) self.done += 1 self.print_tmap(None) return None return prediction
def setUp(self): # Prepare the model self.model_lines = """Chr5 tair10 transcript 26584797 26595528 100 + . ID=c58_g1_i3.mrna1.19;Parent=c58_g1_i3.path1.19;Name=c58_g1_i3.mrna1.19;gene_name=c58_g1_i3 Chr5 tair10 exon 26584797 26584879 . + . ID=c58_g1_i3.mrna1.19.exon1;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26585220 26585273 . + . ID=c58_g1_i3.mrna1.19.exon2;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26585345 26585889 . + . ID=c58_g1_i3.mrna1.19.exon3;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26585982 26586294 . + . ID=c58_g1_i3.mrna1.19.exon4;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26586420 26586524 . + . ID=c58_g1_i3.mrna1.19.exon5;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26586638 26586850 . + . ID=c58_g1_i3.mrna1.19.exon6;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26586934 26586996 . + . ID=c58_g1_i3.mrna1.19.exon7;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26587084 26587202 . + . ID=c58_g1_i3.mrna1.19.exon8;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26587287 26587345 . + . ID=c58_g1_i3.mrna1.19.exon9;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26587427 26587472 . + . ID=c58_g1_i3.mrna1.19.exon10;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26595411 26595528 . + . ID=c58_g1_i3.mrna1.19.exon11;Parent=c58_g1_i3.mrna1.19""" self.gff_lines = [] for line in self.model_lines.split("\n"): line = line.rstrip().lstrip() line = GffLine(line) self.gff_lines.append(line) self.model = Transcript(self.gff_lines[0]) self.model.add_exons(self.gff_lines[1:]) self.model.finalize() self.exons = [ self.fasta[line.chrom][line.start - 1:line.end] for line in self.gff_lines[1:] ] self.assertEqual(sum([len(exon) for exon in self.exons]), 1718, self.exons) # We need the whole genomic fragment self.model_fasta = self.fasta["Chr5"][self.model.start - 1:self.model.end] self.assertEqual(self.gff_lines[1].start, 26584797) self.assertEqual(self.gff_lines[1].end, 26584879) self.assertEqual(self.model.exons[0][0], self.gff_lines[1].start) self.assertEqual(self.model.exons[0][1], self.gff_lines[1].end)
def launch(args): """ Simple launcher script. :param args: the argparse Namespace """ if hasattr(args, "region") and args.region is not None: try: args.chrom, args.start, args.end = args.region except ValueError as exc: raise ValueError("{0} {1}".format(exc, args.region)) if args.start >= args.end: raise ValueError("Start greater than end: {0}\t{1}".format( args.start, args.end)) transcript = None with GTF(args.gtf) as gtf: for row in gtf: if row.chrom != args.chrom: continue else: if row.is_transcript is True: if transcript is not None and \ transcript.start >= args.start and transcript.end <= args.end: print(transcript.format("gtf"), file=args.out) transcript = None if args.assume_sorted is True and row.start > args.end: break transcript = Transcript(row) else: transcript.add_exon(row) if transcript is not None and transcript.start >= args.start and transcript.end <= args.end: print(transcript.format("gtf"), file=args.out)
def test_negative(self): gtf_lines = """Chr5 Cufflinks transcript 26575364 26578163 1000 - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";exon_number "1";FPKM "2.9700103727";conf_hi "3.260618";frac "0.732092";cov "81.895309";conf_lo "2.679403"; Chr5 Cufflinks exon 26575364 26575410 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26575495 26575620 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26575711 26575797 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26575885 26575944 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26576035 26576134 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26576261 26577069 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26577163 26577288 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26577378 26577449 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26577856 26578163 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";""" gtf_lines = [GtfLine(line) for line in gtf_lines.split("\n")] self.assertEqual(len([_ for _ in gtf_lines if _.header]), 0) transcript = Transcript(gtf_lines[0]) transcript.add_exons(gtf_lines[1:]) transcript.finalize() fasta_seq = self.fasta[transcript.chrom][transcript.start - 1:transcript.end] tr_neg = transcript.copy() tchecker = TranscriptChecker(tr_neg, fasta_seq, strand_specific=False) self.assertEqual(tchecker.strand, "-") self.assertEqual(tchecker.fasta_seq, fasta_seq) tchecker.check_strand() self.assertEqual(tchecker.strand, "-") tr_neg = transcript.copy() tr_neg.strand = "+" for ss in (False, True): with self.subTest(ss=ss): tchecker = TranscriptChecker(tr_neg.copy(), fasta_seq, strand_specific=ss) tchecker.check_strand() if ss: self.assertEqual(tchecker.strand, "+") self.assertTrue(tchecker.suspicious_splicing) else: self.assertEqual(tchecker.strand, "-")
def test_positive_strand(self): gtf_lines = """chr1A Self_CESAR/windows_chr1A.gp transcript 265021906 265026255 . + . gene_id "TraesCS1A01G152900.1"; transcript_id "TraesCS1A01G152900.1"; chr1A Self_CESAR/windows_chr1A.gp exon 265021906 265021971 . + . gene_id "TraesCS1A01G152900.1"; transcript_id "TraesCS1A01G152900.1"; exon_number "1"; exon_id "TraesCS1A01G152900.1.1"; chr1A Self_CESAR/windows_chr1A.gp CDS 265021906 265021971 . + 0 gene_id "TraesCS1A01G152900.1"; transcript_id "TraesCS1A01G152900.1"; exon_number "1"; exon_id "TraesCS1A01G152900.1.1"; chr1A Self_CESAR/windows_chr1A.gp exon 265022056 265026255 . + . gene_id "TraesCS1A01G152900.1"; transcript_id "TraesCS1A01G152900.1"; exon_number "2"; exon_id "TraesCS1A01G152900.1.2"; chr1A Self_CESAR/windows_chr1A.gp CDS 265022056 265026252 . + 0 gene_id "TraesCS1A01G152900.1"; transcript_id "TraesCS1A01G152900.1"; exon_number "2"; exon_id "TraesCS1A01G152900.1.2"; chr1A Self_CESAR/windows_chr1A.gp start_codon 265021906 265021908 . + 0 gene_id "TraesCS1A01G152900.1"; transcript_id "TraesCS1A01G152900.1"; exon_number "1"; exon_id "TraesCS1A01G152900.1.1"; chr1A Self_CESAR/windows_chr1A.gp stop_codon 265026253 265026255 . + 0 gene_id "TraesCS1A01G152900.1"; transcript_id "TraesCS1A01G152900.1"; exon_number "2"; exon_id "TraesCS1A01G152900.1.2";""" gtf_lines = [GtfLine(_) for _ in gtf_lines.split("\n")] t = Transcript(gtf_lines[0]) t.add_exons(gtf_lines[1:]) t.finalize() self.assertEqual(t.start, t.combined_cds_start) self.assertEqual(t.end, t.combined_cds_end)
def main(): parser = argparse.ArgumentParser( "Script to convert from BAM to GTF, for PB alignments") parser.add_argument("bam", type=to_bam, help="Input BAM file") parser.add_argument("out", nargs="?", default=sys.stdout, type=argparse.FileType("wt"), help="Optional output file") args = parser.parse_args() # M 0 alignment match (can be a sequence match or mismatch) # I 1 insertion to the reference # D 2 deletion from the reference # N 3 skipped region from the reference # S 4 soft clipping (clipped sequences present in SEQ) # H 5 hard clipping (clipped sequences NOT present in SEQ) # P 6 padding (silent deletion from padded reference) # = 7 sequence match # X 8 sequence mismatch for record in args.bam: try: start, end = record.reference_start, record.get_blocks()[-1][1] except IndexError: continue current = start exons = [] current_exon = None r_length = record.inferred_length # Read length # Upper because of STAR edit_distance = [_[1] for _ in record.tags if _[0].upper() == "NM"][0] # Excluding clipping matches = 0 alen = 0 for cigar, length in record.cigartuples: if cigar in (4, 5): # Insertion, clipping (soft or hard) continue elif cigar == 1: alen += length elif cigar == 3: # Intron if current_exon is None: continue # Read positioned at the end/beginning of scaffold # assert current_exon is not None exons.append(current_exon) current_exon = None current += length elif cigar in (0, 2): # Match or deletion if current_exon is None: current_exon = (current + 1, current + length) else: current_exon = (current_exon[0], current_exon[1] + length) current += length if cigar == 0: matches += length alen += length snps = sum( len(_) for _ in re.split( "[0-9]*", [_[1] for _ in record.tags if _[0] == "MD"][0]) if not _.startswith("^")) identity = round(100 * (matches - snps) / r_length, 2) coverage = round(100 * alen / r_length, 2) exons.append(current_exon) # This is clearly a mistake if current != end: msg = """{0}, {1}, {2} Cigar tuples: {3} Cigar string: {4} Blocks: {5} Exons: {6} """.format(current, end, end - current, record.cigartuples, record.cigarstring, record.get_blocks(), exons) print(AssertionError(msg), file=sys.stderr) # raise AssertionError(msg) continue transcript = Transcript() transcript.id = record.query_name transcript.exons = exons transcript.parent = transcript.attributes[ "gene_id"] = "{0}.gene".format(record.query_name) transcript.attributes["identity"] = identity transcript.attributes["coverage"] = coverage transcript.attributes["cigar"] = record.cigarstring transcript.chrom = args.bam.getrname(record.rname) if len(transcript.exons) == 0: continue assert len(transcript.exons) > 0 try: transcript.start = min(x[0] for x in transcript.exons) except IndexError: raise IndexError(new_exons, transcript.exons) transcript.end = max(x[1] for x in transcript.exons) transcript.score = record.mapq transcript.source = "bam2gtf" transcript.id = record.query_name transcript.parent = transcript.attributes[ "gene_id"] = "{0}.gene".format(record.query_name) transcript.chrom = args.bam.getrname(record.rname) assert len(transcript.exons) > 0 try: transcript.start = min(x[0] for x in transcript.exons) except IndexError: raise IndexError(new_exons, transcript.exons) transcript.end = max(x[1] for x in transcript.exons) transcript.score = record.mapq transcript.source = "bam2gtf" transcript.attributes.update( dict((tag, str(value)) for tag, value in record.get_tags())) if "XS" in transcript.attributes: transcript.strand = transcript.attributes["XS"] else: if record.is_reverse: transcript.strand = "-" else: transcript.strand = "+" print(transcript.__str__(to_gtf=True), file=args.out)
def main(): parser = argparse.ArgumentParser("Script to convert from BAM to GTF, for PB alignments") parser.add_argument("bam", type=to_bam, help="Input BAM file") parser.add_argument("out", nargs="?", default=sys.stdout, type=argparse.FileType("wt"), help="Optional output file") args = parser.parse_args() # M 0 alignment match (can be a sequence match or mismatch) # I 1 insertion to the reference # D 2 deletion from the reference # N 3 skipped region from the reference # S 4 soft clipping (clipped sequences present in SEQ) # H 5 hard clipping (clipped sequences NOT present in SEQ) # P 6 padding (silent deletion from padded reference) # = 7 sequence match # X 8 sequence mismatch for record in args.bam: try: start, end = record.reference_start, record.get_blocks()[-1][1] except IndexError: continue current = start exons = [] current_exon = None r_length = record.inferred_length # Read length # Upper because of STAR edit_distance = [_[1] for _ in record.tags if _[0].upper() == "NM"][0] # Excluding clipping matches = 0 alen = 0 for cigar,length in record.cigartuples: if cigar in (4,5): # Insertion, clipping (soft or hard) continue elif cigar == 1: alen += length elif cigar == 3: # Intron if current_exon is None: continue # Read positioned at the end/beginning of scaffold # assert current_exon is not None exons.append(current_exon) current_exon = None current += length elif cigar in (0, 2): # Match or deletion if current_exon is None: current_exon = (current + 1, current + length) else: current_exon = (current_exon[0], current_exon[1] + length) current += length if cigar == 0: matches += length alen += length snps = sum(len(_) for _ in re.split("[0-9]*", [_[1] for _ in record.tags if _[0] == "MD"][0]) if not _.startswith("^")) identity = round(100 * (matches - snps) / r_length, 2) coverage = round(100 * alen / r_length, 2) exons.append(current_exon) # This is clearly a mistake if current != end: msg = """{0}, {1}, {2} Cigar tuples: {3} Cigar string: {4} Blocks: {5} Exons: {6} """.format(current, end, end-current, record.cigartuples, record.cigarstring, record.get_blocks(), exons) print(AssertionError(msg), file=sys.stderr) # raise AssertionError(msg) continue transcript = Transcript() transcript.id = record.query_name transcript.exons = exons transcript.parent = transcript.attributes["gene_id"] = "{0}.gene".format(record.query_name) transcript.attributes["identity"] = identity transcript.attributes["coverage"] = coverage transcript.attributes["cigar"] = record.cigarstring transcript.chrom = args.bam.getrname(record.rname) if len(transcript.exons) == 0: continue assert len(transcript.exons) > 0 try: transcript.start = min(x[0] for x in transcript.exons) except IndexError: raise IndexError(new_exons, transcript.exons) transcript.end = max(x[1] for x in transcript.exons) transcript.score = record.mapq transcript.source = "bam2gtf" transcript.id = record.query_name transcript.parent = transcript.attributes["gene_id"] = "{0}.gene".format(record.query_name) transcript.chrom = args.bam.getrname(record.rname) assert len(transcript.exons) > 0 try: transcript.start = min(x[0] for x in transcript.exons) except IndexError: raise IndexError(new_exons, transcript.exons) transcript.end = max(x[1] for x in transcript.exons) transcript.score = record.mapq transcript.source = "bam2gtf" transcript.attributes.update(dict((tag,str(value)) for tag,value in record.get_tags())) if "XS" in transcript.attributes: transcript.strand = transcript.attributes["XS"] else: if record.is_reverse: transcript.strand = "-" else: transcript.strand = "+" print(transcript.__str__(to_gtf=True), file=args.out)
def parse_prediction(args, genes, positions, queue_logger): """ This function performs the real comparison between the reference and the prediction. It needs the following inputs: :param args: the Namespace with the necessary parameters :param genes: Dictionary with the reference genes, of the form dict[chrom][(start,end)] = [gene object] :param positions: Dictionary with the positions of the reference genes, of the form dict[chrom][IntervalTree] :param queue_logger: Logger :return: """ # start the class which will manage the statistics accountant_instance = Accountant(genes, args) assigner_instance = Assigner(genes, positions, args, accountant_instance) transcript = None if hasattr(args, "self") and args.self is True: args.prediction = to_gff(args.reference.name) ref_gff = isinstance(args.prediction, GFF3) __found_with_orf = set() for row in args.prediction: if row.header is True: continue # queue_logger.debug("Row:\n{0:>20}".format(str(row))) if row.is_transcript is True or row.feature == "match": queue_logger.debug("Transcript row:\n%s", str(row)) if transcript is not None: if re.search(r"\.orf[0-9]+$", transcript.id): __name = re.sub(r"\.orf[0-9]+$", "", transcript.id) if __name not in __found_with_orf: __found_with_orf.add(__name) assigner_instance.get_best(transcript) else: pass else: assigner_instance.get_best(transcript) transcript = Transcript(row, logger=queue_logger) elif row.is_exon is True: # Case 1: we are talking about cDNA_match and GFF if ref_gff is True and "match" not in row.feature: if transcript is None: raise TypeError( "Transcript not defined inside the GFF; line:\n{}". format(row)) else: queue_logger.debug("Adding exon to transcript %s: %s", transcript.id, row) transcript.add_exon(row) elif ref_gff is True and "match" in row.feature: if transcript is not None and row.id == transcript.id: transcript.add_exon(row) elif transcript is not None and transcript.id in row.parent: transcript.add_exon(row) elif transcript is None or (transcript is not None and row.id != transcript.id): if transcript is not None: if re.search(r"\.orf[0-9]+$", transcript.id) and \ (not transcript.id.endswith("orf1")): pass else: assigner_instance.get_best(transcript) queue_logger.debug("New transcript: %s", row.transcript) transcript = Transcript(row, logger=queue_logger) elif ref_gff is False: if transcript is None or (transcript is not None and transcript.id != row.transcript): if transcript is not None: if re.search(r"\.orf[0-9]+$", transcript.id) and \ (not transcript.id.endswith("orf1")): pass else: assigner_instance.get_best(transcript) queue_logger.debug("New transcript: %s", row.transcript) transcript = Transcript(row, logger=queue_logger) transcript.add_exon(row) else: raise TypeError("Unmatched exon: {}".format(row)) elif row.header: continue else: queue_logger.debug("Skipped row: {}".format(row)) if transcript is not None: if re.search(r"\.orf[0-9]+$", transcript.id) and not transcript.id.endswith("orf1"): pass else: assigner_instance.get_best(transcript) # Finish everything, including printing refmap and stats assigner_instance.finish() args.prediction.close()
def prepare_reference(args, queue_logger, ref_gff=False) -> (dict, collections.defaultdict(dict)): """ Method to prepare the data structures that hold the reference information for the parsing. :param args: :param queue_logger: :param ref_gff: :return: genes, positions """ genes = dict() positions = collections.defaultdict(dict) transcript2gene = dict() for row in args.reference: # Assume we are going to use GTF for the moment if row.is_transcript is True or (ref_gff is True and row.feature == "match"): queue_logger.debug("Transcript\n%s", str(row)) transcript = Transcript(row, logger=queue_logger) if row.feature == "match": gid = row.id else: gid = row.gene transcript2gene[row.id] = gid if gid not in genes: genes[gid] = Gene(transcript, gid=gid, logger=queue_logger) genes[gid].add(transcript) assert transcript.id in genes[gid].transcripts elif row.is_exon is True: if ref_gff is True: if "cDNA_match" in row.feature: row.parent = row.id # row.gene = row.id if row.id not in transcript2gene: genes[row.id] = Gene(None, gid=row.id, logger=queue_logger) transcript2gene[row.id] = row.id transcript = Transcript(row, logger=queue_logger) genes[row.id].add(transcript) found = False for transcript in row.transcript: if transcript in transcript2gene: # We have to perform the check because there are some GFFs # e.g. TAIR # where CDSs are defined within a spurious "Protein" feature found = True gid = transcript2gene[transcript] genes[gid][transcript].add_exon(row) if found is False: queue_logger.warn( "This feature has no corresponding transcript! %s", str(row)) else: if row.gene in genes and row.transcript in genes[ row.gene].transcripts: genes[row.gene][row.transcript].add_exon(row) else: if row.gene not in genes: genes[row.gene] = Gene(None, gid=row.gene, logger=queue_logger) if row.transcript not in genes[row.gene]: transcript = Transcript(row, logger=queue_logger) transcript2gene[row.id] = row.gene genes[row.gene].add(transcript) genes[row.gene][row.transcript].add_exon(row) genes, positions = finalize_reference(genes, positions, queue_logger, args) if len(genes) == 0: raise KeyError("No genes remained for the reference!") return genes, positions
class TChekerTester(unittest.TestCase): temp_genome = None @classmethod def setUpClass(cls): # Prepare the genome cls.temp_genome = tempfile.NamedTemporaryFile(mode="wb", suffix=".fa") with pkg_resources.resource_stream("Mikado.tests", "chr5.fas.gz") as comp: cls.temp_genome.write(gzip.decompress(comp.read())) cls.temp_genome.flush() cls.fasta = pyfaidx.Fasta(cls.temp_genome.name) def setUp(self): # Prepare the model self.model_lines= """Chr5 tair10 transcript 26584797 26595528 100 + . ID=c58_g1_i3.mrna1.19;Parent=c58_g1_i3.path1.19;Name=c58_g1_i3.mrna1.19;gene_name=c58_g1_i3 Chr5 tair10 exon 26584797 26584879 . + . ID=c58_g1_i3.mrna1.19.exon1;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26585220 26585273 . + . ID=c58_g1_i3.mrna1.19.exon2;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26585345 26585889 . + . ID=c58_g1_i3.mrna1.19.exon3;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26585982 26586294 . + . ID=c58_g1_i3.mrna1.19.exon4;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26586420 26586524 . + . ID=c58_g1_i3.mrna1.19.exon5;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26586638 26586850 . + . ID=c58_g1_i3.mrna1.19.exon6;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26586934 26586996 . + . ID=c58_g1_i3.mrna1.19.exon7;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26587084 26587202 . + . ID=c58_g1_i3.mrna1.19.exon8;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26587287 26587345 . + . ID=c58_g1_i3.mrna1.19.exon9;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26587427 26587472 . + . ID=c58_g1_i3.mrna1.19.exon10;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26595411 26595528 . + . ID=c58_g1_i3.mrna1.19.exon11;Parent=c58_g1_i3.mrna1.19""" self.gff_lines = [] for line in self.model_lines.split("\n"): line = line.rstrip().lstrip() line = GffLine(line) self.gff_lines.append(line) self.model = Transcript(self.gff_lines[0]) self.model.add_exons(self.gff_lines[1:]) self.model.finalize() self.exons = [self.fasta[line.chrom][line.start - 1:line.end] for line in self.gff_lines[1:]] self.assertEqual(sum([len(exon) for exon in self.exons]), 1718, self.exons) # We need the whole genomic fragment self.model_fasta = self.fasta["Chr5"][self.model.start -1:self.model.end] self.assertEqual(self.gff_lines[1].start, 26584797) self.assertEqual(self.gff_lines[1].end, 26584879) self.assertEqual(self.model.exons[0][0], self.gff_lines[1].start) self.assertEqual(self.model.exons[0][1], self.gff_lines[1].end) @classmethod def tearDownClass(cls): # Remove the genome if hasattr(cls.temp_genome, "close"): cls.temp_genome.close() cls.fasta.close() os.remove("{}.fai".format(cls.temp_genome.name)) def test_translation_table(self): self.assertEqual(TranscriptChecker.get_translation_table(), {65: 84, 67: 71, 71: 67, 84: 65}) def test_rev_complement(self): string = "AGTCGTGCAGNGTCGAAGTGCAACAGTGC" self.assertEqual(TranscriptChecker.rev_complement(string), "GCACTGTTGCACTTCGACNCTGCACGACT") def test_init(self): tcheck = TranscriptChecker(self.model, self.model_fasta) self.assertEqual(tcheck.cdna_length, 1718) self.assertEqual(sorted(tcheck.exons), sorted([(exon.start, exon.end) for exon in self.exons])) self.assertEqual(tcheck.fasta_seq, self.model_fasta) def test_check_reverse_strand(self): self.model.strand = "-" tcheck = TranscriptChecker(self.model, self.model_fasta, strand_specific=False) tcheck.check_strand() self.assertEqual(tcheck.strand, "+") def test_check_strand_not_reversed(self): self.model.strand = "-" tcheck = TranscriptChecker(self.model, self.model_fasta, strand_specific=True) tcheck.check_strand() self.assertEqual(tcheck.strand, "-") self.assertTrue(tcheck.attributes["canonical_on_reverse_strand"]) self.assertTrue(tcheck.suspicious_splicing) def test_monoexonic(self): exon = self.gff_lines[1] transcript_line = self.gff_lines[0] transcript_line.end = exon.end model = Transcript(transcript_line) model.add_exon(exon) model.finalize() fasta = self.fasta[model.chrom][model.start - 1: model.end] tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=False) tcheck.check_strand() self.assertIsNone(tcheck.strand) tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=True) tcheck.check_strand() self.assertEqual(tcheck.strand, "+") neg = model.copy() neg.strand = "-" tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=False) tcheck.check_strand() self.assertIsNone(tcheck.strand) tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=True) tcheck.check_strand() self.assertEqual(tcheck.strand, "-") def test_negative(self): gtf_lines = """Chr5 Cufflinks transcript 26575364 26578163 1000 - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";exon_number "1";FPKM "2.9700103727";conf_hi "3.260618";frac "0.732092";cov "81.895309";conf_lo "2.679403"; Chr5 Cufflinks exon 26575364 26575410 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26575495 26575620 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26575711 26575797 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26575885 26575944 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26576035 26576134 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26576261 26577069 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26577163 26577288 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26577378 26577449 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26577856 26578163 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";""" gtf_lines = [GtfLine(line) for line in gtf_lines.split("\n")] self.assertEqual(len([_ for _ in gtf_lines if _.header]), 0) transcript = Transcript(gtf_lines[0]) transcript.add_exons(gtf_lines[1:]) transcript.finalize() fasta_seq = self.fasta[transcript.chrom][transcript.start - 1:transcript.end] tr_neg = transcript.copy() tchecker = TranscriptChecker(tr_neg, fasta_seq, strand_specific=False) self.assertEqual(tchecker.strand, "-") self.assertEqual(tchecker.fasta_seq, fasta_seq) tchecker.check_strand() self.assertEqual(tchecker.strand, "-") tr_neg = transcript.copy() tr_neg.strand = "+" for ss in (False, True): with self.subTest(ss=ss): tchecker = TranscriptChecker(tr_neg.copy(), fasta_seq, strand_specific=ss) tchecker.check_strand() if ss: self.assertEqual(tchecker.strand, "+") self.assertTrue(tchecker.suspicious_splicing) else: self.assertEqual(tchecker.strand, "-") def test_suspicious(self): self.model.attributes["mixed_splices"] = "6positive,1negative" self.assertTrue(self.model.suspicious_splicing) del self.model.attributes["mixed_splices"] self.assertFalse(self.model.suspicious_splicing) self.model.attributes["canonical_number"] = 0 self.assertFalse(self.model.suspicious_splicing) del self.model.attributes["canonical_number"] self.model.attributes["canonical_on_reverse_strand"] = True self.assertTrue(self.model.suspicious_splicing) self.model.attributes["canonical_on_reverse_strand"] = False self.assertFalse(self.model.suspicious_splicing) self.model.attributes["mixed_splices"] = "6positive,1negative" self.assertTrue(self.model.suspicious_splicing) del self.model.attributes["mixed_splices"] del self.model.attributes["canonical_on_reverse_strand"] self.model.attributes["canonical_number"] = 0 self.assertFalse(self.model.suspicious_splicing) self.assertTrue(self.model.only_non_canonical_splicing) self.model.attributes["canonical_on_reverse_strand"] = True self.assertTrue(self.model.suspicious_splicing) self.assertTrue(self.model.only_non_canonical_splicing) del self.model.attributes["canonical_on_reverse_strand"] self.model.attributes["mixed_splices"] = "6positive,1negative" self.assertTrue(self.model.suspicious_splicing) self.assertTrue(self.model.only_non_canonical_splicing) def test_monoexonic_suspicious(self): """A monoexonic transcript should never appear as a suspicious transcript, in terms of splicing.""" exon = self.gff_lines[1] transcript_line = self.gff_lines[0] transcript_line.end = exon.end model = Transcript(transcript_line) model.add_exon(exon) model.finalize() model.attributes["mixed_splices"] = "6positive,1negative" self.assertFalse(model.suspicious_splicing) del model.attributes["mixed_splices"] self.assertFalse(model.suspicious_splicing) model.attributes["canonical_number"] = 0 self.assertFalse(model.suspicious_splicing) del model.attributes["canonical_number"] model.attributes["canonical_on_reverse_strand"] = True self.assertFalse(model.suspicious_splicing) model.attributes["canonical_on_reverse_strand"] = False self.assertFalse(model.suspicious_splicing) model.attributes["mixed_splices"] = "6positive,1negative" self.assertFalse(model.suspicious_splicing) del model.attributes["mixed_splices"] del model.attributes["canonical_on_reverse_strand"] model.attributes["canonical_number"] = 0 self.assertFalse(model.suspicious_splicing) self.assertFalse(model.only_non_canonical_splicing) model.attributes["canonical_on_reverse_strand"] = True self.assertFalse(model.suspicious_splicing) self.assertFalse(model.only_non_canonical_splicing) del model.attributes["canonical_on_reverse_strand"] model.attributes["mixed_splices"] = "6positive,1negative" self.assertFalse(model.suspicious_splicing) self.assertFalse(model.only_non_canonical_splicing)
def test_monoexonic_suspicious(self): """A monoexonic transcript should never appear as a suspicious transcript, in terms of splicing.""" exon = self.gff_lines[1] transcript_line = self.gff_lines[0] transcript_line.end = exon.end model = Transcript(transcript_line) model.add_exon(exon) model.finalize() model.attributes["mixed_splices"] = "6positive,1negative" self.assertFalse(model.suspicious_splicing) del model.attributes["mixed_splices"] self.assertFalse(model.suspicious_splicing) model.attributes["canonical_number"] = 0 self.assertFalse(model.suspicious_splicing) del model.attributes["canonical_number"] model.attributes["canonical_on_reverse_strand"] = True self.assertFalse(model.suspicious_splicing) model.attributes["canonical_on_reverse_strand"] = False self.assertFalse(model.suspicious_splicing) model.attributes["mixed_splices"] = "6positive,1negative" self.assertFalse(model.suspicious_splicing) del model.attributes["mixed_splices"] del model.attributes["canonical_on_reverse_strand"] model.attributes["canonical_number"] = 0 self.assertFalse(model.suspicious_splicing) self.assertFalse(model.only_non_canonical_splicing) model.attributes["canonical_on_reverse_strand"] = True self.assertFalse(model.suspicious_splicing) self.assertFalse(model.only_non_canonical_splicing) del model.attributes["canonical_on_reverse_strand"] model.attributes["mixed_splices"] = "6positive,1negative" self.assertFalse(model.suspicious_splicing) self.assertFalse(model.only_non_canonical_splicing)
class TChekerTester(unittest.TestCase): temp_genome = None @classmethod def setUpClass(cls): # Prepare the genome cls.temp_genome = tempfile.NamedTemporaryFile(mode="wb", suffix=".fa") with pkg_resources.resource_stream("Mikado.tests", "chr5.fas.gz") as comp: cls.temp_genome.write(gzip.decompress(comp.read())) cls.temp_genome.flush() cls.fasta = pyfaidx.Fasta(cls.temp_genome.name) def setUp(self): # Prepare the model self.model_lines = """Chr5 tair10 transcript 26584797 26595528 100 + . ID=c58_g1_i3.mrna1.19;Parent=c58_g1_i3.path1.19;Name=c58_g1_i3.mrna1.19;gene_name=c58_g1_i3 Chr5 tair10 exon 26584797 26584879 . + . ID=c58_g1_i3.mrna1.19.exon1;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26585220 26585273 . + . ID=c58_g1_i3.mrna1.19.exon2;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26585345 26585889 . + . ID=c58_g1_i3.mrna1.19.exon3;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26585982 26586294 . + . ID=c58_g1_i3.mrna1.19.exon4;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26586420 26586524 . + . ID=c58_g1_i3.mrna1.19.exon5;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26586638 26586850 . + . ID=c58_g1_i3.mrna1.19.exon6;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26586934 26586996 . + . ID=c58_g1_i3.mrna1.19.exon7;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26587084 26587202 . + . ID=c58_g1_i3.mrna1.19.exon8;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26587287 26587345 . + . ID=c58_g1_i3.mrna1.19.exon9;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26587427 26587472 . + . ID=c58_g1_i3.mrna1.19.exon10;Parent=c58_g1_i3.mrna1.19 Chr5 tair10 exon 26595411 26595528 . + . ID=c58_g1_i3.mrna1.19.exon11;Parent=c58_g1_i3.mrna1.19""" self.gff_lines = [] for line in self.model_lines.split("\n"): line = line.rstrip().lstrip() line = GffLine(line) self.gff_lines.append(line) self.model = Transcript(self.gff_lines[0]) self.model.add_exons(self.gff_lines[1:]) self.model.finalize() self.exons = [ self.fasta[line.chrom][line.start - 1:line.end] for line in self.gff_lines[1:] ] self.assertEqual(sum([len(exon) for exon in self.exons]), 1718, self.exons) # We need the whole genomic fragment self.model_fasta = self.fasta["Chr5"][self.model.start - 1:self.model.end] self.assertEqual(self.gff_lines[1].start, 26584797) self.assertEqual(self.gff_lines[1].end, 26584879) self.assertEqual(self.model.exons[0][0], self.gff_lines[1].start) self.assertEqual(self.model.exons[0][1], self.gff_lines[1].end) @classmethod def tearDownClass(cls): # Remove the genome if hasattr(cls.temp_genome, "close"): cls.temp_genome.close() cls.fasta.close() os.remove("{}.fai".format(cls.temp_genome.name)) def test_translation_table(self): self.assertEqual(TranscriptChecker.get_translation_table(), { 65: 84, 67: 71, 71: 67, 84: 65 }) def test_rev_complement(self): string = "AGTCGTGCAGNGTCGAAGTGCAACAGTGC" self.assertEqual(TranscriptChecker.rev_complement(string), "GCACTGTTGCACTTCGACNCTGCACGACT") def test_init(self): tcheck = TranscriptChecker(self.model, self.model_fasta) self.assertEqual(tcheck.cdna_length, 1718) self.assertEqual( sorted(tcheck.exons), sorted([(exon.start, exon.end) for exon in self.exons])) self.assertEqual(tcheck.fasta_seq, self.model_fasta) def test_check_reverse_strand(self): self.model.strand = "-" tcheck = TranscriptChecker(self.model, self.model_fasta, strand_specific=False) tcheck.check_strand() self.assertEqual(tcheck.strand, "+") def test_check_strand_not_reversed(self): self.model.strand = "-" tcheck = TranscriptChecker(self.model, self.model_fasta, strand_specific=True) tcheck.check_strand() self.assertEqual(tcheck.strand, "-") self.assertTrue(tcheck.attributes["canonical_on_reverse_strand"]) self.assertTrue(tcheck.suspicious_splicing) def test_monoexonic(self): exon = self.gff_lines[1] transcript_line = self.gff_lines[0] transcript_line.end = exon.end model = Transcript(transcript_line) model.add_exon(exon) model.finalize() fasta = self.fasta[model.chrom][model.start - 1:model.end] tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=False) tcheck.check_strand() self.assertIsNone(tcheck.strand) tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=True) tcheck.check_strand() self.assertEqual(tcheck.strand, "+") neg = model.copy() neg.strand = "-" tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=False) tcheck.check_strand() self.assertIsNone(tcheck.strand) tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=True) tcheck.check_strand() self.assertEqual(tcheck.strand, "-") def test_negative(self): gtf_lines = """Chr5 Cufflinks transcript 26575364 26578163 1000 - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";exon_number "1";FPKM "2.9700103727";conf_hi "3.260618";frac "0.732092";cov "81.895309";conf_lo "2.679403"; Chr5 Cufflinks exon 26575364 26575410 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26575495 26575620 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26575711 26575797 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26575885 26575944 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26576035 26576134 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26576261 26577069 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26577163 26577288 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26577378 26577449 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1"; Chr5 Cufflinks exon 26577856 26578163 . - . gene_id "cufflinks_star_at.23553";transcript_id "cufflinks_star_at.23553.1";""" gtf_lines = [GtfLine(line) for line in gtf_lines.split("\n")] self.assertEqual(len([_ for _ in gtf_lines if _.header]), 0) transcript = Transcript(gtf_lines[0]) transcript.add_exons(gtf_lines[1:]) transcript.finalize() fasta_seq = self.fasta[transcript.chrom][transcript.start - 1:transcript.end] tr_neg = transcript.copy() tchecker = TranscriptChecker(tr_neg, fasta_seq, strand_specific=False) self.assertEqual(tchecker.strand, "-") self.assertEqual(tchecker.fasta_seq, fasta_seq) tchecker.check_strand() self.assertEqual(tchecker.strand, "-") tr_neg = transcript.copy() tr_neg.strand = "+" for ss in (False, True): with self.subTest(ss=ss): tchecker = TranscriptChecker(tr_neg.copy(), fasta_seq, strand_specific=ss) tchecker.check_strand() if ss: self.assertEqual(tchecker.strand, "+") self.assertTrue(tchecker.suspicious_splicing) else: self.assertEqual(tchecker.strand, "-") def test_suspicious(self): self.model.attributes["mixed_splices"] = "6positive,1negative" self.assertTrue(self.model.suspicious_splicing) del self.model.attributes["mixed_splices"] self.assertFalse(self.model.suspicious_splicing) self.model.attributes["canonical_number"] = 0 self.assertFalse(self.model.suspicious_splicing) del self.model.attributes["canonical_number"] self.model.attributes["canonical_on_reverse_strand"] = True self.assertTrue(self.model.suspicious_splicing) self.model.attributes["canonical_on_reverse_strand"] = False self.assertFalse(self.model.suspicious_splicing) self.model.attributes["mixed_splices"] = "6positive,1negative" self.assertTrue(self.model.suspicious_splicing) del self.model.attributes["mixed_splices"] del self.model.attributes["canonical_on_reverse_strand"] self.model.attributes["canonical_number"] = 0 self.assertFalse(self.model.suspicious_splicing) self.assertTrue(self.model.only_non_canonical_splicing) self.model.attributes["canonical_on_reverse_strand"] = True self.assertTrue(self.model.suspicious_splicing) self.assertTrue(self.model.only_non_canonical_splicing) del self.model.attributes["canonical_on_reverse_strand"] self.model.attributes["mixed_splices"] = "6positive,1negative" self.assertTrue(self.model.suspicious_splicing) self.assertTrue(self.model.only_non_canonical_splicing) def test_monoexonic_suspicious(self): """A monoexonic transcript should never appear as a suspicious transcript, in terms of splicing.""" exon = self.gff_lines[1] transcript_line = self.gff_lines[0] transcript_line.end = exon.end model = Transcript(transcript_line) model.add_exon(exon) model.finalize() model.attributes["mixed_splices"] = "6positive,1negative" self.assertFalse(model.suspicious_splicing) del model.attributes["mixed_splices"] self.assertFalse(model.suspicious_splicing) model.attributes["canonical_number"] = 0 self.assertFalse(model.suspicious_splicing) del model.attributes["canonical_number"] model.attributes["canonical_on_reverse_strand"] = True self.assertFalse(model.suspicious_splicing) model.attributes["canonical_on_reverse_strand"] = False self.assertFalse(model.suspicious_splicing) model.attributes["mixed_splices"] = "6positive,1negative" self.assertFalse(model.suspicious_splicing) del model.attributes["mixed_splices"] del model.attributes["canonical_on_reverse_strand"] model.attributes["canonical_number"] = 0 self.assertFalse(model.suspicious_splicing) self.assertFalse(model.only_non_canonical_splicing) model.attributes["canonical_on_reverse_strand"] = True self.assertFalse(model.suspicious_splicing) self.assertFalse(model.only_non_canonical_splicing) del model.attributes["canonical_on_reverse_strand"] model.attributes["mixed_splices"] = "6positive,1negative" self.assertFalse(model.suspicious_splicing) self.assertFalse(model.only_non_canonical_splicing)
def parse_prediction(args, genes, positions, queue_logger): """ This function performs the real comparison between the reference and the prediction. It needs the following inputs: :param args: the Namespace with the necessary parameters :param genes: Dictionary with the reference genes, of the form dict[chrom][(start,end)] = [gene object] :param positions: Dictionary with the positions of the reference genes, of the form dict[chrom][IntervalTree] :param queue_logger: Logger :return: """ # start the class which will manage the statistics accountant_instance = Accountant(genes, args) assigner_instance = Assigner(genes, positions, args, accountant_instance) transcript = None if hasattr(args, "self") and args.self is True: args.prediction = to_gff(args.reference.name) ref_gff = isinstance(args.prediction, GFF3) __found_with_orf = set() for row in args.prediction: if row.header is True: continue # queue_logger.debug("Row:\n{0:>20}".format(str(row))) if row.is_transcript is True or row.feature == "match": queue_logger.debug("Transcript row:\n%s", str(row)) if transcript is not None: if re.search(r"\.orf[0-9]+$", transcript.id): __name = re.sub(r"\.orf[0-9]+$", "", transcript.id) if __name not in __found_with_orf: __found_with_orf.add(__name) assigner_instance.get_best(transcript) else: pass else: assigner_instance.get_best(transcript) transcript = Transcript(row, logger=queue_logger) elif row.is_exon is True: # Case 1: we are talking about cDNA_match and GFF if ref_gff is True and "match" not in row.feature: if transcript is None: raise TypeError("Transcript not defined inside the GFF; line:\n{}".format(row)) else: queue_logger.debug("Adding exon to transcript %s: %s", transcript.id, row) transcript.add_exon(row) elif ref_gff is True and "match" in row.feature: if transcript is not None and row.id == transcript.id: transcript.add_exon(row) elif transcript is not None and transcript.id in row.parent: transcript.add_exon(row) elif transcript is None or (transcript is not None and row.id != transcript.id): if transcript is not None: if re.search(r"\.orf[0-9]+$", transcript.id) and \ (not transcript.id.endswith("orf1")): pass else: assigner_instance.get_best(transcript) queue_logger.debug("New transcript: %s", row.transcript) transcript = Transcript(row, logger=queue_logger) elif ref_gff is False: if transcript is None or (transcript is not None and transcript.id != row.transcript): if transcript is not None: if re.search(r"\.orf[0-9]+$", transcript.id) and \ (not transcript.id.endswith("orf1")): pass else: assigner_instance.get_best(transcript) queue_logger.debug("New transcript: %s", row.transcript) transcript = Transcript(row, logger=queue_logger) transcript.add_exon(row) else: raise TypeError("Unmatched exon: {}".format(row)) elif row.header: continue else: queue_logger.debug("Skipped row: {}".format(row)) if transcript is not None: if re.search(r"\.orf[0-9]+$", transcript.id) and not transcript.id.endswith("orf1"): pass else: assigner_instance.get_best(transcript) # Finish everything, including printing refmap and stats assigner_instance.finish() args.prediction.close()