def test_monoexonic(self): exon = self.gff_lines[1] transcript_line = self.gff_lines[0] transcript_line.end = exon.end model = Transcript(transcript_line) model.add_exon(exon) model.finalize() fasta = self.fasta[model.chrom][model.start - 1:model.end] tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=False) tcheck.check_strand() self.assertIsNone(tcheck.strand) tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=True) tcheck.check_strand() self.assertEqual(tcheck.strand, "+") neg = model.copy() neg.strand = "-" tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=False) tcheck.check_strand() self.assertIsNone(tcheck.strand) tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=True) tcheck.check_strand() self.assertEqual(tcheck.strand, "-")
def test_monoexonic(self): exon = self.gff_lines[1] transcript_line = self.gff_lines[0] transcript_line.end = exon.end model = Transcript(transcript_line) model.add_exon(exon) model.finalize() fasta = self.fasta[model.chrom][model.start - 1: model.end] tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=False) tcheck.check_strand() self.assertIsNone(tcheck.strand) tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=True) tcheck.check_strand() self.assertEqual(tcheck.strand, "+") neg = model.copy() neg.strand = "-" tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=False) tcheck.check_strand() self.assertIsNone(tcheck.strand) tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=True) tcheck.check_strand() self.assertEqual(tcheck.strand, "-")
def test_monoexonic_suspicious(self): """A monoexonic transcript should never appear as a suspicious transcript, in terms of splicing.""" exon = self.gff_lines[1] transcript_line = self.gff_lines[0] transcript_line.end = exon.end model = Transcript(transcript_line) model.add_exon(exon) model.finalize() model.attributes["mixed_splices"] = "6positive,1negative" self.assertFalse(model.suspicious_splicing) del model.attributes["mixed_splices"] self.assertFalse(model.suspicious_splicing) model.attributes["canonical_number"] = 0 self.assertFalse(model.suspicious_splicing) del model.attributes["canonical_number"] model.attributes["canonical_on_reverse_strand"] = True self.assertFalse(model.suspicious_splicing) model.attributes["canonical_on_reverse_strand"] = False self.assertFalse(model.suspicious_splicing) model.attributes["mixed_splices"] = "6positive,1negative" self.assertFalse(model.suspicious_splicing) del model.attributes["mixed_splices"] del model.attributes["canonical_on_reverse_strand"] model.attributes["canonical_number"] = 0 self.assertFalse(model.suspicious_splicing) self.assertFalse(model.only_non_canonical_splicing) model.attributes["canonical_on_reverse_strand"] = True self.assertFalse(model.suspicious_splicing) self.assertFalse(model.only_non_canonical_splicing) del model.attributes["canonical_on_reverse_strand"] model.attributes["mixed_splices"] = "6positive,1negative" self.assertFalse(model.suspicious_splicing) self.assertFalse(model.only_non_canonical_splicing)
def launch(args): """ Simple launcher script. :param args: the argparse Namespace """ if hasattr(args, "region") and args.region is not None: try: args.chrom, args.start, args.end = args.region except ValueError as exc: raise ValueError("{0} {1}".format(exc, args.region)) if args.start >= args.end: raise ValueError("Start greater than end: {0}\t{1}".format( args.start, args.end)) transcript = None with GTF(args.gtf) as gtf: for row in gtf: if row.chrom != args.chrom: continue else: if row.is_transcript is True: if transcript is not None and \ transcript.start >= args.start and transcript.end <= args.end: print(transcript.format("gtf"), file=args.out) transcript = None if args.assume_sorted is True and row.start > args.end: break transcript = Transcript(row) else: transcript.add_exon(row) if transcript is not None and transcript.start >= args.start and transcript.end <= args.end: print(transcript.format("gtf"), file=args.out)
def parse_prediction(args, genes, positions, queue_logger): """ This function performs the real comparison between the reference and the prediction. It needs the following inputs: :param args: the Namespace with the necessary parameters :param genes: Dictionary with the reference genes, of the form dict[chrom][(start,end)] = [gene object] :param positions: Dictionary with the positions of the reference genes, of the form dict[chrom][IntervalTree] :param queue_logger: Logger :return: """ # start the class which will manage the statistics accountant_instance = Accountant(genes, args) assigner_instance = Assigner(genes, positions, args, accountant_instance) transcript = None if hasattr(args, "self") and args.self is True: args.prediction = to_gff(args.reference.name) ref_gff = isinstance(args.prediction, GFF3) __found_with_orf = set() for row in args.prediction: if row.header is True: continue # queue_logger.debug("Row:\n{0:>20}".format(str(row))) if row.is_transcript is True or row.feature == "match": queue_logger.debug("Transcript row:\n%s", str(row)) if transcript is not None: if re.search(r"\.orf[0-9]+$", transcript.id): __name = re.sub(r"\.orf[0-9]+$", "", transcript.id) if __name not in __found_with_orf: __found_with_orf.add(__name) assigner_instance.get_best(transcript) else: pass else: assigner_instance.get_best(transcript) transcript = Transcript(row, logger=queue_logger) elif row.is_exon is True: # Case 1: we are talking about cDNA_match and GFF if ref_gff is True and "match" not in row.feature: if transcript is None: raise TypeError( "Transcript not defined inside the GFF; line:\n{}". format(row)) else: queue_logger.debug("Adding exon to transcript %s: %s", transcript.id, row) transcript.add_exon(row) elif ref_gff is True and "match" in row.feature: if transcript is not None and row.id == transcript.id: transcript.add_exon(row) elif transcript is not None and transcript.id in row.parent: transcript.add_exon(row) elif transcript is None or (transcript is not None and row.id != transcript.id): if transcript is not None: if re.search(r"\.orf[0-9]+$", transcript.id) and \ (not transcript.id.endswith("orf1")): pass else: assigner_instance.get_best(transcript) queue_logger.debug("New transcript: %s", row.transcript) transcript = Transcript(row, logger=queue_logger) elif ref_gff is False: if transcript is None or (transcript is not None and transcript.id != row.transcript): if transcript is not None: if re.search(r"\.orf[0-9]+$", transcript.id) and \ (not transcript.id.endswith("orf1")): pass else: assigner_instance.get_best(transcript) queue_logger.debug("New transcript: %s", row.transcript) transcript = Transcript(row, logger=queue_logger) transcript.add_exon(row) else: raise TypeError("Unmatched exon: {}".format(row)) elif row.header: continue else: queue_logger.debug("Skipped row: {}".format(row)) if transcript is not None: if re.search(r"\.orf[0-9]+$", transcript.id) and not transcript.id.endswith("orf1"): pass else: assigner_instance.get_best(transcript) # Finish everything, including printing refmap and stats assigner_instance.finish() args.prediction.close()
def parse_prediction(args, genes, positions, queue_logger): """ This function performs the real comparison between the reference and the prediction. It needs the following inputs: :param args: the Namespace with the necessary parameters :param genes: Dictionary with the reference genes, of the form dict[chrom][(start,end)] = [gene object] :param positions: Dictionary with the positions of the reference genes, of the form dict[chrom][IntervalTree] :param queue_logger: Logger :return: """ # start the class which will manage the statistics accountant_instance = Accountant(genes, args) assigner_instance = Assigner(genes, positions, args, accountant_instance) transcript = None if hasattr(args, "self") and args.self is True: args.prediction = to_gff(args.reference.name) ref_gff = isinstance(args.prediction, GFF3) __found_with_orf = set() for row in args.prediction: if row.header is True: continue # queue_logger.debug("Row:\n{0:>20}".format(str(row))) if row.is_transcript is True or row.feature == "match": queue_logger.debug("Transcript row:\n%s", str(row)) if transcript is not None: if re.search(r"\.orf[0-9]+$", transcript.id): __name = re.sub(r"\.orf[0-9]+$", "", transcript.id) if __name not in __found_with_orf: __found_with_orf.add(__name) assigner_instance.get_best(transcript) else: pass else: assigner_instance.get_best(transcript) transcript = Transcript(row, logger=queue_logger) elif row.is_exon is True: # Case 1: we are talking about cDNA_match and GFF if ref_gff is True and "match" not in row.feature: if transcript is None: raise TypeError("Transcript not defined inside the GFF; line:\n{}".format(row)) else: queue_logger.debug("Adding exon to transcript %s: %s", transcript.id, row) transcript.add_exon(row) elif ref_gff is True and "match" in row.feature: if transcript is not None and row.id == transcript.id: transcript.add_exon(row) elif transcript is not None and transcript.id in row.parent: transcript.add_exon(row) elif transcript is None or (transcript is not None and row.id != transcript.id): if transcript is not None: if re.search(r"\.orf[0-9]+$", transcript.id) and \ (not transcript.id.endswith("orf1")): pass else: assigner_instance.get_best(transcript) queue_logger.debug("New transcript: %s", row.transcript) transcript = Transcript(row, logger=queue_logger) elif ref_gff is False: if transcript is None or (transcript is not None and transcript.id != row.transcript): if transcript is not None: if re.search(r"\.orf[0-9]+$", transcript.id) and \ (not transcript.id.endswith("orf1")): pass else: assigner_instance.get_best(transcript) queue_logger.debug("New transcript: %s", row.transcript) transcript = Transcript(row, logger=queue_logger) transcript.add_exon(row) else: raise TypeError("Unmatched exon: {}".format(row)) elif row.header: continue else: queue_logger.debug("Skipped row: {}".format(row)) if transcript is not None: if re.search(r"\.orf[0-9]+$", transcript.id) and not transcript.id.endswith("orf1"): pass else: assigner_instance.get_best(transcript) # Finish everything, including printing refmap and stats assigner_instance.finish() args.prediction.close()