예제 #1
0
    def test_monoexonic(self):

        exon = self.gff_lines[1]
        transcript_line = self.gff_lines[0]
        transcript_line.end = exon.end
        model = Transcript(transcript_line)
        model.add_exon(exon)
        model.finalize()
        fasta = self.fasta[model.chrom][model.start - 1:model.end]

        tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=False)
        tcheck.check_strand()
        self.assertIsNone(tcheck.strand)

        tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=True)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "+")

        neg = model.copy()
        neg.strand = "-"

        tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=False)
        tcheck.check_strand()
        self.assertIsNone(tcheck.strand)

        tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=True)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "-")
예제 #2
0
    def test_monoexonic(self):

        exon = self.gff_lines[1]
        transcript_line = self.gff_lines[0]
        transcript_line.end = exon.end
        model = Transcript(transcript_line)
        model.add_exon(exon)
        model.finalize()
        fasta = self.fasta[model.chrom][model.start - 1: model.end]

        tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=False)
        tcheck.check_strand()
        self.assertIsNone(tcheck.strand)

        tcheck = TranscriptChecker(model.copy(), fasta, strand_specific=True)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "+")

        neg = model.copy()
        neg.strand = "-"

        tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=False)
        tcheck.check_strand()
        self.assertIsNone(tcheck.strand)

        tcheck = TranscriptChecker(neg.copy(), fasta, strand_specific=True)
        tcheck.check_strand()
        self.assertEqual(tcheck.strand, "-")
예제 #3
0
    def test_monoexonic_suspicious(self):

        """A monoexonic transcript should never appear as a suspicious transcript, in terms of splicing."""

        exon = self.gff_lines[1]
        transcript_line = self.gff_lines[0]
        transcript_line.end = exon.end
        model = Transcript(transcript_line)
        model.add_exon(exon)
        model.finalize()

        model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertFalse(model.suspicious_splicing)
        del model.attributes["mixed_splices"]
        self.assertFalse(model.suspicious_splicing)
        
        model.attributes["canonical_number"] = 0
        self.assertFalse(model.suspicious_splicing)
        
        del model.attributes["canonical_number"]
        
        model.attributes["canonical_on_reverse_strand"] = True
        self.assertFalse(model.suspicious_splicing)
        model.attributes["canonical_on_reverse_strand"] = False
        self.assertFalse(model.suspicious_splicing)
        model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertFalse(model.suspicious_splicing)
        
        del model.attributes["mixed_splices"]
        del model.attributes["canonical_on_reverse_strand"]
        model.attributes["canonical_number"] = 0
        self.assertFalse(model.suspicious_splicing)
        self.assertFalse(model.only_non_canonical_splicing)
        model.attributes["canonical_on_reverse_strand"] = True
        self.assertFalse(model.suspicious_splicing)
        self.assertFalse(model.only_non_canonical_splicing)
        del model.attributes["canonical_on_reverse_strand"]
        model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertFalse(model.suspicious_splicing)
        self.assertFalse(model.only_non_canonical_splicing)
예제 #4
0
    def test_monoexonic_suspicious(self):
        """A monoexonic transcript should never appear as a suspicious transcript, in terms of splicing."""

        exon = self.gff_lines[1]
        transcript_line = self.gff_lines[0]
        transcript_line.end = exon.end
        model = Transcript(transcript_line)
        model.add_exon(exon)
        model.finalize()

        model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertFalse(model.suspicious_splicing)
        del model.attributes["mixed_splices"]
        self.assertFalse(model.suspicious_splicing)

        model.attributes["canonical_number"] = 0
        self.assertFalse(model.suspicious_splicing)

        del model.attributes["canonical_number"]

        model.attributes["canonical_on_reverse_strand"] = True
        self.assertFalse(model.suspicious_splicing)
        model.attributes["canonical_on_reverse_strand"] = False
        self.assertFalse(model.suspicious_splicing)
        model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertFalse(model.suspicious_splicing)

        del model.attributes["mixed_splices"]
        del model.attributes["canonical_on_reverse_strand"]
        model.attributes["canonical_number"] = 0
        self.assertFalse(model.suspicious_splicing)
        self.assertFalse(model.only_non_canonical_splicing)
        model.attributes["canonical_on_reverse_strand"] = True
        self.assertFalse(model.suspicious_splicing)
        self.assertFalse(model.only_non_canonical_splicing)
        del model.attributes["canonical_on_reverse_strand"]
        model.attributes["mixed_splices"] = "6positive,1negative"
        self.assertFalse(model.suspicious_splicing)
        self.assertFalse(model.only_non_canonical_splicing)
예제 #5
0
파일: awk_gtf.py 프로젝트: shenweima/mikado
def launch(args):
    """
    Simple launcher script.

    :param args: the argparse Namespace
    """

    if hasattr(args, "region") and args.region is not None:
        try:
            args.chrom, args.start, args.end = args.region
        except ValueError as exc:
            raise ValueError("{0} {1}".format(exc, args.region))

    if args.start >= args.end:
        raise ValueError("Start greater than end: {0}\t{1}".format(
            args.start, args.end))

    transcript = None
    with GTF(args.gtf) as gtf:
        for row in gtf:
            if row.chrom != args.chrom:
                continue
            else:
                if row.is_transcript is True:
                    if transcript is not None and \
                            transcript.start >= args.start and transcript.end <= args.end:
                        print(transcript.format("gtf"), file=args.out)
                        transcript = None
                    if args.assume_sorted is True and row.start > args.end:
                        break
                    transcript = Transcript(row)
                else:
                    transcript.add_exon(row)

    if transcript is not None and transcript.start >= args.start and transcript.end <= args.end:
        print(transcript.format("gtf"), file=args.out)
예제 #6
0
def parse_prediction(args, genes, positions, queue_logger):
    """
    This function performs the real comparison between the reference and the prediction.
     It needs the following inputs:
    :param args: the Namespace with the necessary parameters
    :param genes: Dictionary with the reference genes, of the form
    dict[chrom][(start,end)] = [gene object]
    :param positions: Dictionary with the positions of the reference genes, of the form
    dict[chrom][IntervalTree]
    :param queue_logger: Logger
    :return:
    """

    # start the class which will manage the statistics
    accountant_instance = Accountant(genes, args)
    assigner_instance = Assigner(genes, positions, args, accountant_instance)

    transcript = None
    if hasattr(args, "self") and args.self is True:
        args.prediction = to_gff(args.reference.name)
    ref_gff = isinstance(args.prediction, GFF3)
    __found_with_orf = set()

    for row in args.prediction:
        if row.header is True:
            continue
        #         queue_logger.debug("Row:\n{0:>20}".format(str(row)))
        if row.is_transcript is True or row.feature == "match":
            queue_logger.debug("Transcript row:\n%s", str(row))
            if transcript is not None:
                if re.search(r"\.orf[0-9]+$", transcript.id):
                    __name = re.sub(r"\.orf[0-9]+$", "", transcript.id)
                    if __name not in __found_with_orf:
                        __found_with_orf.add(__name)
                        assigner_instance.get_best(transcript)
                    else:
                        pass
                else:
                    assigner_instance.get_best(transcript)
            transcript = Transcript(row, logger=queue_logger)
        elif row.is_exon is True:
            # Case 1: we are talking about cDNA_match and GFF
            if ref_gff is True and "match" not in row.feature:
                if transcript is None:
                    raise TypeError(
                        "Transcript not defined inside the GFF; line:\n{}".
                        format(row))
                else:
                    queue_logger.debug("Adding exon to transcript %s: %s",
                                       transcript.id, row)
                    transcript.add_exon(row)
            elif ref_gff is True and "match" in row.feature:
                if transcript is not None and row.id == transcript.id:
                    transcript.add_exon(row)
                elif transcript is not None and transcript.id in row.parent:
                    transcript.add_exon(row)
                elif transcript is None or (transcript is not None
                                            and row.id != transcript.id):
                    if transcript is not None:
                        if re.search(r"\.orf[0-9]+$", transcript.id) and \
                                (not transcript.id.endswith("orf1")):
                            pass
                        else:
                            assigner_instance.get_best(transcript)
                    queue_logger.debug("New transcript: %s", row.transcript)
                    transcript = Transcript(row, logger=queue_logger)
            elif ref_gff is False:
                if transcript is None or (transcript is not None
                                          and transcript.id != row.transcript):
                    if transcript is not None:
                        if re.search(r"\.orf[0-9]+$", transcript.id) and \
                                (not transcript.id.endswith("orf1")):
                            pass
                        else:
                            assigner_instance.get_best(transcript)
                    queue_logger.debug("New transcript: %s", row.transcript)
                    transcript = Transcript(row, logger=queue_logger)
                transcript.add_exon(row)
            else:
                raise TypeError("Unmatched exon: {}".format(row))

        elif row.header:
            continue
        else:
            queue_logger.debug("Skipped row: {}".format(row))

    if transcript is not None:
        if re.search(r"\.orf[0-9]+$",
                     transcript.id) and not transcript.id.endswith("orf1"):
            pass
        else:
            assigner_instance.get_best(transcript)

    # Finish everything, including printing refmap and stats
    assigner_instance.finish()
    args.prediction.close()
예제 #7
0
파일: compare.py 프로젝트: Jamure/Mikado
def parse_prediction(args, genes, positions, queue_logger):

    """
    This function performs the real comparison between the reference and the prediction.
     It needs the following inputs:
    :param args: the Namespace with the necessary parameters
    :param genes: Dictionary with the reference genes, of the form
    dict[chrom][(start,end)] = [gene object]
    :param positions: Dictionary with the positions of the reference genes, of the form
    dict[chrom][IntervalTree]
    :param queue_logger: Logger
    :return:
    """

    # start the class which will manage the statistics
    accountant_instance = Accountant(genes, args)
    assigner_instance = Assigner(genes, positions, args, accountant_instance)

    transcript = None
    if hasattr(args, "self") and args.self is True:
        args.prediction = to_gff(args.reference.name)
    ref_gff = isinstance(args.prediction, GFF3)
    __found_with_orf = set()

    for row in args.prediction:
        if row.header is True:
            continue
        #         queue_logger.debug("Row:\n{0:>20}".format(str(row)))
        if row.is_transcript is True or row.feature == "match":
            queue_logger.debug("Transcript row:\n%s", str(row))
            if transcript is not None:
                if re.search(r"\.orf[0-9]+$", transcript.id):
                    __name = re.sub(r"\.orf[0-9]+$", "", transcript.id)
                    if __name not in __found_with_orf:
                        __found_with_orf.add(__name)
                        assigner_instance.get_best(transcript)
                    else:
                        pass
                else:
                    assigner_instance.get_best(transcript)
            transcript = Transcript(row, logger=queue_logger)
        elif row.is_exon is True:
            # Case 1: we are talking about cDNA_match and GFF
            if ref_gff is True and "match" not in row.feature:
                if transcript is None:
                    raise TypeError("Transcript not defined inside the GFF; line:\n{}".format(row))
                else:
                    queue_logger.debug("Adding exon to transcript %s: %s",
                                       transcript.id, row)
                    transcript.add_exon(row)
            elif ref_gff is True and "match" in row.feature:
                if transcript is not None and row.id == transcript.id:
                    transcript.add_exon(row)
                elif transcript is not None and transcript.id in row.parent:
                    transcript.add_exon(row)
                elif transcript is None or (transcript is not None and row.id != transcript.id):
                    if transcript is not None:
                        if re.search(r"\.orf[0-9]+$", transcript.id) and \
                                (not transcript.id.endswith("orf1")):
                            pass
                        else:
                            assigner_instance.get_best(transcript)
                    queue_logger.debug("New transcript: %s", row.transcript)
                    transcript = Transcript(row, logger=queue_logger)
            elif ref_gff is False:
                if transcript is None or (transcript is not None and transcript.id != row.transcript):
                    if transcript is not None:
                        if re.search(r"\.orf[0-9]+$", transcript.id) and \
                                (not transcript.id.endswith("orf1")):
                            pass
                        else:
                            assigner_instance.get_best(transcript)
                    queue_logger.debug("New transcript: %s", row.transcript)
                    transcript = Transcript(row, logger=queue_logger)
                transcript.add_exon(row)
            else:
                raise TypeError("Unmatched exon: {}".format(row))

        elif row.header:
            continue
        else:
            queue_logger.debug("Skipped row: {}".format(row))

    if transcript is not None:
        if re.search(r"\.orf[0-9]+$", transcript.id) and not transcript.id.endswith("orf1"):
            pass
        else:
            assigner_instance.get_best(transcript)

    # Finish everything, including printing refmap and stats
    assigner_instance.finish()
    args.prediction.close()