Exemple #1
0
    def test_bam2gff(self):
        name_counter = Counter()
        transcripts = []
        bam_file = pkg_resources.resource_filename("Mikado.tests",
                                                   "test.reads.bam")
        for record in AlignmentFile(bam_file, mode="rb"):
            if record.is_unmapped is True:
                continue
            transcript = Transcript(record)
            if name_counter.get(record.query_name):
                name = "{}_{}".format(record.query_name,
                                      name_counter.get(record.query_name))
            else:
                name = record.query_name

            if name != transcript.id:
                transcript.alias = transcript.id
                transcript.id = name

            transcript.parent = transcript.attributes[
                "gene_id"] = "{0}.gene".format(name)
            name_counter.update([record.query_name])
            transcript.source = "bam2gtf"
            transcripts.append(transcript)

        self.assertEqual(len(transcripts), 4)
        self.assertEqual(transcripts[0].strand, '-')
        self.assertEqual(transcripts[1].strand, '+')
        self.assertEqual(transcripts[2].strand, '+')
        self.assertEqual(transcripts[3].strand, '-')
    def test_load_orfs(self):

        transcript_line = 'Chr1\t100\t2000\tID=foo;coding=True;phase=0'\
                          '\t0\t+\t300\t1850\t0\t4\t400,400,400,200\t0,500,1100,1700'
        transcript = Transcript(transcript_line)
        orf = transcript.orfs[0].to_transcriptomic()
        transcript2 = transcript.copy()
        transcript2.unfinalize()
        transcript2.chrom = "Chr2"
        transcript2.id = "foo.2"
        transcript2.finalize()
        other_orf = transcript2.orfs[0].to_transcriptomic()
        engine = create_engine("sqlite:///:memory:")
        db.metadata.create_all(engine)
        SessionMaker = sessionmaker(bind=engine)
        session = SessionMaker()
        query = Query(transcript.id, transcript.cdna_length)
        query2 = Query(transcript2.id, transcript2.cdna_length)
        session.add_all([query, query2])
        session.commit()
        serialized_orf = Orf(orf, query.query_id)
        self.assertEqual(serialized_orf.thick_end, orf.thick_end)
        self.assertEqual(serialized_orf.cds_len, orf.cds_len)
        serialized_other_orf = Orf(other_orf, query2.query_id)
        session.add_all([serialized_orf, serialized_other_orf])
        session.commit()
        sup = Superlocus(transcript)
        sup.session = session
        sup_orfs = asyncio.run(sup.get_orfs([query.query_id]))
        self.assertEqual(len(sup_orfs), 1)
        self.assertIn(transcript.id, sup_orfs)
        self.assertEqual(len(sup_orfs[transcript.id]), 1)
        self.assertIsInstance(sup_orfs[transcript.id][0], BED12,
                              type(sup_orfs[transcript.id][0]))
        self.assertTrue(
            sup_orfs[transcript.id][0] == orf, "\n" +
            "\n".join([str(orf), str(sup_orfs[transcript.id][0])]))
def main():
    """
    Main script function.
    """

    parser = argparse.ArgumentParser(
        "Script to add a transcript feature to e.g. Cufflinks GTFs")
    parser.add_argument("gtf", type=argparse.FileType(), help="Input GTF")
    parser.add_argument("out",
                        default=sys.stdout,
                        nargs="?",
                        type=argparse.FileType("w"),
                        help="Output file. Default: stdout.")
    args = parser.parse_args()

    args.gtf.close()

    transcript_lines = defaultdict(list)

    [
        transcript_lines[_.transcript].append(_) for _ in GTF(args.gtf.name)
        if _.header is False and _.is_exon is True
    ]
    args.gtf.close()
    transcripts = list()

    for tid in transcript_lines:
        transcript = Transcript(transcript_lines[tid][0])
        transcript.add_exons(transcript_lines[tid])
        transcripts.append(transcript)

    for transcript in sorted(transcripts):
        print(transcript.format("gtf"), file=args.out)

    if args.out is not sys.stdout:
        args.out.close()
    def test_zero_one_many(self):
        from Mikado.transcripts import Transcript
        junctions = []
        transcripts = []
        with parsers.bed12.Bed12Parser(os.path.join(
                os.path.dirname(__file__), "zom_junctions.bed")) as parser:
            for line in parser:
                serializers.junction.JunctionSerializer.generate_introns(0, junctions, line)
                transcripts.append(Transcript(line))

        bed_introns = []
        for junction in junctions:
            bed_introns.append((junction.junction_start, junction.junction_end))
        transcript_introns = []
        for transcript in transcripts:
            for intron in transcript.introns:
                transcript_introns.append(intron)

        assert set(bed_introns) == set(transcript_introns), (set(bed_introns), set(transcript_introns))
Exemple #5
0
def main():

    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("-f",
                        "--format",
                        default=None,
                        choices=["gff3", "gtf"])
    parser.add_argument("gff", type=parser_factory)
    parser.add_argument("out",
                        nargs="?",
                        default=sys.stdout,
                        type=argparse.FileType("wt"))
    args = parser.parse_args()

    is_gff = (args.gff.file_format == "gff3")
    if args.format is None:
        args.format = args.gff.file_format

    tid2gid = dict()
    genes = OrderedDict()

    for row in args.gff:
        if row.header is True:
            continue
        elif row.is_gene is True:
            genes[row.id] = Gene(row)
        elif row.is_transcript is True:
            assert len(row.parent) == 1
            parent = row.parent[0]
            tid2gid[row.id] = parent
            genes[parent].add(Transcript(row))
        elif row.is_exon is True:
            if row.gene is None:
                gene = tid2gid[row.parent[0]]
            else:
                gene = row.gene
            genes[gene].add_exon(row)

    for gid, gene in genes.items():
        print(strip_utr(gene).format(args.format), file=args.out)
        continue

    return
    def test_get_external(self):
        checked_conf = load_and_validate_config(None).copy()
        checked_conf.pick.output_format.report_all_external_metrics = True
        transcript = Transcript()
        transcript.chrom = "15"
        transcript.source = "protein_coding"
        transcript.start = 47631264
        transcript.end = 48051999

        exons = [(47631264, 47631416), (47704590, 47704669),
                 (47762671, 47762742), (47893062, 47893093),
                 (47895572, 47895655), (48051942, 48051999)]

        transcript.strand = "+"
        transcript.add_exons(exons)
        transcript.id = "ENST00000560636"
        transcript.parent = "ENSG00000137872"
        transcript2 = transcript.copy()
        transcript2.id = "ENST00000560637"
        checked_conf.scoring.scoring["attributes.tpm"] = MinMaxScore.Schema(
        ).load({
            "rescaling": "max",
            "default": 0,
            "rtype": "float",
            'multiplier': 4,
            'use_raw': True,
            'percentage': True
        })
        transcript.attributes["tpm"] = 10

        int_source = ExternalSource('int', 'int', 0)
        float_source = ExternalSource('float', 'float', 0)
        bool_source = ExternalSource('bool', 'bool', 0)

        raw_int_source = ExternalSource('raw_int', 'int', 1)
        raw_float_source = ExternalSource('raw_float', 'float', 1)
        raw_bool_source = ExternalSource('raw_bool', 'bool', 1)

        int_score = External(1, 1, 10)
        float_score = External(1, 2, 10.0)
        bool_score = External(
            1, 3, int(False)
        )  # We cast as int here following external.py serialize function

        raw_int_score = External(1, 4, 8)
        raw_float_score = External(1, 5, 8.0)
        raw_bool_score = External(
            1, 6, int(True)
        )  # We cast as int here following external.py serialize function

        query = Query(transcript.id, transcript.cdna_length)
        query2 = Query(transcript2.id, transcript2.cdna_length)

        engine = create_engine("sqlite:///:memory:")
        db.metadata.create_all(engine)
        SessionMaker = sessionmaker(bind=engine)
        session = SessionMaker()
        session.add_all([
            int_source, float_source, bool_source, raw_int_source,
            raw_float_source, raw_bool_source
        ])
        session.add_all([query, query2])
        session.add_all([
            int_score, float_score, bool_score, raw_int_score, raw_float_score,
            raw_bool_score
        ])
        session.commit()
        sup = Superlocus(transcript, configuration=checked_conf)
        sup.session = session
        tid = transcript.id
        self.assertIn(tid, sup.transcripts)
        from collections import namedtuple
        qobj = {1: namedtuple('t', field_names=('query_name'))}
        qobj[1].query_name = 'ENST00000560636'
        external = asyncio.run(sup.get_external(qobj, [1]))

        self.assertEqual(
            external, {
                'ENST00000560636': {
                    'int': (10, False),
                    'float': (10.0, False),
                    'bool': (False, False),
                    'raw_int': (8, True),
                    'raw_float': (8.0, True),
                    'raw_bool': (True, True)
                }
            })

        sup.configuration.pick.output_format.report_all_external_metrics = False
        external = asyncio.run(sup.get_external(qobj, [1]))
        self.assertEqual(len(external), 0)
        # These are meaningless it's just to verify we are loading *only* these metrics.
        # We should *NOT* have 'float' as it is not present in any section.
        sup.configuration.scoring.scoring["external.int"] = MinMaxScore(
            rescaling="max", filter=None)
        sup.configuration.scoring.requirements.parameters[
            "external.raw_float"] = SizeFilter(operator="gt", value=100)
        sup.configuration.scoring.cds_requirements.parameters[
            "external.raw_int"] = SizeFilter(operator="lt", value=1)
        sup.configuration.scoring.as_requirements.parameters[
            "external.raw_bool"] = SizeFilter(operator="lt", value=1)
        sup.configuration.scoring.not_fragmentary.parameters[
            "external.bool"] = SizeFilter(operator="ne", value=False)
        external = asyncio.run(sup.get_external(qobj, [1]))
        self.assertEqual(
            external, {
                'ENST00000560636': {
                    'int': (10, False),
                    'raw_float': (8.0, True),
                    'bool': (False, False),
                    'raw_int': (8, True),
                    'raw_bool': (True, True)
                }
            })
    def test_retrieval(self):
        engine = create_engine("sqlite:///:memory:")
        db.metadata.create_all(engine)
        SessionMaker = sessionmaker(bind=engine)
        session = SessionMaker()

        transcript = Transcript(accept_undefined_multi=True)
        transcript.chrom = "15"
        transcript.source = "protein_coding"
        transcript.start = 47631264
        transcript.end = 48051999

        exons = [(47631264, 47631416), (47704590, 47704669),
                 (47762671, 47762742), (47893062, 47893093),
                 (47895572, 47895655), (48051942, 48051999)]

        transcript.strand = "+"
        transcript.add_exons(exons)
        transcript.id = "ENST00000560636"
        transcript.parent = "ENSG00000137872"
        transcript2 = transcript.copy()
        transcript2.id = "ENST00000560637"

        chrom_one = Chrom("1", 10**8)
        chrom_fifteen = Chrom("15", 5 * 10**8)
        session.add_all([chrom_one, chrom_fifteen])
        session.commit()
        # junction_start, junction_end, name, strand, score, chrom_id)
        # This junction is on a different chrom
        junction_chrom_one = Junction(47704669 + 1, 47762671 - 1, "chrom_one",
                                      "+", 10, chrom_one.chrom_id)
        # This junction is too far away
        outside_chrom_15 = Junction(47704669 - 10**6 + 1, 47762671 - 10**6 - 1,
                                    "chrom_15_outside", "+", 10,
                                    chrom_fifteen.chrom_id)
        # This junction is in the right place but wrong strand
        wrong_strand_chrom_15 = Junction(47704669 + 1, 47762671 - 1,
                                         "chrom_15_wrong_strand", "-", 10,
                                         chrom_fifteen.chrom_id)
        # This one is correct
        chrom_15_junction = Junction(47704669 + 1, 47762671 - 1, "chrom_15",
                                     "+", 10, chrom_fifteen.chrom_id)
        session.add_all([
            junction_chrom_one, outside_chrom_15, wrong_strand_chrom_15,
            chrom_15_junction
        ])
        session.commit()

        self.assertEqual(junction_chrom_one.chrom, "1")
        for junc in [
                outside_chrom_15, wrong_strand_chrom_15, chrom_15_junction
        ]:
            self.assertEqual(junc.chrom, "15")

        for strand, stranded in itertools.product(("+", "-", None),
                                                  (True, False)):
            transcript.unfinalize()
            transcript.strand = strand
            transcript.finalize()
            sup = Superlocus(transcript, stranded=stranded)
            self.assertTrue(
                (chrom_15_junction.junction_start, chrom_15_junction.end)
                in sup.introns, (chrom_15_junction, sup.introns))
            sup.session = session
            asyncio.run(sup._load_introns())
            if stranded is True and strand is not None:
                self.assertEqual(
                    sup.locus_verified_introns,
                    {(chrom_15_junction.junction_start,
                      chrom_15_junction.junction_end, strand)},
                    (stranded, strand))
            elif stranded is False:
                self.assertEqual(
                    sup.locus_verified_introns,
                    {(chrom_15_junction.junction_start,
                      chrom_15_junction.junction_end,
                      chrom_15_junction.strand),
                     (wrong_strand_chrom_15.junction_start,
                      wrong_strand_chrom_15.junction_end,
                      wrong_strand_chrom_15.strand)}, (stranded, strand))
            elif stranded is True and strand is None:
                self.assertEqual(sup.locus_verified_introns, set())
Exemple #8
0
    def test_fusion(self):

        t = Transcript()
        t.chrom, t.strand, t.start, t.end, t.id, t.parent = "Chr1", "+", 101, 1000, "foo.1", "foo"
        t.add_exons([(101, 500), (601, 800), (901, 1000)])
        t.finalize()
        t2 = Transcript()
        t2.chrom, t2.strand, t2.start, t2.end, t2.id, t2.parent = "Chr1", "+", 2001, 3000, "bar.1", "bar"
        t2.add_exons([(2001, 2500), (2601, 2800), (2901, 3000)])
        t2.finalize()

        t3 = Transcript()
        t3.chrom, t3.strand, t3.start, t3.end, t3.id, t3.parent = "Chr1", "+", 651, 2703, "faz.1", "faz"
        t3.add_exons([(651, 800), (901, 1300), (2230, 2500), (2601, 2703)])
        t3.finalize()

        logger = create_default_logger("test_fusion")
        with tempfile.TemporaryDirectory() as folder:
            with open(os.path.join(folder, "reference.gtf"),
                      "wt") as reference:
                print(t.format("gtf"), file=reference)
                print(t2.format("gtf"), file=reference)
            self.assertTrue(os.path.exists(reference.name))
            _ = [_ for _ in parser_factory(reference.name)]
            try:
                indexing.create_index(parser_factory(reference.name), logger,
                                      "{}.midx".format(reference.name))
            except InvalidParsingFormat:
                self.assertFalse(
                    True,
                    "\n".join([line.rstrip()
                               for line in open(reference.name)]))
            namespace = Namespace(default=False)
            namespace.out = os.path.join(folder, "out")
            for report in (False, True):
                with self.subTest(report=report):
                    namespace.report_fusions = report
                    assigner = Assigner("{}.midx".format(reference.name),
                                        args=namespace,
                                        printout_tmap=False)
                    result = assigner.get_best(t3)
                    if report:
                        self.assertTrue(len(result), 2)
                        self.assertTrue(result[0].ccode == ("f", "j"),
                                        str(result[0]))
                        self.assertTrue(result[1].ccode == ("f", "j"),
                                        str(result[1]))
                    else:
                        self.assertTrue(result.ccode == ("j", ), str(result))
def main():

    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("--bed12",
                        nargs=2,
                        required=True,
                        help="Transcriptomic cDNAs BED12s")
    parser.add_argument("--cdnas", nargs=2, required=True)
    parser.add_argument("-gf",
                        help="GFF3/BED12 of the transferred annotation.",
                        required=True)
    parser.add_argument("--out",
                        default=sys.stdout,
                        type=argparse.FileType("wt"))
    parser.add_argument("-ob",
                        "--out-bed",
                        dest="out_bed",
                        required=False,
                        default=None,
                        type=argparse.FileType("wt"))
    log = parser.add_mutually_exclusive_group()
    log.add_argument("-q", "--quiet", default=False, action="store_true")
    log.add_argument("-v", "--verbose", default=False, action="store_true")
    parser.add_argument("-p", "--processes", type=int, default=mp.cpu_count())
    args = parser.parse_args()

    logger = create_default_logger("master")
    verbosity = "INFO"
    if args.verbose is True:
        verbosity = "DEBUG"
    elif args.quiet is True:
        verbosity = "WARNING"

    listener = logging.handlers.QueueListener(logging_queue, logger)
    listener.propagate = False
    listener.start()
    logger.setLevel(verbosity)

    cdnas = dict()
    beds = dict()
    beds["ref"] = dict()
    beds["target"] = dict()

    gmap_pat = re.compile("\.mrna[0-9]*$")

    logger.info("Loading reference cDNAS")
    cdnas["ref"] = pyfaidx.Fasta(args.cdnas[0])
    logger.info("Loading target cDNAS")
    cdnas["target"] = pyfaidx.Fasta(args.cdnas[1])
    logger.info("Loaded cDNAs")
    logger.info("Loading reference BED12")
    for entry in Bed12Parser(args.bed12[0], transcriptomic=True):
        if entry.header:
            continue
        name = entry.chrom
        if name in beds["ref"]:
            raise KeyError("Duplicated ID for the reference: {}".format(name))
        if name not in cdnas["ref"]:
            raise KeyError("Reference {} not found in the cDNAs!".format(name))
        beds["ref"][name] = entry

    logger.info("Loading target BED12")
    beds["target"] = defaultdict(dict)
    for entry in Bed12Parser(args.bed12[1], transcriptomic=True):
        # Now, here we have to account for the fact that there *might* be multiple alignments
        name = re.sub(gmap_pat, "", entry.chrom)
        if entry.chrom not in cdnas["target"]:
            raise KeyError("Target {} not found in the cDNAs!".format(
                entry.chrom))
        beds["target"][name][entry.chrom] = entry
    logger.info("Loaded BED12s")

    # Now let us start parsing the GFF3, which we presume being a GMAP GFF3
    transcript = None

    logger.info("Launching sub-processes")
    procs = []
    queue = mp.Queue(-1)
    for proc in range(args.processes):
        sq = tempfile.NamedTemporaryFile(mode="wb")
        sq.close()
        sq = sq.name
        _proc = Transferer(sq, queue, verbosity=verbosity)
        _proc.start()
        procs.append(_proc)
    logger.info("Launched sub-processes, starting parsing annotation")

    # pool = mp.Pool(processes=args.processes)

    tnum = -1
    if args.gf.endswith(("bed12", "bed")):
        parser = Bed12Parser(args.gf, transcriptomic=False)
        for line in parser:
            if line.header:
                continue
            else:
                transcript = Transcript(line)
                tid = re.sub(gmap_pat, "", transcript.id)
                logger.debug("Found %s", tid)
                ref_cdna = str(cdnas["ref"][tid])
                ref_bed = beds["ref"][tid]
                target_cdna = str(cdnas["target"][transcript.id])
                target_bed = beds["target"][tid][transcript.id]
                tnum += 1
                logger.debug("Submitting %s", tid)
                queue.put((tnum, (transcript, ref_cdna, ref_bed, target_cdna,
                                  target_bed)))
            if tnum >= 10**4 and tnum % 10**4 == 0:
                logger.info("Parsed {} transcripts", tnum)
        logger.info("Finished parsing input genomic BED file")
    else:
        parser = to_gff(args.gf)

        for pos, line in enumerate(parser):
            if line.header is True:  # or (not isinstance(line, BED12) and line.is_gene is True):
                if str(line) == "###":
                    continue
                try:
                    print(line, file=args.out)
                except IndexError:
                    raise IndexError(line._line)
                continue
            elif not isinstance(line, BED12) and line.is_gene is True:
                continue
            elif line.is_transcript is True:
                if transcript:
                    if transcript.alias is None:
                        tid = re.sub(gmap_pat, "", transcript.id)
                    else:
                        tid = re.sub(gmap_pat, "", transcript.alias)
                    ref_cdna = str(cdnas["ref"][tid])
                    ref_bed = beds["ref"][tid]
                    target_cdna = str(cdnas["target"][transcript.id])
                    store = beds["target"].get(tid, None)
                    if store is None:
                        raise KeyError((tid, beds["target"].keys()))
                    target_bed = store.get(transcript.id, None)
                    if target_bed is None:
                        raise KeyError((tid, store.keys()))
                    tnum += 1
                    queue.put((tnum, (transcript, ref_cdna, ref_bed,
                                      target_cdna, target_bed)))
                try:
                    transcript = Transcript(line)
                except (ValueError, TypeError):
                    raise ValueError((pos, line))
            elif line.is_exon is True:
                transcript.add_exon(line)
            if tnum >= 10**4 and tnum % 10**4 == 0:
                logger.info("Parsed {} transcripts", tnum)

        if transcript:
            tnum += 1
            tid = re.sub(gmap_pat, "", transcript.id)
            ref_cdna = str(cdnas["ref"][tid])
            ref_bed = beds["ref"][tid]
            target_cdna = str(cdnas["target"][transcript.id])
            target_bed = beds["target"][tid][transcript.id]
            queue.put((tnum, (transcript, ref_cdna, ref_bed, target_cdna,
                              target_bed)))
        logger.info("Finished parsing input genomic GF file")

    queue.put("EXIT")
    logger.info("Waiting for subprocesses to finish")
    [_proc.join() for _proc in procs]

    # Now the printing ...
    # results = dict()

    logger.info("Subprocesses finished, printing")
    for proc in procs:
        sq = sqlalchemy.create_engine("sqlite:///{}".format(proc.out_sq))
        for res in sq.execute("select * from storer"):
            num, bed12, gff3 = res
            if args.out_bed is not None:
                print(bed12.decode(), file=args.out_bed)
            print(*gff3.decode().split("\n"), file=args.out, sep="\n")
        os.remove(proc.out_sq)

    logger.info("Finished!")
    return
def transfer_cds(transcript: Transcript,
                 ref_cdna: str,
                 ref_bed: BED12,
                 target_cdna: str,
                 target_bed: BED12,
                 logger=create_null_logger()):

    if transcript is None:
        return transcript, target_bed, (None, None, False)

    transcript.finalize()
    assert target_bed.transcriptomic is True

    logger.debug("Starting with %s, phases: %s (BED %s)", transcript.id,
                 transcript.phases, target_bed.phase)

    if ref_bed.coding is False:
        logger.debug("%s is non coding, returning immediately.", transcript.id,
                     transcript.phases)
        transcript.attributes["aligner_cds"] = False
        transcript.attributes["was_coding"] = transcript.is_coding
        target_bed.coding = False
        transcript.strip_cds()
        pep_coords = (None, None, True)
    else:
        original_start, original_end = target_bed.thick_start, target_bed.thick_end
        original_phase, original_phases = target_bed.phase, transcript.phases.copy(
        )
        ref_pep = str(
            Seq.Seq(str(
                ref_cdna[ref_bed.thick_start -
                         1:ref_bed.thick_end])).translate(to_stop=False))

        ref_has_multiple_stops = False
        if ref_pep.count("*") == 0:
            pass
        elif abs(ref_pep.index("*") * 3 - ref_bed.cds_len) in (0, 3):
            ref_pep = ref_pep[:ref_pep.index(
                "*")]  # This is the "good" case: the CDS is correct.
        else:
            ref_has_multiple_stops = True
            logger.warning(
                "The sequence of %s has in frame stop codons. Adjusting the program to take this into account.",
                ref_bed.name)

        logger.debug("%s now has phases: %s (%s)", transcript.id,
                     transcript.phases, target_bed.phase)
        target_bed, pep_coords = transfer_by_alignment(ref_pep,
                                                       target_cdna,
                                                       target_bed,
                                                       logger=logger)
        logger.debug("%s now has phases: %s; target bed: %s", transcript.id,
                     transcript.phases, target_bed.phase)
        pep_coords = (pep_coords[0], pep_coords[1],
                      (pep_coords[0] == 1 and pep_coords[1] == len(ref_pep)))

        if target_bed.thick_start == original_start and target_bed.thick_end == original_end:
            transcript.attributes["aligner_cds"] = True
            logger.debug("%s now has phases: %s", transcript.id,
                         transcript.phases)
        else:
            transcript.attributes["aligner_cds"] = False
            transcript.strip_cds()
            if target_bed.coding is True:
                transcript.load_orfs([target_bed])

        logger.debug("%s now has phases: %s", transcript.id, transcript.phases)
        # Now we have to decide whether the transcript has the "original" CDS or not
        result, cigar = transfer.get_and_prepare_cigar(str(ref_cdna),
                                                       str(target_cdna))
        ref_array, target_array = transfer.create_translation_array(cigar)
        try:
            target_start = target_array[ref_array.index(ref_bed.thick_start)]
        except IndexError:
            target_start = target_bed.start
        try:
            target_end = target_array[ref_array.index(ref_bed.thick_end)]
        except IndexError:
            target_end = target_bed.end

        if target_start == target_bed.thick_start and target_end == target_bed.thick_end:
            transcript.attributes["original_cds"] = True
        else:
            transcript.attributes["original_cds"] = False

        if ref_cdna == target_cdna:
            logger.debug("%s now has phases: %s", transcript.id,
                         transcript.phases)
            if transcript.is_coding is False:
                raise AssertionError("{} not coding".format(transcript.id))
            elif transcript.attributes["original_cds"] is False:
                raise AssertionError("\n".join([
                    str(_) for _ in [
                        transcript.id,
                        (target_bed.thick_start, target_start,
                         target_bed.thick_start == target_start),
                        (target_bed.thick_end, target_end,
                         target_bed.thick_end == target_end
                         ), target_bed.thick_start == target_start
                        and target_bed.thick_end == target_end
                    ]
                ]))

    return transcript, target_bed, pep_coords
def create_transcript(tid: str, parent: str, lines: List[GtfLine],
                      args: argparse.Namespace):
    """"""

    chroms = defaultdict(list)
    for line in lines:
        chroms[line.chrom].append(line)

    if len(chroms) > 1:
        # Recursively
        for chrom in chroms:
            newtid = tid + "." + chrom
            newparent = parent + "." + chrom
            for transcript in create_transcript(newtid, newparent,
                                                chroms[chrom], args):
                assert transcript.id == newtid, (newtid, transcript.id)
                assert transcript.parent[0] == newparent
                yield transcript
    else:
        # Now we are sure that we only have one chromosome
        exons = sorted([line for line in lines if line.is_exon],
                       key=operator.attrgetter("chrom", "start", "end"))

        if len(exons) == 1:
            transcript = Transcript(exons[0])
            transcript.id = tid
            transcript.parent = parent
            transcript.finalize()
            yield transcript
        else:
            new_exons = deque()
            identifier = ord("A") - 1
            current = exons[0]

            for exon in exons[1:]:
                if ((overlap((exon.start, exon.end),
                             (current.start, current.end)) > 0)
                        or (exon.start - current.end + 1 <= args.min_intron
                            and args.split is False)):
                    # Merge the two exons
                    current.end = exon.end
                elif ((exon.start - current.end + 1 <= args.min_intron
                       and args.split is True)
                      or exon.start - current.end + 1 > args.max_intron):
                    # TODO: split
                    new_exons.append(current)
                    transcript = Transcript(new_exons.popleft())
                    transcript.add_exons(new_exons)
                    transcript.finalize()
                    identifier += 1
                    transcript.parent = parent + "." + chr(identifier)
                    transcript.id = tid + "." + chr(identifier)
                    yield transcript
                    current = exon
                    new_exons = deque()
                else:
                    new_exons.append(current)
                    current = exon

            new_exons.append(current)
            transcript = Transcript(new_exons.popleft())
            transcript.add_exons(new_exons)

            if identifier == ord("A") - 1:
                transcript.id = tid
                transcript.parent = parent
            else:
                identifier += 1
                transcript.id = tid + "." + chr(identifier)
                transcript.parent = parent + "." + chr(identifier)

            transcript.finalize()
            yield transcript