Ejemplos de GFF.write_collapseGFF_format en Python, ejemplos de cupcake.sequence.GFF.write_collapseGFF_format en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: scrub_sample_GFF_junctions.py Proyecto: milescsmith/cDNA_Cupcake

def scrub_sample_GFFs(
    sample_dirs: Dict[str, str],
    gff_filename: Union[str, Path],
    count_filename: Union[str, Path],
    group_filename: Union[str, Path],
    fastq_filename: Union[str, Path],
    output_prefix: str,
    tree: IntervalTree,
) -> None:

    for _, d in sample_dirs.items():
        with Path(d, f"{output_prefix}.gff.tmp").open("w") as outf:
            for r in GFF.collapseGFFReader(Path(d, gff_filename)):
                n = len(r.ref_exons)
                if n == 1:
                    GFF.write_collapseGFF_format(outf, r)

                new_ref_exons = scrub_ref_exons(r, tree)
                if new_ref_exons is None:
                    logger.info("No changes made due to error:", r.seqid)
                else:
                    # print "before:", r.ref_exons
                    # print "after :", new_ref_exons
                    r.ref_exons = new_ref_exons
                GFF.write_collapseGFF_format(outf, r)
        cleanup_scrubbed_files_redundancy(
            outf.name,
            Path(d, group_filename),
            Path(d, count_filename),
            Path(d, fastq_filename) if fastq_filename is not None else None,
            Path(d, output_prefix),
        )

Ejemplo n.º 2

0

Mostrar archivo

Archivo: get_gffs_from_list.py Proyecto: milescsmith/cDNA_Cupcake

def get_gff_from_list(gff_filename, listfile, partial_ok=False):
    seqs = [line.strip() for line in open(listfile)]
    for r in GFF.collapseGFFReader(gff_filename):
        if (
            r.seqid in seqs
            or r.seqid.split("|")[0] in seqs
            or (partial_ok and any(r.seqid.startswith(x) for x in seqs))
        ):
            GFF.write_collapseGFF_format(sys.stdout, r)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: demux_by_barcode_groups.py Proyecto: milescsmith/cDNA_Cupcake

def regroup_gff(
    pooled_gff, demux_count_file, output_prefix, out_group_dict, in_fafq=None
):
    """
    :param pooled_sam: SAM file
    :param demux_count_file: comma-delimited per-barcode count file
    :param output_prefix: output prefix for GFF
    :param out_group_dict: dict of barcode name --> group to be long in  (ex: {'EM1':'EM', 'EM2':'EM'})
    :param in_fafq: optional fasta/fastq that was input to SAM
    """
    if in_fafq is not None:
        type_fafq = get_type_fafq(in_fafq)
    in_tissue = defaultdict(
        lambda: set()
    )  # pbid --> list of tissue it is in (EM, END, R)

    for r in DictReader(open(demux_count_file), delimiter=","):
        for k, v in r.items():
            if k != "id" and int(v) > 0:
                in_tissue[r["id"]].add(k)

    # in_tissue = dict(in_tissue)

    handles = {}
    handles_fafq = {}
    for g in out_group_dict.values():
        handles[g] = open(f"{output_prefix}_{g}_only.gff", "w")
        if in_fafq is not None:
            handles_fafq[g] = open(f"{output_prefix}_{g}_only.{type_fafq}", "w")

    if in_fafq is not None:
        fafq_dict = SeqIO.to_dict(SeqIO.parse(open(in_fafq), type_fafq))
        fafq_dict_keys = list(fafq_dict.keys())
        for k in fafq_dict_keys:
            m = rex_pbid.match(k)
            if m is not None:
                fafq_dict[m.group(1)] = fafq_dict[k]
    reader = GFF.collapseGFFReader(pooled_gff)
    for r in reader:
        groups_to_write_in = set()
        pbid = r.seqid
        if pbid not in in_tissue:
            logger.info(
                f"WARNING: {pbid} does not belong to any group indicated by outgroup_dict"
            )
        for tissue in in_tissue[pbid]:
            groups_to_write_in.add(out_group_dict[tissue])

        for g in groups_to_write_in:
            GFF.write_collapseGFF_format(handles[g], r)
            if in_fafq is not None:
                SeqIO.write(fafq_dict[pbid], handles_fafq[g], type_fafq)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: collapse_isoforms_by_sam.py Proyecto: milescsmith/cDNA_Cupcake

def collapse_fuzzy_junctions(
    gff_filename: Union[str, Path],
    group_filename: Union[str, Path],
    allow_extra_5exon: bool,
    internal_fuzzy_max_dist: int,
    max_5_diff: int,
    max_3_diff: int,
) -> defaultdict:
    def can_merge(m, r1, r2):
        if m == "exact":
            return True
        else:
            if not allow_extra_5exon:
                return False
        # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True
        if m == "subset":
            r1, r2 = r2, r1  # rotate so r1 is always the longer one
        if m == "super" or m == "subset":
            n2 = len(r2.ref_exons)
            # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees
            # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates
            if r1.strand == "+":
                return (abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <=
                        internal_fuzzy_max_dist and r1.ref_exons[-n2].start <=
                        r2.ref_exons[0].start < r1.ref_exons[-n2].end)
            else:
                return (abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <=
                        internal_fuzzy_max_dist and r1.ref_exons[n2 - 1].start
                        <= r2.ref_exons[-1].end < r1.ref_exons[n2].end)
        return False

    d = {}
    # chr --> strand --> tree
    recs = defaultdict(lambda: {"+": IntervalTree(), "-": IntervalTree()})
    fuzzy_match = defaultdict(lambda: [])
    for r in GFF.collapseGFFReader(gff_filename):
        d[r.seqid] = r
        has_match = False
        r.segments = r.ref_exons
        for r2 in recs[r.chr][r.strand].find(r.start, r.end):
            r2.segments = r2.ref_exons
            m = compare_junctions(
                r,
                r2,
                internal_fuzzy_max_dist=internal_fuzzy_max_dist,
                max_5_diff=max_5_diff,
                max_3_diff=max_3_diff,
            )
            if can_merge(m, r, r2):
                fuzzy_match[r2.seqid].append(r.seqid)
                has_match = True
                break
        if not has_match:
            recs[r.chr][r.strand].insert(r.start, r.end, r)
            fuzzy_match[r.seqid] = [r.seqid]

    group_info = {}
    with open(group_filename) as f:
        for line in f:
            pbid, members = line.strip().split("\t")
            group_info[pbid] = members.split(",")

    # pick for each fuzzy group the one that has the most exons
    keys = list(fuzzy_match.keys())
    keys.sort(key=lambda x: int(x.split(".")[1]))

    with open(f"{gff_filename}.fuzzy",
              "w") as f_gff, open(f"{group_filename}.fuzzy", "w") as f_group:
        for k in keys:
            all_members = []
            best_pbid, best_size, best_num_exons = (
                fuzzy_match[k][0],
                len(group_info[fuzzy_match[k][0]]),
                len(d[fuzzy_match[k][0]].ref_exons),
            )
            all_members += group_info[fuzzy_match[k][0]]
            for pbid in fuzzy_match[k][1:]:
                _num_exons = len(d[pbid].ref_exons)
                _size = len(group_info[pbid])
                all_members += group_info[pbid]
                if _num_exons > best_num_exons or (_num_exons == best_num_exons
                                                   and _size > best_size):
                    best_pbid, best_size, best_num_exons = pbid, _size, _num_exons
            GFF.write_collapseGFF_format(f_gff, d[best_pbid])
            f_group.write(f'{best_pbid}\t{",".join(all_members)}\n')

    return fuzzy_match

Ejemplo n.º 5

0

Mostrar archivo

Archivo: filter_monoexon.py Proyecto: milescsmith/cDNA_Cupcake

def main(
    input_prefix: str = typer.Argument(
        ..., help="Input prefix (ex: test.collapsed.min_fl_2)"
    ),
    version: bool = typer.Option(
        None,
        "--version",
        callback=version_callback,
        is_eager=True,
        help="Prints the version of the SQANTI3 package.",
    ),
) -> None:

    output_prefix = f"{input_prefix}.nomono"

    count_filename, gff_filename, rep_filename = sanity_check_collapse_input(
        input_prefix
    )

    good = []
    with open(f"{output_prefix}.gff", "w") as f:
        reader = GFF.collapseGFFReader(gff_filename)
        for r in reader:
            assert r.seqid.startswith("PB.")
            if len(r.ref_exons) > 1:
                good.append(r.seqid)
                GFF.write_collapseGFF_format(f, r)

    # read abundance first
    d, count_header = read_count_file(count_filename)

    # write output rep.fq
    with open(f"{output_prefix}.rep.fq", "w") as f:
        for r in SeqIO.parse(open(rep_filename, "r"), "fastq"):
            if r.name.split("|")[0] in good:
                SeqIO.write(r, f, "fastq")

    # write output to .abundance.txt
    with open(f"{output_prefix}.abundance.txt", "w") as f:
        f.write(count_header)
        writer = DictWriter(
            f,
            fieldnames=[
                "pbid",
                "count_fl",
                "count_nfl",
                "count_nfl_amb",
                "norm_fl",
                "norm_nfl",
                "norm_nfl_amb",
            ],
            delimiter="\t",
            lineterminator="\n",
        )
        writer.writeheader()
        for k in good:
            r = d[k]
            writer.writerow(r)

    logger.info(f"Output written to:{output_prefix}.gff")
    logger.info(f"Output written to:{output_prefix}.rep.fq")

Ejemplo n.º 6

0

Mostrar archivo

def filter_by_count(
    input_prefix: str,
    output_prefix: str,
    min_count: int,
    dun_use_group_count: bool = False,
) -> None:

    group_filename = f"{input_prefix}.group.txt"
    count_filename = f"{input_prefix}.abundance.txt"
    gff_filename = f"{input_prefix}.gff"
    rep_filenames = [
        (f"{input_prefix}.rep.fq", "fastq"),
        (f"{input_prefix}.rep.fastq", "fastq"),
        (f"{input_prefix}.rep.fa", "fasta"),
        (f"{input_prefix}.rep.fasta", "fasta"),
    ]

    rep_filename = None
    rep_type = None
    for x, feature in rep_filenames:
        if os.path.exists(x):
            rep_filename = x
            rep_type = feature

    if rep_filename is None:
        logger.error(
            f"Expected to find input fasta or fastq files {input_prefix}.rep.fa or {input_prefix}.rep.fq. Not found. Abort!"
        )
        sys.exit(-1)

    if not dun_use_group_count:
        # read group
        group_max_count_fl = {}
        group_max_count_p = {}
        for line in open(group_filename):
            # ex: PB.1.1  i0HQ_54b0ca|c58773/f30p16/700
            pbid, members = line.strip().split("\t")
            group_max_count_fl[pbid] = 0
            group_max_count_p[pbid] = 0
            members = members.split(",")
            for m in members:
                i = m.find("|")
                if i > 0:
                    tmp = m.split("|")[1].split("/")[1]  # ex: tmp = f30p16
                else:
                    tmp = m.split("/")[1]
                fl_count, p_count = tmp.split("p")
                fl_count = int(fl_count[1:])
                p_count = int(p_count)
                group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count)
                group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count)

    # read abundance first
    with open(count_filename) as f:
        count_header = ""
        while True:
            cur_pos = f.tell()
            line = f.readline()
            if not line.startswith("#"):
                f.seek(cur_pos)
                break
            else:
                count_header += line
        d = {r["pbid"]: r for r in DictReader(f, delimiter="\t")}
        for k, v in d.items():
            print(k, v)

    # group_max_count_p NOT used for now
    good = [
        x
        for x in d
        if int(d[x]["count_fl"]) >= min_count
        and (dun_use_group_count or group_max_count_fl[x] >= min_count)
    ]

    # write output GFF
    with open(f"{output_prefix}.gff", "w") as f:
        for r in GFF.collapseGFFReader(gff_filename):
            if r.seqid in good:
                GFF.write_collapseGFF_format(f, r)

    # write output rep.fq
    with open(
        f"{output_prefix}.rep.{('fq' if rep_type == 'fastq' else 'fa')}", "w"
    ) as f:
        for r in SeqIO.parse(open(rep_filename), rep_type):
            if r.name.split("|")[0] in good:
                SeqIO.write(r, f, rep_type)

    # write output to .abundance.txt
    with open(f"{output_prefix}.abundance.txt", "w") as f:
        f.write(count_header)
        writer = DictWriter(
            f,
            fieldnames=[
                "pbid",
                "count_fl",
                "count_nfl",
                "count_nfl_amb",
                "norm_fl",
                "norm_nfl",
                "norm_nfl_amb",
            ],
            delimiter="\t",
            lineterminator="\n",
        )
        writer.writeheader()
        for k in good:
            r = d[k]
            writer.writerow(r)

    logger.info(
        f"Output written to: {output_prefix}.gff\n"
        f"Output written to: {rep_filename}\n"
        f"Output written to: {output_prefix}.abundance.txt"
    )

Ejemplo n.º 7

0

Mostrar archivo

Archivo: scrub_sample_GFF_junctions.py Proyecto: milescsmith/cDNA_Cupcake

def cleanup_scrubbed_files_redundancy(
    gff_filename: Union[str, Path],
    group_filename: Union[str, Path],
    count_filename: Union[str, Path],
    fastq_filename: Union[str, Path],
    output_prefix: str,
):

    junction_seen = defaultdict(lambda: defaultdict(lambda: [
    ]))  # key (chr,strand) -> dict of (series of junctions) -> record
    for r in GFF.collapseGFFReader(gff_filename):
        n = len(r.ref_exons)
        if n == 1:
            junc_str = f"{str(r.start)},{str(r.end)}"
            junction_seen[r.chr, r.strand][junc_str] = [r]
        else:
            junc_str = ",".join(
                f"{str(r.ref_exons[i].end)},{str(r.ref_exons[i + 1].start)}"
                for i in range(n - 1))
            junction_seen[r.chr, r.strand][junc_str].append(r)

    # write out cleaned GFF
    with open(f"{output_prefix}.gff",
              "w") as outf, open(f"{output_prefix}.merged_ids.txt",
                                 "w") as outf2:
        merged = {}
        keys = list(junction_seen.keys())
        keys.sort()
        for k in keys:
            for bunch in junction_seen[k].values():
                if len(bunch) == 1:  # just one record, write it out
                    r = bunch[0]
                    GFF.write_collapseGFF_format(outf, r)
                    merged[r.seqid] = [r.seqid]
                else:
                    # find the representative
                    r = bunch[0]
                    for r2 in bunch[1:]:
                        if r2.end - r2.start > r.end - r.start:
                            r = r2
                    GFF.write_collapseGFF_format(outf, r)
                    merged[r.seqid] = [x.seqid for x in bunch]
                outf2.write(f"{r.seqid}\t{','.join(merged[r.seqid])}\n")

    count_d, count_header = read_count_file(count_filename)
    # write out count file
    with open(f"{output_prefix}.abundance.txt", "w") as outf:
        outf.write(count_header)
        writer = DictWriter(
            outf,
            fieldnames=[
                "pbid",
                "count_fl",
                "count_nfl",
                "count_nfl_amb",
                "norm_fl",
                "norm_nfl",
                "norm_nfl_amb",
            ],
            delimiter="\t",
            lineterminator="\n",
        )
        writer.writeheader()
        for pbid, bunch in merged.items():
            # combine the counts
            r = count_d[bunch[0]]
            r["pbid"] = pbid
            for field in fields_to_add:
                r[field] = float(r[field])
            for _id in bunch[1:]:
                for field in fields_to_add:
                    r[field] += float(count_d[_id][field])
            writer.writerow(r)

    group_info = read_group_file(group_filename)
    # write out group file
    with open(f"{output_prefix}.group.txt", "w") as outf:
        for pbid, bunch in merged.items():
            # combine the groups
            g = [group_info[bunch[0]]]
            for _id in bunch[1:]:
                g.append(group_info[_id])
            outf.write(f"{pbid}\t{','.join(g)}\n")

    # write out fastq file if present
    if fastq_filename is not None:
        with open(f"{output_prefix}.rep.fq", "w") as outf:
            for r in SeqIO.parse(open(fastq_filename), "fastq"):
                if r.id.split("|")[0] in merged or r.id in merged:
                    SeqIO.write(r, outf, "fastq")

    logger.info(
        f"scrubbed files written: {output_prefix}.gff, {output_prefix}.group.txt, {output_prefix}.abundance.txt, {output_prefix}.merged_ids.txt"
    )

Ejemplo n.º 8

0

Mostrar archivo

Archivo: combine_abundance_across_samples.py Proyecto: milescsmith/cDNA_Cupcake

def write_reclist_to_gff_n_info(
    rec_list: Dict[str, Any],
    final_prefix: str,
    ref_name: str,
    addon_name: str,
    use_fq: bool = False,
) -> Dict[str, str]:
    # now go through the rec list and figure out in what order we are outputting the total records
    tree = defaultdict(lambda: {
        "+": ClusterTree(0, 0),
        "-": ClusterTree(0, 0)
    })
    tree_keys_numeric = set()
    tree_keys_alpha = set()
    for i, match_rec in enumerate(rec_list):
        tree[match_rec.rec.chr][match_rec.rec.strand].insert(
            match_rec.rec.start, match_rec.rec.end, i)

    for chrom in tree:
        try:
            k = int(chrom)
            tree_keys_numeric.add(k)
        except ValueError:
            tree_keys_alpha.add(chrom)
    tree_keys = sorted(tree_keys_numeric) + sorted(tree_keys_alpha)

    writer_info = DictWriter(
        Path(f"{final_prefix}.mega_info.txt").open("w"),
        fieldnames=["superPBID", ref_name, addon_name],
        delimiter="\t",
    )
    writer_info.writeheader()
    if use_fq:
        f_fq = Path(f"{final_prefix}.rep.fq")
    with open(f"{final_prefix}.gff",
              "w") as f_gff, open(f"{final_prefix}.group.txt", "w") as f_group:
        new_group_info = {}

        pb_i = 0
        for _chr in tree_keys:
            for _strand in ("+", "-"):
                for *_, _indices in tree[_chr][_strand].getregions():
                    # further sort these records by (start, end, num_exons)
                    _indices.sort(key=lambda i: (
                        rec_list[i].rec.start,
                        rec_list[i].rec.end,
                        len(rec_list[i].rec.ref_exons),
                    ))
                    pb_i += 1
                    for pb_j, recs_index in enumerate(_indices):
                        pbid = f"PB.{pb_i}.{pb_j + 1}"
                        match_rec = rec_list[recs_index]
                        new_group_info[pbid] = match_rec.members
                        match_rec.rec.seqid = pbid
                        GFF.write_collapseGFF_format(f_gff, match_rec.rec)
                        writer_info.writerow({
                            "superPBID": pbid,
                            ref_name: match_rec.ref_id,
                            addon_name: match_rec.addon_id,
                        })
                        f_group.write(
                            f"{pbid}\t{','.join(match_rec.members)}\n")
                        if use_fq:
                            match_rec.seqrec.id = pbid
                            match_rec.seqrec.description = ""
                            SeqIO.write(match_rec.seqrec, f_fq, "fastq")

    return new_group_info

Ejemplo n.º 9

0

Mostrar archivo

Archivo: filter_away_subset.py Proyecto: milescsmith/cDNA_Cupcake

def main(
    count_filename: Path = typer.Argument(
        ..., help="Count file (generally ends with '.abundance.txt')"),
    gff_filename: Path = typer.Argument(..., help="Annotation file"),
    rep_filename: Path = typer.Argument(
        ...,
        help="Sequence file (ends with '.fq', '.fastq', '.fa', or '.fasta')"),
    fuzzy_junction: int = typer.Option(
        5, help="Fuzzy junction max dist (default: 5bp)"),
    sample_directory: Optional[Path] = typer.Option(
        None,
        help=
        "Directory in which the sample data resides.  By default uses the directory from which the script was called",
    ),
    output_prefix: Optional[str] = typer.Option(
        None, help="Prefix to use when naming the filtered files"),
    version: bool = typer.Option(
        None,
        "--version",
        callback=version_callback,
        is_eager=True,
        help="Prints the version of the SQANTI3 package.",
    ),
) -> None:

    if not sample_directory:
        sample_directory = Path.cwd()

    if output_prefix is None:
        output_prefix = f"{gff_filename.stem}.filtered"

    rep_type = "fastq" if rep_filename.suffix in (".fq", ".fastq") else "fasta"

    sanity_check_collapse_input(
        count_filename,
        gff_filename,
        rep_filename,
        sample_directory,
    )

    recs = defaultdict(lambda: [])
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith("PB.")
        recs[int(r.seqid.split(".")[1])].append(r)

    good = []
    with open(f"{output_prefix}.gff", "w") as f:
        keys = list(recs.keys())
        keys.sort()
        for k in recs:
            xxx = recs[k]
            filter_out_subsets(xxx, fuzzy_junction)
            for r in xxx:
                GFF.write_collapseGFF_format(f, r)
                good.append(r.seqid)

    # read abundance first
    d, count_header = read_count_file(count_filename)

    # write output rep.fq
    with open(f"{output_prefix}.rep.{('fq' if rep_type == 'fastq' else 'fa')}",
              "w") as f:
        for r in SeqIO.parse(open(rep_filename), rep_type):
            if r.name.split("|")[0] in good:
                SeqIO.write(r, f, rep_type)

    # write output to .abundance.txt
    with open(f"{output_prefix}.abundance.txt", "w") as f:
        f.write(count_header)
        writer = DictWriter(
            f,
            fieldnames=[
                "pbid",
                "count_fl",
                "count_nfl",
                "count_nfl_amb",
                "norm_fl",
                "norm_nfl",
                "norm_nfl_amb",
            ],
            delimiter="\t",
            lineterminator="\n",
        )
        writer.writeheader()
        for k in good:
            r = d[k]
            writer.writerow(r)

    logger.info(f"Output written to: {output_prefix}.gff\n"
                f"Output written to: {rep_filename}\n"
                f"Output written to: {output_prefix}.abundance.txt")

Ejemplo n.º 10

0

Mostrar archivo

def chain_split_file(
    ref_gff: Path,
    ref_group: Path,
    ref_name: str,
    addon_gff: Path,
    addon_group: Path,
    addon_name: str,
    fuzzy_junction: int,
    allow_5merge: bool,
    max_3_diff: int,
    n_chunks: int,
) -> Tuple[List[str], List[str]]:
    """
    Organize entries in both a gff and transcript group file
    and split both such that the original two files are split into chunks
    where gff.chunk.n covers the same entries as group.chunk.n
    """

    # read in the group_file as a dictionary in the form of
    # {
    #   'PB.1.1': ["transcript/1"],
    #   'PB.1.2': ["transcript/2", "transcript/3"]
    # }
    addon_group_info = sp.MegaPBTree.read_group(addon_group, None)
    # with addon_group.open('r') as ag:
    #     addon_group_info = {_.split('\t')[0]: _.split('\t')[1].split(",") for _ in ag.readlines()}
    recs = []
    tree = OrderedDict()
    i = 0
    # for r in HTSeq.GFF_Reader(addon_gff):
    # if r.iv.chrom not in tree2:
    #     tree[r.iv.chrom] = {"+": ClusterTree(0, 0), "-": ClusterTree(0, 0)}
    #     tree[r.iv.chrom][r.iv.strand].insert(r.iv.start, r.iv.end, i)
    #     recs.append(r)
    #     i += 1

    # This should build a structure in the form of:
    # {"chrN":
    #   {
    #       "+" : bx.intervals.cluster.clusterTree,
    #       "-" : bx.intervals.cluster.clusterTree,
    #   },
    # "chrN+1":
    #   {
    #       "+" : bx.intervals.cluster.clusterTree,
    #       "-" : bx.intervals.cluster.clusterTree,
    #   },
    # }
    # CusterTree objects have the form
    #   [(x,y,[z]), (a,b,[c]), (m,n,[o])]
    #   where each tuple is a range and a list of ids that lie within that range
    # e.g. (from the bx-python docs):
    # tree = ClusterTree(0, 0) Insert (6, 7, 1), (1, 2, 3), (9, 10, 2), (3, 4, 0), (3, 8, 4)
    # tree.getregions() returns [(1, 2, [3]), (3, 8, [0, 1, 4]), (9, 10, [2])]

    # NOTE: GFF.collapseGFFReader is a specialized GFF reader that in the attributes
    # field stores a list of bx.intervals.intersection.Interval objects
    # describing the exons
    for r in GFF.collapseGFFReader(addon_gff):
        if r.chr not in tree:
            tree[r.chr] = {"+": ClusterTree(0, 0), "-": ClusterTree(0, 0)}
        tree[r.chr][r.strand].insert(r.start, r.end, i)
        recs.append(r)
        i += 1

    n = len(recs)
    chunk_size = (n // n_chunks) + (n % n_chunks > 0)

    split_files = []
    i = 0
    counter = 0
    f_gff = open(f"{addon_gff}.split{str(i)}", "w")
    f_group = open(f"{addon_group}.split{str(i)}", "w")
    # this loop is going to reorder everything
    # so that we have a GFF with a transcript followed by all the exons that
    # made up that transcript and a separate file with the matching
    # transcript_id transcript/read_group#
    # (see the sp.MegaPBTree above)
    for v1 in tree.values():
        for strand in ("+", "-"):
            v2 = v1[strand]
            for *_, _indices in v2.getregions():
                for cur in _indices:
                    GFF.write_collapseGFF_format(f_gff, recs[cur])
                    f_group.write(
                        f"{recs[cur].seqid}\t{','.join(addon_group_info[recs[cur].seqid])}\n"
                    )
                    counter += 1
            if counter >= (i + 1) * chunk_size:
                i += 1
                n = f_gff.tell()
                f_gff.close()
                f_group.close()
                if n == 0:  # didn't write any records, delete these
                    Path(f_gff.name).unlink()
                    Path(f_group.name).unlink()
                else:
                    split_files.append((f_gff.name, f_group.name))
                if i >= n_chunks or counter >= len(recs):
                    break
                f_gff = open(f"{addon_gff}.split{str(i)}", "w")
                f_group = open(f"{addon_group}.split{str(i)}", "w")
    if not f_gff.closed:
        n = f_gff.tell()
        f_gff.close()
        f_group.close()
        if n == 0:  # didn't write any records, delete these
            Path(f_gff.name).unlink()
            Path(f_group.name).unlink()
        else:
            split_files.append((f_gff.name, f_group.name))

    result_prefixes = []
    pools = []
    for i, (split_gff, split_group) in enumerate(split_files):
        p = Process(
            target=chain_helper,
            args=(
                ref_gff,
                ref_group,
                split_gff,
                split_group,
                ref_name,
                f"{addon_name}.{str(i)}",
                fuzzy_junction,
                allow_5merge,
                max_3_diff,
            ),
        )
        p.start()
        pools.append(p)
        result_prefixes.append((ref_name, f"{addon_name}.{str(i)}"))
    for p in pools:
        p.join()
    return result_prefixes, split_files