def scrub_sample_GFFs(
    sample_dirs: Dict[str, str],
    gff_filename: Union[str, Path],
    count_filename: Union[str, Path],
    group_filename: Union[str, Path],
    fastq_filename: Union[str, Path],
    output_prefix: str,
    tree: IntervalTree,
) -> None:

    for _, d in sample_dirs.items():
        with Path(d, f"{output_prefix}.gff.tmp").open("w") as outf:
            for r in GFF.collapseGFFReader(Path(d, gff_filename)):
                n = len(r.ref_exons)
                if n == 1:
                    GFF.write_collapseGFF_format(outf, r)

                new_ref_exons = scrub_ref_exons(r, tree)
                if new_ref_exons is None:
                    logger.info("No changes made due to error:", r.seqid)
                else:
                    # print "before:", r.ref_exons
                    # print "after :", new_ref_exons
                    r.ref_exons = new_ref_exons
                GFF.write_collapseGFF_format(outf, r)
        cleanup_scrubbed_files_redundancy(
            outf.name,
            Path(d, group_filename),
            Path(d, count_filename),
            Path(d, fastq_filename) if fastq_filename is not None else None,
            Path(d, output_prefix),
        )
def get_gff_from_list(gff_filename, listfile, partial_ok=False):
    seqs = [line.strip() for line in open(listfile)]
    for r in GFF.collapseGFFReader(gff_filename):
        if (
            r.seqid in seqs
            or r.seqid.split("|")[0] in seqs
            or (partial_ok and any(r.seqid.startswith(x) for x in seqs))
        ):
            GFF.write_collapseGFF_format(sys.stdout, r)
def regroup_gff(
    pooled_gff, demux_count_file, output_prefix, out_group_dict, in_fafq=None
):
    """
    :param pooled_sam: SAM file
    :param demux_count_file: comma-delimited per-barcode count file
    :param output_prefix: output prefix for GFF
    :param out_group_dict: dict of barcode name --> group to be long in  (ex: {'EM1':'EM', 'EM2':'EM'})
    :param in_fafq: optional fasta/fastq that was input to SAM
    """
    if in_fafq is not None:
        type_fafq = get_type_fafq(in_fafq)
    in_tissue = defaultdict(
        lambda: set()
    )  # pbid --> list of tissue it is in (EM, END, R)

    for r in DictReader(open(demux_count_file), delimiter=","):
        for k, v in r.items():
            if k != "id" and int(v) > 0:
                in_tissue[r["id"]].add(k)

    # in_tissue = dict(in_tissue)

    handles = {}
    handles_fafq = {}
    for g in out_group_dict.values():
        handles[g] = open(f"{output_prefix}_{g}_only.gff", "w")
        if in_fafq is not None:
            handles_fafq[g] = open(f"{output_prefix}_{g}_only.{type_fafq}", "w")

    if in_fafq is not None:
        fafq_dict = SeqIO.to_dict(SeqIO.parse(open(in_fafq), type_fafq))
        fafq_dict_keys = list(fafq_dict.keys())
        for k in fafq_dict_keys:
            m = rex_pbid.match(k)
            if m is not None:
                fafq_dict[m.group(1)] = fafq_dict[k]
    reader = GFF.collapseGFFReader(pooled_gff)
    for r in reader:
        groups_to_write_in = set()
        pbid = r.seqid
        if pbid not in in_tissue:
            logger.info(
                f"WARNING: {pbid} does not belong to any group indicated by outgroup_dict"
            )
        for tissue in in_tissue[pbid]:
            groups_to_write_in.add(out_group_dict[tissue])

        for g in groups_to_write_in:
            GFF.write_collapseGFF_format(handles[g], r)
            if in_fafq is not None:
                SeqIO.write(fafq_dict[pbid], handles_fafq[g], type_fafq)
    def __init__(
        self,
        gff_filename: Union[str, Path],
        group_filename: Union[str, Path],
        internal_fuzzy_max_dist: int = 0,
        self_prefix: str = None,
        fastq_filename: Union[str, Path] = None,
        fusion_max_dist: int = 10,
    ):
        """
        Differences with non-fusion MegaPBTree:

        1. allow_5merge is always FALSE. Not a parameter.
        2. fusion_max_dist --- maximum allowed distance on internal fusion sites to be called as equivalent fusions
        """
        super().__init__(
            gff_filename,
            group_filename,
            internal_fuzzy_max_dist,
            self_prefix,
            False,
            fastq_filename,
        )

        self.fusion_max_dist = fusion_max_dist

        # ex: PBfusion.1 -> [PBfusion.1.1, PBfusion.1.2]
        self.record_d_fusion = {
            fusion_id: records
            for fusion_id, records in GFF.collapseGFFFusionReader(gff_filename)
        }
    def __init__(
        self,
        gff_filename: str,
        group_filename: str,
        internal_fuzzy_max_dist: int = 0,
        self_prefix: Optional[str] = None,
        allow_5merge: bool = False,
        fastq_filename: Optional[str] = None,
        max_3_diff: Optional[int] = None,
    ):
        self.gff_filename = gff_filename
        self.group_filename = group_filename
        self.self_prefix = self_prefix
        self.internal_fuzzy_max_dist = internal_fuzzy_max_dist
        self.max_3_diff = max_3_diff
        self.allow_5merge = allow_5merge
        self.record_d = {
            r.seqid: r
            for r in GFF.collapseGFFReader(gff_filename)
        }
        # sanity_check_seqids(self.record_d.keys()) # sanity check all IDs look like PB.1.2
        self.tree = defaultdict(lambda: {
            "+": IntervalTree(),
            "-": IntervalTree()
        })  # chr --> strand --> tree
        self.fastq_dict = None
        if fastq_filename is not None:
            self.fastq_dict = MegaPBTree.read_fastq_to_dict(fastq_filename)

        # print >> sys.stderr, "self.internal_fuzzy_max_dist is", internal_fuzzy_max_dist
        # raw_input()
        self.read_gff_as_interval_tree()
        self.group_info = MegaPBTree.read_group(
            self.group_filename,
            self.self_prefix)  # ex: PB.1.1 --> [ RatHeart|i3_c123.... ]
    def add_sample(
        self,
        gff_filename: Union[str, Path],
        group_filename: Union[str, Path],
        sample_prefix: str,
        output_prefix: str,
        fastq_filename: Union[str, Path] = None,
    ) -> None:
        combined = []  # list of (<matches to r2 or None>, r2)
        unmatched_recs = set(self.record_d.keys())

        for r in GFF.collapseGFFReader(gff_filename):
            # for each collapsed transcript, find records that overlap
            match_rec_list = list(self.match_record_to_tree(r))
            if len(match_rec_list
                   ) > 0:  # found match(es)! put longer of r1/r2 in
                # if len(match_rec_list) > 1: pdb.set_trace()  #DEBUG
                combined.append((match_rec_list, r))
                for match_rec in match_rec_list:
                    try:
                        unmatched_recs.remove(match_rec.seqid)
                    except KeyError:
                        pass  # already deleted, OK, this can happen
            else:  # r is not present in current tree
                combined.append((None, r))
        # put whatever is left from the tree in
        for seqid in unmatched_recs:
            combined.append(([self.record_d[seqid]], None))

        # create a ClusterTree to re-calc the loci/transcripts
        final_tree = defaultdict(lambda: {
            "+": ClusterTree(0, 0),
            "-": ClusterTree(0, 0)
        })
        for i, (r1s, r2) in enumerate(combined):
            if r1s is None:
                final_tree[r2.chr][r2.strand].insert(r2.start, r2.end, i)
            else:
                if r2 is not None:
                    rep = find_representative_in_iso_list(r1s + [r2])
                else:
                    rep = find_representative_in_iso_list(r1s)
                final_tree[rep.chr][rep.strand].insert(rep.start, rep.end, i)

        self.write_cluster_tree_as_gff(
            rec_list=combined,
            group_filename2=group_filename,
            sample_prefix2=sample_prefix,
            output_prefix=output_prefix,
            fastq_filename2=fastq_filename,
        )
    def add_sample(
        self,
        gff_filename: Union[str, Path],
        group_filename: Union[str, Path],
        sample_prefix: str,
        output_prefix: str,
        fastq_filename: Optional[Union[str, Path]] = None,
    ) -> None:
        combined = (
            []
        )  # list of (r1 if r2 is None | r2 if r1 is None | longer of r1 or r2 if both not None)
        unmatched_recs = list(self.record_d_fusion.keys())

        for _, records in GFF.collapseGFFFusionReader(gff_filename):
            match_seqid = self.match_fusion_record(records)
            if match_seqid is not None:
                combined.append((self.record_d_fusion[match_seqid], records))
                try:
                    unmatched_recs.remove(match_seqid)
                except ValueError:
                    pass  # already deleted, OK, this happens for single-exon transcripts
            else:  # r is not present in current tree
                combined.append((None, records))
        # put whatever is left from the tree in
        for seqid in unmatched_recs:
            combined.append((self.record_d_fusion[seqid], None))

        # create a ClusterTree to re-calc the loci/transcripts
        final_tree = defaultdict(lambda: {
            "+": ClusterTree(0, 0),
            "-": ClusterTree(0, 0)
        })
        for i, (r1s, r2s) in enumerate(combined):
            if r2s is None or (r1s is not None and r1s[0].end - r1s[0].start >
                               r2s[0].end - r2s[0].start):
                final_tree[r1s[0].chr][r1s[0].strand].insert(
                    r1s[0].start, r1s[0].end, i)
            else:
                final_tree[r2s[0].chr][r2s[0].strand].insert(
                    r2s[0].start, r2s[0].end, i)

        self.write_cluster_tree_as_gff(
            final_tree,
            combined,
            group_filename,
            sample_prefix,
            output_prefix,
            fastq_filename2=fastq_filename,
        )
def sanity_check_collapse_input(input_prefix: str) -> Tuple[Path, Path, Path]:
    """
    Check that
    1. the count, gff, rep files exist
    2. the number of records agree among the three
    """
    # group_filename =  f"{input_prefix}.group.txt"
    count_filename = Path(f"{input_prefix}.abundance.txt")
    gff_filename = Path(f"{input_prefix}.gff")
    rep_filename = Path(f"{input_prefix}.rep.fq")
    if not count_filename.exists():
        logger.error(f"File {count_filename} does not exist. Abort!")
        sys.exit(-1)
    if not gff_filename.exists():
        logger.error(f"File {gff_filename} does not exist. Abort!")
        sys.exit(-1)
    if not rep_filename.exists():
        logger.error(f"File {rep_filename} does not exist. Abort!")
        sys.exit(-1)

    pbids1 = {[r.id for r in SeqIO.parse(open(rep_filename, "r"), "fastq")]}
    pbids2 = {[r.seqid for r in GFF.collapseGFFReader(gff_filename)]}
    pbids3 = {read_count_file(count_filename)[0].keys()}

    if (
        len(pbids1) != len(pbids2)
        or len(pbids2) != len(pbids3)
        or len(pbids1) != len(pbids3)
    ):
        logger.error(
            "The number of PBID records in the files disagree! Sanity check failed."
        )
        logger.error(f"# of PBIDs in {rep_filename}: {len(pbids1)}")
        logger.error(f"# of PBIDs in {gff_filename}: {len(pbids2)}")
        logger.error(f"# of PBIDs in {count_filename}: {len(pbids3)}")
        sys.exit(-1)

    return count_filename, gff_filename, rep_filename
def sanity_check_collapse_input(count_filename: Path, gff_filename: Path,
                                rep_filename: Path,
                                sample_directory: Path) -> None:
    """
    Check that
    1. the count, gff, rep files exist
    2. the number of records agree among the three
    """
    # group_filename = f"{input_prefix}.group.txt"

    if not rep_filename.exists():
        raise RuntimeError(
            f"Input sequence file {rep_filename.name} not found. Abort!")
    if not count_filename.exists():
        raise RuntimeError(f"File {count_filename.name} not found. Abort!")
    if not gff_filename.exists():
        raise RuntimeError(f"File {gff_filename.name} not found. Abort!")
    if not sample_directory.exists():
        raise RuntimeError(
            f"The directory {sample_directory.name} not found. Abort!")

    rep_type = "fastq" if rep_filename.suffix in (".fq", ".fastq") else "fasta"

    pbids1 = {r.id for r in SeqIO.parse(open(rep_filename), rep_type)}
    pbids2 = {r.seqid for r in GFF.collapseGFFReader(gff_filename)}
    pbids3 = set(read_count_file(count_filename)[0].keys())

    if (len(pbids1) != len(pbids2) or len(pbids2) != len(pbids3)
            or len(pbids1) != len(pbids3)):
        logger.error(
            "The number of PBID records in the files disagree! Sanity check failed."
        )
        logger.error(f"# of PBIDs in {rep_filename}: {len(pbids1)}")
        logger.error(f"# of PBIDs in {gff_filename}: {len(pbids2)}")
        logger.error(f"# of PBIDs in {count_filename}: {len(pbids3)}")
        sys.exit(-1)

    return None
def sample_sanity_check(
    group_filename: Union[str, Path],
    gff_filename: Union[str, Path],
    count_filename: Union[str, Path],
    fastq_filename: Optional[Union[str, Path]] = None,
) -> None:
    """
    Double check that the formats are expected and all PBIDs are concordant across the files
    :return: raise Exception if sanity check failed
    """

    logger.info(
        f"Sanity checking. Retrieving PBIDs from {group_filename},{gff_filename},{count_filename}..."
    )
    ids1 = [line.strip().split()[0] for line in open(group_filename)]
    ids2 = [
        fusion_id
        for fusion_id, rs in GFF.collapseGFFFusionReader(gff_filename)
    ]
    with open(count_filename) as f:
        for _ in range(14):
            f.readline()  # just through the header
        ids3 = [r["pbid"] for r in DictReader(f, delimiter="\t")]
        if len({ids2}.difference(ids1)) > 0 or len({ids2
                                                    }.difference(ids3)) > 0:
            raise Exception(
                f"Sanity check failed! Please make sure the PBIDs listed in {gff_filename} are also in {gff_filename} and {count_filename}"
            )

    if fastq_filename is not None:
        ids4 = [
            r.id.split("|")[0] for r in SeqIO.parse(fastq_filename, "fastq")
        ]
        if len({ids2}.difference(ids4)) > 0:
            raise Exception(
                f"Sanity check failed! Please make sure the PBIDs listed in {gff_filename} are also in {fastq_filename}"
            )
Esempio n. 11
0
def sample_sanity_check(group_filename,
                        gff_filename,
                        count_filename,
                        fastq_filename=None) -> None:
    """
    Double check that the formats are expected and all PBIDs are concordant across the files
    :return: raise Exception if sanity check failed
    """
    logger.info(
        f"Sanity checking. Retrieving PBIDs from {group_filename},{gff_filename},{count_filename}..."
    )
    ids1 = [line.strip().split()[0] for line in open(group_filename)]
    ids2 = [r.seqid for r in GFF.collapseGFFReader(gff_filename)]
    f = open(count_filename)
    while True:
        # advance through the headers which start with #
        cur = f.tell()
        if (not f.readline().startswith("#")
                or f.tell() == cur):  # first non-# seen or EOF
            f.seek(cur)
            break
    ids3 = [r["pbid"] for r in DictReader(f, delimiter="\t")]
    if len(set(ids2).difference(ids1)) > 0 or len(
            set(ids2).difference(ids3)) > 0:
        raise Exception(
            f"Sanity check failed! Please make sure the PBIDs listed in {gff_filename} are also in {group_filename} and {count_filename}"
        )

    if fastq_filename is not None:
        ids4 = [
            r.id.split("|")[0]
            for r in SeqIO.parse(open(fastq_filename), "fastq")
        ]
        if len(set(ids2).difference(ids4)) > 0:
            raise Exception(
                f"Sanity check failed! Please make sure the PBIDs listed in {gff_filename} are also in {fastq_filename}"
            )
def summarize_junctions(
    sample_dirs: Dict[str, Path],
    # sample_names: List[str],
    gff_filename: Union[str, Path],
    output_prefix: Union[str, Path],
    genome_d: Optional[Union[str, Path]] = None,
    junction_known: Optional[Union[str, Path]] = None,
) -> defaultdict:
    """
    1. for each sample, read all the GFF, store the junction information (both 0-based)

    """
    junc_by_chr_strand = defaultdict(
        lambda: defaultdict(list)
    )  # (seqname,strand) --> (donor,acceptor) --> samples it show up in (more than once possible)

    for sample_name, d in sample_dirs.items():
        for r in GFF.collapseGFFReader(Path(d, gff_filename)):
            n = len(r.ref_exons)
            if n == 1:
                continue  # ignore single exon transcripts
            for i in range(n - 1):
                donor = r.ref_exons[i].end - 1  # make it 0-based
                accep = r.ref_exons[i + 1].start  # start is already 0-based
                junc_by_chr_strand[r.seqname,
                                   r.strand][donor, accep].append(sample_name)

    # write junction report
    with open(f"{output_prefix}.junction.bed",
              "w") as f1, open(f"{output_prefix}.junction_detail.txt",
                               "w") as f:
        f1.write(
            f'track name=junctions description="{output_prefix}" useScore=1\n')

        JUNC_DETAIL_FIELDS = [
            "seqname",
            "left",
            "right",
            "strand",
            "num_transcript",
            "num_sample",
            "genome",
            "annotation",
            "label",
        ]

        writer = DictWriter(f, JUNC_DETAIL_FIELDS, delimiter="\t")
        writer.writeheader()
        keys = list(junc_by_chr_strand)
        keys.sort()
        for _seqname, _strand in keys:
            v = junc_by_chr_strand[_seqname, _strand]
            v_keys = list(v)
            v_keys.sort()
            labels = cluster_junctions(v_keys)
            for i, (_donor, _accep) in enumerate(v_keys):
                rec = {
                    "seqname": _seqname,
                    "left": _donor,
                    "right": _accep,
                    "strand": _strand,
                    "num_transcript": len(v[_donor, _accep]),
                    "num_sample": len(set(v[_donor, _accep])),
                }
                # f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t".format(_chr, _donor, _accep, _strand, len(v[_donor,_accep]), len(set(v[_donor,_accep]))))
                f1.write(
                    f"{_seqname}\t{_donor}\t{_accep + 1}\t{output_prefix}\t{len(v[_donor, _accep])}\t{_strand}\n"
                )
                # if genome is given, write acceptor-donor site
                if genome_d is None or _seqname not in genome_d:
                    rec["genome"] = "NA"
                    # f.write("NA\t")
                else:
                    up, down = (
                        genome_d[_seqname][(_donor + 1):(_donor + 3)],
                        genome_d[_seqname][(_accep - 2):_accep],
                    )
                    if _strand == "+":
                        rec["genome"] = f"{str(up.seq).upper()}-{str(down.seq).upper()}"
                        # f.write("{0}-{1}\t".format(str(up.seq).upper(), str(down.seq).upper()))
                    else:
                        rec["genome"] = f"{str(down.reverse_complement().seq).upper()}-{str(up.reverse_complement().seq).upper()}"
                        # f.write("{0}-{1}\t".format(str(down.reverse_complement().seq).upper(), str(up.reverse_complement().seq).upper()))
                # if annotation is given, check if matches with annotation
                if junction_known is None:
                    rec["annotation"] = "NA"
                    # f.write("NA\n")
                else:
                    if (_seqname, _strand) in junction_known and (
                            _donor,
                            _accep,
                    ) in junction_known[_seqname, _strand]:
                        rec["annotation"] = "Y"
                        # f.write("Y\t")
                    else:
                        rec["annotation"] = "N"
                        # f.write("N\t")
                rec["label"] = f"{_seqname}_{_strand}_{labels[i]}"
                writer.writerow(rec)
            # f.write("{c}_{s}_{lab}\n".format(c=_seqname, s=_strand, lab=labels[i]))

    return junc_by_chr_strand
Esempio n. 13
0
def main(
    input_prefix: str = typer.Argument(
        ..., help="Input prefix (ex: test.collapsed.min_fl_2)"
    ),
    version: bool = typer.Option(
        None,
        "--version",
        callback=version_callback,
        is_eager=True,
        help="Prints the version of the SQANTI3 package.",
    ),
) -> None:

    output_prefix = f"{input_prefix}.nomono"

    count_filename, gff_filename, rep_filename = sanity_check_collapse_input(
        input_prefix
    )

    good = []
    with open(f"{output_prefix}.gff", "w") as f:
        reader = GFF.collapseGFFReader(gff_filename)
        for r in reader:
            assert r.seqid.startswith("PB.")
            if len(r.ref_exons) > 1:
                good.append(r.seqid)
                GFF.write_collapseGFF_format(f, r)

    # read abundance first
    d, count_header = read_count_file(count_filename)

    # write output rep.fq
    with open(f"{output_prefix}.rep.fq", "w") as f:
        for r in SeqIO.parse(open(rep_filename, "r"), "fastq"):
            if r.name.split("|")[0] in good:
                SeqIO.write(r, f, "fastq")

    # write output to .abundance.txt
    with open(f"{output_prefix}.abundance.txt", "w") as f:
        f.write(count_header)
        writer = DictWriter(
            f,
            fieldnames=[
                "pbid",
                "count_fl",
                "count_nfl",
                "count_nfl_amb",
                "norm_fl",
                "norm_nfl",
                "norm_nfl_amb",
            ],
            delimiter="\t",
            lineterminator="\n",
        )
        writer.writeheader()
        for k in good:
            r = d[k]
            writer.writerow(r)

    logger.info(f"Output written to:{output_prefix}.gff")
    logger.info(f"Output written to:{output_prefix}.rep.fq")
Esempio n. 14
0
def filter_by_count(
    input_prefix: str,
    output_prefix: str,
    min_count: int,
    dun_use_group_count: bool = False,
) -> None:

    group_filename = f"{input_prefix}.group.txt"
    count_filename = f"{input_prefix}.abundance.txt"
    gff_filename = f"{input_prefix}.gff"
    rep_filenames = [
        (f"{input_prefix}.rep.fq", "fastq"),
        (f"{input_prefix}.rep.fastq", "fastq"),
        (f"{input_prefix}.rep.fa", "fasta"),
        (f"{input_prefix}.rep.fasta", "fasta"),
    ]

    rep_filename = None
    rep_type = None
    for x, feature in rep_filenames:
        if os.path.exists(x):
            rep_filename = x
            rep_type = feature

    if rep_filename is None:
        logger.error(
            f"Expected to find input fasta or fastq files {input_prefix}.rep.fa or {input_prefix}.rep.fq. Not found. Abort!"
        )
        sys.exit(-1)

    if not dun_use_group_count:
        # read group
        group_max_count_fl = {}
        group_max_count_p = {}
        for line in open(group_filename):
            # ex: PB.1.1  i0HQ_54b0ca|c58773/f30p16/700
            pbid, members = line.strip().split("\t")
            group_max_count_fl[pbid] = 0
            group_max_count_p[pbid] = 0
            members = members.split(",")
            for m in members:
                i = m.find("|")
                if i > 0:
                    tmp = m.split("|")[1].split("/")[1]  # ex: tmp = f30p16
                else:
                    tmp = m.split("/")[1]
                fl_count, p_count = tmp.split("p")
                fl_count = int(fl_count[1:])
                p_count = int(p_count)
                group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count)
                group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count)

    # read abundance first
    with open(count_filename) as f:
        count_header = ""
        while True:
            cur_pos = f.tell()
            line = f.readline()
            if not line.startswith("#"):
                f.seek(cur_pos)
                break
            else:
                count_header += line
        d = {r["pbid"]: r for r in DictReader(f, delimiter="\t")}
        for k, v in d.items():
            print(k, v)

    # group_max_count_p NOT used for now
    good = [
        x
        for x in d
        if int(d[x]["count_fl"]) >= min_count
        and (dun_use_group_count or group_max_count_fl[x] >= min_count)
    ]

    # write output GFF
    with open(f"{output_prefix}.gff", "w") as f:
        for r in GFF.collapseGFFReader(gff_filename):
            if r.seqid in good:
                GFF.write_collapseGFF_format(f, r)

    # write output rep.fq
    with open(
        f"{output_prefix}.rep.{('fq' if rep_type == 'fastq' else 'fa')}", "w"
    ) as f:
        for r in SeqIO.parse(open(rep_filename), rep_type):
            if r.name.split("|")[0] in good:
                SeqIO.write(r, f, rep_type)

    # write output to .abundance.txt
    with open(f"{output_prefix}.abundance.txt", "w") as f:
        f.write(count_header)
        writer = DictWriter(
            f,
            fieldnames=[
                "pbid",
                "count_fl",
                "count_nfl",
                "count_nfl_amb",
                "norm_fl",
                "norm_nfl",
                "norm_nfl_amb",
            ],
            delimiter="\t",
            lineterminator="\n",
        )
        writer.writeheader()
        for k in good:
            r = d[k]
            writer.writerow(r)

    logger.info(
        f"Output written to: {output_prefix}.gff\n"
        f"Output written to: {rep_filename}\n"
        f"Output written to: {output_prefix}.abundance.txt"
    )
def cleanup_scrubbed_files_redundancy(
    gff_filename: Union[str, Path],
    group_filename: Union[str, Path],
    count_filename: Union[str, Path],
    fastq_filename: Union[str, Path],
    output_prefix: str,
):

    junction_seen = defaultdict(lambda: defaultdict(lambda: [
    ]))  # key (chr,strand) -> dict of (series of junctions) -> record
    for r in GFF.collapseGFFReader(gff_filename):
        n = len(r.ref_exons)
        if n == 1:
            junc_str = f"{str(r.start)},{str(r.end)}"
            junction_seen[r.chr, r.strand][junc_str] = [r]
        else:
            junc_str = ",".join(
                f"{str(r.ref_exons[i].end)},{str(r.ref_exons[i + 1].start)}"
                for i in range(n - 1))
            junction_seen[r.chr, r.strand][junc_str].append(r)

    # write out cleaned GFF
    with open(f"{output_prefix}.gff",
              "w") as outf, open(f"{output_prefix}.merged_ids.txt",
                                 "w") as outf2:
        merged = {}
        keys = list(junction_seen.keys())
        keys.sort()
        for k in keys:
            for bunch in junction_seen[k].values():
                if len(bunch) == 1:  # just one record, write it out
                    r = bunch[0]
                    GFF.write_collapseGFF_format(outf, r)
                    merged[r.seqid] = [r.seqid]
                else:
                    # find the representative
                    r = bunch[0]
                    for r2 in bunch[1:]:
                        if r2.end - r2.start > r.end - r.start:
                            r = r2
                    GFF.write_collapseGFF_format(outf, r)
                    merged[r.seqid] = [x.seqid for x in bunch]
                outf2.write(f"{r.seqid}\t{','.join(merged[r.seqid])}\n")

    count_d, count_header = read_count_file(count_filename)
    # write out count file
    with open(f"{output_prefix}.abundance.txt", "w") as outf:
        outf.write(count_header)
        writer = DictWriter(
            outf,
            fieldnames=[
                "pbid",
                "count_fl",
                "count_nfl",
                "count_nfl_amb",
                "norm_fl",
                "norm_nfl",
                "norm_nfl_amb",
            ],
            delimiter="\t",
            lineterminator="\n",
        )
        writer.writeheader()
        for pbid, bunch in merged.items():
            # combine the counts
            r = count_d[bunch[0]]
            r["pbid"] = pbid
            for field in fields_to_add:
                r[field] = float(r[field])
            for _id in bunch[1:]:
                for field in fields_to_add:
                    r[field] += float(count_d[_id][field])
            writer.writerow(r)

    group_info = read_group_file(group_filename)
    # write out group file
    with open(f"{output_prefix}.group.txt", "w") as outf:
        for pbid, bunch in merged.items():
            # combine the groups
            g = [group_info[bunch[0]]]
            for _id in bunch[1:]:
                g.append(group_info[_id])
            outf.write(f"{pbid}\t{','.join(g)}\n")

    # write out fastq file if present
    if fastq_filename is not None:
        with open(f"{output_prefix}.rep.fq", "w") as outf:
            for r in SeqIO.parse(open(fastq_filename), "fastq"):
                if r.id.split("|")[0] in merged or r.id in merged:
                    SeqIO.write(r, outf, "fastq")

    logger.info(
        f"scrubbed files written: {output_prefix}.gff, {output_prefix}.group.txt, {output_prefix}.abundance.txt, {output_prefix}.merged_ids.txt"
    )
Esempio n. 16
0
def chain_split_file(
    ref_gff: Path,
    ref_group: Path,
    ref_name: str,
    addon_gff: Path,
    addon_group: Path,
    addon_name: str,
    fuzzy_junction: int,
    allow_5merge: bool,
    max_3_diff: int,
    n_chunks: int,
) -> Tuple[List[str], List[str]]:
    """
    Organize entries in both a gff and transcript group file
    and split both such that the original two files are split into chunks
    where gff.chunk.n covers the same entries as group.chunk.n
    """

    # read in the group_file as a dictionary in the form of
    # {
    #   'PB.1.1': ["transcript/1"],
    #   'PB.1.2': ["transcript/2", "transcript/3"]
    # }
    addon_group_info = sp.MegaPBTree.read_group(addon_group, None)
    # with addon_group.open('r') as ag:
    #     addon_group_info = {_.split('\t')[0]: _.split('\t')[1].split(",") for _ in ag.readlines()}
    recs = []
    tree = OrderedDict()
    i = 0
    # for r in HTSeq.GFF_Reader(addon_gff):
    # if r.iv.chrom not in tree2:
    #     tree[r.iv.chrom] = {"+": ClusterTree(0, 0), "-": ClusterTree(0, 0)}
    #     tree[r.iv.chrom][r.iv.strand].insert(r.iv.start, r.iv.end, i)
    #     recs.append(r)
    #     i += 1

    # This should build a structure in the form of:
    # {"chrN":
    #   {
    #       "+" : bx.intervals.cluster.clusterTree,
    #       "-" : bx.intervals.cluster.clusterTree,
    #   },
    # "chrN+1":
    #   {
    #       "+" : bx.intervals.cluster.clusterTree,
    #       "-" : bx.intervals.cluster.clusterTree,
    #   },
    # }
    # CusterTree objects have the form
    #   [(x,y,[z]), (a,b,[c]), (m,n,[o])]
    #   where each tuple is a range and a list of ids that lie within that range
    # e.g. (from the bx-python docs):
    # tree = ClusterTree(0, 0) Insert (6, 7, 1), (1, 2, 3), (9, 10, 2), (3, 4, 0), (3, 8, 4)
    # tree.getregions() returns [(1, 2, [3]), (3, 8, [0, 1, 4]), (9, 10, [2])]

    # NOTE: GFF.collapseGFFReader is a specialized GFF reader that in the attributes
    # field stores a list of bx.intervals.intersection.Interval objects
    # describing the exons
    for r in GFF.collapseGFFReader(addon_gff):
        if r.chr not in tree:
            tree[r.chr] = {"+": ClusterTree(0, 0), "-": ClusterTree(0, 0)}
        tree[r.chr][r.strand].insert(r.start, r.end, i)
        recs.append(r)
        i += 1

    n = len(recs)
    chunk_size = (n // n_chunks) + (n % n_chunks > 0)

    split_files = []
    i = 0
    counter = 0
    f_gff = open(f"{addon_gff}.split{str(i)}", "w")
    f_group = open(f"{addon_group}.split{str(i)}", "w")
    # this loop is going to reorder everything
    # so that we have a GFF with a transcript followed by all the exons that
    # made up that transcript and a separate file with the matching
    # transcript_id transcript/read_group#
    # (see the sp.MegaPBTree above)
    for v1 in tree.values():
        for strand in ("+", "-"):
            v2 = v1[strand]
            for *_, _indices in v2.getregions():
                for cur in _indices:
                    GFF.write_collapseGFF_format(f_gff, recs[cur])
                    f_group.write(
                        f"{recs[cur].seqid}\t{','.join(addon_group_info[recs[cur].seqid])}\n"
                    )
                    counter += 1
            if counter >= (i + 1) * chunk_size:
                i += 1
                n = f_gff.tell()
                f_gff.close()
                f_group.close()
                if n == 0:  # didn't write any records, delete these
                    Path(f_gff.name).unlink()
                    Path(f_group.name).unlink()
                else:
                    split_files.append((f_gff.name, f_group.name))
                if i >= n_chunks or counter >= len(recs):
                    break
                f_gff = open(f"{addon_gff}.split{str(i)}", "w")
                f_group = open(f"{addon_group}.split{str(i)}", "w")
    if not f_gff.closed:
        n = f_gff.tell()
        f_gff.close()
        f_group.close()
        if n == 0:  # didn't write any records, delete these
            Path(f_gff.name).unlink()
            Path(f_group.name).unlink()
        else:
            split_files.append((f_gff.name, f_group.name))

    result_prefixes = []
    pools = []
    for i, (split_gff, split_group) in enumerate(split_files):
        p = Process(
            target=chain_helper,
            args=(
                ref_gff,
                ref_group,
                split_gff,
                split_group,
                ref_name,
                f"{addon_name}.{str(i)}",
                fuzzy_junction,
                allow_5merge,
                max_3_diff,
            ),
        )
        p.start()
        pools.append(p)
        result_prefixes.append((ref_name, f"{addon_name}.{str(i)}"))
    for p in pools:
        p.join()
    return result_prefixes, split_files
def make_fake_genome(
    genome_filename,
    gff_filename,
    ref_chr,
    ref_start,
    ref_end,
    ref_strand,
    output_prefix,
    output_name=None,
    genome_d=None,
):
    if genome_d is None:
        logger.info(f"Reading genome file {genome_filename}...")
        d = SeqIO.to_dict(SeqIO.parse(open(genome_filename), "fasta"))
    else:
        d = genome_d

    if output_name is None:
        output_name = f"fake_{genome_filename}"

    logger.info(f"Reading GFF file {gff_filename}...")
    good = []
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        if (r.chr == ref_chr and r.strand == ref_strand
                and (ref_start <= r.start < r.end <= ref_end)
                and len(r.ref_exons) > 1):
            logger.info(f"Adding {r.seqid} to fake genome.")
            good.append(r)

    if len(good) == 0:
        raise RuntimeError(
            f"Did not find any transcripts strictly within {ref_chr}:{ref_start}-{ref_end} on strand {ref_strand}. Abort!"
        )

    c = ClusterTree(0, 0)
    for r in good:
        for e in r.ref_exons:
            c.insert(
                e.start - extra_bp_around_junctions,
                e.end + extra_bp_around_junctions,
                1,
            )

    regions = [(a, b) for (a, b, junk) in c.getregions()]
    regions[0] = (regions[0][0] - __padding_before_after__, regions[0][1])
    regions[-1] = (regions[-1][0], regions[-1][1] + __padding_before_after__)

    with open(output_prefix + ".fasta", "w") as f:
        f.write(">" + output_name + "\n")
        for a, b in regions:
            f.write(str(d[r.chr][a:b].seq))
        f.write("\n")

    # for mapping, write <0-based index on fake genome>, <ref chrom>, <0-based index on ref genome>
    with open(output_prefix + ".mapping.txt", "w") as f:
        i = 0
        for a, b in regions:
            for j in range(a, b):
                f.write(f"{i},{ref_chr},{j}\n")
                i += 1

        with open(output_prefix + ".pbids.txt", "w") as f:
            f.write("\n".join(r.seqid for r in good) + "\n")

    logger.info(
        f"Output written to {output_prefix}.fasta, {output_prefix}.mapping.txt, {output_prefix}.pbids.txt."
    )
 def read_gff_as_interval_tree(self):
     """
     Read a collapsed GFF file into an IntervalTree
     """
     for r in GFF.collapseGFFReader(self.gff_filename):
         self.tree[r.chr][r.strand].insert(r.start, r.end, r)
def main(
    count_filename: Path = typer.Argument(
        ..., help="Count file (generally ends with '.abundance.txt')"),
    gff_filename: Path = typer.Argument(..., help="Annotation file"),
    rep_filename: Path = typer.Argument(
        ...,
        help="Sequence file (ends with '.fq', '.fastq', '.fa', or '.fasta')"),
    fuzzy_junction: int = typer.Option(
        5, help="Fuzzy junction max dist (default: 5bp)"),
    sample_directory: Optional[Path] = typer.Option(
        None,
        help=
        "Directory in which the sample data resides.  By default uses the directory from which the script was called",
    ),
    output_prefix: Optional[str] = typer.Option(
        None, help="Prefix to use when naming the filtered files"),
    version: bool = typer.Option(
        None,
        "--version",
        callback=version_callback,
        is_eager=True,
        help="Prints the version of the SQANTI3 package.",
    ),
) -> None:

    if not sample_directory:
        sample_directory = Path.cwd()

    if output_prefix is None:
        output_prefix = f"{gff_filename.stem}.filtered"

    rep_type = "fastq" if rep_filename.suffix in (".fq", ".fastq") else "fasta"

    sanity_check_collapse_input(
        count_filename,
        gff_filename,
        rep_filename,
        sample_directory,
    )

    recs = defaultdict(lambda: [])
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith("PB.")
        recs[int(r.seqid.split(".")[1])].append(r)

    good = []
    with open(f"{output_prefix}.gff", "w") as f:
        keys = list(recs.keys())
        keys.sort()
        for k in recs:
            xxx = recs[k]
            filter_out_subsets(xxx, fuzzy_junction)
            for r in xxx:
                GFF.write_collapseGFF_format(f, r)
                good.append(r.seqid)

    # read abundance first
    d, count_header = read_count_file(count_filename)

    # write output rep.fq
    with open(f"{output_prefix}.rep.{('fq' if rep_type == 'fastq' else 'fa')}",
              "w") as f:
        for r in SeqIO.parse(open(rep_filename), rep_type):
            if r.name.split("|")[0] in good:
                SeqIO.write(r, f, rep_type)

    # write output to .abundance.txt
    with open(f"{output_prefix}.abundance.txt", "w") as f:
        f.write(count_header)
        writer = DictWriter(
            f,
            fieldnames=[
                "pbid",
                "count_fl",
                "count_nfl",
                "count_nfl_amb",
                "norm_fl",
                "norm_nfl",
                "norm_nfl_amb",
            ],
            delimiter="\t",
            lineterminator="\n",
        )
        writer.writeheader()
        for k in good:
            r = d[k]
            writer.writerow(r)

    logger.info(f"Output written to: {output_prefix}.gff\n"
                f"Output written to: {rep_filename}\n"
                f"Output written to: {output_prefix}.abundance.txt")
Esempio n. 20
0
def combine_split_chained_results(
    output_prefixes,
    final_prefix,
    ref_gff,
    ref_group,
    ref_name,
    ref_fq,
    addon_gff,
    addon_group,
    addon_name,
    addon_fq,
):
    """
    Each <output_prefix> will have .gff, .group.txt, .mega_info.txt.
    There should be NO overlap between the split files, so clean merge should be possible!

    1. read the .gff files, record the group and mega (id-map) info
    2. sort the total records so can properly put on a unified superPBID
    3. write out the unified result
    4. delete the split files
    """

    # sanity check files are all there
    split_files = []  # tuple of (gff, group, mega)
    for ref_name, o in output_prefixes:
        gff_file = Path(f"tmp_{o}.gff")
        mega_file = Path(f"tmp_{o}.mega_info.txt")
        group_file = Path(f"tmp_{o}.group.txt")
        if not gff_file.exists() or not mega_file.exists(
        ) or not group_file.exists():
            raise RuntimeError(
                f"Expects to see {gff_file},{mega_file},{group_file} but one or more files are missing! Abort!"
            )
        split_files.append((ref_name, o, gff_file, group_file, mega_file))

    use_fq = False
    if ref_fq is not None and addon_fq is not None:
        use_fq = True
        ref_fq_dict = {
            r.id.split("|")[0]: r
            for r in SeqIO.parse(open(ref_fq), "fastq")
        }
        addon_fq_dict = {
            r.id.split("|")[0]: r
            for r in SeqIO.parse(open(addon_fq), "fastq")
        }

    mega_info = {}  # ref id -> list of matching query_id, or empty list
    split_unmatched = set()

    for (ref_name, split_name, gff_file, group_file, mega_file) in split_files:
        for r in DictReader(open(mega_file), delimiter="\t"):
            if r[ref_name] != "NA":
                if r[ref_name] not in mega_info:
                    mega_info[r[ref_name]] = []
                if r[split_name] != "NA":
                    mega_info[r[ref_name]].append(r[split_name])
            else:  # ref is NA, non-ref is not NA
                split_unmatched.add(r[split_name])

    # make a rec list of matches of (ref_id, addon_id, representative record, combined group info) where rec_ref or ref_addon could be None, but not both
    rec_list = []
    d_ref = {r.seqid: r for r in GFF.collapseGFFReader(ref_gff)}
    d_addon = {r.seqid: r for r in GFF.collapseGFFReader(addon_gff)}

    ref_group_info = sp.MegaPBTree.read_group(ref_group, None)
    addon_group_info = sp.MegaPBTree.read_group(addon_group, None)

    for ref_id, matches in mega_info.items():
        if len(matches) == 0:
            rec_list.append(
                sp.MatchRecord(
                    ref_id=ref_id,
                    addon_id="NA",
                    rec=d_ref[ref_id],
                    members=ref_group_info[ref_id],
                    seqrec=ref_fq_dict[ref_id] if use_fq else None,
                ))
        else:
            for addon_id in matches:
                r1 = d_ref[ref_id]
                r2 = d_addon[addon_id]
                if (r1.end - r1.start) > (r2.end - r2.start):
                    rec_list.append(
                        sp.MatchRecord(
                            ref_id=ref_id,
                            addon_id=addon_id,
                            rec=r1,
                            members=ref_group_info[ref_id] +
                            addon_group_info[addon_id],
                            seqrec=ref_fq_dict[ref_id] if use_fq else None,
                        ))
                else:
                    rec_list.append(
                        sp.MatchRecord(
                            ref_id=ref_id,
                            addon_id=addon_id,
                            rec=r2,
                            members=ref_group_info[ref_id] +
                            addon_group_info[addon_id],
                            seqrec=addon_fq_dict[addon_id] if use_fq else None,
                        ))
    for addon_id in split_unmatched:
        rec_list.append(
            sp.MatchRecord(
                ref_id="NA",
                addon_id=addon_id,
                rec=d_addon[addon_id],
                members=addon_group_info[addon_id],
                seqrec=addon_fq_dict[addon_id] if use_fq else None,
            ))

    sp.write_reclist_to_gff_n_info(rec_list, final_prefix, ref_name,
                                   addon_name, use_fq)
    for (ref_name, split_name, gff_file, group_file, mega_file) in split_files:
        gff_file.unlink()
        group_file.unlink()
        mega_file.unlink()
def collapse_fuzzy_junctions(
    gff_filename: Union[str, Path],
    group_filename: Union[str, Path],
    allow_extra_5exon: bool,
    internal_fuzzy_max_dist: int,
    max_5_diff: int,
    max_3_diff: int,
) -> defaultdict:
    def can_merge(m, r1, r2):
        if m == "exact":
            return True
        else:
            if not allow_extra_5exon:
                return False
        # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True
        if m == "subset":
            r1, r2 = r2, r1  # rotate so r1 is always the longer one
        if m == "super" or m == "subset":
            n2 = len(r2.ref_exons)
            # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees
            # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates
            if r1.strand == "+":
                return (abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <=
                        internal_fuzzy_max_dist and r1.ref_exons[-n2].start <=
                        r2.ref_exons[0].start < r1.ref_exons[-n2].end)
            else:
                return (abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <=
                        internal_fuzzy_max_dist and r1.ref_exons[n2 - 1].start
                        <= r2.ref_exons[-1].end < r1.ref_exons[n2].end)
        return False

    d = {}
    # chr --> strand --> tree
    recs = defaultdict(lambda: {"+": IntervalTree(), "-": IntervalTree()})
    fuzzy_match = defaultdict(lambda: [])
    for r in GFF.collapseGFFReader(gff_filename):
        d[r.seqid] = r
        has_match = False
        r.segments = r.ref_exons
        for r2 in recs[r.chr][r.strand].find(r.start, r.end):
            r2.segments = r2.ref_exons
            m = compare_junctions(
                r,
                r2,
                internal_fuzzy_max_dist=internal_fuzzy_max_dist,
                max_5_diff=max_5_diff,
                max_3_diff=max_3_diff,
            )
            if can_merge(m, r, r2):
                fuzzy_match[r2.seqid].append(r.seqid)
                has_match = True
                break
        if not has_match:
            recs[r.chr][r.strand].insert(r.start, r.end, r)
            fuzzy_match[r.seqid] = [r.seqid]

    group_info = {}
    with open(group_filename) as f:
        for line in f:
            pbid, members = line.strip().split("\t")
            group_info[pbid] = members.split(",")

    # pick for each fuzzy group the one that has the most exons
    keys = list(fuzzy_match.keys())
    keys.sort(key=lambda x: int(x.split(".")[1]))

    with open(f"{gff_filename}.fuzzy",
              "w") as f_gff, open(f"{group_filename}.fuzzy", "w") as f_group:
        for k in keys:
            all_members = []
            best_pbid, best_size, best_num_exons = (
                fuzzy_match[k][0],
                len(group_info[fuzzy_match[k][0]]),
                len(d[fuzzy_match[k][0]].ref_exons),
            )
            all_members += group_info[fuzzy_match[k][0]]
            for pbid in fuzzy_match[k][1:]:
                _num_exons = len(d[pbid].ref_exons)
                _size = len(group_info[pbid])
                all_members += group_info[pbid]
                if _num_exons > best_num_exons or (_num_exons == best_num_exons
                                                   and _size > best_size):
                    best_pbid, best_size, best_num_exons = pbid, _size, _num_exons
            GFF.write_collapseGFF_format(f_gff, d[best_pbid])
            f_group.write(f'{best_pbid}\t{",".join(all_members)}\n')

    return fuzzy_match
def write_reclist_to_gff_n_info(
    rec_list: Dict[str, Any],
    final_prefix: str,
    ref_name: str,
    addon_name: str,
    use_fq: bool = False,
) -> Dict[str, str]:
    # now go through the rec list and figure out in what order we are outputting the total records
    tree = defaultdict(lambda: {
        "+": ClusterTree(0, 0),
        "-": ClusterTree(0, 0)
    })
    tree_keys_numeric = set()
    tree_keys_alpha = set()
    for i, match_rec in enumerate(rec_list):
        tree[match_rec.rec.chr][match_rec.rec.strand].insert(
            match_rec.rec.start, match_rec.rec.end, i)

    for chrom in tree:
        try:
            k = int(chrom)
            tree_keys_numeric.add(k)
        except ValueError:
            tree_keys_alpha.add(chrom)
    tree_keys = sorted(tree_keys_numeric) + sorted(tree_keys_alpha)

    writer_info = DictWriter(
        Path(f"{final_prefix}.mega_info.txt").open("w"),
        fieldnames=["superPBID", ref_name, addon_name],
        delimiter="\t",
    )
    writer_info.writeheader()
    if use_fq:
        f_fq = Path(f"{final_prefix}.rep.fq")
    with open(f"{final_prefix}.gff",
              "w") as f_gff, open(f"{final_prefix}.group.txt", "w") as f_group:
        new_group_info = {}

        pb_i = 0
        for _chr in tree_keys:
            for _strand in ("+", "-"):
                for *_, _indices in tree[_chr][_strand].getregions():
                    # further sort these records by (start, end, num_exons)
                    _indices.sort(key=lambda i: (
                        rec_list[i].rec.start,
                        rec_list[i].rec.end,
                        len(rec_list[i].rec.ref_exons),
                    ))
                    pb_i += 1
                    for pb_j, recs_index in enumerate(_indices):
                        pbid = f"PB.{pb_i}.{pb_j + 1}"
                        match_rec = rec_list[recs_index]
                        new_group_info[pbid] = match_rec.members
                        match_rec.rec.seqid = pbid
                        GFF.write_collapseGFF_format(f_gff, match_rec.rec)
                        writer_info.writerow({
                            "superPBID": pbid,
                            ref_name: match_rec.ref_id,
                            addon_name: match_rec.addon_id,
                        })
                        f_group.write(
                            f"{pbid}\t{','.join(match_rec.members)}\n")
                        if use_fq:
                            match_rec.seqrec.id = pbid
                            match_rec.seqrec.description = ""
                            SeqIO.write(match_rec.seqrec, f_fq, "fastq")

    return new_group_info