def scrub_sample_GFFs( sample_dirs: Dict[str, str], gff_filename: Union[str, Path], count_filename: Union[str, Path], group_filename: Union[str, Path], fastq_filename: Union[str, Path], output_prefix: str, tree: IntervalTree, ) -> None: for _, d in sample_dirs.items(): with Path(d, f"{output_prefix}.gff.tmp").open("w") as outf: for r in GFF.collapseGFFReader(Path(d, gff_filename)): n = len(r.ref_exons) if n == 1: GFF.write_collapseGFF_format(outf, r) new_ref_exons = scrub_ref_exons(r, tree) if new_ref_exons is None: logger.info("No changes made due to error:", r.seqid) else: # print "before:", r.ref_exons # print "after :", new_ref_exons r.ref_exons = new_ref_exons GFF.write_collapseGFF_format(outf, r) cleanup_scrubbed_files_redundancy( outf.name, Path(d, group_filename), Path(d, count_filename), Path(d, fastq_filename) if fastq_filename is not None else None, Path(d, output_prefix), )
def get_gff_from_list(gff_filename, listfile, partial_ok=False): seqs = [line.strip() for line in open(listfile)] for r in GFF.collapseGFFReader(gff_filename): if ( r.seqid in seqs or r.seqid.split("|")[0] in seqs or (partial_ok and any(r.seqid.startswith(x) for x in seqs)) ): GFF.write_collapseGFF_format(sys.stdout, r)
def regroup_gff( pooled_gff, demux_count_file, output_prefix, out_group_dict, in_fafq=None ): """ :param pooled_sam: SAM file :param demux_count_file: comma-delimited per-barcode count file :param output_prefix: output prefix for GFF :param out_group_dict: dict of barcode name --> group to be long in (ex: {'EM1':'EM', 'EM2':'EM'}) :param in_fafq: optional fasta/fastq that was input to SAM """ if in_fafq is not None: type_fafq = get_type_fafq(in_fafq) in_tissue = defaultdict( lambda: set() ) # pbid --> list of tissue it is in (EM, END, R) for r in DictReader(open(demux_count_file), delimiter=","): for k, v in r.items(): if k != "id" and int(v) > 0: in_tissue[r["id"]].add(k) # in_tissue = dict(in_tissue) handles = {} handles_fafq = {} for g in out_group_dict.values(): handles[g] = open(f"{output_prefix}_{g}_only.gff", "w") if in_fafq is not None: handles_fafq[g] = open(f"{output_prefix}_{g}_only.{type_fafq}", "w") if in_fafq is not None: fafq_dict = SeqIO.to_dict(SeqIO.parse(open(in_fafq), type_fafq)) fafq_dict_keys = list(fafq_dict.keys()) for k in fafq_dict_keys: m = rex_pbid.match(k) if m is not None: fafq_dict[m.group(1)] = fafq_dict[k] reader = GFF.collapseGFFReader(pooled_gff) for r in reader: groups_to_write_in = set() pbid = r.seqid if pbid not in in_tissue: logger.info( f"WARNING: {pbid} does not belong to any group indicated by outgroup_dict" ) for tissue in in_tissue[pbid]: groups_to_write_in.add(out_group_dict[tissue]) for g in groups_to_write_in: GFF.write_collapseGFF_format(handles[g], r) if in_fafq is not None: SeqIO.write(fafq_dict[pbid], handles_fafq[g], type_fafq)
def collapse_fuzzy_junctions( gff_filename: Union[str, Path], group_filename: Union[str, Path], allow_extra_5exon: bool, internal_fuzzy_max_dist: int, max_5_diff: int, max_3_diff: int, ) -> defaultdict: def can_merge(m, r1, r2): if m == "exact": return True else: if not allow_extra_5exon: return False # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True if m == "subset": r1, r2 = r2, r1 # rotate so r1 is always the longer one if m == "super" or m == "subset": n2 = len(r2.ref_exons) # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates if r1.strand == "+": return (abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end) else: return (abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and r1.ref_exons[n2 - 1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end) return False d = {} # chr --> strand --> tree recs = defaultdict(lambda: {"+": IntervalTree(), "-": IntervalTree()}) fuzzy_match = defaultdict(lambda: []) for r in GFF.collapseGFFReader(gff_filename): d[r.seqid] = r has_match = False r.segments = r.ref_exons for r2 in recs[r.chr][r.strand].find(r.start, r.end): r2.segments = r2.ref_exons m = compare_junctions( r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist, max_5_diff=max_5_diff, max_3_diff=max_3_diff, ) if can_merge(m, r, r2): fuzzy_match[r2.seqid].append(r.seqid) has_match = True break if not has_match: recs[r.chr][r.strand].insert(r.start, r.end, r) fuzzy_match[r.seqid] = [r.seqid] group_info = {} with open(group_filename) as f: for line in f: pbid, members = line.strip().split("\t") group_info[pbid] = members.split(",") # pick for each fuzzy group the one that has the most exons keys = list(fuzzy_match.keys()) keys.sort(key=lambda x: int(x.split(".")[1])) with open(f"{gff_filename}.fuzzy", "w") as f_gff, open(f"{group_filename}.fuzzy", "w") as f_group: for k in keys: all_members = [] best_pbid, best_size, best_num_exons = ( fuzzy_match[k][0], len(group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons), ) all_members += group_info[fuzzy_match[k][0]] for pbid in fuzzy_match[k][1:]: _num_exons = len(d[pbid].ref_exons) _size = len(group_info[pbid]) all_members += group_info[pbid] if _num_exons > best_num_exons or (_num_exons == best_num_exons and _size > best_size): best_pbid, best_size, best_num_exons = pbid, _size, _num_exons GFF.write_collapseGFF_format(f_gff, d[best_pbid]) f_group.write(f'{best_pbid}\t{",".join(all_members)}\n') return fuzzy_match
def main( input_prefix: str = typer.Argument( ..., help="Input prefix (ex: test.collapsed.min_fl_2)" ), version: bool = typer.Option( None, "--version", callback=version_callback, is_eager=True, help="Prints the version of the SQANTI3 package.", ), ) -> None: output_prefix = f"{input_prefix}.nomono" count_filename, gff_filename, rep_filename = sanity_check_collapse_input( input_prefix ) good = [] with open(f"{output_prefix}.gff", "w") as f: reader = GFF.collapseGFFReader(gff_filename) for r in reader: assert r.seqid.startswith("PB.") if len(r.ref_exons) > 1: good.append(r.seqid) GFF.write_collapseGFF_format(f, r) # read abundance first d, count_header = read_count_file(count_filename) # write output rep.fq with open(f"{output_prefix}.rep.fq", "w") as f: for r in SeqIO.parse(open(rep_filename, "r"), "fastq"): if r.name.split("|")[0] in good: SeqIO.write(r, f, "fastq") # write output to .abundance.txt with open(f"{output_prefix}.abundance.txt", "w") as f: f.write(count_header) writer = DictWriter( f, fieldnames=[ "pbid", "count_fl", "count_nfl", "count_nfl_amb", "norm_fl", "norm_nfl", "norm_nfl_amb", ], delimiter="\t", lineterminator="\n", ) writer.writeheader() for k in good: r = d[k] writer.writerow(r) logger.info(f"Output written to:{output_prefix}.gff") logger.info(f"Output written to:{output_prefix}.rep.fq")
def filter_by_count( input_prefix: str, output_prefix: str, min_count: int, dun_use_group_count: bool = False, ) -> None: group_filename = f"{input_prefix}.group.txt" count_filename = f"{input_prefix}.abundance.txt" gff_filename = f"{input_prefix}.gff" rep_filenames = [ (f"{input_prefix}.rep.fq", "fastq"), (f"{input_prefix}.rep.fastq", "fastq"), (f"{input_prefix}.rep.fa", "fasta"), (f"{input_prefix}.rep.fasta", "fasta"), ] rep_filename = None rep_type = None for x, feature in rep_filenames: if os.path.exists(x): rep_filename = x rep_type = feature if rep_filename is None: logger.error( f"Expected to find input fasta or fastq files {input_prefix}.rep.fa or {input_prefix}.rep.fq. Not found. Abort!" ) sys.exit(-1) if not dun_use_group_count: # read group group_max_count_fl = {} group_max_count_p = {} for line in open(group_filename): # ex: PB.1.1 i0HQ_54b0ca|c58773/f30p16/700 pbid, members = line.strip().split("\t") group_max_count_fl[pbid] = 0 group_max_count_p[pbid] = 0 members = members.split(",") for m in members: i = m.find("|") if i > 0: tmp = m.split("|")[1].split("/")[1] # ex: tmp = f30p16 else: tmp = m.split("/")[1] fl_count, p_count = tmp.split("p") fl_count = int(fl_count[1:]) p_count = int(p_count) group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count) group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count) # read abundance first with open(count_filename) as f: count_header = "" while True: cur_pos = f.tell() line = f.readline() if not line.startswith("#"): f.seek(cur_pos) break else: count_header += line d = {r["pbid"]: r for r in DictReader(f, delimiter="\t")} for k, v in d.items(): print(k, v) # group_max_count_p NOT used for now good = [ x for x in d if int(d[x]["count_fl"]) >= min_count and (dun_use_group_count or group_max_count_fl[x] >= min_count) ] # write output GFF with open(f"{output_prefix}.gff", "w") as f: for r in GFF.collapseGFFReader(gff_filename): if r.seqid in good: GFF.write_collapseGFF_format(f, r) # write output rep.fq with open( f"{output_prefix}.rep.{('fq' if rep_type == 'fastq' else 'fa')}", "w" ) as f: for r in SeqIO.parse(open(rep_filename), rep_type): if r.name.split("|")[0] in good: SeqIO.write(r, f, rep_type) # write output to .abundance.txt with open(f"{output_prefix}.abundance.txt", "w") as f: f.write(count_header) writer = DictWriter( f, fieldnames=[ "pbid", "count_fl", "count_nfl", "count_nfl_amb", "norm_fl", "norm_nfl", "norm_nfl_amb", ], delimiter="\t", lineterminator="\n", ) writer.writeheader() for k in good: r = d[k] writer.writerow(r) logger.info( f"Output written to: {output_prefix}.gff\n" f"Output written to: {rep_filename}\n" f"Output written to: {output_prefix}.abundance.txt" )
def cleanup_scrubbed_files_redundancy( gff_filename: Union[str, Path], group_filename: Union[str, Path], count_filename: Union[str, Path], fastq_filename: Union[str, Path], output_prefix: str, ): junction_seen = defaultdict(lambda: defaultdict(lambda: [ ])) # key (chr,strand) -> dict of (series of junctions) -> record for r in GFF.collapseGFFReader(gff_filename): n = len(r.ref_exons) if n == 1: junc_str = f"{str(r.start)},{str(r.end)}" junction_seen[r.chr, r.strand][junc_str] = [r] else: junc_str = ",".join( f"{str(r.ref_exons[i].end)},{str(r.ref_exons[i + 1].start)}" for i in range(n - 1)) junction_seen[r.chr, r.strand][junc_str].append(r) # write out cleaned GFF with open(f"{output_prefix}.gff", "w") as outf, open(f"{output_prefix}.merged_ids.txt", "w") as outf2: merged = {} keys = list(junction_seen.keys()) keys.sort() for k in keys: for bunch in junction_seen[k].values(): if len(bunch) == 1: # just one record, write it out r = bunch[0] GFF.write_collapseGFF_format(outf, r) merged[r.seqid] = [r.seqid] else: # find the representative r = bunch[0] for r2 in bunch[1:]: if r2.end - r2.start > r.end - r.start: r = r2 GFF.write_collapseGFF_format(outf, r) merged[r.seqid] = [x.seqid for x in bunch] outf2.write(f"{r.seqid}\t{','.join(merged[r.seqid])}\n") count_d, count_header = read_count_file(count_filename) # write out count file with open(f"{output_prefix}.abundance.txt", "w") as outf: outf.write(count_header) writer = DictWriter( outf, fieldnames=[ "pbid", "count_fl", "count_nfl", "count_nfl_amb", "norm_fl", "norm_nfl", "norm_nfl_amb", ], delimiter="\t", lineterminator="\n", ) writer.writeheader() for pbid, bunch in merged.items(): # combine the counts r = count_d[bunch[0]] r["pbid"] = pbid for field in fields_to_add: r[field] = float(r[field]) for _id in bunch[1:]: for field in fields_to_add: r[field] += float(count_d[_id][field]) writer.writerow(r) group_info = read_group_file(group_filename) # write out group file with open(f"{output_prefix}.group.txt", "w") as outf: for pbid, bunch in merged.items(): # combine the groups g = [group_info[bunch[0]]] for _id in bunch[1:]: g.append(group_info[_id]) outf.write(f"{pbid}\t{','.join(g)}\n") # write out fastq file if present if fastq_filename is not None: with open(f"{output_prefix}.rep.fq", "w") as outf: for r in SeqIO.parse(open(fastq_filename), "fastq"): if r.id.split("|")[0] in merged or r.id in merged: SeqIO.write(r, outf, "fastq") logger.info( f"scrubbed files written: {output_prefix}.gff, {output_prefix}.group.txt, {output_prefix}.abundance.txt, {output_prefix}.merged_ids.txt" )
def write_reclist_to_gff_n_info( rec_list: Dict[str, Any], final_prefix: str, ref_name: str, addon_name: str, use_fq: bool = False, ) -> Dict[str, str]: # now go through the rec list and figure out in what order we are outputting the total records tree = defaultdict(lambda: { "+": ClusterTree(0, 0), "-": ClusterTree(0, 0) }) tree_keys_numeric = set() tree_keys_alpha = set() for i, match_rec in enumerate(rec_list): tree[match_rec.rec.chr][match_rec.rec.strand].insert( match_rec.rec.start, match_rec.rec.end, i) for chrom in tree: try: k = int(chrom) tree_keys_numeric.add(k) except ValueError: tree_keys_alpha.add(chrom) tree_keys = sorted(tree_keys_numeric) + sorted(tree_keys_alpha) writer_info = DictWriter( Path(f"{final_prefix}.mega_info.txt").open("w"), fieldnames=["superPBID", ref_name, addon_name], delimiter="\t", ) writer_info.writeheader() if use_fq: f_fq = Path(f"{final_prefix}.rep.fq") with open(f"{final_prefix}.gff", "w") as f_gff, open(f"{final_prefix}.group.txt", "w") as f_group: new_group_info = {} pb_i = 0 for _chr in tree_keys: for _strand in ("+", "-"): for *_, _indices in tree[_chr][_strand].getregions(): # further sort these records by (start, end, num_exons) _indices.sort(key=lambda i: ( rec_list[i].rec.start, rec_list[i].rec.end, len(rec_list[i].rec.ref_exons), )) pb_i += 1 for pb_j, recs_index in enumerate(_indices): pbid = f"PB.{pb_i}.{pb_j + 1}" match_rec = rec_list[recs_index] new_group_info[pbid] = match_rec.members match_rec.rec.seqid = pbid GFF.write_collapseGFF_format(f_gff, match_rec.rec) writer_info.writerow({ "superPBID": pbid, ref_name: match_rec.ref_id, addon_name: match_rec.addon_id, }) f_group.write( f"{pbid}\t{','.join(match_rec.members)}\n") if use_fq: match_rec.seqrec.id = pbid match_rec.seqrec.description = "" SeqIO.write(match_rec.seqrec, f_fq, "fastq") return new_group_info
def main( count_filename: Path = typer.Argument( ..., help="Count file (generally ends with '.abundance.txt')"), gff_filename: Path = typer.Argument(..., help="Annotation file"), rep_filename: Path = typer.Argument( ..., help="Sequence file (ends with '.fq', '.fastq', '.fa', or '.fasta')"), fuzzy_junction: int = typer.Option( 5, help="Fuzzy junction max dist (default: 5bp)"), sample_directory: Optional[Path] = typer.Option( None, help= "Directory in which the sample data resides. By default uses the directory from which the script was called", ), output_prefix: Optional[str] = typer.Option( None, help="Prefix to use when naming the filtered files"), version: bool = typer.Option( None, "--version", callback=version_callback, is_eager=True, help="Prints the version of the SQANTI3 package.", ), ) -> None: if not sample_directory: sample_directory = Path.cwd() if output_prefix is None: output_prefix = f"{gff_filename.stem}.filtered" rep_type = "fastq" if rep_filename.suffix in (".fq", ".fastq") else "fasta" sanity_check_collapse_input( count_filename, gff_filename, rep_filename, sample_directory, ) recs = defaultdict(lambda: []) reader = GFF.collapseGFFReader(gff_filename) for r in reader: assert r.seqid.startswith("PB.") recs[int(r.seqid.split(".")[1])].append(r) good = [] with open(f"{output_prefix}.gff", "w") as f: keys = list(recs.keys()) keys.sort() for k in recs: xxx = recs[k] filter_out_subsets(xxx, fuzzy_junction) for r in xxx: GFF.write_collapseGFF_format(f, r) good.append(r.seqid) # read abundance first d, count_header = read_count_file(count_filename) # write output rep.fq with open(f"{output_prefix}.rep.{('fq' if rep_type == 'fastq' else 'fa')}", "w") as f: for r in SeqIO.parse(open(rep_filename), rep_type): if r.name.split("|")[0] in good: SeqIO.write(r, f, rep_type) # write output to .abundance.txt with open(f"{output_prefix}.abundance.txt", "w") as f: f.write(count_header) writer = DictWriter( f, fieldnames=[ "pbid", "count_fl", "count_nfl", "count_nfl_amb", "norm_fl", "norm_nfl", "norm_nfl_amb", ], delimiter="\t", lineterminator="\n", ) writer.writeheader() for k in good: r = d[k] writer.writerow(r) logger.info(f"Output written to: {output_prefix}.gff\n" f"Output written to: {rep_filename}\n" f"Output written to: {output_prefix}.abundance.txt")
def chain_split_file( ref_gff: Path, ref_group: Path, ref_name: str, addon_gff: Path, addon_group: Path, addon_name: str, fuzzy_junction: int, allow_5merge: bool, max_3_diff: int, n_chunks: int, ) -> Tuple[List[str], List[str]]: """ Organize entries in both a gff and transcript group file and split both such that the original two files are split into chunks where gff.chunk.n covers the same entries as group.chunk.n """ # read in the group_file as a dictionary in the form of # { # 'PB.1.1': ["transcript/1"], # 'PB.1.2': ["transcript/2", "transcript/3"] # } addon_group_info = sp.MegaPBTree.read_group(addon_group, None) # with addon_group.open('r') as ag: # addon_group_info = {_.split('\t')[0]: _.split('\t')[1].split(",") for _ in ag.readlines()} recs = [] tree = OrderedDict() i = 0 # for r in HTSeq.GFF_Reader(addon_gff): # if r.iv.chrom not in tree2: # tree[r.iv.chrom] = {"+": ClusterTree(0, 0), "-": ClusterTree(0, 0)} # tree[r.iv.chrom][r.iv.strand].insert(r.iv.start, r.iv.end, i) # recs.append(r) # i += 1 # This should build a structure in the form of: # {"chrN": # { # "+" : bx.intervals.cluster.clusterTree, # "-" : bx.intervals.cluster.clusterTree, # }, # "chrN+1": # { # "+" : bx.intervals.cluster.clusterTree, # "-" : bx.intervals.cluster.clusterTree, # }, # } # CusterTree objects have the form # [(x,y,[z]), (a,b,[c]), (m,n,[o])] # where each tuple is a range and a list of ids that lie within that range # e.g. (from the bx-python docs): # tree = ClusterTree(0, 0) Insert (6, 7, 1), (1, 2, 3), (9, 10, 2), (3, 4, 0), (3, 8, 4) # tree.getregions() returns [(1, 2, [3]), (3, 8, [0, 1, 4]), (9, 10, [2])] # NOTE: GFF.collapseGFFReader is a specialized GFF reader that in the attributes # field stores a list of bx.intervals.intersection.Interval objects # describing the exons for r in GFF.collapseGFFReader(addon_gff): if r.chr not in tree: tree[r.chr] = {"+": ClusterTree(0, 0), "-": ClusterTree(0, 0)} tree[r.chr][r.strand].insert(r.start, r.end, i) recs.append(r) i += 1 n = len(recs) chunk_size = (n // n_chunks) + (n % n_chunks > 0) split_files = [] i = 0 counter = 0 f_gff = open(f"{addon_gff}.split{str(i)}", "w") f_group = open(f"{addon_group}.split{str(i)}", "w") # this loop is going to reorder everything # so that we have a GFF with a transcript followed by all the exons that # made up that transcript and a separate file with the matching # transcript_id transcript/read_group# # (see the sp.MegaPBTree above) for v1 in tree.values(): for strand in ("+", "-"): v2 = v1[strand] for *_, _indices in v2.getregions(): for cur in _indices: GFF.write_collapseGFF_format(f_gff, recs[cur]) f_group.write( f"{recs[cur].seqid}\t{','.join(addon_group_info[recs[cur].seqid])}\n" ) counter += 1 if counter >= (i + 1) * chunk_size: i += 1 n = f_gff.tell() f_gff.close() f_group.close() if n == 0: # didn't write any records, delete these Path(f_gff.name).unlink() Path(f_group.name).unlink() else: split_files.append((f_gff.name, f_group.name)) if i >= n_chunks or counter >= len(recs): break f_gff = open(f"{addon_gff}.split{str(i)}", "w") f_group = open(f"{addon_group}.split{str(i)}", "w") if not f_gff.closed: n = f_gff.tell() f_gff.close() f_group.close() if n == 0: # didn't write any records, delete these Path(f_gff.name).unlink() Path(f_group.name).unlink() else: split_files.append((f_gff.name, f_group.name)) result_prefixes = [] pools = [] for i, (split_gff, split_group) in enumerate(split_files): p = Process( target=chain_helper, args=( ref_gff, ref_group, split_gff, split_group, ref_name, f"{addon_name}.{str(i)}", fuzzy_junction, allow_5merge, max_3_diff, ), ) p.start() pools.append(p) result_prefixes.append((ref_name, f"{addon_name}.{str(i)}")) for p in pools: p.join() return result_prefixes, split_files