def scrub_sample_GFFs( sample_dirs: Dict[str, str], gff_filename: Union[str, Path], count_filename: Union[str, Path], group_filename: Union[str, Path], fastq_filename: Union[str, Path], output_prefix: str, tree: IntervalTree, ) -> None: for _, d in sample_dirs.items(): with Path(d, f"{output_prefix}.gff.tmp").open("w") as outf: for r in GFF.collapseGFFReader(Path(d, gff_filename)): n = len(r.ref_exons) if n == 1: GFF.write_collapseGFF_format(outf, r) new_ref_exons = scrub_ref_exons(r, tree) if new_ref_exons is None: logger.info("No changes made due to error:", r.seqid) else: # print "before:", r.ref_exons # print "after :", new_ref_exons r.ref_exons = new_ref_exons GFF.write_collapseGFF_format(outf, r) cleanup_scrubbed_files_redundancy( outf.name, Path(d, group_filename), Path(d, count_filename), Path(d, fastq_filename) if fastq_filename is not None else None, Path(d, output_prefix), )
def get_gff_from_list(gff_filename, listfile, partial_ok=False): seqs = [line.strip() for line in open(listfile)] for r in GFF.collapseGFFReader(gff_filename): if ( r.seqid in seqs or r.seqid.split("|")[0] in seqs or (partial_ok and any(r.seqid.startswith(x) for x in seqs)) ): GFF.write_collapseGFF_format(sys.stdout, r)
def regroup_gff( pooled_gff, demux_count_file, output_prefix, out_group_dict, in_fafq=None ): """ :param pooled_sam: SAM file :param demux_count_file: comma-delimited per-barcode count file :param output_prefix: output prefix for GFF :param out_group_dict: dict of barcode name --> group to be long in (ex: {'EM1':'EM', 'EM2':'EM'}) :param in_fafq: optional fasta/fastq that was input to SAM """ if in_fafq is not None: type_fafq = get_type_fafq(in_fafq) in_tissue = defaultdict( lambda: set() ) # pbid --> list of tissue it is in (EM, END, R) for r in DictReader(open(demux_count_file), delimiter=","): for k, v in r.items(): if k != "id" and int(v) > 0: in_tissue[r["id"]].add(k) # in_tissue = dict(in_tissue) handles = {} handles_fafq = {} for g in out_group_dict.values(): handles[g] = open(f"{output_prefix}_{g}_only.gff", "w") if in_fafq is not None: handles_fafq[g] = open(f"{output_prefix}_{g}_only.{type_fafq}", "w") if in_fafq is not None: fafq_dict = SeqIO.to_dict(SeqIO.parse(open(in_fafq), type_fafq)) fafq_dict_keys = list(fafq_dict.keys()) for k in fafq_dict_keys: m = rex_pbid.match(k) if m is not None: fafq_dict[m.group(1)] = fafq_dict[k] reader = GFF.collapseGFFReader(pooled_gff) for r in reader: groups_to_write_in = set() pbid = r.seqid if pbid not in in_tissue: logger.info( f"WARNING: {pbid} does not belong to any group indicated by outgroup_dict" ) for tissue in in_tissue[pbid]: groups_to_write_in.add(out_group_dict[tissue]) for g in groups_to_write_in: GFF.write_collapseGFF_format(handles[g], r) if in_fafq is not None: SeqIO.write(fafq_dict[pbid], handles_fafq[g], type_fafq)
def __init__( self, gff_filename: Union[str, Path], group_filename: Union[str, Path], internal_fuzzy_max_dist: int = 0, self_prefix: str = None, fastq_filename: Union[str, Path] = None, fusion_max_dist: int = 10, ): """ Differences with non-fusion MegaPBTree: 1. allow_5merge is always FALSE. Not a parameter. 2. fusion_max_dist --- maximum allowed distance on internal fusion sites to be called as equivalent fusions """ super().__init__( gff_filename, group_filename, internal_fuzzy_max_dist, self_prefix, False, fastq_filename, ) self.fusion_max_dist = fusion_max_dist # ex: PBfusion.1 -> [PBfusion.1.1, PBfusion.1.2] self.record_d_fusion = { fusion_id: records for fusion_id, records in GFF.collapseGFFFusionReader(gff_filename) }
def __init__( self, gff_filename: str, group_filename: str, internal_fuzzy_max_dist: int = 0, self_prefix: Optional[str] = None, allow_5merge: bool = False, fastq_filename: Optional[str] = None, max_3_diff: Optional[int] = None, ): self.gff_filename = gff_filename self.group_filename = group_filename self.self_prefix = self_prefix self.internal_fuzzy_max_dist = internal_fuzzy_max_dist self.max_3_diff = max_3_diff self.allow_5merge = allow_5merge self.record_d = { r.seqid: r for r in GFF.collapseGFFReader(gff_filename) } # sanity_check_seqids(self.record_d.keys()) # sanity check all IDs look like PB.1.2 self.tree = defaultdict(lambda: { "+": IntervalTree(), "-": IntervalTree() }) # chr --> strand --> tree self.fastq_dict = None if fastq_filename is not None: self.fastq_dict = MegaPBTree.read_fastq_to_dict(fastq_filename) # print >> sys.stderr, "self.internal_fuzzy_max_dist is", internal_fuzzy_max_dist # raw_input() self.read_gff_as_interval_tree() self.group_info = MegaPBTree.read_group( self.group_filename, self.self_prefix) # ex: PB.1.1 --> [ RatHeart|i3_c123.... ]
def add_sample( self, gff_filename: Union[str, Path], group_filename: Union[str, Path], sample_prefix: str, output_prefix: str, fastq_filename: Union[str, Path] = None, ) -> None: combined = [] # list of (<matches to r2 or None>, r2) unmatched_recs = set(self.record_d.keys()) for r in GFF.collapseGFFReader(gff_filename): # for each collapsed transcript, find records that overlap match_rec_list = list(self.match_record_to_tree(r)) if len(match_rec_list ) > 0: # found match(es)! put longer of r1/r2 in # if len(match_rec_list) > 1: pdb.set_trace() #DEBUG combined.append((match_rec_list, r)) for match_rec in match_rec_list: try: unmatched_recs.remove(match_rec.seqid) except KeyError: pass # already deleted, OK, this can happen else: # r is not present in current tree combined.append((None, r)) # put whatever is left from the tree in for seqid in unmatched_recs: combined.append(([self.record_d[seqid]], None)) # create a ClusterTree to re-calc the loci/transcripts final_tree = defaultdict(lambda: { "+": ClusterTree(0, 0), "-": ClusterTree(0, 0) }) for i, (r1s, r2) in enumerate(combined): if r1s is None: final_tree[r2.chr][r2.strand].insert(r2.start, r2.end, i) else: if r2 is not None: rep = find_representative_in_iso_list(r1s + [r2]) else: rep = find_representative_in_iso_list(r1s) final_tree[rep.chr][rep.strand].insert(rep.start, rep.end, i) self.write_cluster_tree_as_gff( rec_list=combined, group_filename2=group_filename, sample_prefix2=sample_prefix, output_prefix=output_prefix, fastq_filename2=fastq_filename, )
def add_sample( self, gff_filename: Union[str, Path], group_filename: Union[str, Path], sample_prefix: str, output_prefix: str, fastq_filename: Optional[Union[str, Path]] = None, ) -> None: combined = ( [] ) # list of (r1 if r2 is None | r2 if r1 is None | longer of r1 or r2 if both not None) unmatched_recs = list(self.record_d_fusion.keys()) for _, records in GFF.collapseGFFFusionReader(gff_filename): match_seqid = self.match_fusion_record(records) if match_seqid is not None: combined.append((self.record_d_fusion[match_seqid], records)) try: unmatched_recs.remove(match_seqid) except ValueError: pass # already deleted, OK, this happens for single-exon transcripts else: # r is not present in current tree combined.append((None, records)) # put whatever is left from the tree in for seqid in unmatched_recs: combined.append((self.record_d_fusion[seqid], None)) # create a ClusterTree to re-calc the loci/transcripts final_tree = defaultdict(lambda: { "+": ClusterTree(0, 0), "-": ClusterTree(0, 0) }) for i, (r1s, r2s) in enumerate(combined): if r2s is None or (r1s is not None and r1s[0].end - r1s[0].start > r2s[0].end - r2s[0].start): final_tree[r1s[0].chr][r1s[0].strand].insert( r1s[0].start, r1s[0].end, i) else: final_tree[r2s[0].chr][r2s[0].strand].insert( r2s[0].start, r2s[0].end, i) self.write_cluster_tree_as_gff( final_tree, combined, group_filename, sample_prefix, output_prefix, fastq_filename2=fastq_filename, )
def sanity_check_collapse_input(input_prefix: str) -> Tuple[Path, Path, Path]: """ Check that 1. the count, gff, rep files exist 2. the number of records agree among the three """ # group_filename = f"{input_prefix}.group.txt" count_filename = Path(f"{input_prefix}.abundance.txt") gff_filename = Path(f"{input_prefix}.gff") rep_filename = Path(f"{input_prefix}.rep.fq") if not count_filename.exists(): logger.error(f"File {count_filename} does not exist. Abort!") sys.exit(-1) if not gff_filename.exists(): logger.error(f"File {gff_filename} does not exist. Abort!") sys.exit(-1) if not rep_filename.exists(): logger.error(f"File {rep_filename} does not exist. Abort!") sys.exit(-1) pbids1 = {[r.id for r in SeqIO.parse(open(rep_filename, "r"), "fastq")]} pbids2 = {[r.seqid for r in GFF.collapseGFFReader(gff_filename)]} pbids3 = {read_count_file(count_filename)[0].keys()} if ( len(pbids1) != len(pbids2) or len(pbids2) != len(pbids3) or len(pbids1) != len(pbids3) ): logger.error( "The number of PBID records in the files disagree! Sanity check failed." ) logger.error(f"# of PBIDs in {rep_filename}: {len(pbids1)}") logger.error(f"# of PBIDs in {gff_filename}: {len(pbids2)}") logger.error(f"# of PBIDs in {count_filename}: {len(pbids3)}") sys.exit(-1) return count_filename, gff_filename, rep_filename
def sanity_check_collapse_input(count_filename: Path, gff_filename: Path, rep_filename: Path, sample_directory: Path) -> None: """ Check that 1. the count, gff, rep files exist 2. the number of records agree among the three """ # group_filename = f"{input_prefix}.group.txt" if not rep_filename.exists(): raise RuntimeError( f"Input sequence file {rep_filename.name} not found. Abort!") if not count_filename.exists(): raise RuntimeError(f"File {count_filename.name} not found. Abort!") if not gff_filename.exists(): raise RuntimeError(f"File {gff_filename.name} not found. Abort!") if not sample_directory.exists(): raise RuntimeError( f"The directory {sample_directory.name} not found. Abort!") rep_type = "fastq" if rep_filename.suffix in (".fq", ".fastq") else "fasta" pbids1 = {r.id for r in SeqIO.parse(open(rep_filename), rep_type)} pbids2 = {r.seqid for r in GFF.collapseGFFReader(gff_filename)} pbids3 = set(read_count_file(count_filename)[0].keys()) if (len(pbids1) != len(pbids2) or len(pbids2) != len(pbids3) or len(pbids1) != len(pbids3)): logger.error( "The number of PBID records in the files disagree! Sanity check failed." ) logger.error(f"# of PBIDs in {rep_filename}: {len(pbids1)}") logger.error(f"# of PBIDs in {gff_filename}: {len(pbids2)}") logger.error(f"# of PBIDs in {count_filename}: {len(pbids3)}") sys.exit(-1) return None
def sample_sanity_check( group_filename: Union[str, Path], gff_filename: Union[str, Path], count_filename: Union[str, Path], fastq_filename: Optional[Union[str, Path]] = None, ) -> None: """ Double check that the formats are expected and all PBIDs are concordant across the files :return: raise Exception if sanity check failed """ logger.info( f"Sanity checking. Retrieving PBIDs from {group_filename},{gff_filename},{count_filename}..." ) ids1 = [line.strip().split()[0] for line in open(group_filename)] ids2 = [ fusion_id for fusion_id, rs in GFF.collapseGFFFusionReader(gff_filename) ] with open(count_filename) as f: for _ in range(14): f.readline() # just through the header ids3 = [r["pbid"] for r in DictReader(f, delimiter="\t")] if len({ids2}.difference(ids1)) > 0 or len({ids2 }.difference(ids3)) > 0: raise Exception( f"Sanity check failed! Please make sure the PBIDs listed in {gff_filename} are also in {gff_filename} and {count_filename}" ) if fastq_filename is not None: ids4 = [ r.id.split("|")[0] for r in SeqIO.parse(fastq_filename, "fastq") ] if len({ids2}.difference(ids4)) > 0: raise Exception( f"Sanity check failed! Please make sure the PBIDs listed in {gff_filename} are also in {fastq_filename}" )
def sample_sanity_check(group_filename, gff_filename, count_filename, fastq_filename=None) -> None: """ Double check that the formats are expected and all PBIDs are concordant across the files :return: raise Exception if sanity check failed """ logger.info( f"Sanity checking. Retrieving PBIDs from {group_filename},{gff_filename},{count_filename}..." ) ids1 = [line.strip().split()[0] for line in open(group_filename)] ids2 = [r.seqid for r in GFF.collapseGFFReader(gff_filename)] f = open(count_filename) while True: # advance through the headers which start with # cur = f.tell() if (not f.readline().startswith("#") or f.tell() == cur): # first non-# seen or EOF f.seek(cur) break ids3 = [r["pbid"] for r in DictReader(f, delimiter="\t")] if len(set(ids2).difference(ids1)) > 0 or len( set(ids2).difference(ids3)) > 0: raise Exception( f"Sanity check failed! Please make sure the PBIDs listed in {gff_filename} are also in {group_filename} and {count_filename}" ) if fastq_filename is not None: ids4 = [ r.id.split("|")[0] for r in SeqIO.parse(open(fastq_filename), "fastq") ] if len(set(ids2).difference(ids4)) > 0: raise Exception( f"Sanity check failed! Please make sure the PBIDs listed in {gff_filename} are also in {fastq_filename}" )
def summarize_junctions( sample_dirs: Dict[str, Path], # sample_names: List[str], gff_filename: Union[str, Path], output_prefix: Union[str, Path], genome_d: Optional[Union[str, Path]] = None, junction_known: Optional[Union[str, Path]] = None, ) -> defaultdict: """ 1. for each sample, read all the GFF, store the junction information (both 0-based) """ junc_by_chr_strand = defaultdict( lambda: defaultdict(list) ) # (seqname,strand) --> (donor,acceptor) --> samples it show up in (more than once possible) for sample_name, d in sample_dirs.items(): for r in GFF.collapseGFFReader(Path(d, gff_filename)): n = len(r.ref_exons) if n == 1: continue # ignore single exon transcripts for i in range(n - 1): donor = r.ref_exons[i].end - 1 # make it 0-based accep = r.ref_exons[i + 1].start # start is already 0-based junc_by_chr_strand[r.seqname, r.strand][donor, accep].append(sample_name) # write junction report with open(f"{output_prefix}.junction.bed", "w") as f1, open(f"{output_prefix}.junction_detail.txt", "w") as f: f1.write( f'track name=junctions description="{output_prefix}" useScore=1\n') JUNC_DETAIL_FIELDS = [ "seqname", "left", "right", "strand", "num_transcript", "num_sample", "genome", "annotation", "label", ] writer = DictWriter(f, JUNC_DETAIL_FIELDS, delimiter="\t") writer.writeheader() keys = list(junc_by_chr_strand) keys.sort() for _seqname, _strand in keys: v = junc_by_chr_strand[_seqname, _strand] v_keys = list(v) v_keys.sort() labels = cluster_junctions(v_keys) for i, (_donor, _accep) in enumerate(v_keys): rec = { "seqname": _seqname, "left": _donor, "right": _accep, "strand": _strand, "num_transcript": len(v[_donor, _accep]), "num_sample": len(set(v[_donor, _accep])), } # f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t".format(_chr, _donor, _accep, _strand, len(v[_donor,_accep]), len(set(v[_donor,_accep])))) f1.write( f"{_seqname}\t{_donor}\t{_accep + 1}\t{output_prefix}\t{len(v[_donor, _accep])}\t{_strand}\n" ) # if genome is given, write acceptor-donor site if genome_d is None or _seqname not in genome_d: rec["genome"] = "NA" # f.write("NA\t") else: up, down = ( genome_d[_seqname][(_donor + 1):(_donor + 3)], genome_d[_seqname][(_accep - 2):_accep], ) if _strand == "+": rec["genome"] = f"{str(up.seq).upper()}-{str(down.seq).upper()}" # f.write("{0}-{1}\t".format(str(up.seq).upper(), str(down.seq).upper())) else: rec["genome"] = f"{str(down.reverse_complement().seq).upper()}-{str(up.reverse_complement().seq).upper()}" # f.write("{0}-{1}\t".format(str(down.reverse_complement().seq).upper(), str(up.reverse_complement().seq).upper())) # if annotation is given, check if matches with annotation if junction_known is None: rec["annotation"] = "NA" # f.write("NA\n") else: if (_seqname, _strand) in junction_known and ( _donor, _accep, ) in junction_known[_seqname, _strand]: rec["annotation"] = "Y" # f.write("Y\t") else: rec["annotation"] = "N" # f.write("N\t") rec["label"] = f"{_seqname}_{_strand}_{labels[i]}" writer.writerow(rec) # f.write("{c}_{s}_{lab}\n".format(c=_seqname, s=_strand, lab=labels[i])) return junc_by_chr_strand
def main( input_prefix: str = typer.Argument( ..., help="Input prefix (ex: test.collapsed.min_fl_2)" ), version: bool = typer.Option( None, "--version", callback=version_callback, is_eager=True, help="Prints the version of the SQANTI3 package.", ), ) -> None: output_prefix = f"{input_prefix}.nomono" count_filename, gff_filename, rep_filename = sanity_check_collapse_input( input_prefix ) good = [] with open(f"{output_prefix}.gff", "w") as f: reader = GFF.collapseGFFReader(gff_filename) for r in reader: assert r.seqid.startswith("PB.") if len(r.ref_exons) > 1: good.append(r.seqid) GFF.write_collapseGFF_format(f, r) # read abundance first d, count_header = read_count_file(count_filename) # write output rep.fq with open(f"{output_prefix}.rep.fq", "w") as f: for r in SeqIO.parse(open(rep_filename, "r"), "fastq"): if r.name.split("|")[0] in good: SeqIO.write(r, f, "fastq") # write output to .abundance.txt with open(f"{output_prefix}.abundance.txt", "w") as f: f.write(count_header) writer = DictWriter( f, fieldnames=[ "pbid", "count_fl", "count_nfl", "count_nfl_amb", "norm_fl", "norm_nfl", "norm_nfl_amb", ], delimiter="\t", lineterminator="\n", ) writer.writeheader() for k in good: r = d[k] writer.writerow(r) logger.info(f"Output written to:{output_prefix}.gff") logger.info(f"Output written to:{output_prefix}.rep.fq")
def filter_by_count( input_prefix: str, output_prefix: str, min_count: int, dun_use_group_count: bool = False, ) -> None: group_filename = f"{input_prefix}.group.txt" count_filename = f"{input_prefix}.abundance.txt" gff_filename = f"{input_prefix}.gff" rep_filenames = [ (f"{input_prefix}.rep.fq", "fastq"), (f"{input_prefix}.rep.fastq", "fastq"), (f"{input_prefix}.rep.fa", "fasta"), (f"{input_prefix}.rep.fasta", "fasta"), ] rep_filename = None rep_type = None for x, feature in rep_filenames: if os.path.exists(x): rep_filename = x rep_type = feature if rep_filename is None: logger.error( f"Expected to find input fasta or fastq files {input_prefix}.rep.fa or {input_prefix}.rep.fq. Not found. Abort!" ) sys.exit(-1) if not dun_use_group_count: # read group group_max_count_fl = {} group_max_count_p = {} for line in open(group_filename): # ex: PB.1.1 i0HQ_54b0ca|c58773/f30p16/700 pbid, members = line.strip().split("\t") group_max_count_fl[pbid] = 0 group_max_count_p[pbid] = 0 members = members.split(",") for m in members: i = m.find("|") if i > 0: tmp = m.split("|")[1].split("/")[1] # ex: tmp = f30p16 else: tmp = m.split("/")[1] fl_count, p_count = tmp.split("p") fl_count = int(fl_count[1:]) p_count = int(p_count) group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count) group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count) # read abundance first with open(count_filename) as f: count_header = "" while True: cur_pos = f.tell() line = f.readline() if not line.startswith("#"): f.seek(cur_pos) break else: count_header += line d = {r["pbid"]: r for r in DictReader(f, delimiter="\t")} for k, v in d.items(): print(k, v) # group_max_count_p NOT used for now good = [ x for x in d if int(d[x]["count_fl"]) >= min_count and (dun_use_group_count or group_max_count_fl[x] >= min_count) ] # write output GFF with open(f"{output_prefix}.gff", "w") as f: for r in GFF.collapseGFFReader(gff_filename): if r.seqid in good: GFF.write_collapseGFF_format(f, r) # write output rep.fq with open( f"{output_prefix}.rep.{('fq' if rep_type == 'fastq' else 'fa')}", "w" ) as f: for r in SeqIO.parse(open(rep_filename), rep_type): if r.name.split("|")[0] in good: SeqIO.write(r, f, rep_type) # write output to .abundance.txt with open(f"{output_prefix}.abundance.txt", "w") as f: f.write(count_header) writer = DictWriter( f, fieldnames=[ "pbid", "count_fl", "count_nfl", "count_nfl_amb", "norm_fl", "norm_nfl", "norm_nfl_amb", ], delimiter="\t", lineterminator="\n", ) writer.writeheader() for k in good: r = d[k] writer.writerow(r) logger.info( f"Output written to: {output_prefix}.gff\n" f"Output written to: {rep_filename}\n" f"Output written to: {output_prefix}.abundance.txt" )
def cleanup_scrubbed_files_redundancy( gff_filename: Union[str, Path], group_filename: Union[str, Path], count_filename: Union[str, Path], fastq_filename: Union[str, Path], output_prefix: str, ): junction_seen = defaultdict(lambda: defaultdict(lambda: [ ])) # key (chr,strand) -> dict of (series of junctions) -> record for r in GFF.collapseGFFReader(gff_filename): n = len(r.ref_exons) if n == 1: junc_str = f"{str(r.start)},{str(r.end)}" junction_seen[r.chr, r.strand][junc_str] = [r] else: junc_str = ",".join( f"{str(r.ref_exons[i].end)},{str(r.ref_exons[i + 1].start)}" for i in range(n - 1)) junction_seen[r.chr, r.strand][junc_str].append(r) # write out cleaned GFF with open(f"{output_prefix}.gff", "w") as outf, open(f"{output_prefix}.merged_ids.txt", "w") as outf2: merged = {} keys = list(junction_seen.keys()) keys.sort() for k in keys: for bunch in junction_seen[k].values(): if len(bunch) == 1: # just one record, write it out r = bunch[0] GFF.write_collapseGFF_format(outf, r) merged[r.seqid] = [r.seqid] else: # find the representative r = bunch[0] for r2 in bunch[1:]: if r2.end - r2.start > r.end - r.start: r = r2 GFF.write_collapseGFF_format(outf, r) merged[r.seqid] = [x.seqid for x in bunch] outf2.write(f"{r.seqid}\t{','.join(merged[r.seqid])}\n") count_d, count_header = read_count_file(count_filename) # write out count file with open(f"{output_prefix}.abundance.txt", "w") as outf: outf.write(count_header) writer = DictWriter( outf, fieldnames=[ "pbid", "count_fl", "count_nfl", "count_nfl_amb", "norm_fl", "norm_nfl", "norm_nfl_amb", ], delimiter="\t", lineterminator="\n", ) writer.writeheader() for pbid, bunch in merged.items(): # combine the counts r = count_d[bunch[0]] r["pbid"] = pbid for field in fields_to_add: r[field] = float(r[field]) for _id in bunch[1:]: for field in fields_to_add: r[field] += float(count_d[_id][field]) writer.writerow(r) group_info = read_group_file(group_filename) # write out group file with open(f"{output_prefix}.group.txt", "w") as outf: for pbid, bunch in merged.items(): # combine the groups g = [group_info[bunch[0]]] for _id in bunch[1:]: g.append(group_info[_id]) outf.write(f"{pbid}\t{','.join(g)}\n") # write out fastq file if present if fastq_filename is not None: with open(f"{output_prefix}.rep.fq", "w") as outf: for r in SeqIO.parse(open(fastq_filename), "fastq"): if r.id.split("|")[0] in merged or r.id in merged: SeqIO.write(r, outf, "fastq") logger.info( f"scrubbed files written: {output_prefix}.gff, {output_prefix}.group.txt, {output_prefix}.abundance.txt, {output_prefix}.merged_ids.txt" )
def chain_split_file( ref_gff: Path, ref_group: Path, ref_name: str, addon_gff: Path, addon_group: Path, addon_name: str, fuzzy_junction: int, allow_5merge: bool, max_3_diff: int, n_chunks: int, ) -> Tuple[List[str], List[str]]: """ Organize entries in both a gff and transcript group file and split both such that the original two files are split into chunks where gff.chunk.n covers the same entries as group.chunk.n """ # read in the group_file as a dictionary in the form of # { # 'PB.1.1': ["transcript/1"], # 'PB.1.2': ["transcript/2", "transcript/3"] # } addon_group_info = sp.MegaPBTree.read_group(addon_group, None) # with addon_group.open('r') as ag: # addon_group_info = {_.split('\t')[0]: _.split('\t')[1].split(",") for _ in ag.readlines()} recs = [] tree = OrderedDict() i = 0 # for r in HTSeq.GFF_Reader(addon_gff): # if r.iv.chrom not in tree2: # tree[r.iv.chrom] = {"+": ClusterTree(0, 0), "-": ClusterTree(0, 0)} # tree[r.iv.chrom][r.iv.strand].insert(r.iv.start, r.iv.end, i) # recs.append(r) # i += 1 # This should build a structure in the form of: # {"chrN": # { # "+" : bx.intervals.cluster.clusterTree, # "-" : bx.intervals.cluster.clusterTree, # }, # "chrN+1": # { # "+" : bx.intervals.cluster.clusterTree, # "-" : bx.intervals.cluster.clusterTree, # }, # } # CusterTree objects have the form # [(x,y,[z]), (a,b,[c]), (m,n,[o])] # where each tuple is a range and a list of ids that lie within that range # e.g. (from the bx-python docs): # tree = ClusterTree(0, 0) Insert (6, 7, 1), (1, 2, 3), (9, 10, 2), (3, 4, 0), (3, 8, 4) # tree.getregions() returns [(1, 2, [3]), (3, 8, [0, 1, 4]), (9, 10, [2])] # NOTE: GFF.collapseGFFReader is a specialized GFF reader that in the attributes # field stores a list of bx.intervals.intersection.Interval objects # describing the exons for r in GFF.collapseGFFReader(addon_gff): if r.chr not in tree: tree[r.chr] = {"+": ClusterTree(0, 0), "-": ClusterTree(0, 0)} tree[r.chr][r.strand].insert(r.start, r.end, i) recs.append(r) i += 1 n = len(recs) chunk_size = (n // n_chunks) + (n % n_chunks > 0) split_files = [] i = 0 counter = 0 f_gff = open(f"{addon_gff}.split{str(i)}", "w") f_group = open(f"{addon_group}.split{str(i)}", "w") # this loop is going to reorder everything # so that we have a GFF with a transcript followed by all the exons that # made up that transcript and a separate file with the matching # transcript_id transcript/read_group# # (see the sp.MegaPBTree above) for v1 in tree.values(): for strand in ("+", "-"): v2 = v1[strand] for *_, _indices in v2.getregions(): for cur in _indices: GFF.write_collapseGFF_format(f_gff, recs[cur]) f_group.write( f"{recs[cur].seqid}\t{','.join(addon_group_info[recs[cur].seqid])}\n" ) counter += 1 if counter >= (i + 1) * chunk_size: i += 1 n = f_gff.tell() f_gff.close() f_group.close() if n == 0: # didn't write any records, delete these Path(f_gff.name).unlink() Path(f_group.name).unlink() else: split_files.append((f_gff.name, f_group.name)) if i >= n_chunks or counter >= len(recs): break f_gff = open(f"{addon_gff}.split{str(i)}", "w") f_group = open(f"{addon_group}.split{str(i)}", "w") if not f_gff.closed: n = f_gff.tell() f_gff.close() f_group.close() if n == 0: # didn't write any records, delete these Path(f_gff.name).unlink() Path(f_group.name).unlink() else: split_files.append((f_gff.name, f_group.name)) result_prefixes = [] pools = [] for i, (split_gff, split_group) in enumerate(split_files): p = Process( target=chain_helper, args=( ref_gff, ref_group, split_gff, split_group, ref_name, f"{addon_name}.{str(i)}", fuzzy_junction, allow_5merge, max_3_diff, ), ) p.start() pools.append(p) result_prefixes.append((ref_name, f"{addon_name}.{str(i)}")) for p in pools: p.join() return result_prefixes, split_files
def make_fake_genome( genome_filename, gff_filename, ref_chr, ref_start, ref_end, ref_strand, output_prefix, output_name=None, genome_d=None, ): if genome_d is None: logger.info(f"Reading genome file {genome_filename}...") d = SeqIO.to_dict(SeqIO.parse(open(genome_filename), "fasta")) else: d = genome_d if output_name is None: output_name = f"fake_{genome_filename}" logger.info(f"Reading GFF file {gff_filename}...") good = [] reader = GFF.collapseGFFReader(gff_filename) for r in reader: if (r.chr == ref_chr and r.strand == ref_strand and (ref_start <= r.start < r.end <= ref_end) and len(r.ref_exons) > 1): logger.info(f"Adding {r.seqid} to fake genome.") good.append(r) if len(good) == 0: raise RuntimeError( f"Did not find any transcripts strictly within {ref_chr}:{ref_start}-{ref_end} on strand {ref_strand}. Abort!" ) c = ClusterTree(0, 0) for r in good: for e in r.ref_exons: c.insert( e.start - extra_bp_around_junctions, e.end + extra_bp_around_junctions, 1, ) regions = [(a, b) for (a, b, junk) in c.getregions()] regions[0] = (regions[0][0] - __padding_before_after__, regions[0][1]) regions[-1] = (regions[-1][0], regions[-1][1] + __padding_before_after__) with open(output_prefix + ".fasta", "w") as f: f.write(">" + output_name + "\n") for a, b in regions: f.write(str(d[r.chr][a:b].seq)) f.write("\n") # for mapping, write <0-based index on fake genome>, <ref chrom>, <0-based index on ref genome> with open(output_prefix + ".mapping.txt", "w") as f: i = 0 for a, b in regions: for j in range(a, b): f.write(f"{i},{ref_chr},{j}\n") i += 1 with open(output_prefix + ".pbids.txt", "w") as f: f.write("\n".join(r.seqid for r in good) + "\n") logger.info( f"Output written to {output_prefix}.fasta, {output_prefix}.mapping.txt, {output_prefix}.pbids.txt." )
def read_gff_as_interval_tree(self): """ Read a collapsed GFF file into an IntervalTree """ for r in GFF.collapseGFFReader(self.gff_filename): self.tree[r.chr][r.strand].insert(r.start, r.end, r)
def main( count_filename: Path = typer.Argument( ..., help="Count file (generally ends with '.abundance.txt')"), gff_filename: Path = typer.Argument(..., help="Annotation file"), rep_filename: Path = typer.Argument( ..., help="Sequence file (ends with '.fq', '.fastq', '.fa', or '.fasta')"), fuzzy_junction: int = typer.Option( 5, help="Fuzzy junction max dist (default: 5bp)"), sample_directory: Optional[Path] = typer.Option( None, help= "Directory in which the sample data resides. By default uses the directory from which the script was called", ), output_prefix: Optional[str] = typer.Option( None, help="Prefix to use when naming the filtered files"), version: bool = typer.Option( None, "--version", callback=version_callback, is_eager=True, help="Prints the version of the SQANTI3 package.", ), ) -> None: if not sample_directory: sample_directory = Path.cwd() if output_prefix is None: output_prefix = f"{gff_filename.stem}.filtered" rep_type = "fastq" if rep_filename.suffix in (".fq", ".fastq") else "fasta" sanity_check_collapse_input( count_filename, gff_filename, rep_filename, sample_directory, ) recs = defaultdict(lambda: []) reader = GFF.collapseGFFReader(gff_filename) for r in reader: assert r.seqid.startswith("PB.") recs[int(r.seqid.split(".")[1])].append(r) good = [] with open(f"{output_prefix}.gff", "w") as f: keys = list(recs.keys()) keys.sort() for k in recs: xxx = recs[k] filter_out_subsets(xxx, fuzzy_junction) for r in xxx: GFF.write_collapseGFF_format(f, r) good.append(r.seqid) # read abundance first d, count_header = read_count_file(count_filename) # write output rep.fq with open(f"{output_prefix}.rep.{('fq' if rep_type == 'fastq' else 'fa')}", "w") as f: for r in SeqIO.parse(open(rep_filename), rep_type): if r.name.split("|")[0] in good: SeqIO.write(r, f, rep_type) # write output to .abundance.txt with open(f"{output_prefix}.abundance.txt", "w") as f: f.write(count_header) writer = DictWriter( f, fieldnames=[ "pbid", "count_fl", "count_nfl", "count_nfl_amb", "norm_fl", "norm_nfl", "norm_nfl_amb", ], delimiter="\t", lineterminator="\n", ) writer.writeheader() for k in good: r = d[k] writer.writerow(r) logger.info(f"Output written to: {output_prefix}.gff\n" f"Output written to: {rep_filename}\n" f"Output written to: {output_prefix}.abundance.txt")
def combine_split_chained_results( output_prefixes, final_prefix, ref_gff, ref_group, ref_name, ref_fq, addon_gff, addon_group, addon_name, addon_fq, ): """ Each <output_prefix> will have .gff, .group.txt, .mega_info.txt. There should be NO overlap between the split files, so clean merge should be possible! 1. read the .gff files, record the group and mega (id-map) info 2. sort the total records so can properly put on a unified superPBID 3. write out the unified result 4. delete the split files """ # sanity check files are all there split_files = [] # tuple of (gff, group, mega) for ref_name, o in output_prefixes: gff_file = Path(f"tmp_{o}.gff") mega_file = Path(f"tmp_{o}.mega_info.txt") group_file = Path(f"tmp_{o}.group.txt") if not gff_file.exists() or not mega_file.exists( ) or not group_file.exists(): raise RuntimeError( f"Expects to see {gff_file},{mega_file},{group_file} but one or more files are missing! Abort!" ) split_files.append((ref_name, o, gff_file, group_file, mega_file)) use_fq = False if ref_fq is not None and addon_fq is not None: use_fq = True ref_fq_dict = { r.id.split("|")[0]: r for r in SeqIO.parse(open(ref_fq), "fastq") } addon_fq_dict = { r.id.split("|")[0]: r for r in SeqIO.parse(open(addon_fq), "fastq") } mega_info = {} # ref id -> list of matching query_id, or empty list split_unmatched = set() for (ref_name, split_name, gff_file, group_file, mega_file) in split_files: for r in DictReader(open(mega_file), delimiter="\t"): if r[ref_name] != "NA": if r[ref_name] not in mega_info: mega_info[r[ref_name]] = [] if r[split_name] != "NA": mega_info[r[ref_name]].append(r[split_name]) else: # ref is NA, non-ref is not NA split_unmatched.add(r[split_name]) # make a rec list of matches of (ref_id, addon_id, representative record, combined group info) where rec_ref or ref_addon could be None, but not both rec_list = [] d_ref = {r.seqid: r for r in GFF.collapseGFFReader(ref_gff)} d_addon = {r.seqid: r for r in GFF.collapseGFFReader(addon_gff)} ref_group_info = sp.MegaPBTree.read_group(ref_group, None) addon_group_info = sp.MegaPBTree.read_group(addon_group, None) for ref_id, matches in mega_info.items(): if len(matches) == 0: rec_list.append( sp.MatchRecord( ref_id=ref_id, addon_id="NA", rec=d_ref[ref_id], members=ref_group_info[ref_id], seqrec=ref_fq_dict[ref_id] if use_fq else None, )) else: for addon_id in matches: r1 = d_ref[ref_id] r2 = d_addon[addon_id] if (r1.end - r1.start) > (r2.end - r2.start): rec_list.append( sp.MatchRecord( ref_id=ref_id, addon_id=addon_id, rec=r1, members=ref_group_info[ref_id] + addon_group_info[addon_id], seqrec=ref_fq_dict[ref_id] if use_fq else None, )) else: rec_list.append( sp.MatchRecord( ref_id=ref_id, addon_id=addon_id, rec=r2, members=ref_group_info[ref_id] + addon_group_info[addon_id], seqrec=addon_fq_dict[addon_id] if use_fq else None, )) for addon_id in split_unmatched: rec_list.append( sp.MatchRecord( ref_id="NA", addon_id=addon_id, rec=d_addon[addon_id], members=addon_group_info[addon_id], seqrec=addon_fq_dict[addon_id] if use_fq else None, )) sp.write_reclist_to_gff_n_info(rec_list, final_prefix, ref_name, addon_name, use_fq) for (ref_name, split_name, gff_file, group_file, mega_file) in split_files: gff_file.unlink() group_file.unlink() mega_file.unlink()
def collapse_fuzzy_junctions( gff_filename: Union[str, Path], group_filename: Union[str, Path], allow_extra_5exon: bool, internal_fuzzy_max_dist: int, max_5_diff: int, max_3_diff: int, ) -> defaultdict: def can_merge(m, r1, r2): if m == "exact": return True else: if not allow_extra_5exon: return False # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True if m == "subset": r1, r2 = r2, r1 # rotate so r1 is always the longer one if m == "super" or m == "subset": n2 = len(r2.ref_exons) # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates if r1.strand == "+": return (abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end) else: return (abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and r1.ref_exons[n2 - 1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end) return False d = {} # chr --> strand --> tree recs = defaultdict(lambda: {"+": IntervalTree(), "-": IntervalTree()}) fuzzy_match = defaultdict(lambda: []) for r in GFF.collapseGFFReader(gff_filename): d[r.seqid] = r has_match = False r.segments = r.ref_exons for r2 in recs[r.chr][r.strand].find(r.start, r.end): r2.segments = r2.ref_exons m = compare_junctions( r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist, max_5_diff=max_5_diff, max_3_diff=max_3_diff, ) if can_merge(m, r, r2): fuzzy_match[r2.seqid].append(r.seqid) has_match = True break if not has_match: recs[r.chr][r.strand].insert(r.start, r.end, r) fuzzy_match[r.seqid] = [r.seqid] group_info = {} with open(group_filename) as f: for line in f: pbid, members = line.strip().split("\t") group_info[pbid] = members.split(",") # pick for each fuzzy group the one that has the most exons keys = list(fuzzy_match.keys()) keys.sort(key=lambda x: int(x.split(".")[1])) with open(f"{gff_filename}.fuzzy", "w") as f_gff, open(f"{group_filename}.fuzzy", "w") as f_group: for k in keys: all_members = [] best_pbid, best_size, best_num_exons = ( fuzzy_match[k][0], len(group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons), ) all_members += group_info[fuzzy_match[k][0]] for pbid in fuzzy_match[k][1:]: _num_exons = len(d[pbid].ref_exons) _size = len(group_info[pbid]) all_members += group_info[pbid] if _num_exons > best_num_exons or (_num_exons == best_num_exons and _size > best_size): best_pbid, best_size, best_num_exons = pbid, _size, _num_exons GFF.write_collapseGFF_format(f_gff, d[best_pbid]) f_group.write(f'{best_pbid}\t{",".join(all_members)}\n') return fuzzy_match
def write_reclist_to_gff_n_info( rec_list: Dict[str, Any], final_prefix: str, ref_name: str, addon_name: str, use_fq: bool = False, ) -> Dict[str, str]: # now go through the rec list and figure out in what order we are outputting the total records tree = defaultdict(lambda: { "+": ClusterTree(0, 0), "-": ClusterTree(0, 0) }) tree_keys_numeric = set() tree_keys_alpha = set() for i, match_rec in enumerate(rec_list): tree[match_rec.rec.chr][match_rec.rec.strand].insert( match_rec.rec.start, match_rec.rec.end, i) for chrom in tree: try: k = int(chrom) tree_keys_numeric.add(k) except ValueError: tree_keys_alpha.add(chrom) tree_keys = sorted(tree_keys_numeric) + sorted(tree_keys_alpha) writer_info = DictWriter( Path(f"{final_prefix}.mega_info.txt").open("w"), fieldnames=["superPBID", ref_name, addon_name], delimiter="\t", ) writer_info.writeheader() if use_fq: f_fq = Path(f"{final_prefix}.rep.fq") with open(f"{final_prefix}.gff", "w") as f_gff, open(f"{final_prefix}.group.txt", "w") as f_group: new_group_info = {} pb_i = 0 for _chr in tree_keys: for _strand in ("+", "-"): for *_, _indices in tree[_chr][_strand].getregions(): # further sort these records by (start, end, num_exons) _indices.sort(key=lambda i: ( rec_list[i].rec.start, rec_list[i].rec.end, len(rec_list[i].rec.ref_exons), )) pb_i += 1 for pb_j, recs_index in enumerate(_indices): pbid = f"PB.{pb_i}.{pb_j + 1}" match_rec = rec_list[recs_index] new_group_info[pbid] = match_rec.members match_rec.rec.seqid = pbid GFF.write_collapseGFF_format(f_gff, match_rec.rec) writer_info.writerow({ "superPBID": pbid, ref_name: match_rec.ref_id, addon_name: match_rec.addon_id, }) f_group.write( f"{pbid}\t{','.join(match_rec.members)}\n") if use_fq: match_rec.seqrec.id = pbid match_rec.seqrec.description = "" SeqIO.write(match_rec.seqrec, f_fq, "fastq") return new_group_info