def scrub_sample_GFFs(sample_dirs, gff_filename, count_filename, group_filename, fastq_filename, output_prefix, tree): for sample_name, d in sample_dirs.items(): outf = open(os.path.join(d, output_prefix + '.gff.tmp'), 'w') for r in GFF.collapseGFFReader(os.path.join(d, gff_filename)): n = len(r.ref_exons) if n == 1: GFF.write_collapseGFF_format(outf, r) new_ref_exons = scrub_ref_exons(r, tree) if new_ref_exons is None: print("No changes made due to error:", r.seqid, file=sys.stderr) else: #print "before:", r.ref_exons #print "after :", new_ref_exons r.ref_exons = new_ref_exons GFF.write_collapseGFF_format(outf, r) outf.close() cleanup_scrubbed_files_redundancy(outf.name, \ os.path.join(d, group_filename), \ os.path.join(d, count_filename), \ os.path.join(d, fastq_filename) if fastq_filename is not None else None, os.path.join(d, output_prefix))
def __init__(self, gff_filename, group_filename, internal_fuzzy_max_dist=0, self_prefix=None, allow_5merge=False, fastq_filename=None): self.gff_filename = gff_filename self.group_filename = group_filename self.self_prefix = self_prefix self.internal_fuzzy_max_dist = internal_fuzzy_max_dist self.allow_5merge = allow_5merge self.record_d = dict( (r.seqid, r) for r in GFF.collapseGFFReader(gff_filename)) #sanity_check_seqids(self.record_d.keys()) # sanity check all IDs look like PB.1.2 self.tree = defaultdict(lambda: { '+': IntervalTree(), '-': IntervalTree() }) # chr --> strand --> tree self.fastq_dict = None if fastq_filename is not None: self.fastq_dict = MegaPBTree.read_fastq_to_dict(fastq_filename) #print >> sys.stderr, "self.internal_fuzzy_max_dist is", internal_fuzzy_max_dist #raw_input() self.read_gff_as_interval_tree() self.group_info = MegaPBTree.read_group( self.group_filename, self.self_prefix) # ex: PB.1.1 --> [ RatHeart|i3_c123.... ]
def sample_sanity_check(group_filename, gff_filename, count_filename, fastq_filename=None): """ Double check that the formats are expected and all PBIDs are concordant across the files :return: raise Exception if sanity check failed """ print >> sys.stderr, "Sanity checking. Retrieving PBIDs from {0},{1},{2}...".format(\ group_filename, gff_filename, count_filename) ids1 = [line.strip().split()[0] for line in open(group_filename)] ids2 = [r.seqid for r in GFF.collapseGFFReader(gff_filename)] f = open(count_filename) while True: # advance through the headers which start with # cur = f.tell() if not f.readline().startswith('#') or f.tell( ) == cur: # first non-# seen or EOF f.seek(cur) break ids3 = [r['pbid'] for r in DictReader(f, delimiter='\t')] if len(set(ids2).difference(ids1)) > 0 or len( set(ids2).difference(ids3)) > 0: raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0} and {2}".format(\ group_filename, gff_filename, count_filename) if fastq_filename is not None: ids4 = [ r.id.split('|')[0] for r in SeqIO.parse(open(fastq_filename), 'fastq') ] if len(set(ids2).difference(ids4)) > 0: raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0}".format(\ fastq_filename, gff_filename)
def add_sample(self, gff_filename, group_filename, sample_prefix, output_prefix, fastq_filename=None): combined = [] # list of (<matches to r2 or None>, r2) unmatched_recs = set(self.record_d.keys()) for r in GFF.collapseGFFReader(gff_filename): match_rec_list = [r for r in self.match_record_to_tree(r)] if len(match_rec_list) > 0: # found match(es)! put longer of r1/r2 in #if len(match_rec_list) > 1: pdb.set_trace() #DEBUG combined.append((match_rec_list, r)) for match_rec in match_rec_list: try: unmatched_recs.remove(match_rec.seqid) except KeyError: pass # already deleted, OK, this can happen else: # r is not present in current tree combined.append((None, r)) # put whatever is left from the tree in for seqid in unmatched_recs: combined.append(([self.record_d[seqid]], None)) # create a ClusterTree to re-calc the loci/transcripts final_tree = defaultdict(lambda: {'+': ClusterTree(0, 0), '-':ClusterTree(0, 0)}) for i,(r1s,r2) in enumerate(combined): if r1s is None: final_tree[r2.chr][r2.strand].insert(r2.start, r2.end, i) else: if r2 is not None: rep = find_representative_in_iso_list(r1s + [r2]) else: rep = find_representative_in_iso_list(r1s) final_tree[rep.chr][rep.strand].insert(rep.start, rep.end, i) self.write_cluster_tree_as_gff(final_tree, combined, group_filename, sample_prefix, output_prefix, fastq_filename2=fastq_filename)
def sample_sanity_check(group_filename, gff_filename, count_filename, fastq_filename=None): """ Double check that the formats are expected and all PBIDs are concordant across the files :return: raise Exception if sanity check failed """ print >> sys.stderr, "Sanity checking. Retrieving PBIDs from {0},{1},{2}...".format(\ group_filename, gff_filename, count_filename) ids1 = [line.strip().split()[0] for line in open(group_filename)] ids2 = [r.seqid for r in GFF.collapseGFFReader(gff_filename)] f = open(count_filename) for i in xrange(14): f.readline() # just through the header ids3 = [r['pbid'] for r in DictReader(f, delimiter='\t')] if len(set(ids2).difference(ids1)) > 0 or len( set(ids2).difference(ids3)) > 0: raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0} and {2}".format(\ group_filename, gff_filename, count_filename) if fastq_filename is not None: ids4 = [ r.id.split('|')[0] for r in SeqIO.parse(open(fastq_filename), 'fastq') ] if len(set(ids2).difference(ids4)) > 0: raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0}".format(\ fastq_filename, gff_filename)
def sanity_check_collapse_input(input_prefix): """ Check that 1. the count, gff, rep files exist 2. the number of records agree among the three """ group_filename = input_prefix + '.group.txt' count_filename = input_prefix + '.abundance.txt' gff_filename = input_prefix + '.gff' rep_filename = input_prefix + '.rep.fq' if not os.path.exists(count_filename): print >> sys.stderr, "File {0} does not exist. Abort!".format(count_filename) sys.exit(-1) if not os.path.exists(gff_filename): print >> sys.stderr, "File {0} does not exist. Abort!".format(gff_filename) sys.exit(-1) if not os.path.exists(rep_filename): print >> sys.stderr, "File {0} does not exist. Abort!".format(rep_filename) sys.exit(-1) pbids1 = set([r.id for r in SeqIO.parse(open(rep_filename),'fastq')]) pbids2 = set([r.seqid for r in GFF.collapseGFFReader(gff_filename)]) pbids3 = set(read_count_file(count_filename)[0].keys()) if len(pbids1)!=len(pbids2) or len(pbids2)!=len(pbids3) or len(pbids1)!=len(pbids3): print >> sys.stderr, "The number of PBID records in the files disagree! Sanity check failed." print >> sys.stderr, "# of PBIDs in {0}: {1}".format(rep_filename, len(pbids1)) print >> sys.stderr, "# of PBIDs in {0}: {1}".format(gff_filename, len(pbids2)) print >> sys.stderr, "# of PBIDs in {0}: {1}".format(count_filename, len(pbids3)) sys.exit(-1) return count_filename, gff_filename, rep_filename
def sample_sanity_check(group_filename, gff_filename, count_filename, fastq_filename=None): """ Double check that the formats are expected and all PBIDs are concordant across the files :return: raise Exception if sanity check failed """ print >> sys.stderr, "Sanity checking. Retrieving PBIDs from {0},{1},{2}...".format(\ group_filename, gff_filename, count_filename) ids1 = [line.strip().split()[0] for line in open(group_filename)] ids2 = [r.seqid for r in GFF.collapseGFFReader(gff_filename)] f = open(count_filename) while True: # advance through the headers which start with # cur = f.tell() if not f.readline().startswith('#') or f.tell() == cur: # first non-# seen or EOF f.seek(cur) break ids3 = [r['pbid'] for r in DictReader(f, delimiter='\t')] if len(set(ids2).difference(ids1))>0 or len(set(ids2).difference(ids3))>0: raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0} and {2}".format(\ group_filename, gff_filename, count_filename) if fastq_filename is not None: ids4 = [r.id.split('|')[0] for r in SeqIO.parse(open(fastq_filename), 'fastq')] if len(set(ids2).difference(ids4))>0: raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0}".format(\ fastq_filename, gff_filename)
def add_sample(self, gff_filename, group_filename, sample_prefix, output_prefix, fastq_filename=None): combined = [] # list of (r1 if r2 is None | r2 if r1 is None | longer of r1 or r2 if both not None) unmatched_recs = self.record_d.keys() for r in GFF.collapseGFFReader(gff_filename): match_rec = self.match_record_to_tree(r) if match_rec is not None: # found a match! put longer of r1/r2 in combined.append((match_rec, r)) try: unmatched_recs.remove(match_rec.seqid) except ValueError: pass # already deleted, OK, this happens for single-exon transcripts else: # r is not present in current tree combined.append((None, r)) # put whatever is left from the tree in for seqid in unmatched_recs: combined.append((self.record_d[seqid], None)) # create a ClusterTree to re-calc the loci/transcripts final_tree = defaultdict(lambda: {'+': ClusterTree(0, 0), '-':ClusterTree(0, 0)}) for i,(r1,r2) in enumerate(combined): if r2 is None or (r1 is not None and r1.end-r1.start > r2.end-r2.start): final_tree[r1.chr][r1.strand].insert(r1.start, r1.end, i) else: final_tree[r2.chr][r2.strand].insert(r2.start, r2.end, i) self.write_cluster_tree_as_gff(final_tree, combined, group_filename, sample_prefix, output_prefix, fastq_filename2=fastq_filename)
def main(): from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("input_prefix", help="Input prefix (ex: test.collapsed.min_fl_2)") parser.add_argument("--fuzzy_junction", type=int, default=5, help="Fuzzy junction max dist (default: 5bp)") args = parser.parse_args() output_prefix = args.input_prefix + '.filtered' count_filename, gff_filename, rep_filename, rep_type = sanity_check_collapse_input( args.input_prefix) recs = defaultdict(lambda: []) reader = GFF.collapseGFFReader(gff_filename) for r in reader: assert r.seqid.startswith('PB.') recs[int(r.seqid.split('.')[1])].append(r) good = [] f = open(output_prefix + '.gff', 'w') keys = list(recs.keys()) keys.sort() for k in recs: xxx = recs[k] filter_out_subsets(xxx, args.fuzzy_junction) for r in xxx: GFF.write_collapseGFF_format(f, r) good.append(r.seqid) f.close() # read abundance first d, count_header = read_count_file(count_filename) # write output rep.fq f = open(output_prefix + '.rep.' + ('fq' if rep_type == 'fastq' else 'fa'), 'w') for r in SeqIO.parse(open(rep_filename), rep_type): if r.name.split('|')[0] in good: SeqIO.write(r, f, rep_type) f.close() # write output to .abundance.txt f = open(output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close() print("Output written to:", output_prefix + '.gff', file=sys.stderr) print("Output written to:", rep_filename, file=sys.stderr) print("Output written to:", output_prefix + '.gff', file=sys.stderr)
def chain_split_file(ref_gff, ref_group, ref_name, addon_gff, addon_group, addon_name, fuzzy_junction, allow_5merge, max_3_diff, n_chunks): addon_group_info = sp.MegaPBTree.read_group(addon_group, None) recs = [] tree = OrderedDict() i = 0 for r in GFF.collapseGFFReader(addon_gff): if r.chr not in tree: tree[r.chr] = {'+': ClusterTree(0, 0), '-': ClusterTree(0, 0)} tree[r.chr][r.strand].insert(r.start, r.end, i) recs.append(r) i += 1 n = len(recs) chunk_size = (n // n_chunks) + (n % n_chunks > 0) split_files = [] i = 0 counter = 0 f_gff = open(addon_gff + '.split' + str(i), 'w') f_group = open(addon_group + '.split' + str(i), 'w') for v1 in tree.values(): for strand in ('+', '-'): v2 = v1[strand] for _start, _end, _indices in v2.getregions(): for cur in _indices: GFF.write_collapseGFF_format(f_gff, recs[cur]) f_group.write("{0}\t{1}\n".format( recs[cur].seqid, ",".join(addon_group_info[recs[cur].seqid]))) counter += 1 if counter >= (i + 1) * chunk_size: i += 1 f_gff.close() f_group.close() split_files.append((f_gff.name, f_group.name)) f_gff = open(addon_gff + '.split' + str(i), 'w') f_group = open(addon_group + '.split' + str(i), 'w') if not f_gff.closed: f_gff.close() f_group.close() split_files.append((f_gff.name, f_group.name)) result_prefixes = [] pools = [] for i, (split_gff, split_group) in enumerate(split_files): p = Process(target=chain_helper, args=(ref_gff, ref_group, split_gff, split_group, ref_name, addon_name + '.' + str(i), fuzzy_junction, allow_5merge, max_3_diff)) p.start() pools.append(p) result_prefixes.append((ref_name, addon_name + '.' + str(i))) for p in pools: p.join() return result_prefixes, split_files
def regroup_gff(pooled_gff, demux_count_file, output_prefix, out_group_dict, in_fafq=None): """ :param pooled_sam: SAM file :param demux_count_file: comma-delimited per-barcode count file :param output_prefix: output prefix for GFF :param out_group_dict: dict of barcode name --> group to be long in (ex: {'EM1':'EM', 'EM2':'EM'}) :param in_fafq: optional fasta/fastq that was input to SAM """ if in_fafq is not None: type_fafq = get_type_fafq(in_fafq) in_tissue = defaultdict( lambda: set()) # pbid --> list of tissue it is in (EM, END, R) for r in DictReader(open(demux_count_file), delimiter=','): for k, v in r.items(): if k == 'id': continue if int(v) > 0: in_tissue[r['id']].add(k) in_tissue = dict(in_tissue) handles = {} handles_fafq = {} for g in out_group_dict.values(): handles[g] = open("{o}_{g}_only.gff".format(o=output_prefix, g=g), 'w') if in_fafq is not None: handles_fafq[g] = open( "{o}_{g}_only.{t}".format(o=output_prefix, g=g, t=type_fafq), 'w') if in_fafq is not None: fafq_dict = SeqIO.to_dict(SeqIO.parse(open(in_fafq), type_fafq)) fafq_dict_keys = list(fafq_dict.keys()) for k in fafq_dict_keys: m = rex_pbid.match(k) if m is not None: fafq_dict[m.group(1)] = fafq_dict[k] reader = GFF.collapseGFFReader(pooled_gff) for r in reader: groups_to_write_in = set() pbid = r.seqid if pbid not in in_tissue: print( "WARNING: {0} does not belong to any group indicated by outgroup_dict" .format(pbid), file=sys.stderr) for tissue in in_tissue[pbid]: groups_to_write_in.add(out_group_dict[tissue]) for g in groups_to_write_in: GFF.write_collapseGFF_format(handles[g], r) if in_fafq is not None: SeqIO.write(fafq_dict[pbid], handles_fafq[g], type_fafq)
def main(): from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("input_prefix", help="Input prefix (ex: test.collapsed.min_fl_2)") parser.add_argument("--fuzzy_junction", type=int, default=5, help="Fuzzy junction max dist (default: 5bp)") args = parser.parse_args() output_prefix = args.input_prefix + '.filtered' count_filename, gff_filename, rep_filename = sanity_check_collapse_input(args.input_prefix) recs = defaultdict(lambda: []) reader = GFF.collapseGFFReader(gff_filename) for r in reader: assert r.seqid.startswith('PB.') recs[int(r.seqid.split('.')[1])].append(r) good = [] f = open(output_prefix + '.gff', 'w') keys = recs.keys() keys.sort() for k in recs: xxx = recs[k] filter_out_subsets(xxx, args.fuzzy_junction) for r in xxx: GFF.write_collapseGFF_format(f, r) good.append(r.seqid) f.close() # read abundance first d, count_header = read_count_file(count_filename) # write output rep.fq f = open(output_prefix + '.rep.fq', 'w') for r in SeqIO.parse(open(rep_filename), 'fastq'): if r.name.split('|')[0] in good: SeqIO.write(r, f, 'fastq') f.close() # write output to .abundance.txt f = open(output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close() print >> sys.stderr, "Output written to:", output_prefix + '.gff' print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq' print >> sys.stderr, "Output written to:", output_prefix + '.gff'
def sanity_check_collapse_input(input_prefix): """ Check that 1. the count, gff, rep files exist 2. the number of records agree among the three """ group_filename = input_prefix + '.group.txt' count_filename = input_prefix + '.abundance.txt' gff_filename = input_prefix + '.gff' rep_filenames = [(input_prefix + '.rep.fq', 'fastq'), (input_prefix + '.rep.fastq', 'fastq'), \ (input_prefix + '.rep.fa', 'fasta'), (input_prefix + '.rep.fasta', 'fasta')] rep_filename = None rep_type = None for x, type in rep_filenames: if os.path.exists(x): rep_filename = x rep_type = type if rep_filename is None: print( "Expected to find input fasta or fastq files {0}.rep.fa or {0}.rep.fq. Not found. Abort!" .format(input_prefix), file=sys.stderr) sys.exit(-1) if not os.path.exists(count_filename): print("File {0} does not exist. Abort!".format(count_filename), file=sys.stderr) sys.exit(-1) if not os.path.exists(gff_filename): print("File {0} does not exist. Abort!".format(gff_filename), file=sys.stderr) sys.exit(-1) pbids1 = set([r.id for r in SeqIO.parse(open(rep_filename), rep_type)]) pbids2 = set([r.seqid for r in GFF.collapseGFFReader(gff_filename)]) pbids3 = set(read_count_file(count_filename)[0].keys()) if len(pbids1) != len(pbids2) or len(pbids2) != len(pbids3) or len( pbids1) != len(pbids3): print( "The number of PBID records in the files disagree! Sanity check failed.", file=sys.stderr) print("# of PBIDs in {0}: {1}".format(rep_filename, len(pbids1)), file=sys.stderr) print("# of PBIDs in {0}: {1}".format(gff_filename, len(pbids2)), file=sys.stderr) print("# of PBIDs in {0}: {1}".format(count_filename, len(pbids3)), file=sys.stderr) sys.exit(-1) return count_filename, gff_filename, rep_filename, rep_type
def make_fake_genome(genome_filename, gff_filename, ref_chr, ref_start, ref_end, ref_strand, output_prefix, output_name, genome_d=None): if genome_d is None: print("Reading genome file {0}...".format(genome_filename), file=sys.stderr) d = SeqIO.to_dict(SeqIO.parse(open(genome_filename),'fasta')) else: d = genome_d print("Reading GFF file {0}...".format(gff_filename), file=sys.stderr) good = [] reader = GFF.collapseGFFReader(gff_filename) for r in reader: if r.chr==ref_chr and r.strand==ref_strand and \ (ref_start <= r.start < r.end <= ref_end) \ and len(r.ref_exons) > 1: print("Adding {0} to fake genome.".format(r.seqid), file=sys.stderr) good.append(r) if len(good) == 0: print("Did not find any transcripts strictly within {0}:{1}-{2} on strand {3}. Abort!".format(\ ref_chr, ref_start, ref_end, ref_strand), file=sys.stderr) sys.exit(-1) c = ClusterTree(0, 0) for r in good: for e in r.ref_exons: c.insert(e.start-extra_bp_around_junctions, e.end+extra_bp_around_junctions, 1) regions = [(a,b) for (a,b,junk) in c.getregions()] regions[0] = (regions[0][0]-__padding_before_after__, regions[0][1]) regions[-1] = (regions[-1][0], regions[-1][1]+__padding_before_after__) with open(output_prefix+'.fasta', 'w') as f: f.write(">" + output_name + "\n") for a,b in regions: f.write(str(d[r.chr][a:b].seq)) f.write("\n") f.close() # for mapping, write <0-based index on fake genome>, <ref chrom>, <0-based index on ref genome> with open(output_prefix+'.mapping.txt', 'w') as f: i = 0 for a,b in regions: for j in range(a, b): f.write("{0},{1},{2}\n".format(i, ref_chr, j)) i += 1 with open(output_prefix+'.pbids.txt', 'w') as f: f.write("\n".join(r.seqid for r in good)+'\n') print("Output written to {0}.fasta, {0}.mapping.txt, {0}.pbids.txt.".format(output_prefix), file=sys.stderr)
def main(): from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("input_prefix", help="Input prefix (ex: test.collapsed.min_fl_2)") args = parser.parse_args() output_prefix = args.input_prefix + '.nomono' count_filename, gff_filename, rep_filename = sanity_check_collapse_input( args.input_prefix) good = [] f = open(output_prefix + '.gff', 'w') reader = GFF.collapseGFFReader(gff_filename) for r in reader: assert r.seqid.startswith('PB.') if len(r.ref_exons) > 1: good.append(r.seqid) GFF.write_collapseGFF_format(f, r) # read abundance first d, count_header = read_count_file(count_filename) # write output rep.fq f = open(output_prefix + '.rep.fq', 'w') for r in SeqIO.parse(open(rep_filename), 'fastq'): if r.name.split('|')[0] in good: SeqIO.write(r, f, 'fastq') f.close() # write output to .abundance.txt f = open(output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close() print >> sys.stderr, "Output written to:", output_prefix + '.gff' print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq' print >> sys.stderr, "Output written to:", output_prefix + '.gff'
def __init__(self, gff_filename, group_filename, internal_fuzzy_max_dist=0, self_prefix=None, allow_5merge=False, fastq_filename=None): self.gff_filename = gff_filename self.group_filename = group_filename self.self_prefix = self_prefix self.internal_fuzzy_max_dist = internal_fuzzy_max_dist self.allow_5merge = allow_5merge self.record_d = dict((r.seqid, r) for r in GFF.collapseGFFReader(gff_filename)) #sanity_check_seqids(self.record_d.keys()) # sanity check all IDs look like PB.1.2 self.tree = defaultdict(lambda: {'+':IntervalTree(), '-':IntervalTree()}) # chr --> strand --> tree self.fastq_dict = None if fastq_filename is not None: self.fastq_dict = MegaPBTree.read_fastq_to_dict(fastq_filename) #print >> sys.stderr, "self.internal_fuzzy_max_dist is", internal_fuzzy_max_dist #raw_input() self.read_gff_as_interval_tree() self.group_info = MegaPBTree.read_group(self.group_filename, self.self_prefix) # ex: PB.1.1 --> [ RatHeart|i3_c123.... ]
def main(): from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("input_prefix", help="Input prefix (ex: test.collapsed.min_fl_2)") args = parser.parse_args() output_prefix = args.input_prefix + '.nomono' count_filename, gff_filename, rep_filename = sanity_check_collapse_input(args.input_prefix) good = [] f = open(output_prefix + '.gff', 'w') reader = GFF.collapseGFFReader(gff_filename) for r in reader: assert r.seqid.startswith('PB.') if len(r.ref_exons) > 1: good.append(r.seqid) GFF.write_collapseGFF_format(f, r) # read abundance first d, count_header = read_count_file(count_filename) # write output rep.fq f = open(output_prefix + '.rep.fq', 'w') for r in SeqIO.parse(open(rep_filename), 'fastq'): if r.name.split('|')[0] in good: SeqIO.write(r, f, 'fastq') f.close() # write output to .abundance.txt f = open(output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close() print >> sys.stderr, "Output written to:", output_prefix + '.gff' print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq' print >> sys.stderr, "Output written to:", output_prefix + '.gff'
def collapse_fuzzy_junctions(gff_filename, group_filename, allow_extra_5exon, internal_fuzzy_max_dist): def get_fl_from_id(members): try: # ex: 13cycle_1Mag1Diff|i0HQ_SIRV_1d1m|c139597/f1p0/178 return sum(int(_id.split('/')[1].split('p')[0][1:]) for _id in members) except ValueError: return 0 def can_merge(m, r1, r2): if m == 'exact': return True else: if not allow_extra_5exon: return False # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True if m == 'subset': r1, r2 = r2, r1 # rotate so r1 is always the longer one if m == 'super' or m == 'subset': n2 = len(r2.ref_exons) # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates if r1.strand == '+': return abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and \ r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end else: return abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and \ r1.ref_exons[n2-1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end return False d = {} recs = defaultdict(lambda: {'+':IntervalTree(), '-':IntervalTree()}) # chr --> strand --> tree fuzzy_match = defaultdict(lambda: []) for r in GFF.collapseGFFReader(gff_filename): d[r.seqid] = r has_match = False r.segments = r.ref_exons for r2 in recs[r.chr][r.strand].find(r.start, r.end): r2.segments = r2.ref_exons m = compare_junctions.compare_junctions(r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist, max_5_diff=args.max_5_diff, max_3_diff=args.max_3_diff) if can_merge(m, r, r2): fuzzy_match[r2.seqid].append(r.seqid) has_match = True break if not has_match: recs[r.chr][r.strand].insert(r.start, r.end, r) fuzzy_match[r.seqid] = [r.seqid] group_info = {} with open(group_filename) as f: for line in f: pbid, members = line.strip().split('\t') group_info[pbid] = [x for x in members.split(',')] # pick for each fuzzy group the one that has the most exons (if tie, then most FL) keys = fuzzy_match.keys() keys.sort(key=lambda x: map(int, x.split('.')[1:])) f_gff = open(gff_filename+'.fuzzy', 'w') f_group = open(group_filename+'.fuzzy', 'w') for k in keys: all_members = [] best_pbid, best_size, best_num_exons = fuzzy_match[k][0], len(group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons) all_members += group_info[fuzzy_match[k][0]] for pbid in fuzzy_match[k][1:]: # note: get_fl_from_id only works on IsoSeq1 and 2 ID formats, will return 0 if IsoSeq3 format or other _size = get_fl_from_id(group_info[pbid]) _num_exons = len(d[pbid].ref_exons) all_members += group_info[pbid] if _num_exons > best_num_exons or (_num_exons == best_num_exons and _size > best_size): best_pbid, best_size, best_num_exons = pbid, _size, _num_exons GFF.write_collapseGFF_format(f_gff, d[best_pbid]) f_group.write("{0}\t{1}\n".format(best_pbid, ",".join(all_members))) f_gff.close() f_group.close() return fuzzy_match
def get_gff_from_list(gff_filename, listfile, partial_ok=False): seqs = [line.strip() for line in open(listfile)] for r in GFF.collapseGFFReader(gff_filename): if r.seqid in seqs or r.seqid.split('|')[0] in seqs or (partial_ok and any(r.seqid.startswith(x) for x in seqs)): GFF.write_collapseGFF_format(sys.stdout, r)
def main(): from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("input_prefix", help="Input prefix (ex: filtered.microexon)") parser.add_argument( "--micro_exon_size", type=int, default=12, help="Filter away microexons < micro_exon_size (default: 12bp)") args = parser.parse_args() output_prefix = args.input_prefix + '.filtered' args = parser.parse_args() output_prefix = args.input_prefix + '.filtered.microexon' count_filename, gff_filename, rep_filename = sanity_check_collapse_input( args.input_prefix) recs = defaultdict(lambda: []) reader = GFF.collapseGFFReader(gff_filename) for r in reader: assert r.seqid.startswith('PB.') recs[int(r.seqid.split('.')[1])].append(r) good = [] f = open(output_prefix + '.gff', 'w') keys = recs.keys() keys.sort() for k in recs: xxx = recs[k] for r in xxx: min_exon_size = min(e.end - e.start for e in r.ref_exons) if min_exon_size > 12: # minimum exon must be > default 12 bp GFF.write_collapseGFF_format(f, r) good.append(r.seqid) f.close() # read abundance first d, count_header = read_count_file(count_filename) # write output rep.fq f = open(output_prefix + '.rep.fq', 'w') for r in SeqIO.parse(open(rep_filename), 'fastq'): if r.name.split('|')[0] in good: SeqIO.write(r, f, 'fastq') f.close() # write output to .abundance.txt f = open(output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close() print >> sys.stderr, "Output written to:", output_prefix + '.gff' print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq' print >> sys.stderr, "Output written to:", output_prefix + '.gff'
def filter_by_count(input_prefix, output_prefix, min_count, dun_use_group_count=False): group_filename = input_prefix + '.group.txt' count_filename = input_prefix + '.abundance.txt' gff_filename = input_prefix + '.gff' rep_filename = input_prefix + '.rep.fq' if not dun_use_group_count: # read group group_max_count_fl = {} group_max_count_p = {} f = open(group_filename) for line in f: #ex: PB.1.1 i0HQ_54b0ca|c58773/f30p16/700 pbid, members = line.strip().split('\t') group_max_count_fl[pbid] = 0 group_max_count_p[pbid] = 0 members = members.split(',') for m in members: i = m.find('|') if i > 0: tmp = m.split('|')[1].split('/')[1] #ex: tmp = f30p16 else: tmp = m.split('/')[1] fl_count, p_count = tmp.split('p') fl_count = int(fl_count[1:]) p_count = int(p_count) group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count) group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count) f.close() # read abundance first f = open(count_filename) count_header = '' while True: cur_pos = f.tell() line = f.readline() if not line.startswith('#'): f.seek(cur_pos) break else: count_header += line d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t')) for k,v in d.iteritems(): print k,v f.close() # group_max_count_p NOT used for now good = filter(lambda x: int(d[x]['count_fl']) >= min_count and (dun_use_group_count or group_max_count_fl[x] >= min_count), d) # write output GFF f = open(output_prefix + '.gff', 'w') for r in GFF.collapseGFFReader(gff_filename): if r.seqid in good: GFF.write_collapseGFF_format(f, r) f.close() # write output rep.fq f = open(output_prefix + '.rep.fq', 'w') for r in SeqIO.parse(open(rep_filename), 'fastq'): if r.name.split('|')[0] in good: SeqIO.write(r, f, 'fastq') f.close() # write output to .abundance.txt f = open(output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close() print >> sys.stderr, "Output written to:", output_prefix + '.gff' print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq' print >> sys.stderr, "Output written to:", output_prefix + '.abundance.txt'
def read_gff_as_interval_tree(self): """ Read a collapsed GFF file into an IntervalTree """ for r in GFF.collapseGFFReader(self.gff_filename): self.tree[r.chr][r.strand].insert(r.start, r.end, r)
def chain_split_file(ref_gff, ref_group, ref_name, addon_gff, addon_group, addon_name, fuzzy_junction, allow_5merge, max_3_diff, n_chunks): addon_group_info = sp.MegaPBTree.read_group(addon_group, None) recs = [] tree = OrderedDict() i = 0 for r in GFF.collapseGFFReader(addon_gff): if r.chr not in tree: tree[r.chr] = {'+': ClusterTree(0, 0), '-': ClusterTree(0, 0)} tree[r.chr][r.strand].insert(r.start, r.end, i) recs.append(r) i += 1 n = len(recs) chunk_size = (n // n_chunks) + (n % n_chunks > 0) #print("# of recs: {0}, cpus: {1}, chunk_size: {2}".format(n, n_chunks, chunk_size)) split_files = [] i = 0 counter = 0 f_gff = open(addon_gff + '.split' + str(i), 'w') f_group = open(addon_group + '.split' + str(i), 'w') for v1 in tree.values(): for strand in ('+', '-'): v2 = v1[strand] for _start, _end, _indices in v2.getregions(): for cur in _indices: GFF.write_collapseGFF_format(f_gff, recs[cur]) f_group.write("{0}\t{1}\n".format( recs[cur].seqid, ",".join(addon_group_info[recs[cur].seqid]))) counter += 1 # note: becuz we are limited by how the records are organized by (chrom, strand) # we may not end up using all the chunks, ex: if all records are on the same locus, we end up writing everything to one split file if counter >= (i + 1) * chunk_size: i += 1 f_gff.close() f_group.close() split_files.append((f_gff.name, f_group.name)) if i >= n_chunks or counter >= len(recs): break f_gff = open(addon_gff + '.split' + str(i), 'w') f_group = open(addon_group + '.split' + str(i), 'w') if not f_gff.closed: f_gff.close() f_group.close() split_files.append((f_gff.name, f_group.name)) result_prefixes = [] pools = [] for i, (split_gff, split_group) in enumerate(split_files): p = Process(target=chain_helper, args=(ref_gff, ref_group, split_gff, split_group, ref_name, addon_name + '.' + str(i), fuzzy_junction, allow_5merge, max_3_diff)) p.start() pools.append(p) result_prefixes.append((ref_name, addon_name + '.' + str(i))) for p in pools: p.join() #print("split files: {0}, result_prefix: {1}".format(split_files, result_prefixes)) return result_prefixes, split_files
def cleanup_scrubbed_files_redundancy(gff_filename, group_filename, count_filename, fastq_filename, output_prefix): junction_seen = defaultdict(lambda: defaultdict(lambda: [ ])) # key (chr,strand) -> dict of (series of junctions) -> record for r in GFF.collapseGFFReader(gff_filename): n = len(r.ref_exons) if n == 1: junc_str = str(r.start) + ',' + str(r.end) junction_seen[r.chr, r.strand][junc_str] = [r] else: junc_str = ",".join( str(r.ref_exons[i].end) + ',' + str(r.ref_exons[i + 1].start) for i in range(n - 1)) junction_seen[r.chr, r.strand][junc_str].append(r) # write out cleaned GFF outf = open(output_prefix + '.gff', 'w') outf2 = open(output_prefix + '.merged_ids.txt', 'w') merged = {} keys = list(junction_seen.keys()) keys.sort() for k in keys: for bunch in junction_seen[k].values(): if len(bunch) == 1: # just one record, write it out r = bunch[0] GFF.write_collapseGFF_format(outf, r) merged[r.seqid] = [r.seqid] else: # find the representative r = bunch[0] for r2 in bunch[1:]: if r2.end - r2.start > r.end - r.start: r = r2 GFF.write_collapseGFF_format(outf, r) merged[r.seqid] = [x.seqid for x in bunch] outf2.write("{0}\t{1}\n".format(r.seqid, ",".join(merged[r.seqid]))) outf.close() outf2.close() count_d, count_header = read_count_file(count_filename) # write out count file outf = open(output_prefix + '.abundance.txt', 'w') outf.write(count_header) writer = DictWriter(outf, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for pbid, bunch in merged.items(): # combine the counts r = count_d[bunch[0]] r['pbid'] = pbid for field in fields_to_add: r[field] = float(r[field]) for _id in bunch[1:]: for field in fields_to_add: r[field] += float(count_d[_id][field]) writer.writerow(r) outf.close() group_info = read_group_file(group_filename) # write out group file outf = open(output_prefix + '.group.txt', 'w') for pbid, bunch in merged.items(): # combine the groups g = [group_info[bunch[0]]] for _id in bunch[1:]: g.append(group_info[_id]) outf.write("{0}\t{1}\n".format(pbid, ",".join(g))) outf.close() # write out fastq file if present if fastq_filename is not None: outf = open(output_prefix + '.rep.fq', 'w') for r in SeqIO.parse(open(fastq_filename), 'fastq'): if r.id.split('|')[0] in merged or r.id in merged: SeqIO.write(r, outf, 'fastq') outf.close() print( "scrubbed files written: {0}.gff, {0}.group.txt, {0}.abundance.txt, {0}.merged_ids.txt" .format(output_prefix), file=sys.stderr)
#!/usr/bin/env python import cupcake.io.GFF as GFF from collections import defaultdict import numpy as np d = defaultdict(lambda: []) for r in GFF.collapseGFFReader( 'hq_isoforms.fastq.no5merge.collapsed.filtered.gff'): d[r.seqid.split('.')[1]].append(r.seqid) p = np.array([len(v) for v in d.itervalues()]) print "Number of loci:", len(d) print "Number of isoforms:", sum(p) print "Avg. number of isoforms per loci:", np.mean(p) f = open('hq_isoforms.fastq.no5merge.collapsed.filtered.isoform_per_loci.txt', 'w') f.write("loci\tnum_isoform\n") for k, v in d.iteritems(): f.write("PB.{0}\t{1}\n".format(k, len(v))) f.close()
def filter_by_count(input_prefix, output_prefix, min_count, dun_use_group_count=False): group_filename = input_prefix + '.group.txt' count_filename = input_prefix + '.abundance.txt' gff_filename = input_prefix + '.gff' rep_filename = input_prefix + '.rep.fq' if not dun_use_group_count: # read group group_max_count_fl = {} group_max_count_p = {} f = open(group_filename) for line in f: #ex: PB.1.1 i0HQ_54b0ca|c58773/f30p16/700 pbid, members = line.strip().split('\t') group_max_count_fl[pbid] = 0 group_max_count_p[pbid] = 0 members = members.split(',') for m in members: i = m.find('|') if i > 0: tmp = m.split('|')[1].split('/')[1] #ex: tmp = f30p16 else: tmp = m.split('/')[1] fl_count, p_count = tmp.split('p') fl_count = int(fl_count[1:]) p_count = int(p_count) group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count) group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count) f.close() # read abundance first f = open(count_filename) count_header = '' while True: cur_pos = f.tell() line = f.readline() if not line.startswith('#'): f.seek(cur_pos) break else: count_header += line d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t')) for k, v in d.items(): print(k, v) f.close() # group_max_count_p NOT used for now good = [ x for x in d if int(d[x]['count_fl']) >= min_count and ( dun_use_group_count or group_max_count_fl[x] >= min_count) ] # write output GFF f = open(output_prefix + '.gff', 'w') for r in GFF.collapseGFFReader(gff_filename): if r.seqid in good: GFF.write_collapseGFF_format(f, r) f.close() # write output rep.fq f = open(output_prefix + '.rep.fq', 'w') for r in SeqIO.parse(open(rep_filename), 'fastq'): if r.name.split('|')[0] in good: SeqIO.write(r, f, 'fastq') f.close() # write output to .abundance.txt f = open(output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close() print("Output written to:", output_prefix + '.gff', file=sys.stderr) print("Output written to:", output_prefix + '.rep.fq', file=sys.stderr) print("Output written to:", output_prefix + '.abundance.txt', file=sys.stderr)
def main(): from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("input_prefix", help="Input prefix (ex: test.collapsed.min_fl_2)") parser.add_argument("--fuzzy_junction", type=int, default=5, help="Fuzzy junction max dist (default: 5bp)") args = parser.parse_args() output_prefix = args.input_prefix + '.filtered' #group_filename = args.input_prefix + '.group.txt' count_filename = args.input_prefix + '.abundance.txt' gff_filename = args.input_prefix + '.gff' rep_filename = args.input_prefix + '.rep.fq' if not os.path.exists(count_filename): print >> sys.stderr, "File {0} does not exist. Abort!".format( count_filename) sys.exit(-1) if not os.path.exists(gff_filename): print >> sys.stderr, "File {0} does not exist. Abort!".format( gff_filename) sys.exit(-1) if not os.path.exists(rep_filename): print >> sys.stderr, "File {0} does not exist. Abort!".format( rep_filename) sys.exit(-1) recs = defaultdict(lambda: []) reader = GFF.collapseGFFReader(gff_filename) for r in reader: assert r.seqid.startswith('PB.') recs[int(r.seqid.split('.')[1])].append(r) good = [] f = open(output_prefix + '.gff', 'w') keys = recs.keys() keys.sort() for k in recs: xxx = recs[k] filter_out_subsets(xxx, args.fuzzy_junction) for r in xxx: GFF.write_collapseGFF_format(f, r) good.append(r.seqid) f.close() # read abundance first f = open(count_filename) count_header = '' while True: cur_pos = f.tell() line = f.readline() if not line.startswith('#'): f.seek(cur_pos) break else: count_header += line d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t')) for k, v in d.iteritems(): print k, v f.close() # write output rep.fq f = open(output_prefix + '.rep.fq', 'w') for r in SeqIO.parse(open(rep_filename), 'fastq'): if r.name.split('|')[0] in good: SeqIO.write(r, f, 'fastq') f.close() # write output to .abundance.txt f = open(output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close() print >> sys.stderr, "Output written to:", output_prefix + '.gff' print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq' print >> sys.stderr, "Output written to:", output_prefix + '.gff'
def combine_split_chained_results(output_prefixes, final_prefix, ref_gff, ref_group, ref_name, ref_fq, addon_gff, addon_group, addon_name, addon_fq): """ Each <output_prefix> will have .gff, .group.txt, .mega_info.txt. There should be NO overlap between the split files, so clean merge should be possible! 1. read the .gff files, record the group and mega (id-map) info 2. sort the total records so can properly put on a unified superPBID 3. write out the unified result 4. delete the split files """ # sanity check files are all there split_files = [] # tuple of (gff, group, mega) for ref_name, o in output_prefixes: gff_file = 'tmp_' + o + '.gff' mega_file = 'tmp_' + o + '.mega_info.txt' group_file = 'tmp_' + o + '.group.txt' if not os.path.exists(gff_file) or not os.path.exists( mega_file) or not os.path.exists(group_file): print( "Expects to see {0},{1},{2} but one or more files are missing! Abort!" .format(gff_file, mega_file, group_file), file=sys.stderr) sys.exit(-1) split_files.append((ref_name, o, gff_file, group_file, mega_file)) use_fq = False if ref_fq is not None and addon_fq is not None: use_fq = True ref_fq_dict = dict((r.id.split('|')[0], r) for r in SeqIO.parse(open(ref_fq), 'fastq')) addon_fq_dict = dict((r.id.split('|')[0], r) for r in SeqIO.parse(open(addon_fq), 'fastq')) mega_info = {} # ref id -> list of matching query_id, or empty list split_unmatched = set() for (ref_name, split_name, gff_file, group_file, mega_file) in split_files: for r in DictReader(open(mega_file), delimiter='\t'): if r[ref_name] != 'NA': if r[ref_name] not in mega_info: mega_info[r[ref_name]] = [] if r[split_name] != 'NA': mega_info[r[ref_name]].append(r[split_name]) else: # ref is NA, non-ref is not NA split_unmatched.add(r[split_name]) # make a rec list of matches of (ref_id, addon_id, representative record, combined group info) where rec_ref or ref_addon could be None, but not both rec_list = [] d_ref = dict((r.seqid, r) for r in GFF.collapseGFFReader(ref_gff)) d_addon = dict((r.seqid, r) for r in GFF.collapseGFFReader(addon_gff)) ref_group_info = sp.MegaPBTree.read_group(ref_group, None) addon_group_info = sp.MegaPBTree.read_group(addon_group, None) for ref_id, matches in mega_info.items(): if len(matches) == 0: rec_list.append( sp.MatchRecord(ref_id=ref_id, addon_id='NA', rec=d_ref[ref_id], members=ref_group_info[ref_id], seqrec=ref_fq_dict[ref_id] if use_fq else None)) else: for addon_id in matches: r1 = d_ref[ref_id] r2 = d_addon[addon_id] if (r1.end - r1.start) > (r2.end - r2.start): rec_list.append( sp.MatchRecord( ref_id=ref_id, addon_id=addon_id, rec=r1, members=ref_group_info[ref_id] + addon_group_info[addon_id], seqrec=ref_fq_dict[ref_id] if use_fq else None)) else: rec_list.append( sp.MatchRecord(ref_id=ref_id, addon_id=addon_id, rec=r2, members=ref_group_info[ref_id] + addon_group_info[addon_id], seqrec=addon_fq_dict[addon_id] if use_fq else None)) for addon_id in split_unmatched: rec_list.append( sp.MatchRecord(ref_id='NA', addon_id=addon_id, rec=d_addon[addon_id], members=addon_group_info[addon_id], seqrec=addon_fq_dict[addon_id] if use_fq else None)) sp.write_reclist_to_gff_n_info(rec_list, final_prefix, ref_name, addon_name, use_fq) for (ref_name, split_name, gff_file, group_file, mega_file) in split_files: os.remove(gff_file) os.remove(group_file) os.remove(mega_file)
def collapse_fuzzy_junctions(gff_filename, group_filename, allow_extra_5exon, internal_fuzzy_max_dist): def get_fl_from_id(members): # ex: 13cycle_1Mag1Diff|i0HQ_SIRV_1d1m|c139597/f1p0/178 return sum(int(_id.split('/')[1].split('p')[0][1:]) for _id in members) def can_merge(m, r1, r2): if m == 'exact': return True else: if not allow_extra_5exon: return False # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True if m == 'subset': r1, r2 = r2, r1 # rotate so r1 is always the longer one if m == 'super' or m == 'subset': n2 = len(r2.ref_exons) # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates if r1.strand == '+': return abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and \ r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end else: return abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and \ r1.ref_exons[n2-1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end return False d = {} recs = defaultdict(lambda: { '+': IntervalTree(), '-': IntervalTree() }) # chr --> strand --> tree fuzzy_match = defaultdict(lambda: []) for r in GFF.collapseGFFReader(gff_filename): d[r.seqid] = r has_match = False r.segments = r.ref_exons for r2 in recs[r.chr][r.strand].find(r.start, r.end): r2.segments = r2.ref_exons m = compare_junctions.compare_junctions( r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist) if can_merge(m, r, r2): fuzzy_match[r2.seqid].append(r.seqid) has_match = True break if not has_match: recs[r.chr][r.strand].insert(r.start, r.end, r) fuzzy_match[r.seqid] = [r.seqid] group_info = {} with open(group_filename) as f: for line in f: pbid, members = line.strip().split('\t') group_info[pbid] = [x for x in members.split(',')] # pick for each fuzzy group the one that has the most exons (if tie, then most FL) keys = fuzzy_match.keys() keys.sort(key=lambda x: map(int, x.split('.')[1:])) f_gff = open(gff_filename + '.fuzzy', 'w') f_group = open(group_filename + '.fuzzy', 'w') for k in keys: all_members = [] best_pbid, best_size, best_num_exons = fuzzy_match[k][0], len( group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons) all_members += group_info[fuzzy_match[k][0]] for pbid in fuzzy_match[k][1:]: _size = get_fl_from_id(group_info[pbid]) _num_exons = len(d[pbid].ref_exons) all_members += group_info[pbid] if _num_exons > best_num_exons or (_num_exons == best_num_exons and _size > best_size): best_pbid, best_size, best_num_exons = pbid, _size, _num_exons GFF.write_collapseGFF_format(f_gff, d[best_pbid]) f_group.write("{0}\t{1}\n".format(best_pbid, ",".join(all_members))) f_gff.close() f_group.close() return fuzzy_match
#!/usr/bin/env python import cupcake.io.GFF as GFF from collections import defaultdict import numpy as np d = defaultdict(lambda: []) for r in GFF.collapseGFFReader('hq_isoforms.fastq.no5merge.collapsed.filtered.gff'): d[r.seqid.split('.')[1]].append(r.seqid) p = np.array([len(v) for v in d.itervalues()]) print "Number of loci:", len(d) print "Number of isoforms:", sum(p) print "Avg. number of isoforms per loci:", np.mean(p) f = open('hq_isoforms.fastq.no5merge.collapsed.filtered.isoform_per_loci.txt', 'w') f.write("loci\tnum_isoform\n") for k,v in d.iteritems(): f.write("PB.{0}\t{1}\n".format(k, len(v))) f.close()
def summarize_junctions(sample_dirs, sample_names, gff_filename, output_prefix, genome_d=None, junction_known=None): """ 1. for each sample, read all the GFF, store the junction information (both 0-based) """ junc_by_chr_strand = defaultdict(lambda: defaultdict(lambda: [])) # (chr,strand) --> (donor,acceptor) --> samples it show up in (more than once possible) for sample_name, d in sample_dirs.items(): for r in GFF.collapseGFFReader(os.path.join(d, gff_filename)): n = len(r.ref_exons) if n == 1: continue # ignore single exon transcripts for i in range(n-1): donor = r.ref_exons[i].end-1 # make it 0-based accep = r.ref_exons[i+1].start # start is already 0-based junc_by_chr_strand[r.chr, r.strand][donor, accep].append(sample_name) # write junction report f1 = open(output_prefix+'.junction.bed', 'w') f1.write("track name=junctions description=\"{0}\" useScore=1\n".format(output_prefix)) JUNC_DETAIL_FIELDS = ['chr', 'left', 'right', 'strand', 'num_transcript', 'num_sample', 'genome', 'annotation', 'label'] with open(output_prefix+'.junction_detail.txt', 'w') as f: writer = DictWriter(f, JUNC_DETAIL_FIELDS, delimiter='\t') writer.writeheader() keys = list(junc_by_chr_strand.keys()) keys.sort() for _chr, _strand in keys: v = junc_by_chr_strand[_chr, _strand] v_keys = list(v.keys()) v_keys.sort() labels = cluster_junctions(v_keys) for i,(_donor, _accep) in enumerate(v_keys): rec = {'chr': _chr, 'left': _donor, 'right': _accep, 'strand': _strand, 'num_transcript': len(v[_donor,_accep]), 'num_sample': len(set(v[_donor,_accep]))} #f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t".format(_chr, _donor, _accep, _strand, len(v[_donor,_accep]), len(set(v[_donor,_accep])))) f1.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(_chr, _donor, _accep+1, output_prefix, len(v[_donor,_accep]), _strand)) # if genome is given, write acceptor-donor site if genome_d is None or _chr not in genome_d: rec['genome'] = 'NA' #f.write("NA\t") else: up, down = genome_d[_chr][_donor+1:_donor+3], genome_d[_chr][_accep-2:_accep] if _strand == '+': rec['genome'] = "{0}-{1}".format(str(up.seq).upper(), str(down.seq).upper()) #f.write("{0}-{1}\t".format(str(up.seq).upper(), str(down.seq).upper())) else: rec['genome'] = "{0}-{1}".format(str(down.reverse_complement().seq).upper(), str(up.reverse_complement().seq).upper()) #f.write("{0}-{1}\t".format(str(down.reverse_complement().seq).upper(), str(up.reverse_complement().seq).upper())) # if annotation is given, check if matches with annotation if junction_known is None: rec['annotation'] = 'NA' #f.write("NA\n") else: if (_chr, _strand) in junction_known and (_donor, _accep) in junction_known[_chr, _strand]: rec['annotation'] = 'Y' #f.write("Y\t") else: rec['annotation'] = 'N' #f.write("N\t") rec['label'] = "{c}_{s}_{lab}".format(c=_chr, s=_strand, lab=labels[i]) writer.writerow(rec) #f.write("{c}_{s}_{lab}\n".format(c=_chr, s=_strand, lab=labels[i])) f1.close() return junc_by_chr_strand
def summarize_junctions(sample_dirs, sample_names, gff_filename, output_prefix, genome_d=None, junction_known=None): """ 1. for each sample, read all the GFF, store the junction information (both 0-based) """ junc_by_chr_strand = defaultdict(lambda: defaultdict(lambda: [])) # (chr,strand) --> (donor,acceptor) --> samples it show up in (more than once possible) for sample_name, d in sample_dirs.iteritems(): for r in GFF.collapseGFFReader(os.path.join(d, gff_filename)): n = len(r.ref_exons) if n == 1: continue # ignore single exon transcripts for i in xrange(n-1): donor = r.ref_exons[i].end-1 # make it 0-based accep = r.ref_exons[i+1].start # start is already 0-based junc_by_chr_strand[r.chr, r.strand][donor, accep].append(sample_name) # write junction report f1 = open(output_prefix+'.junction.bed', 'w') f1.write("track name=junctions description=\"{0}\" useScore=1\n".format(output_prefix)) JUNC_DETAIL_FIELDS = ['chr', 'left', 'right', 'strand', 'num_transcript', 'num_sample', 'genome', 'annotation', 'label'] with open(output_prefix+'.junction_detail.txt', 'w') as f: writer = DictWriter(f, JUNC_DETAIL_FIELDS, delimiter='\t') writer.writeheader() keys = junc_by_chr_strand.keys() keys.sort() for _chr, _strand in keys: v = junc_by_chr_strand[_chr, _strand] v_keys = v.keys() v_keys.sort() labels = cluster_junctions(v_keys) for i,(_donor, _accep) in enumerate(v_keys): rec = {'chr': _chr, 'left': _donor, 'right': _accep, 'strand': _strand, 'num_transcript': len(v[_donor,_accep]), 'num_sample': len(set(v[_donor,_accep]))} #f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t".format(_chr, _donor, _accep, _strand, len(v[_donor,_accep]), len(set(v[_donor,_accep])))) f1.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(_chr, _donor, _accep+1, output_prefix, len(v[_donor,_accep]), _strand)) # if genome is given, write acceptor-donor site if genome_d is None or _chr not in genome_d: rec['genome'] = 'NA' #f.write("NA\t") else: up, down = genome_d[_chr][_donor+1:_donor+3], genome_d[_chr][_accep-2:_accep] if _strand == '+': rec['genome'] = "{0}-{1}".format(str(up.seq).upper(), str(down.seq).upper()) #f.write("{0}-{1}\t".format(str(up.seq).upper(), str(down.seq).upper())) else: rec['genome'] = "{0}-{1}".format(str(down.reverse_complement().seq).upper(), str(up.reverse_complement().seq).upper()) #f.write("{0}-{1}\t".format(str(down.reverse_complement().seq).upper(), str(up.reverse_complement().seq).upper())) # if annotation is given, check if matches with annotation if junction_known is None: rec['annotation'] = 'NA' #f.write("NA\n") else: if (_chr, _strand) in junction_known and (_donor, _accep) in junction_known[_chr, _strand]: rec['annotation'] = 'Y' #f.write("Y\t") else: rec['annotation'] = 'N' #f.write("N\t") rec['label'] = "{c}_{s}_{lab}".format(c=_chr, s=_strand, lab=labels[i]) writer.writerow(rec) #f.write("{c}_{s}_{lab}\n".format(c=_chr, s=_strand, lab=labels[i])) f1.close() return junc_by_chr_strand