def main(args): inf1 = None if re.search('\.gz$', args.depth_bed): inf1 = gzip.open(args.depth_bed) else: inf1 = open(args.depth_bed) inf2 = None if re.search('\.gz$', args.feature_bed): inf2 = gzip.open(args.feature_bed) else: inf2 = open(args.feature_bed) of = sys.stdout if args.output: if re.search('\.gz$', args.output): of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') bs1 = BedStream(inf1) bs2 = BedStream(inf2) mls = MultiLocusStream([bs1, bs2]) for overlapped in mls: [b1s, b2s] = overlapped.get_payload() if len(b1s) == 0 or len(b2s) == 0: continue for b1 in b1s: m = union_range_array(b1, b2s, is_sorted=True) for rng in m: of.write("\t".join([str(x) for x in rng.get_bed_array()]) + "\t" + b1.get_payload() + "\n") of.close()
def main(): parser = argparse.ArgumentParser(description="",formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input',help="Use - for STDIN") parser.add_argument('-o','--output',help="output file or use STDOUT if not set") args = parser.parse_args() if args.input == '-': args.input = sys.stdin else: args.input = open(args.input) gs = GPDStream(args.input) ls = LocusStream(gs) of = sys.stdout if args.output: if re.search('\.gz$',args.output): of = gzip.open(args.output,'w') else: of = open(args.output,'w') for rng in ls: sys.stderr.write(rng.get_range_string()+" \r") gpds = rng.get_payload() exs = [] for ex_set in [[y.get_range() for y in x.exons] for x in gpds]: exs += ex_set cov = ranges_to_coverage(exs) #use our coverage data on each gpd entry now for gpd in gpds: totcov = 0 for exon in [x.get_range() for x in gpd.exons]: gcovs = union_range_array(exon,cov,payload=2) totcov += sum([x.get_payload()*x.length() for x in gcovs]) of.write(gpd.get_gene_name()+"\t"+str(gpd.get_exon_count())+"\t"+str(gpd.get_length())+"\t"+str(float(totcov)/float(gpd.get_length()))+"\n") sys.stderr.write("\n") of.close()
def main(): parser = argparse.ArgumentParser( description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help="Use - for STDIN") parser.add_argument('-o', '--output', help="output file or use STDOUT if not set") args = parser.parse_args() if args.input == '-': args.input = sys.stdin else: args.input = open(args.input) gs = GPDStream(args.input) ls = LocusStream(gs) of = sys.stdout if args.output: if re.search('\.gz$', args.output): of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') for rng in ls: sys.stderr.write(rng.get_range_string() + " \r") gpds = rng.get_payload() exs = [] for ex_set in [[y.get_range() for y in x.exons] for x in gpds]: exs += ex_set cov = ranges_to_coverage(exs) #use our coverage data on each gpd entry now for gpd in gpds: totcov = 0 for exon in [x.get_range() for x in gpd.exons]: gcovs = union_range_array(exon, cov, payload=2) totcov += sum([x.get_payload() * x.length() for x in gcovs]) of.write(gpd.get_gene_name() + "\t" + str(gpd.get_exon_count()) + "\t" + str(gpd.get_length()) + "\t" + str(float(totcov) / float(gpd.get_length())) + "\n") sys.stderr.write("\n") of.close()
def main(): parser = argparse.ArgumentParser(description="For every gpd entry (sorted) intersect it with bed depth (sorted)",formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('gpd_input',help="GPD file") parser.add_argument('bed_depth_input',help="GPD file") parser.add_argument('-o','--output',help="output file") args = parser.parse_args() inf1 = None if re.search('\.gz$',args.gpd_input): inf1 = gzip.open(args.gpd_input) else: inf1 = open(args.gpd_input) inf2 = None if re.search('\.gz$',args.bed_depth_input): inf2 = gzip.open(args.bed_depth_input) else: inf2 = open(args.bed_depth_input) gs = GPDStream(inf1) bs = BedStream(inf2) of = sys.stdout if args.output: if re.search('\.gz$',args.output): of = gzip.open(args.output,'w') else: of = open(args.output,'w') mls = MultiLocusStream([gs,bs]) z = 0 for ml in mls: z += 1 #if z%1000 == 0: sys.stderr.write(ml.get_range_string()+" \r") [gpds,beds] = ml.get_payload() if len(gpds) == 0: continue if len(beds)==0: for gpd in gpds: of.write(gpd.get_gene_name()+"\t"+gpd.get_transcript_name()+"\t"+str(gpd.get_exon_count())+"\t"+str(gpd.get_length())+"\t0\t0\t0"+"\n") continue #break beds up by depth #depths = {} #for bed in beds: # d = int(bed.get_payload()) # if d not in depths: depths[d] = [] # depths[d].append(bed) #for gpd in gpds: # clen = 0 # tot = 0 # for d in depths: # covs = [] # for ex in [x.get_range() for x in gpd.exons]: # clen += sum([x.overlap_size(ex) for x in depths[d]]) # tot += clen*d # of.write(gpd.get_gene_name()+"\t"+gpd.get_transcript_name()+"\t"+str(gpd.get_exon_count())+"\t"+str(gpd.get_length())+"\t"+str(clen)+"\t"+str(float(clen)/float(gpd.get_length()))+"\t"+str(float(tot)/float(gpd.get_length()))+"\n") for gpd in gpds: covs = [] for ex in [x.get_range() for x in gpd.exons]: c = union_range_array(ex,beds,payload=2) covs += c clen = sum([x.length() for x in covs if int(x.get_payload())>0]) tot = sum([x.length()*int(x.get_payload()) for x in covs]) of.write(gpd.get_gene_name()+"\t"+gpd.get_transcript_name()+"\t"+str(gpd.get_exon_count())+"\t"+str(gpd.get_length())+"\t"+str(clen)+"\t"+str(float(clen)/float(gpd.get_length()))+"\t"+str(float(tot)/float(gpd.get_length()))+"\n") sys.stderr.write("\n") of.close() inf1.close() inf2.close()