def get_output(bedarray,z): sarray = sort_genomic_ranges(bedarray[:]) covs = ranges_to_coverage(bedarray) olines = '' for c in covs: olines += c.chr+"\t"+str(c.start-1)+"\t"+str(c.end)+"\t"+str(c.get_payload())+"\n" return [olines,z]
def main(args): inf = sys.stdin if args.input != '-': if re.search('\.gz$', args.input): inf = gzip.open(args.input) else: inf = open(args.input) of = sys.stdout if args.output: if re.search('\.gz$', args.output): of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') loci = LocusStream(GPDStream(inf)) for locus in loci: exranges = [] for entry in locus.get_payload(): for exon in entry.exons: exranges.append(exon.get_range()) covs = ranges_to_coverage(exranges) for cov in covs: of.write("\t".join([str(x) for x in cov.get_bed_coordinates()]) + "\t" + str(+cov.get_payload()) + "\n") of.close() inf.close()
def get_depth_per_transcript(self,mindepth=1): bedarray = [] for tx in self.get_transcripts(): for ex in [x.rng for x in tx.exons]: bedarray.append(ex) cov = ranges_to_coverage(bedarray) results = {} for tx in self.get_transcripts(): tlen = tx.get_length() bcov = [] for ex in [x.rng for x in tx.exons]: excov = [[x.overlap_size(ex),x.get_payload()] for x in cov] for coved in [x for x in excov if x[0] > 0]: bcov.append(coved) total_base_coverage = sum([x[0]*x[1] for x in bcov]) average_coverage = float(total_base_coverage)/float(tlen) minimum_bases_covered = sum([x[0] for x in bcov if x[1] >= mindepth]) fraction_covered_at_minimum = float(minimum_bases_covered)/float(tlen) res = {'tx':tx,'average_coverage':average_coverage,'fraction_covered':fraction_covered_at_minimum,'mindepth':mindepth,'length_covered':minimum_bases_covered} results[tx.get_id()] = res #print average_coverage #print fraction_covered_at_minimum #print tlen #tcov = float(bcov)/float(tlen) #print tcov #for c in cov: # print c return results
def main(): parser = argparse.ArgumentParser(description="",formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input',help="Use - for STDIN") parser.add_argument('-o','--output',help="output file or use STDOUT if not set") args = parser.parse_args() if args.input == '-': args.input = sys.stdin else: args.input = open(args.input) gs = GPDStream(args.input) ls = LocusStream(gs) of = sys.stdout if args.output: if re.search('\.gz$',args.output): of = gzip.open(args.output,'w') else: of = open(args.output,'w') for rng in ls: sys.stderr.write(rng.get_range_string()+" \r") gpds = rng.get_payload() exs = [] for ex_set in [[y.get_range() for y in x.exons] for x in gpds]: exs += ex_set cov = ranges_to_coverage(exs) #use our coverage data on each gpd entry now for gpd in gpds: totcov = 0 for exon in [x.get_range() for x in gpd.exons]: gcovs = union_range_array(exon,cov,payload=2) totcov += sum([x.get_payload()*x.length() for x in gcovs]) of.write(gpd.get_gene_name()+"\t"+str(gpd.get_exon_count())+"\t"+str(gpd.get_length())+"\t"+str(float(totcov)/float(gpd.get_length()))+"\n") sys.stderr.write("\n") of.close()
def do_tx_line(ref_gpd, annots, reads, args): allbits = [] read_count = 0 for read in reads: if not args.allow_overflowed_matches and read.get_range( ).start < ref_gpd.get_range().start: continue if not args.allow_overflowed_matches and read.get_range( ).end > ref_gpd.get_range().end: continue v = ref_gpd.union(read) for e in [x.rng for x in v.exons]: allbits.append(e) read_count += 1 if len(allbits) == 0: return None if read_count < args.minimum_read_count: return None cov = ranges_to_coverage(allbits) #print [x.get_payload() for x in cov] curr = 0 bps = [] for i in range(0, ref_gpd.get_length()): bps.append(0) for rng1 in [x.rng for x in ref_gpd.exons]: overs = [[z[0], z[1].get_payload()] for z in [[y.union(rng1), y] for y in cov] if z[0]] for ov in overs: dist1 = ov[0].start - rng1.start + curr dist2 = ov[0].end - rng1.start + curr for i in range(dist1, dist2 + 1): bps[i] += ov[1] curr += rng1.length() trimmedbps = bps if args.only_covered_ends: start = 0 finish = len(bps) - 1 for i in range(0, len(bps)): if bps[i] != 0: start = i break for i in reversed(range(0, len(bps))): if bps[i] != 0: finish = i break trimmedbps = bps[start:finish + 1] exp = float(sum(trimmedbps)) / float(len(trimmedbps)) if ref_gpd.get_strand() == '-': trimmedbps = list(reversed(trimmedbps)) if len(trimmedbps) < args.minimum_read_count: return None #bin the results vals = {} for dat in [[ str(1 + int(100 * float(i) / float(len(trimmedbps)))), float(trimmedbps[i]) / float(read_count) ] for i in range(0, len(trimmedbps))]: if dat[0] not in vals: vals[dat[0]] = [] vals[dat[0]].append(dat[1]) for num in vals: vals[num] = average(vals[num]) return [vals, read_count, exp, len(trimmedbps), ref_gpd.get_exon_count()]
def main(): parser = argparse.ArgumentParser( description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help="IDP output folder") #parser.add_argument('--min_exons',type=int,default=1,help="At least this number of exons") parser.add_argument('--offset', type=int, default=1, help="add this much to all expressions") parser.add_argument('--mult', type=int, default=10, help="multiply all expressions by this much") parser.add_argument('-o', '--output', help="OUTPUT file or nothing for STDOUT") args = parser.parse_args() args.input = args.input.rstrip('/') inf = open(args.input + '/isoform.gpd') sys.stderr.write("Reading isoform.gpd\n") txs = {} for line in inf: gpd = GPD(line) tx = gpd.get_transcript_name() if tx not in txs: txs[tx] = [] for exon in gpd.exons: txs[tx].append(exon.get_range()) inf.close() sys.stderr.write("Reading isoform.exp file\n") inf = open(args.input + '/isoform.exp') vals = [] for line in inf: f = line.rstrip().split("\t") v = int((float(f[1]) * args.mult) + args.offset) tx = f[0] exons = txs[tx] #if len(exons) < args.min_exons: continue for i in range(0, v): vals += exons[:] inf.close() sys.stderr.write("Generating coverage file " + str(len(vals)) + "\n") of = sys.stdout if args.output: if args.output[-3:] == '.gz': of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') covs = ranges_to_coverage(vals) for v in covs: of.write(v.chr + "\t" + str(v.start - 1) + "\t" + str(v.end) + "\t" + str(v.get_payload()) + "\n") # of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n") of.close()
def main(): parser = argparse.ArgumentParser(description="",formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input',help="Use - for STDIN") parser.add_argument('--offset',type=int,default=0,help="add this much to transcript tpms") parser.add_argument('--mult',type=int,default=10,help="multiply this much to tpms") parser.add_argument('--min_exons',type=int,default=1,help="require at least this many exons") parser.add_argument('-o','--output',help="OUTPUT file or nothing for STDOUT") args = parser.parse_args() inf = sys.stdin if args.input != '-': if args.input[-3:]=='.gz': inf = gzip.open(args.input) else: inf = open(args.input) genes = {} sys.stderr.write("Reading gtf file\n") txs = {} for line in inf: if re.match('#',line): continue f = line.rstrip().split("\t") tx = None if f[2] == 'exon' or f[2] == 'transcript': tx = re.search('transcript_id\s+"([^"]+)"',f[8]).group(1) if tx not in txs: txs[tx] = {} txs[tx]['tpm'] = 0 txs[tx]['exons'] = [] if f[2] == 'transcript': tpm = float(re.search('TPM\s+"([^"]+)"',f[8]).group(1)) txs[tx]['tpm'] = int((tpm*float(args.mult))+args.offset) if f[2] == 'exon': chr = f[0] start = int(f[3])-1 end = int(f[4]) txs[tx]['exons'].append(Bed(chr,start,end)) inf.close() vals = [] sys.stderr.write("Traversing annotation file\n") for tx in txs: exons = txs[tx]['exons'] v = txs[tx]['tpm'] if len(exons) < args.min_exons: continue for i in range(0,v): vals += exons[:] sys.stderr.write("Generating coverage file "+str(len(vals))+"\n") of = sys.stdout if args.output: if args.output[-3:]=='.gz': of = gzip.open(args.output,'w') else: of = open(args.output,'w') covs = ranges_to_coverage(vals) for v in covs: of.write(v.chr+"\t"+str(v.start-1)+"\t"+str(v.end)+"\t"+str(v.get_payload())+"\n") # of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n") of.close()
def do_locus(locus): exranges = [] for entry in locus.get_payload(): for exon in entry.exons: exranges.append(exon.get_range()) covs = ranges_to_coverage(exranges) output = [] for cov in covs: output.append("\t".join([str(x) for x in cov.get_bed_coordinates()])+"\t"+str(+cov.get_payload())+"\n") return output
def do_locus(locus): exranges = [] for entry in locus.get_payload(): for exon in entry.exons: exranges.append(exon.get_range()) covs = ranges_to_coverage(exranges) output = [] for cov in covs: output.append("\t".join([str(x) for x in cov.get_bed_coordinates()]) + "\t" + str(+cov.get_payload()) + "\n") return output
def main(): parser = argparse.ArgumentParser(description="",formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input',help="IDP output folder") #parser.add_argument('--min_exons',type=int,default=1,help="At least this number of exons") parser.add_argument('--offset',type=int,default=1,help="add this much to all expressions") parser.add_argument('--mult',type=int,default=10,help="multiply all expressions by this much") parser.add_argument('-o','--output',help="OUTPUT file or nothing for STDOUT") args = parser.parse_args() args.input= args.input.rstrip('/') inf = open(args.input+'/isoform.gpd') sys.stderr.write("Reading isoform.gpd\n") txs = {} for line in inf: gpd = GPD(line) tx = gpd.get_transcript_name() if tx not in txs: txs[tx] = [] for exon in gpd.exons: txs[tx].append(exon.get_range()) inf.close() sys.stderr.write("Reading isoform.exp file\n") inf = open(args.input+'/isoform.exp') vals = [] for line in inf: f = line.rstrip().split("\t") v = int((float(f[1])*args.mult)+args.offset) tx = f[0] exons = txs[tx] #if len(exons) < args.min_exons: continue for i in range(0,v): vals += exons[:] inf.close() sys.stderr.write("Generating coverage file "+str(len(vals))+"\n") of = sys.stdout if args.output: if args.output[-3:]=='.gz': of = gzip.open(args.output,'w') else: of = open(args.output,'w') covs = ranges_to_coverage(vals) for v in covs: of.write(v.chr+"\t"+str(v.start-1)+"\t"+str(v.end)+"\t"+str(v.get_payload())+"\n") # of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n") of.close()
def main(): parser = argparse.ArgumentParser( description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help="Use - for STDIN") parser.add_argument('-o', '--output', help="output file or use STDOUT if not set") args = parser.parse_args() if args.input == '-': args.input = sys.stdin else: args.input = open(args.input) gs = GPDStream(args.input) ls = LocusStream(gs) of = sys.stdout if args.output: if re.search('\.gz$', args.output): of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') for rng in ls: sys.stderr.write(rng.get_range_string() + " \r") gpds = rng.get_payload() exs = [] for ex_set in [[y.get_range() for y in x.exons] for x in gpds]: exs += ex_set cov = ranges_to_coverage(exs) #use our coverage data on each gpd entry now for gpd in gpds: totcov = 0 for exon in [x.get_range() for x in gpd.exons]: gcovs = union_range_array(exon, cov, payload=2) totcov += sum([x.get_payload() * x.length() for x in gcovs]) of.write(gpd.get_gene_name() + "\t" + str(gpd.get_exon_count()) + "\t" + str(gpd.get_length()) + "\t" + str(float(totcov) / float(gpd.get_length())) + "\n") sys.stderr.write("\n") of.close()
def get_depth_per_transcript(self, mindepth=1): bedarray = [] for tx in self.get_transcripts(): for ex in [x.rng for x in tx.exons]: bedarray.append(ex) cov = ranges_to_coverage(bedarray) results = {} for tx in self.get_transcripts(): tlen = tx.get_length() bcov = [] for ex in [x.rng for x in tx.exons]: excov = [[x.overlap_size(ex), x.get_payload()] for x in cov] for coved in [x for x in excov if x[0] > 0]: bcov.append(coved) total_base_coverage = sum([x[0] * x[1] for x in bcov]) average_coverage = float(total_base_coverage) / float(tlen) minimum_bases_covered = sum( [x[0] for x in bcov if x[1] >= mindepth]) fraction_covered_at_minimum = float(minimum_bases_covered) / float( tlen) res = { 'tx': tx, 'average_coverage': average_coverage, 'fraction_covered': fraction_covered_at_minimum, 'mindepth': mindepth, 'length_covered': minimum_bases_covered } results[tx.get_id()] = res #print average_coverage #print fraction_covered_at_minimum #print tlen #tcov = float(bcov)/float(tlen) #print tcov #for c in cov: # print c return results
def main(): parser = argparse.ArgumentParser( description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help="Use - for STDIN") parser.add_argument('genepred', help="the genepred used for this alignqc") parser.add_argument('--min_exons', type=int, default=1, help="At least this number of exons") parser.add_argument('--full', action='store_true', help="only use full matches") parser.add_argument('-o', '--output', help="OUTPUT file or nothing for STDOUT") args = parser.parse_args() inf = sys.stdin if args.input != '-': if args.input[-3:] == '.gz': inf = gzip.open(args.input) else: inf = open(args.input) genes = {} sys.stderr.write("Reading annotation file\n") for line in inf: f = line.rstrip().split("\t") gene = f[2] tx = f[3] type = f[4] if args.full and type != 'full': continue if gene not in genes: genes[gene] = {} genes[gene]['transcripts'] = {} genes[gene]['cnt'] = 0 if tx not in genes[gene]['transcripts']: genes[gene]['transcripts'][tx] = 0 genes[gene]['cnt'] += 1 genes[gene]['transcripts'][tx] += 1 inf.close() txs = {} sys.stderr.write("Reading genepred file\n") z = 0 with open(args.genepred) as inf: for line in inf: z += 1 if z % 1000 == 0: sys.stderr.write(str(z) + " \r") gpd = GPD(line) exs = [] for ex in gpd.exons: exs.append(ex.get_range()) txs[gpd.get_transcript_name()] = exs sys.stderr.write("\n") vals = [] sys.stderr.write("Traversing annotation file\n") for gene in genes: for tx in genes[gene]['transcripts']: v = genes[gene]['transcripts'][tx] exons = txs[tx] if len(exons) < args.min_exons: continue for i in range(0, v): vals += exons[:] sys.stderr.write("Generating coverage file " + str(len(vals)) + "\n") of = sys.stdout if args.output: if args.output[-3:] == '.gz': of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') covs = ranges_to_coverage(vals) for v in covs: of.write(v.chr + "\t" + str(v.start - 1) + "\t" + str(v.end) + "\t" + str(v.get_payload()) + "\n") # of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n") of.close()
def main(): parser = argparse.ArgumentParser( description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help="Use - for STDIN") parser.add_argument('--offset', type=int, default=0, help="add this much to transcript tpms") parser.add_argument('--mult', type=int, default=10, help="multiply this much to tpms") parser.add_argument('--min_exons', type=int, default=1, help="require at least this many exons") parser.add_argument('-o', '--output', help="OUTPUT file or nothing for STDOUT") args = parser.parse_args() inf = sys.stdin if args.input != '-': if args.input[-3:] == '.gz': inf = gzip.open(args.input) else: inf = open(args.input) genes = {} sys.stderr.write("Reading gtf file\n") txs = {} for line in inf: if re.match('#', line): continue f = line.rstrip().split("\t") tx = None if f[2] == 'exon' or f[2] == 'transcript': tx = re.search('transcript_id\s+"([^"]+)"', f[8]).group(1) if tx not in txs: txs[tx] = {} txs[tx]['tpm'] = 0 txs[tx]['exons'] = [] if f[2] == 'transcript': tpm = float(re.search('TPM\s+"([^"]+)"', f[8]).group(1)) txs[tx]['tpm'] = int((tpm * float(args.mult)) + args.offset) if f[2] == 'exon': chr = f[0] start = int(f[3]) - 1 end = int(f[4]) txs[tx]['exons'].append(Bed(chr, start, end)) inf.close() vals = [] sys.stderr.write("Traversing annotation file\n") for tx in txs: exons = txs[tx]['exons'] v = txs[tx]['tpm'] if len(exons) < args.min_exons: continue for i in range(0, v): vals += exons[:] sys.stderr.write("Generating coverage file " + str(len(vals)) + "\n") of = sys.stdout if args.output: if args.output[-3:] == '.gz': of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') covs = ranges_to_coverage(vals) for v in covs: of.write(v.chr + "\t" + str(v.start - 1) + "\t" + str(v.end) + "\t" + str(v.get_payload()) + "\n") # of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n") of.close()
def main(): parser = argparse.ArgumentParser(description="",formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input',help="Use - for STDIN") parser.add_argument('genepred',help="the genepred used for this alignqc") parser.add_argument('--min_exons',type=int,default=1,help="At least this number of exons") parser.add_argument('--full',action='store_true',help="only use full matches") parser.add_argument('-o','--output',help="OUTPUT file or nothing for STDOUT") args = parser.parse_args() inf = sys.stdin if args.input != '-': if args.input[-3:]=='.gz': inf = gzip.open(args.input) else: inf = open(args.input) genes = {} sys.stderr.write("Reading annotation file\n") for line in inf: f = line.rstrip().split("\t") gene = f[2] tx = f[3] type = f[4] if args.full and type != 'full': continue if gene not in genes: genes[gene] = {} genes[gene]['transcripts'] = {} genes[gene]['cnt'] = 0 if tx not in genes[gene]['transcripts']: genes[gene]['transcripts'][tx] = 0 genes[gene]['cnt'] += 1 genes[gene]['transcripts'][tx] += 1 inf.close() txs = {} sys.stderr.write("Reading genepred file\n") z = 0 with open(args.genepred) as inf: for line in inf: z +=1 if z%1000==0: sys.stderr.write(str(z)+" \r") gpd = GPD(line) exs = [] for ex in gpd.exons: exs.append(ex.range) txs[gpd.get_transcript_name()] = exs sys.stderr.write("\n") vals = [] sys.stderr.write("Traversing annotation file\n") for gene in genes: for tx in genes[gene]['transcripts']: v = genes[gene]['transcripts'][tx] exons = txs[tx] if len(exons) < args.min_exons: continue for i in range(0,v): vals += exons[:] sys.stderr.write("Generating coverage file "+str(len(vals))+"\n") of = sys.stdout if args.output: if args.output[-3:]=='.gz': of = gzip.open(args.output,'w') else: of = open(args.output,'w') covs = ranges_to_coverage(vals) for v in covs: of.write(v.chr+"\t"+str(v.start-1)+"\t"+str(v.end)+"\t"+str(v.get_payload())+"\n") # of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n") of.close()