def fetchProbeFragments(probe_bed, digest_bed, outfile, lookup_out): digest_fragments = pysam.TabixFile(digest_bed) bed = Bed.Bed() with IOTools.openFile(outfile, "w") as outf, \ IOTools.openFile(lookup_out,"w") as lookup: lookup.write("probe\tfragment\n") for probe in Bed.iterator(IOTools.openFile(probe_bed)): frag = digest_fragments.fetch(probe.contig, probe.start, probe.end, parser=pysam.asBed()) frag = list(frag) if not len(frag) == 1: E.warn("%i fragments found for probe %s, skipping" % (len(frag), probe.name)) continue frag = frag[0] bed.start = frag.start bed.end = frag.end bed.contig = frag.contig bed["name"] = probe.name bed["score"] = "." bed["strand"] = "+" lookup.write("%s\t%s\n" % (probe.name, frag.name)) outf.write(str(bed) + "\n")
def exportSequencesFromBedFile( infile, outfile, masker = None, mode = "intervals" ): '''export sequences for intervals in :term:`bed`-formatted *infile* to :term:`fasta` formatted *outfile* ''' track = P.snip( infile, ".bed.gz" ) fasta = IndexedFasta.IndexedFasta( os.path.join( PARAMS["genome_dir"], PARAMS["genome"] ) ) outs = IOTools.openFile( outfile, "w") ids, seqs = [], [] for bed in Bed.setName(Bed.iterator( IOTools.openFile(infile) )): lcontig = fasta.getLength( bed.contig ) if mode == "intervals": seqs.append( fasta.getSequence( bed.contig, "+", bed.start, bed.end) ) ids.append( "%s_%s %s:%i..%i" % (track, bed.name, bed.contig, bed.start, bed.end) ) elif mode == "leftright": l = bed.end - bed.start start, end = max(0,bed.start-l), bed.end-l ids.append( "%s_%s_l %s:%i..%i" % (track, bed.name, bed.contig, start, end) ) seqs.append( fasta.getSequence( bed.contig, "+", start, end) ) start, end = bed.start+l, min(lcontig,bed.end+l) ids.append( "%s_%s_r %s:%i..%i" % (track, bed.name, bed.contig, start, end) ) seqs.append( fasta.getSequence( bed.contig, "+", start, end) ) masked = maskSequences( seqs, masker ) outs.write("\n".join( [ ">%s\n%s" % (x,y) for x,y in zip(ids, masked) ] ) ) outs.close()
def annotateCpGIslands(infiles, outfile): '''annotate transcript by absence/presence of CpG islands ''' cpgfile, tssfile = infiles cpg = Bed.readAndIndex(IOTools.openFile(cpgfile)) extension_upstream = PARAMS["cpg_search_upstream"] extension_downstream = PARAMS["cpg_search_downstream"] c = E.Counter() outf = IOTools.openFile(outfile, "w") outf.write( "transcript_id\tstrand\tstart\tend\trelative_start\trelative_end\n") for tss in Bed.iterator(IOTools.openFile(tssfile)): c.tss_total += 1 if tss.strand == "+": start, end = tss.start - extension_upstream, tss.start + extension_downstream else: start, end = tss.end - extension_downstream, tss.end + extension_upstream try: matches = list(cpg[tss.contig].find(start, end)) except KeyError: c.promotor_without_matches += 1 continue if len(matches) == 0: c.promotor_without_matches += 1 continue c.promotor_output += 1 for match in matches: c.matches_total += 1 genome_start, genome_end, x = match l = genome_end - genome_start # get relative location of match if tss.strand == "+": relative_start = genome_start - tss.start else: relative_start = tss.end - genome_end relative_end = relative_start + l outf.write("\t".join( map(str, (tss.name, tss.strand, genome_start, genome_end, relative_start, relative_end))) + "\n") c.matches_output += 1 outf.close() with IOTools.openFile(outfile + ".summary", "w") as outf: outf.write("category\tcounts\n") outf.write(c.asTable() + "\n") E.info(c)
def getProbeFragments(probe_bed, digest_bed, outfile, lookup_out): # First find the length of the restriction enzyme cut, required to obtain the start and end coordinates # from the pregenerated file. # First iteration, no comparison first_iteration = True length_RE_cut = 0 last_bed = None for bed_digest in Bed.iterator(IOTools.openFile(digest_bed)): if(first_iteration): first_iteration = False else: # If they are in the same contig they can be compared if(bed_digest.contig == last_bed.contig): length_RE_cut = bed_digest.start - last_bed.end break last_bed = bed_digest digest_fragments = pysam.TabixFile(digest_bed) bed = Bed.Bed() with IOTools.openFile(outfile, "w") as outf, \ IOTools.openFile(lookup_out,"w") as lookup: lookup.write("probe\tfragment\n") for probe in Bed.iterator(IOTools.openFile(probe_bed)): frag = digest_fragments.fetch(probe.contig, probe.start, probe.end, parser=pysam.asBed()) frag = list(frag) if not len(frag) == 1: E.warn("%i fragments found for probe %s, skipping" % (len(frag), probe.name)) continue frag = frag[0] # The restriction enzyme cut on the left side of the fragment # is the end site of the last restriction enzyme fragment + 1 # (+1 because according to the manual coordinates are specified # in 1-origin for the bed start.) bed.start = frag.start-length_RE_cut+1 bed.end = frag.end+length_RE_cut bed.contig = frag.contig bed["name"] = probe.name bed["score"] = "." bed["strand"] = "+" lookup.write("%s\t%s\n" % (probe.name, frag.name)) outf.write(str(bed) + "\n")
def annotateCpGIslands( infiles, outfile ): '''annotate transcript by absence/presence of CpG islands ''' cpgfile, tssfile = infiles cpg = Bed.readAndIndex( IOTools.openFile( cpgfile ) ) extension_upstream = PARAMS["cpg_search_upstream"] extension_downstream = PARAMS["cpg_search_downstream"] c = E.Counter() outf = IOTools.openFile( outfile, "w" ) outf.write("transcript_id\tstrand\tstart\tend\trelative_start\trelative_end\n" ) for tss in Bed.iterator(IOTools.openFile( tssfile ) ): c.tss_total += 1 if tss.strand == "+": start, end = tss.start - extension_upstream, tss.start + extension_downstream else: start, end = tss.end - extension_downstream, tss.end + extension_upstream try: matches = list(cpg[tss.contig].find( start, end )) except KeyError: c.promotor_without_matches += 1 continue if len(matches) == 0: c.promotor_without_matches += 1 continue c.promotor_output += 1 for match in matches: c.matches_total += 1 genome_start, genome_end, x = match l = genome_end - genome_start # get relative location of match if tss.strand == "+": relative_start = genome_start - tss.start else: relative_start = tss.end - genome_end relative_end = relative_start + l outf.write( "\t".join( map(str, ( tss.name, tss.strand, genome_start, genome_end, relative_start, relative_end ))) + "\n" ) c.matches_output += 1 outf.close() with IOTools.openFile( outfile + ".summary", "w" ) as outf: outf.write ("category\tcounts\n" ) outf.write( c.asTable() + "\n" ) E.info( c )
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id: bed2graph.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option("-o", "--output", dest="output", type="choice", choices=("full", "name"), help="output either ``full`` overlapping entries, only the ``name``s. [default=%default].") parser.set_defaults( output="full", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) != 2: raise ValueError("two arguments required") if args[0] == "-": infile1 = options.stdin else: infile1 = IOTools.openFile(args[0], "r") infile2 = IOTools.openFile(args[1], "r") idx = Bed.readAndIndex(infile2, with_values=True) output = options.output outfile = options.stdout if output == "name": outfile.write("name1\tname2\n") outf = lambda x: x.fields[0] else: outf = str for bed in Bed.iterator(infile1): try: overlaps = idx[bed.contig].find(bed.start, bed.end) except (KeyError, IndexError): # ignore missing contig and zero length intervals continue for o in overlaps: outfile.write("\t".join((outf(bed), outf(o[2]))) + "\n") E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id: bed2graph.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option("-o", "--output-section", dest="output", type="choice", choices=("full", "name"), help="output either ``full`` overlapping entries, only the ``name``s. [default=%default].") parser.set_defaults( output="full", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) != 2: raise ValueError("two arguments required") if args[0] == "-": infile1 = options.stdin else: infile1 = IOTools.openFile(args[0], "r") infile2 = IOTools.openFile(args[1], "r") idx = Bed.readAndIndex(infile2, with_values=True) output = options.output outfile = options.stdout if output == "name": outfile.write("name1\tname2\n") outf = lambda x: x.fields[0] else: outf = str for bed in Bed.iterator(infile1): try: overlaps = idx[bed.contig].find(bed.start, bed.end) except (KeyError, IndexError): # ignore missing contig and zero length intervals continue for o in overlaps: outfile.write("\t".join((outf(bed), outf(o[2]))) + "\n") E.Stop()
def _count(self, filename, idx): '''count filename against idx.''' overlapping_genes = set() genes = set() # iterate over exons infile = IOTools.openFile(filename, "r") it = Bed.bed_iterator(infile) nexons, nexons_overlapping = 0, 0 nbases, nbases_overlapping = 0, 0 for this in it: nexons += 1 nbases += this.end - this.start try: intervals = list( idx[this.contig].find(max(0, this.start), this.end)) except KeyError: continue except Exception, msg: raise Exception( "error while processing %s, msg=%s" % (filename, msg)) if len(intervals) == 0: continue nexons_overlapping += 1 start, end = this.start, this.end counts = numpy.zeros(end - start, numpy.int) for other_start, other_end, other_value in intervals: for x in range(max(start, other_start) - start, min(end, other_end) - start): counts[x] += 1 nbases_overlapping += sum([1 for x in counts if x > 0])
def aggregateWindowsReadCounts(infiles, outfile, regex="(.*)\..*"): '''aggregate several results from coverageBed into a single file. *regex* is used to extract the track name from the filename. The default removes any suffix. coverageBed outputs the following columns: 1 Contig 2 Start 3 Stop 4 Name 5 The number of features in A that overlapped (by at least one base pair) the B interval. 6 The number of bases in B that had non-zero coverage from features in A. 7 The length of the entry in B. 8 The fraction of bases in B that had non-zero coverage from features in A. For bed: use column 5 For bed6: use column 7 For bed12: use column 13 Windows without any counts will not be output. ''' # get bed format bed_columns = Bed.getNumColumns(infiles[0]) # +1 as awk is 1-based column = bed_columns - 4 + 1 src = " ".join([ '''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}' ) ''' % (x, column) for x in infiles ]) tmpfile = P.getTempFilename(".") statement = '''paste %(src)s > %(tmpfile)s''' P.run() # build track names tracks = [ re.search(regex, os.path.basename(x)).groups()[0] for x in infiles ] outf = IOTools.openFile(outfile, "w") outf.write("interval_id\t%s\n" % "\t".join(tracks)) for line in open(tmpfile, "r"): data = line[:-1].split("\t") genes = list(set([data[x] for x in range(0, len(data), 2)])) values = [int(data[x]) for x in range(1, len(data), 2)] if sum(values) == 0: continue assert len(genes) == 1, \ "paste command failed, wrong number of genes per line: '%s'" % line outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values)))) outf.close() os.unlink(tmpfile)
def aggregateWindowsReadCounts(infiles, outfile, regex="(.*)\..*"): '''aggregate several results from coverageBed into a single file. *regex* is used to extract the track name from the filename. The default removes any suffix. coverageBed outputs the following columns: 1 Contig 2 Start 3 Stop 4 Name 5 The number of features in A that overlapped (by at least one base pair) the B interval. 6 The number of bases in B that had non-zero coverage from features in A. 7 The length of the entry in B. 8 The fraction of bases in B that had non-zero coverage from features in A. For bed: use column 5 For bed6: use column 7 For bed12: use column 13 Windows without any counts will not be output. ''' # get bed format bed_columns = Bed.getNumColumns(infiles[0]) # +1 as awk is 1-based column = bed_columns - 4 + 1 src = " ".join(['''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}' ) ''' % (x, column) for x in infiles]) tmpfile = P.getTempFilename(".") statement = '''paste %(src)s > %(tmpfile)s''' P.run() # build track names tracks = [re.search(regex, os.path.basename(x)).groups()[0] for x in infiles] outf = IOTools.openFile(outfile, "w") outf.write("interval_id\t%s\n" % "\t".join(tracks)) for line in open(tmpfile, "r"): data = line[:-1].split("\t") genes = list(set([data[x] for x in range(0, len(data), 2)])) values = [int(data[x]) for x in range(1, len(data), 2)] if sum(values) == 0: continue assert len(genes) == 1, \ "paste command failed, wrong number of genes per line: '%s'" % line outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values)))) outf.close() os.unlink(tmpfile)
def quantifyAnomolies(bam, probes, outfile): bamfile = pysam.AlignmentFile(bam) results = dict() mapped = bamfile.mapped total = 0 seeks = 0 for probe in Bed.iterator(IOTools.openFile(probes)): c = collections.Counter() for read in bamfile.fetch(probe.contig, probe.start, probe.end, multiple_iterators=True): if read.is_unmapped: continue c["total"] += 1 if not (read.is_secondary or read.is_supplementary): c["primary"] += 1 else: continue if read.pos < (probe.start - 4) or read.aend > (probe.end + 4): c["undigested"] += 1 if read.is_read1: c["read1"] += 1 if not read.mate_is_unmapped: c["paired"] += 1 if read.is_read1 and (read.mpos >= probe.start and read.mpos <= probe.end): c["self_lig"] += 1 if (total + c["total"] % 10000) == 0: E.debug("%s/%s done" % (total + c["total"], mapped)) E.debug("%s processed, %i found" % (probe.name, c["total"])) results[probe.name] = c total += c["total"] headers = [ "Probe", "total", "primary", "undigested", "read1", "paired", "self_lig" ] with IOTools.openFile(outfile, "w") as outf: outf.write("\t".join(headers) + "\n") for probe in results: outf.write("\t".join( [probe] + [str(results[probe][col]) for col in headers[1:]]) + "\n")
def sites2fragments(infile, genomefile, outfile): '''Convert bedfile of deigestion sites into bedfile of fragments''' contig_lengths = { line.split()[0]: int(line.split()[1][:-1]) for line in IOTools.openFile(genomefile) } last_end = 0 last_contig = None name = 0 new_bed = Bed.Bed() new_bed["strand"] = "+" new_bed["score"] = "." with IOTools.openFile(outfile, "w") as outf: for bed in Bed.iterator(IOTools.openFile(infile)): if last_contig is not None and not bed.contig == last_contig: name += 1 new_bed.start = last_end new_bed.contig = last_contig new_bed.end = contig_lengths[bed.contig] new_bed["name"] = str(name) outf.write(str(new_bed) + "\n") last_end = 0 last_contig = bed.contig new_bed.contig = last_contig new_bed.start = last_end new_bed.end = bed.start name += 1 new_bed["name"] = str(name) outf.write(str(new_bed) + "\n") last_end = bed.end name += 1 new_bed.start = last_end new_bed.contig = last_contig new_bed.end = contig_lengths[bed.contig] new_bed["name"] = str(name) outf.write(str(new_bed) + "\n") pysam.tabix_index(outfile, force=True, preset="bed")
def quantifyAnomolies(bam, probes, outfile): bamfile = pysam.AlignmentFile(bam) results = dict() mapped = bamfile.mapped total = 0 seeks = 0 for probe in Bed.iterator(IOTools.openFile(probes)): c = collections.Counter() for read in bamfile.fetch(probe.contig, probe.start, probe.end, multiple_iterators=True): if read.is_unmapped: continue c["total"] += 1 if not (read.is_secondary or read.is_supplementary): c["primary"] += 1 else: continue if read.pos < (probe.start-4) or read.aend > (probe.end +4): c["undigested"] += 1 if read.is_read1: c["read1"] += 1 if not read.mate_is_unmapped: c["paired"] += 1 if read.is_read1 and ( read.mpos >= probe.start and read.mpos <= probe.end): c["self_lig"] += 1 if (total + c["total"] % 10000) == 0: E.debug("%s/%s done" % (total + c["total"], mapped)) E.debug("%s processed, %i found" % (probe.name, c["total"])) results[probe.name] = c total += c["total"] headers = ["Probe", "total", "primary", "undigested", "read1", "paired", "self_lig"] with IOTools.openFile(outfile, "w") as outf: outf.write("\t".join(headers) + "\n") for probe in results: outf.write("\t".join( [probe]+[str(results[probe][col]) for col in headers[1:]]) + "\n")
def main( argv = None ): if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gtf2table.py 2888 2010-04-07 08:48:36Z andreas $", usage = globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default]." ) parser.add_option("-n", "--per-name", dest="per_name", action="store_true", help="compute counts per name [default=%default]." ) parser.add_option("-c", "--per-contig", dest="per_contig", action="store_true", help="compute counts per contig [default=%default]." ) parser.add_option("-t", "--per-track", dest="per_track", action="store_true", help="compute counts per track [default=%default]." ) parser.set_defaults( genome_file = None, per_name = False, per_track = False, ) (options, args) = E.Start( parser, argv ) # get files if options.genome_file: fasta = IndexedFasta.IndexedFasta( options.genome_file ) else: fasta = None counts = collections.defaultdict( Counter ) if options.per_track: keyf = lambda x: x.track elif options.per_name: keyf = lambda x: x.name elif options.per_contig: keyf = lambda x: x.contig else: keyf = lambda x: "all" for bed in Bed.iterator(options.stdin): counts[keyf(bed)].add( bed ) outf = options.stdout key = "track" outf.write( "%s\t%s\n" % (key, "\t".join( Counter.headers) )) for key, count in counts.iteritems(): outf.write( "%s\t%s\n" % ( key, str(count)) ) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--regex-filename", dest="re_name", type="string", help="regex for filename component to be used " "as the annotation label") parser.set_defaults( re_name="(.+)", ) (options, args) = E.Start(parser, argv=argv) infiles = argv[-1] bed_files = infiles.split(",") if len(bed_files) == 1: raise IOError("Only one file detected, cannot merge " "a single bed file") else: pass # get the regex for annotation names, rx = re.compile(options.re_name) annot_names = [[y for y in rx.search(x).groups()] for x in bed_files] annot_names = list(itertools.chain(*annot_names)) # output as BED4 format: chr, start, end, name for fx in range(len(bed_files)): bfile = bed_files[fx] with IOTools.openFile(bfile, "r") as ofile: intervals = Bed.iterator(ofile) track_name = [bx for bx in annot_names if re.search(bx, bfile)][0] for entry in intervals: entry.__setitem__("name", track_name) options.stdout.write("%s\t%s\t%s\t%s\n" % (entry.contig, entry.start, entry.end, entry.name)) # write footer and output benchmark information. E.Stop()
def aggregateWindowsReadCounts(infiles, outfile): '''aggregate tag counts for each window. coverageBed outputs the following columns: 1) Contig 2) Start 3) Stop 4) Name 5) The number of features in A that overlapped (by at least one base pair) the B interval. 6) The number of bases in B that had non-zero coverage from features in A. 7) The length of the entry in B. 8) The fraction of bases in B that had non-zero coverage from features in A. For bed: use column 5 For bed6: use column 7 For bed12: use column 13 Tiles with no counts will not be output. ''' to_cluster = True # get bed format bed_columns = Bed.getNumColumns(infiles[0]) # +1 as awk is 1-based column = bed_columns - 4 + 1 src = " ".join([ '''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}' ) ''' % (x, column) for x in infiles ]) tmpfile = P.getTempFilename(".") statement = '''paste %(src)s > %(tmpfile)s''' P.run() tracks = [re.sub("\..*", '', os.path.basename(x)) for x in infiles] outf = IOTools.openFile(outfile, "w") outf.write("interval_id\t%s\n" % "\t".join(tracks)) for line in open(tmpfile, "r"): data = line[:-1].split("\t") genes = list(set([data[x] for x in range(0, len(data), 2)])) values = [int(data[x]) for x in range(1, len(data), 2)] if sum(values) == 0: continue assert len( genes ) == 1, "paste command failed, wrong number of genes per line: '%s'" % line outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values)))) outf.close() os.unlink(tmpfile)
def countTagsInClusters(bedfile, bamfile, outfile): bam = pysam.AlignmentFile(bamfile) outlines = [] for bed in Bed.iterator(IOTools.openFile(bedfile)): interval = (bed.start, bed.end) counts = iCLIP.count_intervals(bam, [interval], bed.contig).sum() outlines.append(["%s:%i-%i" % (bed.contig, bed.start, bed.end), str(counts)]) IOTools.writeLines(outfile, outlines, header=["position","count"])
def exportSequencesFromBedFile(infile, outfile, masker=None, mode="intervals"): '''export sequences for intervals in :term:`bed`-formatted *infile* to :term:`fasta` formatted *outfile* ''' track = P.snip(infile, ".bed.gz") fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) outs = IOTools.open_file(outfile, "w") ids, seqs = [], [] for bed in Bed.setName(Bed.iterator(IOTools.open_file(infile))): lcontig = fasta.getLength(bed.contig) if mode == "intervals": seqs.append(fasta.getSequence(bed.contig, "+", bed.start, bed.end)) ids.append("%s_%s %s:%i..%i" % (track, bed.name, bed.contig, bed.start, bed.end)) elif mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_%s_l %s:%i..%i" % (track, bed.name, bed.contig, start, end)) seqs.append(fasta.getSequence(bed.contig, "+", start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_%s_r %s:%i..%i" % (track, bed.name, bed.contig, start, end)) seqs.append(fasta.getSequence(bed.contig, "+", start, end)) masked = maskSequences(seqs, masker) outs.write("\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)])) outs.close()
def countTagsInClusters(bedfile, bamfile, outfile): bam = pysam.AlignmentFile(bamfile) outlines = [] for bed in Bed.iterator(IOTools.openFile(bedfile)): interval = (bed.start, bed.end) counts = iCLIP.count_intervals(bam, [interval], bed.contig).sum() outlines.append( ["%s:%i-%i" % (bed.contig, bed.start, bed.end), str(counts)]) IOTools.writeLines(outfile, outlines, header=["position", "count"])
def buildAlignmentSizes(infiles, outfile): ''' use bed files to sum the total number of bases that are aligned to the genomes ''' outf = open(outfile, "w") outf.write("genome\tsize\n") for infile in infiles: genome = P.snip(os.path.basename(infile), ".bed.gz") c = 0 inf = IOTools.openFile(infile) for bed in Bed.iterator(inf): c += bed.end - bed.start outf.write("%s\t%s\n" % (genome, str(c))) outf.close()
def buildQuicksectMask(bed_file): '''return Quicksect object containing the regions specified takes a bed file listing the regions to mask ''' mask = IndexedGenome.Quicksect() n_regions = 0 for bed in Bed.iterator(IOTools.openFile(bed_file)): # it is neccessary to extend the region to make an accurate mask mask.add(bed.contig, (bed.start - 1), (bed.end + 1), 1) n_regions += 1 E.info("Built Quicksect mask for %i regions" % n_regions) return(mask)
def bedsFromList(data): ''' takes a list of data and returns a bed object''' for interval in data: bed = Bed.Bed() try: bed.contig, bed.start, bed.end = \ interval[0], int(interval[1]), int(interval[2]) except IndexError: raise ValueError("Insufficient fields to generate bed entry") except ValueError: raise ValueError("Fields 2 and 3 must be integer") bed.fields = interval[3:] yield bed
def aggregateWindowsReadCounts( infiles, outfile ): '''aggregate tag counts for each window. coverageBed outputs the following columns: 1) Contig 2) Start 3) Stop 4) Name 5) The number of features in A that overlapped (by at least one base pair) the B interval. 6) The number of bases in B that had non-zero coverage from features in A. 7) The length of the entry in B. 8) The fraction of bases in B that had non-zero coverage from features in A. For bed: use column 5 For bed6: use column 7 For bed12: use column 13 Tiles with no counts will not be output. ''' to_cluster = True # get bed format bed_columns = Bed.getNumColumns( infiles[0] ) # +1 as awk is 1-based column = bed_columns - 4 + 1 src = " ".join( [ '''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}' ) ''' % (x,column) for x in infiles] ) tmpfile = P.getTempFilename( "." ) statement = '''paste %(src)s > %(tmpfile)s''' P.run() tracks = [ re.sub( "\..*", '', os.path.basename(x) ) for x in infiles ] outf = IOTools.openFile( outfile, "w") outf.write( "interval_id\t%s\n" % "\t".join( tracks ) ) for line in open( tmpfile, "r" ): data = line[:-1].split("\t") genes = list(set([ data[x] for x in range(0,len(data), 2 ) ])) values = [ int(data[x]) for x in range(1,len(data), 2 ) ] if sum(values) == 0: continue assert len(genes) == 1, "paste command failed, wrong number of genes per line: '%s'" % line outf.write( "%s\t%s\n" % (genes[0], "\t".join(map(str, values) ) ) ) outf.close() os.unlink(tmpfile)
def sites2fragments(infile, genomefile, outfile): '''Convert bedfile of deigestion sites into bedfile of fragments''' contig_lengths = {line.split()[0]: int(line.split()[1][:-1]) for line in IOTools.openFile(genomefile)} last_end = 0 last_contig = None name = 0 new_bed = Bed.Bed() new_bed["strand"] = "+" new_bed["score"] = "." with IOTools.openFile(outfile, "w") as outf: for bed in Bed.iterator(IOTools.openFile(infile)): if last_contig is not None and not bed.contig == last_contig: name += 1 new_bed.start = last_end new_bed.contig = last_contig new_bed.end = contig_lengths[bed.contig] new_bed["name"] = str(name) outf.write(str(new_bed) + "\n") last_end = 0 last_contig = bed.contig new_bed.contig = last_contig new_bed.start = last_end new_bed.end = bed.start name += 1 new_bed["name"] = str(name) outf.write(str(new_bed) + "\n") last_end = bed.end name += 1 new_bed.start = last_end new_bed.contig = last_contig new_bed.end = contig_lengths[bed.contig] new_bed["name"] = str(name) outf.write(str(new_bed) + "\n") pysam.tabix_index(outfile, force=True, preset="bed")
def __init__(self, filename, *args, **kwargs ): assert filename != None, "please supply filename for CounterOverlap" Counter.__init__(self, *args, **kwargs ) self.filename = filename E.info( "reading intervals from %s" % self.filename ) self.index = Bed.readAndIndex( IOTools.openFile( self.filename, "r"), per_track = True ) E.info( "read intervals for %s tracks" % len(self.index) ) self.tracks = self.index.keys() self.headers = [] for track in self.tracks: self.headers.extend( ["%s_nover" % track, "%s_bases" % track] )
def transcript2bed12(transcript): new_entry = Bed.Bed() start = min(entry.start for entry in transcript) end = max(entry.end for entry in transcript) try: thickStart = min(entry.start for entry in transcript if entry.feature == "CDS") thickEnd = max(entry.end for entry in transcript if entry.feature == "CDS") except ValueError: # if there is no CDS, then set first base of transcript as # start if transcript[0].strand == "-": thickStart = end thickEnd = end else: thickStart = start thickEnd = start exons = GTF.asRanges(transcript, "exon") exon_starts = [es - start for (es, ee) in exons] exon_lengths = [ee - es for (es, ee) in exons] exon_count = len(exons) new_entry.contig = transcript[0].contig new_entry.start = start new_entry.end = end new_entry["strand"] = transcript[0].strand new_entry["name"] = transcript[0].transcript_id new_entry["thickStart"] = thickStart new_entry["thickEnd"] = thickEnd new_entry["blockCount"] = exon_count new_entry["blockStarts"] = ",".join(map(str, exon_starts)) new_entry["blockSizes"] = ",".join(map(str, exon_lengths)) return new_entry
def windows2bed12(windows, contig, strand, name, score): '''Convert a list of intervals into a single bed12 entry ''' windows = sorted(windows) entry = Bed.Bed() #if strand == "-": # windows = [(y+1, x+1) for x, y in windows] # windows = sorted(windows) #else: # windows = sorted(windows) entry.start = int(windows[0][0]) entry.end = int(windows[-1][1]) entry.contig = contig blockCount = int(len(windows)) blockSizes = ",".join( [str(int(window[1] - window[0])) for window in windows]) blockStarts = ",".join( [str(int(window[0] - windows[0][0])) for window in windows]) thickStart = int(entry.start) thickEnd = int(entry.end) itemRGB = "255,0,0" entry.fields = [ name, score, strand, thickStart, thickEnd, itemRGB, blockCount, blockSizes, blockStarts ] assert entry.end - entry.start > 0, "Malformed Bed entry entry size less than zero" assert all([blockSize > 0 for blockSize in map(int, blockSizes.split(","))]), \ "Malformed Bed entry, at least one block size less than zero" assert all([entry.start + blockStart <= entry.end for blockStart in map(int,blockStarts.split(","))]), \ "Malformed Bed entry: block start after end of entry" return entry
def makeIntervalCorrelation(infiles, outfile, field, reference): '''compute correlation of interval properties between sets ''' dbhandle = sqlite3.connect(PARAMS["database_name"]) tracks, idx = [], [] for infile in infiles: track = P.snip(infile, ".bed.gz") tablename = "%s_intervals" % P.tablequote(track) cc = dbhandle.cursor() statement = "SELECT contig, start, end, %(field)s FROM %(tablename)s" % locals( ) cc.execute(statement) ix = IndexedGenome.IndexedGenome() for contig, start, end, peakval in cc: ix.add(contig, start, end, peakval) idx.append(ix) tracks.append(track) outs = IOTools.openFile(outfile, "w") outs.write("contig\tstart\tend\tid\t" + "\t".join(tracks) + "\n") for bed in Bed.iterator(infile=IOTools.openFile(reference, "r")): row = [] for ix in idx: try: intervals = list(ix.get(bed.contig, bed.start, bed.end)) except KeyError: row.append("") continue if len(intervals) == 0: peakval = "" else: peakval = str((max([x[2] for x in intervals]))) row.append(peakval) outs.write(str(bed) + "\t" + "\t".join(row) + "\n") outs.close()
def _count(self, filename, idx): '''count filename against idx.''' overlapping_genes = set() genes = set() # iterate over exons infile = IOTools.openFile(filename, "r") it = Bed.bed_iterator(infile) nexons, nexons_overlapping = 0, 0 nbases, nbases_overlapping = 0, 0 for this in it: nexons += 1 nbases += this.end - this.start try: intervals = list(idx[this.contig].find(max(0, this.start), this.end)) except KeyError: continue except Exception as msg: raise Exception("error while processing %s, msg=%s" % (filename, msg)) if len(intervals) == 0: continue nexons_overlapping += 1 start, end = this.start, this.end counts = numpy.zeros(end - start, numpy.int) for other_start, other_end, other_value in intervals: for x in range( max(start, other_start) - start, min(end, other_end) - start): counts[x] += 1 nbases_overlapping += sum([1 for x in counts if x > 0]) infile.close() return nexons, nexons_overlapping, nbases, nbases_overlapping
def getExonLocations(filename): '''return a list of exon locations as Bed entries from a file contain a one ensembl gene ID per line ''' fh = IOTools.openFile(filename, "r") ensembl_ids = [] for line in fh: ensembl_ids.append(line.strip()) fh.close() dbhandle = sqlite3.connect(PARAMS["annotations_database"]) cc = dbhandle.cursor() gene_ids = [] n_ids = 0 for ID in ensembl_ids: gene_ids.append('gene_id="%s"' % ID) n_ids += 1 statement = "select contig,start,end from geneset_cds_gtf where " + \ " OR ".join(gene_ids) cc.execute(statement) region_list = [] n_regions = 0 for result in cc: b = Bed.Bed() b.contig, b.start, b.end = result region_list.append(b) n_regions += 1 cc.close() E.info("Retrieved exon locations for %i genes. Got %i regions" % (n_ids, n_regions)) return(region_list)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-g", "--gtf-file", dest="filename_gtf", type="string", help="filename with gene models in gtf format [%default]") parser.add_option( "-m", "--filename-mismapped", dest="filename_mismapped", type="string", help="output bam file for mismapped reads [%default]") parser.add_option( "-j", "--junctions-bed-file", dest="filename_junctions", type="string", help="bam file with reads mapped across junctions [%default]") parser.add_option( "-r", "--filename-regions", dest="filename_regions", type="string", help="filename with regions to remove in bed format [%default]") parser.add_option( "-t", "--transcripts-gtf-file", dest="filename_transcriptome", type="string", help="bam file with reads mapped against transcripts [%default]") parser.add_option( "-p", "--map-tsv-file", dest="filename_map", type="string", help="filename mapping transcript numbers (used by " "--filename-transciptome) to transcript names " "(used by --filename-gtf) [%default]") parser.add_option( "-s", "--filename-stats", dest="filename_stats", type="string", help="filename to output stats to [%default]") parser.add_option( "-o", "--colour", dest="colour_mismatches", action="store_true", help="mismatches will use colour differences (CM tag) [%default]") parser.add_option( "-i", "--ignore-mismatches", dest="ignore_mismatches", action="store_true", help="ignore mismatches [%default]") parser.add_option( "-c", "--remove-contigs", dest="remove_contigs", type="string", help="','-separated list of contigs to remove [%default]") parser.add_option( "-f", "--force-output", dest="force", action="store_true", help="force overwriting of existing files [%default]") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="remove reads not matching uniquely [%default]") parser.add_option("--output-sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.set_defaults( filename_gtf=None, filename_mismapped=None, filename_junctions=None, filename_transcriptome=None, filename_map=None, remove_contigs=None, force=False, unique=False, colour_mismatches=False, ignore_mismatches=False, output_sam=False, filename_table=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if len(args) != 1: raise ValueError("please supply one bam file") bamfile_genome = args[0] genome_samfile = pysam.AlignmentFile(bamfile_genome, "rb") if options.remove_contigs: options.remove_contigs = options.remove_contigs.split(",") if options.filename_map: E.info("reading map") id_map = IOTools.read_map( IOTools.open_file(options.filename_map), has_header=True) id_map = dict([(y, x) for x, y in id_map.items()]) else: id_map = None transcripts = {} if options.filename_gtf: E.info("indexing geneset") mapped, missed = 0, 0 for gtf in GTF.transcript_iterator( GTF.iterator(IOTools.open_file(options.filename_gtf))): gtf.sort(key=lambda x: x.start) transcript_id = gtf[0].transcript_id if id_map: try: transcript_id = id_map[transcript_id] mapped += 1 except KeyError: missed += 1 continue transcripts[transcript_id] = gtf E.info("read %i transcripts from geneset (%i mapped, %i missed)" % (len(transcripts), mapped, missed)) regions_to_remove = None if options.filename_regions: E.info("indexing regions") regions_to_remove = IndexedGenome.Simple() for bed in Bed.iterator(IOTools.open_file(options.filename_regions)): regions_to_remove.add(bed.contig, bed.start, bed.end) E.info("read %i regions" % len(regions_to_remove)) if options.filename_transcriptome: transcripts_samfile = pysam.AlignmentFile(options.filename_transcriptome, "rb") else: transcripts_samfile = None if options.output_sam: output_samfile = pysam.AlignmentFile("-", "wh", template=genome_samfile) else: output_samfile = pysam.AlignmentFile("-", "wb", template=genome_samfile) if options.filename_mismapped: if not options.force and os.path.exists(options.filename_mismapped): raise IOError("output file %s already exists" % options.filename_mismapped) output_mismapped = pysam.AlignmentFile(options.filename_mismapped, "wb", template=genome_samfile) else: output_mismapped = None if options.filename_junctions: junctions_samfile = pysam.AlignmentFile(options.filename_junctions, "rb") else: junctions_samfile = None c = bams2bam_filter(genome_samfile, output_samfile, output_mismapped, transcripts_samfile, junctions_samfile, transcripts, regions=regions_to_remove, unique=options.unique, remove_contigs=options.remove_contigs, colour_mismatches=options.colour_mismatches, ignore_mismatches=options.ignore_mismatches, ignore_transcripts=transcripts_samfile is None, ignore_junctions=junctions_samfile is None) if options.filename_stats: outf = IOTools.open_file(options.filename_stats, "w") outf.write("category\tcounts\n%s\n" % c.asTable()) outf.close() if options.filename_transcriptome: transcripts_samfile.close() genome_samfile.close() output_samfile.close() if output_mismapped: output_mismapped.close() # write footer and output benchmark information. E.stop()
def main(argv=sys.argv): # setup command line parser parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-n", "--new-instance", dest="new_instance", action="store_true", help="create a new IGV instance [%default]") parser.add_option("-s", "--session", dest="session", type="string", help="load session before creating plots " "[%default]") parser.add_option("-d", "--snapshot-dir", dest="snapshotdir", type="string", help="directory to save snapshots in [%default]") parser.add_option("-f", "--format", dest="format", type="choice", choices=("png", "eps", "svg"), help="output file format [%default]") parser.add_option("-o", "--host", dest="host", type="string", help="host that IGV is running on [%default]") parser.add_option("-p", "--port", dest="port", type="int", help="port that IGV listens at [%default]") parser.add_option("-e", "--extend", dest="extend", type="int", help="extend each interval by a number of bases " "[%default]") parser.add_option("-x", "--expand", dest="expand", type="float", help="expand each region by a certain factor " "[%default]") parser.add_option("--session-only", dest="session_only", action="store_true", help="plot session after opening, " "ignore intervals " "[%default]") parser.add_option("--keep", dest="keep_open", action="store_true", help="keep a newly created IGV session open " "[%default]") parser.set_defaults( command="igv.sh", host='127.0.0.1', port=61111, snapshotdir=os.getcwd(), extend=0, format="png", expand=1.0, session=None, session_only=False, keep_open=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) igv_process = None if options.new_instance: E.info("starting new IGV process") igv_process = IGV.startIGV(command=options.command, port=options.port) E.info("new IGV process started") E.info("connection to process on %s:%s" % (options.host, options.port)) E.info("saving images in %s" % options.snapshotdir) igv = IGV.IGV(host=options.host, port=options.port, snapshot_dir=os.path.abspath(options.snapshotdir)) if options.session: E.info('loading session from %s' % options.session) igv.load(options.session) E.info('loaded session') if options.session_only: E.info('plotting session only ignoring any intervals') fn = "%s.%s" % (os.path.basename(options.session), options.format) E.info("writing snapshot to '%s'" % os.path.join(options.snapshotdir, fn)) igv.save(fn) else: c = E.Counter() for bed in Bed.iterator(options.stdin): c.input += 1 # IGV can not deal with white-space in filenames name = re.sub("\s", "_", bed.name) E.info("going to %s:%i-%i for %s" % (bed.contig, bed.start, bed.end, name)) start, end = bed.start, bed.end extend = options.extend if options.expand: d = end - start extend = max(extend, (options.expand * d - d) // 2) start -= extend end += extend igv.go("%s:%i-%i" % (bed.contig, start, end)) fn = "%s.%s" % (name, options.format) E.info("writing snapshot to '%s'" % fn) igv.save(fn) c.snapshots += 1 E.info(c) if igv_process is not None and not options.keep_open: E.info('shutting down IGV') igv_process.send_signal(signal.SIGKILL) E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-a", "--aggregate-by", dest="aggregate", type="choice", choices=("name", "contig", "track", "none"), help="aggregate counts by feature [default=%default].") parser.add_option("-p", "--add-percent", dest="add_percent", action="store_true", help="add percentages [default=%default].") parser.set_defaults( genome_file=None, aggregate="none", add_percent=False, ) (options, args) = E.Start(parser, argv) # get files if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: if options.add_percent: raise ValueError("--add-percent option requires --genome-file") fasta = None if options.add_percent and not options.aggregate == "contig": raise NotImplementedError( "--add-percent option requires --aggregate=contig") counts = collections.defaultdict(Counter) total = Counter() output_totals = True if options.aggregate == "track": keyf = lambda x: x.track elif options.aggregate == "name": keyf = lambda x: x.name elif options.aggregate == "contig": keyf = lambda x: x.contig else: keyf = lambda x: "all" output_totals = False for bed in Bed.iterator(options.stdin): counts[keyf(bed)].add(bed) total.add(bed) outf = options.stdout key = "track" if options.add_percent: outf.write("%s\t%s\n" % (key, "\t".join(Counter.headers_percent))) else: outf.write("%s\t%s\n" % (key, "\t".join(Counter.headers))) total_bases = 0 for key, count in sorted(counts.items()): if options.add_percent: total_bases += fasta.getLength(key) count.setSize(fasta.getLength(key)) outf.write("%s\t%s\n" % (key, str(count))) if output_totals: if options.add_percent: count.setSize(total_bases) outf.write("%s\t%s\n" % ("total", str(total))) E.Stop()
def formatProbeFragments(probe_fragments_bed, outfile): # Create strings for output collisions probe_collisions = "" exclus_collisions = "" with IOTools.openFile(outfile, "w") as outf: # dictionaries for collisions per chr chr_probe = {} chr_exclus = {} for bed_digest in Bed.iterator(IOTools.openFile(probe_fragments_bed)): # chromosome needs to specify only number (remove chr) chromosome = re.sub('chr', '', bed_digest.contig) out_array = [] out_array.append(formatNameCompliance(bed_digest["name"])) out_array.append(str(chromosome)) out_array.append(str(bed_digest.start)) out_array.append(str(bed_digest.end)) out_array.append(str(chromosome)) out_array.append(str(bed_digest.start-1000)) out_array.append(str(bed_digest.end+1000)) out_array.append("1") out_array.append("A") outf.write("\t".join(out_array) + "\n") # Calculate collisions, all coordinates assumed in BED format # It needs to be done per chr # chr_probe -> probe_ranges -> probe_range # chr_exclus -> exclus_ranges -> exclus_range # Check if chromosome key already exists in one of the # dictionaries # (ranges introduced before on that chromosome if bed_digest.contig in chr_probe: # If it exists, get the array of probe ranges # and exclusion ranges already stored probe_ranges = chr_probe[bed_digest.contig] exclus_ranges = chr_exclus[bed_digest.contig] # If it doesn't create the arrays else: probe_ranges = [] exclus_ranges = [] # Create a range for probes and exclusion fragments probe_range = [] exclus_range = [] probe_range.append(bed_digest.start) probe_range.append(bed_digest.end) probe_ranges.append(probe_range) exclus_range.append(bed_digest.start-1000) exclus_range.append(bed_digest.end+1000) exclus_ranges.append(exclus_range) # Substitute the ranges back to the corresponding chr in the # dictionary chr_probe[bed_digest.contig] = probe_ranges chr_exclus[bed_digest.contig] = exclus_ranges for chr in chr_probe: probe_ranges = chr_probe[chr] exclus_ranges = chr_exclus[chr] probe_intersection = set.intersection(*(set(range(start, finish)) for start, finish in probe_ranges)) exclus_intersection = set.intersection(*(set(range(start, finish)) for start, finish in exclus_ranges)) if(len(probe_intersection) != 0): probe_collisions += "Probe collision " +str(chr) probe_collisions += " " +str(min(probe_intersection)) probe_collisions += " " +str(max(probe_intersection)) probe_collisions += "\n" if(len(exclus_intersection) != 0): exclus_collisions += "Exclusion collision " +str(chr) exclus_collisions += " " +str(min(exclus_intersection)) exclus_collisions += " " +str(max(exclus_intersection)) exclus_collisions += "\n" return (probe_collisions, exclus_collisions)
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id: bed2bed.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) # IMS: new method: extend intervals by set amount parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("merge", "filter-genome", "bins", "block", "sanitize-genome", "shift", "extend"), help="method to apply [default=%default]") parser.add_option("--num-bins", dest="num_bins", type="int", help="number of bins into which to merge (used for method `bins) [default=%default]") parser.add_option("--bin-edges", dest="bin_edges", type="string", help="bin_edges for binning method [default=%default]") parser.add_option("--binning-method", dest="binning_method", type="choice", choices=( "equal-bases", "equal-intervals", "equal-range"), help="method used for binning (used for method `bins` if no bin_edges is given) [default=%default]") parser.add_option("--merge-distance", dest="merge_distance", type="int", help="distance in bases over which to merge that are not directly adjacent [default=%default]") parser.add_option("--merge-min-intervals", dest="merge_min_intervals", type="int", help="only output merged intervals that are build from at least x intervals [default=%default]") parser.add_option("--merge-by-name", dest="merge_by_name", action="store_true", help="only merge intervals with the same name [default=%default]") parser.add_option("--remove-inconsistent", dest="remove_inconsistent", action="store_true", help="when merging, do not output intervals where the names of overlapping intervals " "do not match [default=%default]") parser.add_option("--offset", dest="offset", type="int", help="offset for shifting intervals [default=%default]") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-b", "--bam-file", dest="bam_file", type="string", help="bam-formatted filename with genome.") parser.set_defaults(methods=[], merge_distance=0, binning_method="equal-bases", merge_by_name=False, genome_file=None, bam_file=None, num_bins=5, merge_min_intervals=1, bin_edges=None, offset=10000, test=None, extend_distance=1000, remove_inconsistent=False) (options, args) = E.Start(parser, add_pipe_options=True) contigs = None # Why provide full indexed genome, when a tsv of contig sizes would do? if options.genome_file: genome_fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = genome_fasta.getContigSizes() if options.bam_file: samfile = pysam.Samfile(options.bam_file) contigs = dict(zip(samfile.references, samfile.lengths)) processor = Bed.iterator(options.stdin) for method in options.methods: if method == "filter-genome": if not contigs: raise ValueError("please supply contig sizes") processor = filterGenome(processor, contigs) elif method == "sanitize-genome": if not contigs: raise ValueError("please supply contig sizes") processor = sanitizeGenome(processor, contigs) elif method == "merge": processor = merge(processor, options.merge_distance, by_name=options.merge_by_name, min_intervals=options.merge_min_intervals, remove_inconsistent=options.remove_inconsistent) elif method == "bins": if options.bin_edges: bin_edges = map(float, options.bin_edges.split(",")) # IMS: check bin edges are valid if not(len(bin_edges) == options.num_bins + 1): raise ValueError( "Number of bin edge must be one more than number of bins") else: bin_edges = None processor, bin_edges = Bed.binIntervals(processor, num_bins=options.num_bins, method=options.binning_method, bin_edges=bin_edges) E.info("# split bed: bin_edges=%s" % (str(bin_edges))) elif method == "block": processor = Bed.blocked_iterator(processor) elif method == "shift": # IMS: test that contig sizes are availible if not contigs: raise ValueError("please supply genome file") processor = shiftIntervals( processor, contigs, offset=options.offset) # IMS: new method: extend intervals by set amount elif method == "extend": if not contigs: raise ValueError("please supply genome file") processor = extendInterval(processor, contigs, options.offset) noutput = 0 for bed in processor: options.stdout.write(str(bed) + "\n") noutput += 1 E.info("noutput=%i" % (noutput)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m","--method", dest="method", type="choice", choices=["ave_dist","min_dist","corr"], default="min_dist", help="Method for calcuating similarity between profiles") parser.add_option("-s", "--spread", dest="spread", type="int", default=10, help="Amount to spread each tag by") parser.add_option("-k", "--keep-dist", dest="keep_dist", action="store_true", help="Keep the distribution of tag depths") parser.add_option("-r", "--rands", dest="rands", default=100, help="Number of randomisations to use for calculating" " mean and stdev of distance") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) profile1_file, profile2_file = args profile1_file = pysam.AlignmentFile(profile1_file) if profile2_file.endswith("bed") or profile2_file.endswith("bed.gz"): profile2_file = Bed.readAndIndex(profile2_file, with_values=True) profile2_counter = bed_counter else: profile2_file = pysam.AlignmentFile(profile2_file) profile2_counter = iCLIP.count_intervals if options.method=="min_dist": distance_func = iCLIP.findMinDistance elif options.method=="ave_dist": distance_func = iCLIP.calcAverageDistance else: def distance_func(profile1,profile2): return iCLIP.corr_profile(profile1,profile2, options.spread, profile2_ready=True) for exon in GTF.iterator(options.stdin): if exon.feature != "exon": continue contig = exon.contig strand = exon.strand transcript_id = exon.transcript_id start = exon.start end = exon.end profile1 = iCLIP.count_intervals(profile1_file, [(start, end)], contig=contig, strand=strand) profile2 = profile2_counter(profile2_file, [(start, end)], contig=contig, strand=strand) if profile1.sum() == 0 or profile2.sum() == 0: z = "NA" distance = "NA" options.stdout.write( "%(contig)s\t%(start)i\t%(end)i\t%(transcript_id)s\t%(strand)s\t%(distance)s\t%(z)s\n" % locals()) continue if options.method=="corr": profile2 = iCLIP.spread(profile2, options.spread) distance = distance_func(profile1, profile2) rands = iCLIP.rand_apply(profile=profile1, exon=exon, n=options.rands, func=distance_func, keep_dist=options.keep_dist, profile2=profile2) z = (distance - rands.mean())/rands.std() options.stdout.write( "%(contig)s\t%(start)i\t%(end)i\t%(transcript_id)s\t%(strand)s\t%(distance).3f\t%(z).2f\n" % locals()) # write footer and output benchmark information. E.Stop()
def loadIntervalsFromBed(bedfile, track, outfile, bamfiles, offsets): '''load intervals from :term:`bed` formatted files into database. Re-evaluate the intervals by counting reads within the interval. In contrast to the initial pipeline, the genome is not binned. In particular, the meaning of the columns in the table changes to: nProbes: number of reads in interval PeakCenter: position with maximum number of reads in interval AvgVal: average coverage within interval ''' tmpfile = P.getTempFile(".") headers = ("AvgVal", "DisttoStart", "GeneList", "Length", "PeakCenter", "PeakVal", "Position", "interval_id", "nCpGs", "nGenes", "nPeaks", "nProbes", "nPromoters", "contig", "start", "end") tmpfile.write("\t".join(headers) + "\n") avgval, contig, disttostart, end, genelist, length, peakcenter, peakval, position, start, interval_id, ncpgs, ngenes, npeaks, nprobes, npromoters = \ 0, "", 0, 0, "", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, mlength = int(PARAMS["calling_merge_min_interval_length"]) c = E.Counter() # count tags for bed in Bed.iterator(IOTools.openFile(infile, "r")): c.input += 1 if "name" not in bed: bed.name = c.input # remove very short intervals if bed.end - bed.start < mlength: c.skipped_length += 1 continue if replicates: npeaks, peakcenter, length, avgval, peakval, nprobes = \ PipelineChipseq.countPeaks( bed.contig, bed.start, bed.end, samfiles, offsets) # nreads can be 0 if the intervals overlap only slightly # and due to the binning, no reads are actually in the # overlap region. However, most of these intervals should # be small and have already be deleted via the # merge_min_interval_length cutoff. do not output # intervals without reads. if nprobes == 0: c.skipped_reads += 1 else: npeaks, peakcenter, length, avgval, peakval, nprobes = ( 1, bed.start + (bed.end - bed.start) // 2, bed.end - bed.start, 1, 1, 1) c.output += 1 tmpfile.write("\t".join(map( str, (avgval, disttostart, genelist, length, peakcenter, peakval, position, bed.name, ncpgs, ngenes, npeaks, nprobes, npromoters, bed.contig, bed.start, bed.end))) + "\n") if c.output == 0: E.warn("%s - no intervals") tmpfile.close() tmpfilename = tmpfile.name tablename = "%s_intervals" % track.asTable() statement = ''' cgat csv2db %(csv2db_options)s --allow-empty-file --add-index=interval_id --table=%(tablename)s < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfile.name) E.info("%s\n" % str(c))
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "-a", "--aggregate-by", dest="aggregate", type="choice", choices=("name", "contig", "track", "none"), help="aggregate counts by feature [default=%default].") parser.add_option( "-p", "--add-percent", dest="add_percent", action="store_true", help="add percentages [default=%default].") parser.set_defaults( genome_file=None, aggregate="none", add_percent=False, ) (options, args) = E.Start(parser, argv) # get files if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: if options.add_percent: raise ValueError("--add-percent option requires --genome-file") fasta = None if options.add_percent and not options.aggregate == "contig": raise NotImplementedError( "--add-percent option requires --aggregate=contig") counts = collections.defaultdict(Counter) total = Counter() output_totals = True if options.aggregate == "track": keyf = lambda x: x.track elif options.aggregate == "name": keyf = lambda x: x.name elif options.aggregate == "contig": keyf = lambda x: x.contig else: keyf = lambda x: "all" output_totals = False for bed in Bed.iterator(options.stdin): counts[keyf(bed)].add(bed) total.add(bed) outf = options.stdout key = "track" if options.add_percent: outf.write("%s\t%s\n" % (key, "\t".join(Counter.headers_percent))) else: outf.write("%s\t%s\n" % (key, "\t".join(Counter.headers))) total_bases = 0 for key, count in sorted(counts.items()): if options.add_percent: total_bases += fasta.getLength(key) count.setSize(fasta.getLength(key)) outf.write("%s\t%s\n" % (key, str(count))) if output_totals: if options.add_percent: count.setSize(total_bases) outf.write("%s\t%s\n" % ("total", str(total))) E.Stop()
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-a", "--as-gtf", dest="as_gtf", action="store_true", help="output as gtf.") parser.add_option( "-f", "--id-format", dest="id_format", type="string", help="format for numeric identifier if --as-gtf is set and " "no name in bed file [%default].") parser.set_defaults(as_gtf=False, id_format="%08i", test=None) (options, args) = E.start(parser, add_pipe_options=True) as_gtf = options.as_gtf id_format = options.id_format if as_gtf: gff = GTF.Entry() else: gff = GTF.Entry() gff.source = "bed" gff.feature = "exon" ninput, noutput, nskipped = 0, 0, 0 id = 0 for bed in Bed.iterator(options.stdin): ninput += 1 gff.contig = bed.contig gff.start = bed.start gff.end = bed.end if bed.fields and len(bed.fields) >= 3: gff.strand = bed.fields[2] else: gff.strand = "." if bed.fields and len(bed.fields) >= 2: gff.score = bed.fields[1] if as_gtf: if bed.fields: gff.gene_id = bed.fields[0] gff.transcript_id = bed.fields[0] else: id += 1 gff.gene_id = id_format % id gff.transcript_id = id_format % id else: if bed.fields: gff.source = bed.fields[0] options.stdout.write(str(gff) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-i", "--infiles", dest="infiles", type="string", metavar="bed", action="append", help="supply list of bed files") parser.set_defaults(infiles=[]) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) options.infiles.extend(args) if len(options.infiles) == 0: raise ValueError('please provide at least 1 bed file') E.info("concatenating bed files") # concatenate the list of files tmp = tempfile.NamedTemporaryFile(delete=False) tmp_merge = tempfile.NamedTemporaryFile(delete=False) infs = options.infiles for inf in infs: for bed in Bed.iterator(IOTools.openFile(inf)): tmp.write("%s\n" % bed) tmp.close() E.info("merging bed entries") # merge the bed entries in the file name = tmp.name tmp_bed = pybedtools.BedTool(name) tmp_bed.merge().saveas(tmp_merge.name) tmp_merge.close() E.info("indexing bed entries") # index the bed entries merged = IndexedGenome.Simple() for bed in Bed.iterator(IOTools.openFile(tmp_merge.name)): merged.add(bed.contig, bed.start, bed.end) counts = collections.defaultdict(int) # list of samples samples = options.infiles E.info("counting no. samples overlapping each interval") for sample in samples: found = set() for bed in Bed.iterator(IOTools.openFile(sample)): if merged.contains(bed.contig, bed.start, bed.end): key = [bed.contig] + \ [x for x in merged.get(bed.contig, bed.start, bed.end)] key = (key[0], key[1][0], key[1][1]) if key in found: continue found.add(key) # tuple of interval description as key - (contig, start, end) counts[key] += 1 # open outfile options.stdout.write("contig\tstart\tend\tcount\n") E.info("outputting result") for interval, count in counts.iteritems(): options.stdout.write( "\t".join(map(str, interval)) + "\t" + str(count) + "\n") # write footer and output benchmark information. E.Stop()
def __init__(self, filename): self.mIndices = Bed.readAndIndex(IOTools.openFile(filename, "r"), per_track=True)
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic sequence to retrieve " "sequences from.") parser.add_option("-m", "--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker to mask output sequences " "[%default].") parser.add_option("--output-mode", dest="output_mode", type="choice", choices=("intervals", "leftright", "segments"), help="what to output. " "'intervals' generates a single sequence for " "each bed interval. 'leftright' generates two " "sequences, one in each direction, for each bed " "interval. 'segments' can be used to output " "sequence from bed12 files so that sequence only covers " "the segements [%default]") parser.add_option("--min-sequence-length", dest="min_length", type="int", help="require a minimum sequence length [%default]") parser.add_option("--max-sequence-length", dest="max_length", type="int", help="require a maximum sequence length [%default]") parser.add_option( "--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at 3', 5' or both or no ends. If 3only or 5only " "are set, only the added sequence is returned [default=%default]") parser.add_option( "--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option( "--use-strand", dest="ignore_strand", action="store_false", help="use strand information and return reverse complement " "on intervals located on the negative strand. " "[default=%default]") parser.set_defaults( genome_file=None, masker=None, output_mode="intervals", min_length=0, max_length=0, extend_at=None, extend_by=100, ignore_strand=True, ) (options, args) = E.Start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() fasta.setConverter(IndexedFasta.getConverter("zero-both-open")) counter = E.Counter() ids, seqs = [], [] E.info("collecting sequences") for bed in Bed.setName(Bed.iterator(options.stdin)): counter.input += 1 lcontig = fasta.getLength(bed.contig) if options.ignore_strand: strand = "+" else: strand = bed.strand if options.output_mode == "segments" and bed.columns == 12: ids.append("%s %s:%i..%i (%s) %s %s" % (bed.name, bed.contig, bed.start, bed.end, strand, bed["blockSizes"], bed["blockStarts"])) seg_seqs = [fasta.getSequence(bed.contig, strand, start, end) for start, end in bed.toIntervals()] seqs.append("".join(seg_seqs)) elif (options.output_mode == "intervals" or options.output_mode == "segments"): ids.append("%s %s:%i..%i (%s)" % (bed.name, bed.contig, bed.start, bed.end, strand)) seqs.append( fasta.getSequence(bed.contig, strand, bed.start, bed.end)) elif options.output_mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_l %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_r %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) E.info("collected %i sequences" % len(seqs)) masked = Masker.maskSequences(seqs, options.masker) options.stdout.write( "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n") E.info("masked %i sequences" % len(seqs)) counter.output = len(seqs) E.info("%s" % counter) E.Stop()
def main(argv=sys.argv): # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=("igv", ), help="method to create plots with [%default]") parser.add_option("-d", "--snapshot-dir", dest="snapshotdir", type="string", help="directory to save snapshots in [%default]") parser.add_option("-f", "--format", dest="format", type="choice", choices=("png", "eps", "svg"), help="output file format [%default]") parser.add_option("-o", "--host", dest="host", type="string", help="host that IGV is running on [%default]") parser.add_option("-p", "--port", dest="port", type="int", help="port that IGV listens at [%default]") parser.add_option( "-e", "--extend", dest="extend", type="int", help="extend each interval by a number of bases [%default]") parser.add_option("-x", "--expand", dest="expand", type="float", help="expand each region by a certain factor [%default]") parser.set_defaults( method="igv", host='127.0.0.1', port=61111, snapshotdir=os.getcwd(), extend=0, format="png", expand=1.0, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) E.info("connection to session on %s:%s" % (options.host, options.port)) E.info("saving images in %s" % options.snapshotdir) igv = IGV.IGV(host=options.host, port=options.port, snapshot_dir=os.path.abspath(options.snapshotdir)) c = E.Counter() for bed in Bed.iterator(options.stdin): c.input += 1 # IGV can not deal with white-space in filenames name = re.sub("\s", "_", bed.name) E.info("going to %s:%i-%i for %s" % (bed.contig, bed.start, bed.end, name)) start, end = bed.start, bed.end extend = options.extend if options.expand: d = end - start extend = max(extend, (options.expand * d - d) // 2) start -= extend end += extend igv.go("%s:%i-%i" % (bed.contig, start, end)) fn = "%s.%s" % (name, options.format) E.info("writing snapshot to '%s'" % fn) igv.save(fn) c.snapshots += 1 E.info(c) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: bed2annotator2tsv.py 2885 2010-04-07 08:46:50Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-f", "--features", dest="features", type="string", help="feature to collect [default=None].") parser.add_option("-i", "--files", dest="files", action="append", help="use multiple annotations [default=None].") parser.add_option( "-a", "--annotations", dest="annotations", type="string", help= "aggregate name for annotations if only single file is provided from STDIN [default=None]." ) parser.add_option( "--input-filename-map", dest="input_filename_map", type="string", help="filename with a map of gene_ids to categories [default=None].") parser.add_option("-l", "--max-length", dest="max_length", type="string", help="maximum segment length [default=None].") parser.add_option( "-m", "--merge", dest="merge", action="store_true", help="merge overlapping bed segments [default=%default].") parser.add_option("-s", "--section", dest="section", type="choice", choices=("segments", "annotations", "workspace"), help="annotator section [default=None].") parser.add_option( "--subset", dest="subsets", type="string", action="append", help= "add filenames to delimit subsets within the gff files. The syntax is filename.gff,label,filename.ids [default=None]." ) parser.set_defaults( genome_file=None, feature=None, remove_random=True, section="segments", annotations="annotations", max_length=100000, files=[], subsets=[], input_filename_map=None, merge=False, ) (options, args) = E.Start(parser) options.files += args if len(options.files) == 0: options.files.append("-") options.files = list( itertools.chain(*[re.split("[,; ]+", x) for x in options.files])) if options.subsets: subsets = collections.defaultdict(list) for s in options.subsets: filename_gff, label, filename_ids = s.split(",") subsets[filename_gff].append((label, filename_ids)) options.subsets = subsets if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.section == "segments": prefix = "##Segs" elif options.section == "annotations": prefix = "##Id" elif options.section == "workspace": prefix = "##Work" else: raise ValueError("unknown section %s" % options.section) if options.max_length: max_length = options.max_length else: max_length = 0 ninput, ntracks, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0, 0 if options.section in ("annotations"): contigs = set() it = itertools.groupby(Bed.iterator(options.stdin), key=lambda x: x.track["name"]) map_track2segments = {} for track, beds in it: ntracks += 1 map_track2segments[track] = [] first_segment = nsegments beds = list(beds) if options.merge: beds = Bed.merge(beds) for bed in beds: contig, start, end = bed.contig, bed.start, bed.end if options.remove_random and "random" in contig: continue if max_length > 0 and end - start > max_length: ndiscarded += 1 continue contigs.add(contig) map_track2segments[track].append(nsegments) options.stdout.write("%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end)) nsegments += 1 options.stdout.write("##Ann\t%s\t%s\n" % (track, "\t".join( ["%i" % x for x in range(first_segment, nsegments)]))) E.info("track %s: annotated with %i segments" % (track, nsegments - first_segment)) ncontigs = len(contigs) E.info( "ninput=%i, ntracks=%i, ncontigs=%i, nsegments=%i, ndiscarded=%i" % (ninput, ntracks, ncontigs, nsegments, ndiscarded)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--min-overlap", dest="min_overlap", type="float", help="minimum overlap [%default]") parser.add_option("-a", "--bam-file", dest="filename_bam", metavar="bam", type="string", help="bam-file to use (required) [%default]") parser.add_option("-b", "--bed-file", dest="filename_bed", metavar="bed", type="string", help="bed-file to use (required) [%default]") parser.add_option( "-s", "--sort-bed", dest="sort_bed", action="store_true", help="sort the bed file by chromosomal location before " "processing. " "[%default]") parser.add_option( "--assume-sorted", dest="sort_bed", action="store_false", help="assume that the bed-file is sorted by chromosomal location. " "[%default]") parser.add_option( "--split-intervals", dest="split_intervals", action="store_true", help="treat split BAM intervals, for example spliced intervals, " "as separate intervals. Note that a single alignment might be " "counted several times as a result. " "[%default]") parser.set_defaults( min_overlap=0.5, filename_bam=None, filename_bed=None, sort_bed=True, split_intervals=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) filename_bam = options.filename_bam filename_bed = options.filename_bed if filename_bam is None and filename_bed is None: if len(args) != 2: raise ValueError( "please supply a bam and a bed file or two bed-files.") filename_bam, filename_bed = args if filename_bed is None: raise ValueError("please supply a bed file to compare to.") if filename_bam is None: raise ValueError("please supply a bam file to compare with.") E.info("intersecting the two files") min_overlap = options.min_overlap options.stdout.write("category\talignments\n") # get number of columns of reference bed file for bed in Bed.iterator(IOTools.openFile(filename_bed)): ncolumns_bed = bed.columns break E.info("assuming %s is bed%i format" % (filename_bed, ncolumns_bed)) if ncolumns_bed < 4: raise ValueError("please supply a name attribute in the bed file") # get information about if filename_bam.endswith(".bam"): format = "-abam" samfile = pysam.Samfile(filename_bam, "rb") total = samfile.mapped # latest bedtools uses bed12 format when bam is input ncolumns_bam = 12 # count per read sort_key = lambda x: x.name else: format = "-a" total = IOTools.getNumLines(filename_bam) # get bed format ncolumns_bam = 0 for bed in Bed.iterator(IOTools.openFile(filename_bam)): ncolumns_bam = bed.columns break if ncolumns_bam > 0: E.info("assuming %s is bed%i fomat" % (filename_bam, ncolumns_bam)) if ncolumns_bam == 3: # count per interval sort_key = lambda x: (x.contig, x.start, x.end) else: # count per interval category sort_key = lambda x: x.name # use fields for bam/bed file (regions to count with) data_fields = [ "contig", "start", "end", "name", "score", "strand", "thickstart", "thickend", "rgb", "blockcount", "blockstarts", "blockends"][:ncolumns_bam] # add fields for second bed (regions to count in) data_fields.extend([ "contig2", "start2", "end2", "name2", "score2", "strand2", "thickstart2", "thickend2", "rgb2", "blockcount2", "blockstarts2", "blockends2"][:ncolumns_bed]) # add bases overlap data_fields.append("bases_overlap") data = collections.namedtuple("data", data_fields) options.stdout.write("total\t%i\n" % total) if total == 0: E.warn("no data in %s" % filename_bam) return # SNS: sorting optional, off by default if options.sort_bed: bedcmd = "<( zcat %s | sort -k1,1 -k2,2n)" % filename_bed else: bedcmd = filename_bed if options.split_intervals: split = "-split" else: split = "" # IMS: newer versions of intersectBed have a very high memory # requirement unless passed sorted bed files. statement = """bedtools intersect %(format)s %(filename_bam)s -b %(bedcmd)s %(split)s -sorted -bed -wo -f %(min_overlap)f""" % locals() E.info("starting counting process: %s" % statement) proc = E.run(statement, return_popen=True, stdout=subprocess.PIPE) E.info("counting") counts_per_alignment = collections.defaultdict(int) take_columns = len(data._fields) def iterate(infile): for line in infile: if not line.strip(): continue yield data._make(line[:-1].split()[:take_columns]) for read, overlaps in itertools.groupby( iterate(IOTools.force_str(proc.stdout)), key=sort_key): annotations = [x.name2 for x in overlaps] for anno in annotations: counts_per_alignment[anno] += 1 for key, counts in sorted(counts_per_alignment.items()): options.stdout.write("%s\t%i\n" % (key, counts)) # write footer and output benchmark information. E.Stop()
def buildIndex(self, filename): return Bed.readAndIndex(IOTools.openFile(filename, "r"))
def aggregateWindowsTagCounts(infiles, outfile, regex="(.*)\..*"): '''aggregate output from several ``bedtools coverage`` results. ``bedtools coverage`` outputs the following columns for a bed4 file:: 1 Contig 2 Start 3 Stop 4 Name 5 The number of features in A that overlapped (by at least one base pair) the B interval. 6 The number of bases in B that had non-zero coverage from features in A. 7 The length of the entry in B. 8 The fraction of bases in B that had non-zero coverage from features in A. This method autodetects the number of columns in the :term:`infiles` and selects: * bed4: use column 5 * bed6: use column 7 * bed12: use column 13 Arguments --------- infiles : list Input filenames with the output from ``bedtools coverage`` outfile : string Output filename in :term:`tsv` format. regex : string Regular expression used to extract the track name from the filename. The default removes any suffix. ''' # get bed format bed_columns = Bed.getNumColumns(infiles[0]) # +1 as awk is 1-based column = bed_columns - 4 + 1 src = " ".join([ """<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}')""" % (x, column) for x in infiles ]) tmpfile = P.get_temp_filename(".") statement = '''paste %(src)s > %(tmpfile)s''' P.run(statement) # build track names tracks = [ re.search(regex, os.path.basename(x)).groups()[0] for x in infiles ] outf = IOTools.open_file(outfile, "w") outf.write("interval_id\t%s\n" % "\t".join(tracks)) # filter for uniqueness - keys with the same value as the # previous line will be ignored. last_gene = None c = E.Counter() for line in open(tmpfile, "r"): c.input += 1 data = line[:-1].split("\t") genes = list(set([data[x] for x in range(0, len(data), 2)])) values = [int(data[x]) for x in range(1, len(data), 2)] assert len(genes) == 1, \ "paste command failed, wrong number of genes per line: '%s'" % line if genes[0] == last_gene: c.duplicates += 1 continue c.output += 1 outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values)))) last_gene = genes[0] outf.close() os.unlink(tmpfile) E.info("aggregateWindowsTagCounts: %s" % c)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--min-overlap", dest="min_overlap", type="float", help="minimum overlap [%default]") parser.add_option("-a", "--bam-file", dest="filename_bam", metavar="bam", type="string", help="bam-file to use (required) [%default]") parser.add_option("-b", "--bed-file", dest="filename_bed", metavar="bed", type="string", help="bed-file to use (required) [%default]") parser.add_option("-s", "--sort-bed", dest="sort_bed", action="store_true", help="sort the bed file by chromosomal location before " "processing. " "[%default]") parser.add_option( "--assume-sorted", dest="sort_bed", action="store_false", help="assume that the bed-file is sorted by chromosomal location. " "[%default]") parser.add_option( "--split-intervals", dest="split_intervals", action="store_true", help="treat split BAM intervals, for example spliced intervals, " "as separate intervals. Note that a single alignment might be " "counted several times as a result. " "[%default]") parser.set_defaults( min_overlap=0.5, filename_bam=None, filename_bed=None, sort_bed=True, split_intervals=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) filename_bam = options.filename_bam filename_bed = options.filename_bed if filename_bam is None and filename_bed is None: if len(args) != 2: raise ValueError( "please supply a bam and a bed file or two bed-files.") filename_bam, filename_bed = args if filename_bed is None: raise ValueError("please supply a bed file to compare to.") if filename_bam is None: raise ValueError("please supply a bam file to compare with.") E.info("intersecting the two files") min_overlap = options.min_overlap options.stdout.write("category\talignments\n") # get number of columns of reference bed file for bed in Bed.iterator(IOTools.open_file(filename_bed)): ncolumns_bed = bed.columns break E.info("assuming %s is bed%i format" % (filename_bed, ncolumns_bed)) if ncolumns_bed < 4: raise ValueError("please supply a name attribute in the bed file") # get information about if filename_bam.endswith(".bam"): format = "-abam" samfile = pysam.AlignmentFile(filename_bam, "rb") total = samfile.mapped # latest bedtools uses bed12 format when bam is input ncolumns_bam = 12 # count per read sort_key = lambda x: x.name else: format = "-a" total = IOTools.get_num_lines(filename_bam) # get bed format ncolumns_bam = 0 for bed in Bed.iterator(IOTools.open_file(filename_bam)): ncolumns_bam = bed.columns break if ncolumns_bam > 0: E.info("assuming %s is bed%i fomat" % (filename_bam, ncolumns_bam)) if ncolumns_bam == 3: # count per interval sort_key = lambda x: (x.contig, x.start, x.end) else: # count per interval category sort_key = lambda x: x.name # use fields for bam/bed file (regions to count with) data_fields = [ "contig", "start", "end", "name", "score", "strand", "thickstart", "thickend", "rgb", "blockcount", "blockstarts", "blockends" ][:ncolumns_bam] # add fields for second bed (regions to count in) data_fields.extend([ "contig2", "start2", "end2", "name2", "score2", "strand2", "thickstart2", "thickend2", "rgb2", "blockcount2", "blockstarts2", "blockends2" ][:ncolumns_bed]) # add bases overlap data_fields.append("bases_overlap") data = collections.namedtuple("data", data_fields) options.stdout.write("total\t%i\n" % total) if total == 0: E.warn("no data in %s" % filename_bam) return # SNS: sorting optional, off by default if options.sort_bed: bedcmd = "<( gunzip < %s | sort -k1,1 -k2,2n)" % filename_bed else: bedcmd = filename_bed if options.split_intervals: split = "-split" else: split = "" # IMS: newer versions of intersectBed have a very high memory # requirement unless passed sorted bed files. statement = """bedtools intersect %(format)s %(filename_bam)s -b %(bedcmd)s %(split)s -sorted -bed -wo -f %(min_overlap)f""" % locals() E.info("starting counting process: %s" % statement) proc = E.run(statement, return_popen=True, stdout=subprocess.PIPE) E.info("counting") counts_per_alignment = collections.defaultdict(int) take_columns = len(data._fields) def iterate(infile): for line in infile: if not line.strip(): continue yield data._make(line[:-1].split()[:take_columns]) for read, overlaps in itertools.groupby(iterate( IOTools.force_str(proc.stdout)), key=sort_key): annotations = [x.name2 for x in overlaps] for anno in annotations: counts_per_alignment[anno] += 1 for key, counts in sorted(counts_per_alignment.items()): options.stdout.write("%s\t%i\n" % (key, counts)) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-g", "--gtf-file", dest="filename_gtf", type="string", help="filename with gene models in gtf format [%default]") parser.add_option( "-m", "--filename-mismapped", dest="filename_mismapped", type="string", help="output bam file for mismapped reads [%default]") parser.add_option( "-j", "--junctions-bed-file", dest="filename_junctions", type="string", help="bam file with reads mapped across junctions [%default]") parser.add_option( "-r", "--filename-regions", dest="filename_regions", type="string", help="filename with regions to remove in bed format [%default]") parser.add_option( "-t", "--transcripts-gtf-file", dest="filename_transcriptome", type="string", help="bam file with reads mapped against transcripts [%default]") parser.add_option( "-p", "--map-tsv-file", dest="filename_map", type="string", help="filename mapping transcript numbers (used by " "--filename-transciptome) to transcript names " "(used by --filename-gtf) [%default]") parser.add_option( "-s", "--filename-stats", dest="filename_stats", type="string", help="filename to output stats to [%default]") parser.add_option( "-o", "--colour", dest="colour_mismatches", action="store_true", help="mismatches will use colour differences (CM tag) [%default]") parser.add_option( "-i", "--ignore-mismatches", dest="ignore_mismatches", action="store_true", help="ignore mismatches [%default]") parser.add_option( "-c", "--remove-contigs", dest="remove_contigs", type="string", help="','-separated list of contigs to remove [%default]") parser.add_option( "-f", "--force-output", dest="force", action="store_true", help="force overwriting of existing files [%default]") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="remove reads not matching uniquely [%default]") parser.add_option("--output-sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.set_defaults( filename_gtf=None, filename_mismapped=None, filename_junctions=None, filename_transcriptome=None, filename_map=None, remove_contigs=None, force=False, unique=False, colour_mismatches=False, ignore_mismatches=False, output_sam=False, filename_table=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) != 1: raise ValueError("please supply one bam file") bamfile_genome = args[0] genome_samfile = pysam.Samfile(bamfile_genome, "rb") if options.remove_contigs: options.remove_contigs = options.remove_contigs.split(",") if options.filename_map: E.info("reading map") id_map = IOTools.readMap( IOTools.openFile(options.filename_map), has_header=True) id_map = dict([(y, x) for x, y in id_map.iteritems()]) else: id_map = None transcripts = {} if options.filename_gtf: E.info("indexing geneset") mapped, missed = 0, 0 for gtf in GTF.transcript_iterator( GTF.iterator(IOTools.openFile(options.filename_gtf))): gtf.sort(key=lambda x: x.start) transcript_id = gtf[0].transcript_id if id_map: try: transcript_id = id_map[transcript_id] mapped += 1 except KeyError: missed += 1 continue transcripts[transcript_id] = gtf E.info("read %i transcripts from geneset (%i mapped, %i missed)" % (len(transcripts), mapped, missed)) regions_to_remove = None if options.filename_regions: E.info("indexing regions") regions_to_remove = IndexedGenome.Simple() for bed in Bed.iterator(IOTools.openFile(options.filename_regions)): regions_to_remove.add(bed.contig, bed.start, bed.end) E.info("read %i regions" % len(regions_to_remove)) if options.filename_transcriptome: transcripts_samfile = pysam.Samfile(options.filename_transcriptome, "rb") else: transcripts_samfile = None if options.output_sam: output_samfile = pysam.Samfile("-", "wh", template=genome_samfile) else: output_samfile = pysam.Samfile("-", "wb", template=genome_samfile) if options.filename_mismapped: if not options.force and os.path.exists(options.filename_mismapped): raise IOError("output file %s already exists" % options.filename_mismapped) output_mismapped = pysam.Samfile(options.filename_mismapped, "wb", template=genome_samfile) else: output_mismapped = None if options.filename_junctions: junctions_samfile = pysam.Samfile(options.filename_junctions, "rb") else: junctions_samfile = None c = _bams2bam.filter(genome_samfile, output_samfile, output_mismapped, transcripts_samfile, junctions_samfile, transcripts, regions=regions_to_remove, unique=options.unique, remove_contigs=options.remove_contigs, colour_mismatches=options.colour_mismatches, ignore_mismatches=options.ignore_mismatches, ignore_transcripts=transcripts_samfile is None, ignore_junctions=junctions_samfile is None) if options.filename_stats: outf = IOTools.openFile(options.filename_stats, "w") outf.write("category\tcounts\n%s\n" % c.asTable()) outf.close() if options.filename_transcriptome: transcripts_samfile.close() genome_samfile.close() output_samfile.close() if output_mismapped: output_mismapped.close() # write footer and output benchmark information. E.Stop()
def main( argv = None ): if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gtf2table.py 2888 2010-04-07 08:48:36Z andreas $", usage = globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default]." ) parser.add_option("-b", "--bam-file", dest="bam_files", type="string", help="filename with read mapping information. Multiple files can be submitted in a comma-separated list [default=%default]." ) parser.add_option( "--control-bam-file", dest="control_bam_files", type="string", help="filename with read mapping information for input/control. Multiple files can be submitted in a comma-separated list [default=%default]." ) parser.add_option( "--filename-format", dest="filename_format", type="choice", choices=("bed", "gff", "gtf" ), help="format of secondary stream [default=%default]." ) parser.add_option("-c", "--counter", dest="counters", type="choice", action="append", choices=( "overlap", "peaks", "composition-na", "composition-cpg", "classifier-chipseq"), help="select counters to apply [default=%default]." ) parser.add_option("-o", "--offset", dest="offsets", type="int", action="append", help="tag offsets for tag counting - supply as many as there are bam-files [default=%default]." ) parser.add_option( "--control-offset", dest="control_offsets", type="int", action="append", help="control tag offsets for tag counting - supply as many as there are bam-files [default=%default]." ) parser.add_option("-a", "--all-fields", dest="all_fields", action = "store_true", help="output all fields in original bed file, by default only the first 4 are output [default=%default]." ) parser.add_option("--bed-headers", dest="bed_headers", type="string", help="supply ',' separated list of headers for bed component [default=%default]." ) parser.add_option("-f", "--filename-gff", dest="filename_gff", type="string", action="append", metavar='bed', help="filename with extra gff files. The order is important [default=%default]." ) parser.set_defaults( genome_file = None, counters = [], bam_files = None, offsets = [], control_bam_files = None, control_offsets = [], all_fields = False, filename_format = None, bed_headers = "contig,start,end,name", filename_gff = [], ) (options, args) = E.Start( parser ) # get files if options.genome_file: fasta = IndexedFasta.IndexedFasta( options.genome_file ) else: fasta = None if options.bam_files: bam_files = [] for bamfile in options.bam_files.split(","): bam_files.append( pysam.Samfile(bamfile, "rb" ) ) else: bam_files = None if options.control_bam_files: control_bam_files = [] for bamfile in options.control_bam_files.split(","): control_bam_files.append( pysam.Samfile(bamfile, "rb" ) ) else: control_bam_files = None counters = [] for c in options.counters: if c == "overlap": counters.append( CounterOverlap( filename = options.filename_bed, fasta=fasta, options = options) ) elif c == "peaks": counters.append( CounterPeaks( bam_files, options.offsets, control_bam_files, options.control_offsets, options = options ) ) elif c == "composition-na": counters.append( CounterCompositionNucleotides( fasta=fasta, options = options )) elif c == "composition-cpg": counters.append( CounterCompositionCpG( fasta=fasta, options = options ) ) elif c == "classifier-chipseq": counters.append( ClassifierChIPSeq( filename_gff = options.filename_gff, fasta = fasta, options = options, prefix = None) ) options.stdout.write( "\t".join( [x.strip() for x in options.bed_headers.split(",") ] ) ) options.stdout.write( "\t" + "\t".join( [ x.getHeader() for x in counters] ) + "\n" ) for bed in Bed.iterator(options.stdin): for counter in counters: counter.update(bed) if options.all_fields: options.stdout.write( str(bed) ) else: options.stdout.write( "\t".join( (bed.contig, str(bed.start), str(bed.end), bed.fields[0]) ) ) for counter in counters: options.stdout.write("\t%s" % str(counter) ) options.stdout.write("\n") E.Stop()
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id: bed2gff.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option("-a", "--as-gtf", dest="as_gtf", action="store_true", help="output as gtf.") parser.add_option("-f", "--id-format", dest="id_format", type="string", help="format for numeric identifier if --as-gtf is set and no name in bed file [%default].") parser.set_defaults(as_gtf=False, id_format="%08i", test=None) (options, args) = E.Start(parser, add_pipe_options=True) as_gtf = options.as_gtf id_format = options.id_format if as_gtf: gff = GTF.Entry() else: gff = GTF.Entry() gff.source = "bed" gff.feature = "exon" ninput, noutput, nskipped = 0, 0, 0 id = 0 for bed in Bed.iterator(options.stdin): ninput += 1 gff.contig = bed.contig gff.start = bed.start gff.end = bed.end if bed.fields and len(bed.fields) >= 3: gff.strand = bed.fields[2] else: gff.strand = "." if bed.fields and len(bed.fields) >= 2: gff.score = bed.fields[1] if as_gtf: if bed.fields: gff.gene_id = bed.fields[0] gff.transcript_id = bed.fields[0] else: id += 1 gff.gene_id = id_format % id gff.transcript_id = id_format % id else: if bed.fields: gff.source = bed.fields[0] options.stdout.write(str(gff) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=["ave_dist", "min_dist", "corr"], default="min_dist", help="Method for calcuating similarity between profiles") parser.add_option("-s", "--spread", dest="spread", type="int", default=10, help="Amount to spread each tag by") parser.add_option("-k", "--keep-dist", dest="keep_dist", action="store_true", help="Keep the distribution of tag depths") parser.add_option("-r", "--rands", dest="rands", default=100, help="Number of randomisations to use for calculating" " mean and stdev of distance") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) profile1_file, profile2_file = args profile1_file = pysam.AlignmentFile(profile1_file) if profile2_file.endswith("bed") or profile2_file.endswith("bed.gz"): profile2_file = Bed.readAndIndex(profile2_file, with_values=True) profile2_counter = bed_counter else: profile2_file = pysam.AlignmentFile(profile2_file) profile2_counter = iCLIP.count_intervals if options.method == "min_dist": distance_func = iCLIP.findMinDistance elif options.method == "ave_dist": distance_func = iCLIP.calcAverageDistance else: def distance_func(profile1, profile2): return iCLIP.corr_profile(profile1, profile2, options.spread, profile2_ready=True) for exon in GTF.iterator(options.stdin): if exon.feature != "exon": continue contig = exon.contig strand = exon.strand transcript_id = exon.transcript_id start = exon.start end = exon.end profile1 = iCLIP.count_intervals(profile1_file, [(start, end)], contig=contig, strand=strand) profile2 = profile2_counter(profile2_file, [(start, end)], contig=contig, strand=strand) if profile1.sum() == 0 or profile2.sum() == 0: z = "NA" distance = "NA" options.stdout.write( "%(contig)s\t%(start)i\t%(end)i\t%(transcript_id)s\t%(strand)s\t%(distance)s\t%(z)s\n" % locals()) continue if options.method == "corr": profile2 = iCLIP.spread(profile2, options.spread) distance = distance_func(profile1, profile2) rands = iCLIP.rand_apply(profile=profile1, exon=exon, n=options.rands, func=distance_func, keep_dist=options.keep_dist, profile2=profile2) z = (distance - rands.mean()) / rands.std() options.stdout.write( "%(contig)s\t%(start)i\t%(end)i\t%(transcript_id)s\t%(strand)s\t%(distance).3f\t%(z).2f\n" % locals()) # write footer and output benchmark information. E.Stop()
def readSegments(infile, indexed_workspace, truncate=False, format="gtf", keep_ambiguous=False, remove_overhangs=False): """read segments from infile. segments not overlapping with indexed_workspace are removed. If :attr: truncate is given, segments extending beyond the workspace are truncated. returns a list of segments for each contig in a dictionary """ counter = E.Counter() segments = collections.defaultdict(list) def addSegment(contig, start, end, counter): if contig in indexed_workspace: r = indexed_workspace[contig].find(start, end) if not r: counter.nskipped += 1 return if len(r) > 1: counter.nambiguous += 1 if not keep_ambiguous: return if truncate: for x in r: wstart, wend = x.start, x.end rstart, rend = max(start, wstart), min(end, wend) if start < wstart or end > wend: counter.ntruncated += 1 segments[contig].append((rstart, rend)) counter.added += 1 elif remove_overhangs: for x in r: wstart, wend = x.start, x.end rstart, rend = max(start, wstart), min(end, wend) if start < wstart or end > wend: counter.overhangs += 1 break else: segments[contig].append((start, end)) else: segments[contig].append((start, end)) counter.added += 1 counter.nkept += 1 if format == "gtf": gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(infile)) for gene in gtf_iterator: # get start and end ignoring introns # contig, start, end = gene[0].contig, min( [x.start for x in gene] ), max( [x.end for x in gene] ) contig, coords = gene[0].contig, [(x.start, x.end) for x in gene] counter.ninput += 1 for start, end in coords: addSegment(contig, start, end, counter) elif format == "bed": bed_iterator = Bed.iterator(infile) for bed in bed_iterator: counter.ninput += 1 addSegment(bed.contig, bed.start, bed.end, counter) E.info("read segments: %s" % str(counter)) return segments