def test_chromsizes(): assert_raises(OSError, pybedtools.get_chromsizes_from_ucsc, 'dm3', mysql='wrong path') assert_raises(ValueError, pybedtools.get_chromsizes_from_ucsc, 'dm3', timeout=0) try: print pybedtools.chromsizes('dm3') print pybedtools.get_chromsizes_from_ucsc('dm3') assert pybedtools.chromsizes('dm3') == pybedtools.get_chromsizes_from_ucsc('dm3') hg17 = pybedtools.chromsizes('hg17') assert hg17['chr1'] == (0, 245522847) fn = pybedtools.chromsizes_to_file(hg17, fn='hg17.genome') expected = 'chr1\t245522847\n' results = open(fn).readline() print results assert expected == results # make sure the tempfile version works, too fn = pybedtools.chromsizes_to_file(hg17, fn=None) expected = 'chr1\t245522847\n' results = open(fn).readline() print results assert expected == results assert_raises(OSError, pybedtools.get_chromsizes_from_ucsc, **dict(genome='hg17', mysql='nonexistent')) os.unlink('hg17.genome') except OSError: sys.stdout.write("mysql error -- test for chromsizes from UCSC didn't run")
def test_issue_145(): x = pybedtools.BedTool( """ chr1 1 100 feature1 0 + chr1 1 100 feature1 0 + """, from_string=True, ).saveas("foo.bed") g = pybedtools.chromsizes_to_file({"chr1": (0, 200)}, "genome.txt") y = x.genome_coverage(g=g, **{"5": True}) # trying to print causes pybedtools to interpret as a BED file, but it's # a histogram so line 2 raises error with pytest.raises(pybedtools.MalformedBedLineError): print(y) # solution is to iterate over lines of file; make sure this works for line in open(y.fn): print(line) # if streaming, iterate over y.fn directly: y = x.genome_coverage(g=g, **{"5": True}) for line in y.fn: print(line)
def bam_to_bigwig(bam, genome, output, scale=False): """ Given a BAM file `bam` and assembly `genome`, create a bigWig file scaled such that the values represent scaled reads -- that is, reads per million mapped reads. (Disable this scaling step with scale=False; in this case values will indicate number of reads) Assumes that `bedGraphToBigWig` from UCSC tools is installed; see http://genome.ucsc.edu/goldenPath/help/bigWig.html for more details on the format. """ genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) kwargs = dict(bg=True, split=True, g=genome_file) if scale: readcount = mapped_read_count(bam) _scale = 1 / (readcount / 1e6) kwargs['scale'] = _scale x = pybedtools.BedTool(bam).genome_coverage(**kwargs) cmds = [ 'bedGraphToBigWig', x.fn, genome_file, output] os.system(' '.join(cmds))
def bam2bigwig(bam, bigwig, genome, scale=1e6, verbose=False): """ Uses BEDTools to go from BAM to bedgraph, then bedGraphToBigWig to get the final bigwig. """ if scale is not None: cmds = ['samtools', 'view', '-F', '0x4', '-c', bam] p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() total_reads = float(stdout) reads_per_scale = total_reads / scale if verbose: sys.stderr.write('%s total reads\n' % total_reads) sys.stderr.flush() chromsizes = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) t0 = time.time() bedgraph = pybedtools.BedTool(bam)\ .genome_coverage(bg=True, g=chromsizes, scale=scale)\ .moveto('bedgraph.bedgraph') print bedgraph.fn if verbose: sys.stderr.write('Completed bedGraph in %.1fs\n' % (time.time() - t0)) sys.stderr.flush() cmds = ['bedGraphToBigWig', bedgraph.fn, chromsizes, bigwig] p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if verbose: sys.stderr.write('Completed bigWig %s\n' % bigwig) sys.stderr.flush()
def bam_to_bigwig(bam, genome, output, scale=False): """ Given a BAM file `bam` and assembly `genome`, create a bigWig file scaled such that the values represent scaled reads -- that is, reads per million mapped reads. (Disable this scaling step with scale=False; in this case values will indicate number of reads) Assumes that `bedGraphToBigWig` from UCSC tools is installed; see http://genome.ucsc.edu/goldenPath/help/bigWig.html for more details on the format. """ genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) kwargs = dict(bg=True, split=True, g=genome_file) if scale: readcount = mapped_read_count(bam) _scale = 1 / (readcount / 1e6) kwargs["scale"] = _scale x = pybedtools.BedTool(bam).genome_coverage(**kwargs) cmds = ["bedGraphToBigWig", x.fn, genome_file, output] try: p = subprocess.Popen( cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, ) stdout, stderr = p.communicate() except FileNotFoundError: raise FileNotFoundError( "bedGraphToBigWig was not found on the path. This is an external " "tool from UCSC which can be downloaded from " "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use " "`conda install ucsc-bedgraphtobigwig`") if p.returncode and "bedSort" in stderr: print("BAM header was not sorted; sorting bedGraph") y = x.sort() cmds[1] = y.fn try: p = subprocess.Popen( cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, ) stdout, stderr = p.communicate() except FileNotFoundError: raise FileNotFoundError( "bedSort was not found on the path. This is an external " "tool from UCSC which can be downloaded from " "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use " "`conda install ucsc-bedgraphtobigwig`") if p.returncode: raise ValueError("cmds: %s\nstderr: %s\nstdout: %s" % (" ".join(cmds), stderr, stdout))
def wig_to_bigwig(wig, genome, output): genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) cmds = [ 'wigToBigWig', wig.fn, genome_file, output] os.system(' '.join(cmds)) return output
def bedgraph_to_bigwig(bedgraph, genome, output): genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) cmds = [ 'bedGraphToBigWig', bedgraph.fn, genome_file, output] os.system(' '.join(cmds)) return output
def bedgraph_to_bigwig(bedgraph, genome, output): genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) cmds = ['bedGraphToBigWig', bedgraph.fn, genome_file, output] p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if p.returncode: raise ValueError("cmds: %s\nstderr:%s\nstdout:%s" % (" ".join(cmds), stderr, stdout)) return output
def wig_to_bigwig(wig, genome, output): genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) cmds = ['wigToBigWig', wig.fn, genome_file, output] subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if p.returncode: raise ValueError('cmds: %s\nstderr:%s\nstdout:%s' % (' '.join(cmds), stderr, stdout)) return output
def bigbed( x, genome, output, blockSize=256, itemsPerSlot=512, bedtype=None, _as=None, unc=False, tab=False, ): """ Converts a BedTool object to a bigBed format and returns the new filename. `x` is a BedTool object `genome` is an assembly string `output` is the name of the bigBed file to create. Other args are passed to bedToBigBed. In particular, `bedtype` (which becomes the "-type=" argument) is automatically handled for you if it is kept as the default None. Assumes that a recent version of bedToBigBed from UCSC is on the path. """ if isinstance(x, six.string_types): x = pybedtools.BedTool(x) if not isinstance(x.fn, six.string_types): x = x.saveas() chromsizes = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) if bedtype is None: bedtype = "bed%s" % x.field_count() cmds = [ "bedToBigBed", x.fn, chromsizes, output, "-blockSize=%s" % blockSize, "-itemsPerSlot=%s" % itemsPerSlot, "-type=%s" % bedtype, ] if unc: cmds.append("-unc") if tab: cmds.append("-tab") if _as: cmds.append("-as=%s" % _as) p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if p.returncode: raise ValueError("cmds: %s\nstderr:%s\nstdout:%s" % (" ".join(cmds), stderr, stdout)) return output
def bam_to_bigwig(bam, genome, output, scale=False): """ Given a BAM file `bam` and assembly `genome`, create a bigWig file scaled such that the values represent scaled reads -- that is, reads per million mapped reads. (Disable this scaling step with scale=False; in this case values will indicate number of reads) Assumes that `bedGraphToBigWig` from UCSC tools is installed; see http://genome.ucsc.edu/goldenPath/help/bigWig.html for more details on the format. """ genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) kwargs = dict(bg=True, split=True, g=genome_file) if scale: readcount = mapped_read_count(bam) _scale = 1 / (readcount / 1e6) kwargs['scale'] = _scale x = pybedtools.BedTool(bam).genome_coverage(**kwargs) cmds = [ 'bedGraphToBigWig', x.fn, genome_file, output] try: p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) stdout, stderr = p.communicate() except FileNotFoundError: raise FileNotFoundError( "bedGraphToBigWig was not found on the path. This is an external " "tool from UCSC which can be downloaded from " "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use " "`conda install ucsc-bedgraphtobigwig`" ) if p.returncode and 'bedSort' in stderr: print('BAM header was not sorted; sorting bedGraph') y = x.sort() cmds[1] = y.fn try: p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) stdout, stderr = p.communicate() except FileNotFoundError: raise FileNotFoundError( "bedSort was not found on the path. This is an external " "tool from UCSC which can be downloaded from " "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use " "`conda install ucsc-bedgraphtobigwig`" ) if p.returncode: raise ValueError('cmds: %s\nstderr: %s\nstdout: %s' % (' '.join(cmds), stderr, stdout))
def wig_to_bigwig(wig, genome, output): genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) cmds = [ 'wigToBigWig', wig.fn, genome_file, output] subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if p.returncode: raise ValueError('cmds: %s\nstderr:%s\nstdout:%s' % (' '.join(cmds), stderr, stdout)) return output
def test_chromsizes(): with pytest.raises(OSError): pybedtools.get_chromsizes_from_ucsc("dm3", mysql="wrong path", fetchchromsizes="wrongtoo") with pytest.raises(ValueError): pybedtools.get_chromsizes_from_ucsc("dm3", timeout=0) try: print(pybedtools.chromsizes("dm3")) print(pybedtools.get_chromsizes_from_ucsc("dm3")) assert pybedtools.chromsizes( "dm3") == pybedtools.get_chromsizes_from_ucsc("dm3") hg17 = pybedtools.chromsizes("hg17") assert hg17["chr1"] == (0, 245522847) fn = pybedtools.chromsizes_to_file(hg17, fn="hg17.genome") expected = "chr1\t245522847\n" results = open(fn).readline() print(results) assert expected == results # make sure the tempfile version works, too fn = pybedtools.chromsizes_to_file(hg17, fn=None) expected = "chr1\t245522847\n" results = open(fn).readline() print(results) assert expected == results with pytest.raises(OSError): pybedtools.get_chromsizes_from_ucsc(**dict( genome="hg17", mysql="nonexistent", fetchchromsizes="missing")) os.unlink("hg17.genome") except OSError: sys.stdout.write( "mysql error -- test for chromsizes from UCSC didn't run")
def bedgraph_to_bigwig(bedgraph, genome, output): genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) cmds = [ 'bedGraphToBigWig', bedgraph.fn, genome_file, output] p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if p.returncode: raise ValueError("cmds: %s\nstderr:%s\nstdout:%s" % (" ".join(cmds), stderr, stdout)) return output
def bedgraph2bigwig(bedgraph, bigwig, genome, verbose=False): """ Create a bigWig from `bedgraph`. :param bedgraph: Input filename of bedgraph :param bigwig: Output filename of bigWig to create :param genome: String assembly name of genome :param verbose: Print messages to stderr """ chromsizes = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) cmds = ['bedGraphToBigWig', bedgraph, chromsizes, bigwig] p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if verbose: sys.stderr.write('Completed bigWig %s\n' % bigwig) sys.stderr.flush()
def bigbed(x, genome, output, blockSize=256, itemsPerSlot=512, bedtype=None, _as=None, unc=False, tab=False): """ Converts a BedTool object to a bigBed format and returns the new filename. `x` is a BedTool object `genome` is an assembly string `output` is the name of the bigBed file to create. Other args are passed to bedToBigBed. In particular, `bedtype` (which becomes the "-type=" argument) is automatically handled for you if it is kept as the default None. Assumes that a recent version of bedToBigBed from UCSC is on the path. """ if isinstance(x, str): x = pybedtools.BedTool(x) if not isinstance(x.fn, str): x = x.saveas() chromsizes = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) if bedtype is None: bedtype = 'bed%s' % x.field_count() cmds = [ 'bedToBigBed', x.fn, chromsizes, output, '-blockSize=%s' % blockSize, '-itemsPerSlot=%s' % itemsPerSlot, '-type=%s' % bedtype ] if unc: cmds.append('-unc') if tab: cmds.append('-tab') if _as: cmds.append('-as=%s' % _as) p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if p.returncode: raise ValueError("cmds: %s\nstderr:%s\nstdout:%s" % (" ".join(cmds), stderr, stdout)) return output
def run_jaccard(Enrichment_Par): e = Enrichment_Par # cuts out chrom,chromStart, and chromEnd A2 = e.A.cut([0,1,2]) B2 = e.B.cut([0,1,2]) genome_fn = pybedtools.chromsizes_to_file(e.genome) resjaccard = A2.random_jaccard(B2,genome_fn=genome_fn,iterations=e.n, shuffle_kwargs={'chrom':True}) jaccard_dist = resjaccard[1] jaccard_obs = resjaccard[0] jaccard_exp = numpy.mean(resjaccard[1]) jaccardp_value = 'NA' if jaccard_exp == jaccard_obs or (jaccard_exp == 0 and jaccard_obs == 0): jaccardp_value =1 else: jaccardp_value = len([x for x in jaccard_dist if x > jaccard_obs]) / float(len(jaccard_dist)) jaccardp_value = min(jaccardp_value,1-jaccardp_value) return {"jaccardp_value": jaccardp_value, "jaccard_obs": jaccard_obs, "jaccard_exp": jaccard_exp}
def wig_to_bigwig(wig, genome, output): genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) cmds = ["wigToBigWig", wig.fn, genome_file, output] try: p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() except FileNotFoundError: raise FileNotFoundError( "bigWigToBedGraph was not found on the path. This is an external " "tool from UCSC which can be downloaded from " "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use " "`conda install ucsc-bedgraphtobigwig`") if p.returncode: raise ValueError("cmds: %s\nstderr:%s\nstdout:%s" % (" ".join(cmds), stderr, stdout)) return output
def bam_to_bigwig(bam, genome, output): """ Given a BAM file `bam` and assembly `genome`, create a bigWig file scaled such that the values represent scaled reads -- that is, reads per million mapped reads. Assumes that `bedGraphToBigWig` from UCSC tools is installed; see http://genome.ucsc.edu/goldenPath/help/bigWig.html for more details on the format. """ genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) readcount = mapped_read_count(bam) scale = 1 / (readcount / 1e6) x = pybedtools.BedTool(bam)\ .genome_coverage(bg=True, scale=scale, split=True, g=genome_file) cmds = [ 'bedGraphToBigWig', x.fn, genome_file, output] os.system(' '.join(cmds))
def bam_to_bigwig(bam, genome, output, scale=False): """ Given a BAM file `bam` and assembly `genome`, create a bigWig file scaled such that the values represent scaled reads -- that is, reads per million mapped reads. (Disable this scaling step with scale=False; in this case values will indicate number of reads) Assumes that `bedGraphToBigWig` from UCSC tools is installed; see http://genome.ucsc.edu/goldenPath/help/bigWig.html for more details on the format. """ genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) kwargs = dict(bg=True, split=True, g=genome_file) if scale: readcount = mapped_read_count(bam) _scale = 1 / (readcount / 1e6) kwargs['scale'] = _scale x = pybedtools.BedTool(bam).genome_coverage(**kwargs) cmds = ['bedGraphToBigWig', x.fn, genome_file, output] p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) stdout, stderr = p.communicate() if p.returncode and 'bedSort' in stderr: print('BAM header was not sorted; sorting bedGraph') y = x.sort() cmds[1] = y.fn p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) stdout, stderr = p.communicate() if p.returncode: raise ValueError('cmds: %s\nstderr: %s\nstdout: %s' % (' '.join(cmds), stderr, stdout))
def wig_to_bigwig(wig, genome, output): genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) cmds = [ 'wigToBigWig', wig.fn, genome_file, output] try: p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() except FileNotFoundError: raise FileNotFoundError( "bigWigToBedGraph was not found on the path. This is an external " "tool from UCSC which can be downloaded from " "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use " "`conda install ucsc-bedgraphtobigwig`" ) if p.returncode: raise ValueError('cmds: %s\nstderr:%s\nstdout:%s' % (' '.join(cmds), stderr, stdout)) return output
def test_issue_145(): x = pybedtools.BedTool(""" chr1 1 100 feature1 0 + chr1 1 100 feature1 0 + """, from_string=True).saveas('foo.bed') g = pybedtools.chromsizes_to_file({'chr1': (0, 200)}, 'genome.txt') y = x.genome_coverage(g=g, **{'5': True}) # trying to print causes pybedtools to interpret as a BED file, but it's # a histogram so line 2 raises error with pytest.raises(pybedtools.MalformedBedLineError): print(y) # solution is to iterate over lines of file; make sure this works for line in open(y.fn): print(line) # if streaming, iterate over y.fn directly: y = x.genome_coverage(g=g, **{'5': True}) for line in y.fn: print(line)
import os import subprocess import pybedtools from trackhub import helpers data_dir = helpers.data_dir() chromsizes = pybedtools.genome_registry.dm3.euchromatic g = pybedtools.chromsizes_to_file(chromsizes) # Make some randomized bigBed files for i in range(3): x = ( pybedtools.BedTool("chr2L 0 10000000", from_string=True) .window_maker(g=g, w=1000 * (i + 1)) .shuffle(g=g, seed=i) .sort() ) out = os.path.join(data_dir, "random-dm3-%s.bigBed" % i) cmds = ["bedToBigBed", x.fn, g, out] p = subprocess.Popen(cmds) p.communicate() # make some sine waves for bigWigs import numpy as np def sine(factor):
#!/usr/bin/env python import os import subprocess import logging import hashlib import urllib import pybedtools import gffutils import metaseq logging.basicConfig(level=logging.DEBUG, format='[%(name)s] [%(asctime)s]: %(message)s') logger = logging.getLogger('metaseq data download') hg19 = pybedtools.chromsizes('hg19') genomes_file = pybedtools.chromsizes_to_file('hg19', 'hg19') usage = """ Downloads data from UCSC, GEO, and Ensembl. """ import argparse ap = argparse.ArgumentParser(usage=usage) ap.add_argument('--data-dir', default=metaseq.data_dir(), help='Location to store downloaded and prepped data. ' 'Default is %(default)s') args = ap.parse_args() CHROM = 'chr17' COORD = "%s:%s-%s" % (CHROM, 0, hg19[CHROM][-1])
#!/usr/bin/env python import os import subprocess import logging import hashlib import urllib import pybedtools import gffutils import metaseq logging.basicConfig( level=logging.DEBUG, format='[%(name)s] [%(asctime)s]: %(message)s') logger = logging.getLogger('metaseq data download') hg19 = pybedtools.chromsizes('hg19') genomes_file = pybedtools.chromsizes_to_file('hg19', 'hg19') usage = """ Downloads data from UCSC, GEO, and Ensembl. """ import argparse ap = argparse.ArgumentParser(usage=usage) ap.add_argument( '--data-dir', default=metaseq.data_dir(), help='Location to store downloaded and prepped data. ' 'Default is %(default)s') args = ap.parse_args() CHROM = 'chr17'
def parallel_apply(orig_bedtool, method, genome=None, genome_fn=None, method_args=None, method_kwargs=None, shuffle_kwargs=None, shuffle=True, reduce_func=None, processes=1, sort=False, _orig_pool=None, iterations=1000, debug=False, report_iterations=False): """ Call an arbitrary BedTool method many times in parallel. An example use-case is to generate a null distribution of intersections, and then compare this to the actual intersections. **Important:** due to a known file handle leak in BedTool.__len__, it's best to simply check the number of lines in the file, as in the below function. This works because BEDTools programs strip any non-interval lines in the results. >>> # set up example BedTools >>> a = pybedtools.example_bedtool('a.bed') >>> b = pybedtools.example_bedtool('b.bed') >>> # Method of `a` to call: >>> method = 'intersect' >>> # Kwargs provided to `a.intersect` each iteration >>> method_kwargs = dict(b=b, u=True) >>> # Function that will be called on the results of >>> # `a.intersect(**method_kwargs)`. >>> def reduce_func(x): ... return sum(1 for _ in open(x.fn)) >>> # Create a small artificial genome for this test (generally you'd >>> # use an assembly name, like "hg19"): >>> genome = dict(chr1=(0, 1000)) >>> # Do 10 iterations using 1 process for this test (generally you'd >>> # use 1000+ iterations, and as many processes as you have CPUs) >>> results = pybedtools.parallel.parallel_apply(a, method, genome=genome, ... method_kwargs=method_kwargs, iterations=10, processes=1, ... reduce_func=reduce_func, debug=True, report_iterations=True) >>> # get results >>> print list(results) [2, 2, 3, 0, 3, 3, 0, 0, 2, 4] >>> # We can compare this to the actual intersection: >>> reduce_func(a.intersect(**method_kwargs)) 3 Alternatively, we could use the `a.jaccard` method, which already does the reduction to a dictionary. However, the Jaccard method requires the input to be sorted. Here, we specify `sort=True` to sort each shuffled BedTool before calling its `jaccard` method. >>> from pybedtools.parallel import parallel_apply >>> a = pybedtools.example_bedtool('a.bed') >>> results = parallel_apply(a, method='jaccard', method_args=(b,), ... genome=genome, iterations=3, processes=1, sort=True, debug=True) >>> for i in results: ... print sorted(i.items()) [('intersection', 15), ('jaccard', 0.0238095), ('n_intersections', 2), ('union-intersection', 630)] [('intersection', 15), ('jaccard', 0.0238095), ('n_intersections', 2), ('union-intersection', 630)] [('intersection', 45), ('jaccard', 0.0818182), ('n_intersections', 1), ('union-intersection', 550)] Parameters ---------- orig_bedtool : BedTool method : str The method of `orig_bedtool` to run method_args : tuple Passed directly to getattr(orig_bedtool, method)() method_kwargs : dict Passed directly to getattr(orig_bedtool, method)() shuffle : bool If True, then `orig_bedtool` will be shuffled at each iteration and that shuffled version's `method` will be called with `method_args` and `method_kwargs`. shuffle_kwargs : dict If `shuffle` is True, these are passed to `orig_bedtool.shuffle()`. You do not need to pass the genome here; that's handled separately by the `genome` and `genome_fn` kwargs. iterations : int Number of iterations to perform genome : string or dict If string, then assume it is the assembly name (e.g., hg19) and get a dictionary of chromsizes for that assembly, then converts to a filename. genome_fn : str Mutually exclusive with `genome`; `genome_fn` must be an existing filename with the chromsizes. Use the `genome` kwarg instead if you'd rather supply an assembly or dict. reduce_func : callable Function or other callable object that accepts, as its only argument, the results from `orig_bedtool.method()`. For example, if you care about the number of results, then you can use `reduce_func=len`. processes : int Number of processes to run. If `processes=1`, then multiprocessing is not used (making it much easier to debug). This argument is ignored if `_orig_pool` is provided. sort : bool If both `shuffle` and `sort` are True, then the shuffled BedTool will then be sorted. Use this if `method` requires sorted input. _orig_pool : multiprocessing.Pool instance If provided, uses `_orig_pool` instead of creating one. In this case, `processes` will be ignored. debug : bool If True, then use the current iteration index as the seed to shuffle. report_iterations : bool If True, then report the number of iterations to stderr. """ shuffle_kwargs = shuffle_kwargs or {} method_args = method_args or () if not isinstance(method_args, list) and not isinstance(method_args, tuple): raise ValueError( "method_args must be a list or tuple, got %s" % type(method_args)) method_kwargs = method_kwargs or {} if genome_fn and genome: raise ValueError("only of of genome_fn or genome should be provided") if shuffle: if not genome_fn: if not genome: raise ValueError("shuffle=True, so either genome_fn" " or genome must be provided") genome_fn = pybedtools.chromsizes_to_file(genome) _parallel_wrap_kwargs = dict( orig_bedtool=orig_bedtool, shuffle_kwargs=shuffle_kwargs, genome_fn=genome_fn, method=method, method_args=method_args, method_kwargs=method_kwargs, shuffle=shuffle, reduce_func=reduce_func, sort=sort, ) def add_seed(i, kwargs): if debug and shuffle: kwargs['shuffle_kwargs']['seed'] = i return kwargs if processes == 1: for it in range(iterations): yield _parallel_wrap(**add_seed(it, _parallel_wrap_kwargs)) raise StopIteration if _orig_pool: p = _orig_pool else: p = multiprocessing.Pool(processes) results = [ p.apply_async(_parallel_wrap, (), add_seed(it, _parallel_wrap_kwargs)) for it in range(iterations)] for i, r in enumerate(results): yield r.get() if report_iterations: sys.stderr.write('%s\r' % i) sys.stderr.flush() raise StopIteration
def parallel_apply(orig_bedtool, method, genome=None, genome_fn=None, method_args=None, method_kwargs=None, shuffle_kwargs=None, shuffle=True, reduce_func=None, processes=1, sort=False, _orig_pool=None, iterations=1000, debug=False, report_iterations=False): """ Call an arbitrary BedTool method many times in parallel. An example use-case is to generate a null distribution of intersections, and then compare this to the actual intersections. **Important:** due to a known file handle leak in BedTool.__len__, it's best to simply check the number of lines in the file, as in the below function. This works because BEDTools programs strip any non-interval lines in the results. >>> # set up example BedTools >>> a = pybedtools.example_bedtool('a.bed') >>> b = pybedtools.example_bedtool('b.bed') >>> # Method of `a` to call: >>> method = 'intersect' >>> # Kwargs provided to `a.intersect` each iteration >>> method_kwargs = dict(b=b, u=True) >>> # Function that will be called on the results of >>> # `a.intersect(**method_kwargs)`. >>> def reduce_func(x): ... return sum(1 for _ in open(x.fn)) >>> # Create a small artificial genome for this test (generally you'd >>> # use an assembly name, like "hg19"): >>> genome = dict(chr1=(0, 1000)) >>> # Do 10 iterations using 1 process for this test (generally you'd >>> # use 1000+ iterations, and as many processes as you have CPUs) >>> results = pybedtools.parallel.parallel_apply(a, method, genome=genome, ... method_kwargs=method_kwargs, iterations=10, processes=1, ... reduce_func=reduce_func, debug=True, report_iterations=True) >>> # get results >>> print(list(results)) [2, 2, 3, 0, 3, 3, 0, 0, 2, 4] >>> # We can compare this to the actual intersection: >>> reduce_func(a.intersect(**method_kwargs)) 3 Alternatively, we could use the `a.jaccard` method, which already does the reduction to a dictionary. However, the Jaccard method requires the input to be sorted. Here, we specify `sort=True` to sort each shuffled BedTool before calling its `jaccard` method. >>> from pybedtools.parallel import parallel_apply >>> a = pybedtools.example_bedtool('a.bed') >>> results = parallel_apply(a, method='jaccard', method_args=(b,), ... genome=genome, iterations=3, processes=1, sort=True, debug=True) >>> for i in results: ... print(sorted(i.items())) [('intersection', 15), ('jaccard', 0.0238095), ('n_intersections', 2), ('union-intersection', 630)] [('intersection', 15), ('jaccard', 0.0238095), ('n_intersections', 2), ('union-intersection', 630)] [('intersection', 45), ('jaccard', 0.0818182), ('n_intersections', 1), ('union-intersection', 550)] Parameters ---------- orig_bedtool : BedTool method : str The method of `orig_bedtool` to run method_args : tuple Passed directly to getattr(orig_bedtool, method)() method_kwargs : dict Passed directly to getattr(orig_bedtool, method)() shuffle : bool If True, then `orig_bedtool` will be shuffled at each iteration and that shuffled version's `method` will be called with `method_args` and `method_kwargs`. shuffle_kwargs : dict If `shuffle` is True, these are passed to `orig_bedtool.shuffle()`. You do not need to pass the genome here; that's handled separately by the `genome` and `genome_fn` kwargs. iterations : int Number of iterations to perform genome : string or dict If string, then assume it is the assembly name (e.g., hg19) and get a dictionary of chromsizes for that assembly, then converts to a filename. genome_fn : str Mutually exclusive with `genome`; `genome_fn` must be an existing filename with the chromsizes. Use the `genome` kwarg instead if you'd rather supply an assembly or dict. reduce_func : callable Function or other callable object that accepts, as its only argument, the results from `orig_bedtool.method()`. For example, if you care about the number of results, then you can use `reduce_func=len`. processes : int Number of processes to run. If `processes=1`, then multiprocessing is not used (making it much easier to debug). This argument is ignored if `_orig_pool` is provided. sort : bool If both `shuffle` and `sort` are True, then the shuffled BedTool will then be sorted. Use this if `method` requires sorted input. _orig_pool : multiprocessing.Pool instance If provided, uses `_orig_pool` instead of creating one. In this case, `processes` will be ignored. debug : bool If True, then use the current iteration index as the seed to shuffle. report_iterations : bool If True, then report the number of iterations to stderr. """ shuffle_kwargs = shuffle_kwargs or {} method_args = method_args or () if not isinstance(method_args, list) and not isinstance( method_args, tuple): raise ValueError("method_args must be a list or tuple, got %s" % type(method_args)) method_kwargs = method_kwargs or {} if genome_fn and genome: raise ValueError("only of of genome_fn or genome should be provided") if shuffle: if not genome_fn: if not genome: raise ValueError("shuffle=True, so either genome_fn" " or genome must be provided") genome_fn = pybedtools.chromsizes_to_file(genome) _parallel_wrap_kwargs = dict( orig_bedtool=orig_bedtool, shuffle_kwargs=shuffle_kwargs, genome_fn=genome_fn, method=method, method_args=method_args, method_kwargs=method_kwargs, shuffle=shuffle, reduce_func=reduce_func, sort=sort, ) def add_seed(i, kwargs): if debug and shuffle: kwargs['shuffle_kwargs']['seed'] = i return kwargs if processes == 1: for it in range(iterations): yield _parallel_wrap(**add_seed(it, _parallel_wrap_kwargs)) raise StopIteration if _orig_pool: p = _orig_pool else: p = multiprocessing.Pool(processes) results = [ p.apply_async(_parallel_wrap, (), add_seed(it, _parallel_wrap_kwargs)) for it in range(iterations) ] for i, r in enumerate(results): yield r.get() if report_iterations: sys.stderr.write('%s\r' % i) sys.stderr.flush() raise StopIteration
def main(): """ Creates a pairwise matrix containing overlapping feature counts for many BED files """ ap = argparse.ArgumentParser(usage=usage) ap.add_argument('beds', nargs="*", help='BED/GTF/GFF/VCF filenames, e.g., ' 'in a directory of bed files, you can use *.bed') ap.add_argument('--frac', action='store_true', help='Instead of counts, report fraction overlapped') ap.add_argument('--enrichment', action='store_true', help='Run randomizations (default 1000, specify otherwise ' 'with --iterations) on each pairwise comparison and ' 'compute the enrichment score as ' '(actual intersection count + 1) / (median randomized + 1)' ) ap.add_argument('--genome', help='Required argument if --enrichment is ' 'used. Needs to be a string assembly name like "dm3" or ' '"hg19"') ap.add_argument('--iterations', default=1000, type=int, help='Number of randomizations to perform for enrichement ' 'scores') ap.add_argument('--processes', default=None, type=int, help='Number of CPUs to use for randomization') ap.add_argument('--test', action='store_true', help='Ignore any input BED ' 'files and use test BED files') ap.add_argument('-v', '--verbose', action='store_true', help='Be verbose: print which files are ' 'currently being intersected and timing info at the end.') args = ap.parse_args() if not args.beds and not args.test: ap.print_help() sys.exit(1) if args.test: # insulator binding sites from ChIP-chip -- 4 proteins, 2 cell types # Genes Dev. 2009 23(11):1338-1350 args.beds = [example_filename(i) for i in [ 'Cp190_Kc_Bushey_2009.bed', 'Cp190_Mbn2_Bushey_2009.bed', 'CTCF_Kc_Bushey_2009.bed', 'CTCF_Mbn2_Bushey_2009.bed', 'SuHw_Kc_Bushey_2009.bed', 'SuHw_Mbn2_Bushey_2009.bed', 'BEAF_Mbn2_Bushey_2009.bed', 'BEAF_Kc_Bushey_2009.bed' ]] if args.enrichment: FUNC = enrichment_score genome_fn = pybedtools.chromsizes_to_file(pybedtools.chromsizes(args.genome)) kwargs = dict(genome_fn=genome_fn, iterations=args.iterations, processes=args.processes) elif args.frac: FUNC = frac_of_a kwargs = {} else: FUNC = actual_intersection kwargs = {} t0 = time.time() matrix = create_matrix(beds=args.beds, func=FUNC, verbose=args.verbose, **kwargs) t1 = time.time() nfiles = len(args.beds) if args.verbose: sys.stderr.write('Time to construct %s x %s matrix: %.1fs' \ % (nfiles, nfiles, (t1 - t0)) + '\n') keys = sorted(matrix.keys()) sys.stdout.write("\t" + "\t".join(keys) + '\n') for k in keys: sys.stdout.write(k) for j in keys: sys.stdout.write('\t' + str(matrix[k][j])) sys.stdout.write('\n')
for chrom, size in chromsizes.items(): fasta.write(">" + chrom + "\n") n, r = divmod(size[1], 80) for _ in range(n): fasta.write(random_dna(80)) fasta.write(random_dna(r)) cmds = ["faToTwoBit", fasta.name, fasta.name[0:-2] + "2bit"] p = subprocess.check_call(cmds) fasta.close() os.unlink(fasta.name) g = pybedtools.chromsizes_to_file(chromsizes) # Make some randomized bigBed files for i in range(3): x = pybedtools.BedTool( "chr1 0 100000", from_string=True)\ .window_maker(g=g, w=100 * (i + 1))\ .shuffle(g=g, seed=i)\ .sort() out = os.path.join(data_dir, 'random-no1-%s.bigBed' % i) cmds = ['bedToBigBed', x.fn, g, out] p = subprocess.check_call(cmds)
def enrichment(id,a, b,background, organism,name=None, score=None, strand=None, n=10, run=[]): """Perform enrichment analysis between two BED files. a - path to Feature of Interest BED file (FOI) b - path to Genomic Feature BED file (GF) n - number of Monte-Carlo iterations """ write_debug("START",True) r = {} e = Enrichment_Par(a=a,b=b,organism=organism,n=n,background=background) if os.path.exists(e.background): e = e.replace(Background = BedTool(e.background)) e = e._replace(A = BedTool(str(e.a))) e = e._replace(B = BedTool(str(e.b))) e = e._replace(genome = pybedtools.get_chromsizes_from_ucsc(e.organism)) e = e._replace(genome_fn = pybedtools.chromsizes_to_file(e.genome)) e = e._replace(organism = str(e.organism)) flt = make_filter(name,score,strand) e = e._replace(B = e.B.filter(flt).saveas()) e = e._replace(nA = len(e.A)) e = e._replace(nB = len(e.B)) # Exits if there are 0 GFs or 0 FOI if not e.nA or not e.nB: logger.info("Filter resulted in 0 Features of Interest. Terminating Run. {} (id={})".format(b,id)) return Enrichment(e.a,basename(e.b),e.nA,e.nB,"NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA") e.A.set_chromsizes(e.genome) e.B.set_chromsizes(e.genome) e = e._replace(obs = len(e.A.intersect(e.B, u=True))) # This is the Monte-Carlo step. If custom background present, it is used if 'pvalue' in run: logger.info("Running Monte Carlo ({}): (id={})".format(b,id)) write_progress(id, "Running Monte Carlo {}".format(b)) r.update(run_montecarlo(e)) else: r['p_value'], r['exp'] = "NA","NA" logger.info("Skipping Monte Carlo ({}): (id={})".format(b,id)) ## Uncomment to print global parameters in debug file #write_debug("Global parameters",a = e.a,b=e.b,A= e.A,B=e.B,background = e.background, # n=e.n,flt = e.flt,genome = e.genome,genome_fn = e.genome_fn,organism = e.organism,obs = e.obs) # expected caluclated using pybed method CANNOT use custom background if 'pybedtool' in run: logger.info("Running Random Intersections ({}): (id={})".format(b,id)) write_progress(id, "Running Random Intersections: {0}".format(b)) r.update( run_pybedtool(e)) else: r['pybedp_value'], r['pybed_exp'] = "NA","NA" logger.info("Skipping Random Intersections") # epected calculated using jaccard method if 'jaccard' in run: logger.info("Running Jaccard ({}): (id={})".format(e.b,id)) write_progress(id, "Running Jaccard {}".format(e.b)) r.update( run_jaccard(e)) else: r['jaccardp_value'], r['jaccard_obs'],r['jaccard_exp'] = "NA","NA","NA" logger.info("Skipping Jaccard ({}): (id={})".format(b,id)) # run kolmogorov-smornov test if 'kolmogorov' in run: logger.info("Running Kolmogorov-Smornov {} (id={})".format(b,id)) write_progress(id, "Running Kolmogorov-Smornov{}".format(b)) r.update( run_kolmogorov(e)) else: r['kol_smor_p_value'] = "NA" logger.info("Skipping Kolmogorov-Smornov {} (id={})".format(b,id)) # run proximity analysis if 'proximity' in run: logger.info("Running proximity {} (id={})".format(b,id)) write_progress(id, "Running proximity analysis{}".format(b)) r.update( run_proximity(e)) else: logger.info( "Skipping Proximity") r['obsprox'],r['expprox'],r['proximityp_value']="NA","NA", "NA" # run hypergeometric distrubtion analysis if 'hypergeometric' in run: write_progress(id,"Running") logger.info("Running hypergeometric analysis {} (id={})".format(b,id)) r.update(run_hypergeometric(e)) else: logger.info("Skipping hypergeometric") r['hypergeometric_p_value'] = "NA" ## Uncomment to print global parameters in debug file #write_debug("Global parameters",a = e.a,b=e.b,A= e.A,B=e.B,background = e.background, # n=e.n,flt = e.flt,genome = e.genome,genome_fn = e.genome_fn,organism = e.organism,obs = e.obs) # the order of these arguments IS IMPORTANT return Enrichment(e.a, basename(e.b), e.nA, e.nB, e.obs, r['exp'], r['p_value'],r['obsprox'],\ r['expprox'],r['pybedp_value'],r['pybed_exp'],r['jaccard_obs'],r['jaccardp_value'],\ r['jaccard_exp'],r['proximityp_value'],r['kol_smor_p_value'],r['hypergeometric_p_value'])