def test_chromsizes_from_genomepy(): import pybedtools if "genomepy" not in sys.modules: pytest.skip("genomepy not instlled -- skipping test") genome = "pybedtools/test/data/genome.fa" try: d = pybedtools.helpers.get_chromsizes_from_genomepy(genome) print(pybedtools.chromsizes(genome)) assert d["chr1"] == (0, 10) d = pybedtools.chromsizes(genome) assert d["chr3"] == (0, 30) finally: # Make sure all genomepy files get deleted fnames = [genome + ext for ext in [".fai", ".sizes"]] fnames.append(genome.replace(".fa", ".gaps.bed")) for fname in fnames: if os.path.exists(fname): os.unlink(fname) assert None == pybedtools.helpers.get_chromsizes_from_genomepy( "non-existing")
def test_chromsizes(): assert_raises(OSError, pybedtools.get_chromsizes_from_ucsc, 'dm3', mysql='wrong path') assert_raises(ValueError, pybedtools.get_chromsizes_from_ucsc, 'dm3', timeout=0) try: print pybedtools.chromsizes('dm3') print pybedtools.get_chromsizes_from_ucsc('dm3') assert pybedtools.chromsizes('dm3') == pybedtools.get_chromsizes_from_ucsc('dm3') hg17 = pybedtools.chromsizes('hg17') assert hg17['chr1'] == (0, 245522847) fn = pybedtools.chromsizes_to_file(hg17, fn='hg17.genome') expected = 'chr1\t245522847\n' results = open(fn).readline() print results assert expected == results # make sure the tempfile version works, too fn = pybedtools.chromsizes_to_file(hg17, fn=None) expected = 'chr1\t245522847\n' results = open(fn).readline() print results assert expected == results assert_raises(OSError, pybedtools.get_chromsizes_from_ucsc, **dict(genome='hg17', mysql='nonexistent')) os.unlink('hg17.genome') except OSError: sys.stdout.write("mysql error -- test for chromsizes from UCSC didn't run")
def test_genomepy_not_installed(): import pybedtools if "genomepy" in sys.modules: del sys.modules["genomepy"] genome = "pybedtools/test/data/genome.fa" d = pybedtools.helpers.get_chromsizes_from_genomepy(genome) assert d is None with pytest.raises(OSError): pybedtools.chromsizes(genome) with pytest.raises(OSError): pybedtools.chromsizes("non-existing")
def bam_to_bigwig(bam, genome, output, scale=False): """ Given a BAM file `bam` and assembly `genome`, create a bigWig file scaled such that the values represent scaled reads -- that is, reads per million mapped reads. (Disable this scaling step with scale=False; in this case values will indicate number of reads) Assumes that `bedGraphToBigWig` from UCSC tools is installed; see http://genome.ucsc.edu/goldenPath/help/bigWig.html for more details on the format. """ genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) kwargs = dict(bg=True, split=True, g=genome_file) if scale: readcount = mapped_read_count(bam) _scale = 1 / (readcount / 1e6) kwargs['scale'] = _scale x = pybedtools.BedTool(bam).genome_coverage(**kwargs) cmds = [ 'bedGraphToBigWig', x.fn, genome_file, output] os.system(' '.join(cmds))
def get_random_genomic_locations(n_regions, width_mean=500, width_std=400, min_width=300, genome_assembly="hg38"): """Get `n_regions`` number of random genomic locations respecting the boundaries of the ``genome_assembly``""" from ngs_toolkit.utils import bed_to_index # weight chroms by their size, excluding others csizes = { k: v[-1] for k, v in dict(pybedtools.chromsizes(genome_assembly)).items() if "_" not in k } gsize = sum(csizes.values()) csizes = {k: v / gsize for k, v in csizes.items()} chrom = pd.Series( np.random.choice(a=list(csizes.keys()), size=n_regions, p=list(csizes.values()))) start = np.array([0] * n_regions) end = np.absolute(np.random.normal(width_mean, width_std, n_regions)).astype(int) df = pd.DataFrame([chrom.tolist(), start.tolist(), end.tolist()]).T df.loc[(df[2] - df[1]) < min_width, 2] += min_width bed = (pybedtools.BedTool.from_dataframe(df).shuffle( genome=genome_assembly, chromFirst=True, noOverlapping=True, chrom=True).sort().to_dataframe()) return bed_to_index(bed)
def bam2bigwig(bam, bigwig, genome, scale=1e6, verbose=False): """ Uses BEDTools to go from BAM to bedgraph, then bedGraphToBigWig to get the final bigwig. """ if scale is not None: cmds = ['samtools', 'view', '-F', '0x4', '-c', bam] p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() total_reads = float(stdout) reads_per_scale = total_reads / scale if verbose: sys.stderr.write('%s total reads\n' % total_reads) sys.stderr.flush() chromsizes = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) t0 = time.time() bedgraph = pybedtools.BedTool(bam)\ .genome_coverage(bg=True, g=chromsizes, scale=scale)\ .moveto('bedgraph.bedgraph') print bedgraph.fn if verbose: sys.stderr.write('Completed bedGraph in %.1fs\n' % (time.time() - t0)) sys.stderr.flush() cmds = ['bedGraphToBigWig', bedgraph.fn, chromsizes, bigwig] p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if verbose: sys.stderr.write('Completed bigWig %s\n' % bigwig) sys.stderr.flush()
def get_random_block(chrom, gene_trees, genome_fasta, block_range): ''' Get random block sequence for feature not overlapping any other genomic features ''' chr_sizes = pybedtools.chromsizes('hg38') chr_features = gene_trees[chrom] chr_range = chr_sizes[('chr%s' % chrom)] block_size = np.random.randint(block_range[0], block_range[1]) block_start = np.random.randint(chr_range[0], chr_range[1] - block_size) block_end = block_start + block_size strand = np.random.choice(['+', '-']) block_bed = '%s\t%d\t%d\t.\t1\t%s' % (chrom, block_start, block_end, strand) block_bt = BedTool(block_bed, from_string=True) block_seq, bs = get_seq(block_bt, genome_fasta) seq = ''.join([bs for bs in block_seq.values()]) while chr_features.overlaps(block_start, block_end) or 'N' in seq: # block is invalid and must be reselected set_and_increment_seed() block_start = np.random.randint(chr_range[0], chr_range[1] - block_size) block_end = block_start + block_size block_bed = '%s\t%d\t%d\t.\t1\t%s' % (chrom, block_start, block_end, strand) block_bt = BedTool(block_bed, from_string=True) block_seq, bs = get_seq(block_bt, genome_fasta) seq = ''.join([bs for bs in block_seq.values()]) return block_seq
def __init__(self, file, genome, chrom, matrix_dim, incr_column=None): self.file = file self.genome = genome self.chrom = chrom self.use_chrom_range = False # grab the dict of chrom lengths for this genome if isinstance(self.genome, basestring): self.chromdict = pbt.chromsizes(self.genome) elif isinstance(self.genome, dict): self.chromdict = self.genome else: raise ValueError('`genome` must be either a string assembly name ' ' or a dictionary of chrom:(start, stop)') if self.chrom != "genome": chrom_range_tuple = get_interval_from_string(self.chrom) if chrom_range_tuple is None: # grab the length of the requested chromosome self.chrom_length = self.chromdict[self.chrom][1] else: (self.chrom, self.range_start, self.range_end) = \ chrom_range_tuple self.chrom_length = self.range_end - self.range_start self.use_chrom_range = True print self.chrom, "size: ", else: # using the entire genome for our coordinate system self.chrom_length = 0 curr_offset = 0 self.chrom_offsets = {} self.chrom_offsets_list = [] self.chrom_names_list = [] for chrom in self.chromdict: self.chrom_offsets[chrom] = curr_offset self.chrom_offsets_list.append(curr_offset) self.chrom_names_list.append(chrom) self.chrom_length += self.chromdict[chrom][1] curr_offset += self.chromdict[chrom][1] print "genome size: ", print self.chrom_length super(HilbertMatrix, self).__init__(matrix_dim, self.chrom_length) print "using matrix of size", self.matrix_dim, "there are", \ self.ncells, "cells in the matrix and each cell represents", \ int(self.dist_per_cell), "base pairs." self.incr_column = incr_column self.num_intervals = 0 self.total_interval_length = 0 chrom_offsets = [] chrom_names = [] self.temp_files = [] # populate the matrix with the data contained in self.file self.build() self.dump_matrix()
def gc_content(vf, fa, flank=50): print "inside gc_content" v = BedTool(vf) flanks = v.slop(g=pybedtools.chromsizes('hg19'), b=flank) nc = flanks.nucleotide_content(fi=fa) results = dict([ (r.name, float(r[5])) for r in nc ]) print "exiting gc_content" return Series(results, name="GC")
def bam_to_bigwig(bam, genome, output, scale=False): """ Given a BAM file `bam` and assembly `genome`, create a bigWig file scaled such that the values represent scaled reads -- that is, reads per million mapped reads. (Disable this scaling step with scale=False; in this case values will indicate number of reads) Assumes that `bedGraphToBigWig` from UCSC tools is installed; see http://genome.ucsc.edu/goldenPath/help/bigWig.html for more details on the format. """ genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) kwargs = dict(bg=True, split=True, g=genome_file) if scale: readcount = mapped_read_count(bam) _scale = 1 / (readcount / 1e6) kwargs["scale"] = _scale x = pybedtools.BedTool(bam).genome_coverage(**kwargs) cmds = ["bedGraphToBigWig", x.fn, genome_file, output] try: p = subprocess.Popen( cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, ) stdout, stderr = p.communicate() except FileNotFoundError: raise FileNotFoundError( "bedGraphToBigWig was not found on the path. This is an external " "tool from UCSC which can be downloaded from " "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use " "`conda install ucsc-bedgraphtobigwig`") if p.returncode and "bedSort" in stderr: print("BAM header was not sorted; sorting bedGraph") y = x.sort() cmds[1] = y.fn try: p = subprocess.Popen( cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, ) stdout, stderr = p.communicate() except FileNotFoundError: raise FileNotFoundError( "bedSort was not found on the path. This is an external " "tool from UCSC which can be downloaded from " "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use " "`conda install ucsc-bedgraphtobigwig`") if p.returncode: raise ValueError("cmds: %s\nstderr: %s\nstdout: %s" % (" ".join(cmds), stderr, stdout))
def wig_to_bigwig(wig, genome, output): genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) cmds = ['wigToBigWig', wig.fn, genome_file, output] subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if p.returncode: raise ValueError('cmds: %s\nstderr:%s\nstdout:%s' % (' '.join(cmds), stderr, stdout)) return output
def bedgraph_to_bigwig(bedgraph, genome, output): genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) cmds = [ 'bedGraphToBigWig', bedgraph.fn, genome_file, output] os.system(' '.join(cmds)) return output
def wig_to_bigwig(wig, genome, output): genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) cmds = [ 'wigToBigWig', wig.fn, genome_file, output] os.system(' '.join(cmds)) return output
def bedgraph_to_bigwig(bedgraph, genome, output): genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) cmds = ['bedGraphToBigWig', bedgraph.fn, genome_file, output] p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if p.returncode: raise ValueError("cmds: %s\nstderr:%s\nstdout:%s" % (" ".join(cmds), stderr, stdout)) return output
def __init__(self, genome, windowsize, chrom=None, window_cache_dir=".", npz_dir='.', metric='mean0'): """ Class to handle converting bigWig files into NumPy arrays. bigWigAverageOverBed needs to be available on the path. The arrays are saved to disk as .npz files, which can then be memory-mapped for fast, lightweight re-use. Each .npz file contains the coordinates of the bin midpoints (x) and values in each bin (y). The class is designed to be set up once, then used many times on different bigWig files. General usage: >>> b = Binner('mm9', 1000, chrom='chr19') >>> b.to_npz('PolII.bigwig') >>> pol = np.load('PolII.bigwig.chr19.npz') Assuming matplotlib is installed, >>> from matplotlib import pyplot as plt >>> plt.plot(pol['x'], pol['y']) >>> plt.show() Parameters ---------- genome : str Assembly name to use (e.g., hg19, dm3). This is used for creating the right number of windows. windowsize : int Bp to use in each window chrom : None or str If None, all chromosomes will be used; otherwise you can specify a single chromosome here. window_cache_dir : str Path where BED files containing windowed chromsome coordinates will be stored. These files are cached to avoid creating them every time, and have the filename pattern {window_cache_dir}/{chrom}.{windowsize}bp_windows.bed """ self.chromsizes = pybedtools.chromsizes(genome) if chrom is None: self.chroms = sorted(self.chromsizes.keys()) else: self.chroms = [chrom] self.windowsize = windowsize self.window_cache_dir = window_cache_dir
def test_chromsizes_in_5prime_3prime(): # standard 5' a = pybedtools.example_bedtool('a.bed')\ .each(featurefuncs.five_prime, 1, 10, add_to_name="_TSS", genome=pybedtools.chromsizes("hg19"))\ .saveas() assert a == fix( """ chr1 0 11 feature1_TSS 0 + chr1 99 110 feature2_TSS 0 + chr1 490 501 feature3_TSS 0 - chr1 899 910 feature4_TSS 0 + """), str(a) # add genomes sizes; last feature should be truncated a = pybedtools.example_bedtool('a.bed')\ .each(featurefuncs.five_prime, 1, 10, add_to_name="_TSS", genome=dict(chr1=(0, 900)))\ .saveas() assert a == fix( """ chr1 0 11 feature1_TSS 0 + chr1 99 110 feature2_TSS 0 + chr1 490 501 feature3_TSS 0 - chr1 899 900 feature4_TSS 0 + """), str(a) # same thing but for 3'. # Note that the last feature chr1:949-960 is completely truncated because # it would entirely fall outside of the chromosome a = pybedtools.example_bedtool('a.bed')\ .each(featurefuncs.three_prime, 1, 10, add_to_name="_TSS", genome=dict(chr1=(0, 900)))\ .saveas() assert a == fix( """ chr1 99 110 feature1_TSS 0 + chr1 199 210 feature2_TSS 0 + chr1 140 151 feature3_TSS 0 - chr1 900 900 feature4_TSS 0 + """), str(a) # be a lot harsher with the chromsizes to ensure features on both strands # get truncated correctly a = pybedtools.example_bedtool('a.bed')\ .each(featurefuncs.three_prime, 1, 10, add_to_name="_TSS", genome=dict(chr1=(0, 120)))\ .saveas() assert a == fix( """ chr1 99 110 feature1_TSS 0 + chr1 120 120 feature2_TSS 0 + chr1 120 120 feature3_TSS 0 - chr1 120 120 feature4_TSS 0 + """), str(a)
def __init__(self, genome, windowsize, chrom=None, window_cache_dir=".", npz_dir='.', metric='mean0'): self.chromsizes = pybedtools.chromsizes(genome) if chrom is None: self.chroms = sorted(self.chromsizes.keys()) else: self.chroms = [chrom] self.windowsize = windowsize self.window_cache_dir = window_cache_dir
def fix_macs_wig(fn, genome, output=None, add_chr=False, to_ignore=None): """ wig files created by MACS often are extended outside the chromsome ranges. This function edits an input WIG file to fit within the chromosome boundaries defined by `genome`. If `add_chr` is True, then prefix each chromosome name with "chr". Also gets rid of any track lines so the file is ready for conversion to bigWig. Returns the output filename. fn : str Input WIG filename. Can be gzipped, if extension ends in .gz. genome : str or dict output : str or None If None, writes to temp file to_ignore : list List of chromosomes to ignore. """ if output is None: output = pybedtools.BedTool._tmp() if to_ignore is None: to_ignore = [] genome = pybedtools.chromsizes(genome) with open(output, 'w') as fout: if fn.endswith('.gz'): f = gzip.open(fn) else: f = open(fn) for line in f: if line.startswith('track'): continue if line.startswith('variableStep'): a, b, c = line.strip().split() prefix, chrom = b.split('=') if add_chr: chrom = 'chr' + chrom if chrom in to_ignore: continue fout.write(' '.join([a, prefix + '=' + chrom, c]) + '\n') span = int(c.split('=')[1]) continue pos, val = line.strip().split() if chrom in to_ignore: continue if (int(pos) + span) >= genome[chrom][1]: continue fout.write(line) return output
def bigbed( x, genome, output, blockSize=256, itemsPerSlot=512, bedtype=None, _as=None, unc=False, tab=False, ): """ Converts a BedTool object to a bigBed format and returns the new filename. `x` is a BedTool object `genome` is an assembly string `output` is the name of the bigBed file to create. Other args are passed to bedToBigBed. In particular, `bedtype` (which becomes the "-type=" argument) is automatically handled for you if it is kept as the default None. Assumes that a recent version of bedToBigBed from UCSC is on the path. """ if isinstance(x, six.string_types): x = pybedtools.BedTool(x) if not isinstance(x.fn, six.string_types): x = x.saveas() chromsizes = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) if bedtype is None: bedtype = "bed%s" % x.field_count() cmds = [ "bedToBigBed", x.fn, chromsizes, output, "-blockSize=%s" % blockSize, "-itemsPerSlot=%s" % itemsPerSlot, "-type=%s" % bedtype, ] if unc: cmds.append("-unc") if tab: cmds.append("-tab") if _as: cmds.append("-as=%s" % _as) p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if p.returncode: raise ValueError("cmds: %s\nstderr:%s\nstdout:%s" % (" ".join(cmds), stderr, stdout)) return output
def plot_chr_counts(assembly, dataframe): chr_size = pybedtools.chromsizes(assembly) chromsizes = {k: chr_size[k][1] - chr_size[k][0] for k in chr_size} keys = dataframe['chrom'].value_counts( ).sort_values(ascending=True).index.tolist() return gridplot([[ plot_vbar(pd.Series(dataframe['chrom']), count=True, keys=keys, title='Counts per chromossome'), plot_vbar(pd.Series(chromsizes), keys=keys, title=assembly + ' Chromossome size')]])
def maybe_read_chromsizes(genome): try: chromsizes = OrderedDict() with open(genome) as f: for line in f: chrom, size = line.strip().split() size = int(size) chromsizes[chrom] = (0, size) except OSError: chromsizes = pybedtools.chromsizes(genome) return chromsizes
def get_chr_size( genome_build="hg19" , cannonical=False): """ Returns a data frame of chr sizes """ X = pybedtools.chromsizes(genome=genome_build) if cannonical : df = pd.DataFrame( [[ i, X.get(i)[1]] for i in X ], columns=["Chr","Size"]) else : df = pd.DataFrame( [[ i, X.get(i)[1]] for i in X if '_' not in i ], columns=["Chr","Size"]) if not df.empty : return(df)
def bam_to_bigwig(bam, genome, output, scale=False): """ Given a BAM file `bam` and assembly `genome`, create a bigWig file scaled such that the values represent scaled reads -- that is, reads per million mapped reads. (Disable this scaling step with scale=False; in this case values will indicate number of reads) Assumes that `bedGraphToBigWig` from UCSC tools is installed; see http://genome.ucsc.edu/goldenPath/help/bigWig.html for more details on the format. """ genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) kwargs = dict(bg=True, split=True, g=genome_file) if scale: readcount = mapped_read_count(bam) _scale = 1 / (readcount / 1e6) kwargs['scale'] = _scale x = pybedtools.BedTool(bam).genome_coverage(**kwargs) cmds = [ 'bedGraphToBigWig', x.fn, genome_file, output] try: p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) stdout, stderr = p.communicate() except FileNotFoundError: raise FileNotFoundError( "bedGraphToBigWig was not found on the path. This is an external " "tool from UCSC which can be downloaded from " "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use " "`conda install ucsc-bedgraphtobigwig`" ) if p.returncode and 'bedSort' in stderr: print('BAM header was not sorted; sorting bedGraph') y = x.sort() cmds[1] = y.fn try: p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) stdout, stderr = p.communicate() except FileNotFoundError: raise FileNotFoundError( "bedSort was not found on the path. This is an external " "tool from UCSC which can be downloaded from " "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use " "`conda install ucsc-bedgraphtobigwig`" ) if p.returncode: raise ValueError('cmds: %s\nstderr: %s\nstdout: %s' % (' '.join(cmds), stderr, stdout))
def bedgraph_to_bigwig(bedgraph, genome, output): genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) cmds = [ 'bedGraphToBigWig', bedgraph.fn, genome_file, output] p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if p.returncode: raise ValueError("cmds: %s\nstderr:%s\nstdout:%s" % (" ".join(cmds), stderr, stdout)) return output
def wig_to_bigwig(wig, genome, output): genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) cmds = [ 'wigToBigWig', wig.fn, genome_file, output] subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if p.returncode: raise ValueError('cmds: %s\nstderr:%s\nstdout:%s' % (' '.join(cmds), stderr, stdout)) return output
def test_chromsizes(): with pytest.raises(OSError): pybedtools.get_chromsizes_from_ucsc("dm3", mysql="wrong path", fetchchromsizes="wrongtoo") with pytest.raises(ValueError): pybedtools.get_chromsizes_from_ucsc("dm3", timeout=0) try: print(pybedtools.chromsizes("dm3")) print(pybedtools.get_chromsizes_from_ucsc("dm3")) assert pybedtools.chromsizes( "dm3") == pybedtools.get_chromsizes_from_ucsc("dm3") hg17 = pybedtools.chromsizes("hg17") assert hg17["chr1"] == (0, 245522847) fn = pybedtools.chromsizes_to_file(hg17, fn="hg17.genome") expected = "chr1\t245522847\n" results = open(fn).readline() print(results) assert expected == results # make sure the tempfile version works, too fn = pybedtools.chromsizes_to_file(hg17, fn=None) expected = "chr1\t245522847\n" results = open(fn).readline() print(results) assert expected == results with pytest.raises(OSError): pybedtools.get_chromsizes_from_ucsc(**dict( genome="hg17", mysql="nonexistent", fetchchromsizes="missing")) os.unlink("hg17.genome") except OSError: sys.stdout.write( "mysql error -- test for chromsizes from UCSC didn't run")
def hypmut_bw(vcf,bw,step=1000,nstep=50,shr=0.8,genome='mm9'): gnm = pb.chromsizes(genome) bin_bed = bed_bins(step, genome=genome) vcf_b = pb.BedTool(vcf) bed_out = variants_bed_counts(vcf_b,bin_bed) out = bed_out.to_dataframe() out['density'] = out['score']/step hyp_bw = pd.DataFrame() for chrom in gnm.keys(): out_chr = out[out['chrom']==chrom] x = out_chr['start'].values # + int(step/2) y = out_chr['density'].values st = gnm[chrom][0] end = gnm[chrom][1] vl = gs.countFragmentsInRegions_worker(chrom, int(st), int(end), [bw], stepSize=step, binLength=step, save_data=False) xb = np.arange(st,end,step)[:-1] yb = np.squeeze(vl[0])[:-1] x_r = x[:-nstep] cr_f = np.zeros(x_r.shape) # rolling correlation: for pandas DF -- df['A'].rolling(10).corr(df['B']) for i in range(len(x_r)): cr_f[i] = np.corrcoef(y[i:i+nstep],yb[i:i+nstep])[0,1] x_c = x_r[cr_f>shr] hyp_cor= pd.DataFrame() hyp_cor['start']= x_c hyp_cor['end']= x_c+nstep*step hyp_cor['chrom']= chrom hyp_cor = hyp_cor[['chrom','start','end']].astype({'start':int,'end':int}) hyp_bw = pd.concat([hyp_bw, hyp_cor]) hyp_bed = pb.BedTool.from_dataframe(hyp_bw).merge() return hyp_bed
def seq_context(vf, fa): print "inside seq_context" v = BedTool(vf) flanks = v.slop(g=pybedtools.chromsizes('hg19'), b=1) nc = flanks.nucleotide_content(fi=fa, seq=True, pattern="CG", C=True) cpg_context = Series(dict([ (r.name, float(r[14])) for r in nc ])) nucleotide = Series(dict([ (r.name, r[13][1].upper()) for r in nc ])) results = {} for b in 'ACGT': results['seq_'+b] = (nucleotide == b).apply(float) results['in_cpg'] = cpg_context print "exiting seq_context" return DataFrame(results)
def get_genomic_bins(n_bins, genome_assembly="hg38", resolution=None): """Get a ``size`` number of random genomic bins respecting the boundaries of the ``genome_assembly``""" from ngs_toolkit.utils import bed_to_index bed = pybedtools.BedTool.from_dataframe( pd.DataFrame(dict( pybedtools.chromsizes(genome_assembly))).T.reset_index()) w = bed.makewindows(genome=genome_assembly, w=sum([i.length for i in bed]) / n_bins).to_dataframe() if resolution is not None: if isinstance(resolution, str): resolution = int(resolution.replace("kb", "000")) w["end"] = w["start"] + resolution return bed_to_index(w.head(n_bins))
def extractIntervals(self): midpointlist = [] for peak in self.merged: midpoint = round((int(peak[1]) + int(peak[2]))/2) midpointlist.append((peak[0], midpoint, midpoint+1)) midpoints = BedTool(midpointlist) chrom = pybedtools.chromsizes(self.referenceGenome) self.slopped = midpoints.slop(b=self.flankLength, g=chrom) self.startvals = [int(x[1]) for x in self.slopped] self.endvals = [int(x[2]) for x in self.slopped] return self.chromosomes, self.startvals, self.endvals
def bedgraph2bigwig(bedgraph, bigwig, genome, verbose=False): """ Create a bigWig from `bedgraph`. :param bedgraph: Input filename of bedgraph :param bigwig: Output filename of bigWig to create :param genome: String assembly name of genome :param verbose: Print messages to stderr """ chromsizes = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) cmds = ['bedGraphToBigWig', bedgraph, chromsizes, bigwig] p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if verbose: sys.stderr.write('Completed bigWig %s\n' % bigwig) sys.stderr.flush()
def bigbed(x, genome, output, blockSize=256, itemsPerSlot=512, bedtype=None, _as=None, unc=False, tab=False): """ Converts a BedTool object to a bigBed format and returns the new filename. `x` is a BedTool object `genome` is an assembly string `output` is the name of the bigBed file to create. Other args are passed to bedToBigBed. In particular, `bedtype` (which becomes the "-type=" argument) is automatically handled for you if it is kept as the default None. Assumes that a recent version of bedToBigBed from UCSC is on the path. """ if isinstance(x, str): x = pybedtools.BedTool(x) if not isinstance(x.fn, str): x = x.saveas() chromsizes = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) if bedtype is None: bedtype = 'bed%s' % x.field_count() cmds = [ 'bedToBigBed', x.fn, chromsizes, output, '-blockSize=%s' % blockSize, '-itemsPerSlot=%s' % itemsPerSlot, '-type=%s' % bedtype ] if unc: cmds.append('-unc') if tab: cmds.append('-tab') if _as: cmds.append('-as=%s' % _as) p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if p.returncode: raise ValueError("cmds: %s\nstderr:%s\nstdout:%s" % (" ".join(cmds), stderr, stdout)) return output
def snp_stats(vf, af, stat='avg_het', flank=500): v = BedTool(vf) feats = BedTool(af) flanks = v.slop(g=pybedtools.chromsizes('hg19'), b=flank) intersection = feats.intersect(flanks, wb=True) results = {} if len(intersection) > 0: sort_cmd = 'sort -k6,6 -k7,7n -k8,8n -k9,9 %s -o %s' % (intersection.fn, intersection.fn) call(sort_cmd, shell=True) annots = intersection.groupby(g=[6,7,8,9], c=5, ops='collapse') for entry in annots: rates = entry[4].split(',') tot = reduce(lambda x, y: x + float(y), rates, 0.) rate = tot / (flank * 2) results[entry.name] = rate return Series(results, name=stat)
def wig_to_bigwig(wig, genome, output): genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) cmds = ["wigToBigWig", wig.fn, genome_file, output] try: p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() except FileNotFoundError: raise FileNotFoundError( "bigWigToBedGraph was not found on the path. This is an external " "tool from UCSC which can be downloaded from " "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use " "`conda install ucsc-bedgraphtobigwig`") if p.returncode: raise ValueError("cmds: %s\nstderr:%s\nstdout:%s" % (" ".join(cmds), stderr, stdout)) return output
def snp_stats(vf, af, stat='avg_het', flank=500): print "inside snp_stats" v = BedTool(vf) feats = BedTool(af) flanks = v.slop(g=pybedtools.chromsizes('hg19'), b=flank) intersection = feats.intersect(flanks, wb=True) results = {} if len(intersection) > 0: sort_cmd = 'sort -k6,6 -k7,7n -k8,8n -k9,9 %s -o %s' % (intersection.fn, intersection.fn) call(sort_cmd, shell=True) annots = intersection.groupby(g=[6,7,8,9], c=5, o='collapse') for entry in annots: rates = entry[4].split(',') tot = reduce(lambda x, y: x + float(y), rates, 0.) rate = tot / (flank * 2) results[entry.name] = rate print "exiting snp_stats" return Series(results, name=stat)
def bed_bins(step, genome='mm9'): gnm = pb.chromsizes(genome) all_bins = pd.DataFrame() for chrm in gnm.keys(): st = np.arange(gnm[chrm][0],gnm[chrm][1],step) bins = pd.DataFrame() bins['start'] = st[:-1] bins['end'] = st[1:] bins['chrom'] = chrm bins['score'] = 0 all_bins = pd.concat([all_bins, bins]) cols = ['chrom','start','end','score'] all_bins = all_bins[cols] bin_bed = pb.BedTool.from_dataframe(all_bins) return bin_bed
def bam_to_bigwig(bam, genome, output): """ Given a BAM file `bam` and assembly `genome`, create a bigWig file scaled such that the values represent scaled reads -- that is, reads per million mapped reads. Assumes that `bedGraphToBigWig` from UCSC tools is installed; see http://genome.ucsc.edu/goldenPath/help/bigWig.html for more details on the format. """ genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) readcount = mapped_read_count(bam) scale = 1 / (readcount / 1e6) x = pybedtools.BedTool(bam)\ .genome_coverage(bg=True, scale=scale, split=True, g=genome_file) cmds = [ 'bedGraphToBigWig', x.fn, genome_file, output] os.system(' '.join(cmds))
def wig_to_bigwig(wig, genome, output): genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome)) cmds = [ 'wigToBigWig', wig.fn, genome_file, output] try: p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() except FileNotFoundError: raise FileNotFoundError( "bigWigToBedGraph was not found on the path. This is an external " "tool from UCSC which can be downloaded from " "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use " "`conda install ucsc-bedgraphtobigwig`" ) if p.returncode: raise ValueError('cmds: %s\nstderr:%s\nstdout:%s' % (' '.join(cmds), stderr, stdout)) return output
def complement_bedx(bedx, genome): pybedtools.settings.KEEP_TEMPFILES=True allowed_types = gqltypes.complementable_types if not ( type(bedx) in allowed_types ): raise ToolsException('Type mismatch in COMPLEMENT. ' +\ ident.name + ' not supported.',\ 'complement_bedx') kwargs = {} if type(genome) is str: try: test = pybedtools.chromsizes(genome) kwargs['genome']=genome except Exception as e: raise ToolsException(\ 'Error locating and/or retrieve genome ' + \ genome + ' in COMPLEMENT.',\ 'complement_bedx') else: if type(genome) is gqltypes.GENOME: kwargs['g'] = genome.val else: raise ToolsException(\ 'Type mismatch in COMPLEMENT. GENOME expect ' + \ 'but ' + genome.name + ' encountered.',\ 'complement_bedx') a = pybedtools.BedTool(bedx.val) r = a.complement(**kwargs) output_type = gqltypes.BED3 result = output_type(r.fn, True) add_tmp_file(result) return result
## Log for number of lines processed ... if (counter % LOG_EVERY_N) == 0: print '[INFO ' + strftime("%Y-%m-%d %H:%M:%S", gmtime()) + ' ] ' + str( LOG_EVERY_N * factor) + ' fragments processed ...' factor += 1 else: lastLine = LOG_EVERY_N * (factor - 1) if counter != lastLine: ## Last Line processed log print '[INFO ' + strftime("%Y-%m-%d %H:%M:%S", gmtime()) + ' ] ' + str(counter) + ' fragments processed ...' fp.close() ## generate a list of bins totalBins = int(pybedtools.chromsizes('hg19')[args.chrom][1]) / int(args.res) bins = list() for splits in range(0, totalBins): bins.append(str(args.chrom) + ":" + str(splits + 1)) ## generate a list of bins print '\nSaving results to a matrix-like tab-delimited file' start = int(args.subset.split('-')[0]) / int(args.res) stop = (int(args.subset.split('-')[1]) / int(args.res)) + 1 ## Build matrix ------ fp = open(args.outFile, 'w') for z in range(start, stop): ## Header print if z == start: fp.write('%s\t%s\t' % ('bins', bins[z]))
#!/usr/bin/env python import os import subprocess import logging import hashlib import urllib import pybedtools import gffutils import metaseq logging.basicConfig( level=logging.DEBUG, format='[%(name)s] [%(asctime)s]: %(message)s') logger = logging.getLogger('metaseq data download') hg19 = pybedtools.chromsizes('hg19') genomes_file = pybedtools.chromsizes_to_file('hg19', 'hg19') usage = """ Downloads data from UCSC, GEO, and Ensembl. """ import argparse ap = argparse.ArgumentParser(usage=usage) ap.add_argument( '--data-dir', default=metaseq.data_dir(), help='Location to store downloaded and prepped data. ' 'Default is %(default)s') args = ap.parse_args() CHROM = 'chr17'
def main(): """ Creates a pairwise matrix containing overlapping feature counts for many BED files """ ap = argparse.ArgumentParser(usage=usage) ap.add_argument('beds', nargs="*", help='BED/GTF/GFF/VCF filenames, e.g., ' 'in a directory of bed files, you can use *.bed') ap.add_argument('--frac', action='store_true', help='Instead of counts, report fraction overlapped') ap.add_argument('--enrichment', action='store_true', help='Run randomizations (default 1000, specify otherwise ' 'with --iterations) on each pairwise comparison and ' 'compute the enrichment score as ' '(actual intersection count + 1) / (median randomized + 1)' ) ap.add_argument('--genome', help='Required argument if --enrichment is ' 'used. Needs to be a string assembly name like "dm3" or ' '"hg19"') ap.add_argument('--iterations', default=1000, type=int, help='Number of randomizations to perform for enrichement ' 'scores') ap.add_argument('--processes', default=None, type=int, help='Number of CPUs to use for randomization') ap.add_argument('--test', action='store_true', help='Ignore any input BED ' 'files and use test BED files') ap.add_argument('-v', '--verbose', action='store_true', help='Be verbose: print which files are ' 'currently being intersected and timing info at the end.') args = ap.parse_args() if not args.beds and not args.test: ap.print_help() sys.exit(1) if args.test: # insulator binding sites from ChIP-chip -- 4 proteins, 2 cell types # Genes Dev. 2009 23(11):1338-1350 args.beds = [example_filename(i) for i in [ 'Cp190_Kc_Bushey_2009.bed', 'Cp190_Mbn2_Bushey_2009.bed', 'CTCF_Kc_Bushey_2009.bed', 'CTCF_Mbn2_Bushey_2009.bed', 'SuHw_Kc_Bushey_2009.bed', 'SuHw_Mbn2_Bushey_2009.bed', 'BEAF_Mbn2_Bushey_2009.bed', 'BEAF_Kc_Bushey_2009.bed' ]] if args.enrichment: FUNC = enrichment_score genome_fn = pybedtools.chromsizes_to_file(pybedtools.chromsizes(args.genome)) kwargs = dict(genome_fn=genome_fn, iterations=args.iterations, processes=args.processes) elif args.frac: FUNC = frac_of_a kwargs = {} else: FUNC = actual_intersection kwargs = {} t0 = time.time() matrix = create_matrix(beds=args.beds, func=FUNC, verbose=args.verbose, **kwargs) t1 = time.time() nfiles = len(args.beds) if args.verbose: sys.stderr.write('Time to construct %s x %s matrix: %.1fs' \ % (nfiles, nfiles, (t1 - t0)) + '\n') keys = sorted(matrix.keys()) sys.stdout.write("\t" + "\t".join(keys) + '\n') for k in keys: sys.stdout.write(k) for j in keys: sys.stdout.write('\t' + str(matrix[k][j])) sys.stdout.write('\n')
def __init__(self, file, genome, chrom, matrix_dim, incr_column=None, default_chroms=True): """ Subclass of HilbertNormalized that represents a genomic HilbertMatrix. If `default_chroms` is True, then only use the pybedtools-defined "default" chromosomes. For example, this will be only the autosomes and X and Y for human, or just the euchromatic chromosomes for dm3. """ self.file = file self.genome = genome self.chrom = chrom self.use_chrom_range = False # grab the dict of chrom lengths for this genome if isinstance(self.genome, basestring): self.chromdict = pbt.chromsizes(self.genome) if default_chroms: try: self.chromdict = self.chromdict.default except AttributeError: raise ValueError( "Early version of pybedtools, or no chromosome " "default set for genome %s. Use " "`default_chroms=False` instead." % self.genome) elif isinstance(self.genome, dict): self.chromdict = self.genome else: raise ValueError( '`genome` must be either a string assembly name ' ' or a dictionary of chrom:(start, stop)') if self.chrom != "genome": chrom_range_tuple = get_interval_from_string(self.chrom) if chrom_range_tuple is None: # grab the length of the requested chromosome self.chrom_length = self.chromdict[self.chrom][1] else: (self.chrom, self.range_start, self.range_end) = chrom_range_tuple self.chrom_length = self.range_end - self.range_start self.use_chrom_range = True print self.chrom, "size: ", else: # using the entire genome for our coordinate system self.chrom_length = 0 curr_offset = 0 self.chrom_offsets = {} self.chrom_offsets_list = [] self.chrom_names_list = [] self.chrom_d = {} for chrom in self.chromdict: self.chrom_offsets[chrom] = curr_offset self.chrom_d[chrom] = curr_offset / (matrix_dim * matrix_dim) self.chrom_offsets_list.append(curr_offset) self.chrom_names_list.append(chrom) self.chrom_length += self.chromdict[chrom][1] curr_offset += self.chromdict[chrom][1] print "genome size: ", print self.chrom_length super(HilbertMatrix, self).__init__(matrix_dim, self.chrom_length) print "using matrix of size", self.matrix_dim, "there are", \ self.ncells, "cells in the matrix and each cell represents", \ int(self.dist_per_cell), "base pairs." self.incr_column = incr_column self.num_intervals = 0 self.total_interval_length = 0 chrom_offsets = [] chrom_names = [] self.temp_files = [] # populate the matrix with the data contained in self.file self.build() self.dump_matrix()
def rebin(configfile, genome, binsize=200, quiet=False, binned_dir='binned'): """ Split bigwig/bedGraph files by chromosome, and interpolate signal into `binsize` bins. """ if quiet: logger.disabled = True config = Config(configfile) genome = pybedtools.chromsizes(genome) if not os.path.exists(binned_dir): os.makedirs(binned_dir) for celltype, mark, path, control in config.config: chrom = None fout = None output_pattern = ( '{binned_dir}/{celltype}-{mark}-{binsize}-' '{{chrom}}.binned'.format(**locals())) logger.info('{path} -> {output_pattern}'.format(**locals())) # convert to bedGraph if needed if is_bigwig(path): bg = path + '.bedgraph' if not os.path.exists(bg): logger.info('converting to bedgraph') os.system('bigWigToBedGraph %s %s' % (path, bg)) else: logger.info('%s already exists, using it' % bg) else: bg = path bt = pybedtools.BedTool(bg) x = [] y = [] def write_interpolated_results(x, y, chrom): """ interpolation and file-creation happens here """ logger.info(chrom) filename = output_pattern.format(chrom=chrom) max_pos = genome[chrom][-1] x = np.array(x) y = np.array(y) xi = np.arange(0, max_pos, binsize) yi = np.interp(xi, x, y, left=-1, right=-1) fout = open(filename, 'w') fout.write('%s\t%s\n' % (celltype, chrom)) fout.write('%s\n' % mark) for xii, yii in itertools.izip(xi, yi): fout.write('%s\n' % yii) fout.close() # try to save a little memory del x, y, xi, yi for i in bt: if (i.chrom != chrom) and (chrom is not None): write_interpolated_results(x, y, chrom) x = [] y = [] # use the midpoint of each bedgraph feature x.append(i.start + (i.stop - i.start) / 2) y.append(float(i[-1])) chrom = i.chrom # last one write_interpolated_results(x, y, chrom) if quiet: logger.disabled = False
def __init__(self, config, debug=False): """ Class for handling plotting multiple Hilbert matrices. This class is designed for programmatic access; if you want to interact with it via a GUI, then use the HilbertGUI subclass which adds the GUI elements. :param config: If a string, then treat it as a filename of a YAML config file; if a dictionary then treat it as the config dictionary itself. For each dictionary in `config['data']`, a new matrix, colorbar, and slider will be created using the filename and colormap specified. The matrices for the files will be plotted on the same Axes. There is no limit, but colors get complicated quickly with, say, >3 files. Example config dict:: { 'dim': 128, 'genome': 'hg19', 'chrom': 'chr10', 'data': [ {'filename': '../data/cpg-islands.hg19.chr10.bed', 'colormap': 'Blues'}, {'filename': '../data/refseq.chr10.exons.bed', 'colormap': 'Reds'} ] } Example YAML file:: dim: 128 chrom: chr10 genome: hg19 data: - filename: ../data/cpg-islands.hg19.chr10.bed colormap: Blues - filename: ../data/refseq.chr10.exons.bed colormap: Reds :param debug: If True, then print some extra debugging info :param kwargs: Additional keyword arguments are passed to HilbertMatrix (e.g., m_dim, genome, chrom) """ self.config = self._parse_config(config) self.matrix_dim = self.config['dim'] hilbert_matrix_kwargs = dict( matrix_dim=self.config['dim'], genome=self.config['genome']) # self.hilberts is keyed first by chrom, then by filename; the final # leaves are HilbertMatrix objects # # self.hilberts = { # chrom1: { # filename1: HM, # filename2: HM, # filename3: HM, # }, # chrom2: { # filename1: HM, # filename2: HM, # filename3: HM, # }, # } # # self.hilberts = defaultdict(dict) # colormaps are consistent across all chroms, so it's just keyed by # filename: # # self.colormaps = { # filename1: cmap1, # filename2: cmap2, # filename3: cmap3 # } self.colormaps = {} chroms = self.config['chrom'] if chroms == 'chroms': chroms = pbt.chromsizes(self.config['genome']).default.keys() if isinstance(chroms, basestring): chroms = [chroms] self.chroms = chroms self.fns = [] for chunk in self.config['data']: is_bigwig = chunk.get('bigwig', False) if is_bigwig: HilbertClass = HilbertMatrixBigWig else: HilbertClass = HilbertMatrix fn = chunk['filename'] self.fns.append(fn) self.colormaps[fn] = getattr(matplotlib.cm, chunk['colormap']) for chrom in self.chroms: hm = HilbertClass(fn, chrom=chrom, **hilbert_matrix_kwargs) hm.mask_low_values() self.hilberts[chrom][fn] = hm self.debug = debug self.nfiles = len(self.config['data']) self.nchroms = len(chroms) self.annotation_ax = None
# Python packages or genomic Python packages, this may take a while. # # pip install . import metaseq import pybedtools import numpy as np from matplotlib import pyplot as plt bam = metaseq.genomic_signal('Mcf7Max.sorted.bam', 'bam') cpg = pybedtools.BedTool('cpg.bed') tss = pybedtools.BedTool('HIF_sites_invovled_in_looping_not_at_promoter.bed') # extend by 5 kb up/downstream tss = tss.slop(b=5000, g=pybedtools.chromsizes('hg19')) tss_with_cpg = tss.intersect(cpg, u=True) tss_without_cpg = tss.intersect(cpg, v=True) # change this to as many CPUs as you have in order to run in parallel processes = 1 # each read will be extended 3' to a total size of this many bp fragment_size = 200 # the region +/-5kb around each TSS will be split into a total of 100 bins, # change as needed bins = 100 x = np.linspace(-5000, 5000, bins)
def average_gerp(vf, af, flank=50): v = BedTool(vf) flanks = v.slop(g=pybedtools.chromsizes('hg19'), b=flank) return gerp(flanks.fn, af, name="avg_gerp")