def create_new_header(infile, mappings, outfile): """Create new header in BigWig, with UCSC chromosome names.""" with pyBigWig.open(infile) as bw: if set(bw.chroms().keys()).issubset(mappings.values()): # If chromosome names are already UCSC, just rename input file to output name. # Exit with status 0 since this is normal behavior. os.rename(infile, outfile) sys.exit(0) hdr = [(mappings[chrom], length) for chrom, length in bw.chroms().items() if chrom in mappings] if not hdr: msg = "Neither of the chromosomes in the input file has a valid UCSC pair. No mapping will be done." print(warning(msg)) os.rename(infile, outfile) sys.exit(0) seq_num = 0 with pyBigWig.open(outfile, 'w') as bw_output: bw_output.addHeader(hdr) for chrom, length in bw.chroms().items(): ints = bw.intervals(chrom, 0, length) if ints and chrom in mappings: bw_output.addEntries([mappings[chrom]] * len(ints), [x[0] for x in ints], ends=[x[1] for x in ints], values=[x[2] for x in ints]) elif chrom not in mappings: seq_num += 1 print('UCSC chromosome/conting mapping for {} is missing'.format(chrom)) if seq_num > 0: print(warning("UCSC chromosome/conting mapping for {} sequence(s) is missing. " "This sequence(s) will not be included in the bigWig file.".format(seq_num)))
def getChromSizes(bigwigFilesList): """ Get chromosome sizes from bigWig file with pyBigWig Test dataset with two samples covering 200 bp. >>> test = Tester() Chromosome name(s) and size(s). >>> assert(getChromSizes([test.bwFile1, test.bwFile2]) == ([('3R', 200)], set([]))) """ # check that the path to USCS bedGraphToBigWig as set in the config # is installed and is executable. def print_chr_names_and_size(chr_set): sys.stderr.write("chromosome\tlength\n") for name, size in chr_set: sys.stderr.write("{0:>15}\t{1:>10}\n".format(name, size)) bigwigFilesList = bigwigFilesList[:] common_chr = set() for fname in bigwigFilesList: fh = pyBigWig.open(fname) common_chr = common_chr.union(set(fh.chroms().items())) fh.close() non_common_chr = set() for bw in bigwigFilesList: _names_and_size = set(pyBigWig.open(bw).chroms().items()) if len(common_chr & _names_and_size) == 0: # try to add remove 'chr' from the chromosme name _corr_names_size = set() for chrom_name, size in _names_and_size: if chrom_name.startswith('chr'): _corr_names_size.add((chrom_name[3:], size)) else: _corr_names_size.add(('chr' + chrom_name, size)) if len(common_chr & _corr_names_size) == 0: message = "No common chromosomes found. Are the bigwig files " \ "from the same species and same assemblies?\n" sys.stderr.write(message) print_chr_names_and_size(common_chr) sys.stderr.write("\nand the following is the list of the unmatched chromosome and chromosome\n" "lengths from file\n{}\n".format(bw)) print_chr_names_and_size(_names_and_size) exit(1) else: _names_and_size = _corr_names_size non_common_chr |= common_chr ^ _names_and_size common_chr = common_chr & _names_and_size if len(non_common_chr) > 0: sys.stderr.write("\nThe following chromosome names did not match between the the bigwig files\n") print_chr_names_and_size(non_common_chr) # get the list of common chromosome names and sizes return sorted(common_chr), non_common_chr
def _generate_chunk_output_file(self, i=None): records = [ ("chr1", 1, 2, 1.5), ("chr1", 2, 3, 4.5), ("chr1", 3, 4, 1.9), ("chr1", 4, 5, 0.45), ("chr2", 8, 9, 1.0), ("chr2", 9, 10, 6.7) ] fn = tempfile.NamedTemporaryFile(suffix=".bw").name _records = records[(i*3):(i*3)+3] assert len(_records) == 3 ranges = {} for rec in _records: seqid = rec[0] pos = rec[1] ranges.setdefault(seqid, (sys.maxint, 0)) ranges[seqid] = (min(ranges[seqid][0], pos), max(ranges[seqid][1], pos)) bw = pyBigWig.open(fn, "w") regions = [ (s, ranges[s][1]+1) for s in sorted(ranges.keys()) ] bw.addHeader(regions) bw.addEntries([rec[0] for rec in _records], [rec[1]-1 for rec in _records], ends=[rec[2]-1 for rec in _records], values=[rec[3] for rec in _records]) bw.close() return fn
def main(): usage = 'usage: %prog [options] <in_bw_file> <out_h5_file>' parser = OptionParser(usage) parser.add_option('-v', dest='verbose', default=False, action='store_true') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide input BigWig and output HDF5.') else: bw_file = args[0] hdf5_file = args[1] # open files bw_in = pyBigWig.open(bw_file) h5_out = h5py.File(hdf5_file, 'w') # for each chromosome chrom_lengths = bw_in.chroms() for chrom in chrom_lengths: if options.verbose: print(chrom) # read values x = bw_in.values(chrom, 0, chrom_lengths[chrom], numpy=True).astype('float16') # write gzipped into HDF5 h5_out.create_dataset(chrom, data=x, dtype='float16', compression='gzip', shuffle=True) # close files h5_out.close() bw_in.close()
def gerprunner(): import pyBigWig b = pyBigWig.open("/scratch/ucgd/lustre/u1021864/serial/hg19.gerp.bw") # x = list(range(1,23)); x.append("X"), x.append("Y") input = sys.argv[1] iterator = JimFile(input) iterable = windower(iterator, chunker(1)) cutoff = 1e-3 def genchunks(): nsmall = 0 for i, chunk in enumerate(iterable): #if len(chunk) < 5: # continue score = b.stats("chr"+chunk[0].chrom, chunk[0].start, chunk[-1].end) yield chunk, score[0] if i % 100000 == 0: print i, chunk[0].chrom, chunk[0].start, score print >>sys.stderr, nsmall, "removed for being too short" print >>sys.stderr, i, "total chunks" vcf_path = "/scratch/ucgd/lustre/u1021864/serial/clinvar-anno.vcf.gz" res = eval2(genchunks(), vcf_path, "/scratch/ucgd/lustre/u1021864/serial/esp-common.vcf.gz") print metrics(res[True], res[False], "gerp.auc.png")
def testBigBed(self): fname = "http://www.encodeproject.org/files/ENCFF001JBR/@@download/ENCFF001JBR.bigBed" bb = pyBigWig.open(fname) assert(bb is not None) assert(bb.isBigWig() == 0) assert(bb.isBigBed() == 1) SQL = """table RnaElements "BED6 + 3 scores for RNA Elements data " ( string chrom; "Reference sequence chromosome or scaffold" uint chromStart; "Start position in chromosome" uint chromEnd; "End position in chromosome" string name; "Name of item" uint score; "Normalized score from 0-1000" char[1] strand; "+ or - or . for unknown" float level; "Expression level such as RPKM or FPKM. Set to -1 for no data." float signif; "Statistical significance such as IDR. Set to -1 for no data." uint score2; "Additional measurement/count e.g. number of reads. Set to 0 for no data." ) """ output = bb.SQL() if isinstance(output, bytes): output = output.decode('ASCII') assert(output == SQL) o = bb.entries('chr1',10000000,10020000) expected = [(10009333, 10009640, '61035\t130\t-\t0.026\t0.42\t404'), (10014007, 10014289, '61047\t136\t-\t0.029\t0.42\t404'), (10014373, 10024307, '61048\t630\t-\t5.420\t0.00\t2672399')] assert(o == expected) bb.close()
def mhsmidkernelsmooth(bamfile, bwfile, maxinsert=80, mininsert=1, paired=False, kernelsize=30): bamfor = Baminfo.Baminfo(bamfile) bw = pyBigWig.open(bwfile, "w") bw.addHeader(list(bamfor.chrlen.items())) for chromosome in bamfor.chrlen: end = bamfor.chrlen[chromosome] mhsmidcount = mhsbam.mhsmidcount(bamfile=bamfile, chromosome=chromosome, start=1, end=end, maxinsert=maxinsert, mininsert=mininsert, paired=paired) mhsmidsmoothed = kernelsmooth(mhsmidcount, 1, end, end, kernelsize) if mhsmidsmoothed: starts = list() values = list() for start in sorted(mhsmidsmoothed): starts.append(start) values.append(float(mhsmidsmoothed[start])) bw.addEntries(chromosome, starts=starts, values=values, span=1, step=1) bw.close()
def coverage_from_bigwig(self, bigwig_file, stepsize=100): """Return list of arrays describing the coverage of each genomicRegions from <bigwig_file>. *Keyword arguments:* - bigwig_file -- path to bigwig file - stepsize -- used stepsize *Output:* Class variable <coverage>: a list where the elements correspond to the GenomicRegion. The list elements give the number of reads falling into the GenomicRegion. """ self.coverage = [] bwf = pyBigWig.open(bigwig_file) for gr in self.genomicRegions: steps = int(len(gr) / stepsize) try: ds = bwf.stats(gr.chrom, gr.initial, gr.final, type="mean", nBins=steps) ds = [x if x else 0 for x in ds] except: ds = [0] * steps self.coverage.append(np.array(ds)) bwf.close()
def coveragetobw(bamfile, bwfile, maxinsert, mininsert, paired=False): bamfor = Baminfo.Baminfo(bamfile) bw = pyBigWig.open(bwfile, "w") bw.addHeader(list(bamfor.chrlen.items())) for chromosome in bamfor.chrlen: end = bamfor.chrlen[chromosome] coveragecount = mhsbam.coveragecount(bamfile=bamfile, chromosome=chromosome, start=1, end=end, maxinsert=maxinsert, mininsert=mininsert, paired=paired) if coveragecount: starts = list() values = list() for start in sorted(coveragecount): starts.append(start) values.append(float(coveragecount[start])) bw.addEntries(chromosome, starts=starts, values=values, span=1, step=1) bw.close()
def dhscutkernelsmooth(bamfile, bwfile, library='Duke', kernelsize=200): bamfor = Baminfo.Baminfo(bamfile) bw = pyBigWig.open(bwfile, "w") bw.addHeader(list(bamfor.chrlen.items())) for chromosome in bamfor.chrlen: end = bamfor.chrlen[chromosome] dhscut = dhsbam.dhcutcount(bamfile=bamfile, chromosome=chromosome, start=1, end=end, library=library) dhscutsmoothed = kernelsmooth(dhscut, 1, end, end, kernelsize) if dhscutsmoothed: starts = list() values = list() for start in sorted(dhscutsmoothed): starts.append(start) values.append(float(dhscutsmoothed[start])) bw.addEntries(chromosome, starts=starts, values=values, span=1, step=1) bw.close()
def doWrite2(self): ''' Test all three modes of storing entries. Also test to ensure that we get error messages when doing something silly This is a modified version of the writing example from libBigWig ''' chroms = ["1"]*6 starts = [0, 100, 125, 200, 220, 230, 500, 600, 625, 700, 800, 850] ends = [5, 120, 126, 205, 226, 231] values = [0.0, 1.0, 200.0, -2.0, 150.0, 25.0, 0.0, 1.0, 200.0, -2.0, 150.0, 25.0, -5.0, -20.0, 25.0, -5.0, -20.0, 25.0] ofile = tempfile.NamedTemporaryFile(delete=False) oname = ofile.name ofile.close() bw = pyBigWig.open(oname, "w") bw.addHeader([("1", 1000000), ("2", 1500000)]) #Intervals bw.addEntries(chroms[0:3], starts[0:3], ends=ends[0:3], values=values[0:3]) bw.addEntries(chroms[3:6], starts[3:6], ends=ends[3:6], values=values[3:6]) #IntervalSpans bw.addEntries("1", starts[6:9], values=values[6:9], span=20) bw.addEntries("1", starts[9:12], values=values[9:12], span=20) #IntervalSpanSteps, this should instead take an int bw.addEntries("1", 900, values=values[12:15], span=20, step=30) bw.addEntries("1", 990, values=values[15:18], span=20, step=30) #Attempt to add incorrect values. These MUST raise an exception try: bw.addEntries(chroms[0:3], starts[0:3], ends=ends[0:3], values=values[0:3]) assert(1==0) except RuntimeError: pass try: bw.addEntries("1", starts[6:9], values=values[6:9], span=20) assert(1==0) except RuntimeError: pass try: bw.addEntries("3", starts[6:9], values=values[6:9], span=20) assert(1==0) except RuntimeError: pass try: bw.addEntries("1", 900, values=values[12:15], span=20, step=30) assert(1==0) except RuntimeError: pass #Add a few intervals on a new chromosome bw.addEntries(["2"]*3, starts[0:3], ends=ends[0:3], values=values[0:3]) bw.close() #check md5sum, this is the simplest method to check correctness h = hashlib.md5(open(oname, "rb").read()).hexdigest() assert(h=="b1ca91d2ff42afdd2efa19a007c1ded4") #Clean up os.remove(oname)
def fetch_from_bigbed(path, chrom, start, end): import pyBigWig bed = pyBigWig.open(path) assert bed.isBigBed(), "Oops, for some reason I was expecting a bed file: {}".format(path) chrom = match_chrom_format(chrom, bed.chroms().keys()) for cur_start, cur_end, bed_line in bed.entries(chrom, start, end): bed_line = bed_line.split() yield tx_from_bedfields([chrom, cur_start, cur_end] + bed_line)
def __init__(self, wig_location): """ Arguments --------- wig_location: Path to bigwig """ self.wig_location = wig_location try: self.wig = pyBigWig.open(self.wig_location) except Exception as e: raise MocaException('Error reading wig file: {}'.format(e))
def readValuesPyBigWig(self, reference, start, end): """ Use pyBigWig package to read a BigWig file for the given range and return a protocol object. pyBigWig returns an array of values that fill the query range. Not sure if it is possible to get the step and span. This method trims NaN values from the start and end. pyBigWig throws an exception if end is outside of the reference range. This function checks the query range and throws its own exceptions to avoid the ones thrown by pyBigWig. """ if not self.checkReference(reference): raise exceptions.ReferenceNameNotFoundException(reference) if start < 0: start = 0 bw = pyBigWig.open(self._sourceFile) referenceLen = bw.chroms(reference) if referenceLen is None: raise exceptions.ReferenceNameNotFoundException(reference) if end > referenceLen: end = referenceLen if start >= end: raise exceptions.ReferenceRangeErrorException( reference, start, end) data = protocol.Continuous() curStart = start curEnd = curStart + self._INCREMENT while curStart < end: if curEnd > end: curEnd = end for i, val in enumerate(bw.values(reference, curStart, curEnd)): if not math.isnan(val): if len(data.values) == 0: data.start = curStart + i data.values.append(val) if len(data.values) == self._MAX_VALUES: yield data data = protocol.Continuous() elif len(data.values) > 0: # data.values.append(float('NaN')) yield data data = protocol.Continuous() curStart = curEnd curEnd = curStart + self._INCREMENT bw.close() if len(data.values) > 0: yield data
def bedGraphToBigWig(chromSizes, bedGraphPath, bigWigPath, sort=True): """ takes a bedgraph file, orders it and converts it to a bigwig file using pyBigWig. """ from tempfile import NamedTemporaryFile from os import remove, system # Make a list of tuples for the bigWig header, this MUST be sorted identically to the bedGraph file sort_cmd = cfg.config.get('external_tools', 'sort') _file = NamedTemporaryFile(delete=False) for chrom, size in chromSizes: _file.write(toBytes("{}\t{}\n".format(chrom, size))) _file.close() system("LC_ALL=C {} -k1,1 -k2,2n {} > {}.sorted".format(sort_cmd, _file.name, _file.name)) cl = [] f = open("{}.sorted".format(_file.name)) for line in f: chrom, chromLen = line.split() cl.append((chrom, int(chromLen))) f.close() remove(_file.name) remove("{}.sorted".format(_file.name)) # check if the file is empty if os.stat(bedGraphPath).st_size < 10: import sys sys.stderr.write( "Error: The generated bedGraphFile was empty. Please adjust\n" "your deepTools settings and check your input files.\n") exit(1) if sort: # temporary file to store sorted bedgraph file _file = NamedTemporaryFile(delete=False) tempfilename1 = _file.name system("LC_ALL=C {} -k1,1 -k2,2n {} > {}".format(sort_cmd, bedGraphPath, tempfilename1)) bedGraphPath = tempfilename1 bw = pyBigWig.open(bigWigPath, "w") assert(bw is not None) # The lack of maxZooms will change the results a bit, perhaps the defaults are better bw.addHeader(cl, maxZooms=10) f = open(bedGraphPath) for line in f: interval = line.split() bw.addEntries([interval[0]], [int(interval[1])], ends=[int(interval[2])], values=[float(interval[3])]) f.close() bw.close() if sort: remove(tempfilename1)
def test_bigwig(self): import pyBigWig f = pyBigWig.open(self.bw_file) for i_rec, rec in enumerate(self.csv_records): seqid = re.sub('\"', "", rec[0]) tpl = int(rec[1]) - 1 s = int(f.values(seqid, tpl, tpl+1)[0]) ipd_minus = (s % 65536) / 100.0 ipd_plus = (s >> 16) / 100.0 if rec[2] == "1": self.assertAlmostEqual(ipd_minus, float(rec[8]), places=1) else: self.assertAlmostEqual(ipd_plus, float(rec[8]), places=1)
def gather_bigwig(input_files, output_file): import pyBigWig chr_lengths = {} FileInfo = namedtuple("FileInfo", ("file_name", "seqid", "length")) files_info = [] for file_name in input_files: log.info("Reading header info from {f}...".format(f=file_name)) if op.getsize(file_name) == 0: continue bw_chunk = pyBigWig.open(file_name) for (seqid, length) in bw_chunk.chroms().iteritems(): chr_lengths.setdefault(seqid, 0) chr_lengths[seqid] = max(length, chr_lengths[seqid]) seqid_min = sorted(bw_chunk.chroms().keys())[0] files_info.append(FileInfo(file_name, seqid, bw_chunk.chroms()[seqid])) bw_chunk.close() if len(files_info) == 0: with open(output_file, "wb") as f: return output_file files_info.sort(lambda a,b: cmp((a.seqid, a.length), (b.seqid, b.length))) bw = pyBigWig.open(output_file, "w") regions = [ (s, chr_lengths[s]) for s in sorted(chr_lengths.keys())] bw.addHeader(regions) for file_info in files_info: log.info("Reading values from {f}...".format(f=file_info.file_name)) bw_chunk = pyBigWig.open(file_info.file_name) for seqid in sorted(bw_chunk.chroms().keys()): seqids, starts, ends, values = [], [], [], [] chr_max = bw_chunk.chroms()[seqid] for i, val in enumerate(bw_chunk.values(seqid, 0, chr_max)): if not math.isnan(val): seqids.append(seqid) starts.append(i) ends.append(i+1) values.append(val) bw.addEntries(seqids, starts, ends=ends, values=values) bw_chunk.close() bw.close() return output_file
def big_wig_corr(full, semi, regions): full = pyBigWig.open(full) semi = pyBigWig.open(semi) regions = pybedtools.BedTool(regions) full_result = [] semi_result = [] for interval in regions: gene_full_values = np.array(full.values(interval.chrom, interval.start, interval.stop)) gene_semi_values = np.array(semi.values(interval.chrom, interval.start, interval.stop)) filtered_gene_full_values = gene_full_values[~np.isnan(gene_full_values) & (gene_full_values != 0)] filtered_gene_semi_values = gene_semi_values[~np.isnan(gene_full_values) & (gene_full_values != 0)] filtered_gene_semi_values = np.nan_to_num(filtered_gene_semi_values) full_result.append(filtered_gene_full_values) semi_result.append(filtered_gene_semi_values) full_result = np.concatenate(full_result) semi_result = np.concatenate(semi_result) return stats.pearsonr(full_result, semi_result)
def convert_bigwig(mapping_table, bw_in_filename, bw_out_filename, verbose=False): """ convert chromosome names of a bigwig file according to given mapping_table it checks which chromosome names that can correctly mapped, all other chromosomes are skipped """ bw = pyBigWig.open(bw_in_filename) curr_chroms = bw.chroms() final_mapping_table = {} new_chroms = {} for c in curr_chroms: if c not in mapping_table: if (verbose): print("skip original chrom \'" + c + "\' - cannot be found in mapping table! Right GENOME & FROM_FORMAT?") continue final_mapping_table[c] = mapping_table[c] new_chroms[mapping_table[c]] = curr_chroms[c] if (len(new_chroms) <= 0): print("No chromosomes found for mapping! Wrong 'FROM_FORMAT'?") sys.exit(1) bw_out = pyBigWig.open(bw_out_filename, "w") bw_out.addHeader(list(new_chroms.items())) for c in final_mapping_table: c_int = bw.intervals(c) c_map = final_mapping_table[c] if verbose: print("convert chromosome: ", c, " --> ", c_map) bw_out.addEntries(list(itertools.repeat(c_map, len(c_int))), [x[0] for x in c_int], ends=[x[1] for x in c_int], values=[x[2] for x in c_int]) bw_out.close() bw.close() if (verbose): print("\nbigwig conversion finished!\n")
def getBigWigMean(regions, bigwig_file, non_nan): bw=pyBigWig.open(bigwig_file) Profile=[] if non_nan ==1: #average over non_nan region for region in regions: tmp=bw.stats(str(region.chrom), region.start,region.stop) # nan considered missing if tmp[0]==None: # average over non_nan region tmp[0]=0 Profile.append(tmp[0]) else: #average over whole region, default for region in regions: values=bw.values(str(region.chrom),region.start,region.stop) # nan considered as 0 Profile.append(np.mean(np.nan_to_num(values))) # #average over the whole region return Profile
def __init__(self, cov_file): self.cov_file = cov_file self.bigwig = False cov_ext = os.path.splitext(self.cov_file)[1].lower() if cov_ext in ['.bw', '.bigwig']: self.cov_open = pyBigWig.open(self.cov_file, 'r') self.bigwig = True elif cov_ext in ['.h5', '.hdf5', '.w5', '.wdf5']: self.cov_open = h5py.File(self.cov_file, 'r') else: print('Cannot identify coverage file extension "%s".' % cov_ext, file=sys.stderr) exit(1)
def initBigWig(exampleBW, outname): """! Initiates a bigWig file @param exampleBW String: name of the bigWig file from which to use the header info @param outname String: prefix for the file name of the bigWig file to be created @return Opens a bigWig file and adds a header so that it's ready to receive more information. """ # read the header info from an example bigWig bw_check = pyBigWig.open(exampleBW) # extract the chromosome information chrom_info = bw_check.chroms() # wrangle the chrom info into a list of tuples chrom_info = [tuple(i) for i in chrom_info.items()] #open a new bigWig file bw_out = pyBigWig.open(outname + '.bw', "w") bw_out.addHeader(chrom_info) return bw_out
def get_chrom_info(self, chrom_name): pyBigWig_object = pyBigWig.open(self.bigWig_file) chrom_stats_dict = { 'chrom_name': chrom_name, 'chrom_len': pyBigWig_object.chroms(chrom_name), 'chrom_mean': pyBigWig_object.stats(chrom_name, type='mean', exact=True)[0], 'chrom_std': pyBigWig_object.stats(chrom_name, type='std', exact=True)[0] } pyBigWig_object.close() return chrom_stats_dict
def group_and_process_data(bigwig_data, input_data, feature_name): bw_file = pw.open(bigwig_data) if not bw_file.isBigWig(): print("The given file is not in BigWig format!!!") data_grouped = [group for key, group in input_data.groupby("CHROM")] for group in data_grouped: group = extract_data(group, feature_name, bw_file) data_combined = pd.concat(data_grouped) return data_combined
def _to_bigwig(self, path, chromosome_sizes, rpm=True, divide=False, value_col=None, dryrun=False): try: import pyBigWig except ModuleNotFoundError: print("pybigwig must be installed to create bigwigs. Use `conda install -c bioconda pybigwig` or `pip install pybigwig` to install it.") import sys sys.exit(1) if not divide: gr = self.to_rle(rpm=rpm, strand=False, value_col=value_col).to_ranges() else: gr = self.to_rle(rpm=rpm, strand=False, value_col=value_col) divide_by = self.to_rle(rpm=rpm, strand=False) c = (gr / divide_by) new_pyrles = {} for k, v in c.items(): v.values = np.log2(v.values) v.defragment() new_pyrles[k] = v gr = c.defragment().to_ranges() unique_chromosomes = gr.chromosomes subset = ['Chromosome', 'Start', 'End', 'Score'] gr = gr[subset].unstrand() gr = gr.sort() if dryrun: return gr if not isinstance(chromosome_sizes, dict): size_df = chromosome_sizes.df chromosome_sizes = {k: v for k, v in zip(size_df.Chromosome, size_df.End)} header = [(c, int(chromosome_sizes[c])) for c in unique_chromosomes] bw = pyBigWig.open(path, "w") bw.addHeader(header) for chromosome, df in gr: chromosomes = df.Chromosome.tolist() starts = df.Start.tolist() ends = df.End.tolist() values = df.Score.tolist() bw.addEntries(chromosomes, starts, ends=ends, values=values)
def get_bigWig_scores(map_args, def_param=(scores1, scores2)): """ Inner loop for multithreading over bigWig score features. """ (i, train, Peak, opt) = map_args bw = pyBigWig.open(Peak) row = train.iloc[i] anchor1, anchor2 = prepare_anchors(row, opt.cons_extension) con1 = sum(bw.values(anchor1.chrom, anchor1.start, anchor1.end)) con2 = sum(bw.values(anchor2.chrom, anchor2.start, anchor2.end)) lock.acquire() scores1[i] = (con1 + con2) / 2.0 scores2[i] = np.std([con1, con2]) lock.release()
def load_big_file(name, rel_path='data', is_abs_path=False): """ Load bigwig file :param name: Name of the file or absolute path if is_abs_path is set to True :type name: str :param rel_path: Relative path without the name from current directory :type rel_path: str :param is_abs_path: If True, name is interpreted as absolute path. :type is_abs_path: bool :return: bigWigFile object """ path = set_path(name, rel_path=rel_path, is_abs_path=is_abs_path) file = pyBigWig.open(path) return file
def __init__(self, wig_location): """ Parameters --------- wig_location : string Path to wig file """ self.wig_location = wig_location try: self.wig = pyBigWig.open(self.wig_location) except Exception as e: raise Exception('Error reading wig file {} : {}'.format( os.path.abspath(self.wig_location), e))
def calculateScalerForNorm(resultMeta): norm_sum = 0 for i in range(len(resultMeta)): tempFile = pyBigWig.open(resultMeta[i][0]) norm_sum = norm_sum + float(tempFile.header().get('sumData')) tempFile.close() scalerNorm = read.CTRLBW_SUM / float(norm_sum) print("Scaler:") print(scalerNorm) return scalerNorm
def compare_main(args): operation_dict = { "log2ratio": compare_log2ratio, "add": compare_add, "subtract": compare_subtract, "divide": compare_divide, "recipratio": compare_recipratio } #read in files inf1 = pyBigWig.open(args.infile1) inf2 = pyBigWig.open(args.infile2) arrays1 = bigwig_to_arrays(inf1, res=args.res) arrays2 = bigwig_to_arrays(inf2, res=args.res) # perform operation arrays_out = operation_dict[args.operation](arrays1, arrays2) # write out file write_arrays_to_bigwig(args.outfile, arrays_out, inf1.chroms(), \ res = args.res, dropNaNsandInfs = args.dropNaNsandInfs) inf1.close() inf2.close()
def extract_bigwig_worker(lines, bwFile=None, stepSize=1, stranded=1, bw=None): ''' Helper mapper for querying BigWig ''' bw = pybw.open(bwFile) chromL = bw.chroms() lines = [x for x in lines if x] nField = lines[0].strip().split('\t').__len__() res = [] for line in lines: # def parse(line, nField = nField): if line is None: return None cols = line.strip().split('\t') if nField >= 6: chrom, start, end, (id, score, strand) = cols[0], int( cols[1]), int(cols[2]), cols[3:6] else: strand = '+' if nField is 5: chrom, start, end, id, _ = cols[0], int(cols[1]), int( cols[2]), cols[3], cols[4] # assert 0, 'operation not defined when bedFile has 5 fields:\n%s'%lines[0] elif nField is 4: chrom, start, end, id = cols[0], int(cols[1]), int( cols[2]), cols[3] else: chrom, start, end = cols[0], int(cols[1]), int(cols[2]) id = 'NoID' if chrom not in bw.chroms(): o = None else: start = max(0, start) end = min(chromL[chrom], end) sec = bw.values(chrom, start, end, numpy=0) if strand is not '-' or not stranded: vals = sec[::stepSize] else: vals = sec[::-stepSize] o = vals # return (id,o) res += [(id, o)] # res = map( parse, lines) bw.close() return res
def create_bw(name, chr_list, len_list): """ create a bigwiggle with random binary values """ file = bg.open(name + ".bw", "w") header = [(chr_list[i], len_list[i] + 1) for i in range(len(chr_list))] print(header) file.addHeader(header) for i in range(len(chr_list)): valeurs = [rd.randint(0, 1) for i in range(len_list[i])] places = [k for k in range(len_list[i] + 1)] ends = places + [places[-1] + 1] chrome = [chr_list[i]] * len_list[i] file.addEntries(chrome, places, values=valeurs, span=1)
def main(LineArgs): T0 = time.time() # Data Input AcetylFilePath = LineArgs.RespVarFilePath InputFilePath = LineArgs.InputFilePath OutputFilePath = LineArgs.OutputFilePath TranscriptPath = LineArgs.TranscriptPath # load transcript defs TranscriptDF = pd.read_csv(TranscriptPath, header="infer", sep="\t") # load ChIP-seq data Ac_BHW = pyBigWig.open(AcetylFilePath) Inp_BHW = pyBigWig.open(InputFilePath) # Precompute ranges for signal calculations BinRanges = computeBinRanges(40, 250) Header = buildHeader(40) # get signal from region designated print("start resp var collect:") TranscriptDF["SignalOutput"] = TranscriptDF.apply(getSignal, args=(Ac_BHW, Inp_BHW, BinRanges), axis=1) TranscriptDF[Header] = pd.DataFrame(TranscriptDF.SignalOutput.values.tolist(), index= TranscriptDF.index) # remove unparse col of signal types, and unneeded cols TranscriptDF.drop(["SignalOutput", "Bins"], inplace=True, axis=1) print("Printing to file", OutputFilePath) TranscriptDF.to_csv(OutputFilePath, header=True,sep="\t",index=False) # T1 = time.time() Time = T1 - T0 print("Total Raw feature time to complete,", str(Time)+"s") print("Start Feature calculation pipe:") RespVarDF = calcRespVars(TranscriptDF) OutputFilePath = OutputFilePath.replace(".txt", "_FinalSignal.txt") print("Printing to file", OutputFilePath) RespVarDF.to_csv(OutputFilePath, header=True,sep="\t",index=True)
def make_bigwig(bigwig_in_path, bigwig_out_path, header_count_path): """make bigwig from bam""" bw = pyBigWig.open(str(bigwig_in_path)) bw_out = pyBigWig.open(str(bigwig_out_path), "w") header = [] with open(str(header_count_path), "r") as header_count: for line in header_count: contig = line.split(' ')[0] length = bw.chroms(contig) if not length: continue header.append((contig, length)) if not header: return None bw_out.addHeader(header) with open(str(header_count_path), "r") as header_count: for line in header_count: contig = line.split(' ')[0] length = bw.chroms(contig) if not length: continue values = bw.values(contig, 0, length) entry = (contig, values) bw_out.addEntries(entry[0], 1, values=entry[1], span=1, step=1, validate=False) bw_out.close() return None
def __init__(self, reference_sequence, target_path, features, seed=436, validation_holdout=['chr6', 'chr7'], test_holdout=['chr8', 'chr9'], sequence_length=1000, bin_size=200, step_size=100, bins_start=200, bins_end=800, feature_thresholds=0.5, mode="train", save_datasets=[], output_dir=None, additional_bw_files=None): super(RandomFilesSampler, self).__init__(reference_sequence, target_path, features, seed=seed, validation_holdout=validation_holdout, test_holdout=test_holdout, sequence_length=sequence_length, bin_size=bin_size, step_size=step_size, bins_start=bins_start, bins_end=bins_end, feature_thresholds=feature_thresholds, mode=mode, save_datasets=save_datasets, output_dir=output_dir) self._sample_from_mode = {} self._randcache = {} for mode in self.modes: self._sample_from_mode[mode] = None self._randcache[mode] = {"cache_indices": [], "sample_next": 0} self.sample_from_intervals = [] self.interval_lengths = [] self.initialized = False #add additional bigWig files self.all_bw_files = [] if not additional_bw_files is None: for file_name in additional_bw_files: self.all_bw_files.append(pyBigWig.open(file_name))
def dyad_coverage_sample(sample, genes, minp, maxp, smoothing=None): '''Finds the distribution of ditances between fragments and dyad for a single sample.''' print ('Finds the distribution of ditances between fragments and dyad of sample {}'.format(sample)) if not smoothing: smoothing = 0 smoothing = math.ceil(smoothing / 2.0) bw = pbw.open(sample + '-cov.bw') distances = [[] for i in range(0, maxp - minp + smoothing * 2 + 1)] for index, columns in genes.iterrows(): chromosome = columns[1] max_end = bw.chroms(chromosome) if not max_end: max_end = 0 negative = columns[4] == NEGATIVE_STRAND theo_start = int(columns[6]) + minp - smoothing start = max(theo_start, 0) end = min(int(columns[6]) + maxp + smoothing + 1, max_end) distance = signal(bw, chromosome, start, end) if end > start else [] if negative: distance.reverse() for i in range(0, maxp - minp + smoothing * 2 + 1): distance_index = i - (start - theo_start) value = distance[distance_index] if distance_index in range(0, len(distance)) else 0 distances[i].append(value if value and not math.isnan(value) else 0) for i in range(0, maxp - minp + smoothing * 2 + 1): genes['dyad position ' + str(i + minp - smoothing)] = distances[i] genes_output = sample + '-genes.txt' genes.to_csv(genes_output, sep='\t', index=False) sums = pd.DataFrame(index=list(range(minp - smoothing, maxp + smoothing + 1))) sums['Frequency'] = [genes['dyad position ' + str(i)].sum() for i in range(minp - smoothing, maxp + smoothing + 1)] dyads = pd.DataFrame(index=list(range(minp, maxp + 1)), columns=['Frequency', 'Relative Frequency']) for i in range(minp, maxp + 1): dyads.at[i, 'Frequency'] = mean([sums.at[j, 'Frequency'] for j in range(i - smoothing, i + smoothing)]) frequency_sum = dyads['Frequency'].sum() for i in range(minp, maxp + 1): dyads.at[i, 'Relative Frequency'] = dyads.at[i, 'Frequency'] / frequency_sum dyad_output = sample + '-dyad.txt' dyads.to_csv(dyad_output, sep='\t') x = dyads.index.values plt.figure() plt.title(sample) plt.xlabel('Position relative to dyad (bp)') plt.ylabel('Relative Frequency') plt.xlim(x[0], x[len(x) - 1]) plt.xticks(list(range(x[0], x[len(x) - 1] + 1, 25))) plt.plot(dyads.index.values, dyads['Relative Frequency'].values, color='red') plot_output = sample + '-dyad.png' plt.savefig(plot_output) plt.clf()
def parse_intron(options, chrom, start, end, strand, intron_info): # fetch fasta fa = check_fasta(options['--genome']) intron_fa = dna_to_rna(fa.fetch(chrom, start, end), strand) # load matrix matrix3 = load_matrix3() # parse options phastcons_f = pyBigWig.open(options['--bigwig']) min_distance = int(options['--min-distance']) min_score = float(options['--min-score']) min_phastcons = float(options['--min-phastcons']) # start to parse rs sites rs_list = [] for m in re.finditer('AGGT', intron_fa): if strand == '+': pos = start + m.start() + 2 left_dist, right_dist, dist_flag = cal_distance( pos, start, end, min_distance) if not dist_flag: # not enough distance continue ss3_seq = dna_to_rna(fa.fetch(chrom, pos - 20, pos + 3)) if ss3_seq.find('N') != -1: # ensure there is no N continue ss3, score_flag = cal_score(ss3_seq, matrix3, min_score) if not score_flag: # not high score continue else: pos = end - m.start() - 2 left_dist, right_dist, dist_flag = cal_distance( pos, start, end, min_distance) if not dist_flag: # not enough distance continue ss3_seq = dna_to_rna(fa.fetch(chrom, pos - 3, pos + 20), strand='-') if ss3_seq.find('N') != -1: # ensure there is no N continue ss3, score_flag = cal_score(ss3_seq, matrix3, min_score) if not score_flag: # not high score continue phastcons = phastcons_f.stats(chrom, pos - 2, pos + 2)[0] if phastcons is None or phastcons < min_phastcons: # not conserved continue rs_feature = '%d|%d|%d|%f|%f' % (pos, left_dist, right_dist, ss3, phastcons) rs_list.append(rs_feature) if rs_list: return (intron_info, rs_list) else: return (None, None)
def _bigwig_extractor(datafile, intervals, out=None, **kwargs): nan_as_zero = kwargs.get('nan_as_zero', True) if out is None: width = intervals[0].stop - intervals[0].start out = np.zeros((len(intervals), width), dtype=np.float32) bw = pyBigWig.open(datafile) for index, interval in enumerate(intervals): out[index] = bw.values(interval.chrom, interval.start, interval.stop) if nan_as_zero: nan_to_zero(out[index]) bw.close() return out
def check(): bw = pyBigWig.open('sample1.bw') header_keys = list(bw.header().keys()) for k in [ 'maxVal', 'minVal', 'nBasesCovered', 'nLevels', 'sumData', 'sumSquared', 'version' ]: assert k in header_keys # bigWig version should be independent of BAM input, so we can check # the value assert bw.header()['version'] == 4 first_chrom = list(bw.chroms().keys())[0] assert isinstance(bw.stats(first_chrom)[0], float)
def run_file(args, chrom_genes): ''' For genes in a chromosome, get tss/gene body and run fourier transform on the gene ''' bw = pbw.open(args.in_bigwig, 'r') gene_count = 0 with open(args.out_bed, 'w') as out: out.write('name\ttype\tid\tperiodicity\tintensity\n') for count, gene in chrom_genes.iterrows(): run_gene(args, gene, out, bw) if count % 1000 == 0: print 'Parsed {gene_count} for {filename} at {chrom}'.format( gene_count=gene_count, filename=out.name, chrom=args.chrom) bw.close() return 0
def __init__( self, bws='/stor/work/Lambowitz/yaojun/Work/cfNA/tgirt_map/bed_files/merged_bed/coverage/unfragmented.{strand}.bigWig', exon_file='/stor/work/Lambowitz/ref/hg19_ref/genes/exons.gencode.bed.gz', cutoff=2, force=False): records = [] self.high_cov_exons = '/stor/scratch/Lambowitz/cdw2854/high_cov_exon.bed' self.exon_file = exon_file self.bws = { strand: pbw.open(bws.format(strand=strand_label)) for strand, strand_label in zip(['-', '+'], ['rvs', 'fwd']) } if not os.path.isfile(self.high_cov_exons) or force: self.initiate(cutoff=cutoff)
def doWrite(self, bw): ofile = tempfile.NamedTemporaryFile(delete=False) oname = ofile.name ofile.close() bw2 = pyBigWig.open(oname, "w") assert(bw2 is not None) #Since this is an unordered dict(), iterating over the items can swap the order! chroms = [("1", bw.chroms("1")), ("10", bw.chroms("10"))] assert(len(bw.chroms()) == 2) bw2.addHeader(chroms, maxZooms=1) #Copy the input file for c in chroms: ints = bw.intervals(c[0]) chroms2 = [] starts = [] ends = [] values = [] for entry in ints: chroms2.append(c[0]) starts.append(entry[0]) ends.append(entry[1]) values.append(entry[2]) bw2.addEntries(chroms2, starts, ends=ends, values=values) bw2.close() #Ensure that the copied file has the same entries and max/min/etc. bw2 = pyBigWig.open(oname) assert(bw.header() == bw2.header()) assert(bw.chroms() == bw2.chroms()) for c in chroms: ints1 = bw.intervals(c[0]) ints2 = bw2.intervals(c[0]) assert(ints1 == ints2) bw.close() bw2.close() #Clean up os.remove(oname)
def test_bigwig(self): """ Check that encoded ipdRatios in the BigWig output are consistent with modified bases in the GFF file (albeit with lower precision). """ import pyBigWig f = pyBigWig.open(self.bw_file) for (seqid, start, strand), rec in self.gff_dict.iteritems(): s = int(f.values(seqid, start - 1, start)[0]) ipd_minus = (s % 65536) / 100.0 ipd_plus = (s >> 16) / 100.0 if strand == "+": self.assertAlmostEqual(rec.IPDRatio, ipd_plus, places=1) else: self.assertAlmostEqual(rec.IPDRatio, ipd_minus, places=1)
def test_bigwig(self): """ Check that encoded ipdRatios in the BigWig output are consistent with modified bases in the GFF file (albeit with lower precision). """ import pyBigWig f = pyBigWig.open(self.bw_file) for (seqid,start,strand), rec in self.gff_dict.iteritems(): s = int(f.values(seqid, start-1, start)[0]) ipd_minus = (s % 65536) / 100.0 ipd_plus = (s >> 16) / 100.0 if strand == "+": self.assertAlmostEqual(rec.IPDRatio, ipd_plus, places=1) else: self.assertAlmostEqual(rec.IPDRatio, ipd_minus, places=1)
def test_extract_bigwig_to_numpy(tmpdir): """ Tests extract_bigwig_to_numpy function with padding values. \ Uses pyBigWig to construct sample bigWig to draw values from. \ Compares expected numpy array with computed numpy array. """ tmpbigwig = os.path.join(tmpdir, "tmp.bigwig") bw = pyBigWig.open(tmpbigwig, "w") bw.addHeader([('chr1', 20)], maxZooms=0) bw.addEntries(['chr1', 'chr1'], [0, 11], ends=[5, 20], values=[3.0, 7.0]) bw.close() bw = pyBigWig.open(tmpbigwig) sizes = {'chr1': 20} pad = 5 interval1 = ['chr1', 2, 4] # -3 to 9 interval2 = ['chr1', 14, 17] # 9 to 22 output1 = bigwigio.extract_bigwig_to_numpy(interval1, bw, pad, sizes) output2 = bigwigio.extract_bigwig_to_numpy(interval2, bw, pad, sizes) expected1 = np.array([0, 0, 0, 3.0, 3.0, 3.0, 3.0, 3.0, 0, 0, 0, 0]) expected2 = np.array( [0, 0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 0, 0]) assert np.allclose(expected1, output1) assert np.allclose(expected2, output2)
def doWrite(self, bw): ofile = tempfile.NamedTemporaryFile(delete=False) oname = ofile.name ofile.close() bw2 = pyBigWig.open(oname, "w") assert (bw2 is not None) #Since this is an unordered dict(), iterating over the items can swap the order! chroms = [("1", bw.chroms("1")), ("10", bw.chroms("10"))] assert (len(bw.chroms()) == 2) bw2.addHeader(chroms, maxZooms=1) #Copy the input file for c in chroms: ints = bw.intervals(c[0]) chroms2 = [] starts = [] ends = [] values = [] for entry in ints: chroms2.append(c[0]) starts.append(entry[0]) ends.append(entry[1]) values.append(entry[2]) bw2.addEntries(chroms2, starts, ends=ends, values=values) bw2.close() #Ensure that the copied file has the same entries and max/min/etc. bw2 = pyBigWig.open(oname) assert (bw.header() == bw2.header()) assert (bw.chroms() == bw2.chroms()) for c in chroms: ints1 = bw.intervals(c[0]) ints2 = bw2.intervals(c[0]) assert (ints1 == ints2) bw.close() bw2.close() #Clean up os.remove(oname)
def get_signal_dict(self, bigwigs): """ Get dict of signal[region.tup][bigwig] = signal """ signal_dict = { region.tup(): {bigwig: [] for bigwig in bigwigs} for region in self } for bigwig in bigwigs: pybw = pyBigWig.open(bigwig, "rb") for region in self: signal_dict[region.tup()][bigwig] = region.get_signal(pybw) pybw.close() return (signal_dict)
def __init__(self, infile): u""" 初始化 """ self.bigbed, self.bigwig = None, None if os.path.exists(infile): bigfile = pyBigWig.open(infile) if bigfile.isBigWig(): self.bigwig = bigfile elif bigfile.isBigBed(): self.bigbed = bigfile else: raise ValueError('%s is not a legit file' % infile) self.chroms = bigfile.chroms()
def GetNormalizationFactorBasedOnRegionList(bigwig_fn, regionlist): """ regionlist is a list of lists. Each list entries is formatted like this: ["chr1", 1, 10]. """ try: bw = pyBigWig.open(bigwig_fn) TotalSignal = [] for entry in regionlist: EntrySignal = sum( bw.values(RegionChr, RegionStart, RegionStop, numpy=True)) TotalSignal.append(EntrySignal) return (sum(TotalSignal) / 1E3) except: return (None)
def main(BIGBED, trackDB, BED, out_path): ''' Create a output dir if needed, generate decoding library, exctract gene name , use this to extract Ensembl ID. Write to bed file with gene name and Ensembl id. ''' # create outdir if needed if not os.path.isdir(os.path.join(out_path, 'GTRD_BED')): os.mkdir(os.path.join(out_path, 'GTRD_BED')) # Legacy as requires internet connection # load mygene tool #mg = mygene.MyGeneInfo() # load annotation to retreive ensemble ids ensemblDecoder = ensemblIDextract(BED) # load schema dictionary TF_ID = trackDBparser(trackDB) # extract gene name using this schema gene_name = TF_ID[os.path.basename(BIGBED).split('_')[1]][0] # Legacy as requires internet connection # use mygene tool to extract ensembl id #ensembl_id = mg.query(gene_name, fields = 'ensembl.gene', species = 'fruitfly')['hits'][0]['ensembl']['gene'] #use reference annotation to retreive ensembl id ensembl_id = ensemblDecoder[gene_name] # open bigbed file bb = pbw.open(BIGBED) # intergrate this information with coordinates of binding events, write .bed with open( os.path.join(out_path, 'GTRD_BED', os.path.basename(BIGBED).replace('.bb', '.bed')), 'w') as outfile: for chrom, limit in bb.chroms().items(): for bindingEvent in bb.entries(chrom, 0, limit): start = bindingEvent[0] stop = bindingEvent[1] outfile.write('\t'.join([ str(x) for x in [chrom, start, stop, ensembl_id, gene_name] ]) + '\n')
def bw_to_dict(self, chrs, window_size=25): """ Function taken from evaluation scripts - https://github.com/ENCODE-DCC/imputation_challenge/blob/master/build_npy_from_bigwig.py Each chromosome is binned into ((chrom_len-1)//window_size)+1 nonoverlapping bins of size window_size NaN values are converted into zeros before averaging over the bins Because the ends of the bigwig files contain NaNs - regions somehow not measured, a naive bin and then average is liable to cause problems in the first bin which contains nans. Perhaps the simplest solution would just be to use nanmean, and only replace nans after averaging. But I've stuck with the provided script for now """ bw = pyBigWig.open(self.track) for c in chrs: print('Reading chromosome {} from bigwig...'.format(c), flush=True) chrom_len = bw.chroms()[c] # print(chrom_len, window_size) num_step = ( (chrom_len - 1) // window_size ) + 1 # number of bins ensuring all positions are included raw = bw.values( c, 0, chrom_len, numpy=True) # reshape raw vector as (num_step, window_size) raw.resize(num_step * window_size) # typically greater than chrom len # print number of nans (effectively 0s - we should ignore 0s somehow) # print(np.sum(np.isnan(raw))) raw = np.nan_to_num( raw ) # pyBigWig returns nan for values out of bounds - convert to zero raw = np.reshape(raw, (-1, window_size)) # bin it result_per_chr = raw.mean(axis=1) # average over bins # special treatment for last step [i.e. last step with non nan values] (where the first nan is) # above averaging method does not work with the end step - because we've added zeros instead of nans # bw.intervals(c)[-1] is the last interval in bigwig # (248933861, 248934005, 0.08760000020265579) last_interval_end = bw.intervals(c)[-1][ 1] # find the end location of the last interval. after this we will have nans last_step = last_interval_end // window_size # where does our last valid window end start = last_step * window_size # where should our first special treatment window start end = min((last_step + 1) * window_size, chrom_len) stat = bw.stats(c, start, end, exact=True) # pdb.set_trace() if stat[0] is None: result_per_chr[last_step] = 0.0 else: result_per_chr[last_step] = stat[0] self.binned_chroms[c] = np.array(result_per_chr)
def bdg2bw(bdgFile, bwFile, chromSize): with open(chromSize) as f: cs = [line.strip().split('\t') for line in f.readlines()] bw = pyBigWig.open(bwFile, "w") bw.addHeader([(str(x[0]), int(x[1])) for x in cs]) with open(bdgFile, "r") as bdg: for line in bdg: if len(line.strip().split("\t")) == 4: chr, start, end, val = line.strip().split("\t") bw.addEntries(chroms=[chr], starts=[int(start)], ends=[int(end)], values=[float(val)]) else: print("[%s] Warning: skipping bedGraph entry: %s" % (timestamp(), line.strip())) bw.close()
def dhstobw(bamfile, bwfile, library='Duke'): # Washington is under processing """ :param bamfile: :param bwfile: :param library:Duke or Washington Duke: |=====> <=====| Washington: |===========| Out put cutting site '|' :return: """ bamfor = Baminfo.Baminfo(bamfile) bw = pyBigWig.open(bwfile, "w") bw.addHeader(list(bamfor.chrlen.items())) for chromosome in bamfor.chrlen: end = bamfor.chrlen[chromosome] dhscut = dhsbam.dhcutcount(bamfile=bamfile, chromosome=chromosome, start=1, end=end, library=library) if dhscut: starts = list() values = list() for start in sorted(dhscut): starts.append(start) values.append(float(dhscut[start])) bw.addEntries(chromosome, starts=starts, values=values, span=1, step=1) bw.close()
def main(): usage = 'usage: %prog [options] <out_h5_file> <in_bw_file>' parser = OptionParser(usage) parser.add_option('-c', dest='chr', default=None, help='Comma-separated chromosome list') parser.add_option('-v', dest='verbose', default=False, action='store_true') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide input HDF5 and output BigWig.') else: hdf5_file = args[0] bw_file = args[1] # open files h5_in = h5py.File(hdf5_file) bw_out = pyBigWig.open(bw_file, 'w') # construct header if options.chr is not None: chroms = ['chr%s'%c for c in options.chr.split(',')] else: chroms = sorted(h5_in.keys()) header = [] for chrom in chroms: # chromosome and length header.append((chrom,len(h5_in[chrom]))) # write header bw_out.addHeader(header) for chrom, length in header: if options.verbose: print(chrom) # read values x = np.array(h5_in[chrom]) # write gzipped into HDF5 bw_out.addEntries(chrom, 0, values=x, span=1, step=1) # close files h5_in.close() bw_out.close()
def _create_bigwig(bed_column, outpath, genome_size_dict): # type: (pd.Series, str, Dict[str, int]) -> None logging.info("Creating biwgwig " + outpath) bed_column = bed_column.reset_index() values = [float(f) for _, _, _, f in bed_column.values] unique_chromosomes = list(bed_column.Chromosome.drop_duplicates()) chromosomes = list(bed_column.Chromosome) starts = _to_int(list(bed_column.Bin)) ends = _to_int(list(bed_column.End + 1)) header = [(c, int(genome_size_dict[c])) for c in unique_chromosomes] bw = pyBigWig.open(outpath, "w") bw.addHeader(header) bw.addEntries(chromosomes, starts, ends=ends, values=values) bw.close()
def coverage_from_bigwig(self, bigwig_file, stepsize=100): """Return list of arrays describing the coverage of each genomicRegions from <bigwig_file>. *Keyword arguments:* - bigwig_file -- path to bigwig file - stepsize -- used stepsize *Output:* Class variable <coverage>: a list where the elements correspond to the GenomicRegion. The list elements give the number of reads falling into the GenomicRegion. """ try: from ngslib import BigWigFile self.coverage = [] bwf = BigWigFile(bigwig_file) for gr in self.genomicRegions: depth = bwf.pileup(gr.chrom, max(0, int(gr.initial - stepsize / 2)), max(1, int(gr.final + stepsize / 2))) ds = [depth[d] for d in range(0, gr.final - gr.initial, stepsize)] self.coverage.append(np.array(ds)) bwf.close() except ImportError, e: import pyBigWig self.coverage = [] bwf = pyBigWig.open(bigwig_file) for gr in self.genomicRegions: steps = int(len(gr) / stepsize) ds = bwf.stats(gr.chrom, gr.initial, gr.final, type="mean", nBins=steps) ds = [ x if x else 0 for x in ds ] self.coverage.append( np.array(ds) ) bwf.close()
def main(): usage = 'usage: %prog [options] <in_zarr_file> <out_bw_file>' parser = OptionParser(usage) parser.add_option('-v', dest='verbose', default=False, action='store_true') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide input HDF5 and output BigWig.') else: zarr_file = args[0] bw_file = args[1] # open files zarr_in = zarr.open_group(zarr_file, 'r') bw_out = pyBigWig.open(bw_file, 'w') # construct header header = [] chroms = sorted(zarr_in.keys()) for chrom in chroms: # chromosome and length header.append((chrom,len(zarr_in[chrom]))) # write header bw_out.addHeader(header) for chrom, length in header: if options.verbose: print(chrom) # read values x = np.array(zarr_in[chrom]) # write gzipped into HDF5 bw_out.addEntries(chrom, 0, values=x, span=1, step=1) # close files bw_out.close()
def countFragmentsInRegions_worker(chrom, start, end, bigWigFiles, stepSize, binLength, save_data, bedRegions=None ): """ returns the average score in each bigwig file at each 'stepSize' position within the interval start, end for a 'binLength' window. Because the idea is to get counts for window positions at different positions for sampling the bins are equally spaced and *not adjacent*. If a list of bedRegions is given, then the number of reads that overlaps with each region is counted. Test dataset with two samples covering 200 bp. >>> test = Tester() Fragment coverage. >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200, [test.bwFile1, test.bwFile2], 50, 25, False)[0]) array([[ 1., 1., 2., 2.], [ 1., 1., 1., 3.]]) >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200, [test.bwFile1, test.bwFile2], 200, 200, False)[0]) array([[ 1.5], [ 1.5]]) BED regions: >>> bedRegions = [(test.chrom, 45, 55), (test.chrom, 95, 105), (test.chrom, 145, 155)] >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200,[test.bwFile1, test.bwFile2], 200, 200, False, ... bedRegions=bedRegions)[0]) array([[ 1. , 1.5, 2. ], [ 1. , 1. , 2. ]]) """ assert start < end, "start {} bigger that end {}".format(start, end) # array to keep the scores for the regions sub_score_per_bin = [] rows = 0 bigwig_handlers = [pyBigWig.open(bw) for bw in bigWigFiles] regions_to_consider = [] if bedRegions: for chrom, start, end in bedRegions: regions_to_consider.append((chrom, start, end, end - start)) else: for i in xrange(start, end, stepSize): if (i + binLength) > end: regions_to_consider.append((chrom, i, end, end - i)) # last bin (may be smaller) else: regions_to_consider.append((chrom, i, i + binLength, binLength)) if save_data: _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t') _file_name = _file.name else: _file_name = '' warnings.simplefilter("default") i = 0 for chrom, start, end, binLength in regions_to_consider: avgReadsArray = [] i += 1 for idx, bwh in enumerate(bigwig_handlers): if chrom not in bwh.chroms().keys(): unmod_name = chrom if chrom.startswith('chr'): # remove the chr part from chromosome name chrom = chrom[3:] else: # prefix with 'chr' the chromosome name chrom = 'chr' + chrom if chrom not in bwh.chroms().keys(): exit('Chromosome name {} not found in bigwig file\n {}\n'.format(unmod_name, bigWigFiles[idx])) score = bwh.stats(chrom, start, end) if score is None or score == [None] or np.isnan(score[0]): score = [np.nan] avgReadsArray.append(score[0]) # mean of fragment coverage for region # print "{} Region: {}:{:,}-{:,} {} {} {}".format(i, chrom, start, end, binLength, avgReadsArray[0], avgReadsArray[1]) sub_score_per_bin.extend(avgReadsArray) rows += 1 if save_data: _file.write("\t".join(map(str, [chrom, start, end])) + "\t") _file.write("\t".join(["{}".format(x) for x in avgReadsArray]) + "\n") if save_data: _file.close() warnings.resetwarnings() # the output is a matrix having as many rows as the variable 'row' # and as many columns as bigwig files. The rows correspond to # each of the regions processed by the worker. # np.array([[score1_1, score1_2], # [score2_1, score2_2]] return np.array(sub_score_per_bin).reshape(rows, len(bigWigFiles)), _file_name
def run_after(self, rtc, output_dir): bw = pyBigWig.open(rtc.task.output_files[0]) nrec = bw.header()["nBasesCovered"] self.assertEqual(nrec, 6, "{n} != 6".format(n=nrec)) self.assertAlmostEqual(bw.stats("chr1", 2, 3)[0], 1.9, places=5) self.assertAlmostEqual(bw.stats("chr2", 7, 8)[0], 1.0, places=5)
def preload(self, regions, tmpDir=None): """ Given a sample and a set of regions, write a bigWig file containing the underlying signal. This function returns the file name, which needs to be deleted by the calling function at some point. This sends queries one chromosome at a time, due to memory limits on deepBlue """ startTime = datetime.datetime.now() regions2 = mergeRegions(regions) # Make a temporary file f = tempfile.NamedTemporaryFile(delete=False, dir=tmpDir) fname = f.name f.close() # Start with the bigWig file bw = pyBigWig.open(fname, "w") bw.addHeader(self.chromsTuple, maxZooms=0) # This won't work in IGV! # Make a string out of everything in a resonable order for k, v in self.chromsTuple: # Munge chromosome names as appropriate chrom = mungeChromosome(k, regions2.keys()) if not chrom: continue if chrom not in regions2 or len(regions2) == 0: continue regionsStr = "\n".join(["{}\t{}\t{}".format(k, reg[0], reg[1]) for reg in regions2[chrom]]) regionsStr += "\n" # Send the regions (status, regionsID) = self.server.input_regions(self.genome, regionsStr, self.userKey) if status != "okay": raise RuntimeError("Received the following error while sending regions for '{}': {}".format(regionsID, self.sample)) # Get the experiment information (status, queryID) = self.server.select_experiments(self.sample, k, None, None, self.userKey) if status != "okay": raise RuntimeError("Received the following error while running select_experiments on file '{}': {}".format(self.sample, queryID)) if not queryID: raise RuntimeError("Somehow, we received None as a query ID (file '{}')".format(self.sample)) # Intersect (status, intersectID) = self.server.intersection(queryID, regionsID, self.userKey) if status != "okay": raise RuntimeError("Received the following error while running intersection on file '{}': {}".format(self.sample, intersectID)) if not intersectID: raise RuntimeError("Somehow, we received None as an intersect ID (file '{}')".format(self.sample)) # Query the regions (status, reqID) = self.server.get_regions(intersectID, "START,END,VALUE", self.userKey) if status != "okay": raise RuntimeError("Received the following error while fetching regions in file '{}': {}".format(self.sample, reqID)) # Wait for the server to process the data (status, info) = self.server.info(reqID, self.userKey) request_status = info[0]["state"] while request_status != "done" and request_status != "failed": time.sleep(0.1) (status, info) = self.server.info(reqID, self.userKey) request_status = info[0]["state"] # Get the actual data (status, resp) = self.server.get_request_data(reqID, self.userKey) if status != "okay": raise RuntimeError("Received the following error while fetching data in file '{}': {}".format(self.sample, resp)) for intervals in resp.split("\n"): interval = intervals.split("\t") if interval[0] == '': continue bw.addEntries([k], [int(interval[0])], ends=[int(interval[1])], values=[float(interval[2])]) bw.close() sys.stderr.write("{} done (took {})\n".format(self.sample, datetime.datetime.now() - startTime)) sys.stderr.flush() return fname