def main(): # Parse command line options, args = doc_optparse.parse(__doc__) try: score_fname = args[0] interval_fname = args[1] if len(args) > 2: out_file = open(args[2], 'w') else: out_file = sys.stdout binned = bool(options.binned) mask_fname = options.mask except Exception: doc_optparse.exit() if binned: scores_by_chrom = load_scores_ba_dir(score_fname) else: scores_by_chrom = load_scores_wiggle(score_fname) if mask_fname: masks = binned_bitsets_from_file(open(mask_fname)) else: masks = None for line in open(interval_fname): fields = line.split() chrom, start, stop = fields[0], int(fields[1]), int(fields[2]) total = 0 count = 0 min_score = 100000000 max_score = -100000000 for i in range(start, stop): if chrom in scores_by_chrom and scores_by_chrom[chrom][i]: # Skip if base is masked if masks and chrom in masks: if masks[chrom][i]: continue # Get the score, only count if not 'nan' score = scores_by_chrom[chrom][i] if not isNaN(score): total += score count += 1 max_score = max(score, max_score) min_score = min(score, min_score) if count > 0: avg = total / count else: avg = "nan" min_score = "nan" max_score = "nan" print("\t".join( map(str, [chrom, start, stop, avg, min_score, max_score])), file=out_file) out_file.close()
def main(): region_fname, exclude_fname, window_size = sys.argv[1], sys.argv[2], int( sys.argv[3] ) exclude_bitsets = binned_bitsets_from_file( open( exclude_fname ) ) for line in open( region_fname ): fields = line.split() chr, start, end = fields[0], 0, int( fields[1] ) if chr not in exclude_bitsets: do_windows( chr, start, end, window_size ) else: bits = exclude_bitsets[chr] assert end < bits.size e = 0 while 1: s = bits.next_clear( e ) if s > end: break e = bits.next_set( s ) do_windows( chr, s, min( e, end ), window_size )
def print_bits_as_bed( bits ): end = 0 while 1: start = bits.next_set( end ) if start == bits.size: break end = bits.next_clear( start ) print "%s\t%d\t%d" % ( chrom, start, end ) options, args = doc_optparse.parse( __doc__ ) try: in_fname, in2_fname = args except: doc_optparse.exit() # Read first bed into some bitsets bitsets1 = binned_bitsets_from_file( open( in_fname ) ) bitsets2 = binned_bitsets_from_file( open( in2_fname ) ) for chrom in bitsets1: if chrom not in bitsets1: continue bits1 = bitsets1[chrom] if chrom in bitsets2: bits2 = bitsets2[chrom] bits2.invert() bits1.iand( bits2 ) print_bits_as_bed( bits1 )
def main(): # Parse command line options, args = doc_optparse.parse(__doc__) try: score_fname = args[0] interval_fname = args[1] chrom_col = args[2] start_col = args[3] stop_col = args[4] if len(args) > 5: out_file = open(args[5], 'w') else: out_file = sys.stdout binned = bool(options.binned) mask_fname = options.mask except: doc_optparse.exit() if score_fname == 'None': stop_err('This tool works with data from genome builds hg16, hg17 or hg18. Click the pencil icon in your history item to set the genome build if appropriate.') try: chrom_col = int(chrom_col) - 1 start_col = int(start_col) - 1 stop_col = int(stop_col) - 1 except: stop_err('Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.') if chrom_col < 0 or start_col < 0 or stop_col < 0: stop_err('Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.') if binned: scores_by_chrom = load_scores_ba_dir(score_fname) else: try: chrom_buffer = int(options.chrom_buffer) except: chrom_buffer = 3 scores_by_chrom = load_scores_wiggle(score_fname, chrom_buffer) if mask_fname: masks = binned_bitsets_from_file(open(mask_fname)) else: masks = None skipped_lines = 0 first_invalid_line = 0 invalid_line = '' for i, line in enumerate(open(interval_fname)): valid = True line = line.rstrip('\r\n') if line and not line.startswith('#'): fields = line.split() try: chrom, start, stop = fields[chrom_col], int(fields[start_col]), int(fields[stop_col]) except: valid = False skipped_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line if valid: total = 0 count = 0 min_score = 100000000 max_score = -100000000 for j in range(start, stop): if chrom in scores_by_chrom: try: # Skip if base is masked if masks and chrom in masks: if masks[chrom][j]: continue # Get the score, only count if not 'nan' score = scores_by_chrom[chrom][j] if not isnan(score): total += score count += 1 max_score = max(score, max_score) min_score = min(score, min_score) except: continue if count > 0: avg = total / count else: avg = "nan" min_score = "nan" max_score = "nan" # Build the resulting line of data out_line = [] for k in range(0, len(fields)): out_line.append(fields[k]) out_line.append(avg) out_line.append(min_score) out_line.append(max_score) print("\t".join(map(str, out_line)), file=out_file) else: skipped_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line elif line.startswith('#'): # We'll save the original comments print(line, file=out_file) out_file.close() if skipped_lines > 0: print('Data issue: skipped %d invalid lines starting at line #%d which is "%s"' % (skipped_lines, first_invalid_line, invalid_line)) if skipped_lines == i: print('Consider changing the metadata for the input dataset by clicking on the pencil icon in the history item.')
covered in both of the inputs will be output. usage: %prog bed_file_1 bed_file_2 """ from __future__ import print_function from bx.bitset_builders import binned_bitsets_from_file from bx.cookbook import doc_optparse options, args = doc_optparse.parse(__doc__) try: in_fname, in2_fname = args except ValueError: doc_optparse.exit() bits1 = binned_bitsets_from_file(open(in_fname)) bits2 = binned_bitsets_from_file(open(in2_fname)) bitsets = dict() for key in bits1: if key in bits2: bits1[key].iand(bits2[key]) bitsets[key] = bits1[key] for chrom in bitsets: bits = bitsets[chrom] end = 0 while True: start = bits.next_set(end) if start == bits.size:
def main(): # Parse command line options, args = doc_optparse.parse(__doc__) try: score_fname = args[0] interval_fname = args[1] chrom_col = args[2] start_col = args[3] stop_col = args[4] if len(args) > 5: out_file = open(args[5], 'w') else: out_file = sys.stdout binned = bool(options.binned) mask_fname = options.mask except Exception: doc_optparse.exit() if score_fname == 'None': stop_err( 'This tool works with data from genome builds hg16, hg17 or hg18. Click the pencil icon in your history item to set the genome build if appropriate.' ) try: chrom_col = int(chrom_col) - 1 start_col = int(start_col) - 1 stop_col = int(stop_col) - 1 except Exception: stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' ) if chrom_col < 0 or start_col < 0 or stop_col < 0: stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' ) if binned: scores_by_chrom = load_scores_ba_dir(score_fname) else: try: chrom_buffer = int(options.chrom_buffer) except Exception: chrom_buffer = 3 scores_by_chrom = load_scores_wiggle(score_fname, chrom_buffer) if mask_fname: masks = binned_bitsets_from_file(open(mask_fname)) else: masks = None skipped_lines = 0 first_invalid_line = 0 invalid_line = '' for i, line in enumerate(open(interval_fname)): valid = True line = line.rstrip('\r\n') if line and not line.startswith('#'): fields = line.split() try: chrom, start, stop = fields[chrom_col], int( fields[start_col]), int(fields[stop_col]) except Exception: valid = False skipped_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line if valid: total = 0 count = 0 min_score = 100000000 max_score = -100000000 for j in range(start, stop): if chrom in scores_by_chrom: try: # Skip if base is masked if masks and chrom in masks: if masks[chrom][j]: continue # Get the score, only count if not 'nan' score = scores_by_chrom[chrom][j] if not isnan(score): total += score count += 1 max_score = max(score, max_score) min_score = min(score, min_score) except Exception: continue if count > 0: avg = total / count else: avg = "nan" min_score = "nan" max_score = "nan" # Build the resulting line of data out_line = [] for k in range(0, len(fields)): out_line.append(fields[k]) out_line.append(avg) out_line.append(min_score) out_line.append(max_score) print("\t".join(map(str, out_line)), file=out_file) else: skipped_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line elif line.startswith('#'): # We'll save the original comments print(line, file=out_file) out_file.close() if skipped_lines > 0: print( 'Data issue: skipped %d invalid lines starting at line #%d which is "%s"' % (skipped_lines, first_invalid_line, invalid_line)) if skipped_lines == i: print( 'Consider changing the metadata for the input dataset by clicking on the pencil icon in the history item.' )
def main(): parser = OptionParser( usage= "usage: %prog [options] maf_file snp_file neutral_file window_size step_size" ) parser.add_option("-o", "--outfile", help="Specify file for output") parser.add_option("-s", "--species", type="string", default="panTro2") parser.add_option("-b", "--build", type="string", default="hg18") (options, args) = parser.parse_args() if len(args) != 5: parser.error("Incorrect number of arguments") else: maf_filename = args[0] snp_filename = args[1] neutral_filename = args[2] window_size = int(args[3]) step_size = int(args[4]) if options.outfile is not None: out_file = open(options.outfile, 'w') # Generate snp and neutral bitsets AR_snp_bitsets = binned_bitsets_from_file(open(snp_filename)) neutral_bitsets = binned_bitsets_from_file(open(neutral_filename)) # Generate divergence bitset from maf file AR_div_bitsets = dict() chr_lens = dict() reader = bx.align.maf.Reader(open(maf_filename)) for block in reader: comp1 = block.get_component_by_src_start(options.build) comp2 = block.get_component_by_src_start(options.species) if comp1 is None or comp2 is None: continue # Chromosome, start, and stop of reference species alignment chr = comp1.src.split('.')[1] start = comp1.start # Get or create bitset for this chromosome if chr in AR_div_bitsets: bitset = AR_div_bitsets[chr] else: bitset = AR_div_bitsets[chr] = bx.bitset.BinnedBitSet() chr_lens[chr] = comp1.get_src_size() # Iterate over text and set diverged bit pos = start for ch1, ch2 in zip(comp1.text.upper(), comp2.text.upper()): if ch1 == '-': continue if ch2 == '-': pos += 1 continue if ch1 != ch2 and not AR_snp_bitsets[chr][pos]: bitset.set(pos) pos += 1 # Debugging Code # for chr in AR_div_bitsets: # for pos in range(0, AR_div_bitsets[chr].size): # if AR_div_bitsets[pos]: # print >> sys.stderr, chr, pos, pos+1 # Copy div and snp bitsets nonAR_snp_bitsets = dict() for chr in AR_snp_bitsets: nonAR_snp_bitsets[chr] = bx.bitset.BinnedBitSet() nonAR_snp_bitsets[chr].ior(AR_snp_bitsets[chr]) nonAR_div_bitsets = dict() for chr in AR_div_bitsets: nonAR_div_bitsets[chr] = bx.bitset.BinnedBitSet() nonAR_div_bitsets[chr].ior(AR_div_bitsets[chr]) # Generates AR snps by intersecting with neutral intervals for chr in AR_snp_bitsets: AR_snp_bitsets[chr].iand(neutral_bitsets[chr]) # Generates AR divs by intersecting with neutral intervals for chr in AR_div_bitsets: AR_div_bitsets[chr].iand(neutral_bitsets[chr]) # Inverts the neutral intervals so now represents nonAR for chr in neutral_bitsets: neutral_bitsets[chr].invert() # Generates nonAR snps by intersecting with masked neutral intervals for chr in nonAR_snp_bitsets: nonAR_snp_bitsets[chr].iand(neutral_bitsets[chr]) # Generates nonAR divs by intersecting with masked neutral intervals for chr in nonAR_div_bitsets: nonAR_div_bitsets[chr].iand(neutral_bitsets[chr]) for chr in AR_div_bitsets: for window in range(0, chr_lens[chr] - window_size, step_size): # neutral_size = neutral_bitsets[chr].count_range(window, window_size) # if neutral_size < 9200: continue AR_snp = AR_snp_bitsets[chr].count_range(window, window_size) AR_div = AR_div_bitsets[chr].count_range(window, window_size) nonAR_snp = nonAR_snp_bitsets[chr].count_range(window, window_size) nonAR_div = nonAR_div_bitsets[chr].count_range(window, window_size) if nonAR_snp >= 6 and nonAR_div >= 6 and AR_snp >= 6 and AR_div >= 6: MK_pval = MK_chi_pvalue(nonAR_snp, nonAR_div, AR_snp, AR_div) else: MK_pval = MK_fisher_pvalue(nonAR_snp, nonAR_div, AR_snp, AR_div) if options.outfile is not None: out_file.write("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%1.15f\n" % (chr, window, window + window_size, nonAR_snp, nonAR_div, AR_snp, AR_div, MK_pval)) else: print("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%1.15f" % (chr, window, window + window_size, nonAR_snp, nonAR_div, AR_snp, AR_div, MK_pval)) if options.outfile is not None: out_file.close()
def main(): options, args = doc_optparse.parse(__doc__) try: lens = {} if options.lens: for line in open(options.lens): chrom, length = line.split() lens[chrom] = int(length) if options.suffix: suffix = options.suffix else: suffix = "" print("\nReading feature", end=' ', file=sys.stderr) interval_file = open(args[0]) feature = binned_bitsets_from_file(interval_file, lens=lens) interval_file.close() # reuse interval file intervals = {} interval_file = open(args[0]) for line in interval_file: fields = line.split() chrom, start, end = fields[0], int(fields[1]), int(fields[2]) if chrom not in intervals: intervals[chrom] = [] intervals[chrom].append([start, end]) interval_file.close() print("\nReading ar", end=' ', file=sys.stderr) ar = binned_bitsets_from_file(open(args[1]), lens=lens) print("\nReading snps", end=' ', file=sys.stderr) snp = binned_bitsets_from_file(open(args[2]), lens=lens) snp_mask = clone_inverted(snp) snp_copy = clone(snp) print("\nMasking AR", end=' ', file=sys.stderr) ar_mask = clone_inverted(ar) print(file=sys.stderr) dirname = args[3] if options.mask: mask = binned_bitsets_from_file(open(options.mask), lens=lens) else: mask = None except Exception: doc_optparse.exit() if mask: for chrom in mask.keys(): if chrom in feature: feature[chrom].iand(mask[chrom]) if chrom in ar: ar[chrom].iand(mask[chrom]) # divergence and snp counts for all features feature_div_count = 0 feature_snp_count = 0 ar_div_count = 0 ar_snp_count = 0 # collect snp and div for chr in feature.keys(): if chr not in snp: continue if chr not in ar: continue print("reading %s ..." % chr, end=' ', file=sys.stderr) try: div = binned_bitsets_from_file(open(dirname + "/%s.bed" % (chr+suffix)), lens=lens) except Exception: print("%s.bed not found" % chr, file=sys.stderr) continue div[chr].iand(snp_mask[chr]) # div/snp sites count snp-only div_copy = clone(div) print("AR:", chr, end=' ', file=sys.stderr) snp[chr].iand(ar[chr]) div[chr].iand(ar[chr]) snp_count = snp[chr].count_range(0, snp[chr].size) ar_snp_count += snp_count print(snp_count, end=' ', file=sys.stderr) try: div_count = div[chr].count_range(0, div[chr].size) ar_div_count += div_count print(div_count, file=sys.stderr) except Exception: print(chr, "failed", file=sys.stderr) div = div_copy snp[chr] = snp_copy[chr] print("feature:", chr, end=' ', file=sys.stderr) feature[chr].iand(ar_mask[chr]) # clip to non-AR only snp[chr].iand(feature[chr]) div[chr].iand(feature[chr]) feature_snp_count += snp[chr].count_range(0, snp[chr].size) print(snp[chr].count_range(0, snp[chr].size), div[chr].count_range(0, div[chr].size), file=sys.stderr) feature_div_count += div[chr].count_range(0, div[chr].size) print(snp[chr].count_range(0, snp[chr].size), div[chr].count_range(0, div[chr].size), file=sys.stderr) # Note: can loop over feature intervals here for individual counts if chr in intervals: for start, end in intervals[chr]: ind_div_count = div[chr].count_range(start, end-start) ind_snp_count = snp[chr].count_range(start, end-start) print(chr, start, end, ind_div_count, ind_snp_count) print("feature snp\t%d" % feature_snp_count) print("feature div\t%d" % feature_div_count) print("ar snp\t%d" % ar_snp_count) print("ar div\t%d" % ar_div_count)
#!/usr/bin/env python """ Print number of bases covered by all intervals in a bed file (bases covered by more than one interval are counted only once). Multiple bed files can be provided on the command line or to stdin. usage: %prog bed files ... """ from __future__ import print_function import sys from itertools import chain from bx.bitset_builders import binned_bitsets_from_file bed_filenames = sys.argv[1:] if bed_filenames: input = chain(*(open(_) for _ in bed_filenames)) else: input = sys.stdin bitsets = binned_bitsets_from_file(input) total = 0 for chrom in bitsets: total += bitsets[chrom].count_range(0, bitsets[chrom].size) print(total)
def read_len(f): """Read a 'LEN' file and return a mapping from chromosome to length""" mapping = dict() for line in f: fields = line.split() mapping[fields[0]] = int(fields[1]) return mapping options, args = doc_optparse.parse(__doc__) try: in_fname, len_fname = args except Exception: doc_optparse.exit() bitsets = binned_bitsets_from_file(open(in_fname)) lens = read_len(open(len_fname)) for chrom in lens: if chrom in bitsets: bits = bitsets[chrom] bits.invert() len = lens[chrom] end = 0 while True: start = bits.next_set(end) if start == bits.size: break end = bits.next_clear(start) if end > len:
""" For each interval in `bed1` print the fraction of bases covered by `bed2`. usage: %prog bed1 bed2 [mask] """ from __future__ import division, print_function import sys from bx.bitset import BinnedBitSet from bx.bitset_builders import binned_bitsets_from_file bed1_fname, bed2_fname = sys.argv[1:3] bitsets = binned_bitsets_from_file(open(bed2_fname)) def clone(bits): b = BinnedBitSet(bits.size) b.ior(bits) return b if len(sys.argv) > 3: mask_fname = sys.argv[3] mask = binned_bitsets_from_file(open(mask_fname)) new_bitsets = dict() for key in bitsets: if key in mask: b = clone(mask[key])