def conservation_serial(bed_file, window_sizes, chrom_bigwig_dict): # output header fields fields = ['name', 'position', 'transcript_length', 'mean'] fields.extend(map(str, window_sizes)) print '\t'.join(fields) # process bed file for f in BEDFeature.parse(open(bed_file)): # retrieve conservation data bigwig_file = chrom_bigwig_dict[f.chrom] arr = extract_bigwig_data(f, bigwig_file) # measure conservation at various sliding windows window_scores = [] for window_size in window_sizes: window_scores.append(best_sliding_window(arr, window_size, np.mean)) # calc mean conservation finitearr = arr[np.isfinite(arr)] if len(finitearr) == 0: mean_cons = np.nan else: mean_cons = np.mean(finitearr) fields = [ f.name, '%s:%d-%d[%s]' % (f.chrom, f.tx_start, f.tx_end, f.strand), str(len(arr)), str(mean_cons) ] fields.extend(map(str, window_scores)) print '\t'.join(fields)
def _consumer(input_queue, output_queue): while True: line = input_queue.get() if line is None: break f = BEDFeature.from_string(line) # retrieve conservation data bigwig_file = chrom_bigwig_dict[f.chrom] arr = extract_bigwig_data(f, bigwig_file) # measure conservation at various sliding windows window_scores = [] for window_size in window_sizes: window_scores.append(best_sliding_window(arr, window_size, np.mean)) # measure average conservation finitearr = arr[np.isfinite(arr)] if len(finitearr) == 0: mean_cons = np.nan else: mean_cons = np.mean(finitearr) fields = [f.name, '%s:%d-%d[%s]' % (f.chrom, f.tx_start, f.tx_end, f.strand), str(len(arr)), str(mean_cons)] fields.extend(map(str,window_scores)) result = '\t'.join(fields) output_queue.put(result) output_queue.put(None)
def _consumer(input_queue, output_queue): while True: line = input_queue.get() if line is None: break f = BEDFeature.from_string(line) # retrieve conservation data bigwig_file = chrom_bigwig_dict[f.chrom] arr = extract_bigwig_data(f, bigwig_file) # measure conservation at various sliding windows window_scores = [] for window_size in window_sizes: window_scores.append( best_sliding_window(arr, window_size, np.mean)) # measure average conservation finitearr = arr[np.isfinite(arr)] if len(finitearr) == 0: mean_cons = np.nan else: mean_cons = np.mean(finitearr) fields = [ f.name, '%s:%d-%d[%s]' % (f.chrom, f.tx_start, f.tx_end, f.strand), str(len(arr)), str(mean_cons) ] fields.extend(map(str, window_scores)) result = '\t'.join(fields) output_queue.put(result) output_queue.put(None)
def conservation_serial(bed_file, chrom_bigwig_dict, sig_threshold, window_sizes, num_processes): # process bed file for f in BEDFeature.parse(open(bed_file)): fields = feature_conservation(f, chrom_bigwig_dict, sig_threshold, window_sizes) print '\t'.join(fields)
def _consumer(input_queue, output_queue): while True: line = input_queue.get() if line is None: break f = BEDFeature.from_string(line) fields = feature_conservation(f, chrom_bigwig_dict, sig_threshold, window_sizes) output_queue.put('\t'.join(fields)) output_queue.put(None)
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('bed_file') args = parser.parse_args() trees = collections.defaultdict(lambda: ClusterTree(0,1)) for f in BEDFeature.parse(open(args.bed_file)): tree = trees[f.chrom] for start,end in f.exons: tree.insert(start, end, 1) footprint = 0 for chrom in sorted(trees): chromprint = 0 tree = trees[chrom] for start,end,indexes in tree.getregions(): chromprint += (end - start) #print chrom, chromprint footprint += chromprint print 'total', footprint
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('bed_file') args = parser.parse_args() trees = collections.defaultdict(lambda: ClusterTree(0, 1)) for f in BEDFeature.parse(open(args.bed_file)): tree = trees[f.chrom] for start, end in f.exons: tree.insert(start, end, 1) footprint = 0 for chrom in sorted(trees): chromprint = 0 tree = trees[chrom] for start, end, indexes in tree.getregions(): chromprint += (end - start) #print chrom, chromprint footprint += chromprint print 'total', footprint
def conservation_serial(bed_file, window_sizes, chrom_bigwig_dict): # output header fields fields = ['name', 'position', 'transcript_length', 'mean'] fields.extend(map(str,window_sizes)) print '\t'.join(fields) # process bed file for f in BEDFeature.parse(open(bed_file)): # retrieve conservation data bigwig_file = chrom_bigwig_dict[f.chrom] arr = extract_bigwig_data(f, bigwig_file) # measure conservation at various sliding windows window_scores = [] for window_size in window_sizes: window_scores.append(best_sliding_window(arr, window_size, np.mean)) # calc mean conservation finitearr = arr[np.isfinite(arr)] if len(finitearr) == 0: mean_cons = np.nan else: mean_cons = np.mean(finitearr) fields = [f.name, '%s:%d-%d[%s]' % (f.chrom, f.tx_start, f.tx_end, f.strand), str(len(arr)), str(mean_cons)] fields.extend(map(str,window_scores)) print '\t'.join(fields)
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('-n', type=int, dest='num_samples', default=100000) parser.add_argument('genome_bed_file') parser.add_argument('region_lengths_file') # parse arguments args = parser.parse_args() genome_bed_file = args.genome_bed_file region_lengths_file = args.region_lengths_file num_samples = max(1, args.num_samples) # check arguments if not os.path.exists(genome_bed_file): parser.error('genome bed file %s not found' % (genome_bed_file)) if not os.path.exists(region_lengths_file): parser.error('region lengths file %s not found' % (region_lengths_file)) # get transcript lengths from transcript bed file logging.debug('Reading region lengths') region_lengths = [] with open(region_lengths_file) as f: region_lengths.extend(int(x.strip()) for x in f) # first pass to index windows logging.debug('Reading genome bed file') features = [] genome_cumsums = [] genome_size = 0 max_feature_size = 0 for f in BEDFeature.parse(open(genome_bed_file)): length = sum((e[1] - e[0]) for e in f.exons) if length == 0: continue features.append(f) genome_cumsums.append(genome_size) genome_size += length max_feature_size = max(max_feature_size, length) logging.debug('Genome bed size %d' % (genome_size)) # get windows trials = 0 windows = [] for i in xrange(num_samples): # choose random region length length = region_lengths[random.randrange(len(region_lengths))] while True: trials += 1 # choose random window start genome_start = random.randrange(genome_size - length) feature_index = bisect.bisect_right(genome_cumsums, genome_start) - 1 f = features[feature_index] feature_length = (f.tx_end - f.tx_start) # can use available length of this feature genome_feature_start = genome_cumsums[feature_index] feature_offset = genome_start - genome_feature_start avail_length = feature_length - feature_offset if avail_length < length: continue windows.append((f.chrom, f.tx_start + feature_offset, f.tx_start + feature_offset + length)) break if (i > 0) and (i % 1000) == 0: logging.debug('Finished %d' % (i)) logging.debug('Sampled %d windows (%d trials)' % (len(windows), trials)) for window in sorted(windows): print '\t'.join(map(str, window)) return 0
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('-n', type=int, dest='num_samples', default=100000) parser.add_argument('genome_bed_file') parser.add_argument('region_lengths_file') # parse arguments args = parser.parse_args() genome_bed_file = args.genome_bed_file region_lengths_file = args.region_lengths_file num_samples = max(1, args.num_samples) # check arguments if not os.path.exists(genome_bed_file): parser.error('genome bed file %s not found' % (genome_bed_file)) if not os.path.exists(region_lengths_file): parser.error('region lengths file %s not found' % (region_lengths_file)) # get transcript lengths from transcript bed file logging.debug('Reading region lengths') region_lengths = [] with open(region_lengths_file) as f: region_lengths.extend(int(x.strip()) for x in f) # first pass to index windows logging.debug('Reading genome bed file') features = [] genome_cumsums = [] genome_size = 0 max_feature_size = 0 for f in BEDFeature.parse(open(genome_bed_file)): length = sum((e[1]-e[0]) for e in f.exons) if length == 0: continue features.append(f) genome_cumsums.append(genome_size) genome_size += length max_feature_size = max(max_feature_size, length) logging.debug('Genome bed size %d' % (genome_size)) # get windows trials = 0 windows = [] for i in xrange(num_samples): # choose random region length length = region_lengths[random.randrange(len(region_lengths))] while True: trials += 1 # choose random window start genome_start = random.randrange(genome_size - length) feature_index = bisect.bisect_right(genome_cumsums, genome_start) - 1 f = features[feature_index] feature_length = (f.tx_end - f.tx_start) # can use available length of this feature genome_feature_start = genome_cumsums[feature_index] feature_offset = genome_start - genome_feature_start avail_length = feature_length - feature_offset if avail_length < length: continue windows.append((f.chrom, f.tx_start + feature_offset, f.tx_start + feature_offset + length)) break if (i > 0) and (i % 1000) == 0: logging.debug('Finished %d' % (i)) logging.debug('Sampled %d windows (%d trials)' % (len(windows), trials)) for window in sorted(windows): print '\t'.join(map(str, window)) return 0