def _consumer(input_queue, output_queue): while True: line = input_queue.get() if line is None: break f = BEDFeature.from_string(line) # retrieve conservation data bigwig_file = chrom_bigwig_dict[f.chrom] arr = extract_bigwig_data(f, bigwig_file) # measure conservation at various sliding windows window_scores = [] for window_size in window_sizes: window_scores.append(best_sliding_window(arr, window_size, np.mean)) # measure average conservation finitearr = arr[np.isfinite(arr)] if len(finitearr) == 0: mean_cons = np.nan else: mean_cons = np.mean(finitearr) fields = [f.name, '%s:%d-%d[%s]' % (f.chrom, f.tx_start, f.tx_end, f.strand), str(len(arr)), str(mean_cons)] fields.extend(map(str,window_scores)) result = '\t'.join(fields) output_queue.put(result) output_queue.put(None)
def _consumer(worker_index, input_queue, output_queue): hists = collections.defaultdict(lambda: np.zeros(NUM_BINS-1, dtype=np.float)) while True: line = input_queue.get() if line is None: break f = BEDFeature.from_string(line) fields = bed_feature_conservation(f, chrom_bigwig_dict, hists) result = '\t'.join(fields) output_queue.put(result) np.savez('w%d.npz' % (worker_index), **hists) output_queue.put(None)
def main(): # parse command line parser = argparse.ArgumentParser() parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", default=False) parser.add_argument('-p', '--num-processes', type=int, dest='num_processes', default=1) parser.add_argument("--pattern", dest="pattern", default=r'{{CHROM}}.phyloP46way.bw') parser.add_argument("bigwig_file_dir") parser.add_argument("bed_file") args = parser.parse_args() # set logging level if args.verbose: level = logging.DEBUG else: level = logging.INFO logging.basicConfig( level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") # check command line parameters if which(BIGWIG_TO_BEDGRAPH_BIN) is None: parser.error('bigWigToBedGraph binary not found in PATH') if not os.path.exists(args.bed_file): parser.error("BED file %s not found" % (args.bed_file)) prefix = os.path.splitext(args.bed_file)[0] results_file = prefix + '.results.txt' hists_file = prefix + '.hists.npz' # find bigwig files logging.info("Indexing bigWig files") chrom_bigwig_dict = find_bigwig_files(args.bigwig_file_dir, args.pattern) # process bed file logging.info("Measuring conservation") if args.num_processes > 1: conservation_parallel(args.bed_file, chrom_bigwig_dict, args.num_processes, results_file, hists_file) else: hists = collections.defaultdict( lambda: np.zeros(NUM_BINS - 1, dtype=np.float)) with open(results_file, 'w') as outfile: for f in BEDFeature.parse(open(args.bed_file)): fields = bed_feature_conservation(f, chrom_bigwig_dict, hists) print >> outfile, '\t'.join(fields) np.savez(hists_file, **hists) return 0
def _consumer(worker_index, input_queue, output_queue): hists = collections.defaultdict( lambda: np.zeros(NUM_BINS - 1, dtype=np.float)) while True: line = input_queue.get() if line is None: break f = BEDFeature.from_string(line) fields = bed_feature_conservation(f, chrom_bigwig_dict, hists) result = '\t'.join(fields) output_queue.put(result) np.savez('w%d.npz' % (worker_index), **hists) output_queue.put(None)
def main(): # parse command line parser = argparse.ArgumentParser() parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", default=False) parser.add_argument('-p', '--num-processes', type=int, dest='num_processes', default=1) parser.add_argument("--pattern", dest="pattern", default=r'{{CHROM}}.phyloP46way.bw') parser.add_argument("bigwig_file_dir") parser.add_argument("bed_file") args = parser.parse_args() # set logging level if args.verbose: level = logging.DEBUG else: level = logging.INFO logging.basicConfig(level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") # check command line parameters if which(BIGWIG_TO_BEDGRAPH_BIN) is None: parser.error('bigWigToBedGraph binary not found in PATH') if not os.path.exists(args.bed_file): parser.error("BED file %s not found" % (args.bed_file)) prefix = os.path.splitext(args.bed_file)[0] results_file = prefix + '.results.txt' hists_file = prefix + '.hists.npz' # find bigwig files logging.info("Indexing bigWig files") chrom_bigwig_dict = find_bigwig_files(args.bigwig_file_dir, args.pattern) # process bed file logging.info("Measuring conservation") if args.num_processes > 1: conservation_parallel(args.bed_file, chrom_bigwig_dict, args.num_processes, results_file, hists_file) else: hists = collections.defaultdict(lambda: np.zeros(NUM_BINS-1, dtype=np.float)) with open(results_file, 'w') as outfile: for f in BEDFeature.parse(open(args.bed_file)): fields = bed_feature_conservation(f, chrom_bigwig_dict, hists) print >>outfile, '\t'.join(fields) np.savez(hists_file, **hists) return 0
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--source", dest="source", default='bed_to_gtf') parser.add_argument("bed_file") args = parser.parse_args() bed_file = args.bed_file source = args.source for x in BEDFeature.parse(open(bed_file)): f = GTFFeature() f.seqid = x.chrom f.source = source f.feature_type = 'transcript' f.start = x.tx_start f.end = x.tx_end f.score = x.score f.strand = x.strand f.phase = '.' f.attrs = {'transcript_id': x.name, 'gene_id': x.name} features = [f] for i,e in enumerate(x.exons): start, end = e f = GTFFeature() f.seqid = x.chrom f.source = source f.feature_type = 'exon' f.start = start f.end = end f.score = x.score f.strand = x.strand f.phase = '.' f.attrs = dict(features[0].attrs) f.attrs["exon_number"] = i features.append(f) for f in features: print str(f)
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--source", dest="source", default='bed_to_gtf') parser.add_argument("bed_file") args = parser.parse_args() bed_file = args.bed_file source = args.source for x in BEDFeature.parse(open(bed_file)): f = GTFFeature() f.seqid = x.chrom f.source = source f.feature_type = 'transcript' f.start = x.tx_start f.end = x.tx_end f.score = x.score f.strand = x.strand f.phase = '.' f.attrs = {'transcript_id': x.name, 'gene_id': x.name} features = [f] for i, e in enumerate(x.exons): start, end = e f = GTFFeature() f.seqid = x.chrom f.source = source f.feature_type = 'exon' f.start = start f.end = end f.score = x.score f.strand = x.strand f.phase = '.' f.attrs = dict(features[0].attrs) f.attrs["exon_number"] = i features.append(f) for f in features: print str(f)
def conservation_serial(bed_file, window_sizes, chrom_bigwig_dict): # output header fields fields = ['name', 'position', 'transcript_length', 'mean'] fields.extend(map(str,window_sizes)) print '\t'.join(fields) # process bed file for f in BEDFeature.parse(open(bed_file)): # retrieve conservation data bigwig_file = chrom_bigwig_dict[f.chrom] arr = extract_bigwig_data(f, bigwig_file) # measure conservation at various sliding windows window_scores = [] for window_size in window_sizes: window_scores.append(best_sliding_window(arr, window_size, np.mean)) # calc mean conservation finitearr = arr[np.isfinite(arr)] if len(finitearr) == 0: mean_cons = np.nan else: mean_cons = np.mean(finitearr) fields = [f.name, '%s:%d-%d[%s]' % (f.chrom, f.tx_start, f.tx_end, f.strand), str(len(arr)), str(mean_cons)] fields.extend(map(str,window_scores)) print '\t'.join(fields)