def test_read_intervals(tmpdir): """Write intervals to a file and read it through read_intervals \ API. Compare the output of read_intervals with original data.""" intervals = { "chrom": ["chr1", "chr2"], "start": [0, 100], "end": [100, 500] } input_df = pd.DataFrame(intervals) bedfile = os.path.join(tmpdir, "intervals.bed") input_df.to_csv(bedfile, sep="\t", header=False, index=False) output_df = bedio.read_intervals(bedfile) assert input_df.equals(output_df)
def peak2bw(input_file, sizesfile, out_dir): """Convert peak files to bigwig. Args: input_file: Clean peak file to be converted to bigwig. Needs to be either bed or narrowPeak file. sizesfile: BED file containing chromosome sizes. out_dir: Directory to save the outputs to. Returns: Path to the output file. """ # Create the output folder if not os.path.exists(out_dir): os.makedirs(out_dir) # Set name for output file prefix = os.path.basename(input_file) out_bg_name = os.path.join(out_dir, prefix + '.bedGraph') out_bw_name = os.path.join(out_dir, prefix + '.bw') # Read input files _logger.info('Reading input file') # Skip first line if the file is narrowPeak skip = False if input_file.endswith("narrowPeak"): skip = True peaks = read_intervals(input_file, skip=skip) _logger.info('Read ' + str(len(peaks)) + ' peaks.') sizes = read_sizes(sizesfile) # Add score of 1 for all peaks _logger.info('Adding score') peaks['score'] = 1 # Write bedGraph _logger.info('Writing peaks to bedGraph file') # Note: peaks will be subset to chromosomes in sizes file. df_to_bedGraph(peaks, out_bg_name, sizes) # Write bigWig and delete bedGraph _logger.info('Writing peaks to bigWig file {}'.format(out_bw_name)) bedgraph_to_bigwig(out_bg_name, sizesfile, deletebg=True, sort=True) _logger.info('Done!') return out_bw_name
def test_read_intervals_skip(tmpdir): """Write intervals to a file and read it using read_intervals \ API, by skipping the first row. Compare the output with \ original data with first row skipped.""" intervals = { "chrom": ["chr1", "chr2"], "start": [0, 100], "end": [100, 500] } intervals_skip = {"chrom": ["chr2"], "start": [100], "end": [500]} input_df = pd.DataFrame(intervals) input_df_skip = pd.DataFrame(intervals_skip) bedfile = os.path.join(tmpdir, "intervals.bed") input_df.to_csv(bedfile, sep="\t", header=False, index=False) output_df = bedio.read_intervals(bedfile, skip=1) assert input_df_skip.equals(output_df)
def main(): """Main.""" root_dir = os.path.abspath( os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), "..")) args = parse_args(root_dir) genomes = { "hg19": os.path.join(root_dir, "reference", "hg19.chrom.sizes"), "hg38": os.path.join(root_dir, "reference", "hg38.chrom.sizes") } if args.genome in genomes: args.genome = genomes[args.genome] # Set log level _logger.debug(args) # check gpu # TODO: add cpu support if not torch.cuda.is_available(): raise Exception("No GPU available. Check your machine configuration.") # all output will be written in the exp_dir folder args.exp_dir = make_experiment_dir(args.exp_name, args.out_home, timestamp=True) # Convert layer names to a list if args.layers is not None: args.layers = args.layers.strip("[]").split(",") if args.seed is not None and args.seed > 0: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) # train & resume ########################################################################## if args.mode == "train": # If h5 files are provided, load them. if args.train_h5_files is not None: args.train_files = gather_files_from_cmdline(args.train_h5_files, extension=".h5") args.val_files = gather_files_from_cmdline(args.val_h5_files, extension=".h5") # If h5 files not given, generate them. else: args.cleanpeakfile = gather_files_from_cmdline( args.cleanpeakfile, extension=(".bed", ".narrowPeak")) args.noisybw = gather_files_from_cmdline(args.noisybw, extension=".bw") args.cleanbw = gather_files_from_cmdline(args.cleanbw, extension=".bw") # We have to make sure there is a 1-1 correspondence between files. assert len(args.cleanpeakfile) == len(args.noisybw) assert len(args.cleanbw) == len(args.noisybw) train_files = [] val_files = [] for idx in range(len(args.cleanbw)): cleanbw = args.cleanbw[idx] noisybw = args.noisybw[idx] cleanpeakfile = args.cleanpeakfile[idx] # Read in the narrowPeak or BED files for clean data peak # labels, convert them to bigwig out_path = os.path.join(args.exp_dir, "bigwig_peakfiles") cleanpeakbw = peak2bw(cleanpeakfile, args.genome, out_path) # Generate training, validation, holdout intervals files out_path = os.path.join(args.exp_dir, "intervals") train_intervals, val_intervals, holdout_intervals = \ get_intervals(args.genome, args.interval_size, out_path, val=args.val_chrom, holdout=args.holdout_chrom, nonpeak=args.nonpeak, peakfile=cleanpeakbw) # Convert the input bigwig files and the clean peak files into # h5 for training. out_path = os.path.join(args.exp_dir, "bw2h5") nonzero = True prefix = os.path.basename(cleanbw) + ".train" train_file = bw2h5(noisybw, cleanbw, args.layersbw, cleanpeakbw, args.read_buffer, nonzero, train_intervals, out_path, prefix, args.pad) train_files.append(train_file) prefix = os.path.basename(cleanbw) + ".val" val_file = bw2h5(noisybw, cleanbw, args.layersbw, cleanpeakbw, args.read_buffer, nonzero, val_intervals, out_path, prefix, args.pad) val_files.append(val_file) args.train_files = train_files args.val_files = val_files _logger.debug("Training data: " + "\n".join(args.train_files)) _logger.debug("Validation data: " + "\n".join(args.val_files)) # Get model parameters with h5py.File(args.train_files[0], 'r') as f: if args.pad is not None: args.interval_size = f['input'].shape[1] - 2 * args.pad else: args.interval_size = f['input'].shape[1] ngpus_per_node = torch.cuda.device_count() # WAR: gloo distributed doesn't work if world size is 1. # This is fixed in newer torch version - # https://github.com/facebookincubator/gloo/issues/209 if ngpus_per_node == 1: args.distributed = False args.gpu_idx = 0 config_dir = os.path.join(args.exp_dir, "configs") if not os.path.exists(config_dir): os.mkdir(config_dir) if args.distributed: _logger.info('Distributing to %s GPUS' % str(ngpus_per_node)) args.world_size = ngpus_per_node mp.spawn(train_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args), join=True) else: assert_device_available(args.gpu_idx) _logger.info('Running on GPU: %s' % str(args.gpu_idx)) args.world_size = 1 train_worker(args.gpu_idx, ngpus_per_node, args, timers=Timers) # infer & eval ########################################################################## if args.mode == "denoise" or args.mode == "eval": files = [] if args.denoise_h5_files is not None: files = gather_files_from_cmdline(args.denoise_h5_files, extension=".h5") infer_intervals = args.intervals_file else: cleanpeakbw = None if args.mode == "eval": # Read in the narrowPeak or BED files for clean data peak # labels, convert them to bigwig out_path = os.path.join(args.exp_dir, "bigwig_peakfiles") cleanpeakbw = peak2bw(args.cleanpeakfile, args.genome, out_path) args.cleanbw = gather_files_from_cmdline(args.cleanbw, extension=".bw") out_path = os.path.join(args.exp_dir, "intervals") infer_intervals = get_intervals(args.genome, args.interval_size, out_path, peakfile=cleanpeakbw, regions=args.regions) # Convert the input bigiwg files and the clean peak files into h5 # for training. args.noisybw = gather_files_from_cmdline(args.noisybw, extension=".bw") for idx in range(len(args.noisybw)): out_path = os.path.join(args.exp_dir, "bw2h5") nonzero = False cleanbw = None noisybw = args.noisybw[idx] if args.mode == "eval": cleanbw = args.cleanbw[idx] prefix = os.path.basename(noisybw) + "." + args.mode infer_file = bw2h5(noisybw, cleanbw, args.layersbw, cleanpeakbw, args.read_buffer, nonzero, infer_intervals, out_path, prefix, args.pad) files.append(infer_file) for x in range(len(files)): infile = files[x] args.input_files = [infile] if args.mode == "denoise": _logger.debug("Inference data: ", infile) # Check that intervals, sizes and h5 file are all compatible. _logger.info('Checking input files for compatibility') intervals = read_intervals(infer_intervals) sizes = read_sizes(args.genome) check_intervals(intervals, sizes, infile) # Delete intervals and sizes objects in main thread del intervals del sizes else: _logger.debug("Evaluation data: ", infile) # Get model parameters with h5py.File(files[x], 'r') as f: if args.pad is not None: args.interval_size = f['input'].shape[1] - 2 * args.pad else: args.interval_size = f['input'].shape[1] # Make sure that interval_size is a multiple of the out_resolution if args.out_resolution is not None: assert (args.interval_size % args.out_resolution == 0) prefix = os.path.basename(infile).split(".")[0] # setup queue and kick off writer process ############################################################# manager = mp.Manager() res_queue = manager.Queue() if args.mode == "denoise": # Create a keyword argument dictionary to pass into the # multiprocessor keyword_args = { "infer": True, "intervals_file": infer_intervals, "exp_dir": args.exp_dir, "task": args.task, "peaks": args.peaks, "tracks": args.tracks, "num_workers": args.num_workers, "infer_threshold": args.threshold, "reg_rounding": args.reg_rounding, "batches_per_worker": args.batches_per_worker, "gen_bigwig": args.gen_bigwig, "sizes_file": args.genome, "res_queue": res_queue, "prefix": prefix, "deletebg": args.deletebg, "out_resolution": args.out_resolution } write_proc = mp.Process(target=writer, kwargs=keyword_args) write_proc.start() ############################################################# ngpus_per_node = torch.cuda.device_count() # WAR: gloo distributed doesn't work if world size is 1. # This is fixed in newer torch version - # https://github.com/facebookincubator/gloo/issues/209 if ngpus_per_node == 1: args.distributed = False args.gpu_idx = 0 worker = infer_worker if args.mode == "denoise" else eval_worker if args.distributed: args.world_size = ngpus_per_node mp.spawn(worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args, res_queue), join=True) else: assert_device_available(args.gpu_idx) args.world_size = 1 worker(args.gpu_idx, ngpus_per_node, args, res_queue) # finish off writing ############################################################# res_queue.put("done") if args.mode == "denoise": _logger.info("Waiting for writer to finish...") write_proc.join() ############################################################# # Save config parameters dst_config_path = os.path.join(args.out_home, args.mode + "_config.yaml") save_config(dst_config_path, args)
# Output file names if args.prefix is None: prefix = 'summarized_peaks' else: prefix = args.prefix out_bed_path = os.path.join(args.out_dir, prefix + '.bed') out_bg_path = os.path.join(args.out_dir, prefix + '.bedGraph') # Collapse peaks _logger.info('Writing peaks to bedGraph file {}'.format(out_bg_path)) subprocess.call(['bigWigToBedGraph', args.peakbw, out_bg_path]) # Read collapsed peaks _logger.info('Reading peaks') peaks = read_intervals(out_bg_path) peaks.columns = ['#chrom', 'start', 'end'] # Add length of peaks _logger.info('Calculating peak statistics') peaks['len'] = peaks['end'] - peaks['start'] # Extract scores in peaks peakscores = extract_bigwig_intervals(peaks, args.trackbw, stack=False) # Add mean score in peak peaks['mean'] = peakscores.apply(np.mean) # Add max score peaks['max'] = peakscores.apply(np.max)
help='Number of additional bases added as \ padding to the intervals in the h5 file \ containing labels. Use the same --pad value \ that was supplied to bw2h5.py when creating \ --label_file. Not required if --label_file \ is a bigWig file.') args = parser.parse_args() return args args = parse_args() # Load intervals if supplied _logger.info('Loading intervals') if args.intervals is not None: intervals = read_intervals(args.intervals) # If not, use whole chromosome lengths elif args.sizes is not None: intervals = read_sizes(args.sizes, as_intervals=True) else: intervals = None # Calculate regression metrics if args.task == 'regression': # Load labels _logger.info("Loading labels for regression") y = read_data_file(args.label_file, 'label_reg', intervals, pad=args.pad) # Load data _logger.info("Loading data for regression")
def bw2h5(noisybw, cleanbw, layersbw, cleanpeakbw, batch_size, nonzero, intervals_file, out_dir, prefix, pad): """Convert bigwig files to h5. Args: noisybw: BigWig file containing noisy data. cleanbw: BigWig file containing clean data. layersbw: BigWig file containing layers data. cleanpeakbw: BigWig file containing clean peaks data to be used as labels for training. batch_size: Number of lines to read at a time, since all of the data does not fit in memory. nonzero: Only save intervals that have non-zero values. intervals_file: File containing the intervals corresponding to the training, validation or inference. out_dir: Directory to save the output files to. prefix: Prefix to attach to the name, to make the files unique. pad: Padding values. Returns: Path to output files. """ # Read intervals _logger.info('Reading intervals') intervals = read_intervals(intervals_file) _logger.info('Read {} intervals'.format(len(intervals))) # Create output directory if not os.path.exists(out_dir): os.makedirs(out_dir) # Optionally, select intervals with nonzero coverage if nonzero: _logger.info('Selecting intervals with nonzero coverage') nonzero_intervals = check_bigwig_intervals_nonzero( intervals, noisybw) _logger.info("Retaining {} of {} nonzero noisy intervals".format( sum(nonzero_intervals), len(intervals))) intervals = intervals[nonzero_intervals] _logger.debug('Collecting %d intervals' % len(intervals)) # Calculate number of batches batches_per_epoch = int(np.ceil(len(intervals) / batch_size)) _logger.info('Writing data in ' + str(batches_per_epoch) + ' batches.') # Split intervals into batches batch_starts = np.array(range(0, len(intervals), batch_size)) batch_ends = batch_starts + batch_size batch_ends[-1] = len(intervals) # Get output hdf5 filename output_file_path = os.path.join(out_dir, prefix + '.h5') # Write batches to hdf5 file _logger.info('Extracting data for each batch and writing to h5 file') for i in range(batches_per_epoch): # Print current batch if i % 10 == 0: _logger.info("batch " + str(i) + " of " + str(batches_per_epoch)) # Create dictionary to store data batch_data = {} # Subset intervals batch_intervals = intervals.iloc[batch_starts[i]:batch_ends[i], :] # Read noisy data batch_data['input'] = extract_bigwig_intervals( batch_intervals, noisybw, pad=pad ) # Add other input layers if layersbw is not None: # Read additional layers layers = gather_key_files_from_cmdline(layersbw, extension='.bw') for key in layers.keys(): batch_data[key] = extract_bigwig_intervals( batch_intervals, layers[key], pad=pad ) # Add labels if cleanbw and cleanpeakbw: # Read clean data: regression labels batch_data['label_reg'] = extract_bigwig_intervals( batch_intervals, cleanbw, pad=pad ) # Read clean data: classification labels batch_data['label_cla'] = extract_bigwig_intervals( batch_intervals, cleanpeakbw, pad=pad ) _logger.debug(len(batch_data)) _logger.debug("Saving batch " + str(i) + " with keys " + str( batch_data.keys())) # Create dataset, or expand and append batch. if i == 0: dict_to_h5(batch_data, h5file=output_file_path, create_new=True) else: dict_to_h5(batch_data, h5file=output_file_path, create_new=False) _logger.info('Done! Saved to %s' % output_file_path) return output_file_path