def test_sizes(tmpdir): """Write sizes to a file and read it using read_sizes API. \ Compare the output of read_sizes with original data.""" sizes = {"chrom": ["chr1", "chr2"], "length": [1000, 200]} input_df = pd.DataFrame(sizes) bedfile = os.path.join(tmpdir, "sizes.bed") input_df.to_csv(bedfile, sep="\t", header=False, index=False) output_df = bedio.read_sizes(bedfile) assert input_df.equals(output_df)
def peak2bw(input_file, sizesfile, out_dir): """Convert peak files to bigwig. Args: input_file: Clean peak file to be converted to bigwig. Needs to be either bed or narrowPeak file. sizesfile: BED file containing chromosome sizes. out_dir: Directory to save the outputs to. Returns: Path to the output file. """ # Create the output folder if not os.path.exists(out_dir): os.makedirs(out_dir) # Set name for output file prefix = os.path.basename(input_file) out_bg_name = os.path.join(out_dir, prefix + '.bedGraph') out_bw_name = os.path.join(out_dir, prefix + '.bw') # Read input files _logger.info('Reading input file') # Skip first line if the file is narrowPeak skip = False if input_file.endswith("narrowPeak"): skip = True peaks = read_intervals(input_file, skip=skip) _logger.info('Read ' + str(len(peaks)) + ' peaks.') sizes = read_sizes(sizesfile) # Add score of 1 for all peaks _logger.info('Adding score') peaks['score'] = 1 # Write bedGraph _logger.info('Writing peaks to bedGraph file') # Note: peaks will be subset to chromosomes in sizes file. df_to_bedGraph(peaks, out_bg_name, sizes) # Write bigWig and delete bedGraph _logger.info('Writing peaks to bigWig file {}'.format(out_bw_name)) bedgraph_to_bigwig(out_bg_name, sizesfile, deletebg=True, sort=True) _logger.info('Done!') return out_bw_name
def test_sizes_as_intervals(tmpdir): """Write sizes to a file and read it as intervals using \ read_sizes API. Compare the output of read_sizes with \ original data.""" sizes = {"chrom": ["chr1", "chr2"], "length": [1000, 200]} sizes_intervals = { "chrom": ["chr1", "chr2"], "start": [0, 0], "end": [1000, 200] } input_df = pd.DataFrame(sizes) sizes_intervals_df = pd.DataFrame(sizes_intervals) bedfile = os.path.join(tmpdir, "sizes.bed") input_df.to_csv(bedfile, sep="\t", header=False, index=False) output_df = bedio.read_sizes(bedfile, as_intervals=True) assert sizes_intervals_df.equals(output_df)
def main(): """Main.""" root_dir = os.path.abspath( os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), "..")) args = parse_args(root_dir) genomes = { "hg19": os.path.join(root_dir, "reference", "hg19.chrom.sizes"), "hg38": os.path.join(root_dir, "reference", "hg38.chrom.sizes") } if args.genome in genomes: args.genome = genomes[args.genome] # Set log level _logger.debug(args) # check gpu # TODO: add cpu support if not torch.cuda.is_available(): raise Exception("No GPU available. Check your machine configuration.") # all output will be written in the exp_dir folder args.exp_dir = make_experiment_dir(args.exp_name, args.out_home, timestamp=True) # Convert layer names to a list if args.layers is not None: args.layers = args.layers.strip("[]").split(",") if args.seed is not None and args.seed > 0: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) # train & resume ########################################################################## if args.mode == "train": # If h5 files are provided, load them. if args.train_h5_files is not None: args.train_files = gather_files_from_cmdline(args.train_h5_files, extension=".h5") args.val_files = gather_files_from_cmdline(args.val_h5_files, extension=".h5") # If h5 files not given, generate them. else: args.cleanpeakfile = gather_files_from_cmdline( args.cleanpeakfile, extension=(".bed", ".narrowPeak")) args.noisybw = gather_files_from_cmdline(args.noisybw, extension=".bw") args.cleanbw = gather_files_from_cmdline(args.cleanbw, extension=".bw") # We have to make sure there is a 1-1 correspondence between files. assert len(args.cleanpeakfile) == len(args.noisybw) assert len(args.cleanbw) == len(args.noisybw) train_files = [] val_files = [] for idx in range(len(args.cleanbw)): cleanbw = args.cleanbw[idx] noisybw = args.noisybw[idx] cleanpeakfile = args.cleanpeakfile[idx] # Read in the narrowPeak or BED files for clean data peak # labels, convert them to bigwig out_path = os.path.join(args.exp_dir, "bigwig_peakfiles") cleanpeakbw = peak2bw(cleanpeakfile, args.genome, out_path) # Generate training, validation, holdout intervals files out_path = os.path.join(args.exp_dir, "intervals") train_intervals, val_intervals, holdout_intervals = \ get_intervals(args.genome, args.interval_size, out_path, val=args.val_chrom, holdout=args.holdout_chrom, nonpeak=args.nonpeak, peakfile=cleanpeakbw) # Convert the input bigwig files and the clean peak files into # h5 for training. out_path = os.path.join(args.exp_dir, "bw2h5") nonzero = True prefix = os.path.basename(cleanbw) + ".train" train_file = bw2h5(noisybw, cleanbw, args.layersbw, cleanpeakbw, args.read_buffer, nonzero, train_intervals, out_path, prefix, args.pad) train_files.append(train_file) prefix = os.path.basename(cleanbw) + ".val" val_file = bw2h5(noisybw, cleanbw, args.layersbw, cleanpeakbw, args.read_buffer, nonzero, val_intervals, out_path, prefix, args.pad) val_files.append(val_file) args.train_files = train_files args.val_files = val_files _logger.debug("Training data: " + "\n".join(args.train_files)) _logger.debug("Validation data: " + "\n".join(args.val_files)) # Get model parameters with h5py.File(args.train_files[0], 'r') as f: if args.pad is not None: args.interval_size = f['input'].shape[1] - 2 * args.pad else: args.interval_size = f['input'].shape[1] ngpus_per_node = torch.cuda.device_count() # WAR: gloo distributed doesn't work if world size is 1. # This is fixed in newer torch version - # https://github.com/facebookincubator/gloo/issues/209 if ngpus_per_node == 1: args.distributed = False args.gpu_idx = 0 config_dir = os.path.join(args.exp_dir, "configs") if not os.path.exists(config_dir): os.mkdir(config_dir) if args.distributed: _logger.info('Distributing to %s GPUS' % str(ngpus_per_node)) args.world_size = ngpus_per_node mp.spawn(train_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args), join=True) else: assert_device_available(args.gpu_idx) _logger.info('Running on GPU: %s' % str(args.gpu_idx)) args.world_size = 1 train_worker(args.gpu_idx, ngpus_per_node, args, timers=Timers) # infer & eval ########################################################################## if args.mode == "denoise" or args.mode == "eval": files = [] if args.denoise_h5_files is not None: files = gather_files_from_cmdline(args.denoise_h5_files, extension=".h5") infer_intervals = args.intervals_file else: cleanpeakbw = None if args.mode == "eval": # Read in the narrowPeak or BED files for clean data peak # labels, convert them to bigwig out_path = os.path.join(args.exp_dir, "bigwig_peakfiles") cleanpeakbw = peak2bw(args.cleanpeakfile, args.genome, out_path) args.cleanbw = gather_files_from_cmdline(args.cleanbw, extension=".bw") out_path = os.path.join(args.exp_dir, "intervals") infer_intervals = get_intervals(args.genome, args.interval_size, out_path, peakfile=cleanpeakbw, regions=args.regions) # Convert the input bigiwg files and the clean peak files into h5 # for training. args.noisybw = gather_files_from_cmdline(args.noisybw, extension=".bw") for idx in range(len(args.noisybw)): out_path = os.path.join(args.exp_dir, "bw2h5") nonzero = False cleanbw = None noisybw = args.noisybw[idx] if args.mode == "eval": cleanbw = args.cleanbw[idx] prefix = os.path.basename(noisybw) + "." + args.mode infer_file = bw2h5(noisybw, cleanbw, args.layersbw, cleanpeakbw, args.read_buffer, nonzero, infer_intervals, out_path, prefix, args.pad) files.append(infer_file) for x in range(len(files)): infile = files[x] args.input_files = [infile] if args.mode == "denoise": _logger.debug("Inference data: ", infile) # Check that intervals, sizes and h5 file are all compatible. _logger.info('Checking input files for compatibility') intervals = read_intervals(infer_intervals) sizes = read_sizes(args.genome) check_intervals(intervals, sizes, infile) # Delete intervals and sizes objects in main thread del intervals del sizes else: _logger.debug("Evaluation data: ", infile) # Get model parameters with h5py.File(files[x], 'r') as f: if args.pad is not None: args.interval_size = f['input'].shape[1] - 2 * args.pad else: args.interval_size = f['input'].shape[1] # Make sure that interval_size is a multiple of the out_resolution if args.out_resolution is not None: assert (args.interval_size % args.out_resolution == 0) prefix = os.path.basename(infile).split(".")[0] # setup queue and kick off writer process ############################################################# manager = mp.Manager() res_queue = manager.Queue() if args.mode == "denoise": # Create a keyword argument dictionary to pass into the # multiprocessor keyword_args = { "infer": True, "intervals_file": infer_intervals, "exp_dir": args.exp_dir, "task": args.task, "peaks": args.peaks, "tracks": args.tracks, "num_workers": args.num_workers, "infer_threshold": args.threshold, "reg_rounding": args.reg_rounding, "batches_per_worker": args.batches_per_worker, "gen_bigwig": args.gen_bigwig, "sizes_file": args.genome, "res_queue": res_queue, "prefix": prefix, "deletebg": args.deletebg, "out_resolution": args.out_resolution } write_proc = mp.Process(target=writer, kwargs=keyword_args) write_proc.start() ############################################################# ngpus_per_node = torch.cuda.device_count() # WAR: gloo distributed doesn't work if world size is 1. # This is fixed in newer torch version - # https://github.com/facebookincubator/gloo/issues/209 if ngpus_per_node == 1: args.distributed = False args.gpu_idx = 0 worker = infer_worker if args.mode == "denoise" else eval_worker if args.distributed: args.world_size = ngpus_per_node mp.spawn(worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args, res_queue), join=True) else: assert_device_available(args.gpu_idx) args.world_size = 1 worker(args.gpu_idx, ngpus_per_node, args, res_queue) # finish off writing ############################################################# res_queue.put("done") if args.mode == "denoise": _logger.info("Waiting for writer to finish...") write_proc.join() ############################################################# # Save config parameters dst_config_path = os.path.join(args.out_home, args.mode + "_config.yaml") save_config(dst_config_path, args)
def get_intervals(sizesfile, intervalsize, out_dir, val=None, holdout=None, nonpeak=None, peakfile=None, regions=None): """Read chromosome sizes and generate intervals. Args: sizesfile: BED file containing sizes of each chromosome. intervalsize: Size of the intervals at each row. out_dir: Directory to save the output files to. val: Chromosome to reserve for validation. holdout: Chromosome to reserve for evaluation. nonpeak: Ratio of nonpeak to peak intervals desired in training dataset. peakfile: File with clean peaks to know which intervals have non-zero values. Only useful if nonpeak is greater than one. Returns: Paths of files saved. """ # Read chromosome sizes sizes = read_sizes(sizesfile) # Create the output dir if not os.path.exists(out_dir): os.makedirs(out_dir) # Generate intervals if not (val is None or holdout is None): # Generate training intervals _logger.info("Generating training intervals") train_sizes = sizes[sizes['chrom'] != val] train_sizes = train_sizes[train_sizes['chrom'] != holdout] train = _get_tiling_intervals(intervalsize, sizes=train_sizes) # Optional - Set fraction of training intervals to contain peaks if nonpeak is not None: _logger.info('Finding intervals with peaks') train['peak'] = check_bigwig_intervals_peak(train, peakfile) _logger.info('{} of {} intervals contain peaks.'.format( train['peak'].sum(), len(train))) train_peaks = train[train['peak']].copy() train_nonpeaks = train[train['peak'] is False].sample( nonpeak * len(train_peaks)) train = train_peaks.append(train_nonpeaks) train = train.iloc[:, :3] _logger.info('Generated {} peak and {} non-peak\ training intervals.'.format(len(train_peaks), len(train_nonpeaks))) # Write to file out_file_name = str(intervalsize) + '.training_intervals.bed' train_file_path = os.path.join(out_dir, out_file_name) df_to_bed(train, train_file_path) # Generate validation intervals - do not overlap _logger.info("Generating val intervals") val_sizes = sizes[sizes['chrom'] == val] val = _get_tiling_intervals(intervalsize, sizes=val_sizes) # Write to file out_file_name = str(intervalsize) + '.val_intervals.bed' val_file_path = os.path.join(out_dir, out_file_name) df_to_bed(val, val_file_path) # Generate holdout intervals - do not overlap holdout_sizes = sizes[sizes['chrom'] == holdout] holdout = _get_tiling_intervals(intervalsize, sizes=holdout_sizes) # Write to file out_file_name = str(intervalsize) + '.holdout_intervals.bed' holdout_file_path = os.path.join(out_dir, out_file_name) df_to_bed(holdout, holdout_file_path) return train_file_path, val_file_path, holdout_file_path elif regions is not None: # If given regions is a file, then just return the file path if regions.endswith(".bed"): return regions else: final_intervals = pd.DataFrame() regions = regions.strip("[]").split(",") for region in regions: # If regions are specified with intervals like chr1:0-50 # Then split the region into chrom and it's range. if region.find(":") != -1: chrom, chrom_range = region.split(":") chrom_range = chrom_range.split("-") chrom_range = [int(value) for value in chrom_range] chrom_range.insert(0, chrom) intervals = _get_tiling_intervals(intervalsize, chrom_range=chrom_range) else: chrom = region chrom_sizes = sizes[sizes['chrom'] == chrom] chrlength = chrom_sizes.iloc[0, 1] intervals = _get_tiling_intervals( intervalsize, chrom_range=[chrom, 0, chrlength]) final_intervals = final_intervals.append(intervals, ignore_index=True) # Write the intervals to file out_file_name = str(intervalsize) + '.regions_intervals.bed' region_file_path = os.path.join(out_dir, out_file_name) df_to_bed(final_intervals, region_file_path) return region_file_path # If validation and holdout chromosome are not specified, # we use whole genome. else: # Generate intervals tiling across all chromosomes in the sizes file _logger.info("Generating intervals tiling across all chromosomes \ in sizes file: " + sizesfile) intervals = _get_tiling_intervals(intervalsize, sizes=sizes) # Write to file out_file_name = str(intervalsize) + '.genome_intervals.bed' wg_file_path = os.path.join(out_dir, out_file_name) df_to_bed(intervals, wg_file_path) _logger.info('Done!') return wg_file_path
that was supplied to bw2h5.py when creating \ --label_file. Not required if --label_file \ is a bigWig file.') args = parser.parse_args() return args args = parse_args() # Load intervals if supplied _logger.info('Loading intervals') if args.intervals is not None: intervals = read_intervals(args.intervals) # If not, use whole chromosome lengths elif args.sizes is not None: intervals = read_sizes(args.sizes, as_intervals=True) else: intervals = None # Calculate regression metrics if args.task == 'regression': # Load labels _logger.info("Loading labels for regression") y = read_data_file(args.label_file, 'label_reg', intervals, pad=args.pad) # Load data _logger.info("Loading data for regression") if args.test_file is None: x = read_data_file(args.label_file, 'input', pad=args.pad) else: