def main(): """Convert peak files to bigwig.""" args = parse_args() # Set name for output file if args.prefix is None: # Output file gets name from input prefix = os.path.basename(args.input) else: prefix = args.prefix out_bg_name = os.path.join(args.out_dir, prefix + '.bedGraph') # Read input files _logger.info('Reading input file') peaks = read_intervals(args.input, skip=args.skip) _logger.info('Read ' + str(len(peaks)) + ' peaks.') sizes = read_sizes(args.sizes) # Add score of 1 for all peaks _logger.info('Adding score') peaks['score'] = 1 # Write bedGraph _logger.info('Writing peaks to bedGraph file') # Note: peaks will be subset to chromosomes in sizes file. df_to_bedGraph(peaks, out_bg_name, sizes) # Write bigWig and delete bedGraph _logger.info('Writing peaks to bigWig file {}'.format(prefix + '.bw')) bedgraph_to_bigwig(out_bg_name, args.sizes, deletebg=True, sort=True) _logger.info('Done!')
def main(): """Main.""" root_dir = os.path.abspath( os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), "..")) args = parse_args(root_dir) # Set log level if args.debug: _handler.setLevel(logging.DEBUG) _logger.setLevel(logging.DEBUG) _logger.debug(args) # check gpu # TODO: add cpu support if not torch.cuda.is_available(): raise Exception("No GPU available. Check your machine configuration.") # all output will be written in the exp_dir folder args.exp_dir = make_experiment_dir( args.label, args.out_home, timestamp=True) # Convert layer names to a list if args.layers is not None: args.layers = args.layers.strip("[]").split(",") # train & resume ########################################################################## if args.mode == "train": args.files_train = gather_files_from_cmdline(args.files_train) args.val_files = gather_files_from_cmdline(args.val_files) _logger.debug("Training data: " + "\n".join(args.files_train)) _logger.debug("Validation data: " + "\n".join(args.val_files)) # Get model parameters with h5py.File(args.files_train[0], 'r') as f: args.interval_size = f['input'].shape[1] args.batch_size = 1 ngpus_per_node = torch.cuda.device_count() # WAR: gloo distributed doesn't work if world size is 1. # This is fixed in newer torch version - # https://github.com/facebookincubator/gloo/issues/209 args.distributed = False if ngpus_per_node == 1 else args.distributed config_dir = os.path.join(args.exp_dir, "configs") if not os.path.exists(config_dir): os.mkdir(config_dir) if args.distributed: _logger.info('Distributing to %s GPUS' % str(ngpus_per_node)) args.world_size = ngpus_per_node mp.spawn(train_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args), join=True) else: assert_device_available(args.gpu) _logger.info('Running on GPU: %s' % str(args.gpu)) args.world_size = 1 train_worker(args.gpu, ngpus_per_node, args, timers=Timers) # infer & eval ########################################################################## if args.mode == "infer" or args.mode == "eval": files = args.files files = gather_files_from_cmdline(files) for x in range(len(files)): infile = files[x] args.files = [infile] if args.mode == "infer": _logger.debug("Inference data: ", args.files) # Check that intervals, sizes and h5 file are all compatible. _logger.info('Checkng input files for compatibility') intervals = read_intervals(args.intervals_file) sizes = read_sizes(args.sizes_file) check_intervals(intervals, sizes, args.files[0]) # Delete intervals and sizes objects in main thread del intervals del sizes else: _logger.debug("Evaluation data: ", args.files) # Get model parameters with h5py.File(files[x], 'r') as f: args.interval_size = f['input'].shape[1] args.batch_size = 1 prefix = os.path.basename(infile).split(".")[0] # setup queue and kick off writer process ############################################################# manager = mp.Manager() res_queue = manager.Queue() # Create a keyword argument dictionary to pass into the # multiprocessor keyword_args = {"infer": args.mode == "infer", "intervals_file": args.intervals_file, "exp_dir": args.exp_dir, "result_fname": args.result_fname, "task": args.task, "num_workers": args.num_workers, "infer_threshold": args.infer_threshold, "reg_rounding": args.reg_rounding, "cla_rounding": args.cla_rounding, "batches_per_worker": args.batches_per_worker, "gen_bigwig": args.gen_bigwig, "sizes_file": args.sizes_file, "res_queue": res_queue, "prefix": prefix, "deletebg": args.deletebg} write_proc = mp.Process(target=writer, kwargs=keyword_args) write_proc.start() ############################################################# ngpus_per_node = torch.cuda.device_count() # WAR: gloo distributed doesn't work if world size is 1. # This is fixed in newer torch version - # https://github.com/facebookincubator/gloo/issues/209 args.distributed = False if ngpus_per_node == 1 else \ args.distributed worker = infer_worker if args.mode == "infer" else eval_worker if args.distributed: args.world_size = ngpus_per_node mp.spawn(worker, nprocs=ngpus_per_node, args=( ngpus_per_node, args, res_queue), join=True) else: assert_device_available(args.gpu) args.world_size = 1 worker(args.gpu, ngpus_per_node, args, res_queue) # finish off writing ############################################################# res_queue.put("done") _logger.info("Waiting for writer to finish...") write_proc.join() ############################################################# # Save config parameters dst_config_path = os.path.join(args.out_home, args.mode + "_config.yaml") save_config(dst_config_path, args)
that was supplied to bw2h5.py when creating \ --label_file. Not required if --label_file \ is a bigWig file.') args = parser.parse_args() return args args = parse_args() # Load intervals if supplied _logger.info('Loading intervals') if args.intervals is not None: intervals = read_intervals(args.intervals) # If not, use whole chromosome lengths elif args.sizes is not None: intervals = read_sizes(args.sizes, as_intervals=True) else: intervals = None # Calculate regression metrics if args.task == 'regression': # Load labels _logger.info("Loading labels for regression") y = read_data_file(args.label_file, 'label_reg', intervals, pad=args.pad) # Load data _logger.info("Loading data for regression") if args.test_file is None: x = read_data_file(args.label_file, 'input', pad=args.pad) else:
def main(): """Read chromosome sizes and generate intervals.""" args = parse_args() # Read chromosome sizes sizes = read_sizes(args.sizes) # Generate intervals if args.wg: # Generate intervals tiling across all chromosomes in the sizes file _logger.info("Generating intervals tiling across all chromosomes \ in sizes file: " + args.sizes) intervals = get_tiling_intervals(sizes, args.intervalsize, args.shift) # Write to file if args.prefix is None: out_file_name = 'genome_intervals.bed' else: out_file_name = args.prefix + '.genome_intervals.bed' out_file_path = os.path.join(args.out_dir, out_file_name) df_to_bed(intervals, out_file_path) else: # Generate training intervals - can overlap _logger.info("Generating training intervals") train_sizes = sizes[sizes['chrom'] != args.val] if args.holdout is not None: train_sizes = train_sizes[train_sizes['chrom'] != args.holdout] train = get_tiling_intervals(train_sizes, args.intervalsize, args.shift) # Optional - Set fraction of training intervals to contain peaks if args.peakfile is not None: _logger.info('Finding intervals with peaks') train['peak'] = check_bigwig_intervals_peak(train, args.peakfile) _logger.info('{} of {} intervals contain peaks.'.format( train['peak'].sum(), len(train))) train_peaks = train[train['peak']].copy() train_nonpeaks = train[train['peak'] is False].sample( args.nonpeak * len(train_peaks)) train = train_peaks.append(train_nonpeaks) train = train.iloc[:, :3] _logger.info('Generated {} peak and {} non-peak\ training intervals.'.format(len(train_peaks), len(train_nonpeaks))) # Write to file if args.prefix is None: out_file_name = 'training_intervals.bed' else: out_file_name = args.prefix + '.training_intervals.bed' out_file_path = os.path.join(args.out_dir, out_file_name) df_to_bed(train, out_file_path) # Generate validation intervals - do not overlap _logger.info("Generating val intervals") val_sizes = sizes[sizes['chrom'] == args.val] val = get_tiling_intervals(val_sizes, args.intervalsize) # Write to file if args.prefix is None: out_file_name = 'val_intervals.bed' else: out_file_name = args.prefix + '.val_intervals.bed' out_file_path = os.path.join(args.out_dir, out_file_name) df_to_bed(val, out_file_path) # Generate holdout intervals - do not overlap if args.holdout is not None: _logger.info("Generating holdout intervals") holdout_sizes = sizes[sizes['chrom'] == args.holdout] holdout = get_tiling_intervals(holdout_sizes, args.intervalsize) # Write to file if args.prefix is None: out_file_name = 'holdout_intervals.bed' else: out_file_name = args.prefix + '.holdout_intervals.bed' out_file_path = os.path.join(args.out_dir, out_file_name) df_to_bed(holdout, out_file_path) _logger.info('Done!')