예제 #1
0
파일: peak2bw.py 프로젝트: ve008/AtacWorks
def main():
    """Convert peak files to bigwig."""
    args = parse_args()

    # Set name for output file
    if args.prefix is None:
        # Output file gets name from input
        prefix = os.path.basename(args.input)
    else:
        prefix = args.prefix
    out_bg_name = os.path.join(args.out_dir, prefix + '.bedGraph')

    # Read input files
    _logger.info('Reading input file')
    peaks = read_intervals(args.input, skip=args.skip)
    _logger.info('Read ' + str(len(peaks)) + ' peaks.')
    sizes = read_sizes(args.sizes)

    # Add score of 1 for all peaks
    _logger.info('Adding score')
    peaks['score'] = 1

    # Write bedGraph
    _logger.info('Writing peaks to bedGraph file')

    # Note: peaks will be subset to chromosomes in sizes file.
    df_to_bedGraph(peaks, out_bg_name, sizes)

    # Write bigWig and delete bedGraph
    _logger.info('Writing peaks to bigWig file {}'.format(prefix + '.bw'))
    bedgraph_to_bigwig(out_bg_name, args.sizes,
                       deletebg=True, sort=True)

    _logger.info('Done!')
예제 #2
0
파일: main.py 프로젝트: ve008/AtacWorks
def main():
    """Main."""
    root_dir = os.path.abspath(
        os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), ".."))
    args = parse_args(root_dir)
    # Set log level
    if args.debug:
        _handler.setLevel(logging.DEBUG)
        _logger.setLevel(logging.DEBUG)

    _logger.debug(args)

    # check gpu
    # TODO: add cpu support
    if not torch.cuda.is_available():
        raise Exception("No GPU available. Check your machine configuration.")

    # all output will be written in the exp_dir folder
    args.exp_dir = make_experiment_dir(
        args.label, args.out_home, timestamp=True)

    # Convert layer names to a list
    if args.layers is not None:
        args.layers = args.layers.strip("[]").split(",")

    # train & resume
    ##########################################################################
    if args.mode == "train":
        args.files_train = gather_files_from_cmdline(args.files_train)
        args.val_files = gather_files_from_cmdline(args.val_files)
        _logger.debug("Training data:   " + "\n".join(args.files_train))
        _logger.debug("Validation data: " + "\n".join(args.val_files))

        # Get model parameters
        with h5py.File(args.files_train[0], 'r') as f:
            args.interval_size = f['input'].shape[1]
            args.batch_size = 1

        ngpus_per_node = torch.cuda.device_count()
        # WAR: gloo distributed doesn't work if world size is 1.
        # This is fixed in newer torch version -
        # https://github.com/facebookincubator/gloo/issues/209
        args.distributed = False if ngpus_per_node == 1 else args.distributed

        config_dir = os.path.join(args.exp_dir, "configs")
        if not os.path.exists(config_dir):
            os.mkdir(config_dir)
        if args.distributed:
            _logger.info('Distributing to %s GPUS' % str(ngpus_per_node))
            args.world_size = ngpus_per_node
            mp.spawn(train_worker, nprocs=ngpus_per_node,
                     args=(ngpus_per_node, args), join=True)
        else:
            assert_device_available(args.gpu)
            _logger.info('Running on GPU: %s' % str(args.gpu))
            args.world_size = 1
            train_worker(args.gpu, ngpus_per_node, args, timers=Timers)

    # infer & eval
    ##########################################################################
    if args.mode == "infer" or args.mode == "eval":
        files = args.files
        files = gather_files_from_cmdline(files)
        for x in range(len(files)):
            infile = files[x]
            args.files = [infile]
            if args.mode == "infer":
                _logger.debug("Inference data: ", args.files)

                # Check that intervals, sizes and h5 file are all compatible.
                _logger.info('Checkng input files for compatibility')
                intervals = read_intervals(args.intervals_file)
                sizes = read_sizes(args.sizes_file)
                check_intervals(intervals, sizes, args.files[0])

                # Delete intervals and sizes objects in main thread
                del intervals
                del sizes
            else:
                _logger.debug("Evaluation data: ", args.files)
            # Get model parameters
            with h5py.File(files[x], 'r') as f:
                args.interval_size = f['input'].shape[1]
                args.batch_size = 1

            prefix = os.path.basename(infile).split(".")[0]
            # setup queue and kick off writer process
            #############################################################
            manager = mp.Manager()
            res_queue = manager.Queue()
            # Create a keyword argument dictionary to pass into the
            # multiprocessor
            keyword_args = {"infer": args.mode == "infer",
                            "intervals_file": args.intervals_file,
                            "exp_dir": args.exp_dir,
                            "result_fname": args.result_fname,
                            "task": args.task,
                            "num_workers": args.num_workers,
                            "infer_threshold": args.infer_threshold,
                            "reg_rounding": args.reg_rounding,
                            "cla_rounding": args.cla_rounding,
                            "batches_per_worker": args.batches_per_worker,
                            "gen_bigwig": args.gen_bigwig,
                            "sizes_file": args.sizes_file,
                            "res_queue": res_queue, "prefix": prefix,
                            "deletebg": args.deletebg}
            write_proc = mp.Process(target=writer, kwargs=keyword_args)
            write_proc.start()
            #############################################################

            ngpus_per_node = torch.cuda.device_count()
            # WAR: gloo distributed doesn't work if world size is 1.
            # This is fixed in newer torch version -
            # https://github.com/facebookincubator/gloo/issues/209
            args.distributed = False if ngpus_per_node == 1 else \
                args.distributed

            worker = infer_worker if args.mode == "infer" else eval_worker
            if args.distributed:
                args.world_size = ngpus_per_node
                mp.spawn(worker, nprocs=ngpus_per_node, args=(
                    ngpus_per_node, args, res_queue), join=True)
            else:
                assert_device_available(args.gpu)
                args.world_size = 1
                worker(args.gpu, ngpus_per_node, args, res_queue)

            # finish off writing
            #############################################################
            res_queue.put("done")
            _logger.info("Waiting for writer to finish...")
            write_proc.join()
            #############################################################
    # Save config parameters
    dst_config_path = os.path.join(args.out_home,
                                   args.mode + "_config.yaml")
    save_config(dst_config_path, args)
예제 #3
0
                        that was supplied to bw2h5.py when creating \
                        --label_file. Not required if --label_file \
                        is a bigWig file.')
    args = parser.parse_args()
    return args


args = parse_args()

# Load intervals if supplied
_logger.info('Loading intervals')
if args.intervals is not None:
    intervals = read_intervals(args.intervals)
# If not, use whole chromosome lengths
elif args.sizes is not None:
    intervals = read_sizes(args.sizes, as_intervals=True)
else:
    intervals = None

# Calculate regression metrics
if args.task == 'regression':

    # Load labels
    _logger.info("Loading labels for regression")
    y = read_data_file(args.label_file, 'label_reg', intervals, pad=args.pad)

    # Load data
    _logger.info("Loading data for regression")
    if args.test_file is None:
        x = read_data_file(args.label_file, 'input', pad=args.pad)
    else:
예제 #4
0
def main():
    """Read chromosome sizes and generate intervals."""
    args = parse_args()

    # Read chromosome sizes
    sizes = read_sizes(args.sizes)

    # Generate intervals
    if args.wg:

        # Generate intervals tiling across all chromosomes in the sizes file
        _logger.info("Generating intervals tiling across all chromosomes \
            in sizes file: " + args.sizes)
        intervals = get_tiling_intervals(sizes, args.intervalsize, args.shift)

        # Write to file
        if args.prefix is None:
            out_file_name = 'genome_intervals.bed'
        else:
            out_file_name = args.prefix + '.genome_intervals.bed'
        out_file_path = os.path.join(args.out_dir, out_file_name)
        df_to_bed(intervals, out_file_path)

    else:

        # Generate training intervals - can overlap
        _logger.info("Generating training intervals")
        train_sizes = sizes[sizes['chrom'] != args.val]
        if args.holdout is not None:
            train_sizes = train_sizes[train_sizes['chrom'] != args.holdout]
        train = get_tiling_intervals(train_sizes, args.intervalsize,
                                     args.shift)

        # Optional - Set fraction of training intervals to contain peaks
        if args.peakfile is not None:
            _logger.info('Finding intervals with peaks')
            train['peak'] = check_bigwig_intervals_peak(train, args.peakfile)
            _logger.info('{} of {} intervals contain peaks.'.format(
                train['peak'].sum(), len(train)))
            train_peaks = train[train['peak']].copy()
            train_nonpeaks = train[train['peak'] is False].sample(
                args.nonpeak * len(train_peaks))
            train = train_peaks.append(train_nonpeaks)
            train = train.iloc[:, :3]
            _logger.info('Generated {} peak and {} non-peak\
                     training intervals.'.format(len(train_peaks),
                                                 len(train_nonpeaks)))

        # Write to file
        if args.prefix is None:
            out_file_name = 'training_intervals.bed'
        else:
            out_file_name = args.prefix + '.training_intervals.bed'
        out_file_path = os.path.join(args.out_dir, out_file_name)
        df_to_bed(train, out_file_path)

        # Generate validation intervals - do not overlap
        _logger.info("Generating val intervals")
        val_sizes = sizes[sizes['chrom'] == args.val]
        val = get_tiling_intervals(val_sizes, args.intervalsize)

        # Write to file
        if args.prefix is None:
            out_file_name = 'val_intervals.bed'
        else:
            out_file_name = args.prefix + '.val_intervals.bed'
        out_file_path = os.path.join(args.out_dir, out_file_name)
        df_to_bed(val, out_file_path)

        # Generate holdout intervals - do not overlap
        if args.holdout is not None:
            _logger.info("Generating holdout intervals")
            holdout_sizes = sizes[sizes['chrom'] == args.holdout]
            holdout = get_tiling_intervals(holdout_sizes, args.intervalsize)

            # Write to file
            if args.prefix is None:
                out_file_name = 'holdout_intervals.bed'
            else:
                out_file_name = args.prefix + '.holdout_intervals.bed'
            out_file_path = os.path.join(args.out_dir, out_file_name)
            df_to_bed(holdout, out_file_path)

    _logger.info('Done!')