Ejemplo n.º 1
0
def test_sizes(tmpdir):
    """Write sizes to a file and read it using read_sizes API. \
    Compare the output of read_sizes with original data."""
    sizes = {"chrom": ["chr1", "chr2"], "length": [1000, 200]}
    input_df = pd.DataFrame(sizes)
    bedfile = os.path.join(tmpdir, "sizes.bed")
    input_df.to_csv(bedfile, sep="\t", header=False, index=False)
    output_df = bedio.read_sizes(bedfile)
    assert input_df.equals(output_df)
Ejemplo n.º 2
0
def peak2bw(input_file, sizesfile, out_dir):
    """Convert peak files to bigwig.

    Args:
        input_file: Clean peak file to be converted to bigwig. Needs to be
        either bed or narrowPeak file.
        sizesfile: BED file containing chromosome sizes.
        out_dir: Directory to save the outputs to.

    Returns:
        Path to the output file.

    """
    # Create the output folder
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    # Set name for output file
    prefix = os.path.basename(input_file)
    out_bg_name = os.path.join(out_dir, prefix + '.bedGraph')

    out_bw_name = os.path.join(out_dir, prefix + '.bw')
    # Read input files
    _logger.info('Reading input file')
    # Skip first line if the file is narrowPeak
    skip = False
    if input_file.endswith("narrowPeak"):
        skip = True
    peaks = read_intervals(input_file, skip=skip)
    _logger.info('Read ' + str(len(peaks)) + ' peaks.')
    sizes = read_sizes(sizesfile)

    # Add score of 1 for all peaks
    _logger.info('Adding score')
    peaks['score'] = 1

    # Write bedGraph
    _logger.info('Writing peaks to bedGraph file')

    # Note: peaks will be subset to chromosomes in sizes file.
    df_to_bedGraph(peaks, out_bg_name, sizes)

    # Write bigWig and delete bedGraph
    _logger.info('Writing peaks to bigWig file {}'.format(out_bw_name))
    bedgraph_to_bigwig(out_bg_name, sizesfile,
                       deletebg=True, sort=True)

    _logger.info('Done!')

    return out_bw_name
Ejemplo n.º 3
0
def test_sizes_as_intervals(tmpdir):
    """Write sizes to a file and read it as intervals using \
    read_sizes API. Compare the output of read_sizes with \
    original data."""
    sizes = {"chrom": ["chr1", "chr2"], "length": [1000, 200]}
    sizes_intervals = {
        "chrom": ["chr1", "chr2"],
        "start": [0, 0],
        "end": [1000, 200]
    }
    input_df = pd.DataFrame(sizes)
    sizes_intervals_df = pd.DataFrame(sizes_intervals)
    bedfile = os.path.join(tmpdir, "sizes.bed")
    input_df.to_csv(bedfile, sep="\t", header=False, index=False)
    output_df = bedio.read_sizes(bedfile, as_intervals=True)
    assert sizes_intervals_df.equals(output_df)
Ejemplo n.º 4
0
def main():
    """Main."""
    root_dir = os.path.abspath(
        os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), ".."))

    args = parse_args(root_dir)

    genomes = {
        "hg19": os.path.join(root_dir, "reference", "hg19.chrom.sizes"),
        "hg38": os.path.join(root_dir, "reference", "hg38.chrom.sizes")
    }
    if args.genome in genomes:
        args.genome = genomes[args.genome]

    # Set log level
    _logger.debug(args)

    # check gpu
    # TODO: add cpu support
    if not torch.cuda.is_available():
        raise Exception("No GPU available. Check your machine configuration.")

    # all output will be written in the exp_dir folder
    args.exp_dir = make_experiment_dir(args.exp_name,
                                       args.out_home,
                                       timestamp=True)

    # Convert layer names to a list
    if args.layers is not None:
        args.layers = args.layers.strip("[]").split(",")

    if args.seed is not None and args.seed > 0:
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)
    # train & resume
    ##########################################################################
    if args.mode == "train":

        # If h5 files are provided, load them.
        if args.train_h5_files is not None:
            args.train_files = gather_files_from_cmdline(args.train_h5_files,
                                                         extension=".h5")
            args.val_files = gather_files_from_cmdline(args.val_h5_files,
                                                       extension=".h5")

        # If h5 files not given, generate them.
        else:
            args.cleanpeakfile = gather_files_from_cmdline(
                args.cleanpeakfile, extension=(".bed", ".narrowPeak"))
            args.noisybw = gather_files_from_cmdline(args.noisybw,
                                                     extension=".bw")
            args.cleanbw = gather_files_from_cmdline(args.cleanbw,
                                                     extension=".bw")

            # We have to make sure there is a 1-1 correspondence between files.
            assert len(args.cleanpeakfile) == len(args.noisybw)
            assert len(args.cleanbw) == len(args.noisybw)

            train_files = []
            val_files = []
            for idx in range(len(args.cleanbw)):
                cleanbw = args.cleanbw[idx]
                noisybw = args.noisybw[idx]
                cleanpeakfile = args.cleanpeakfile[idx]
                # Read in the narrowPeak or BED files for clean data peak
                # labels, convert them to bigwig
                out_path = os.path.join(args.exp_dir, "bigwig_peakfiles")
                cleanpeakbw = peak2bw(cleanpeakfile, args.genome, out_path)
                # Generate training, validation, holdout intervals files
                out_path = os.path.join(args.exp_dir, "intervals")
                train_intervals, val_intervals, holdout_intervals = \
                    get_intervals(args.genome, args.interval_size,
                                  out_path,
                                  val=args.val_chrom,
                                  holdout=args.holdout_chrom,
                                  nonpeak=args.nonpeak,
                                  peakfile=cleanpeakbw)

                # Convert the input bigwig files and the clean peak files into
                # h5 for training.
                out_path = os.path.join(args.exp_dir, "bw2h5")
                nonzero = True
                prefix = os.path.basename(cleanbw) + ".train"
                train_file = bw2h5(noisybw, cleanbw, args.layersbw,
                                   cleanpeakbw, args.read_buffer, nonzero,
                                   train_intervals, out_path, prefix, args.pad)
                train_files.append(train_file)
                prefix = os.path.basename(cleanbw) + ".val"
                val_file = bw2h5(noisybw, cleanbw, args.layersbw, cleanpeakbw,
                                 args.read_buffer, nonzero, val_intervals,
                                 out_path, prefix, args.pad)
                val_files.append(val_file)

            args.train_files = train_files
            args.val_files = val_files
        _logger.debug("Training data:   " + "\n".join(args.train_files))
        _logger.debug("Validation data: " + "\n".join(args.val_files))

        # Get model parameters
        with h5py.File(args.train_files[0], 'r') as f:
            if args.pad is not None:
                args.interval_size = f['input'].shape[1] - 2 * args.pad
            else:
                args.interval_size = f['input'].shape[1]

        ngpus_per_node = torch.cuda.device_count()
        # WAR: gloo distributed doesn't work if world size is 1.
        # This is fixed in newer torch version -
        # https://github.com/facebookincubator/gloo/issues/209
        if ngpus_per_node == 1:
            args.distributed = False
            args.gpu_idx = 0

        config_dir = os.path.join(args.exp_dir, "configs")
        if not os.path.exists(config_dir):
            os.mkdir(config_dir)
        if args.distributed:
            _logger.info('Distributing to %s GPUS' % str(ngpus_per_node))
            args.world_size = ngpus_per_node
            mp.spawn(train_worker,
                     nprocs=ngpus_per_node,
                     args=(ngpus_per_node, args),
                     join=True)
        else:
            assert_device_available(args.gpu_idx)
            _logger.info('Running on GPU: %s' % str(args.gpu_idx))
            args.world_size = 1
            train_worker(args.gpu_idx, ngpus_per_node, args, timers=Timers)

    # infer & eval
    ##########################################################################
    if args.mode == "denoise" or args.mode == "eval":

        files = []
        if args.denoise_h5_files is not None:
            files = gather_files_from_cmdline(args.denoise_h5_files,
                                              extension=".h5")
            infer_intervals = args.intervals_file
        else:
            cleanpeakbw = None
            if args.mode == "eval":
                # Read in the narrowPeak or BED files for clean data peak
                # labels, convert them to bigwig
                out_path = os.path.join(args.exp_dir, "bigwig_peakfiles")
                cleanpeakbw = peak2bw(args.cleanpeakfile, args.genome,
                                      out_path)
                args.cleanbw = gather_files_from_cmdline(args.cleanbw,
                                                         extension=".bw")

            out_path = os.path.join(args.exp_dir, "intervals")
            infer_intervals = get_intervals(args.genome,
                                            args.interval_size,
                                            out_path,
                                            peakfile=cleanpeakbw,
                                            regions=args.regions)

            # Convert the input bigiwg files and the clean peak files into h5
            # for training.
            args.noisybw = gather_files_from_cmdline(args.noisybw,
                                                     extension=".bw")

            for idx in range(len(args.noisybw)):
                out_path = os.path.join(args.exp_dir, "bw2h5")
                nonzero = False
                cleanbw = None
                noisybw = args.noisybw[idx]
                if args.mode == "eval":
                    cleanbw = args.cleanbw[idx]
                prefix = os.path.basename(noisybw) + "." + args.mode
                infer_file = bw2h5(noisybw, cleanbw, args.layersbw,
                                   cleanpeakbw, args.read_buffer, nonzero,
                                   infer_intervals, out_path, prefix, args.pad)
                files.append(infer_file)

        for x in range(len(files)):
            infile = files[x]
            args.input_files = [infile]
            if args.mode == "denoise":
                _logger.debug("Inference data: ", infile)

                # Check that intervals, sizes and h5 file are all compatible.
                _logger.info('Checking input files for compatibility')
                intervals = read_intervals(infer_intervals)
                sizes = read_sizes(args.genome)
                check_intervals(intervals, sizes, infile)

                # Delete intervals and sizes objects in main thread
                del intervals
                del sizes
            else:
                _logger.debug("Evaluation data: ", infile)
            # Get model parameters
            with h5py.File(files[x], 'r') as f:
                if args.pad is not None:
                    args.interval_size = f['input'].shape[1] - 2 * args.pad
                else:
                    args.interval_size = f['input'].shape[1]

            # Make sure that interval_size is a multiple of the out_resolution
            if args.out_resolution is not None:
                assert (args.interval_size % args.out_resolution == 0)

            prefix = os.path.basename(infile).split(".")[0]
            # setup queue and kick off writer process
            #############################################################
            manager = mp.Manager()
            res_queue = manager.Queue()

            if args.mode == "denoise":
                # Create a keyword argument dictionary to pass into the
                # multiprocessor
                keyword_args = {
                    "infer": True,
                    "intervals_file": infer_intervals,
                    "exp_dir": args.exp_dir,
                    "task": args.task,
                    "peaks": args.peaks,
                    "tracks": args.tracks,
                    "num_workers": args.num_workers,
                    "infer_threshold": args.threshold,
                    "reg_rounding": args.reg_rounding,
                    "batches_per_worker": args.batches_per_worker,
                    "gen_bigwig": args.gen_bigwig,
                    "sizes_file": args.genome,
                    "res_queue": res_queue,
                    "prefix": prefix,
                    "deletebg": args.deletebg,
                    "out_resolution": args.out_resolution
                }
                write_proc = mp.Process(target=writer, kwargs=keyword_args)
                write_proc.start()
            #############################################################

            ngpus_per_node = torch.cuda.device_count()
            # WAR: gloo distributed doesn't work if world size is 1.
            # This is fixed in newer torch version -
            # https://github.com/facebookincubator/gloo/issues/209
            if ngpus_per_node == 1:
                args.distributed = False
                args.gpu_idx = 0

            worker = infer_worker if args.mode == "denoise" else eval_worker
            if args.distributed:
                args.world_size = ngpus_per_node
                mp.spawn(worker,
                         nprocs=ngpus_per_node,
                         args=(ngpus_per_node, args, res_queue),
                         join=True)
            else:
                assert_device_available(args.gpu_idx)
                args.world_size = 1
                worker(args.gpu_idx, ngpus_per_node, args, res_queue)

            # finish off writing
            #############################################################
            res_queue.put("done")
            if args.mode == "denoise":
                _logger.info("Waiting for writer to finish...")
                write_proc.join()
            #############################################################
    # Save config parameters
    dst_config_path = os.path.join(args.out_home, args.mode + "_config.yaml")
    save_config(dst_config_path, args)
Ejemplo n.º 5
0
def get_intervals(sizesfile,
                  intervalsize,
                  out_dir,
                  val=None,
                  holdout=None,
                  nonpeak=None,
                  peakfile=None,
                  regions=None):
    """Read chromosome sizes and generate intervals.

     Args:
         sizesfile: BED file containing sizes of each chromosome.
         intervalsize: Size of the intervals at each row.
         out_dir: Directory to save the output files to.
         val: Chromosome to reserve for validation.
         holdout: Chromosome to reserve for evaluation.
         nonpeak: Ratio of nonpeak to peak intervals desired in training
         dataset.
         peakfile: File with clean peaks to know which intervals have non-zero
         values. Only useful if nonpeak is greater than one.

    Returns:
         Paths of files saved.

    """
    # Read chromosome sizes
    sizes = read_sizes(sizesfile)

    # Create the output dir
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    # Generate intervals
    if not (val is None or holdout is None):
        # Generate training intervals
        _logger.info("Generating training intervals")
        train_sizes = sizes[sizes['chrom'] != val]
        train_sizes = train_sizes[train_sizes['chrom'] != holdout]
        train = _get_tiling_intervals(intervalsize, sizes=train_sizes)

        # Optional - Set fraction of training intervals to contain peaks
        if nonpeak is not None:
            _logger.info('Finding intervals with peaks')
            train['peak'] = check_bigwig_intervals_peak(train, peakfile)
            _logger.info('{} of {} intervals contain peaks.'.format(
                train['peak'].sum(), len(train)))
            train_peaks = train[train['peak']].copy()
            train_nonpeaks = train[train['peak'] is False].sample(
                nonpeak * len(train_peaks))
            train = train_peaks.append(train_nonpeaks)
            train = train.iloc[:, :3]
            _logger.info('Generated {} peak and {} non-peak\
                     training intervals.'.format(len(train_peaks),
                                                 len(train_nonpeaks)))

        # Write to file
        out_file_name = str(intervalsize) + '.training_intervals.bed'
        train_file_path = os.path.join(out_dir, out_file_name)
        df_to_bed(train, train_file_path)

        # Generate validation intervals - do not overlap
        _logger.info("Generating val intervals")
        val_sizes = sizes[sizes['chrom'] == val]
        val = _get_tiling_intervals(intervalsize, sizes=val_sizes)

        # Write to file
        out_file_name = str(intervalsize) + '.val_intervals.bed'
        val_file_path = os.path.join(out_dir, out_file_name)
        df_to_bed(val, val_file_path)

        # Generate holdout intervals - do not overlap
        holdout_sizes = sizes[sizes['chrom'] == holdout]
        holdout = _get_tiling_intervals(intervalsize, sizes=holdout_sizes)

        # Write to file
        out_file_name = str(intervalsize) + '.holdout_intervals.bed'
        holdout_file_path = os.path.join(out_dir, out_file_name)
        df_to_bed(holdout, holdout_file_path)
        return train_file_path, val_file_path, holdout_file_path

    elif regions is not None:
        # If given regions is a file, then just return the file path
        if regions.endswith(".bed"):
            return regions
        else:
            final_intervals = pd.DataFrame()
            regions = regions.strip("[]").split(",")
            for region in regions:
                # If regions are specified with intervals like chr1:0-50
                # Then split the region into chrom and it's range.
                if region.find(":") != -1:
                    chrom, chrom_range = region.split(":")
                    chrom_range = chrom_range.split("-")
                    chrom_range = [int(value) for value in chrom_range]
                    chrom_range.insert(0, chrom)
                    intervals = _get_tiling_intervals(intervalsize,
                                                      chrom_range=chrom_range)
                else:
                    chrom = region
                    chrom_sizes = sizes[sizes['chrom'] == chrom]
                    chrlength = chrom_sizes.iloc[0, 1]
                    intervals = _get_tiling_intervals(
                        intervalsize, chrom_range=[chrom, 0, chrlength])

                final_intervals = final_intervals.append(intervals,
                                                         ignore_index=True)

            # Write the intervals to file
            out_file_name = str(intervalsize) + '.regions_intervals.bed'
            region_file_path = os.path.join(out_dir, out_file_name)
            df_to_bed(final_intervals, region_file_path)
            return region_file_path

    # If validation and holdout chromosome are not specified,
    # we use whole genome.
    else:
        # Generate intervals tiling across all chromosomes in the sizes file
        _logger.info("Generating intervals tiling across all chromosomes \
            in sizes file: " + sizesfile)
        intervals = _get_tiling_intervals(intervalsize, sizes=sizes)

        # Write to file
        out_file_name = str(intervalsize) + '.genome_intervals.bed'
        wg_file_path = os.path.join(out_dir, out_file_name)
        df_to_bed(intervals, wg_file_path)
        _logger.info('Done!')
        return wg_file_path
                        that was supplied to bw2h5.py when creating \
                        --label_file. Not required if --label_file \
                        is a bigWig file.')
    args = parser.parse_args()
    return args


args = parse_args()

# Load intervals if supplied
_logger.info('Loading intervals')
if args.intervals is not None:
    intervals = read_intervals(args.intervals)
# If not, use whole chromosome lengths
elif args.sizes is not None:
    intervals = read_sizes(args.sizes, as_intervals=True)
else:
    intervals = None

# Calculate regression metrics
if args.task == 'regression':

    # Load labels
    _logger.info("Loading labels for regression")
    y = read_data_file(args.label_file, 'label_reg', intervals, pad=args.pad)

    # Load data
    _logger.info("Loading data for regression")
    if args.test_file is None:
        x = read_data_file(args.label_file, 'input', pad=args.pad)
    else: