コード例 #1
0
def test_read_intervals(tmpdir):
    """Write intervals to a file and read it through read_intervals \
    API. Compare the output of read_intervals with original data."""
    intervals = {
        "chrom": ["chr1", "chr2"],
        "start": [0, 100],
        "end": [100, 500]
    }
    input_df = pd.DataFrame(intervals)
    bedfile = os.path.join(tmpdir, "intervals.bed")
    input_df.to_csv(bedfile, sep="\t", header=False, index=False)
    output_df = bedio.read_intervals(bedfile)
    assert input_df.equals(output_df)
コード例 #2
0
ファイル: peak2bw.py プロジェクト: zyzhang1992/AtacWorks
def peak2bw(input_file, sizesfile, out_dir):
    """Convert peak files to bigwig.

    Args:
        input_file: Clean peak file to be converted to bigwig. Needs to be
        either bed or narrowPeak file.
        sizesfile: BED file containing chromosome sizes.
        out_dir: Directory to save the outputs to.

    Returns:
        Path to the output file.

    """
    # Create the output folder
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    # Set name for output file
    prefix = os.path.basename(input_file)
    out_bg_name = os.path.join(out_dir, prefix + '.bedGraph')

    out_bw_name = os.path.join(out_dir, prefix + '.bw')
    # Read input files
    _logger.info('Reading input file')
    # Skip first line if the file is narrowPeak
    skip = False
    if input_file.endswith("narrowPeak"):
        skip = True
    peaks = read_intervals(input_file, skip=skip)
    _logger.info('Read ' + str(len(peaks)) + ' peaks.')
    sizes = read_sizes(sizesfile)

    # Add score of 1 for all peaks
    _logger.info('Adding score')
    peaks['score'] = 1

    # Write bedGraph
    _logger.info('Writing peaks to bedGraph file')

    # Note: peaks will be subset to chromosomes in sizes file.
    df_to_bedGraph(peaks, out_bg_name, sizes)

    # Write bigWig and delete bedGraph
    _logger.info('Writing peaks to bigWig file {}'.format(out_bw_name))
    bedgraph_to_bigwig(out_bg_name, sizesfile,
                       deletebg=True, sort=True)

    _logger.info('Done!')

    return out_bw_name
コード例 #3
0
def test_read_intervals_skip(tmpdir):
    """Write intervals to a file and read it using read_intervals \
    API, by skipping the first row. Compare the output with \
    original data with first row skipped."""
    intervals = {
        "chrom": ["chr1", "chr2"],
        "start": [0, 100],
        "end": [100, 500]
    }
    intervals_skip = {"chrom": ["chr2"], "start": [100], "end": [500]}
    input_df = pd.DataFrame(intervals)
    input_df_skip = pd.DataFrame(intervals_skip)
    bedfile = os.path.join(tmpdir, "intervals.bed")
    input_df.to_csv(bedfile, sep="\t", header=False, index=False)
    output_df = bedio.read_intervals(bedfile, skip=1)
    assert input_df_skip.equals(output_df)
コード例 #4
0
def main():
    """Main."""
    root_dir = os.path.abspath(
        os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), ".."))

    args = parse_args(root_dir)

    genomes = {
        "hg19": os.path.join(root_dir, "reference", "hg19.chrom.sizes"),
        "hg38": os.path.join(root_dir, "reference", "hg38.chrom.sizes")
    }
    if args.genome in genomes:
        args.genome = genomes[args.genome]

    # Set log level
    _logger.debug(args)

    # check gpu
    # TODO: add cpu support
    if not torch.cuda.is_available():
        raise Exception("No GPU available. Check your machine configuration.")

    # all output will be written in the exp_dir folder
    args.exp_dir = make_experiment_dir(args.exp_name,
                                       args.out_home,
                                       timestamp=True)

    # Convert layer names to a list
    if args.layers is not None:
        args.layers = args.layers.strip("[]").split(",")

    if args.seed is not None and args.seed > 0:
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)
    # train & resume
    ##########################################################################
    if args.mode == "train":

        # If h5 files are provided, load them.
        if args.train_h5_files is not None:
            args.train_files = gather_files_from_cmdline(args.train_h5_files,
                                                         extension=".h5")
            args.val_files = gather_files_from_cmdline(args.val_h5_files,
                                                       extension=".h5")

        # If h5 files not given, generate them.
        else:
            args.cleanpeakfile = gather_files_from_cmdline(
                args.cleanpeakfile, extension=(".bed", ".narrowPeak"))
            args.noisybw = gather_files_from_cmdline(args.noisybw,
                                                     extension=".bw")
            args.cleanbw = gather_files_from_cmdline(args.cleanbw,
                                                     extension=".bw")

            # We have to make sure there is a 1-1 correspondence between files.
            assert len(args.cleanpeakfile) == len(args.noisybw)
            assert len(args.cleanbw) == len(args.noisybw)

            train_files = []
            val_files = []
            for idx in range(len(args.cleanbw)):
                cleanbw = args.cleanbw[idx]
                noisybw = args.noisybw[idx]
                cleanpeakfile = args.cleanpeakfile[idx]
                # Read in the narrowPeak or BED files for clean data peak
                # labels, convert them to bigwig
                out_path = os.path.join(args.exp_dir, "bigwig_peakfiles")
                cleanpeakbw = peak2bw(cleanpeakfile, args.genome, out_path)
                # Generate training, validation, holdout intervals files
                out_path = os.path.join(args.exp_dir, "intervals")
                train_intervals, val_intervals, holdout_intervals = \
                    get_intervals(args.genome, args.interval_size,
                                  out_path,
                                  val=args.val_chrom,
                                  holdout=args.holdout_chrom,
                                  nonpeak=args.nonpeak,
                                  peakfile=cleanpeakbw)

                # Convert the input bigwig files and the clean peak files into
                # h5 for training.
                out_path = os.path.join(args.exp_dir, "bw2h5")
                nonzero = True
                prefix = os.path.basename(cleanbw) + ".train"
                train_file = bw2h5(noisybw, cleanbw, args.layersbw,
                                   cleanpeakbw, args.read_buffer, nonzero,
                                   train_intervals, out_path, prefix, args.pad)
                train_files.append(train_file)
                prefix = os.path.basename(cleanbw) + ".val"
                val_file = bw2h5(noisybw, cleanbw, args.layersbw, cleanpeakbw,
                                 args.read_buffer, nonzero, val_intervals,
                                 out_path, prefix, args.pad)
                val_files.append(val_file)

            args.train_files = train_files
            args.val_files = val_files
        _logger.debug("Training data:   " + "\n".join(args.train_files))
        _logger.debug("Validation data: " + "\n".join(args.val_files))

        # Get model parameters
        with h5py.File(args.train_files[0], 'r') as f:
            if args.pad is not None:
                args.interval_size = f['input'].shape[1] - 2 * args.pad
            else:
                args.interval_size = f['input'].shape[1]

        ngpus_per_node = torch.cuda.device_count()
        # WAR: gloo distributed doesn't work if world size is 1.
        # This is fixed in newer torch version -
        # https://github.com/facebookincubator/gloo/issues/209
        if ngpus_per_node == 1:
            args.distributed = False
            args.gpu_idx = 0

        config_dir = os.path.join(args.exp_dir, "configs")
        if not os.path.exists(config_dir):
            os.mkdir(config_dir)
        if args.distributed:
            _logger.info('Distributing to %s GPUS' % str(ngpus_per_node))
            args.world_size = ngpus_per_node
            mp.spawn(train_worker,
                     nprocs=ngpus_per_node,
                     args=(ngpus_per_node, args),
                     join=True)
        else:
            assert_device_available(args.gpu_idx)
            _logger.info('Running on GPU: %s' % str(args.gpu_idx))
            args.world_size = 1
            train_worker(args.gpu_idx, ngpus_per_node, args, timers=Timers)

    # infer & eval
    ##########################################################################
    if args.mode == "denoise" or args.mode == "eval":

        files = []
        if args.denoise_h5_files is not None:
            files = gather_files_from_cmdline(args.denoise_h5_files,
                                              extension=".h5")
            infer_intervals = args.intervals_file
        else:
            cleanpeakbw = None
            if args.mode == "eval":
                # Read in the narrowPeak or BED files for clean data peak
                # labels, convert them to bigwig
                out_path = os.path.join(args.exp_dir, "bigwig_peakfiles")
                cleanpeakbw = peak2bw(args.cleanpeakfile, args.genome,
                                      out_path)
                args.cleanbw = gather_files_from_cmdline(args.cleanbw,
                                                         extension=".bw")

            out_path = os.path.join(args.exp_dir, "intervals")
            infer_intervals = get_intervals(args.genome,
                                            args.interval_size,
                                            out_path,
                                            peakfile=cleanpeakbw,
                                            regions=args.regions)

            # Convert the input bigiwg files and the clean peak files into h5
            # for training.
            args.noisybw = gather_files_from_cmdline(args.noisybw,
                                                     extension=".bw")

            for idx in range(len(args.noisybw)):
                out_path = os.path.join(args.exp_dir, "bw2h5")
                nonzero = False
                cleanbw = None
                noisybw = args.noisybw[idx]
                if args.mode == "eval":
                    cleanbw = args.cleanbw[idx]
                prefix = os.path.basename(noisybw) + "." + args.mode
                infer_file = bw2h5(noisybw, cleanbw, args.layersbw,
                                   cleanpeakbw, args.read_buffer, nonzero,
                                   infer_intervals, out_path, prefix, args.pad)
                files.append(infer_file)

        for x in range(len(files)):
            infile = files[x]
            args.input_files = [infile]
            if args.mode == "denoise":
                _logger.debug("Inference data: ", infile)

                # Check that intervals, sizes and h5 file are all compatible.
                _logger.info('Checking input files for compatibility')
                intervals = read_intervals(infer_intervals)
                sizes = read_sizes(args.genome)
                check_intervals(intervals, sizes, infile)

                # Delete intervals and sizes objects in main thread
                del intervals
                del sizes
            else:
                _logger.debug("Evaluation data: ", infile)
            # Get model parameters
            with h5py.File(files[x], 'r') as f:
                if args.pad is not None:
                    args.interval_size = f['input'].shape[1] - 2 * args.pad
                else:
                    args.interval_size = f['input'].shape[1]

            # Make sure that interval_size is a multiple of the out_resolution
            if args.out_resolution is not None:
                assert (args.interval_size % args.out_resolution == 0)

            prefix = os.path.basename(infile).split(".")[0]
            # setup queue and kick off writer process
            #############################################################
            manager = mp.Manager()
            res_queue = manager.Queue()

            if args.mode == "denoise":
                # Create a keyword argument dictionary to pass into the
                # multiprocessor
                keyword_args = {
                    "infer": True,
                    "intervals_file": infer_intervals,
                    "exp_dir": args.exp_dir,
                    "task": args.task,
                    "peaks": args.peaks,
                    "tracks": args.tracks,
                    "num_workers": args.num_workers,
                    "infer_threshold": args.threshold,
                    "reg_rounding": args.reg_rounding,
                    "batches_per_worker": args.batches_per_worker,
                    "gen_bigwig": args.gen_bigwig,
                    "sizes_file": args.genome,
                    "res_queue": res_queue,
                    "prefix": prefix,
                    "deletebg": args.deletebg,
                    "out_resolution": args.out_resolution
                }
                write_proc = mp.Process(target=writer, kwargs=keyword_args)
                write_proc.start()
            #############################################################

            ngpus_per_node = torch.cuda.device_count()
            # WAR: gloo distributed doesn't work if world size is 1.
            # This is fixed in newer torch version -
            # https://github.com/facebookincubator/gloo/issues/209
            if ngpus_per_node == 1:
                args.distributed = False
                args.gpu_idx = 0

            worker = infer_worker if args.mode == "denoise" else eval_worker
            if args.distributed:
                args.world_size = ngpus_per_node
                mp.spawn(worker,
                         nprocs=ngpus_per_node,
                         args=(ngpus_per_node, args, res_queue),
                         join=True)
            else:
                assert_device_available(args.gpu_idx)
                args.world_size = 1
                worker(args.gpu_idx, ngpus_per_node, args, res_queue)

            # finish off writing
            #############################################################
            res_queue.put("done")
            if args.mode == "denoise":
                _logger.info("Waiting for writer to finish...")
                write_proc.join()
            #############################################################
    # Save config parameters
    dst_config_path = os.path.join(args.out_home, args.mode + "_config.yaml")
    save_config(dst_config_path, args)
コード例 #5
0
# Output file names
if args.prefix is None:
    prefix = 'summarized_peaks'
else:
    prefix = args.prefix
out_bed_path = os.path.join(args.out_dir, prefix + '.bed')
out_bg_path = os.path.join(args.out_dir, prefix + '.bedGraph')

# Collapse peaks
_logger.info('Writing peaks to bedGraph file {}'.format(out_bg_path))
subprocess.call(['bigWigToBedGraph', args.peakbw, out_bg_path])

# Read collapsed peaks
_logger.info('Reading peaks')
peaks = read_intervals(out_bg_path)
peaks.columns = ['#chrom', 'start', 'end']

# Add length of peaks
_logger.info('Calculating peak statistics')
peaks['len'] = peaks['end'] - peaks['start']

# Extract scores in peaks
peakscores = extract_bigwig_intervals(peaks, args.trackbw, stack=False)

# Add mean score in peak
peaks['mean'] = peakscores.apply(np.mean)

# Add max score
peaks['max'] = peakscores.apply(np.max)
コード例 #6
0
                        help='Number of additional bases added as \
                        padding to the intervals in the h5 file \
                        containing labels. Use the same --pad value \
                        that was supplied to bw2h5.py when creating \
                        --label_file. Not required if --label_file \
                        is a bigWig file.')
    args = parser.parse_args()
    return args


args = parse_args()

# Load intervals if supplied
_logger.info('Loading intervals')
if args.intervals is not None:
    intervals = read_intervals(args.intervals)
# If not, use whole chromosome lengths
elif args.sizes is not None:
    intervals = read_sizes(args.sizes, as_intervals=True)
else:
    intervals = None

# Calculate regression metrics
if args.task == 'regression':

    # Load labels
    _logger.info("Loading labels for regression")
    y = read_data_file(args.label_file, 'label_reg', intervals, pad=args.pad)

    # Load data
    _logger.info("Loading data for regression")
コード例 #7
0
ファイル: bw2h5.py プロジェクト: zyzhang1992/AtacWorks
def bw2h5(noisybw, cleanbw, layersbw, cleanpeakbw, batch_size,
          nonzero, intervals_file, out_dir, prefix, pad):
    """Convert bigwig files to h5.

    Args:
        noisybw: BigWig file containing noisy data.
        cleanbw: BigWig file containing clean data.
        layersbw: BigWig file containing layers data.
        cleanpeakbw: BigWig file containing clean peaks data to be used as
        labels for training.
        batch_size: Number of lines to read at a time, since all of the data
        does not fit in memory.
        nonzero: Only save intervals that have non-zero values.
        intervals_file: File containing the intervals corresponding to the
        training,
        validation or inference.
        out_dir: Directory to save the output files to.
        prefix: Prefix to attach to the name, to make the files unique.
        pad: Padding values.

    Returns:
        Path to output files.

    """
    # Read intervals
    _logger.info('Reading intervals')
    intervals = read_intervals(intervals_file)
    _logger.info('Read {} intervals'.format(len(intervals)))

    # Create output directory
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    # Optionally, select intervals with nonzero coverage
    if nonzero:
        _logger.info('Selecting intervals with nonzero coverage')
        nonzero_intervals = check_bigwig_intervals_nonzero(
            intervals, noisybw)
        _logger.info("Retaining {} of {} nonzero noisy intervals".format(
            sum(nonzero_intervals), len(intervals)))
        intervals = intervals[nonzero_intervals]

    _logger.debug('Collecting %d intervals' % len(intervals))

    # Calculate number of batches
    batches_per_epoch = int(np.ceil(len(intervals) / batch_size))
    _logger.info('Writing data in ' + str(batches_per_epoch) + ' batches.')

    # Split intervals into batches
    batch_starts = np.array(range(0, len(intervals), batch_size))
    batch_ends = batch_starts + batch_size
    batch_ends[-1] = len(intervals)

    # Get output hdf5 filename
    output_file_path = os.path.join(out_dir, prefix + '.h5')

    # Write batches to hdf5 file
    _logger.info('Extracting data for each batch and writing to h5 file')
    for i in range(batches_per_epoch):

        # Print current batch
        if i % 10 == 0:
            _logger.info("batch " + str(i) + " of " + str(batches_per_epoch))

        # Create dictionary to store data
        batch_data = {}

        # Subset intervals
        batch_intervals = intervals.iloc[batch_starts[i]:batch_ends[i], :]

        # Read noisy data
        batch_data['input'] = extract_bigwig_intervals(
            batch_intervals, noisybw, pad=pad
        )

        # Add other input layers
        if layersbw is not None:
            # Read additional layers
            layers = gather_key_files_from_cmdline(layersbw, extension='.bw')
            for key in layers.keys():
                batch_data[key] = extract_bigwig_intervals(
                    batch_intervals, layers[key], pad=pad
                )

        # Add labels
        if cleanbw and cleanpeakbw:
            # Read clean data: regression labels
            batch_data['label_reg'] = extract_bigwig_intervals(
                batch_intervals, cleanbw, pad=pad
            )

            # Read clean data: classification labels
            batch_data['label_cla'] = extract_bigwig_intervals(
                batch_intervals, cleanpeakbw, pad=pad
            )

        _logger.debug(len(batch_data))
        _logger.debug("Saving batch " + str(i) + " with keys " + str(
            batch_data.keys()))

        # Create dataset, or expand and append batch.
        if i == 0:
            dict_to_h5(batch_data, h5file=output_file_path, create_new=True)
        else:
            dict_to_h5(batch_data, h5file=output_file_path, create_new=False)

    _logger.info('Done! Saved to %s' % output_file_path)
    return output_file_path