Example #1
0
def test_df_to_bed_header(tmpdir):
    """Create a pandas dataframe of intervals and write it to a file, \
    along with the header using df_to_bed API. Read the output file \
    and compare with original data."""
    sizes_intervals = {
        "chrom": ["chr1", "chr2"],
        "start": [0, 0],
        "end": [1000, 200]
    }
    sizes_df = pd.DataFrame(sizes_intervals)
    bedfile = os.path.join(tmpdir, "sizes.bed")
    bedio.df_to_bed(sizes_df, bedfile, header=True)
    read_output = pd.read_csv(bedfile, sep="\t")
    assert sizes_df.equals(read_output)
Example #2
0
def get_intervals(sizesfile,
                  intervalsize,
                  out_dir,
                  val=None,
                  holdout=None,
                  nonpeak=None,
                  peakfile=None,
                  regions=None):
    """Read chromosome sizes and generate intervals.

     Args:
         sizesfile: BED file containing sizes of each chromosome.
         intervalsize: Size of the intervals at each row.
         out_dir: Directory to save the output files to.
         val: Chromosome to reserve for validation.
         holdout: Chromosome to reserve for evaluation.
         nonpeak: Ratio of nonpeak to peak intervals desired in training
         dataset.
         peakfile: File with clean peaks to know which intervals have non-zero
         values. Only useful if nonpeak is greater than one.

    Returns:
         Paths of files saved.

    """
    # Read chromosome sizes
    sizes = read_sizes(sizesfile)

    # Create the output dir
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    # Generate intervals
    if not (val is None or holdout is None):
        # Generate training intervals
        _logger.info("Generating training intervals")
        train_sizes = sizes[sizes['chrom'] != val]
        train_sizes = train_sizes[train_sizes['chrom'] != holdout]
        train = _get_tiling_intervals(intervalsize, sizes=train_sizes)

        # Optional - Set fraction of training intervals to contain peaks
        if nonpeak is not None:
            _logger.info('Finding intervals with peaks')
            train['peak'] = check_bigwig_intervals_peak(train, peakfile)
            _logger.info('{} of {} intervals contain peaks.'.format(
                train['peak'].sum(), len(train)))
            train_peaks = train[train['peak']].copy()
            train_nonpeaks = train[train['peak'] is False].sample(
                nonpeak * len(train_peaks))
            train = train_peaks.append(train_nonpeaks)
            train = train.iloc[:, :3]
            _logger.info('Generated {} peak and {} non-peak\
                     training intervals.'.format(len(train_peaks),
                                                 len(train_nonpeaks)))

        # Write to file
        out_file_name = str(intervalsize) + '.training_intervals.bed'
        train_file_path = os.path.join(out_dir, out_file_name)
        df_to_bed(train, train_file_path)

        # Generate validation intervals - do not overlap
        _logger.info("Generating val intervals")
        val_sizes = sizes[sizes['chrom'] == val]
        val = _get_tiling_intervals(intervalsize, sizes=val_sizes)

        # Write to file
        out_file_name = str(intervalsize) + '.val_intervals.bed'
        val_file_path = os.path.join(out_dir, out_file_name)
        df_to_bed(val, val_file_path)

        # Generate holdout intervals - do not overlap
        holdout_sizes = sizes[sizes['chrom'] == holdout]
        holdout = _get_tiling_intervals(intervalsize, sizes=holdout_sizes)

        # Write to file
        out_file_name = str(intervalsize) + '.holdout_intervals.bed'
        holdout_file_path = os.path.join(out_dir, out_file_name)
        df_to_bed(holdout, holdout_file_path)
        return train_file_path, val_file_path, holdout_file_path

    elif regions is not None:
        # If given regions is a file, then just return the file path
        if regions.endswith(".bed"):
            return regions
        else:
            final_intervals = pd.DataFrame()
            regions = regions.strip("[]").split(",")
            for region in regions:
                # If regions are specified with intervals like chr1:0-50
                # Then split the region into chrom and it's range.
                if region.find(":") != -1:
                    chrom, chrom_range = region.split(":")
                    chrom_range = chrom_range.split("-")
                    chrom_range = [int(value) for value in chrom_range]
                    chrom_range.insert(0, chrom)
                    intervals = _get_tiling_intervals(intervalsize,
                                                      chrom_range=chrom_range)
                else:
                    chrom = region
                    chrom_sizes = sizes[sizes['chrom'] == chrom]
                    chrlength = chrom_sizes.iloc[0, 1]
                    intervals = _get_tiling_intervals(
                        intervalsize, chrom_range=[chrom, 0, chrlength])

                final_intervals = final_intervals.append(intervals,
                                                         ignore_index=True)

            # Write the intervals to file
            out_file_name = str(intervalsize) + '.regions_intervals.bed'
            region_file_path = os.path.join(out_dir, out_file_name)
            df_to_bed(final_intervals, region_file_path)
            return region_file_path

    # If validation and holdout chromosome are not specified,
    # we use whole genome.
    else:
        # Generate intervals tiling across all chromosomes in the sizes file
        _logger.info("Generating intervals tiling across all chromosomes \
            in sizes file: " + sizesfile)
        intervals = _get_tiling_intervals(intervalsize, sizes=sizes)

        # Write to file
        out_file_name = str(intervalsize) + '.genome_intervals.bed'
        wg_file_path = os.path.join(out_dir, out_file_name)
        df_to_bed(intervals, wg_file_path)
        _logger.info('Done!')
        return wg_file_path
Example #3
0
# Extract scores in peaks
peakscores = extract_bigwig_intervals(peaks, args.trackbw, stack=False)

# Add mean score in peak
peaks['mean'] = peakscores.apply(np.mean)

# Add max score
peaks['max'] = peakscores.apply(np.max)

# Add summit
# TODO: we might want to make this more complicated - if there
# are multiple positions with same value, pick the central one?
peaks['relativesummit'] = peakscores.apply(np.argmax)
peaks['summit'] = peaks['start'] + peaks['relativesummit']

# Discard peaks below minimum length
if args.minlen is not None:
    num_before_cut = len(peaks)
    peaks = peaks[peaks['len'] >= args.minlen]
    _logger.info("reduced number of peaks from {} to {}.".format(
        num_before_cut, len(peaks)))
# TODO: we may also want to merge small peaks together

# Write to BED
_logger.info('Writing peaks to BED file {}'.format(out_bed_path))
df_to_bed(peaks, out_bed_path, header=True)

# Delete bedGraph
_logger.info('Deleting bedGraph file')
subprocess.call(['rm', out_bg_path])