def test_df_to_bed_header(tmpdir): """Create a pandas dataframe of intervals and write it to a file, \ along with the header using df_to_bed API. Read the output file \ and compare with original data.""" sizes_intervals = { "chrom": ["chr1", "chr2"], "start": [0, 0], "end": [1000, 200] } sizes_df = pd.DataFrame(sizes_intervals) bedfile = os.path.join(tmpdir, "sizes.bed") bedio.df_to_bed(sizes_df, bedfile, header=True) read_output = pd.read_csv(bedfile, sep="\t") assert sizes_df.equals(read_output)
def get_intervals(sizesfile, intervalsize, out_dir, val=None, holdout=None, nonpeak=None, peakfile=None, regions=None): """Read chromosome sizes and generate intervals. Args: sizesfile: BED file containing sizes of each chromosome. intervalsize: Size of the intervals at each row. out_dir: Directory to save the output files to. val: Chromosome to reserve for validation. holdout: Chromosome to reserve for evaluation. nonpeak: Ratio of nonpeak to peak intervals desired in training dataset. peakfile: File with clean peaks to know which intervals have non-zero values. Only useful if nonpeak is greater than one. Returns: Paths of files saved. """ # Read chromosome sizes sizes = read_sizes(sizesfile) # Create the output dir if not os.path.exists(out_dir): os.makedirs(out_dir) # Generate intervals if not (val is None or holdout is None): # Generate training intervals _logger.info("Generating training intervals") train_sizes = sizes[sizes['chrom'] != val] train_sizes = train_sizes[train_sizes['chrom'] != holdout] train = _get_tiling_intervals(intervalsize, sizes=train_sizes) # Optional - Set fraction of training intervals to contain peaks if nonpeak is not None: _logger.info('Finding intervals with peaks') train['peak'] = check_bigwig_intervals_peak(train, peakfile) _logger.info('{} of {} intervals contain peaks.'.format( train['peak'].sum(), len(train))) train_peaks = train[train['peak']].copy() train_nonpeaks = train[train['peak'] is False].sample( nonpeak * len(train_peaks)) train = train_peaks.append(train_nonpeaks) train = train.iloc[:, :3] _logger.info('Generated {} peak and {} non-peak\ training intervals.'.format(len(train_peaks), len(train_nonpeaks))) # Write to file out_file_name = str(intervalsize) + '.training_intervals.bed' train_file_path = os.path.join(out_dir, out_file_name) df_to_bed(train, train_file_path) # Generate validation intervals - do not overlap _logger.info("Generating val intervals") val_sizes = sizes[sizes['chrom'] == val] val = _get_tiling_intervals(intervalsize, sizes=val_sizes) # Write to file out_file_name = str(intervalsize) + '.val_intervals.bed' val_file_path = os.path.join(out_dir, out_file_name) df_to_bed(val, val_file_path) # Generate holdout intervals - do not overlap holdout_sizes = sizes[sizes['chrom'] == holdout] holdout = _get_tiling_intervals(intervalsize, sizes=holdout_sizes) # Write to file out_file_name = str(intervalsize) + '.holdout_intervals.bed' holdout_file_path = os.path.join(out_dir, out_file_name) df_to_bed(holdout, holdout_file_path) return train_file_path, val_file_path, holdout_file_path elif regions is not None: # If given regions is a file, then just return the file path if regions.endswith(".bed"): return regions else: final_intervals = pd.DataFrame() regions = regions.strip("[]").split(",") for region in regions: # If regions are specified with intervals like chr1:0-50 # Then split the region into chrom and it's range. if region.find(":") != -1: chrom, chrom_range = region.split(":") chrom_range = chrom_range.split("-") chrom_range = [int(value) for value in chrom_range] chrom_range.insert(0, chrom) intervals = _get_tiling_intervals(intervalsize, chrom_range=chrom_range) else: chrom = region chrom_sizes = sizes[sizes['chrom'] == chrom] chrlength = chrom_sizes.iloc[0, 1] intervals = _get_tiling_intervals( intervalsize, chrom_range=[chrom, 0, chrlength]) final_intervals = final_intervals.append(intervals, ignore_index=True) # Write the intervals to file out_file_name = str(intervalsize) + '.regions_intervals.bed' region_file_path = os.path.join(out_dir, out_file_name) df_to_bed(final_intervals, region_file_path) return region_file_path # If validation and holdout chromosome are not specified, # we use whole genome. else: # Generate intervals tiling across all chromosomes in the sizes file _logger.info("Generating intervals tiling across all chromosomes \ in sizes file: " + sizesfile) intervals = _get_tiling_intervals(intervalsize, sizes=sizes) # Write to file out_file_name = str(intervalsize) + '.genome_intervals.bed' wg_file_path = os.path.join(out_dir, out_file_name) df_to_bed(intervals, wg_file_path) _logger.info('Done!') return wg_file_path
# Extract scores in peaks peakscores = extract_bigwig_intervals(peaks, args.trackbw, stack=False) # Add mean score in peak peaks['mean'] = peakscores.apply(np.mean) # Add max score peaks['max'] = peakscores.apply(np.max) # Add summit # TODO: we might want to make this more complicated - if there # are multiple positions with same value, pick the central one? peaks['relativesummit'] = peakscores.apply(np.argmax) peaks['summit'] = peaks['start'] + peaks['relativesummit'] # Discard peaks below minimum length if args.minlen is not None: num_before_cut = len(peaks) peaks = peaks[peaks['len'] >= args.minlen] _logger.info("reduced number of peaks from {} to {}.".format( num_before_cut, len(peaks))) # TODO: we may also want to merge small peaks together # Write to BED _logger.info('Writing peaks to BED file {}'.format(out_bed_path)) df_to_bed(peaks, out_bed_path, header=True) # Delete bedGraph _logger.info('Deleting bedGraph file') subprocess.call(['rm', out_bg_path])