def cdf_match(src, ref, min_val=None, max_val=None, nbins=100, minobs=None, **kwargs): """ computes cumulative density functions of src and ref at their respective bin-edges by 5th order spline interpolation; then matches CDF of src to CDF of ref. This function does not make sure that the percentiles are unique so it can happen that multiple measurements are scaled to one point or that there are NaN values in the output array. Parameters ---------- src: numpy.array input dataset which will be scaled ref: numpy.array src will be scaled to this dataset min_val: float, optional Minimum allowed value, output data is capped at this value max_val: float, optional Maximum allowed value, output data is capped at this value nbins: int, optional Number of bins to use for estimation of the CDF minobs : int Minimum desired number of observations in a bin. ** kwargs: dict keywords to be passed onto the gen_cdf_match() function Returns ------- CDF matched values: numpy.array dataset src with CDF as ref """ percentiles = np.linspace(0, 100, nbins) if minobs is not None: percentiles = utils.resize_percentiles(src, percentiles, minobs) perc_src = np.array(np.percentile(src, percentiles)) perc_src = utils.unique_percentiles_interpolate(perc_src, percentiles=percentiles) perc_ref = np.array(np.percentile(ref, percentiles)) perc_ref = utils.unique_percentiles_interpolate(perc_ref, percentiles=percentiles) return gen_cdf_match( src, perc_src, perc_ref, ref=ref, min_val=min_val, max_val=max_val, k=5, **kwargs, )
def calc_parameters(self, data): """ Calculate the percentiles used for CDF matching. Parameters ---------- data: pandas.DataFrame temporally matched dataset Returns ------- parameters: dictionary keys -> Names of columns in the input data frame values -> numpy.ndarrays with the percentiles """ parameters = {} for column in data.columns: c_data = data[column].values perc = np.percentile(c_data, self.percentiles) perc = unique_percentiles_interpolate(perc, percentiles=self.percentiles) parameters[column] = perc return parameters
def cdf_match(src, ref, min_val=None, max_val=None, nbins=100): ''' computes cumulative density functions of src and ref at their respective bin-edges by 5th order spline interpolation; then matches CDF of src to CDF of ref. This function does not make sure that the percentiles are unique so it can happen that multiple measurements are scaled to one point or that there are NaN values in the output array. Parameters ---------- src: numpy.array input dataset which will be scaled ref: numpy.array src will be scaled to this dataset min_val: float, optional Minimum allowed value, output data is capped at this value max_val: float, optional Maximum allowed value, output data is capped at this value nbins: int, optional Number of bins to use for estimation of the CDF Returns ------- CDF matched values: numpy.array dataset src with CDF as ref ''' percentiles = np.linspace(0, 100, nbins) perc_src = np.array(np.percentile(src, percentiles)) perc_src = unique_percentiles_interpolate(perc_src, percentiles=percentiles) perc_ref = np.array(np.percentile(ref, percentiles)) perc_ref = unique_percentiles_interpolate(perc_ref, percentiles=percentiles) return gen_cdf_match(src, perc_src, perc_ref, min_val=min_val, max_val=max_val, k=5)
def test_unique_percentile_interpolation(): """ test generation of unique percentile values by interpolation or order k """ arr1 = np.array([1, 1, 1, 2, 2, 2, 5, 5, 6, 10, 10, 10, 10]) percentiles = [0, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 100] p = ml_percentile(arr1, percentiles) src_perc = unique_percentiles_interpolate(p, percentiles=percentiles) assert len(p) == len(src_perc) nptest.assert_almost_equal(src_perc, [ 1., 1.025, 1.05, 1.1, 2., 3.5, 5., 5.3, 8.4, 8.93333333, 9.46666667, 9.73333333, 10. ])
def test_unique_percentile_interpolation(): """ test generation of unique percentile values by interpolation or order k """ arr1 = np.array([1, 1, 1, 2, 2, 2, 5, 5, 6, 10, 10, 10, 10]) percentiles = [0, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 100] p = ml_percentile(arr1, percentiles) src_perc = unique_percentiles_interpolate(p, percentiles=percentiles) assert len(p) == len(src_perc) nptest.assert_almost_equal(src_perc, [1., 1.025, 1.05, 1.1, 2., 3.5, 5., 5.3, 8.4, 8.93333333, 9.46666667, 9.73333333, 10.])