def test_compute_array_bag(): x = da.arange(5, chunks=2) b = db.from_sequence([1, 2, 3]) pytest.raises(ValueError, lambda: compute(x, b)) xx, bb = compute(x, b, scheduler="single-threaded") assert np.allclose(xx, np.arange(5)) assert bb == [1, 2, 3]
def test_compute_array_bag(): x = da.arange(5, chunks=2) b = db.from_sequence([1, 2, 3]) pytest.raises(ValueError, lambda: compute(x, b)) xx, bb = compute(x, b, scheduler='single-threaded') assert np.allclose(xx, np.arange(5)) assert bb == [1, 2, 3]
def test_compute_with_literal(): x = da.arange(5, chunks=2) y = 10 xx, yy = compute(x, y) assert (xx == x.compute()).all() assert yy == y assert compute(5) == (5,)
def test_compute_array_bag(): x = da.arange(5, chunks=2) b = db.from_sequence([1, 2, 3]) pytest.raises(ValueError, lambda: compute(x, b)) xx, bb = compute(x, b, get=dask.async.get_sync) assert np.allclose(xx, np.arange(5)) assert bb == [1, 2, 3]
def test_compute_nested(): a = delayed(1) + 5 b = a + 1 c = a + 2 assert (compute({'a': a, 'b': [1, 2, b]}, (c, 2)) == ({'a': 6, 'b': [1, 2, 7]}, (8, 2))) res = compute([a, b], c, traverse=False) assert res[0][0] is a assert res[0][1] is b assert res[1] == 8
def test_compute_nested(): a = delayed(1) + 5 b = a + 1 c = a + 2 assert compute({"a": a, "b": [1, 2, b]}, (c, 2)) == ( {"a": 6, "b": [1, 2, 7]}, (8, 2), ) res = compute([a, b], c, traverse=False) assert res[0][0] is a assert res[0][1] is b assert res[1] == 8
def test_num_workers_config(scheduler): # Regression test for issue #4082 f = delayed(pure=False)(time.sleep) # Be generous with the initial sleep times, as process have been observed # to take >0.5s to spin up num_workers = 3 a = [f(1.0) for i in range(num_workers)] with dask.config.set(num_workers=num_workers, chunksize=1), Profiler() as prof: compute(*a, scheduler=scheduler) workers = {i.worker_id for i in prof.results} assert len(workers) == num_workers
def test_num_workers_config(scheduler): pytest.importorskip("cloudpickle") # Regression test for issue #4082 f = delayed(pure=False)(time.sleep) # Be generous with the initial sleep times, as process have been observed # to take >0.5s to spin up a = [f(1.0), f(1.0), f(1.0), f(0.1)] num_workers = 3 with dask.config.set(num_workers=num_workers), Profiler() as prof: compute(*a, scheduler=scheduler) workers = {i.worker_id for i in prof.results} assert len(workers) == num_workers
def dask_pipeline(df, schema, canvas, glyph, summary): create, info, append, combine, finalize = compile_components( summary, schema) extend = glyph._build_extend(info, append) x_range = canvas.x_range or compute_x_bounds(glyph, df) y_range = canvas.y_range or compute_y_bounds(glyph, df) x_min, x_max, y_min, y_max = compute(*(x_range + y_range)) x_range, y_range = (x_min, x_max), (y_min, y_max) df = subselect(glyph, df, canvas) vt = canvas.view_transform(x_range, y_range) shape = (canvas.plot_height, canvas.plot_width) def chunk(df): aggs = create(shape) extend(aggs, df, vt) return aggs name = tokenize(df._name, canvas, glyph, summary) keys = df._keys() keys2 = [(name, i) for i in range(len(keys))] dsk = dict((k2, (chunk, k)) for (k2, k) in zip(keys2, keys)) dsk[name] = (finalize, (combine, keys2)) dsk.update(df.dask) dsk = df._optimize(dsk, name) get = _globals['get'] or df._default_get return get(dsk, name)
def dask_pipeline(df, schema, canvas, glyph, summary): create, info, append, combine, finalize = compile_components(summary, schema) extend = glyph._build_extend(info, append) x_range = canvas.x_range or compute_x_bounds(glyph, df) y_range = canvas.y_range or compute_y_bounds(glyph, df) x_min, x_max, y_min, y_max = compute(*(x_range + y_range)) x_range, y_range = (x_min, x_max), (y_min, y_max) df = subselect(glyph, df, canvas) vt = canvas.view_transform(x_range, y_range) shape = (canvas.plot_height, canvas.plot_width) def chunk(df): aggs = create(shape) extend(aggs, df, vt) return aggs name = tokenize(df._name, canvas, glyph, summary) keys = df._keys() keys2 = [(name, i) for i in range(len(keys))] dsk = dict((k2, (chunk, k)) for (k2, k) in zip(keys2, keys)) dsk[name] = (finalize, (combine, keys2)) dsk.update(df.dask) dsk = df._optimize(dsk, name) get = _globals['get'] or df._default_get return get(dsk, name)
def test_compute_array(): arr = np.arange(100).reshape((10, 10)) darr = da.from_array(arr, chunks=(5, 5)) darr1 = darr + 1 darr2 = darr + 2 out1, out2 = compute(darr1, darr2) assert np.allclose(out1, arr + 1) assert np.allclose(out2, arr + 2)
def test_compute_array_dataframe(): arr = np.arange(100).reshape((10, 10)) darr = da.from_array(arr, chunks=(5, 5)) + 1 df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [5, 5, 3, 3]}) ddf = dd.from_pandas(df, npartitions=2).a + 2 arr_out, df_out = compute(darr, ddf) assert np.allclose(arr_out, arr + 1) pd.util.testing.assert_series_equal(df_out, df.a + 2)
def test_compute_dataframe(): df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [5, 5, 3, 3]}) ddf = dd.from_pandas(df, npartitions=2) ddf1 = ddf.a + 1 ddf2 = ddf.a + ddf.b out1, out2 = compute(ddf1, ddf2) pd.util.testing.assert_series_equal(out1, df.a + 1) pd.util.testing.assert_series_equal(out2, df.a + df.b)
def test_compute_dataframe(): df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 5, 3, 3]}) ddf = dd.from_pandas(df, npartitions=2) ddf1 = ddf.a + 1 ddf2 = ddf.a + ddf.b out1, out2 = compute(ddf1, ddf2) dd.utils.assert_eq(out1, df.a + 1) dd.utils.assert_eq(out2, df.a + df.b)
def test_persist_nested(): a = delayed(1) + 5 b = a + 1 c = a + 2 result = persist({"a": a, "b": [1, 2, b]}, (c, 2)) assert isinstance(result[0]["a"], Delayed) assert isinstance(result[0]["b"][2], Delayed) assert isinstance(result[1][0], Delayed) assert compute(*result) == ({"a": 6, "b": [1, 2, 7]}, (8, 2)) res = persist([a, b], c, traverse=False) assert res[0][0] is a assert res[0][1] is b assert res[1].compute() == 8
def test_persist_nested(): a = delayed(1) + 5 b = a + 1 c = a + 2 result = persist({'a': a, 'b': [1, 2, b]}, (c, 2)) assert isinstance(result[0]['a'], Delayed) assert isinstance(result[0]['b'][2], Delayed) assert isinstance(result[1][0], Delayed) assert compute(*result) == ({'a': 6, 'b': [1, 2, 7]}, (8, 2)) res = persist([a, b], c, traverse=False) assert res[0][0] is a assert res[0][1] is b assert res[1].compute() == 8
def sort_index(df, npartitions=None, shuffle='tasks', drop=True, upsample=1.0, divisions=None, partition_size=128e6, **kwargs): """ See _Frame.set_index for docstring """ if npartitions == 'auto': repartition = True npartitions = max(100, df.npartitions) else: if npartitions is None: npartitions = df.npartitions repartition = False index2 = index_to_series(df.index) if divisions is None: divisions = index2._repartition_quantiles(npartitions, upsample=upsample) if repartition: parts = df.to_delayed() sizes = [delayed(sizeof)(part) for part in parts] else: sizes = [] iparts = index2.to_delayed() mins = [ipart.min() for ipart in iparts] maxes = [ipart.max() for ipart in iparts] divisions, sizes, mins, maxes = base.compute(divisions, sizes, mins, maxes) divisions = divisions.tolist() empty_dataframe_detected = pd.isnull(divisions).all() if repartition or empty_dataframe_detected: total = sum(sizes) npartitions = max(math.ceil(total / partition_size), 1) npartitions = min(npartitions, df.npartitions) n = len(divisions) try: divisions = np.interp(x=np.linspace(0, n - 1, npartitions + 1), xp=np.linspace(0, n - 1, n), fp=divisions).tolist() except (TypeError, ValueError): # str type indexes = np.linspace(0, n - 1, npartitions + 1).astype(int) divisions = [divisions[i] for i in indexes] return set_partition(df, divisions, shuffle=shuffle, drop=drop, **kwargs)
def test_num_workers_config(scheduler): # Regression test for issue #4082 @delayed def f(x): time.sleep(0.5) return x a = [f(i) for i in range(5)] num_workers = 3 with dask.config.set(num_workers=num_workers), Profiler() as prof: a = compute(*a, scheduler=scheduler) workers = {i.worker_id for i in prof.results} assert len(workers) == num_workers
def categorize(df, columns=None, **kwargs): """ Convert columns of dataframe to category dtype This aids performance, both in-memory and in spilling to disk """ if columns is None: dtypes = df.dtypes columns = [name for name, dt in zip(dtypes.index, dtypes.values) if dt == 'O'] if not isinstance(columns, (list, tuple)): columns = [columns] distincts = [df[col].drop_duplicates() for col in columns] values = compute(*distincts, **kwargs) func = partial(_categorize_block, categories=dict(zip(columns, values))) return df.map_partitions(func, columns=df.columns)
def shape_bounds_st_and_axis(df, canvas, glyph): x_range = canvas.x_range or glyph._compute_x_bounds(df) y_range = canvas.y_range or glyph._compute_y_bounds(df) x_min, x_max, y_min, y_max = bounds = compute(*(x_range + y_range)) x_range, y_range = (x_min, x_max), (y_min, y_max) width = canvas.plot_width height = canvas.plot_height x_st = canvas.x_axis.compute_scale_and_translate(x_range, width) y_st = canvas.y_axis.compute_scale_and_translate(y_range, height) st = x_st + y_st shape = (height, width) x_axis = canvas.x_axis.compute_index(x_st, width) y_axis = canvas.y_axis.compute_index(y_st, height) axis = [y_axis, x_axis] return shape, bounds, st, axis
def categorize(df, columns=None, **kwargs): """ Convert columns of dataframe to category dtype This aids performance, both in-memory and in spilling to disk """ if columns is None: dtypes = df.dtypes columns = [ name for name, dt in zip(dtypes.index, dtypes.values) if dt == 'O' ] if not isinstance(columns, (list, tuple)): columns = [columns] distincts = [df[col].drop_duplicates() for col in columns] values = compute(*distincts, **kwargs) func = partial(_categorize_block, categories=dict(zip(columns, values))) return df.map_partitions(func, columns=df.columns)
def test_optimize_nested(): a = dask.delayed(inc)(1) b = dask.delayed(inc)(a) c = a + b result = optimize({'a': a, 'b': [1, 2, b]}, (c, 2)) a2 = result[0]['a'] b2 = result[0]['b'][2] c2 = result[1][0] assert isinstance(a2, Delayed) assert isinstance(b2, Delayed) assert isinstance(c2, Delayed) assert dict(a2.dask) == dict(b2.dask) == dict(c2.dask) assert compute(*result) == ({'a': 2, 'b': [1, 2, 3]}, (5, 2)) res = optimize([a, b], c, traverse=False) assert res[0][0] is a assert res[0][1] is b assert res[1].compute() == 5
def test_optimize_nested(): a = dask.delayed(inc)(1) b = dask.delayed(inc)(a) c = a + b result = optimize({"a": a, "b": [1, 2, b]}, (c, 2)) a2 = result[0]["a"] b2 = result[0]["b"][2] c2 = result[1][0] assert isinstance(a2, Delayed) assert isinstance(b2, Delayed) assert isinstance(c2, Delayed) assert dict(a2.dask) == dict(b2.dask) == dict(c2.dask) assert compute(*result) == ({"a": 2, "b": [1, 2, 3]}, (5, 2)) res = optimize([a, b], c, traverse=False) assert res[0][0] is a assert res[0][1] is b assert res[1].compute() == 5
def dask_pipeline(df, schema, canvas, glyph, summary): create, info, append, combine, finalize = compile_components( summary, schema) x_mapper = canvas.x_axis.mapper y_mapper = canvas.y_axis.mapper extend = glyph._build_extend(x_mapper, y_mapper, info, append) x_range = canvas.x_range or glyph._compute_x_bounds(df) y_range = canvas.y_range or glyph._compute_y_bounds(df) x_min, x_max, y_min, y_max = bounds = compute(*(x_range + y_range)) x_range, y_range = (x_min, x_max), (y_min, y_max) width = canvas.plot_width height = canvas.plot_height x_st = canvas.x_axis.scale_and_translation(x_range, width) y_st = canvas.y_axis.scale_and_translation(y_range, height) st = x_st + y_st shape = (height, width) x_axis = canvas.x_axis.compute_index(width, x_st) y_axis = canvas.y_axis.compute_index(height, y_st) def chunk(df): aggs = create(shape) extend(aggs, df, st, bounds) return aggs name = tokenize(df._name, canvas, glyph, summary) keys = df._keys() keys2 = [(name, i) for i in range(len(keys))] dsk = dict((k2, (chunk, k)) for (k2, k) in zip(keys2, keys)) dsk[name] = (apply, finalize, [(combine, keys2)], dict(coords=[y_axis, x_axis], dims=['y_axis', 'x_axis'])) dsk.update(df.dask) dsk = df._optimize(dsk, name) get = _globals['get'] or df._default_get return get(dsk, name)
def dask_pipeline(df, schema, canvas, glyph, summary): create, info, append, combine, finalize = compile_components(summary, schema) x_mapper = canvas.x_axis.mapper y_mapper = canvas.y_axis.mapper extend = glyph._build_extend(x_mapper, y_mapper, info, append) x_range = canvas.x_range or glyph._compute_x_bounds(df) y_range = canvas.y_range or glyph._compute_y_bounds(df) x_min, x_max, y_min, y_max = bounds = compute(*(x_range + y_range)) x_range, y_range = (x_min, x_max), (y_min, y_max) width = canvas.plot_width height = canvas.plot_height x_st = canvas.x_axis.compute_scale_and_translate(x_range, width) y_st = canvas.y_axis.compute_scale_and_translate(y_range, height) st = x_st + y_st shape = (height, width) x_axis = canvas.x_axis.compute_index(x_st, width) y_axis = canvas.y_axis.compute_index(y_st, height) def chunk(df): aggs = create(shape) extend(aggs, df, st, bounds) return aggs name = tokenize(df._name, canvas, glyph, summary) keys = df._keys() keys2 = [(name, i) for i in range(len(keys))] dsk = dict((k2, (chunk, k)) for (k2, k) in zip(keys2, keys)) dsk[name] = (apply, finalize, [(combine, keys2)], dict(coords=[y_axis, x_axis], dims=['y_axis', 'x_axis'])) dsk.update(df.dask) dsk = df._optimize(dsk, name) get = _globals['get'] or df._default_get return get(dsk, name)
def _compute_partition_stats(column: Series, allow_overlap: bool = False, **kwargs) -> Tuple[List, List, List[int]]: """For a given column, compute the min, max, and len of each partition. And make sure that the partitions are sorted relative to each other. NOTE: this does not guarantee that every partition is internally sorted. """ mins = column.map_partitions(M.min, meta=column) maxes = column.map_partitions(M.max, meta=column) lens = column.map_partitions(len, meta=column) mins, maxes, lens = compute(mins, maxes, lens, **kwargs) mins = remove_nans(mins) maxes = remove_nans(maxes) non_empty_mins = [m for m, length in zip(mins, lens) if length != 0] non_empty_maxes = [m for m, length in zip(maxes, lens) if length != 0] if (sorted(non_empty_mins) != non_empty_mins or sorted(non_empty_maxes) != non_empty_maxes): raise ValueError( f"Partitions are not sorted ascending by {column.name or 'the index'}", f"In your dataset the (min, max, len) values of {column.name or 'the index'} " f"for each partition are : {list(zip(mins, maxes, lens))}", ) if not allow_overlap and any( a <= b for a, b in zip(non_empty_mins[1:], non_empty_maxes[:-1])): warnings.warn( "Partitions have overlapping values, so divisions are non-unique." "Use `set_index(sorted=True)` with no `divisions` to allow dask to fix the overlap. " f"In your dataset the (min, max, len) values of {column.name or 'the index'} " f"for each partition are : {list(zip(mins, maxes, lens))}", UserWarning, ) lens = methods.tolist(lens) if not allow_overlap: return (mins, maxes, lens) else: return (non_empty_mins, non_empty_maxes, lens)
def dask_pipeline(df, schema, canvas, glyph, summary): create, info, append, combine, finalize = compile_components(summary, schema) x_mapper = canvas.x_axis_type.mapper y_mapper = canvas.y_axis_type.mapper extend = glyph._build_extend(x_mapper, y_mapper, info, append) x_range = canvas.x_range or compute_x_bounds(glyph, df) y_range = canvas.y_range or compute_y_bounds(glyph, df) x_min, x_max, y_min, y_max = bounds = compute(*(x_range + y_range)) x_range, y_range = (x_min, x_max), (y_min, y_max) x_axis = canvas.x_axis_type(x_range) y_axis = canvas.y_axis_type(y_range) xvt = x_axis.view_transform(canvas.plot_width) yvt = y_axis.view_transform(canvas.plot_height) vt = xvt + yvt shape = (canvas.plot_height, canvas.plot_width) def chunk(df): aggs = create(shape) extend(aggs, df, vt, bounds) return aggs name = tokenize(df._name, canvas, glyph, summary) keys = df._keys() keys2 = [(name, i) for i in range(len(keys))] dsk = dict((k2, (chunk, k)) for (k2, k) in zip(keys2, keys)) dsk[name] = (apply, finalize, [(combine, keys2)], dict(x_axis=x_axis, y_axis=y_axis)) dsk.update(df.dask) dsk = df._optimize(dsk, name) get = _globals['get'] or df._default_get return get(dsk, name)
def shape_bounds_st_and_axis(xr_ds, canvas, glyph): if not canvas.x_range or not canvas.y_range: x_extents, y_extents = glyph.compute_bounds_dask(xr_ds) else: x_extents, y_extents = None, None x_range = canvas.x_range or x_extents y_range = canvas.y_range or y_extents x_min, x_max, y_min, y_max = bounds = compute(*(x_range + y_range)) x_range, y_range = (x_min, x_max), (y_min, y_max) width = canvas.plot_width height = canvas.plot_height x_st = canvas.x_axis.compute_scale_and_translate(x_range, width) y_st = canvas.y_axis.compute_scale_and_translate(y_range, height) st = x_st + y_st shape = (height, width) x_axis = canvas.x_axis.compute_index(x_st, width) y_axis = canvas.y_axis.compute_index(y_st, height) axis = OrderedDict([(glyph.x_label, x_axis), (glyph.y_label, y_axis)]) return shape, bounds, st, axis
for dataset in datasets: dat = tcdp.load_data(dataset, art='R') # dat = tcdp.pd.read_csv(dataset) print(dat.shape[1]) # num = 50 # group = tcdp.bestFeatures(dat, num, art='C') # res = tcdp.calculateFitness(dat, group, art='C') # res, v = tcdp.compute(res)[0] # print("For dataset %s which selects %d features from original, its mean score is %f, standard variance is %f" %(str(dataset[5:-4]), num, res, v)) group = dat.drop(dat.columns[-1], axis=1, inplace=False) datapath = 'result/' + dataset[5:-4] # datapath = 'result/R' + dataset[7:-4] res = tcdp.eval(dat.iloc[:, :-1], dat.iloc[:, -1], art='R') res, v = compute(res)[0] logger_new.info( "For dataset %s, mean score is %f, standard variance is %f" % (str(dataset[5:-4]), res, v)) def relative_absolute_error(y_true: pd.Series, y_pred: pd.Series): y_true_mean = y_true.mean() n = len(y_true) # Relative Absolute Error # err = math.sqrt(sum(np.square(y_true - y_pred)) / math.sqrt(sum(np.square(y_true-y_true_mean)))) err = sum(abs(y_true - y_pred)) / sum(abs(y_true - y_true_mean)) return err score = make_scorer(relative_absolute_error, greater_is_better=True)
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) X, y = indexable(X, y) cv = check_cv(cv, X, y, classifier=is_classifier(self.estimator)) base_estimator = clone(self.estimator) out = [_fit_and_score(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv] self._dask_value = value(out) out, = compute(value(out)) n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _calculate_divisions( df: DataFrame, partition_col: Series, repartition: bool, npartitions: int, upsample: float = 1.0, partition_size: float = 128e6, ) -> Tuple[List, List, List]: """ Utility function to calculate divisions for calls to `map_partitions` """ sizes = df.map_partitions(sizeof) if repartition else [] divisions = partition_col._repartition_quantiles(npartitions, upsample=upsample) mins = partition_col.map_partitions(M.min) maxes = partition_col.map_partitions(M.max) try: divisions, sizes, mins, maxes = compute(divisions, sizes, mins, maxes) except TypeError as e: # When there are nulls and a column is non-numeric, a TypeError is sometimes raised as a result of # 1) computing mins/maxes above, 2) every null being switched to NaN, and 3) NaN being a float. # Also, Pandas ExtensionDtypes may cause TypeErrors when dealing with special nulls such as pd.NaT or pd.NA. # If this happens, we hint the user about eliminating nulls beforehand. if not is_numeric_dtype(partition_col.dtype): obj, suggested_method = ( ("column", f"`.dropna(subset=['{partition_col.name}'])`") if any( partition_col._name == df[c]._name for c in df) else ("series", "`.loc[series[~series.isna()]]`")) raise NotImplementedError( f"Divisions calculation failed for non-numeric {obj} '{partition_col.name}'.\n" f"This is probably due to the presence of nulls, which Dask does not entirely support in the index.\n" f"We suggest you try with {suggested_method}.") from e # For numeric types there shouldn't be problems with nulls, so we raise as-it-is this particular TypeError else: raise e divisions = methods.tolist(divisions) if type(sizes) is not list: sizes = methods.tolist(sizes) mins = methods.tolist(mins) maxes = methods.tolist(maxes) empty_dataframe_detected = pd.isna(divisions).all() if repartition or empty_dataframe_detected: total = sum(sizes) npartitions = max(math.ceil(total / partition_size), 1) npartitions = min(npartitions, df.npartitions) n = len(divisions) try: divisions = np.interp( x=np.linspace(0, n - 1, npartitions + 1), xp=np.linspace(0, n - 1, n), fp=divisions, ).tolist() except (TypeError, ValueError): # str type indexes = np.linspace(0, n - 1, npartitions + 1).astype(int) divisions = [divisions[i] for i in indexes] else: # Drop duplicate divisions returned by partition quantiles divisions = list(toolz.unique(divisions[:-1])) + [divisions[-1]] mins = remove_nans(mins) maxes = remove_nans(maxes) if pd.api.types.is_categorical_dtype(partition_col.dtype): dtype = partition_col.dtype mins = pd.Categorical(mins, dtype=dtype).codes.tolist() maxes = pd.Categorical(maxes, dtype=dtype).codes.tolist() return divisions, mins, maxes