def test_concat_categorical(known, cat_index, divisions): frames = [pd.DataFrame({'w': list('xxxxx'), 'x': np.arange(5), 'y': list('abcbc'), 'z': np.arange(5, dtype='f8')}), pd.DataFrame({'w': list('yyyyy'), 'x': np.arange(5, 10), 'y': list('abbba'), 'z': np.arange(5, 10, dtype='f8')}), pd.DataFrame({'w': list('zzzzz'), 'x': np.arange(10, 15), 'y': list('bcbcc'), 'z': np.arange(10, 15, dtype='f8')})] for df in frames: df.w = df.w.astype('category') df.y = df.y.astype('category') if cat_index: frames = [df.set_index(df.y) for df in frames] dframes = [dd.from_pandas(p, npartitions=2, sort=divisions) for p in frames] if not known: dframes[0]._meta = clear_known_categories(dframes[0]._meta, ['y'], index=True) def check_and_return(ddfs, dfs, join): sol = concat(dfs, join=join) res = dd.concat(ddfs, join=join, interleave_partitions=divisions) assert_eq(res, sol) if known: parts = compute_as_if_collection(dd.DataFrame, res.dask, res.__dask_keys__()) for p in [i.iloc[:0] for i in parts]: res._meta == p # will error if schemas don't align assert not cat_index or has_known_categories(res.index) == known return res for join in ['inner', 'outer']: # Frame res = check_and_return(dframes, frames, join) assert has_known_categories(res.w) assert has_known_categories(res.y) == known # Series res = check_and_return([i.y for i in dframes], [i.y for i in frames], join) assert has_known_categories(res) == known # Non-cat series with cat index if cat_index: res = check_and_return([i.x for i in dframes], [i.x for i in frames], join) # Partition missing columns res = check_and_return([dframes[0][['x', 'y']]] + dframes[1:], [frames[0][['x', 'y']]] + frames[1:], join) assert not hasattr(res, 'w') or has_known_categories(res.w) assert has_known_categories(res.y) == known
def test_read_csv_include_path_column_is_dtype_category(dd_read, files): with filetexts(files, mode='b'): df = dd_read('2014-01-*.csv', include_path_column=True) assert df.path.dtype == 'category' assert has_known_categories(df.path) dfs = dd_read('2014-01-*.csv', include_path_column=True, collection=False) result = dfs[0].compute() assert result.path.dtype == 'category' assert has_known_categories(result.path)
def test_read_csv_include_path_column_is_dtype_category(dd_read, files): with filetexts(files, mode='b'): df = dd_read('2014-01-*.csv', include_path_column=True) assert df.path.dtype == 'category' assert has_known_categories(df.path) dfs = dd_read('2014-01-*.csv', include_path_column=True, collection=False) result = dfs[0].compute() assert result.path.dtype == 'category' assert has_known_categories(result.path)
def test_read_csv_include_path_column_is_dtype_category(dd_read, files): with filetexts(files, mode="b"): df = dd_read("2014-01-*.csv", include_path_column=True) assert df.path.dtype == "category" assert has_known_categories(df.path) dfs = dd_read("2014-01-*.csv", include_path_column=True) result = dfs.compute() assert result.path.dtype == "category" assert has_known_categories(result.path)
def test_read_csv_include_path_column_with_multiple_partitions_per_file(dd_read, files): with filetexts(files, mode="b"): df = dd_read("2014-01-*.csv", blocksize="10B", include_path_column=True) assert df.npartitions > 3 assert df.path.dtype == "category" assert has_known_categories(df.path) dfs = dd_read("2014-01-*.csv", blocksize="10B", include_path_column=True) result = dfs.compute() assert result.path.dtype == "category" assert has_known_categories(result.path)
def test_categorical_dtypes(): text1 = normalize_text(""" fruit,count apple,10 apple,25 pear,100 orange,15 """) text2 = normalize_text(""" fruit,count apple,200 banana,300 orange,400 banana,10 """) with filetexts({'foo.1.csv': text1, 'foo.2.csv': text2}): df = dd.read_csv('foo.*.csv', dtype={'fruit': 'category'}, blocksize=25) assert df.fruit.dtype == 'category' assert not has_known_categories(df.fruit) res = df.compute() assert res.fruit.dtype == 'category' assert (sorted( res.fruit.cat.categories) == ['apple', 'banana', 'orange', 'pear'])
def test_categorical_dtypes(): text1 = normalize_text(""" fruit,count apple,10 apple,25 pear,100 orange,15 """) text2 = normalize_text(""" fruit,count apple,200 banana,300 orange,400 banana,10 """) with filetexts({"foo.1.csv": text1, "foo.2.csv": text2}): df = dd.read_csv("foo.*.csv", dtype={"fruit": "category"}, blocksize=25) assert df.fruit.dtype == "category" assert not has_known_categories(df.fruit) res = df.compute() assert res.fruit.dtype == "category" assert sorted( res.fruit.cat.categories) == ["apple", "banana", "orange", "pear"]
def test_categorical_dtypes(): text1 = normalize_text(""" fruit,count apple,10 apple,25 pear,100 orange,15 """) text2 = normalize_text(""" fruit,count apple,200 banana,300 orange,400 banana,10 """) with filetexts({'foo.1.csv': text1, 'foo.2.csv': text2}): df = dd.read_csv('foo.*.csv', dtype={'fruit': 'category'}, blocksize=25) assert df.fruit.dtype == 'category' assert not has_known_categories(df.fruit) res = df.compute() assert res.fruit.dtype == 'category' assert (sorted(res.fruit.cat.categories) == ['apple', 'banana', 'orange', 'pear'])
def check_and_return(ddfs, dfs, join): sol = concat(dfs, join=join) res = dd.concat(ddfs, join=join, interleave_partitions=divisions) assert_eq(res, sol) if known: parts = compute_as_if_collection(dd.DataFrame, res.dask, res.__dask_keys__()) for p in [i.iloc[:0] for i in parts]: res._meta == p # will error if schemas don't align assert not cat_index or has_known_categories(res.index) == known return res
def test_append_categorical(): frames = [pd.DataFrame({'x': np.arange(5, 10), 'y': list('abbba'), 'z': np.arange(5, 10, dtype='f8')}), pd.DataFrame({'x': np.arange(10, 15), 'y': list('bcbcc'), 'z': np.arange(10, 15, dtype='f8')})] frames2 = [] for df in frames: df.y = df.y.astype('category') df2 = df.copy() df2.y = df2.y.cat.set_categories(list('abc')) df.index = df.y frames2.append(df2.set_index(df2.y)) df1, df2 = frames2 for known in [True, False]: dframes = [dd.from_pandas(p, npartitions=2, sort=False) for p in frames] if not known: dframes[0]._meta = clear_known_categories(dframes[0]._meta, ['y'], index=True) ddf1, ddf2 = dframes res = ddf1.append(ddf2) assert_eq(res, df1.append(df2)) assert has_known_categories(res.index) == known assert has_known_categories(res.y) == known res = ddf1.y.append(ddf2.y) assert_eq(res, df1.y.append(df2.y)) assert has_known_categories(res.index) == known assert has_known_categories(res) == known res = ddf1.index.append(ddf2.index) assert_eq(res, df1.index.append(df2.index)) assert has_known_categories(res) == known
def categorize(df, columns=None, index=None, split_every=None, **kwargs): """Convert columns of the DataFrame to category dtype. Parameters ---------- columns : list, optional A list of column names to convert to categoricals. By default any column with an object dtype is converted to a categorical, and any unknown categoricals are made known. index : bool, optional Whether to categorize the index. By default, object indices are converted to categorical, and unknown categorical indices are made known. Set True to always categorize the index, False to never. split_every : int, optional Group partitions into groups of this size while performing a tree-reduction. If set to False, no tree-reduction will be used. Default is 16. kwargs Keyword arguments are passed on to compute. """ meta = df._meta if columns is None: columns = list(meta.select_dtypes(["object", "category"]).columns) elif is_scalar(columns): columns = [columns] # Filter out known categorical columns columns = [ c for c in columns if not ( is_categorical_dtype(meta[c]) and has_known_categories(meta[c])) ] if index is not False: if is_categorical_dtype(meta.index): index = not has_known_categories(meta.index) elif index is None: index = meta.index.dtype == object # Nothing to do if not len(columns) and index is False: return df if split_every is None: split_every = 16 elif split_every is False: split_every = df.npartitions elif not isinstance(split_every, Integral) or split_every < 2: raise ValueError("split_every must be an integer >= 2") token = tokenize(df, columns, index, split_every) a = "get-categories-chunk-" + token dsk = {(a, i): (_get_categories, key, columns, index) for (i, key) in enumerate(df.__dask_keys__())} prefix = "get-categories-agg-" + token k = df.npartitions depth = 0 while k > split_every: b = prefix + str(depth) for part_i, inds in enumerate(partition_all(split_every, range(k))): dsk[(b, part_i)] = (_get_categories_agg, [(a, i) for i in inds]) k = part_i + 1 a = b depth += 1 dsk[(prefix, 0)] = (_get_categories_agg, [(a, i) for i in range(k)]) dsk.update(df.dask) # Compute the categories categories, index = compute_as_if_collection(df.__class__, dsk, (prefix, 0), **kwargs) # some operations like get_dummies() rely on the order of categories categories = {k: v.sort_values() for k, v in categories.items()} # Categorize each partition return df.map_partitions(_categorize_block, categories, index)
def known(self): """Whether the categories are fully known""" return has_known_categories(self._series)
def pivot_table(df, index=None, columns=None, values=None, aggfunc="mean"): """ Create a spreadsheet-style pivot table as a DataFrame. Target ``columns`` must have category dtype to infer result's ``columns``. ``index``, ``columns``, and ``aggfunc`` must be all scalar. ``values`` can be scalar or list-like. Parameters ---------- df : DataFrame index : scalar column to be index columns : scalar column to be columns values : scalar or list(scalar) column(s) to aggregate aggfunc : {'mean', 'sum', 'count', 'first', 'last'}, default 'mean' Returns ------- table : DataFrame See Also -------- pandas.DataFrame.pivot_table """ if not is_scalar(index) or index is None: raise ValueError("'index' must be the name of an existing column") if not is_scalar(columns) or columns is None: raise ValueError("'columns' must be the name of an existing column") if not methods.is_categorical_dtype(df[columns]): raise ValueError("'columns' must be category dtype") if not has_known_categories(df[columns]): raise ValueError("'columns' must have known categories. Please use " "`df[columns].cat.as_known()` beforehand to ensure " "known categories") if not (is_list_like(values) and all([is_scalar(v) for v in values]) or is_scalar(values)): raise ValueError( "'values' must refer to an existing column or columns") available_aggfuncs = ["mean", "sum", "count", "first", "last"] if not is_scalar(aggfunc) or aggfunc not in available_aggfuncs: raise ValueError("aggfunc must be either " + ", ".join(f"'{x}'" for x in available_aggfuncs)) # _emulate can't work for empty data # the result must have CategoricalIndex columns columns_contents = pd.CategoricalIndex(df[columns].cat.categories, name=columns) if is_scalar(values): new_columns = columns_contents else: new_columns = pd.MultiIndex.from_product( (sorted(values), columns_contents), names=[None, columns]) if aggfunc in ["first", "last"]: # Infer datatype as non-numeric values are allowed if is_scalar(values): meta = pd.DataFrame( columns=new_columns, dtype=df[values].dtype, index=pd.Index(df._meta[index]), ) else: meta = pd.DataFrame( columns=new_columns, index=pd.Index(df._meta[index]), ) for value_col in values: meta[value_col] = meta[value_col].astype( df[values].dtypes[value_col]) else: # Use float64 as other aggregate functions require numerical data meta = pd.DataFrame(columns=new_columns, dtype=np.float64, index=pd.Index(df._meta[index])) kwargs = {"index": index, "columns": columns, "values": values} if aggfunc in ["sum", "mean"]: pv_sum = apply_concat_apply( [df], chunk=methods.pivot_sum, aggregate=methods.pivot_agg, meta=meta, token="pivot_table_sum", chunk_kwargs=kwargs, ) if aggfunc in ["count", "mean"]: pv_count = apply_concat_apply( [df], chunk=methods.pivot_count, aggregate=methods.pivot_agg, meta=meta, token="pivot_table_count", chunk_kwargs=kwargs, ) if aggfunc == "sum": return pv_sum elif aggfunc == "count": return pv_count elif aggfunc == "mean": return pv_sum / pv_count elif aggfunc == "first": return apply_concat_apply( [df], chunk=methods.pivot_first, aggregate=methods.pivot_agg_first, meta=meta, token="pivot_table_first", chunk_kwargs=kwargs, ) elif aggfunc == "last": return apply_concat_apply( [df], chunk=methods.pivot_last, aggregate=methods.pivot_agg_last, meta=meta, token="pivot_table_last", chunk_kwargs=kwargs, ) else: raise ValueError
def get_dummies( data, prefix=None, prefix_sep="_", dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=np.uint8, **kwargs, ): """ Convert categorical variable into dummy/indicator variables. Data must have category dtype to infer result's ``columns``. Parameters ---------- data : Series, or DataFrame For Series, the dtype must be categorical. For DataFrame, at least one column must be categorical. prefix : string, list of strings, or dict of strings, default None String to append DataFrame column names. Pass a list with length equal to the number of columns when calling get_dummies on a DataFrame. Alternatively, `prefix` can be a dictionary mapping column names to prefixes. prefix_sep : string, default '_' If appending prefix, separator/delimiter to use. Or pass a list or dictionary as with `prefix.` dummy_na : bool, default False Add a column to indicate NaNs, if False NaNs are ignored. columns : list-like, default None Column names in the DataFrame to be encoded. If `columns` is None then all the columns with `category` dtype will be converted. sparse : bool, default False Whether the dummy columns should be sparse or not. Returns SparseDataFrame if `data` is a Series or if all columns are included. Otherwise returns a DataFrame with some SparseBlocks. .. versionadded:: 0.18.2 drop_first : bool, default False Whether to get k-1 dummies out of k categorical levels by removing the first level. dtype : dtype, default np.uint8 Data type for new columns. Only a single dtype is allowed. .. versionadded:: 0.18.2 Returns ------- dummies : DataFrame Examples -------- Dask's version only works with Categorical data, as this is the only way to know the output shape without computing all the data. >>> import pandas as pd >>> import dask.dataframe as dd >>> s = dd.from_pandas(pd.Series(list('abca')), npartitions=2) >>> dd.get_dummies(s) Traceback (most recent call last): ... NotImplementedError: `get_dummies` with non-categorical dtypes is not supported... With categorical data: >>> s = dd.from_pandas(pd.Series(list('abca'), dtype='category'), npartitions=2) >>> dd.get_dummies(s) # doctest: +NORMALIZE_WHITESPACE Dask DataFrame Structure: a b c npartitions=2 0 uint8 uint8 uint8 2 ... ... ... 3 ... ... ... Dask Name: get_dummies, 4 tasks >>> dd.get_dummies(s).compute() # doctest: +ELLIPSIS a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 See Also -------- pandas.get_dummies """ if isinstance(data, (pd.Series, pd.DataFrame)): return pd.get_dummies( data, prefix=prefix, prefix_sep=prefix_sep, dummy_na=dummy_na, columns=columns, sparse=sparse, drop_first=drop_first, dtype=dtype, **kwargs, ) not_cat_msg = ("`get_dummies` with non-categorical dtypes is not " "supported. Please use `df.categorize()` beforehand to " "convert to categorical dtype.") unknown_cat_msg = ("`get_dummies` with unknown categories is not " "supported. Please use `column.cat.as_known()` or " "`df.categorize()` beforehand to ensure known " "categories") if isinstance(data, Series): if not methods.is_categorical_dtype(data): raise NotImplementedError(not_cat_msg) if not has_known_categories(data): raise NotImplementedError(unknown_cat_msg) elif isinstance(data, DataFrame): if columns is None: if (data.dtypes == "object").any(): raise NotImplementedError(not_cat_msg) columns = data._meta.select_dtypes(include=["category"]).columns else: if not all(methods.is_categorical_dtype(data[c]) for c in columns): raise NotImplementedError(not_cat_msg) if not all(has_known_categories(data[c]) for c in columns): raise NotImplementedError(unknown_cat_msg) package_name = data._meta.__class__.__module__.split(".")[0] dummies = sys.modules[package_name].get_dummies return map_partitions( dummies, data, prefix=prefix, prefix_sep=prefix_sep, dummy_na=dummy_na, columns=columns, sparse=sparse, drop_first=drop_first, dtype=dtype, **kwargs, )