def array_to_series(array): if isinstance(array, pa.ChunkedArray): return Series._concat( [array_to_series(chunk) for chunk in array.chunks] ) if isinstance(array, pa.Column): return Series._concat( [array_to_series(chunk) for chunk in array.data.chunks] ) array_len = len(array) null_count = array.null_count buffers = make_device_arrays(array) mask, data = buffers[0], buffers[1] dtype = arrow_to_pandas_dtype(array.type) if pa.types.is_dictionary(array.type): from cudf.dataframe import CategoricalColumn codes = array_to_series(array.indices) categories = array_to_series(array.dictionary) data = CategoricalColumn( data=codes.data, mask=mask, null_count=null_count, categories=categories, ordered=array.type.ordered, ) elif pa.types.is_string(array.type): import nvstrings offs, data = buffers[1], buffers[2] offs = offs[array.offset : array.offset + array_len + 1] data = None if data is None else data.device_ctypes_pointer.value mask = None if mask is None else mask.device_ctypes_pointer.value data = nvstrings.from_offsets( data, offs.device_ctypes_pointer.value, array_len, mask, null_count, True, ) elif data is not None: data = data[array.offset : array.offset + len(array)] series = Series(data, dtype=dtype) if null_count > 0 and mask is not None and not series.has_null_mask: return series.set_mask(mask, null_count) return series
def _tile(A, reps): series_list = [A] * reps if reps > 0: return Series._concat(objs=series_list, index=None) else: return Series(Buffer.null(dtype=A.dtype))
def melt(frame, id_vars=None, value_vars=None, var_name='variable', value_name='value'): """Unpivots a DataFrame from wide format to long format, optionally leaving identifier variables set. Parameters ---------- frame : DataFrame id_vars : tuple, list, or ndarray, optional Column(s) to use as identifier variables. default: None value_vars : tuple, list, or ndarray, optional Column(s) to unpivot. default: all columns that are not set as `id_vars`. var_name : scalar Name to use for the `variable` column. default: frame.columns.name or 'variable' value_name : str Name to use for the `value` column. default: 'value' Returns ------- out : DataFrame Melted result Difference from pandas: * Does not support 'col_level' because cuDF does not have multi-index Examples -------- .. code-block:: python import cudf import numpy as np df = cudf.DataFrame({'A': {0: 1, 1: 1, 2: 5}, 'B': {0: 1, 1: 3, 2: 6}, 'C': {0: 1.0, 1: np.nan, 2: 4.0}, 'D': {0: 2.0, 1: 5.0, 2: 6.0}}) df2 = cudf.melt(frame=df, id_vars=['A', 'B'], value_vars=['C', 'D']) print(df2) Output: .. code-block:: python A B variable value 0 1 1 C 1.0 1 1 3 C 2 5 6 C 4.0 3 1 1 D 2.0 4 1 3 D 5.0 5 5 6 D 6.0 """ # Arg cleaning import collections # id_vars if id_vars is not None: if not isinstance(id_vars, collections.abc.Sequence): id_vars = [id_vars] id_vars = list(id_vars) missing = set(id_vars) - set(frame.columns) if not len(missing) == 0: raise KeyError( "The following 'id_vars' are not present" " in the DataFrame: {missing}" "".format(missing=list(missing))) else: id_vars = [] # value_vars if value_vars is not None: if not isinstance(value_vars, collections.abc.Sequence): value_vars = [value_vars] value_vars = list(value_vars) missing = set(value_vars) - set(frame.columns) if not len(missing) == 0: raise KeyError( "The following 'value_vars' are not present" " in the DataFrame: {missing}" "".format(missing=list(missing))) else: # then all remaining columns in frame value_vars = frame.columns.drop(id_vars) value_vars = list(value_vars) # Error for unimplemented support for datatype dtypes = [frame[col].dtype for col in id_vars + value_vars] if any(pd.api.types.is_categorical_dtype(t) for t in dtypes): raise NotImplementedError('Categorical columns are not yet ' 'supported for function') # Check dtype homogeneity in value_var # Because heterogeneous concat is unimplemented dtypes = [frame[col].dtype for col in value_vars] if len(dtypes) > 0: dtype = dtypes[0] if any(t != dtype for t in dtypes): raise ValueError('all cols in value_vars must have the same dtype') # overlap overlap = set(id_vars).intersection(set(value_vars)) if not len(overlap) == 0: raise KeyError( "'value_vars' and 'id_vars' cannot have overlap." " The following 'value_vars' are ALSO present" " in 'id_vars': {overlap}" "".format(overlap=list(overlap))) N = len(frame) K = len(value_vars) def _tile(A, reps): series_list = [A] * reps if reps > 0: return Series._concat(objs=series_list, index=None) else: return Series(Buffer.null(dtype=A.dtype)) # Step 1: tile id_vars mdata = collections.OrderedDict() for col in id_vars: mdata[col] = _tile(frame[col], K) # Step 2: add variable var_cols = [] for i, var in enumerate(value_vars): var_cols.append(Series(Buffer( cudautils.full(size=N, value=i, dtype=np.int8)))) temp = Series._concat(objs=var_cols, index=None) mdata[var_name] = Series(CategoricalColumn( categories=tuple(value_vars), data=temp._column.data, ordered=False)) # Step 3: add values mdata[value_name] = Series._concat( objs=[frame[val] for val in value_vars], index=None) return DataFrame(mdata)