def get_new_values(self): values = self.values # place the values length, width = self.full_shape stride = values.shape[1] result_width = width * stride result_shape = (length, result_width) # if our mask is all True, then we can use our existing dtype if self.mask.all(): dtype = values.dtype new_values = np.empty(result_shape, dtype=dtype) else: dtype, fill_value = _maybe_promote(values.dtype, self.fill_value) new_values = np.empty(result_shape, dtype=dtype) new_values.fill(fill_value) new_mask = np.zeros(result_shape, dtype=bool) # is there a simpler / faster way of doing this? for i in range(values.shape[1]): chunk = new_values[:, i * width:(i + 1) * width] mask_chunk = new_mask[:, i * width:(i + 1) * width] chunk.flat[self.mask] = self.sorted_values[:, i] mask_chunk.flat[self.mask] = True return new_values, new_mask
def get_new_values(self): values = self.values # place the values length, width = self.full_shape stride = values.shape[1] result_width = width * stride result_shape = (length, result_width) # if our mask is all True, then we can use our existing dtype if self.mask.all(): dtype = values.dtype new_values = np.empty(result_shape, dtype=dtype) else: dtype, fill_value = _maybe_promote(values.dtype, self.fill_value) new_values = np.empty(result_shape, dtype=dtype) new_values.fill(fill_value) new_mask = np.zeros(result_shape, dtype=bool) # is there a simpler / faster way of doing this? for i in range(values.shape[1]): chunk = new_values[:, i * width : (i + 1) * width] mask_chunk = new_mask[:, i * width : (i + 1) * width] chunk.flat[self.mask] = self.sorted_values[:, i] mask_chunk.flat[self.mask] = True return new_values, new_mask
def get_new_values(self): values = self.values # place the values length, width = self.full_shape stride = values.shape[1] result_width = width * stride result_shape = (length, result_width) mask = self.mask mask_all = mask.all() # we can simply reshape if we don't have a mask if mask_all and len(values): new_values = (self.sorted_values.reshape( length, width, stride).swapaxes(1, 2).reshape(result_shape)) new_mask = np.ones(result_shape, dtype=bool) return new_values, new_mask # if our mask is all True, then we can use our existing dtype if mask_all: dtype = values.dtype new_values = np.empty(result_shape, dtype=dtype) else: dtype, fill_value = _maybe_promote(values.dtype, self.fill_value) new_values = np.empty(result_shape, dtype=dtype) new_values.fill(fill_value) new_mask = np.zeros(result_shape, dtype=bool) name = np.dtype(dtype).name sorted_values = self.sorted_values # we need to convert to a basic dtype # and possibly coerce an input to our output dtype # e.g. ints -> floats if needs_i8_conversion(values): sorted_values = sorted_values.view('i8') new_values = new_values.view('i8') name = 'int64' elif is_bool_dtype(values): sorted_values = sorted_values.astype('object') new_values = new_values.astype('object') name = 'object' else: sorted_values = sorted_values.astype(name, copy=False) # fill in our values & mask f = getattr(_reshape, "unstack_{}".format(name)) f(sorted_values, mask.view('u1'), stride, length, width, new_values, new_mask.view('u1')) # reconstruct dtype if needed if needs_i8_conversion(values): new_values = new_values.view(values.dtype) return new_values, new_mask
def fill_value(self, value): if not is_scalar(value): raise ValueError('fill_value must be a scalar') # if the specified value triggers type promotion, raise ValueError new_dtype, fill_value = _maybe_promote(self.dtype, value) if is_dtype_equal(self.dtype, new_dtype): self._fill_value = fill_value else: msg = 'unable to set fill_value {0} to {1} dtype' raise ValueError(msg.format(value, self.dtype))
def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, allow_fill=True): """ Specialized Cython take which sets NaN values in one pass """ if indexer is None or (indexer[0] is None and indexer[1] is None): row_idx = np.arange(arr.shape[0], dtype=np.int64) col_idx = np.arange(arr.shape[1], dtype=np.int64) indexer = row_idx, col_idx dtype, fill_value = arr.dtype, arr.dtype.type() else: row_idx, col_idx = indexer if row_idx is None: row_idx = np.arange(arr.shape[0], dtype=np.int64) else: row_idx = _ensure_int64(row_idx) if col_idx is None: col_idx = np.arange(arr.shape[1], dtype=np.int64) else: col_idx = _ensure_int64(col_idx) indexer = row_idx, col_idx if not allow_fill: dtype, fill_value = arr.dtype, arr.dtype.type() mask_info = None, False else: # check for promotion based on types only (do this first because # it's faster than computing a mask) dtype, fill_value = _maybe_promote(arr.dtype, fill_value) if dtype != arr.dtype and (out is None or out.dtype != dtype): # check if promotion is actually required based on indexer if mask_info is not None: (row_mask, col_mask), (row_needs, col_needs) = mask_info else: row_mask = row_idx == -1 col_mask = col_idx == -1 row_needs = row_mask.any() col_needs = col_mask.any() mask_info = (row_mask, col_mask), (row_needs, col_needs) if row_needs or col_needs: if out is not None and out.dtype != dtype: raise TypeError('Incompatible type for fill_value') else: # if not, then depromote, set fill_value to dummy # (it won't be used but we don't want the cython code # to crash when trying to cast it to dtype) dtype, fill_value = arr.dtype, arr.dtype.type() # at this point, it's guaranteed that dtype can hold both the arr values # and the fill_value if out is None: out_shape = len(row_idx), len(col_idx) out = np.empty(out_shape, dtype=dtype) func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None) if func is None and arr.dtype != out.dtype: func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None) if func is not None: func = _convert_wrapper(func, out.dtype) if func is None: def func(arr, indexer, out, fill_value=np.nan): _take_2d_multi_generic(arr, indexer, out, fill_value=fill_value, mask_info=mask_info) func(arr, indexer, out=out, fill_value=fill_value) return out
def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=True): """ Specialized Cython take which sets NaN values in one pass Parameters ---------- arr : ndarray Input array indexer : ndarray 1-D array of indices to take, subarrays corresponding to -1 value indicies are filed with fill_value axis : int, default 0 Axis to take from out : ndarray or None, default None Optional output array, must be appropriate type to hold input and fill_value together, if indexer has any -1 value entries; call _maybe_promote to determine this type for any fill_value fill_value : any, default np.nan Fill value to replace -1 values with mask_info : tuple of (ndarray, boolean) If provided, value should correspond to: (indexer != -1, (indexer != -1).any()) If not provided, it will be computed internally if necessary allow_fill : boolean, default True If False, indexer is assumed to contain no -1 values so no filling will be done. This short-circuits computation of a mask. Result is undefined if allow_fill == False and -1 is present in indexer. """ # dispatch to internal type takes if is_categorical(arr): return arr.take_nd(indexer, fill_value=fill_value, allow_fill=allow_fill) elif is_datetimetz(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) if indexer is None: indexer = np.arange(arr.shape[axis], dtype=np.int64) dtype, fill_value = arr.dtype, arr.dtype.type() else: indexer = _ensure_int64(indexer) if not allow_fill: dtype, fill_value = arr.dtype, arr.dtype.type() mask_info = None, False else: # check for promotion based on types only (do this first because # it's faster than computing a mask) dtype, fill_value = _maybe_promote(arr.dtype, fill_value) if dtype != arr.dtype and (out is None or out.dtype != dtype): # check if promotion is actually required based on indexer if mask_info is not None: mask, needs_masking = mask_info else: mask = indexer == -1 needs_masking = mask.any() mask_info = mask, needs_masking if needs_masking: if out is not None and out.dtype != dtype: raise TypeError('Incompatible type for fill_value') else: # if not, then depromote, set fill_value to dummy # (it won't be used but we don't want the cython code # to crash when trying to cast it to dtype) dtype, fill_value = arr.dtype, arr.dtype.type() flip_order = False if arr.ndim == 2: if arr.flags.f_contiguous: flip_order = True if flip_order: arr = arr.T axis = arr.ndim - axis - 1 if out is not None: out = out.T # at this point, it's guaranteed that dtype can hold both the arr values # and the fill_value if out is None: out_shape = list(arr.shape) out_shape[axis] = len(indexer) out_shape = tuple(out_shape) if arr.flags.f_contiguous and axis == arr.ndim - 1: # minor tweak that can make an order-of-magnitude difference # for dataframes initialized directly from 2-d ndarrays # (s.t. df.values is c-contiguous and df._data.blocks[0] is its # f-contiguous transpose) out = np.empty(out_shape, dtype=dtype, order='F') else: out = np.empty(out_shape, dtype=dtype) func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info) indexer = _ensure_int64(indexer) func(arr, indexer, out, fill_value) if flip_order: out = out.T return out
def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, allow_fill=True): """ Specialized Cython take which sets NaN values in one pass """ if indexer is None or (indexer[0] is None and indexer[1] is None): row_idx = np.arange(arr.shape[0], dtype=np.int64) col_idx = np.arange(arr.shape[1], dtype=np.int64) indexer = row_idx, col_idx dtype, fill_value = arr.dtype, arr.dtype.type() else: row_idx, col_idx = indexer if row_idx is None: row_idx = np.arange(arr.shape[0], dtype=np.int64) else: row_idx = _ensure_int64(row_idx) if col_idx is None: col_idx = np.arange(arr.shape[1], dtype=np.int64) else: col_idx = _ensure_int64(col_idx) indexer = row_idx, col_idx if not allow_fill: dtype, fill_value = arr.dtype, arr.dtype.type() mask_info = None, False else: # check for promotion based on types only (do this first because # it's faster than computing a mask) dtype, fill_value = _maybe_promote(arr.dtype, fill_value) if dtype != arr.dtype and (out is None or out.dtype != dtype): # check if promotion is actually required based on indexer if mask_info is not None: (row_mask, col_mask), (row_needs, col_needs) = mask_info else: row_mask = row_idx == -1 col_mask = col_idx == -1 row_needs = row_mask.any() col_needs = col_mask.any() mask_info = (row_mask, col_mask), (row_needs, col_needs) if row_needs or col_needs: if out is not None and out.dtype != dtype: raise TypeError('Incompatible type for fill_value') else: # if not, then depromote, set fill_value to dummy # (it won't be used but we don't want the cython code # to crash when trying to cast it to dtype) dtype, fill_value = arr.dtype, arr.dtype.type() # at this point, it's guaranteed that dtype can hold both the arr values # and the fill_value if out is None: out_shape = len(row_idx), len(col_idx) out = np.empty(out_shape, dtype=dtype) func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None) if func is None and arr.dtype != out.dtype: func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None) if func is not None: func = _convert_wrapper(func, out.dtype) if func is None: def func(arr, indexer, out, fill_value=np.nan): _take_2d_multi_object(arr, indexer, out, fill_value=fill_value, mask_info=mask_info) func(arr, indexer, out=out, fill_value=fill_value) return out