def _create_df_multiple_dtypes(self, arr_new, columns, column_locs, columns_other, locs_other): new_data = {} dtype_new = arr_new.dtype.kind try: add_loc = self._df._data[dtype_new].shape[1] except KeyError: add_loc = 0 for dtype, arr in self._df._data.items(): if dtype == self._dtype_acc: new_data[self._dtype_acc] = arr[:, locs_other] elif dtype == dtype_new: new_data[dtype_new] = np.asfortranarray(np.column_stack((arr, arr_new))) else: new_data[dtype] = arr.copy('F') if dtype_new not in new_data: new_data[dtype_new] = arr_new new_column_info = {} for col, old_dtype, loc, order in self._df._col_info_iter(with_order=True): # type: str, str, int, int if old_dtype != self._dtype_acc: new_column_info[col] = utils.Column(old_dtype, loc, order) # str columns that have changed type for i, (col, loc) in enumerate(zip(columns, column_locs)): order = self._df._column_info[col].order new_column_info[col] = utils.Column(dtype_new, add_loc + i, order) # those that stayed self._dtype_acc for i, col in enumerate(columns_other): order = self._df._column_info[col].order new_column_info[col] = utils.Column(self._dtype_acc, i, order) return self._df._construct_from_new(new_data, new_column_info, self._df._columns.copy())
def _create_df_all(self, arr, dtype): new_data = {} if dtype == self._dtype_acc: for old_dtype, old_data in self._df._data.items(): if old_dtype == self._dtype_acc: new_data[self._dtype_acc] = arr else: new_data[old_dtype] = old_data.copy('F') else: new_data = {} add_loc = 0 if dtype in self._data: add_loc = self._data[dtype].shape[1] for old_dtype, old_data in self._df._data.items(): if dtype != self._dtype_acc: new_data[old_dtype] = old_data.copy('F') if dtype in new_data: new_data[dtype] = np.asfortranarray( np.column_stack((new_data[dtype], arr))) else: new_data[dtype] = arr new_column_info = {} for col, col_obj in self._df._column_info.items(): old_dtype, loc, order = col_obj.values if old_dtype == self._dtype_acc: new_column_info[col] = utils.Column( dtype, loc + add_loc, order) else: new_column_info[col] = utils.Column(old_dtype, loc, order) new_column_info = self._df._copy_column_info() return self._df._construct_from_new(new_data, new_column_info, self._df._columns.copy())
def _create_df_all(self, arr, dtype): new_data = {} if dtype == 'S': for old_dtype, old_data in self._df._data.items(): if old_dtype == 'S': new_data['S'] = arr else: new_data[old_dtype] = old_data.copy('F') else: new_data = {} add_loc = 0 if dtype in self._df._data: add_loc = self._df._data[dtype].shape[1] for old_dtype, old_data in self._df._data.items(): if dtype != 'S': new_data[old_dtype] = old_data.copy('F') if dtype in new_data: new_data[dtype] = np.asfortranarray( np.column_stack((new_data[dtype], arr))) else: new_data[dtype] = arr new_column_info = {} for col, old_dtype, loc, order in self._df._col_info_iter( with_order=True): # type: str, str, int, int if old_dtype == 'S': new_column_info[col] = utils.Column( dtype, loc + add_loc, order) else: new_column_info[col] = utils.Column(old_dtype, loc, order) new_column_info = self._df._copy_column_info() return self._df._construct_from_new(new_data, new_column_info, self._df._columns.copy())
def apply(self, func, *args, **kwargs): if not isinstance(func, Callable): raise TypeError( 'The `func` variable must be a function or any callable object' ) labels = self._group_labels size = len(self._group_position) new_data, new_column_info, new_columns, group_repeats = _gb.apply( labels, size, self._df, func, *args, **kwargs) grouped_data_dict = self._get_group_col_data() grouped_column_info = self._get_new_column_info() grouped_columns = self._group_columns.copy() order_add = len(grouped_columns) new_column_info_final = {} for col in new_columns: dtype, loc, order = new_column_info[col].values loc_add = grouped_data_dict.get(dtype, 0) if loc_add != 0: loc_add = loc_add[0].shape[1] new_column_info_final[col] = utils.Column(dtype, loc + loc_add, order + order_add) new_grouped_columns = [] for col in grouped_columns: if col in new_column_info_final: new_grouped_columns.append(col + '_group') else: new_grouped_columns.append(col) dtype_loc = defaultdict(int) for i, col in enumerate(grouped_columns): dtype = grouped_column_info[col].dtype loc = dtype_loc[dtype] new_col = new_grouped_columns[i] new_column_info_final[new_col] = utils.Column(dtype, loc, i) dtype_loc[dtype] += 1 new_columns = np.concatenate((new_grouped_columns, new_columns)) for dtype, data_list in grouped_data_dict.items(): data = np.concatenate(data_list, 1) data = np.repeat(data, group_repeats, axis=0) if dtype not in new_data: new_data[dtype] = data else: new_data[dtype] = np.concatenate((data, new_data[dtype]), 1) return DataFrame._construct_from_new(new_data, new_column_info_final, new_columns)
def _cov_corr(self, name: str) -> DataFrame: calc_columns: List[str] = [] calc_dtype_loc: List[Tuple[str, int]] = [] np_dtype = 'int64' for col in self._df._columns: if col in self._group_columns: continue dtype, loc, order = self._df._column_info[col].values if dtype in 'fib': if dtype == 'f': np_dtype = 'float64' calc_columns.append(col) calc_dtype_loc.append((dtype, loc)) data = self._df._values_number_drop(calc_columns, calc_dtype_loc, np_dtype) dtype_word = utils.convert_kind_to_dtype(data.dtype.kind) func = getattr(_gb, name + '_' + dtype_word) result = func(self._group_labels, len(self), data, []) data_dict = self._get_group_col_data() data_dict_final: Dict[str, List[ndarray]] = defaultdict(list) for dtype, arrs in data_dict.items(): data_dict_final[dtype] = [ np.repeat(arrs[0], len(calc_columns), axis=0) ] new_column_info = self._get_new_column_info() num_group_cols = len(self._group_columns) new_columns = self._group_columns.copy() cur_obj_loc = utils.get_num_cols(data_dict_final.get('O', [])) column_name_array = np.tile(calc_columns, len(self))[:, np.newaxis] data_dict_final['O'].append(column_name_array) new_columns.append('Column Name') new_column_info['Column Name'] = utils.Column('O', cur_obj_loc, num_group_cols) cur_loc = utils.get_num_cols(data_dict_final.get('f', [])) for i, col in enumerate(calc_columns): new_column_info[col] = utils.Column('f', i + cur_loc, i + num_group_cols + 1) new_columns.append(col) data_dict_final['f'].append(result) new_data = utils.concat_stat_arrays(data_dict_final) return DataFrame._construct_from_new(new_data, new_column_info, new_columns)
def _create_df(self, arr, dtype, columns): new_data = {dtype: arr} new_column_info = { col: utils.Column(dtype, i, i) for i, col in enumerate(columns) } return self._df._construct_from_new(new_data, new_column_info, columns)
def size(self): name = self._get_agg_name('size') new_columns = np.array(self._group_columns + [name], dtype='O') size = _gb.size(self._group_labels, len(self._group_position))[:, np.newaxis] data_dict = self._get_group_col_data() data_dict['i'].append(size) new_data = utils.concat_stat_arrays(data_dict) new_column_info = self._get_new_column_info() new_column_info[name] = utils.Column('i', new_data['i'].shape[1] - 1, len(new_columns) - 1) return DataFrame._construct_from_new(new_data, new_column_info, new_columns)
def cumcount(self) -> DataFrame: # todo: add ascending=False name = self._get_agg_name('cumcount') new_columns = np.array(self._group_columns + [name], dtype='O') cumcount = _gb.cumcount(self._group_labels, len(self._group_position))[:, np.newaxis] data_dict = self._get_group_col_data_all() data_dict['i'].append(cumcount) new_data = utils.concat_stat_arrays(data_dict) new_column_info = self._get_new_column_info() new_column_info[name] = utils.Column('i', new_data['i'].shape[1] - 1, len(new_columns) - 1) return DataFrame._construct_from_new(new_data, new_column_info, new_columns)
def read_csv(fp, sep=',', header=0, skiprows=None, usecols=None): if not isinstance(sep, str): raise TypeError('`sep` must be a string') if len(sep) != 1: raise ValueError('`sep` must only be one character in length') if not isinstance(header, int): raise TypeError('`header` must be an integer') if header < -1: raise ValueError('`header` must be greater than or equal to -1') if isinstance(usecols, list): if len(usecols) == 0: raise ValueError('`usecols` must be a non-empty list of integers or column names') elif usecols is not None: raise TypeError('`usecols` must be a list of integers or column names') nrows = _get_file_legnth(fp) skiprows_set = set() skiprows_int = 0 if skiprows is None: pass elif isinstance(skiprows, int): if skiprows < 0: raise ValueError('`skiprows` must be one or more non-negative integers') skiprows_int = skiprows else: skiprows_arr = np.asarray(skiprows) if (skiprows_arr < 0).any(): raise ValueError('All values in the `skiprows` sequence must be >= 0') if header == -1: skiprows_set = set(skiprows_arr) else: max_row = skiprows_arr.max() if header > max_row - len(skiprows_arr): header += len(skiprows_arr) else: max_rows = np.arange(max_row) kept_rows = max_rows[~np.isin(max_rows, skiprows_arr)] header = kept_rows[header] skiprows_set = set(skiprows_arr[skiprows_arr > header]) tuple_return = _rf.read_csv(fp, nrows, ord(sep), header, skiprows_int, skiprows_set, usecols) a_bool, a_int, a_float, a_str, columns, dtypes, dtype_loc = tuple_return new_column_info = {} dtype_map = {1: 'b', 2: 'i', 3: 'f', 4: 'O'} final_dtype_locs = defaultdict(list) for i, (col, dtype, loc) in enumerate(zip(columns, dtypes, dtype_loc)): new_column_info[col] = utils.Column(dtype_map[dtype], loc, i) final_dtype_locs[dtype_map[dtype]].append(loc) new_data = {} loc_order_changed = set() for arr, dtype in zip((a_bool, a_int, a_float, a_str), ('b', 'i', 'f', 'O')): num_cols = arr.shape[1] if num_cols != 0: locs = final_dtype_locs[dtype] if len(locs) == num_cols: new_data[dtype] = arr else: loc_order_changed.add(dtype) new_data[dtype] = arr[:, locs] if loc_order_changed: cur_dtype_loc = defaultdict(int) for col in columns: dtype, loc, order = new_column_info[col].values if dtype in loc_order_changed: new_column_info[col].loc = cur_dtype_loc[dtype] cur_dtype_loc[dtype] += 1 new_columns = np.array(columns, dtype='O') return DataFrame._construct_from_new(new_data, new_column_info, new_columns)
def _roll_generic(self, name, columns, **kwargs): if columns is None: columns = self._df.columns elif isinstance(columns, str): columns = [columns] elif not isinstance(columns, list): raise TypeError( '`columns` must either be a string, a list of column names, or None' ) col_order = dict(zip(columns, range(len(columns)))) dtype_locs = defaultdict(list) dtype_cols = defaultdict(list) col_info = self._df._column_info for i, col in enumerate(columns): try: dtype, loc, order = col_info[col].values except KeyError: raise KeyError(f'{col} is not a column name') dtype_locs[dtype].append(loc) dtype_cols[dtype].append(col) kept_dtype_loc = defaultdict(list) new_col_info = {} dtype_ct = defaultdict(int) for i, col in enumerate(self._kept_columns): dtype, loc, _ = col_info[col].values new_loc = len(kept_dtype_loc[dtype]) kept_dtype_loc[dtype].append(loc) new_col_info[col] = utils.Column(dtype, new_loc, i) dtype_ct[dtype] += 1 data_dict = defaultdict(list) for dtype, locs in dtype_locs.items(): func_name = name + '_' + utils.convert_kind_to_dtype_generic(dtype) data = self._df._data[dtype] result = getattr(_roll, func_name)(data, np.array(locs), self._left, self._right, self._min_window, **kwargs) result_dtype = result.dtype.kind data_dict[result_dtype].append(result) for col in dtype_cols[dtype]: order = col_order[col] new_col = col if col in self._kept_columns: new_col = col + '_rolling' columns[columns.index(col)] = new_col new_col_info[new_col] = utils.Column( result_dtype, dtype_ct[result_dtype], order + len(self._kept_columns)) dtype_ct[result_dtype] += 1 new_data = {} for dtype, locs in kept_dtype_loc.items(): data = self._df._data[dtype][:, locs] if data.ndim == 1: data = data[:, np.newaxis] new_data[dtype] = data for dtype, data in data_dict.items(): if dtype not in new_data: new_data[dtype] = np.column_stack((*data, )) else: new_data[dtype] = np.column_stack((new_data[dtype], *data)) new_columns = np.concatenate((self._kept_columns, columns)) return DataFrame._construct_from_new(new_data, new_col_info, new_columns)
def _roll_agg(self, agg_cols: Dict = None, new_names: Dict = None, new_order: Dict = None, num_agg_cols: int = None, func_kwargs: Dict = None): col_info = self._df._column_info kept_dtype_loc = defaultdict(list) new_column_info = {} dtype_ct = defaultdict(int) for i, col in enumerate(self._kept_columns): dtype, loc, _ = col_info[col].values new_loc = len(kept_dtype_loc[dtype]) kept_dtype_loc[dtype].append(loc) new_column_info[col] = utils.Column(dtype, new_loc, i) dtype_ct[dtype] += 1 data_dict = defaultdict(list) new_columns = self._kept_columns.copy() + [''] * num_agg_cols for name, agg_cols in agg_cols.items(): agg_dtype_locs = defaultdict(list) agg_dtype_names = defaultdict(list) agg_dtype_new_names = defaultdict(list) agg_dtype_order = defaultdict(list) non_agg_dtype_locs = defaultdict(list) agg_dtype_kwargs = defaultdict(list) if isinstance(name, str): # name can also be a custom function name_kwargs = get_func_kwargs(name) ignore_str = name_kwargs.get('ignore_str', True) ignore_date = name_kwargs.get('ignore_date', True) keep_date_type = name_kwargs.get('keep_date_type', True) else: ignore_str = False ignore_date = False keep_date_type = True cur_new_names = new_names[name] cur_new_order = new_order[name] kwargs_list = func_kwargs[name] for col in self._df._columns: dtype, loc, _ = self._df._column_info[col].values try: idx = agg_cols.index(col) except ValueError: non_agg_dtype_locs[dtype].append(loc) else: agg_dtype_locs[dtype].append(loc) agg_dtype_names[dtype].append(col) agg_dtype_new_names[dtype].append(cur_new_names[idx]) agg_dtype_order[dtype].append(cur_new_order[idx]) agg_dtype_kwargs[dtype].append(kwargs_list[idx]) for dtype, data in self._df._data.items(): if dtype not in agg_dtype_locs: continue if ignore_str and dtype == 'O': continue if ignore_date and dtype in 'mM': continue if dtype in 'mM': data = data.view('int64') kwargs = {} for kw in agg_dtype_kwargs[dtype]: if kw is not None: kwargs = kw break if isinstance(name, str): func_name = name + '_' + utils.convert_kind_to_dtype_generic( dtype) else: func_name = 'custom_' + utils.convert_kind_to_dtype_generic( dtype) # 'name' is actually a function here kwargs['func'] = name kwargs['col_dict'] = dict( zip(agg_dtype_locs[dtype], agg_dtype_names[dtype])) func = getattr(_roll, func_name) arr = func(data, np.array(agg_dtype_locs[dtype]), self._left, self._right, self._min_window, **kwargs) if dtype in 'mM' and keep_date_type: new_kind = dtype arr = arr.astype(utils.convert_kind_to_dtype(dtype)) else: new_kind = arr.dtype.kind cur_loc = utils.get_num_cols(data_dict.get( new_kind, [])) + dtype_ct[new_kind] data_dict[new_kind].append(arr) old_locs = agg_dtype_locs[dtype] order = np.argsort(old_locs).tolist() cur_names = np.array(agg_dtype_new_names[dtype])[order] cur_order = len(self._kept_columns) + np.array( agg_dtype_order[dtype])[order] for i, cur_name in enumerate(cur_names): new_column_info[cur_name] = utils.Column( new_kind, cur_loc + i, cur_order[i]) new_columns[cur_order[i]] = cur_name new_data = {} for dtype, locs in kept_dtype_loc.items(): data = self._df._data[dtype][:, locs] if data.ndim == 1: data = data[:, np.newaxis] new_data[dtype] = data for dtype, data in data_dict.items(): if dtype not in new_data: new_data[dtype] = np.column_stack((*data, )) else: new_data[dtype] = np.column_stack((new_data[dtype], *data)) return DataFrame._construct_from_new( new_data, new_column_info, np.asarray(new_columns, dtype='O'))
def _single_agg(self, agg_cols: Dict = None, new_names: Dict = None, new_order: Dict = None, num_agg_cols: int = None, func_kwargs: Dict = None) -> DataFrame: labels = self._group_labels size = len(self._group_position) data_dict = self._get_group_col_data() new_column_info = self._get_new_column_info() new_columns = self._group_columns.copy() + [''] * num_agg_cols for name, agg_cols in agg_cols.items(): agg_dtype_locs = defaultdict(list) agg_dtype_names = defaultdict(list) agg_dtype_new_names = defaultdict(list) agg_dtype_order = defaultdict(list) non_agg_dtype_locs = defaultdict(list) agg_dtype_kwargs = defaultdict(list) if isinstance(name, str): # name can also be a custom function name_kwargs = get_func_kwargs(name) ignore_str = name_kwargs.get('ignore_str', True) add_positions = name_kwargs.get('add_positions', False) ignore_date = name_kwargs.get('ignore_date', True) keep_date_type = name_kwargs.get('keep_date_type', True) else: ignore_str = False add_positions = False ignore_date = False keep_date_type = True cur_new_names = new_names[name] cur_new_order = new_order[name] kwargs_list = func_kwargs[name] for col in self._df._columns: dtype, loc, _ = self._df._column_info[col].values try: idx = agg_cols.index(col) except ValueError: non_agg_dtype_locs[dtype].append(loc) else: agg_dtype_locs[dtype].append(loc) agg_dtype_names[dtype].append(col) agg_dtype_new_names[dtype].append(cur_new_names[idx]) agg_dtype_order[dtype].append(cur_new_order[idx]) agg_dtype_kwargs[dtype].append(kwargs_list[idx]) for dtype, data in self._df._data.items(): if dtype not in agg_dtype_locs: continue if ignore_str and dtype == 'O': continue if ignore_date and dtype in 'mM': continue if dtype in 'mM': data = data.view('int64') kwargs = {} for kw in agg_dtype_kwargs[dtype]: if kw is not None: kwargs = kw break if isinstance(name, str): func_name = name + '_' + utils.convert_kind_to_dtype_generic( dtype) else: func_name = 'custom_' + utils.convert_kind_to_dtype_generic( dtype) # 'name' is actually a function here kwargs['func'] = name kwargs['col_dict'] = dict( zip(agg_dtype_locs[dtype], agg_dtype_names[dtype])) func = getattr(_gb, func_name) if add_positions: arr = func(labels, size, data, non_agg_dtype_locs[dtype], self._group_position, **kwargs) else: arr = func(labels, size, data, non_agg_dtype_locs[dtype], **kwargs) if dtype in 'mM' and keep_date_type: new_kind = dtype arr = arr.astype(utils.convert_kind_to_dtype(dtype)) else: new_kind = arr.dtype.kind cur_loc = utils.get_num_cols(data_dict.get(new_kind, [])) data_dict[new_kind].append(arr) old_locs = agg_dtype_locs[dtype] order = np.argsort(old_locs).tolist() cur_names = np.array(agg_dtype_new_names[dtype])[order] cur_order = len(self._group_columns) + np.array( agg_dtype_order[dtype])[order] for i, cur_name in enumerate(cur_names): new_column_info[cur_name] = utils.Column( new_kind, cur_loc + i, cur_order[i]) new_columns[cur_order[i]] = cur_name new_data = utils.concat_stat_arrays(data_dict) new_columns = np.array(new_columns, dtype='O') return DataFrame._construct_from_new(new_data, new_column_info, new_columns)
def _str_generic_concat(self, name, column, keep, return_dtype, **kwargs): if not isinstance(keep, (bool, np.bool_)): raise TypeError('`keep` must be a boolean') if column is None: columns = [] locs = [] for col in self._df._columns: dtype, loc, _ = self._df._column_info[col].values if dtype == 'O': columns.append(col) locs.append(loc) else: columns, locs = self._validate_columns(column) data = self._df._data['O'] count = 0 if return_dtype != 'O': if return_dtype in self._df._data: count = self._df._data[return_dtype].shape[1] else: count = self._df._data['O'].shape[1] - len(columns) kwargs['count'] = count final_arr, final_cols, group_len = getattr(_sf, name)(data[:, locs], **kwargs) dtype_new = final_arr.dtype.kind if len(columns) > 1: final_cols = np.repeat(columns, group_len).astype('O') + '_' + final_cols new_column_info = {} new_data = {} add_loc = 0 add_order = 0 if keep: df = self._df.drop(columns=columns) if dtype_new in df._data: add_loc = df._data[dtype_new].shape[1] add_order = df.shape[1] for dtype, arr in df._data.items(): if dtype == dtype_new: for i in range(arr.shape[1]): final_arr[:, i] = arr[:, i] new_data[dtype_new] = final_arr else: new_data[dtype] = arr if dtype_new not in df._data: new_data[dtype_new] = final_arr new_column_info = df._copy_column_info() new_columns = np.concatenate((df._columns, final_cols)) else: new_data = {dtype_new: final_arr} new_columns = final_cols for i, col in enumerate(final_cols): new_column_info[col] = utils.Column(dtype_new, i + add_loc, i + add_order) return self._df._construct_from_new(new_data, new_column_info, new_columns)
def _group_agg(self, name: str, ignore_str: bool = True, add_positions: bool = False, keep_group_cols: bool = True, ignore_date: bool = True, keep_date_type: bool = True, **kwargs) -> DataFrame: labels = self._group_labels size = len(self._group_position) old_dtype_col: Dict[str, List[str]] = defaultdict(list) for col, col_obj in self._df._column_info.items(): if col not in self._group_columns: old_dtype_col[col_obj.dtype].append(col) if keep_group_cols: data_dict = self._get_group_col_data() new_column_info = self._get_new_column_info() new_columns = self._group_columns.copy() else: data_dict = defaultdict(list) new_column_info = {} new_columns = [] for dtype, data in self._df._data.items(): if ignore_str and dtype == 'O': continue if ignore_date and dtype in 'mM': continue # number of grouped columns group_locs: list = self._group_dtype_loc.get(dtype, []) if len(group_locs) != data.shape[1]: func_name = name + '_' + utils.convert_kind_to_dtype_generic( dtype) func = getattr(_gb, func_name) if dtype in 'mM': data = data.view('int64') if add_positions: arr = func(labels, size, data, group_locs, self._group_position, **kwargs) else: arr = func(labels, size, data, group_locs, **kwargs) else: continue if dtype in 'mM' and keep_date_type: new_kind = dtype arr = arr.astype(utils.convert_kind_to_dtype(dtype)) else: new_kind = arr.dtype.kind cur_loc = utils.get_num_cols(data_dict.get(new_kind, [])) data_dict[new_kind].append(arr) for col in old_dtype_col[dtype]: count_less = 0 old_kind, old_loc, old_order = self._df._column_info[ col].values for k in self._group_dtype_loc.get(dtype, []): count_less += old_loc > k new_column_info[col] = utils.Column( new_kind, cur_loc + old_loc - count_less, 0) i = len(new_columns) j = 0 for col in self._df._columns: if col not in new_column_info: continue if col in self._group_columns and keep_group_cols: new_column_info[col].order = j j += 1 continue new_columns.append(col) new_column_info[col].order = i i += 1 new_data = utils.concat_stat_arrays(data_dict) return DataFrame._construct_from_new(new_data, new_column_info, new_columns)
def _get_new_column_info(self) -> ColInfoT: new_column_info: ColInfoT = {} for col, col_obj in self._column_info.items(): new_column_info[col] = utils.Column(*col_obj.values) return new_column_info
def _generic_concat(self, name, column, keep, **kwargs): if not isinstance(keep, (bool, np.bool_)): raise TypeError('`keep` must be a boolean') if column is None: columns = [] locs = [] for col in self._df._columns: dtype, loc, _ = self._df._column_info[col].values if dtype == self._dtype_acc: columns.append(col) locs.append(loc) else: columns, locs = self._validate_columns(column) data = self._df._data[self._dtype_acc] arrs = [] all_cols = [] for loc in locs: arr, new_columns = getattr(_sf, name)(data[:, loc], **kwargs) arrs.append(arr) all_cols.append(new_columns) dtype_new = arrs[0].dtype.kind if len(arrs) == 1: final_arr = arrs[0] final_cols = all_cols[0] else: final_arr = np.column_stack(arrs) all_cols_new = [] for cols, orig_name in zip(all_cols, columns): all_cols_new.append(cols + '_' + orig_name) final_cols = np.concatenate(all_cols_new) new_column_info = {} new_data = {} add_loc = 0 add_order = 0 if keep: df = self._df.drop(columns=columns) if dtype_new in df._data: add_loc = df._data[dtype_new].shape[1] add_order = df.shape[1] for dtype, arr in df._data.items(): if dtype == dtype_new: new_data[dtype_new] = np.column_stack((arr, final_arr)) else: new_data[dtype] = arr.copy('F') if dtype_new not in df._data: new_data[dtype_new] = final_arr new_column_info = df._copy_column_info() new_columns = np.concatenate((df._columns, final_cols)) else: new_data = {dtype_new: final_arr} new_columns = final_cols for i, col in enumerate(final_cols): new_column_info[col] = utils.Column(dtype_new, i + add_loc, i + add_order) return self._df._construct_from_new(new_data, new_column_info, new_columns)
def _create_groups( self, columns: Union[str, List[str]]) -> Tuple[ndarray, ndarray]: self._group_dtype_loc: Dict[str, List[int]] = defaultdict(list) self._column_info: ColInfoT = {} for i, col in enumerate(columns): dtype, loc, _ = self._df._column_info[ col].values # type: str, int, int cur_loc = len(self._group_dtype_loc[dtype]) self._group_dtype_loc[dtype].append(loc) self._column_info[col] = utils.Column(dtype, cur_loc, i) if len(columns) == 1: # since there is just one column, dtype is from the for-loop final_arr = self._df._data[dtype][:, loc] if dtype in 'mM': final_arr = final_arr.view('int64') dtype = final_arr.dtype.kind func_name = 'get_group_assignment_' + utils.convert_kind_to_dtype( dtype) + '_1d' return getattr(_gb, func_name)(final_arr) elif len(self._group_dtype_loc ) == 1 or 'O' not in self._group_dtype_loc: arrs = [] for dtype, locs in self._group_dtype_loc.items(): arr = self._df._data[dtype][:, locs] if dtype in 'mM': arr = arr.view('int64') arrs.append(arr) if len(arrs) == 1: final_arr = arrs[0] else: final_arr = np.column_stack(arrs) dtype = final_arr.dtype.kind func_name = 'get_group_assignment_' + utils.convert_kind_to_dtype( dtype) + '_2d' final_arr = np.ascontiguousarray(final_arr) return getattr(_gb, func_name)(final_arr) else: arrs = [] for dtype, locs in self._group_dtype_loc.items(): if dtype == 'O': arr_str = self._df._data['O'][:, locs] else: arr = self._df._data[dtype][:, locs] if dtype in 'mM': arr = arr.view('int64') arrs.append(arr) if len(arrs) == 1: arr_numbers = arrs[0] else: arr_numbers = np.column_stack(arrs) dtype = arr_numbers.dtype.kind if arr_str.shape[1] == 1: arr_str = arr_str[:, 0] if arr_numbers.shape[1] == 1: arr_numbers = arr_numbers[:, 0] str_ndim = str(arr_str.ndim) + 'd_' num_ndim = str(arr_numbers.ndim) + 'd' dtype_str = utils.convert_kind_to_dtype(dtype) + '_' func_name = 'get_group_assignment_str_' + str_ndim + dtype_str + num_ndim arr_numbers = np.ascontiguousarray(arr_numbers) return getattr(_gb, func_name)(arr_str, arr_numbers)