Exemple #1
0
 def size(self):
     name = self._get_agg_name('size')
     new_columns = np.array(self._group_columns + [name], dtype='O')
     size = _gb.size(self._group_labels,
                     len(self._group_position))[:, np.newaxis]
     data_dict = self._get_group_col_data()
     data_dict['i'].append(size)
     new_data = utils.concat_stat_arrays(data_dict)
     new_column_info = self._get_new_column_info()
     new_column_info[name] = utils.Column('i', new_data['i'].shape[1] - 1,
                                          len(new_columns) - 1)
     return DataFrame._construct_from_new(new_data, new_column_info,
                                          new_columns)
Exemple #2
0
    def apply(self, func, *args, **kwargs):
        if not isinstance(func, Callable):
            raise TypeError(
                'The `func` variable must be a function or any callable object'
            )
        labels = self._group_labels
        size = len(self._group_position)
        new_data, new_column_info, new_columns, group_repeats = _gb.apply(
            labels, size, self._df, func, *args, **kwargs)

        grouped_data_dict = self._get_group_col_data()
        grouped_column_info = self._get_new_column_info()
        grouped_columns = self._group_columns.copy()
        order_add = len(grouped_columns)

        new_column_info_final = {}
        for col in new_columns:
            dtype, loc, order = new_column_info[col].values
            loc_add = grouped_data_dict.get(dtype, 0)
            if loc_add != 0:
                loc_add = loc_add[0].shape[1]
            new_column_info_final[col] = utils.Column(dtype, loc + loc_add,
                                                      order + order_add)

        new_grouped_columns = []
        for col in grouped_columns:
            if col in new_column_info_final:
                new_grouped_columns.append(col + '_group')
            else:
                new_grouped_columns.append(col)

        dtype_loc = defaultdict(int)
        for i, col in enumerate(grouped_columns):
            dtype = grouped_column_info[col].dtype
            loc = dtype_loc[dtype]
            new_col = new_grouped_columns[i]
            new_column_info_final[new_col] = utils.Column(dtype, loc, i)
            dtype_loc[dtype] += 1

        new_columns = np.concatenate((new_grouped_columns, new_columns))

        for dtype, data_list in grouped_data_dict.items():
            data = np.concatenate(data_list, 1)
            data = np.repeat(data, group_repeats, axis=0)
            if dtype not in new_data:
                new_data[dtype] = data
            else:
                new_data[dtype] = np.concatenate((data, new_data[dtype]), 1)

        return DataFrame._construct_from_new(new_data, new_column_info_final,
                                             new_columns)
Exemple #3
0
    def _cov_corr(self, name: str) -> DataFrame:
        calc_columns: List[str] = []
        calc_dtype_loc: List[Tuple[str, int]] = []
        np_dtype = 'int64'
        for col in self._df._columns:
            if col in self._group_columns:
                continue
            dtype, loc, order = self._df._column_info[col].values
            if dtype in 'fib':
                if dtype == 'f':
                    np_dtype = 'float64'
                calc_columns.append(col)
                calc_dtype_loc.append((dtype, loc))

        data = self._df._values_number_drop(calc_columns, calc_dtype_loc,
                                            np_dtype)
        dtype_word = utils.convert_kind_to_dtype(data.dtype.kind)
        func = getattr(_gb, name + '_' + dtype_word)
        result = func(self._group_labels, len(self), data, [])

        data_dict = self._get_group_col_data()
        data_dict_final: Dict[str, List[ndarray]] = defaultdict(list)
        for dtype, arrs in data_dict.items():
            data_dict_final[dtype] = [
                np.repeat(arrs[0], len(calc_columns), axis=0)
            ]

        new_column_info = self._get_new_column_info()
        num_group_cols = len(self._group_columns)
        new_columns = self._group_columns.copy()

        cur_obj_loc = utils.get_num_cols(data_dict_final.get('O', []))
        column_name_array = np.tile(calc_columns, len(self))[:, np.newaxis]
        data_dict_final['O'].append(column_name_array)
        new_columns.append('Column Name')
        new_column_info['Column Name'] = utils.Column('O', cur_obj_loc,
                                                      num_group_cols)

        cur_loc = utils.get_num_cols(data_dict_final.get('f', []))

        for i, col in enumerate(calc_columns):
            new_column_info[col] = utils.Column('f', i + cur_loc,
                                                i + num_group_cols + 1)
            new_columns.append(col)

        data_dict_final['f'].append(result)
        new_data = utils.concat_stat_arrays(data_dict_final)

        return DataFrame._construct_from_new(new_data, new_column_info,
                                             new_columns)
Exemple #4
0
 def cumcount(self) -> DataFrame:
     # todo: add ascending=False
     name = self._get_agg_name('cumcount')
     new_columns = np.array(self._group_columns + [name], dtype='O')
     cumcount = _gb.cumcount(self._group_labels,
                             len(self._group_position))[:, np.newaxis]
     data_dict = self._get_group_col_data_all()
     data_dict['i'].append(cumcount)
     new_data = utils.concat_stat_arrays(data_dict)
     new_column_info = self._get_new_column_info()
     new_column_info[name] = utils.Column('i', new_data['i'].shape[1] - 1,
                                          len(new_columns) - 1)
     return DataFrame._construct_from_new(new_data, new_column_info,
                                          new_columns)
Exemple #5
0
def read_csv(fp, sep=',', header=0, skiprows=None, usecols=None):
    if not isinstance(sep, str):
        raise TypeError('`sep` must be a string')
    if len(sep) != 1:
        raise ValueError('`sep` must only be one character in length')
    if not isinstance(header, int):
        raise TypeError('`header` must be an integer')
    if header < -1:
        raise ValueError('`header` must be greater than or equal to -1')

    if isinstance(usecols, list):
        if len(usecols) == 0:
            raise ValueError('`usecols` must be a non-empty list of integers or column names')
    elif usecols is not None:
        raise TypeError('`usecols` must be a list of integers or column names')

    nrows = _get_file_legnth(fp)

    skiprows_set = set()
    skiprows_int = 0
    if skiprows is None:
        pass
    elif isinstance(skiprows, int):
        if skiprows < 0:
            raise ValueError('`skiprows` must be one or more non-negative integers')
        skiprows_int = skiprows
    else:
        skiprows_arr = np.asarray(skiprows)
        if (skiprows_arr < 0).any():
            raise ValueError('All values in the `skiprows` sequence must be >= 0')
        if header == -1:
            skiprows_set = set(skiprows_arr)
        else:
            max_row = skiprows_arr.max()
            if header > max_row - len(skiprows_arr):
                header += len(skiprows_arr)
            else:
                max_rows = np.arange(max_row)
                kept_rows = max_rows[~np.isin(max_rows, skiprows_arr)]
                header = kept_rows[header]
                skiprows_set = set(skiprows_arr[skiprows_arr > header])

    tuple_return = _rf.read_csv(fp, nrows, ord(sep), header, skiprows_int, skiprows_set, usecols)

    a_bool, a_int, a_float, a_str, columns, dtypes, dtype_loc = tuple_return

    new_column_info = {}
    dtype_map = {1: 'b', 2: 'i', 3: 'f', 4: 'O'}
    final_dtype_locs = defaultdict(list)
    for i, (col, dtype, loc) in enumerate(zip(columns, dtypes, dtype_loc)):
        new_column_info[col] = utils.Column(dtype_map[dtype], loc, i)
        final_dtype_locs[dtype_map[dtype]].append(loc)

    new_data = {}
    loc_order_changed = set()
    for arr, dtype in zip((a_bool, a_int, a_float, a_str), ('b', 'i', 'f', 'O')):
        num_cols = arr.shape[1]
        if num_cols != 0:
            locs = final_dtype_locs[dtype]
            if len(locs) == num_cols:
                new_data[dtype] = arr
            else:
                loc_order_changed.add(dtype)
                new_data[dtype] = arr[:, locs]

    if loc_order_changed:
        cur_dtype_loc = defaultdict(int)
        for col in columns:
            dtype, loc, order = new_column_info[col].values
            if dtype in loc_order_changed:
                new_column_info[col].loc = cur_dtype_loc[dtype]
                cur_dtype_loc[dtype] += 1
    new_columns = np.array(columns, dtype='O')
    return DataFrame._construct_from_new(new_data, new_column_info, new_columns)
Exemple #6
0
    def _roll_generic(self, name, columns, **kwargs):
        if columns is None:
            columns = self._df.columns
        elif isinstance(columns, str):
            columns = [columns]
        elif not isinstance(columns, list):
            raise TypeError(
                '`columns` must either be a string, a list of column names, or None'
            )

        col_order = dict(zip(columns, range(len(columns))))

        dtype_locs = defaultdict(list)
        dtype_cols = defaultdict(list)
        col_info = self._df._column_info
        for i, col in enumerate(columns):
            try:
                dtype, loc, order = col_info[col].values
            except KeyError:
                raise KeyError(f'{col} is not a column name')

            dtype_locs[dtype].append(loc)
            dtype_cols[dtype].append(col)

        kept_dtype_loc = defaultdict(list)
        new_col_info = {}
        dtype_ct = defaultdict(int)
        for i, col in enumerate(self._kept_columns):
            dtype, loc, _ = col_info[col].values
            new_loc = len(kept_dtype_loc[dtype])
            kept_dtype_loc[dtype].append(loc)
            new_col_info[col] = utils.Column(dtype, new_loc, i)
            dtype_ct[dtype] += 1

        data_dict = defaultdict(list)
        for dtype, locs in dtype_locs.items():
            func_name = name + '_' + utils.convert_kind_to_dtype_generic(dtype)
            data = self._df._data[dtype]
            result = getattr(_roll, func_name)(data, np.array(locs),
                                               self._left, self._right,
                                               self._min_window, **kwargs)
            result_dtype = result.dtype.kind
            data_dict[result_dtype].append(result)
            for col in dtype_cols[dtype]:
                order = col_order[col]
                new_col = col
                if col in self._kept_columns:
                    new_col = col + '_rolling'
                    columns[columns.index(col)] = new_col
                new_col_info[new_col] = utils.Column(
                    result_dtype, dtype_ct[result_dtype],
                    order + len(self._kept_columns))
                dtype_ct[result_dtype] += 1

        new_data = {}
        for dtype, locs in kept_dtype_loc.items():
            data = self._df._data[dtype][:, locs]
            if data.ndim == 1:
                data = data[:, np.newaxis]
            new_data[dtype] = data

        for dtype, data in data_dict.items():
            if dtype not in new_data:
                new_data[dtype] = np.column_stack((*data, ))
            else:
                new_data[dtype] = np.column_stack((new_data[dtype], *data))

        new_columns = np.concatenate((self._kept_columns, columns))
        return DataFrame._construct_from_new(new_data, new_col_info,
                                             new_columns)
Exemple #7
0
    def _roll_agg(self,
                  agg_cols: Dict = None,
                  new_names: Dict = None,
                  new_order: Dict = None,
                  num_agg_cols: int = None,
                  func_kwargs: Dict = None):

        col_info = self._df._column_info
        kept_dtype_loc = defaultdict(list)
        new_column_info = {}
        dtype_ct = defaultdict(int)
        for i, col in enumerate(self._kept_columns):
            dtype, loc, _ = col_info[col].values
            new_loc = len(kept_dtype_loc[dtype])
            kept_dtype_loc[dtype].append(loc)
            new_column_info[col] = utils.Column(dtype, new_loc, i)
            dtype_ct[dtype] += 1

        data_dict = defaultdict(list)
        new_columns = self._kept_columns.copy() + [''] * num_agg_cols

        for name, agg_cols in agg_cols.items():

            agg_dtype_locs = defaultdict(list)
            agg_dtype_names = defaultdict(list)
            agg_dtype_new_names = defaultdict(list)
            agg_dtype_order = defaultdict(list)
            non_agg_dtype_locs = defaultdict(list)
            agg_dtype_kwargs = defaultdict(list)

            if isinstance(name, str):
                # name can also be a custom function
                name_kwargs = get_func_kwargs(name)
                ignore_str = name_kwargs.get('ignore_str', True)
                ignore_date = name_kwargs.get('ignore_date', True)
                keep_date_type = name_kwargs.get('keep_date_type', True)
            else:
                ignore_str = False
                ignore_date = False
                keep_date_type = True

            cur_new_names = new_names[name]
            cur_new_order = new_order[name]
            kwargs_list = func_kwargs[name]

            for col in self._df._columns:

                dtype, loc, _ = self._df._column_info[col].values
                try:
                    idx = agg_cols.index(col)
                except ValueError:
                    non_agg_dtype_locs[dtype].append(loc)
                else:
                    agg_dtype_locs[dtype].append(loc)
                    agg_dtype_names[dtype].append(col)
                    agg_dtype_new_names[dtype].append(cur_new_names[idx])
                    agg_dtype_order[dtype].append(cur_new_order[idx])
                    agg_dtype_kwargs[dtype].append(kwargs_list[idx])

            for dtype, data in self._df._data.items():
                if dtype not in agg_dtype_locs:
                    continue
                if ignore_str and dtype == 'O':
                    continue
                if ignore_date and dtype in 'mM':
                    continue

                if dtype in 'mM':
                    data = data.view('int64')

                kwargs = {}
                for kw in agg_dtype_kwargs[dtype]:
                    if kw is not None:
                        kwargs = kw
                        break

                if isinstance(name, str):
                    func_name = name + '_' + utils.convert_kind_to_dtype_generic(
                        dtype)
                else:
                    func_name = 'custom_' + utils.convert_kind_to_dtype_generic(
                        dtype)
                    # 'name' is actually a function here
                    kwargs['func'] = name
                    kwargs['col_dict'] = dict(
                        zip(agg_dtype_locs[dtype], agg_dtype_names[dtype]))

                func = getattr(_roll, func_name)

                arr = func(data, np.array(agg_dtype_locs[dtype]), self._left,
                           self._right, self._min_window, **kwargs)

                if dtype in 'mM' and keep_date_type:
                    new_kind = dtype
                    arr = arr.astype(utils.convert_kind_to_dtype(dtype))
                else:
                    new_kind = arr.dtype.kind

                cur_loc = utils.get_num_cols(data_dict.get(
                    new_kind, [])) + dtype_ct[new_kind]
                data_dict[new_kind].append(arr)

                old_locs = agg_dtype_locs[dtype]
                order = np.argsort(old_locs).tolist()

                cur_names = np.array(agg_dtype_new_names[dtype])[order]
                cur_order = len(self._kept_columns) + np.array(
                    agg_dtype_order[dtype])[order]

                for i, cur_name in enumerate(cur_names):
                    new_column_info[cur_name] = utils.Column(
                        new_kind, cur_loc + i, cur_order[i])
                    new_columns[cur_order[i]] = cur_name

        new_data = {}
        for dtype, locs in kept_dtype_loc.items():
            data = self._df._data[dtype][:, locs]
            if data.ndim == 1:
                data = data[:, np.newaxis]
            new_data[dtype] = data

        for dtype, data in data_dict.items():
            if dtype not in new_data:
                new_data[dtype] = np.column_stack((*data, ))
            else:
                new_data[dtype] = np.column_stack((new_data[dtype], *data))

        return DataFrame._construct_from_new(
            new_data, new_column_info, np.asarray(new_columns, dtype='O'))
Exemple #8
0
    def _single_agg(self,
                    agg_cols: Dict = None,
                    new_names: Dict = None,
                    new_order: Dict = None,
                    num_agg_cols: int = None,
                    func_kwargs: Dict = None) -> DataFrame:

        labels = self._group_labels
        size = len(self._group_position)

        data_dict = self._get_group_col_data()
        new_column_info = self._get_new_column_info()
        new_columns = self._group_columns.copy() + [''] * num_agg_cols

        for name, agg_cols in agg_cols.items():

            agg_dtype_locs = defaultdict(list)
            agg_dtype_names = defaultdict(list)
            agg_dtype_new_names = defaultdict(list)
            agg_dtype_order = defaultdict(list)
            non_agg_dtype_locs = defaultdict(list)
            agg_dtype_kwargs = defaultdict(list)

            if isinstance(name, str):
                # name can also be a custom function
                name_kwargs = get_func_kwargs(name)
                ignore_str = name_kwargs.get('ignore_str', True)
                add_positions = name_kwargs.get('add_positions', False)
                ignore_date = name_kwargs.get('ignore_date', True)
                keep_date_type = name_kwargs.get('keep_date_type', True)
            else:
                ignore_str = False
                add_positions = False
                ignore_date = False
                keep_date_type = True

            cur_new_names = new_names[name]
            cur_new_order = new_order[name]
            kwargs_list = func_kwargs[name]

            for col in self._df._columns:

                dtype, loc, _ = self._df._column_info[col].values
                try:
                    idx = agg_cols.index(col)
                except ValueError:
                    non_agg_dtype_locs[dtype].append(loc)
                else:
                    agg_dtype_locs[dtype].append(loc)
                    agg_dtype_names[dtype].append(col)
                    agg_dtype_new_names[dtype].append(cur_new_names[idx])
                    agg_dtype_order[dtype].append(cur_new_order[idx])
                    agg_dtype_kwargs[dtype].append(kwargs_list[idx])

            for dtype, data in self._df._data.items():
                if dtype not in agg_dtype_locs:
                    continue
                if ignore_str and dtype == 'O':
                    continue
                if ignore_date and dtype in 'mM':
                    continue

                if dtype in 'mM':
                    data = data.view('int64')

                kwargs = {}
                for kw in agg_dtype_kwargs[dtype]:
                    if kw is not None:
                        kwargs = kw
                        break

                if isinstance(name, str):
                    func_name = name + '_' + utils.convert_kind_to_dtype_generic(
                        dtype)
                else:
                    func_name = 'custom_' + utils.convert_kind_to_dtype_generic(
                        dtype)
                    # 'name' is actually a function here
                    kwargs['func'] = name
                    kwargs['col_dict'] = dict(
                        zip(agg_dtype_locs[dtype], agg_dtype_names[dtype]))

                func = getattr(_gb, func_name)

                if add_positions:
                    arr = func(labels, size, data, non_agg_dtype_locs[dtype],
                               self._group_position, **kwargs)
                else:
                    arr = func(labels, size, data, non_agg_dtype_locs[dtype],
                               **kwargs)

                if dtype in 'mM' and keep_date_type:
                    new_kind = dtype
                    arr = arr.astype(utils.convert_kind_to_dtype(dtype))
                else:
                    new_kind = arr.dtype.kind

                cur_loc = utils.get_num_cols(data_dict.get(new_kind, []))
                data_dict[new_kind].append(arr)

                old_locs = agg_dtype_locs[dtype]
                order = np.argsort(old_locs).tolist()

                cur_names = np.array(agg_dtype_new_names[dtype])[order]
                cur_order = len(self._group_columns) + np.array(
                    agg_dtype_order[dtype])[order]

                for i, cur_name in enumerate(cur_names):
                    new_column_info[cur_name] = utils.Column(
                        new_kind, cur_loc + i, cur_order[i])
                    new_columns[cur_order[i]] = cur_name

        new_data = utils.concat_stat_arrays(data_dict)
        new_columns = np.array(new_columns, dtype='O')
        return DataFrame._construct_from_new(new_data, new_column_info,
                                             new_columns)
Exemple #9
0
    def _group_agg(self,
                   name: str,
                   ignore_str: bool = True,
                   add_positions: bool = False,
                   keep_group_cols: bool = True,
                   ignore_date: bool = True,
                   keep_date_type: bool = True,
                   **kwargs) -> DataFrame:
        labels = self._group_labels
        size = len(self._group_position)

        old_dtype_col: Dict[str, List[str]] = defaultdict(list)
        for col, col_obj in self._df._column_info.items():
            if col not in self._group_columns:
                old_dtype_col[col_obj.dtype].append(col)

        if keep_group_cols:
            data_dict = self._get_group_col_data()
            new_column_info = self._get_new_column_info()
            new_columns = self._group_columns.copy()
        else:
            data_dict = defaultdict(list)
            new_column_info = {}
            new_columns = []

        for dtype, data in self._df._data.items():
            if ignore_str and dtype == 'O':
                continue
            if ignore_date and dtype in 'mM':
                continue
            # number of grouped columns
            group_locs: list = self._group_dtype_loc.get(dtype, [])
            if len(group_locs) != data.shape[1]:
                func_name = name + '_' + utils.convert_kind_to_dtype_generic(
                    dtype)
                func = getattr(_gb, func_name)
                if dtype in 'mM':
                    data = data.view('int64')

                if add_positions:
                    arr = func(labels, size, data, group_locs,
                               self._group_position, **kwargs)
                else:
                    arr = func(labels, size, data, group_locs, **kwargs)
            else:
                continue

            if dtype in 'mM' and keep_date_type:
                new_kind = dtype
                arr = arr.astype(utils.convert_kind_to_dtype(dtype))
            else:
                new_kind = arr.dtype.kind
            cur_loc = utils.get_num_cols(data_dict.get(new_kind, []))
            data_dict[new_kind].append(arr)

            for col in old_dtype_col[dtype]:
                count_less = 0
                old_kind, old_loc, old_order = self._df._column_info[
                    col].values
                for k in self._group_dtype_loc.get(dtype, []):
                    count_less += old_loc > k

                new_column_info[col] = utils.Column(
                    new_kind, cur_loc + old_loc - count_less, 0)

        i = len(new_columns)
        j = 0
        for col in self._df._columns:
            if col not in new_column_info:
                continue
            if col in self._group_columns and keep_group_cols:
                new_column_info[col].order = j
                j += 1
                continue

            new_columns.append(col)
            new_column_info[col].order = i
            i += 1

        new_data = utils.concat_stat_arrays(data_dict)

        return DataFrame._construct_from_new(new_data, new_column_info,
                                             new_columns)