Example #1
0
def statistic_summary(table, group_by=None, **params):
    check_required_parameters(_statistic_summary, params, ['table'])
    column_indices = []
    for i in params['input_cols']:
        column_indices.append(table.columns.get_loc(i))
    params['column_indices'] = column_indices
    columns = ['column_name'] + params['statistics'].copy()
    if 'percentile' in columns:
        columns.remove('percentile')
        if 'percentile_amounts' in params:
            for pa in _unique_list(params['percentile_amounts']):
                columns.append('percentile_{}'.format(_amounts_colname(pa)))
    if 'trimmed_mean' in columns:
        columns.remove('trimmed_mean')
        if 'trimmed_mean_amounts' in params:
            for ta in _unique_list(params['trimmed_mean_amounts']):
                columns.append('trimmed_mean_{}'.format(_amounts_colname(ta)))
    if 'mode' in columns:
        columns.remove('mode')
        columns.append('mode')
    if group_by is not None:
        return _function_by_group2(_statistic_summary,
                                   table,
                                   columns=columns,
                                   group_by=group_by,
                                   **params)
    else:
        tmp_table = table.values
        result = _statistic_summary(tmp_table, **params)
        result['out_table'] = pd.DataFrame(result['out_table'],
                                           columns=columns)
        return result
Example #2
0
def add_shift(table, group_by=None, **params):
    check_required_parameters(_add_shift, params, ['table'])

    columns = table.columns.tolist()

    input_col = params.get('input_col')
    shifted_col = params.get('shifted_col') if params.get(
        'shifted_col') else input_col
    shift_list = params.get('shift_list')
    order_by = params.get('order_by')
    params['input_col'] = table.columns.get_loc(input_col)
    if order_by is not None:
        params['order_by'] = [
            table.columns.get_loc(order_by) for order_by in order_by
        ]

    for shift in shift_list:
        columns.append('{shifted_col}_{shift}'.format(shifted_col=shifted_col,
                                                      shift=shift))

    if group_by is not None:
        return _function_by_group2(_add_shift,
                                   table,
                                   columns=columns,
                                   group_by=group_by,
                                   **params)
    else:
        tmp_table = table.values.tolist()

        result = _add_shift(tmp_table, **params)
        result['out_table'] = pd.DataFrame(result['out_table'],
                                           columns=columns)
    return result
Example #3
0
def statistic_summary(table, group_by=None, **params):

    check_required_parameters(_statistic_summary, params, ['table'])
    params = get_default_from_parameters_if_required(params,
                                                     _statistic_summary)
    param_validation_check = [
        all_elements_from_to(params, 0, 100, 'percentile_amounts'),
        all_elements_from_under(params, 0, 0.5, 'trimmed_mean_amounts')
    ]
    validate(*param_validation_check)

    column_indices = []
    for i in params['input_cols']:
        column_indices.append(table.columns.get_loc(i))
    params['column_indices'] = column_indices
    columns = ['column_name'] + params['statistics'].copy()
    columns = ['num_of_row' if x == 'nrow' else x for x in columns]
    if 'percentile' in columns:
        columns.remove('percentile')
        if params['percentile_amounts'] is not None:
            for pa in _unique_list(params['percentile_amounts']):
                columns.append('percentile_{}'.format(_amounts_colname(pa)))
    if 'trimmed_mean' in columns:
        columns.remove('trimmed_mean')
        if params['trimmed_mean_amounts'] is not None:
            for ta in _unique_list(params['trimmed_mean_amounts']):
                columns.append('trimmed_mean_{}'.format(_amounts_colname(ta)))
    if 'mode' in columns:
        columns.remove('mode')
        columns.append('mode')
    if group_by is not None:
        return _function_by_group2(_statistic_summary,
                                   table,
                                   columns=columns,
                                   group_by=group_by,
                                   **params)
    else:
        tmp_table = table.values
        if 'workers' in params:
            del params['workers']
        result = _statistic_summary(tmp_table, **params)
        result['out_table'] = pd.DataFrame(result['out_table'],
                                           columns=columns)
        return result
Example #4
0
def statistic_summary(table, group_by=None, **params):
    check_required_parameters(_statistic_summary_list, params, ['table'])
    params = get_default_from_parameters_if_required(
        params, _statistic_summary_original)
    param_validation_check = [
        all_elements_from_to(params, 0, 100, 'percentile_amounts'),
        all_elements_from_under(params, 0, 0.5, 'trimmed_mean_amounts')
    ]
    validate(*param_validation_check)
    if group_by is None:
        return _statistic_summary_original(table, **params)
    if True in pd.isnull(table[group_by]).values:
        group_by_unicode = [str(i) + '\u0003' for i in group_by]
        table[group_by_unicode] = table[group_by].fillna('\u0003')
        group_by = group_by_unicode
    params1 = dict()
    params1['input_cols'] = params['input_cols']
    params2 = dict()
    params1['statistics'] = []
    params2['statistics'] = []
    for st in params['statistics']:
        if st in [
                'max', 'min', 'range', 'sum', 'avg', 'variance', 'stddev',
                'nrow', 'num_of_value', 'null_count', 'median'
        ]:
            params1['statistics'].append(st)
        else:
            params2['statistics'].append(st)
    for st in params.keys():
        if 'percentile' == st:
            params2['percentile'] = params['percentile']
        if 'trimmed_mean' == st:
            params2['trimmed_mean'] = params['trimmed_mean']
        if 'percentile_amounts' == st:
            params2['percentile_amounts'] = params['percentile_amounts']
        if 'trimmed_mean_amounts' == st:
            params2['trimmed_mean_amounts'] = params['trimmed_mean_amounts']
    if params1['statistics']:
        result1 = _statistic_summary_groupby(table,
                                             group_by=group_by,
                                             **params1)
        result1.index = result1[group_by + ['column_name']]
    else:
        result1 = None
    if params2['statistics']:
        params2['input_cols'] = params['input_cols']
        column_indices = []
        for i in params2['input_cols']:
            column_indices.append(table.columns.get_loc(i))
        params2['column_indices'] = column_indices
        columns = ['column_name'] + params2['statistics'].copy()
        columns = ['num_of_row' if x == 'nrow' else x for x in columns]
        if 'percentile' in columns:
            columns.remove('percentile')
            if params2['percentile_amounts'] is not None:
                for pa in _unique_list(params2['percentile_amounts']):
                    columns.append('percentile_{}'.format(
                        _amounts_colname(pa)))
        if 'trimmed_mean' in columns:
            columns.remove('trimmed_mean')
            if params2['trimmed_mean_amounts'] is not None:
                for ta in _unique_list(params2['trimmed_mean_amounts']):
                    columns.append('trimmed_mean_{}'.format(
                        _amounts_colname(ta)))
        if 'mode' in columns:
            columns.remove('mode')
            columns.append('mode')
        result2 = _function_by_group2(_statistic_summary_list,
                                      table,
                                      columns=columns,
                                      group_by=group_by,
                                      **params2)['out_table']
        result2.index = result2[group_by + ['column_name']]
        if result1 is not None:
            result2 = result2[[
                i for i in result2.columns
                if i not in group_by + ['column_name']
            ]]
            # Update sort parameter after upgrading pandas.
            # result = pd.concat([result2,result1], axis=1, sort = False).reset_index(drop = True)
            result = pd.concat([result2, result1],
                               axis=1).reset_index(drop=True)
        else:
            result = result2
    else:
        groups = table[group_by].drop_duplicates().values
        result2 = []
        for i in groups:
            for j in params['input_cols']:
                result2.append(list(i) + [j])
        result2 = pd.DataFrame(result2, columns=group_by + ['column_name'])
        result2.index = result2[group_by + ['column_name']]
        result1 = result1[[
            i for i in result1.columns if i not in group_by + ['column_name']
        ]]
        # Update sort parameter after upgrading pandas.
        # result = pd.concat([result2,result1], axis=1, sort=False).reset_index(drop = True)
        result = pd.concat([result2, result1], axis=1).reset_index(drop=True)
    columns = ['column_name'] + params['statistics'].copy()
    if 'percentile' in columns:
        columns.remove('percentile')
        if params['percentile_amounts'] is not None:
            for pa in _unique_list(params['percentile_amounts']):
                columns.append('percentile_{}'.format(_amounts_colname(pa)))
    if 'trimmed_mean' in columns:
        columns.remove('trimmed_mean')
        if params['trimmed_mean_amounts'] is not None:
            for ta in _unique_list(params['trimmed_mean_amounts']):
                columns.append('trimmed_mean_{}'.format(_amounts_colname(ta)))
    if 'mode' in columns:
        columns.remove('mode')
        columns.append('mode')
    result = result[group_by + columns]
    if '\u0003' in result.values:
        result = result.replace('\u0003', np.nan)
        tmp_col = result.columns
        result.columns = [_remove_unicode(i) for i in tmp_col]
    return {'out_table': result}