Esempio n. 1
0
def __ncp_set_valued(original_series, anonymized_series):
    original_flattened, original_indexes, _ = flatten_set_valued_series(
        original_series)
    anonymized_flattened, anonymized_indexes, _ = flatten_set_valued_series(
        anonymized_series)
    if is_categorical_dtype(original_series):
        original_flattened_series = pd.Series(original_flattened,
                                              index=original_indexes,
                                              dtype="category",
                                              name=original_series.name)
    else:
        original_flattened_series = pd.Series(original_flattened,
                                              index=original_indexes,
                                              name=original_series.name)

    if is_categorical_dtype(anonymized_series):
        anonymized_flattened_series = pd.Series(anonymized_flattened,
                                                index=anonymized_indexes,
                                                dtype="category",
                                                name=anonymized_series.name)
    else:
        anonymized_flattened_series = pd.Series(anonymized_flattened,
                                                index=anonymized_indexes,
                                                name=anonymized_series.name)

    return __calculate_ncp_attribute(original_flattened_series,
                                     anonymized_flattened_series)
Esempio n. 2
0
def recode_set_valued(series, recoding_rules, hierarchies):
    """
    Generalizes set valued series by flattening
    Parameters
    ----------
    series: Series
        Series to be recoded.
    recoding_rules: dict
        Dictionary containing recoding rules.
    hierarchies: dict
        Dictionary containing generalization hierarchies.
    Returns
    -------
    any
        Single value recoded to.
    """
    flattened, indexes, is_category = flatten_set_valued_series(series)
    if is_categorical_dtype(series) or is_category:
        flattened_series = pd.Series(flattened,
                                     index=indexes,
                                     dtype="category",
                                     name=series.name)
    else:
        flattened_series = pd.Series(flattened,
                                     index=indexes,
                                     name=series.name)
    result = recode(flattened_series, recoding_rules, hierarchies)
    return result.iloc[0]
Esempio n. 3
0
def __calculate_ncp_attribute(original_series, anonymized_series):
    if must_be_flattened(original_series):
        original_flattened, original_indexes, is_category = flatten_set_valued_series(
            original_series)
        if is_categorical_dtype(original_series) or is_category:
            original_flattened_series = pd.Series(original_flattened,
                                                  index=original_indexes,
                                                  dtype="category",
                                                  name=original_series.name)
        else:
            original_flattened_series = pd.Series(original_flattened,
                                                  index=original_indexes,
                                                  name=original_series.name)
        ncp = __calculate_ncp_attribute(original_flattened_series,
                                        anonymized_series)
    elif is_node(anonymized_series):  # Has been anonymized using a hierarchy
        ncp = __ncp_numerical_hierarchy(original_series, anonymized_series)
    elif is_datetime64_any_dtype(original_series):
        ncp = __ncp_date(original_series, anonymized_series)
    elif is_categorical_dtype(original_series):
        ncp = __ncp_categorical(original_series, anonymized_series)
    elif is_numeric_dtype(original_series):
        ncp = __ncp_numerical(original_series, anonymized_series)
    elif is_token_list(original_series):
        ncp = __ncp_tokens(original_series, anonymized_series)
    else:
        ncp = __ncp_set_valued(original_series, anonymized_series)
    return ncp
Esempio n. 4
0
def __get_attribute_span(series):
    if is_categorical_dtype(series):
        span = len(series.unique())
    elif is_datetime64_any_dtype(series):
        span = series.max() - series.min()
        span = span.days
    elif is_numeric_dtype(series):
        span = series.max() - series.min()
    else:
        flattened, indexes, is_category = flatten_set_valued_series(series)
        if is_category:
            new_series = pd.Series(flattened, dtype="category", index=indexes, name=series.name)
            new_series.index.name = "id"
            grouped = new_series.groupby(by="id").agg(agg_categorical).astype('category')
        else:
            new_series = pd.Series(flattened, index=indexes, name=series.name)
            new_series.index.name = "id"
            grouped = new_series.groupby(by="id").agg(agg_mean)
        span = __get_attribute_span(grouped)
    return span
Esempio n. 5
0
def __split_partition(series):
    if is_categorical_dtype(series) or is_datetime64_any_dtype(series):
        values = series.sort_values().unique()
        lv = set(values[:len(values) // 2])
        rv = set(values[len(values) // 2:])
        return series.index[series.isin(lv)], series.index[series.isin(rv)]
    elif is_numeric_dtype(series):
        median = series.median()
        dfl = series.index[series < median]
        dfr = series.index[series >= median]
        return (dfl, dfr)
    else:
        flattened, indexes, is_category = flatten_set_valued_series(series)
        if is_category:
            new_series = pd.Series(flattened, index=indexes, dtype="category", name=series.name)
            new_series.index.name = "id"
            grouped = new_series.groupby(by="id").agg(agg_categorical).astype('category')
        else:
            new_series = pd.Series(flattened, index=indexes, name=series.name)
            new_series.index.name = "id"
            grouped = new_series.groupby(by="id").agg(agg_mean)
        return __split_partition(grouped)