Ejemplo n.º 1
0
def rule_based_roles_guess(stat: DataFrame) -> Dict[str, ColumnRole]:
    """Create roles dict based on stats.

    Args:
        stat: DataFrame.

    Returns:
        Dict.

    """
    numbers = stat[stat[[x for x in stat.columns if "rule_" in x]].any(axis=1)].copy()
    categories = stat.drop(numbers.index)

    # define encoding types
    roles_dict = {}

    # rules to determinate handling type
    numbers["discrete_rule"] = (~numbers["rule_7"]) & ((numbers["binned_scores"] / numbers["raw_scores"]) > 2)
    categories["int_rule"] = categories["unique"] < 10
    categories["freq_rule"] = (categories["freq_scores"] / categories["encoded_scores"]) > 1.3
    categories["ord_rule"] = categories["unique_rate"] > 0.01

    # numbers with discrete features
    role = NumericRole(np.float32, discretization=True)
    feats = numbers[numbers["discrete_rule"]].index
    roles_dict = {**roles_dict, **{x: role for x in feats}}

    # classic numbers
    role = NumericRole(np.float32)
    feats = numbers[~numbers["discrete_rule"]].index
    roles_dict = {**roles_dict, **{x: role for x in feats}}

    # low cardinal categories
    # role = CategoryRole(np.float32, encoding_type='int')
    feats = categories[categories["int_rule"]].index
    ordinal = categories["ord_rule"][categories["int_rule"]].values
    roles_dict = {
        **roles_dict,
        **{x: CategoryRole(np.float32, encoding_type="int", ordinal=y) for (x, y) in zip(feats, ordinal)},
    }

    # frequency encoded feats
    # role = CategoryRole(np.float32, encoding_type='freq')
    feats = categories[categories["freq_rule"]].index
    ordinal = categories["ord_rule"][categories["freq_rule"]].values
    roles_dict = {
        **roles_dict,
        **{x: CategoryRole(np.float32, encoding_type="freq", ordinal=y) for (x, y) in zip(feats, ordinal)},
    }

    # categories left
    # role = CategoryRole(np.float32)
    feats = categories[(~categories["freq_rule"]) & (~categories["int_rule"])].index
    ordinal = categories["ord_rule"][(~categories["freq_rule"]) & (~categories["int_rule"])].values
    roles_dict = {
        **roles_dict,
        **{x: CategoryRole(np.float32, encoding_type="auto", ordinal=y) for (x, y) in zip(feats, ordinal)},
    }

    return roles_dict
Ejemplo n.º 2
0
def rule_based_cat_handler_guess(stat: DataFrame) -> Dict[str, ColumnRole]:
    """Create roles dict based on stats.

    Args:
        stat: DataFrame.

    Returns:
        Dict.

    """
    # define encoding types
    roles_dict = {}

    # rules to determinate handling type
    freqs = stat[stat[[x for x in stat.columns if "freq_rule_" in x]].any(axis=1)]
    auto = stat[stat[[x for x in stat.columns if "auto_rule_" in x]].any(axis=1)]
    ordinals = stat[stat[[x for x in stat.columns if "ord_rule_" in x]].any(axis=1)]

    for enc_type, st in zip(["freq", "auto", "ord"], [freqs, auto, ordinals]):

        ordinal = False
        if enc_type == "ord":
            enc_type = "auto"
            ordinal = True

        feats = list(st.index)
        dtypes = list(st["dtype"])
        roles_dict = {
            **roles_dict,
            **{x: CategoryRole(dtype=d, encoding_type=enc_type, ordinal=ordinal) for x, d in zip(feats, dtypes)},
        }

    return roles_dict
Ejemplo n.º 3
0
def sampled_app_roles():
    return {
        TargetRole(): "TARGET",
        CategoryRole(dtype=str): ["NAME_CONTRACT_TYPE", "NAME_TYPE_SUITE"],
        NumericRole(np.float32): ["AMT_CREDIT", "AMT_GOODS_PRICE"],
        DatetimeRole(seasonality=["y", "m", "wd"]): ["BIRTH_DATE", "EMP_DATE"],
        FoldsRole(): "__fold__",
    }
Ejemplo n.º 4
0
def get_numeric_roles_stat(train: NumpyOrPandas,
                           subsample: Optional[Union[float, int]] = 100000,
                           random_state: int = 42,
                           manual_roles: Optional[RolesDict] = None,
                           n_jobs: int = 1) -> DataFrame:
    """Calculate statistics about different encodings performances.

    We need it to calculate rules about advanced roles guessing.
    Only for numeric data.

    Args:
        train: Dataset.
        subsample: size of subsample.
        random_state: int.
        manual_roles: Dict.
        n_jobs: int.

    Returns:
        DataFrame.

    """
    if manual_roles is None:
        manual_roles = {}

    roles_to_identify = []
    flg_manual_set = []
    # check for train dtypes
    for f in train.features:
        role = train.roles[f]
        if role.name == 'Numeric':
            roles_to_identify.append(f)
            flg_manual_set.append(f in manual_roles)

    res = DataFrame(columns=[
        'flg_manual', 'unique', 'unique_rate', 'top_freq_values', 'raw_scores',
        'binned_scores', 'encoded_scores', 'freq_scores', 'nan_rate'
    ],
                    index=roles_to_identify)
    res['flg_manual'] = flg_manual_set

    if len(roles_to_identify) == 0:
        return res

    train = train[:, roles_to_identify].to_numpy()

    if train.folds is None:
        train.folds = set_sklearn_folds(train.task,
                                        train.target,
                                        cv=5,
                                        random_state=42,
                                        group=train.group)

    if subsample is not None:
        idx = np.random.RandomState(random_state).permutation(
            train.shape[0])[:subsample]
        train = train[idx]

    data, target = train.data, train.target

    # check task specific
    target, encoder = get_target_and_encoder(train)

    # s3d = data.shape + (-1,)
    empty_slice = np.isnan(data)

    # check scores as is
    res['raw_scores'] = get_score_from_pipe(train,
                                            target,
                                            empty_slice=empty_slice,
                                            n_jobs=n_jobs)

    # check unique values
    unique_values = [
        np.unique(data[:, x][~np.isnan(data[:, x])], return_counts=True)
        for x in range(data.shape[1])
    ]
    top_freq_values = np.array([max(x[1]) for x in unique_values])
    unique_values = np.array([len(x[0]) for x in unique_values])
    res['unique'] = unique_values
    res['top_freq_values'] = top_freq_values
    res['unique_rate'] = res['unique'] / train.shape[0]

    # check binned categorical score
    trf = SequentialTransformer([QuantileBinning(), encoder()])
    res['binned_scores'] = get_score_from_pipe(train,
                                               target,
                                               pipe=trf,
                                               empty_slice=empty_slice,
                                               n_jobs=n_jobs)

    # check label encoded scores
    trf = SequentialTransformer(
        [ChangeRoles(CategoryRole(np.float32)),
         LabelEncoder(),
         encoder()])
    res['encoded_scores'] = get_score_from_pipe(train,
                                                target,
                                                pipe=trf,
                                                empty_slice=empty_slice,
                                                n_jobs=n_jobs)

    # check frequency encoding
    trf = SequentialTransformer(
        [ChangeRoles(CategoryRole(np.float32)),
         FreqEncoder()])
    res['freq_scores'] = get_score_from_pipe(train,
                                             target,
                                             pipe=trf,
                                             empty_slice=empty_slice,
                                             n_jobs=n_jobs)

    res['nan_rate'] = empty_slice.mean(axis=0)

    return res
Ejemplo n.º 5
0
    data["DAYS_EMPLOYED"], None, 0).astype(np.dtype("timedelta64[D]"))
data.drop(["DAYS_BIRTH", "DAYS_EMPLOYED"], axis=1, inplace=True)

# Create folds
print("Create folds")
data["__fold__"] = np.random.randint(0, 5, len(data))

# Print data head
print("Print data head")
print(data.head())

# # Set roles for columns
print("Set roles for columns")
check_roles = {
    TargetRole(): "TARGET",
    CategoryRole(dtype=str): ["NAME_CONTRACT_TYPE", "NAME_TYPE_SUITE"],
    NumericRole(np.float32): ["AMT_CREDIT", "AMT_GOODS_PRICE"],
    DatetimeRole(seasonality=["y", "m", "wd"]): ["BIRTH_DATE", "EMP_DATE"],
    FoldsRole(): "__fold__",
}

# create Task
task = Task("binary")
# # Creating PandasDataSet
print("Creating PandasDataset")
start_time = time.time()
pd_dataset = PandasDataset(data, roles_parser(check_roles), task=task)
print("PandasDataset created. Time = {:.3f} sec".format(time.time() -
                                                        start_time))

# # Print pandas dataset feature roles