def rule_based_roles_guess(stat: DataFrame) -> Dict[str, ColumnRole]: """Create roles dict based on stats. Args: stat: DataFrame. Returns: Dict. """ numbers = stat[stat[[x for x in stat.columns if "rule_" in x]].any(axis=1)].copy() categories = stat.drop(numbers.index) # define encoding types roles_dict = {} # rules to determinate handling type numbers["discrete_rule"] = (~numbers["rule_7"]) & ((numbers["binned_scores"] / numbers["raw_scores"]) > 2) categories["int_rule"] = categories["unique"] < 10 categories["freq_rule"] = (categories["freq_scores"] / categories["encoded_scores"]) > 1.3 categories["ord_rule"] = categories["unique_rate"] > 0.01 # numbers with discrete features role = NumericRole(np.float32, discretization=True) feats = numbers[numbers["discrete_rule"]].index roles_dict = {**roles_dict, **{x: role for x in feats}} # classic numbers role = NumericRole(np.float32) feats = numbers[~numbers["discrete_rule"]].index roles_dict = {**roles_dict, **{x: role for x in feats}} # low cardinal categories # role = CategoryRole(np.float32, encoding_type='int') feats = categories[categories["int_rule"]].index ordinal = categories["ord_rule"][categories["int_rule"]].values roles_dict = { **roles_dict, **{x: CategoryRole(np.float32, encoding_type="int", ordinal=y) for (x, y) in zip(feats, ordinal)}, } # frequency encoded feats # role = CategoryRole(np.float32, encoding_type='freq') feats = categories[categories["freq_rule"]].index ordinal = categories["ord_rule"][categories["freq_rule"]].values roles_dict = { **roles_dict, **{x: CategoryRole(np.float32, encoding_type="freq", ordinal=y) for (x, y) in zip(feats, ordinal)}, } # categories left # role = CategoryRole(np.float32) feats = categories[(~categories["freq_rule"]) & (~categories["int_rule"])].index ordinal = categories["ord_rule"][(~categories["freq_rule"]) & (~categories["int_rule"])].values roles_dict = { **roles_dict, **{x: CategoryRole(np.float32, encoding_type="auto", ordinal=y) for (x, y) in zip(feats, ordinal)}, } return roles_dict
def rule_based_cat_handler_guess(stat: DataFrame) -> Dict[str, ColumnRole]: """Create roles dict based on stats. Args: stat: DataFrame. Returns: Dict. """ # define encoding types roles_dict = {} # rules to determinate handling type freqs = stat[stat[[x for x in stat.columns if "freq_rule_" in x]].any(axis=1)] auto = stat[stat[[x for x in stat.columns if "auto_rule_" in x]].any(axis=1)] ordinals = stat[stat[[x for x in stat.columns if "ord_rule_" in x]].any(axis=1)] for enc_type, st in zip(["freq", "auto", "ord"], [freqs, auto, ordinals]): ordinal = False if enc_type == "ord": enc_type = "auto" ordinal = True feats = list(st.index) dtypes = list(st["dtype"]) roles_dict = { **roles_dict, **{x: CategoryRole(dtype=d, encoding_type=enc_type, ordinal=ordinal) for x, d in zip(feats, dtypes)}, } return roles_dict
def sampled_app_roles(): return { TargetRole(): "TARGET", CategoryRole(dtype=str): ["NAME_CONTRACT_TYPE", "NAME_TYPE_SUITE"], NumericRole(np.float32): ["AMT_CREDIT", "AMT_GOODS_PRICE"], DatetimeRole(seasonality=["y", "m", "wd"]): ["BIRTH_DATE", "EMP_DATE"], FoldsRole(): "__fold__", }
def get_numeric_roles_stat(train: NumpyOrPandas, subsample: Optional[Union[float, int]] = 100000, random_state: int = 42, manual_roles: Optional[RolesDict] = None, n_jobs: int = 1) -> DataFrame: """Calculate statistics about different encodings performances. We need it to calculate rules about advanced roles guessing. Only for numeric data. Args: train: Dataset. subsample: size of subsample. random_state: int. manual_roles: Dict. n_jobs: int. Returns: DataFrame. """ if manual_roles is None: manual_roles = {} roles_to_identify = [] flg_manual_set = [] # check for train dtypes for f in train.features: role = train.roles[f] if role.name == 'Numeric': roles_to_identify.append(f) flg_manual_set.append(f in manual_roles) res = DataFrame(columns=[ 'flg_manual', 'unique', 'unique_rate', 'top_freq_values', 'raw_scores', 'binned_scores', 'encoded_scores', 'freq_scores', 'nan_rate' ], index=roles_to_identify) res['flg_manual'] = flg_manual_set if len(roles_to_identify) == 0: return res train = train[:, roles_to_identify].to_numpy() if train.folds is None: train.folds = set_sklearn_folds(train.task, train.target, cv=5, random_state=42, group=train.group) if subsample is not None: idx = np.random.RandomState(random_state).permutation( train.shape[0])[:subsample] train = train[idx] data, target = train.data, train.target # check task specific target, encoder = get_target_and_encoder(train) # s3d = data.shape + (-1,) empty_slice = np.isnan(data) # check scores as is res['raw_scores'] = get_score_from_pipe(train, target, empty_slice=empty_slice, n_jobs=n_jobs) # check unique values unique_values = [ np.unique(data[:, x][~np.isnan(data[:, x])], return_counts=True) for x in range(data.shape[1]) ] top_freq_values = np.array([max(x[1]) for x in unique_values]) unique_values = np.array([len(x[0]) for x in unique_values]) res['unique'] = unique_values res['top_freq_values'] = top_freq_values res['unique_rate'] = res['unique'] / train.shape[0] # check binned categorical score trf = SequentialTransformer([QuantileBinning(), encoder()]) res['binned_scores'] = get_score_from_pipe(train, target, pipe=trf, empty_slice=empty_slice, n_jobs=n_jobs) # check label encoded scores trf = SequentialTransformer( [ChangeRoles(CategoryRole(np.float32)), LabelEncoder(), encoder()]) res['encoded_scores'] = get_score_from_pipe(train, target, pipe=trf, empty_slice=empty_slice, n_jobs=n_jobs) # check frequency encoding trf = SequentialTransformer( [ChangeRoles(CategoryRole(np.float32)), FreqEncoder()]) res['freq_scores'] = get_score_from_pipe(train, target, pipe=trf, empty_slice=empty_slice, n_jobs=n_jobs) res['nan_rate'] = empty_slice.mean(axis=0) return res
data["DAYS_EMPLOYED"], None, 0).astype(np.dtype("timedelta64[D]")) data.drop(["DAYS_BIRTH", "DAYS_EMPLOYED"], axis=1, inplace=True) # Create folds print("Create folds") data["__fold__"] = np.random.randint(0, 5, len(data)) # Print data head print("Print data head") print(data.head()) # # Set roles for columns print("Set roles for columns") check_roles = { TargetRole(): "TARGET", CategoryRole(dtype=str): ["NAME_CONTRACT_TYPE", "NAME_TYPE_SUITE"], NumericRole(np.float32): ["AMT_CREDIT", "AMT_GOODS_PRICE"], DatetimeRole(seasonality=["y", "m", "wd"]): ["BIRTH_DATE", "EMP_DATE"], FoldsRole(): "__fold__", } # create Task task = Task("binary") # # Creating PandasDataSet print("Creating PandasDataset") start_time = time.time() pd_dataset = PandasDataset(data, roles_parser(check_roles), task=task) print("PandasDataset created. Time = {:.3f} sec".format(time.time() - start_time)) # # Print pandas dataset feature roles