def __init__( self, treatment_col: np.ndarray, target: np.ndarray, mode: bool, task: Task, n_folds: int = 5, ): """Generates time series data split. Sorter - include left, exclude right. Args: treatment_col: Treatment column: 0 - control group, 1 - treatment group target: Target values mode: Flag task: Task n_folds: """ self.task = task self.n_folds = n_folds self.mode = mode idx = np.arange(treatment_col.shape[0]) flg = treatment_col.astype(np.bool) == self.mode self.constant_idx = idx[flg] self.splitted_idx = idx[~flg] self.folds = set_sklearn_folds(self.task, target[self.splitted_idx], self.n_folds)
def get_category_roles_stat(train: NumpyOrPandas, subsample: Optional[Union[float, int]] = 100000, random_state: int = 42, n_jobs: int = 1): """Search for optimal processing of categorical values. Categorical means defined by user or object types. Args: train: Dataset. subsample: size of subsample. random_state: seed of random numbers generator. n_jobs: number of jobs. Returns: result. """ roles_to_identify = [] dtypes = [] # check for train dtypes for f in train.features: role = train.roles[f] if role.name == 'Category' and role.encoding_type == 'auto': roles_to_identify.append(f) dtypes.append(role.dtype) res = DataFrame(columns=[ 'unique', 'top_freq_values', 'dtype', 'encoded_scores', 'freq_scores', 'ord_scores' ], index=roles_to_identify) res['dtype'] = dtypes if len(roles_to_identify) == 0: return res train = train[:, roles_to_identify].to_pandas() if train.folds is None: train.folds = set_sklearn_folds(train.task, train.target.values, cv=5, random_state=42, group=train.group) if subsample is not None: idx = np.random.RandomState(random_state).permutation( train.shape[0])[:subsample] train = train[idx] # check task specific target, encoder = get_target_and_encoder(train) empty_slice = train.data.isnull().values # check label encoded scores trf = SequentialTransformer([LabelEncoder(), encoder()]) res['encoded_scores'] = get_score_from_pipe(train, target, pipe=trf, empty_slice=empty_slice, n_jobs=n_jobs) # check frequency encoding trf = FreqEncoder() res['freq_scores'] = get_score_from_pipe(train, target, pipe=trf, empty_slice=empty_slice, n_jobs=n_jobs) # check ordinal encoding trf = OrdinalEncoder() res['ord_scores'] = get_score_from_pipe(train, target, pipe=trf, empty_slice=empty_slice, n_jobs=n_jobs) return res
def get_numeric_roles_stat(train: NumpyOrPandas, subsample: Optional[Union[float, int]] = 100000, random_state: int = 42, manual_roles: Optional[RolesDict] = None, n_jobs: int = 1) -> DataFrame: """Calculate statistics about different encodings performances. We need it to calculate rules about advanced roles guessing. Only for numeric data. Args: train: Dataset. subsample: size of subsample. random_state: int. manual_roles: Dict. n_jobs: int. Returns: DataFrame. """ if manual_roles is None: manual_roles = {} roles_to_identify = [] flg_manual_set = [] # check for train dtypes for f in train.features: role = train.roles[f] if role.name == 'Numeric': roles_to_identify.append(f) flg_manual_set.append(f in manual_roles) res = DataFrame(columns=[ 'flg_manual', 'unique', 'unique_rate', 'top_freq_values', 'raw_scores', 'binned_scores', 'encoded_scores', 'freq_scores', 'nan_rate' ], index=roles_to_identify) res['flg_manual'] = flg_manual_set if len(roles_to_identify) == 0: return res train = train[:, roles_to_identify].to_numpy() if train.folds is None: train.folds = set_sklearn_folds(train.task, train.target, cv=5, random_state=42, group=train.group) if subsample is not None: idx = np.random.RandomState(random_state).permutation( train.shape[0])[:subsample] train = train[idx] data, target = train.data, train.target # check task specific target, encoder = get_target_and_encoder(train) # s3d = data.shape + (-1,) empty_slice = np.isnan(data) # check scores as is res['raw_scores'] = get_score_from_pipe(train, target, empty_slice=empty_slice, n_jobs=n_jobs) # check unique values unique_values = [ np.unique(data[:, x][~np.isnan(data[:, x])], return_counts=True) for x in range(data.shape[1]) ] top_freq_values = np.array([max(x[1]) for x in unique_values]) unique_values = np.array([len(x[0]) for x in unique_values]) res['unique'] = unique_values res['top_freq_values'] = top_freq_values res['unique_rate'] = res['unique'] / train.shape[0] # check binned categorical score trf = SequentialTransformer([QuantileBinning(), encoder()]) res['binned_scores'] = get_score_from_pipe(train, target, pipe=trf, empty_slice=empty_slice, n_jobs=n_jobs) # check label encoded scores trf = SequentialTransformer( [ChangeRoles(CategoryRole(np.float32)), LabelEncoder(), encoder()]) res['encoded_scores'] = get_score_from_pipe(train, target, pipe=trf, empty_slice=empty_slice, n_jobs=n_jobs) # check frequency encoding trf = SequentialTransformer( [ChangeRoles(CategoryRole(np.float32)), FreqEncoder()]) res['freq_scores'] = get_score_from_pipe(train, target, pipe=trf, empty_slice=empty_slice, n_jobs=n_jobs) res['nan_rate'] = empty_slice.mean(axis=0) return res