Esempio n. 1
0
    def __init__(
        self,
        treatment_col: np.ndarray,
        target: np.ndarray,
        mode: bool,
        task: Task,
        n_folds: int = 5,
    ):
        """Generates time series data split. Sorter - include left, exclude right.

        Args:
            treatment_col: Treatment column: 0 - control group, 1 - treatment group
            target: Target values
            mode: Flag
            task: Task
            n_folds:

        """

        self.task = task
        self.n_folds = n_folds
        self.mode = mode

        idx = np.arange(treatment_col.shape[0])
        flg = treatment_col.astype(np.bool) == self.mode

        self.constant_idx = idx[flg]
        self.splitted_idx = idx[~flg]

        self.folds = set_sklearn_folds(self.task, target[self.splitted_idx],
                                       self.n_folds)
Esempio n. 2
0
def get_category_roles_stat(train: NumpyOrPandas,
                            subsample: Optional[Union[float, int]] = 100000,
                            random_state: int = 42,
                            n_jobs: int = 1):
    """Search for optimal processing of categorical values.

    Categorical means defined by user or object types.

    Args:
        train: Dataset.
        subsample: size of subsample.
        random_state: seed of random numbers generator.
        n_jobs: number of jobs.

    Returns:
        result.

    """

    roles_to_identify = []

    dtypes = []

    # check for train dtypes
    for f in train.features:
        role = train.roles[f]
        if role.name == 'Category' and role.encoding_type == 'auto':
            roles_to_identify.append(f)
            dtypes.append(role.dtype)

    res = DataFrame(columns=[
        'unique', 'top_freq_values', 'dtype', 'encoded_scores', 'freq_scores',
        'ord_scores'
    ],
                    index=roles_to_identify)

    res['dtype'] = dtypes

    if len(roles_to_identify) == 0:
        return res

    train = train[:, roles_to_identify].to_pandas()

    if train.folds is None:
        train.folds = set_sklearn_folds(train.task,
                                        train.target.values,
                                        cv=5,
                                        random_state=42,
                                        group=train.group)

    if subsample is not None:
        idx = np.random.RandomState(random_state).permutation(
            train.shape[0])[:subsample]
        train = train[idx]

    # check task specific
    target, encoder = get_target_and_encoder(train)

    empty_slice = train.data.isnull().values

    # check label encoded scores
    trf = SequentialTransformer([LabelEncoder(), encoder()])
    res['encoded_scores'] = get_score_from_pipe(train,
                                                target,
                                                pipe=trf,
                                                empty_slice=empty_slice,
                                                n_jobs=n_jobs)

    # check frequency encoding
    trf = FreqEncoder()
    res['freq_scores'] = get_score_from_pipe(train,
                                             target,
                                             pipe=trf,
                                             empty_slice=empty_slice,
                                             n_jobs=n_jobs)

    # check ordinal encoding
    trf = OrdinalEncoder()
    res['ord_scores'] = get_score_from_pipe(train,
                                            target,
                                            pipe=trf,
                                            empty_slice=empty_slice,
                                            n_jobs=n_jobs)

    return res
Esempio n. 3
0
def get_numeric_roles_stat(train: NumpyOrPandas,
                           subsample: Optional[Union[float, int]] = 100000,
                           random_state: int = 42,
                           manual_roles: Optional[RolesDict] = None,
                           n_jobs: int = 1) -> DataFrame:
    """Calculate statistics about different encodings performances.

    We need it to calculate rules about advanced roles guessing.
    Only for numeric data.

    Args:
        train: Dataset.
        subsample: size of subsample.
        random_state: int.
        manual_roles: Dict.
        n_jobs: int.

    Returns:
        DataFrame.

    """
    if manual_roles is None:
        manual_roles = {}

    roles_to_identify = []
    flg_manual_set = []
    # check for train dtypes
    for f in train.features:
        role = train.roles[f]
        if role.name == 'Numeric':
            roles_to_identify.append(f)
            flg_manual_set.append(f in manual_roles)

    res = DataFrame(columns=[
        'flg_manual', 'unique', 'unique_rate', 'top_freq_values', 'raw_scores',
        'binned_scores', 'encoded_scores', 'freq_scores', 'nan_rate'
    ],
                    index=roles_to_identify)
    res['flg_manual'] = flg_manual_set

    if len(roles_to_identify) == 0:
        return res

    train = train[:, roles_to_identify].to_numpy()

    if train.folds is None:
        train.folds = set_sklearn_folds(train.task,
                                        train.target,
                                        cv=5,
                                        random_state=42,
                                        group=train.group)

    if subsample is not None:
        idx = np.random.RandomState(random_state).permutation(
            train.shape[0])[:subsample]
        train = train[idx]

    data, target = train.data, train.target

    # check task specific
    target, encoder = get_target_and_encoder(train)

    # s3d = data.shape + (-1,)
    empty_slice = np.isnan(data)

    # check scores as is
    res['raw_scores'] = get_score_from_pipe(train,
                                            target,
                                            empty_slice=empty_slice,
                                            n_jobs=n_jobs)

    # check unique values
    unique_values = [
        np.unique(data[:, x][~np.isnan(data[:, x])], return_counts=True)
        for x in range(data.shape[1])
    ]
    top_freq_values = np.array([max(x[1]) for x in unique_values])
    unique_values = np.array([len(x[0]) for x in unique_values])
    res['unique'] = unique_values
    res['top_freq_values'] = top_freq_values
    res['unique_rate'] = res['unique'] / train.shape[0]

    # check binned categorical score
    trf = SequentialTransformer([QuantileBinning(), encoder()])
    res['binned_scores'] = get_score_from_pipe(train,
                                               target,
                                               pipe=trf,
                                               empty_slice=empty_slice,
                                               n_jobs=n_jobs)

    # check label encoded scores
    trf = SequentialTransformer(
        [ChangeRoles(CategoryRole(np.float32)),
         LabelEncoder(),
         encoder()])
    res['encoded_scores'] = get_score_from_pipe(train,
                                                target,
                                                pipe=trf,
                                                empty_slice=empty_slice,
                                                n_jobs=n_jobs)

    # check frequency encoding
    trf = SequentialTransformer(
        [ChangeRoles(CategoryRole(np.float32)),
         FreqEncoder()])
    res['freq_scores'] = get_score_from_pipe(train,
                                             target,
                                             pipe=trf,
                                             empty_slice=empty_slice,
                                             n_jobs=n_jobs)

    res['nan_rate'] = empty_slice.mean(axis=0)

    return res