Beispiel #1
0
def _fill_remaining_gaps(data: InputData, operation_type: str):
    """ Function for filling in the nans in the table with features """
    # TODO discuss: move this "filling" to the chain method - we use such method too much here (for all tables)
    #  np.isnan(features).any() and np.isnan(features) doesn't work with non-numeric arrays
    features = data.features
    is_operation_not_for_text = operation_type != 'text_clean'
    if data.data_type == DataTypesEnum.table and is_operation_not_for_text:
        # Got indices of columns with string objects
        categorical_ids, _ = OneHotEncodingImplementation.str_columns_check(
            features)

        # Apply most_frequent or mean filling strategy
        if len(categorical_ids) == 0:
            data.features = SimpleImputer().fit_transform(features)
        else:
            data.features = SimpleImputer(
                strategy='most_frequent').fit_transform(features)
    return data
Beispiel #2
0
def _fill_remaining_gaps(data: InputData, operation_type: str):
    """ Function for filling in the nans in the table with features """
    # TODO discuss: move this "filling" to the pipeline method - we use such method too much here (for all tables)
    #  np.isnan(features).any() and np.isnan(features) doesn't work with non-numeric arrays
    features = data.features

    if data.data_type == DataTypesEnum.table and data.task.task_type != TaskTypesEnum.ts_forecasting:
        # Got indices of columns with string objects
        categorical_ids, _ = OneHotEncodingImplementation.str_columns_check(
            features)

        # Apply most_frequent or mean filling strategy
        if len(categorical_ids) == 0:
            data.features = ImputationImplementation().fit_transform(
                data).predict
        else:
            data.features = ImputationImplementation(
                strategy='most_frequent').fit_transform(data).predict
    return data
Beispiel #3
0
    def _preprocess(self, data: InputData):
        preprocessing_func = preprocessing_func_for_data(data, self)

        if not self.cache.actual_cached_state:
            # if fitted preprocessor not found in cache
            preprocessing_strategy = \
                preprocessing_func().fit(data.features)
        else:
            # if fitted preprocessor already exists
            preprocessing_strategy = self.cache.actual_cached_state.preprocessor

        data.features = preprocessing_strategy.apply(data.features)

        return data, preprocessing_strategy
Beispiel #4
0
def _prepare_exog_features(data_for_prediction: InputData,
                           exog_data: InputData, last_prediction: np.array,
                           forecast_step: int,
                           forecast_length: int) -> InputData:
    new_features = []
    if len(data_for_prediction.features.shape) == 1:
        # if one exog feature
        exog_features_num = 1
    else:
        # if several exog features
        exog_features_num = data_for_prediction.features.shape[1]

    new_part_len = 0
    if exog_features_num > 1:
        for exog_feat_id in range(exog_features_num):
            exog_feature = data_for_prediction.features[:, exog_feat_id]
            new_exog_values = \
                exog_data.features[forecast_step * forecast_length:
                                   ((forecast_step + 1) * forecast_length), exog_feat_id]
            new_feature = np.append(exog_feature, new_exog_values)
            new_features.append(new_feature)
            new_part_len = len(new_features[0])
    else:
        exog_feature = data_for_prediction.features
        new_exog_values = \
            exog_data.features[forecast_step * forecast_length:
                               ((forecast_step + 1) * forecast_length)]
        new_features = np.append(exog_feature, new_exog_values)
        new_part_len = len(new_features)

    # add predicted time series to features for next prediction
    predicted_ts = np.append(data_for_prediction.target, last_prediction)

    # cut the prediction if it's too long (actual for the last forecast step)
    predicted_ts = predicted_ts[0:new_part_len]
    # new_features.append(predicted_ts)
    data_for_prediction.target = predicted_ts
    data_for_prediction.features = np.stack(np.asarray(new_features)).T

    return data_for_prediction