Ejemplo n.º 1
0
def test_time_series_iterator_and_multiprocessed_inference():
    np.random.seed(42)
    logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s',
                        level=logging.DEBUG)

    data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv')

    data['BIRTH_DATE'] = (
        np.datetime64('2018-01-01') +
        data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
    data['EMP_DATE'] = (np.datetime64('2018-01-01') +
                        np.clip(data['DAYS_EMPLOYED'], None, 0).astype(
                            np.dtype('timedelta64[D]'))).astype(str)

    data['report_dt'] = np.datetime64('2018-01-01')

    data['constant'] = 1
    data['allnan'] = np.nan

    data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

    train, test = train_test_split(data, test_size=2000, random_state=42)
    # create time series iterator that is passed as cv_func
    cv_iter = TimeSeriesIterator(train['EMP_DATE'].astype(np.datetime64),
                                 n_splits=5,
                                 sorted_kfold=False)

    # train dataset may be passed as dict of np.ndarray
    train = {
        'data': train[['AMT_CREDIT', 'AMT_ANNUITY']].values,
        'target': train['TARGET'].values
    }

    task = Task('binary', )

    automl = TabularAutoML(
        task=task,
        timeout=200,
    )
    oof_pred = automl.fit_predict(train,
                                  train_features=['AMT_CREDIT', 'AMT_ANNUITY'],
                                  cv_iter=cv_iter)
    # prediction can be made on file by
    test.to_csv('temp_test_data.csv', index=False)
    test_pred = automl.predict('temp_test_data.csv', batch_size=100, n_jobs=4)

    logging.debug('Check scores...')
    oof_prediction = oof_pred.data[:, 0]
    not_empty = np.logical_not(np.isnan(oof_prediction))
    logging.debug('OOF score: {}'.format(
        roc_auc_score(train['target'][not_empty], oof_prediction[not_empty])))
    logging.debug('TEST score: {}'.format(
        roc_auc_score(test['TARGET'].values, test_pred.data[:, 0])))
Ejemplo n.º 2
0
def test_default_tabular(sampled_app_train_test, sampled_app_roles,
                         binary_task):
    # load and prepare data
    train, test = sampled_app_train_test

    # run automl
    automl = TabularAutoML(task=binary_task)
    oof_predictions = automl.fit_predict(train,
                                         roles=sampled_app_roles,
                                         verbose=2)
    te_pred = automl.predict(test)

    # calculate scores
    print(
        f"Score for out-of-fold predictions: {roc_auc_score(train['TARGET'].values, oof_predictions.data[:, 0])}"
    )
    print(
        f"Score for hold-out: {roc_auc_score(test['TARGET'].values, te_pred.data[:, 0])}"
    )
Ejemplo n.º 3
0

# load and prepare data
data = pd.read_csv("./data/sampled_app_train.csv")
train_data, test_data = train_test_split(data, test_size=0.2, stratify=data["TARGET"], random_state=42)


def sample(optimization_search_space, trial, suggested_params):
    trial_values = copy.copy(suggested_params)
    trial_values["feature_fraction"] = trial.suggest_uniform("feature_fraction", low=0.5, high=1.0)

    if trial_values["feature_fraction"] > 0.7:
        trial_values["min_sum_hessian_in_leaf"] = trial.suggest_uniform("min_sum_hessian_in_leaf", low=0.5, high=1)
    else:
        trial_values["min_sum_hessian_in_leaf"] = trial.suggest_uniform("min_sum_hessian_in_leaf", low=0, high=0.5)

    return trial_values


# run automl with custom search spaces
automl = TabularAutoML(
    task=Task("binary"),
    lgb_params={"optimization_search_space": sample},
)
oof_predictions = automl.fit_predict(train_data, roles={"target": "TARGET", "drop": ["SK_ID_CURR"]})
te_pred = automl.predict(test_data)

# calculate scores
print(f"Score for out-of-fold predictions: {roc_auc_score(train_data['TARGET'].values, oof_predictions.data[:, 0])}")
print(f"Score for hold-out: {roc_auc_score(test_data['TARGET'].values, te_pred.data[:, 0])}")
Ejemplo n.º 4
0
def test_tabular_automl_preset_without_params():
    np.random.seed(42)
    logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s',
                        level=logging.DEBUG)

    data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv')

    data['BIRTH_DATE'] = (
        np.datetime64('2018-01-01') +
        data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
    data['EMP_DATE'] = (np.datetime64('2018-01-01') +
                        np.clip(data['DAYS_EMPLOYED'], None, 0).astype(
                            np.dtype('timedelta64[D]'))).astype(str)

    data['report_dt'] = np.datetime64('2018-01-01')

    data['constant'] = 1
    data['allnan'] = np.nan

    data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

    train, test = train_test_split(data, test_size=2000, random_state=42)

    roles = {
        'target':
        'TARGET',
        DatetimeRole(base_date=True, seasonality=(), base_feats=False):
        'report_dt',
    }

    task = Task('binary', )

    automl = TabularAutoML(
        task=task,
        timeout=3600,
    )
    oof_pred = automl.fit_predict(train, roles=roles)
    test_pred = automl.predict(test)

    not_nan = np.any(~np.isnan(oof_pred.data), axis=1)

    logging.debug('Check scores...')
    print('OOF score: {}'.format(
        roc_auc_score(train[roles['target']].values[not_nan],
                      oof_pred.data[not_nan][:, 0])))
    print('TEST score: {}'.format(
        roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])))
    logging.debug('Pickle automl')
    with open('automl.pickle', 'wb') as f:
        pickle.dump(automl, f)

    logging.debug('Load pickled automl')
    with open('automl.pickle', 'rb') as f:
        automl = pickle.load(f)

    logging.debug('Predict loaded automl')
    test_pred = automl.predict(test)
    logging.debug('TEST score, loaded: {}'.format(
        roc_auc_score(test['TARGET'].values, test_pred.data[:, 0])))

    os.remove('automl.pickle')
Ejemplo n.º 5
0
    task=task,
    timeout=600,
    general_params={
        "use_algos": [
            [
                "linear_l2",
                "lgb",
            ],
            ["linear_l2", "lgb"],
        ],
        "nested_cv": True,
        "skip_conn": True,
    },
    nested_cv_params={
        "cv": 5,
        "n_folds": None
    },
)

oof_pred = automl.fit_predict(train, roles=roles)
test_pred = automl.predict(test)

not_nan = np.any(~np.isnan(oof_pred.data), axis=1)

print(
    f"OOF score: {roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0])}"
)
print(
    f"TEST score: {roc_auc_score(test[roles['target']].values, test_pred.data[:, 0])}"
)
Ejemplo n.º 6
0
# train dataset may be passed as dict of np.ndarray
train = {
    "data": train[["AMT_CREDIT", "AMT_ANNUITY"]].values,
    "target": train["TARGET"].values,
}

task = Task("binary", )

automl = TabularAutoML(
    task=task,
    timeout=200,
)
oof_pred = automl.fit_predict(train,
                              train_features=["AMT_CREDIT", "AMT_ANNUITY"],
                              cv_iter=cv_iter)
# prediction can be made on file by
test.to_csv("temp_test_data.csv", index=False)
test_pred = automl.predict("temp_test_data.csv", batch_size=100, n_jobs=4)

print("Check scores...")
oof_prediction = oof_pred.data[:, 0]
not_empty = np.logical_not(np.isnan(oof_prediction))

print(
    f'OOF score: {roc_auc_score(train["target"][not_empty], oof_prediction[not_empty])}'
)
print(
    f'TEST score: {roc_auc_score(test["TARGET"].values, test_pred.data[:, 0])}'
)
Ejemplo n.º 7
0
class RLearner(MetaLearner):
    """RLearner

    m(x) - the conditional mean outcome
    e(x) - the propensity score
    tau(x) - the treatment effect

    .. math::
        \tau(\cdot) = argmin_{\tau} \sum_{i} \Big[ (Y_i - m(X_i)) - (W_i - e(X_i))\tau(X_i) \Big]^2

    """

    _epsi = 10 ** -5

    def __init__(
        self,
        propensity_learner: Optional[AutoML] = None,
        mean_outcome_learner: Optional[AutoML] = None,
        effect_learner: Optional[AutoML] = None,
        base_task: Optional[Task] = Task("binary"),
        timeout: Optional[int] = None,
        cpu_limit: int = 4,
        gpu_ids: Optional[str] = "all",
    ):
        """
        Args:
            propensity_learner: AutoML model, if `None` then will be used model by default (task must be 'binary')
            mean_outcome_learner: AutoML model, if `None` then will be used model by default
            effect_learner: AutoML model, if `None` then will be used model by default (task must be 'reg')
            base_task: task
            timeout: Timeout
            cpu_limit: CPU limit that that are passed to each automl.
            gpu_ids: GPU IDs that are passed to each automl.

        """
        if propensity_learner is not None and self._get_task(propensity_learner).name != "binary":
            raise RuntimeError("Task of 'propensity_learner' must be 'binary'")

        if mean_outcome_learner is None and base_task is None:
            raise RuntimeError("Must specify 'mean_outcome_learner' or base_task")

        if effect_learner is not None and self._get_task(effect_learner).name != "reg":
            raise RuntimeError("Task of effect_learner must be 'reg'")

        super().__init__(base_task, timeout, cpu_limit, gpu_ids)

        self.propensity_learner: AutoML
        self.mean_outcome_learner: AutoML
        self.effect_learner: AutoML

        no_learners = (propensity_learner is None) and (mean_outcome_learner is None) and (effect_learner is None)
        tabular_timeout = timeout / 3 if no_learners and timeout is not None else None

        if propensity_learner is None:
            self.propensity_learner = TabularAutoML(task=Task("binary"), timeout=tabular_timeout)
        else:
            self.propensity_learner = propensity_learner

        if mean_outcome_learner is not None:
            self.mean_outcome_learner = mean_outcome_learner
            self.base_task = self._get_task(mean_outcome_learner)
        elif base_task is not None:
            self.mean_outcome_learner = TabularAutoML(task=base_task, timeout=tabular_timeout)

        if effect_learner is None:
            self.effect_learner = TabularAutoML(task=Task("reg"), timeout=tabular_timeout)
        else:
            self.effect_learner = effect_learner

    def _fit(self, train_data: DataFrame, roles: Dict, verbose: int = 0):
        """Fit meta-learner

        Args:
            train_data: Dataset to train
            roles: Roles dict with 'treatment' roles
            verbose: Verbose

        """
        propensity_pred = self._fit_predict_propensity_learner(train_data, roles, verbose)
        self._check_timer()
        mean_outcome_pred = self._fit_predict_mean_outcome_learner(train_data, roles, verbose)
        self._check_timer()
        self._fit_effect_learner(train_data, roles, propensity_pred, mean_outcome_pred, verbose)

    def _predict(self, data: Any) -> Tuple[np.ndarray, None, None]:
        """Predict treatment effects

        Args:
            data: Dataset to perform inference.

        Returns:
            treatment_effect: Predictions of treatment effects
            None: Plug
            None: Plug

        """
        return self.effect_learner.predict(data).data.ravel(), None, None

    def _fit_predict_propensity_learner(self, train_data: DataFrame, roles: Dict, verbose: int = 0):
        """Fit propensity score

        Args:
            train_data: Dataset to train
            roles: Roles dict with 'treatment' roles

        """
        propensity_roles = copy.deepcopy(roles)

        target_role, target_col = _get_target_role(roles)
        propensity_roles.pop(target_role)

        treatment_role, treatment_col = _get_treatment_role(roles)
        propensity_roles.pop(treatment_role)
        propensity_roles["target"] = treatment_col

        train_cp = train_data.copy()
        train_cp.drop(target_col, axis=1, inplace=True)

        propensity_pred = self.propensity_learner.fit_predict(train_cp, propensity_roles, verbose=verbose).data.ravel()

        return propensity_pred

    def _fit_predict_mean_outcome_learner(self, train_data: DataFrame, roles: Dict, verbose: int = 0):
        """Fit mean outcome

        Args:
            train_data: Dataset to train
            roles: Roles dict with 'treatment' roles
            verbose: Verbose

        """
        outcome_roles = copy.deepcopy(roles)

        # target_role, target_col = _get_target_role(roles)

        treatment_role, treatment_col = _get_treatment_role(roles)
        outcome_roles.pop(treatment_role)

        train_cp = train_data.copy()
        train_cp.drop(treatment_col, axis=1, inplace=True)

        mean_outcome_pred = self.mean_outcome_learner.fit_predict(train_cp, outcome_roles).data.ravel()

        return mean_outcome_pred

    def _fit_effect_learner(
        self,
        train_data: DataFrame,
        roles: Dict,
        propensity_pred: np.ndarray,
        mean_outcome_pred: np.ndarray,
        verbose: int = 0,
    ):
        """Fit treatment effects

        Args:
            train_data: Dataset to train
            roles: Roles dict with 'treatment' roles
            propensity_pred: oof-prediction of propensity_learner
            mean_outcome_pred: oof-prediction of mean_outcome_learner
            verbose: Verbose

        """
        effect_roles = copy.deepcopy(roles)

        _, target_col = _get_target_role(roles)
        train_target = train_data[target_col]

        treatment_role, treatment_col = _get_treatment_role(roles)
        train_treatment = train_data[treatment_col]
        effect_roles.pop(treatment_role)

        weights = train_treatment - propensity_pred + self._epsi

        train_cp = train_data.copy()
        train_cp.drop(treatment_col, axis=1, inplace=True)
        train_cp[target_col] = (train_target - mean_outcome_pred) / weights
        train_cp["__WEIGHTS__"] = weights ** 2
        effect_roles["weights"] = "__WEIGHTS__"

        train_cp = train_cp[train_cp[target_col].notnull()]

        self.effect_learner.fit_predict(train_cp, effect_roles, verbose=verbose)