def test_counterfactual_unit_selection():

    df, X_names = make_uplift_classification(
        n_samples=2000, treatment_name=['control', 'treatment'])
    df['treatment_numeric'] = df['treatment_group_key'].replace({
        'control': 0,
        'treatment': 1
    })
    df_train, df_test = train_test_split(df,
                                         test_size=0.2,
                                         random_state=RANDOM_SEED)

    train_idx = df_train.index
    test_idx = df_test.index

    conversion_cost_dict = {'control': 0, 'treatment': 2.5}
    impression_cost_dict = {'control': 0, 'treatment': 0}

    cc_array, ic_array, conditions = get_treatment_costs(
        treatment=df['treatment_group_key'],
        control_name='control',
        cc_dict=conversion_cost_dict,
        ic_dict=impression_cost_dict)
    conversion_value_array = np.full(df.shape[0], 20)

    actual_value = get_actual_value(treatment=df['treatment_group_key'],
                                    observed_outcome=df['conversion'],
                                    conversion_value=conversion_value_array,
                                    conditions=conditions,
                                    conversion_cost=cc_array,
                                    impression_cost=ic_array)

    random_allocation_value = actual_value.loc[test_idx].mean()

    nevertaker_payoff = 0
    alwaystaker_payoff = -2.5
    complier_payoff = 17.5
    defier_payoff = -20

    cus = CounterfactualUnitSelector(learner=LogisticRegressionCV(),
                                     nevertaker_payoff=nevertaker_payoff,
                                     alwaystaker_payoff=alwaystaker_payoff,
                                     complier_payoff=complier_payoff,
                                     defier_payoff=defier_payoff)

    cus.fit(data=df_train.drop('treatment_group_key', 1),
            treatment='treatment_numeric',
            outcome='conversion')

    cus_pred = cus.predict(data=df_test.drop('treatment_group_key', 1),
                           treatment='treatment_numeric',
                           outcome='conversion')

    best_cus = np.where(cus_pred > 0, 1, 0)
    actual_is_cus = df_test['treatment_numeric'] == best_cus.ravel()
    cus_value = actual_value.loc[test_idx][actual_is_cus].mean()

    assert cus_value > random_allocation_value
Ejemplo n.º 2
0
    def test_uplift_random_forest_smoke(self):
        from causalml.dataset import make_uplift_classification
        n_samples = 10000
        seed = 12345

        h2o.init(strict_version_check=False)
        # Data generation
        # Generate dataset with 50 features, treatment/control flag feature
        # and outcome feature (In this case "conversion")
        train, x_names = make_uplift_classification(
            n_samples=n_samples,  # n_samples*2 rows will be generated
            # (#n_samples for each treatment_name)
            treatment_name=['control', 'treatment'],
            n_classification_features=50,
            # Dataset contains only features valid for modeling
            # Do not confuse model with irrelevant or redundant features
            n_classification_informative=50,
            random_seed=seed)

        treatment_column = "treatment_group_key"
        response_column = "conversion"

        train_h2o = h2o.H2OFrame(train)
        train_h2o[treatment_column] = train_h2o[treatment_column].asfactor()
        train_h2o[response_column] = train_h2o[response_column].asfactor()

        uplift_predict_kl = uplift_train_predict("KL", x_names,
                                                 treatment_column,
                                                 response_column, train_h2o,
                                                 seed)
        uplift_predict_euc = uplift_train_predict("euclidean", x_names,
                                                  treatment_column,
                                                  response_column, train_h2o,
                                                  seed)
        uplift_predict_chi = uplift_train_predict("chi_squared", x_names,
                                                  treatment_column,
                                                  response_column, train_h2o,
                                                  seed)

        print(
            "KL:" +
            str(uplift_predict_kl.mean().as_data_frame()["uplift_predict"][0]))
        print("EUC:" + str(uplift_predict_euc.mean().as_data_frame()
                           ["uplift_predict"][0]))
        print("CHI:" + str(uplift_predict_chi.mean().as_data_frame()
                           ["uplift_predict"][0]))

        assert 0.007 < uplift_predict_kl.mean().as_data_frame()["uplift_predict"][0] < 0.008, \
            "Not expected output: Mean uplift is suspiciously different. " \
            + str(uplift_predict_kl.mean().as_data_frame()["uplift_predict"][0])

        assert 0.0075 < uplift_predict_euc.mean().as_data_frame()["uplift_predict"][0] < 0.0085, \
            "Not expected output: Mean uplift is suspiciously different." \
            + str(uplift_predict_euc.mean().as_data_frame()["uplift_predict"][0].mean())

        assert 0.01 < uplift_predict_chi.mean().as_data_frame()["uplift_predict"][0] < 0.02, \
            "Not expected output: Mean uplift is suspiciously different." \
            + str(uplift_predict_chi.mean().as_data_frame()["uplift_predict"][0].mean())
Ejemplo n.º 3
0
    def _generate_data():
        if not generated:
            np.random.seed(RANDOM_SEED)
            data = make_uplift_classification(n_samples=N_SAMPLE,
                                              treatment_name=TREATMENT_NAMES,
                                              y_name=CONVERSION,
                                              random_seed=RANDOM_SEED)

        return data
Ejemplo n.º 4
0
def test_counterfactual_value_optimization():

    df, X_names = make_uplift_classification(
        n_samples=2000, treatment_name=['control', 'treatment1', 'treatment2'])
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)

    train_idx = df_train.index
    test_idx = df_test.index

    conversion_cost_dict = {'control': 0, 'treatment1': 2.5, 'treatment2': 5}
    impression_cost_dict = {'control': 0, 'treatment1': 0, 'treatment2': 0.02}

    cc_array, ic_array, conditions = get_treatment_costs(treatment=df['treatment_group_key'],
                                                         control_name='control',
                                                         cc_dict=conversion_cost_dict,
                                                         ic_dict=impression_cost_dict)
    conversion_value_array = np.full(df.shape[0], 20)

    actual_value = get_actual_value(treatment=df['treatment_group_key'],
                                    observed_outcome=df['conversion'],
                                    conversion_value=conversion_value_array,
                                    conditions=conditions,
                                    conversion_cost=cc_array,
                                    impression_cost=ic_array)

    random_allocation_value = actual_value.loc[test_idx].mean()

    tm = BaseTClassifier(learner=LogisticRegression(), control_name='control')
    tm.fit(df_train[X_names].values, df_train['treatment_group_key'], df_train['conversion'])
    tm_pred = tm.predict(df_test[X_names].values)

    proba_model = LogisticRegression()

    W_dummies = pd.get_dummies(df['treatment_group_key'])
    XW = np.c_[df[X_names], W_dummies]
    proba_model.fit(XW[train_idx], df_train['conversion'])
    y_proba = proba_model.predict_proba(XW[test_idx])[:, 1]

    cve = CounterfactualValueEstimator(treatment=df_test['treatment_group_key'],
                                       control_name='control',
                                       treatment_names=conditions[1:],
                                       y_proba=y_proba,
                                       cate=tm_pred,
                                       value=conversion_value_array[test_idx],
                                       conversion_cost=cc_array[test_idx],
                                       impression_cost=ic_array[test_idx])

    cve_best_idx = cve.predict_best()
    cve_best = [conditions[idx] for idx in cve_best_idx]
    actual_is_cve_best = df.loc[test_idx, 'treatment_group_key'] == cve_best
    cve_value = actual_value.loc[test_idx][actual_is_cve_best].mean()

    assert cve_value > random_allocation_value
Ejemplo n.º 5
0
    def test_causalml_make_uplift_classification_smoke(self):
        from causalml.dataset import make_uplift_classification
        import causalml
        print("Causalml library smoke test")
        print("Version of causalml: {0}".format(causalml.__version__))
        n_samples = 500
        seed = 12345

        train, x_names = make_uplift_classification(
            n_samples=n_samples,
            treatment_name=['control', 'treatment'],
            n_classification_features=10,
            n_classification_informative=10,
            random_seed=seed)
        assert not train.empty
        assert x_names