예제 #1
0
def test_BaseDRLearner(generate_regression_data):
    y, X, treatment, tau, b, e = generate_regression_data()

    learner = BaseDRLearner(learner=XGBRegressor(),
                            treatment_effect_learner=LinearRegression())

    # check the accuracy of the ATE estimation
    ate_p, lb, ub = learner.estimate_ate(X=X, treatment=treatment, y=y, p=e)
    assert (ate_p >= lb) and (ate_p <= ub)
    assert ape(tau.mean(), ate_p) < ERROR_THRESHOLD

    # check the accuracy of the CATE estimation with the bootstrap CI
    cate_p, _, _ = learner.fit_predict(X=X,
                                       treatment=treatment,
                                       y=y,
                                       p=e,
                                       return_ci=True,
                                       n_bootstraps=10)

    auuc_metrics = pd.DataFrame({
        'cate_p': cate_p.flatten(),
        'W': treatment,
        'y': y,
        'treatment_effect_col': tau
    })

    cumgain = get_cumgain(auuc_metrics,
                          outcome_col='y',
                          treatment_col='W',
                          treatment_effect_col='tau')

    # Check if the cumulative gain when using the model's prediction is
    # higher than it would be under random targeting
    assert cumgain['cate_p'].sum() > cumgain['Random'].sum()
예제 #2
0
def test_BaseTClassifier(generate_classification_data):

    np.random.seed(RANDOM_SEED)

    df, x_names = generate_classification_data()

    df['treatment_group_key'] = np.where(df['treatment_group_key'] == CONTROL_NAME, 0, 1)

    df_train, df_test = train_test_split(df,
                                         test_size=0.2,
                                         random_state=RANDOM_SEED)

    uplift_model = BaseTClassifier(learner=LogisticRegression())

    uplift_model.fit(X=df_train[x_names].values,
                     treatment=df_train['treatment_group_key'].values,
                     y=df_train[CONVERSION].values)

    tau_pred = uplift_model.predict(X=df_test[x_names].values,
                                  treatment=df_test['treatment_group_key'].values)

    auuc_metrics = pd.DataFrame({'tau_pred': tau_pred.flatten(),
                                 'W': df_test['treatment_group_key'].values,
                                 CONVERSION: df_test[CONVERSION].values,
                                 'treatment_effect_col': df_test['treatment_effect'].values})

    cumgain = get_cumgain(auuc_metrics,
                          outcome_col=CONVERSION,
                          treatment_col='W',
                          treatment_effect_col='treatment_effect_col')

    # Check if the cumulative gain when using the model's prediction is
    # higher than it would be under random targeting
    assert cumgain['tau_pred'].sum() > cumgain['Random'].sum()
예제 #3
0
def test_BaseSClassifier(generate_classification_data):

    np.random.seed(RANDOM_SEED)

    df, x_names = generate_classification_data()

    df['treatment_group_key'] = np.where(
        df['treatment_group_key'] == CONTROL_NAME, 0, 1)

    df_train, df_test = train_test_split(df,
                                         test_size=0.2,
                                         random_state=RANDOM_SEED)

    uplift_model = BaseSClassifier(learner=XGBClassifier())

    uplift_model.fit(X=df_train[x_names].values,
                     treatment=df_train['treatment_group_key'].values,
                     y=df_train[CONVERSION].values)

    y_pred = uplift_model.predict(
        X=df_test[x_names].values,
        treatment=df_test['treatment_group_key'].values)

    auuc_metrics = pd.DataFrame(np.c_[y_pred,
                                      df_test['treatment_group_key'].values,
                                      df_test[CONVERSION].values],
                                columns=['y_pred', 'W', CONVERSION])

    cumgain = get_cumgain(auuc_metrics,
                          outcome_col=CONVERSION,
                          treatment_col='W')

    # Check if the cumulative gain when using the model's prediction is
    # higher than it would be under random targeting
    assert cumgain['y_pred'].sum() > cumgain['Random'].sum()
예제 #4
0
def test_drivlearner():
    np.random.seed(RANDOM_SEED)
    n = 1000
    p = 8
    sigma = 1.0

    X = np.random.uniform(size=n * p).reshape((n, -1))
    b = np.sin(np.pi * X[:, 0] *
               X[:, 1]) + 2 * (X[:, 2] - 0.5)**2 + X[:, 3] + 0.5 * X[:, 4]
    assignment = (np.random.uniform(size=n) > 0.5).astype(int)
    eta = 0.1
    e_raw = np.maximum(
        np.repeat(eta, n),
        np.minimum(np.sin(np.pi * X[:, 0] * X[:, 1]), np.repeat(1 - eta, n)))
    e = e_raw.copy()
    e[assignment == 0] = 0
    tau = (X[:, 0] + X[:, 1]) / 2
    X_obs = X[:, [i for i in range(8) if i != 1]]

    w = np.random.binomial(1, e, size=n)
    treatment = w
    y = b + (w - 0.5) * tau + sigma * np.random.normal(size=n)

    learner = BaseDRIVLearner(learner=XGBRegressor(),
                              treatment_effect_learner=LinearRegression())

    # check the accuracy of the ATE estimation
    ate_p, lb, ub = learner.estimate_ate(X=X,
                                         assignment=assignment,
                                         treatment=treatment,
                                         y=y,
                                         p=(np.ones(n) * 1e-6, e_raw))
    assert (ate_p >= lb) and (ate_p <= ub)
    assert ape(tau.mean(), ate_p) < ERROR_THRESHOLD

    # check the accuracy of the CATE estimation with the bootstrap CI
    cate_p, _, _ = learner.fit_predict(X=X,
                                       assignment=assignment,
                                       treatment=treatment,
                                       y=y,
                                       p=(np.ones(n) * 1e-6, e_raw),
                                       return_ci=True,
                                       n_bootstraps=10)

    auuc_metrics = pd.DataFrame({
        'cate_p': cate_p.flatten(),
        'W': treatment,
        'y': y,
        'treatment_effect_col': tau
    })

    cumgain = get_cumgain(auuc_metrics,
                          outcome_col='y',
                          treatment_col='W',
                          treatment_effect_col='tau')

    # Check if the cumulative gain when using the model's prediction is
    # higher than it would be under random targeting
    assert cumgain['cate_p'].sum() > cumgain['Random'].sum()
예제 #5
0
파일: synthetic.py 프로젝트: uber/causalml
def get_synthetic_auuc(
    synthetic_preds,
    drop_learners=[],
    outcome_col="y",
    treatment_col="w",
    treatment_effect_col="tau",
    plot=True,
):
    """Get auuc values for cumulative gains of model estimates in quantiles.

    For details, reference get_cumgain() and plot_gain()
    Args:
        synthetic_preds (dict): dictionary of predictions generated by get_synthetic_preds()
        or get_synthetic_preds_holdout()
        outcome_col (str, optional): the column name for the actual outcome
        treatment_col (str, optional): the column name for the treatment indicator (0 or 1)
        treatment_effect_col (str, optional): the column name for the true treatment effect
        plot (boolean,optional): plot the cumulative gain chart or not

    Returns:
        (pandas.DataFrame): auuc values by learner for cumulative gains of model estimates
    """
    synthetic_preds_df = synthetic_preds.copy()
    generated_data = synthetic_preds_df.pop(KEY_GENERATED_DATA)
    synthetic_preds_df = pd.DataFrame(synthetic_preds_df)
    synthetic_preds_df = synthetic_preds_df.drop(drop_learners, axis=1)

    synthetic_preds_df["y"] = generated_data[outcome_col]
    synthetic_preds_df["w"] = generated_data[treatment_col]
    if treatment_effect_col in generated_data.keys():
        synthetic_preds_df["tau"] = generated_data[treatment_effect_col]

    assert ((outcome_col in synthetic_preds_df.columns) and
            (treatment_col in synthetic_preds_df.columns)
            or treatment_effect_col in synthetic_preds_df.columns)

    cumlift = get_cumgain(
        synthetic_preds_df,
        outcome_col="y",
        treatment_col="w",
        treatment_effect_col="tau",
    )
    auuc_df = pd.DataFrame(cumlift.columns)
    auuc_df.columns = ["Learner"]
    auuc_df["cum_gain_auuc"] = [
        auc(cumlift.index.values / 100, cumlift[learner].values)
        for learner in cumlift.columns
    ]
    auuc_df = auuc_df.sort_values("cum_gain_auuc", ascending=False)

    if plot:
        plot_gain(
            synthetic_preds_df,
            outcome_col=outcome_col,
            treatment_col=treatment_col,
            treatment_effect_col=treatment_effect_col,
        )

    return auuc_df
예제 #6
0
def test_BaseXClassifier(generate_classification_data):

    np.random.seed(RANDOM_SEED)

    df, x_names = generate_classification_data()

    df['treatment_group_key'] = np.where(df['treatment_group_key'] == CONTROL_NAME, 0, 1)

    propensity_model = LogisticRegression()
    propensity_model.fit(X=df[x_names].values, y=df['treatment_group_key'].values)
    df['propensity_score'] = propensity_model.predict_proba(df[x_names].values)[:, 1]

    df_train, df_test = train_test_split(df,
                                         test_size=0.2,
                                         random_state=RANDOM_SEED)

    # specify all 4 learners
    uplift_model = BaseXClassifier(control_outcome_learner=XGBClassifier(),
                                   control_effect_learner=XGBRegressor(),
                                   treatment_outcome_learner=XGBClassifier(),
                                   treatment_effect_learner=XGBRegressor())

    uplift_model.fit(X=df_train[x_names].values,
                     treatment=df_train['treatment_group_key'].values,
                     y=df_train[CONVERSION].values)

    tau_pred = uplift_model.predict(X=df_test[x_names].values,
                                  p=df_test['propensity_score'].values)

    # specify 2 learners
    uplift_model = BaseXClassifier(outcome_learner=XGBClassifier(),
                                   effect_learner=XGBRegressor())

    uplift_model.fit(X=df_train[x_names].values,
                     treatment=df_train['treatment_group_key'].values,
                     y=df_train[CONVERSION].values)

    tau_pred = uplift_model.predict(X=df_test[x_names].values,
                                  p=df_test['propensity_score'].values)

    # calculate metrics
    auuc_metrics = pd.DataFrame({'tau_pred': tau_pred.flatten(),
                                 'W': df_test['treatment_group_key'].values,
                                 CONVERSION: df_test[CONVERSION].values,
                                 'treatment_effect_col': df_test['treatment_effect'].values})

    cumgain = get_cumgain(auuc_metrics,
                          outcome_col=CONVERSION,
                          treatment_col='W',
                          treatment_effect_col='treatment_effect_col')

    # Check if the cumulative gain when using the model's prediction is
    # higher than it would be under random targeting
    assert cumgain['tau_pred'].sum() > cumgain['Random'].sum()
예제 #7
0
def test_BaseRClassifier_with_sample_weights(generate_classification_data):

    np.random.seed(RANDOM_SEED)

    df, x_names = generate_classification_data()

    df["treatment_group_key"] = np.where(
        df["treatment_group_key"] == CONTROL_NAME, 0, 1)
    df["sample_weights"] = np.random.randint(low=1, high=3, size=df.shape[0])

    propensity_model = LogisticRegression()
    propensity_model.fit(X=df[x_names].values,
                         y=df["treatment_group_key"].values)
    df["propensity_score"] = propensity_model.predict_proba(
        df[x_names].values)[:, 1]

    df_train, df_test = train_test_split(df,
                                         test_size=0.2,
                                         random_state=RANDOM_SEED)

    uplift_model = BaseRClassifier(outcome_learner=XGBClassifier(),
                                   effect_learner=XGBRegressor())

    uplift_model.fit(
        X=df_train[x_names].values,
        p=df_train["propensity_score"].values,
        treatment=df_train["treatment_group_key"].values,
        y=df_train[CONVERSION].values,
        sample_weight=df_train["sample_weights"],
    )

    tau_pred = uplift_model.predict(X=df_test[x_names].values)

    auuc_metrics = pd.DataFrame({
        "tau_pred":
        tau_pred.flatten(),
        "W":
        df_test["treatment_group_key"].values,
        CONVERSION:
        df_test[CONVERSION].values,
        "treatment_effect_col":
        df_test["treatment_effect"].values,
    })

    cumgain = get_cumgain(
        auuc_metrics,
        outcome_col=CONVERSION,
        treatment_col="W",
        treatment_effect_col="treatment_effect_col",
    )

    # Check if the cumulative gain when using the model's prediction is
    # higher than it would be under random targeting
    assert cumgain["tau_pred"].sum() > cumgain["Random"].sum()
예제 #8
0
def test_BaseXLearner(generate_regression_data):
    y, X, treatment, tau, b, e = generate_regression_data()

    learner = BaseXLearner(learner=XGBRegressor())

    # check the accuracy of the ATE estimation
    ate_p, lb, ub = learner.estimate_ate(X=X, treatment=treatment, y=y, p=e)
    assert (ate_p >= lb) and (ate_p <= ub)
    assert ape(tau.mean(), ate_p) < ERROR_THRESHOLD

    # check pre-train model
    ate_p_pt, lb_pt, ub_pt = learner.estimate_ate(X=X,
                                                  treatment=treatment,
                                                  y=y,
                                                  p=e,
                                                  pretrain=True)
    assert (ate_p_pt == ate_p) and (lb_pt == lb) and (ub_pt == ub)

    # check the accuracy of the CATE estimation with the bootstrap CI
    cate_p, _, _ = learner.fit_predict(X=X,
                                       treatment=treatment,
                                       y=y,
                                       p=e,
                                       return_ci=True,
                                       n_bootstraps=10)

    auuc_metrics = pd.DataFrame({
        "cate_p": cate_p.flatten(),
        "W": treatment,
        "y": y,
        "treatment_effect_col": tau,
    })

    cumgain = get_cumgain(auuc_metrics,
                          outcome_col="y",
                          treatment_col="W",
                          treatment_effect_col="tau")

    # Check if the cumulative gain when using the model's prediction is
    # higher than it would be under random targeting
    assert cumgain["cate_p"].sum() > cumgain["Random"].sum()

    # basic test of using outcome_learner and effect_learner
    learner = BaseXLearner(
        learner=XGBRegressor(),
        control_outcome_learner=RandomForestRegressor(),
        treatment_outcome_learner=RandomForestRegressor(),
        control_effect_learner=RandomForestRegressor(),
        treatment_effect_learner=RandomForestRegressor(),
    )
    # check the accuracy of the ATE estimation
    ate_p, lb, ub = learner.estimate_ate(X=X, treatment=treatment, y=y, p=e)
    assert (ate_p >= lb) and (ate_p <= ub)
    assert ape(tau.mean(), ate_p) < ERROR_THRESHOLD
예제 #9
0
def test_UpliftTreeClassifier(generate_classification_data):
    df, x_names = generate_classification_data()
    df_train, df_test = train_test_split(df,
                                         test_size=0.2,
                                         random_state=RANDOM_SEED)

    # Train the UpLift Random Forest classifier
    uplift_model = UpliftTreeClassifier(control_name=TREATMENT_NAMES[0])

    pr = cProfile.Profile(subcalls=True, builtins=True, timeunit=.001)
    pr.enable()
    uplift_model.fit(df_train[x_names].values,
                     treatment=df_train['treatment_group_key'].values,
                     y=df_train[CONVERSION].values)

    _, _, _, y_pred = uplift_model.predict(df_test[x_names].values,
                                           full_output=True)
    pr.disable()
    with open('UpliftTreeClassifier.prof', 'w') as f:
        ps = pstats.Stats(pr, stream=f).sort_stats('cumulative')
        ps.print_stats()

    result = pd.DataFrame(y_pred)
    result.drop(CONTROL_NAME, axis=1, inplace=True)

    best_treatment = np.where((result < 0).all(axis=1), CONTROL_NAME,
                              result.idxmax(axis=1))

    # Create a synthetic population:

    # Create indicator variables for whether a unit happened to have the
    # recommended treatment or was in the control group
    actual_is_best = np.where(df_test['treatment_group_key'] == best_treatment,
                              1, 0)
    actual_is_control = np.where(
        df_test['treatment_group_key'] == CONTROL_NAME, 1, 0)

    synthetic = (actual_is_best == 1) | (actual_is_control == 1)
    synth = result[synthetic]

    auuc_metrics = synth.assign(
        is_treated=1 - actual_is_control[synthetic],
        conversion=df_test.loc[synthetic, CONVERSION].values,
        uplift_tree=synth.max(axis=1)).drop(columns=result.columns)

    cumgain = get_cumgain(auuc_metrics,
                          outcome_col=CONVERSION,
                          treatment_col='is_treated')

    # Check if the cumulative gain of UpLift Random Forest is higher than
    # random
    assert cumgain['uplift_tree'].sum() > cumgain['Random'].sum()
예제 #10
0
def test_BaseRClassifier_with_sample_weights(generate_classification_data):

    np.random.seed(RANDOM_SEED)

    df, x_names = generate_classification_data()

    df['treatment_group_key'] = np.where(df['treatment_group_key'] == CONTROL_NAME, 0, 1)
    df['sample_weights'] = np.random.randint(low=1, high=3, size=df.shape[0])

    propensity_model = LogisticRegression()
    propensity_model.fit(X=df[x_names].values, y=df['treatment_group_key'].values)
    df['propensity_score'] = propensity_model.predict_proba(df[x_names].values)[:, 1]

    df_train, df_test = train_test_split(df,
                                         test_size=0.2,
                                         random_state=RANDOM_SEED)

    uplift_model = BaseRClassifier(outcome_learner=XGBClassifier(),
                                   effect_learner=XGBRegressor())

    uplift_model.fit(X=df_train[x_names].values,
                     p=df_train['propensity_score'].values,
                     treatment=df_train['treatment_group_key'].values,
                     y=df_train[CONVERSION].values,
                     sample_weight=df_train['sample_weights'])

    tau_pred = uplift_model.predict(X=df_test[x_names].values)

    auuc_metrics = pd.DataFrame({'tau_pred': tau_pred.flatten(),
                                 'W': df_test['treatment_group_key'].values,
                                 CONVERSION: df_test[CONVERSION].values,
                                 'treatment_effect_col': df_test['treatment_effect'].values})

    cumgain = get_cumgain(auuc_metrics,
                          outcome_col=CONVERSION,
                          treatment_col='W',
                          treatment_effect_col='treatment_effect_col')

    # Check if the cumulative gain when using the model's prediction is
    # higher than it would be under random targeting
    assert cumgain['tau_pred'].sum() > cumgain['Random'].sum()

    # Check if XGBRRegressor successfully produces treatment effect estimation
    # when sample_weight is passed
    uplift_model = XGBRRegressor()
    uplift_model.fit(X=df_train[x_names].values,
                     p=df_train['propensity_score'].values,
                     treatment=df_train['treatment_group_key'].values,
                     y=df_train[CONVERSION].values,
                     sample_weight=df_train['sample_weights'])
    tau_pred = uplift_model.predict(X=df_test[x_names].values)
    assert len(tau_pred) == len(df_test['sample_weights'].values)
예제 #11
0
def test_UpliftRandomForestClassifier(generate_classification_data):
    df, x_names = generate_classification_data()
    df_train, df_test = train_test_split(df,
                                         test_size=0.2,
                                         random_state=RANDOM_SEED)

    # Train the UpLift Random Forest classifer
    uplift_model = UpliftRandomForestClassifier(
        min_samples_leaf=50,
        control_name=TREATMENT_NAMES[0]
    )

    uplift_model.fit(df_train[x_names].values,
                     treatment=df_train['treatment_group_key'].values,
                     y=df_train[CONVERSION].values)

    y_pred = uplift_model.predict(df_test[x_names].values)
    result = pd.DataFrame(y_pred, columns=uplift_model.classes_)

    best_treatment = np.where((result < 0).all(axis=1),
                              CONTROL_NAME,
                              result.idxmax(axis=1))

    # Create a synthetic population:

    # Create indicator variables for whether a unit happened to have the
    # recommended treatment or was in the control group
    actual_is_best = np.where(
        df_test['treatment_group_key'] == best_treatment, 1, 0
    )
    actual_is_control = np.where(
        df_test['treatment_group_key'] == CONTROL_NAME, 1, 0
    )

    synthetic = (actual_is_best == 1) | (actual_is_control == 1)
    synth = result[synthetic]

    auuc_metrics = synth.assign(
        is_treated=1 - actual_is_control[synthetic],
        conversion=df_test.loc[synthetic, CONVERSION].values,
        uplift_tree=synth.max(axis=1)
    ).drop(columns=list(uplift_model.classes_))

    cumgain = get_cumgain(auuc_metrics,
                          outcome_col=CONVERSION,
                          treatment_col='is_treated',
                          steps=20)

    # Check if the cumulative gain of UpLift Random Forest is higher than
    # random
    assert cumgain['uplift_tree'].sum() > cumgain['Random'].sum()
예제 #12
0
파일: test_cevae.py 프로젝트: uber/causalml
def test_CEVAE():
    y, X, treatment, tau, b, e = simulate_hidden_confounder(n=10000,
                                                            p=5,
                                                            sigma=1.0,
                                                            adj=0.0)

    outcome_dist = "normal"
    latent_dim = 20
    hidden_dim = 200
    num_epochs = 50
    batch_size = 100
    learning_rate = 1e-3
    learning_rate_decay = 0.1

    cevae = CEVAE(
        outcome_dist=outcome_dist,
        latent_dim=latent_dim,
        hidden_dim=hidden_dim,
        num_epochs=num_epochs,
        batch_size=batch_size,
        learning_rate=learning_rate,
        learning_rate_decay=learning_rate_decay,
    )

    cevae.fit(
        X=torch.tensor(X, dtype=torch.float),
        treatment=torch.tensor(treatment, dtype=torch.float),
        y=torch.tensor(y, dtype=torch.float),
    )

    # check the accuracy of the ite accuracy
    ite = cevae.predict(X).flatten()

    auuc_metrics = pd.DataFrame({
        "ite": ite,
        "W": treatment,
        "y": y,
        "treatment_effect_col": tau
    })

    cumgain = get_cumgain(auuc_metrics,
                          outcome_col="y",
                          treatment_col="W",
                          treatment_effect_col="tau")

    # Check if the cumulative gain when using the model's prediction is
    # higher than it would be under random targeting
    assert cumgain["ite"].sum() > cumgain["Random"].sum()
예제 #13
0
def test_BaseSClassifier(generate_classification_data):

    np.random.seed(RANDOM_SEED)

    df, x_names = generate_classification_data()

    df["treatment_group_key"] = np.where(
        df["treatment_group_key"] == CONTROL_NAME, 0, 1)

    df_train, df_test = train_test_split(df,
                                         test_size=0.2,
                                         random_state=RANDOM_SEED)

    uplift_model = BaseSClassifier(learner=XGBClassifier())

    uplift_model.fit(
        X=df_train[x_names].values,
        treatment=df_train["treatment_group_key"].values,
        y=df_train[CONVERSION].values,
    )

    tau_pred = uplift_model.predict(
        X=df_test[x_names].values,
        treatment=df_test["treatment_group_key"].values)

    auuc_metrics = pd.DataFrame({
        "tau_pred":
        tau_pred.flatten(),
        "W":
        df_test["treatment_group_key"].values,
        CONVERSION:
        df_test[CONVERSION].values,
        "treatment_effect_col":
        df_test["treatment_effect"].values,
    })

    cumgain = get_cumgain(
        auuc_metrics,
        outcome_col=CONVERSION,
        treatment_col="W",
        treatment_effect_col="treatment_effect_col",
    )

    # Check if the cumulative gain when using the model's prediction is
    # higher than it would be under random targeting
    assert cumgain["tau_pred"].sum() > cumgain["Random"].sum()
예제 #14
0
def test_UpliftTreeClassifier(generate_classification_data):
    df, x_names = generate_classification_data()
    df_train, df_test = train_test_split(df,
                                         test_size=0.2,
                                         random_state=RANDOM_SEED)

    # Train the UpLift Random Forest classifier
    uplift_model = UpliftTreeClassifier(control_name=TREATMENT_NAMES[0],
                                        random_state=RANDOM_SEED)

    pr = cProfile.Profile(subcalls=True, builtins=True, timeunit=.001)
    pr.enable()
    uplift_model.fit(df_train[x_names].values,
                     treatment=df_train['treatment_group_key'].values,
                     y=df_train[CONVERSION].values)

    y_pred = uplift_model.predict(df_test[x_names].values)
    pr.disable()
    with open('UpliftTreeClassifier.prof', 'w') as f:
        ps = pstats.Stats(pr, stream=f).sort_stats('cumulative')
        ps.print_stats()

    result = pd.DataFrame(y_pred, columns=uplift_model.classes_)
    result.drop(CONTROL_NAME, axis=1, inplace=True)

    best_treatment = np.where((result < 0).all(axis=1), CONTROL_NAME,
                              result.idxmax(axis=1))

    # Create a synthetic population:

    # Create indicator variables for whether a unit happened to have the
    # recommended treatment or was in the control group
    actual_is_best = np.where(df_test['treatment_group_key'] == best_treatment,
                              1, 0)
    actual_is_control = np.where(
        df_test['treatment_group_key'] == CONTROL_NAME, 1, 0)

    synthetic = (actual_is_best == 1) | (actual_is_control == 1)
    synth = result[synthetic]

    auuc_metrics = synth.assign(
        is_treated=1 - actual_is_control[synthetic],
        conversion=df_test.loc[synthetic, CONVERSION].values,
        uplift_tree=synth.max(axis=1)).drop(columns=result.columns)

    cumgain = get_cumgain(auuc_metrics,
                          outcome_col=CONVERSION,
                          treatment_col='is_treated')

    # Check if the cumulative gain of UpLift Random Forest is higher than
    # random
    assert cumgain['uplift_tree'].sum() > cumgain['Random'].sum()

    # Check if the total count is split correctly, at least for control group in the first level
    def validate_cnt(cur_tree):
        parent_control_cnt = cur_tree.nodeSummary[0][1]
        next_level_control_cnt = 0
        # assume the depth is at least 2
        assert cur_tree.trueBranch or cur_tree.falseBranch
        if cur_tree.trueBranch:
            next_level_control_cnt += cur_tree.trueBranch.nodeSummary[0][1]
        if cur_tree.falseBranch:
            next_level_control_cnt += cur_tree.falseBranch.nodeSummary[0][1]
        return [parent_control_cnt, next_level_control_cnt]

    counts = validate_cnt(uplift_model.fitted_uplift_tree)
    assert (counts[0] > 0 and counts[0] == counts[1])

    # Check if it works as expected after filling with validation data
    uplift_model.fill(df_test[x_names].values,
                      treatment=df_test['treatment_group_key'].values,
                      y=df_test[CONVERSION].values)
    counts = validate_cnt(uplift_model.fitted_uplift_tree)
    assert (counts[0] > 0 and counts[0] == counts[1])
예제 #15
0
def test_UpliftRandomForestClassifier(
    generate_classification_data, backend, joblib_prefer
):
    df, x_names = generate_classification_data()
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)

    with parallel_backend(backend):
        # Train the UpLift Random Forest classifier
        uplift_model = UpliftRandomForestClassifier(
            min_samples_leaf=50,
            control_name=TREATMENT_NAMES[0],
            random_state=RANDOM_SEED,
            joblib_prefer=joblib_prefer,
        )

        uplift_model.fit(
            df_train[x_names].values,
            treatment=df_train["treatment_group_key"].values,
            y=df_train[CONVERSION].values,
        )

        predictions = {}
        predictions["single"] = uplift_model.predict(df_test[x_names].values)
        with parallel_backend("loky", n_jobs=2):
            predictions["loky_2"] = uplift_model.predict(df_test[x_names].values)
        with parallel_backend("threading", n_jobs=2):
            predictions["threading_2"] = uplift_model.predict(df_test[x_names].values)
        with parallel_backend("multiprocessing", n_jobs=2):
            predictions["multiprocessing_2"] = uplift_model.predict(
                df_test[x_names].values
            )

        # assert that the predictions coincide for single and all parallel computations
        iterator = iter(predictions.values())
        first = next(iterator)
        assert all(np.array_equal(first, rest) for rest in iterator)

        y_pred = list(predictions.values())[0]
        result = pd.DataFrame(y_pred, columns=uplift_model.classes_[1:])

        best_treatment = np.where(
            (result < 0).all(axis=1), CONTROL_NAME, result.idxmax(axis=1)
        )

        # Create a synthetic population:

        # Create indicator variables for whether a unit happened to have the
        # recommended treatment or was in the control group
        actual_is_best = np.where(
            df_test["treatment_group_key"] == best_treatment, 1, 0
        )
        actual_is_control = np.where(
            df_test["treatment_group_key"] == CONTROL_NAME, 1, 0
        )

        synthetic = (actual_is_best == 1) | (actual_is_control == 1)
        synth = result[synthetic]

        auuc_metrics = synth.assign(
            is_treated=1 - actual_is_control[synthetic],
            conversion=df_test.loc[synthetic, CONVERSION].values,
            uplift_tree=synth.max(axis=1),
        ).drop(columns=list(uplift_model.classes_[1:]))

        cumgain = get_cumgain(
            auuc_metrics, outcome_col=CONVERSION, treatment_col="is_treated"
        )

        # Check if the cumulative gain of UpLift Random Forest is higher than
        # random
        assert cumgain["uplift_tree"].sum() > cumgain["Random"].sum()