コード例 #1
0
def run_parameterized_estimators(
    df, df2=None,
    stopwords=None,
    vocab_size=2000,
    use_counts=False,
    threshold=0.8,
    only_zeros=True,
    inner_alpha='optimal',
    outer_alpha='optimal',
    g_weight=1, Q_weight=1, mlm_weight=1, run_cb=False):
    """ Run all the ATE estimators based on models:
            regression expansion (+pu classifier), bert adjustment, and
                regression expansion + bert.
    """
    X, vocab, vectorizer = prepare_covariates(df, stopwords, vocab_size, use_counts)
    T_true = df['T_true'].to_numpy()
    T_proxy = df['T_proxy'].to_numpy()

    # PU classifier expansion
    only_zeros=True
    pu = label_expansion.PUClassifier(
        inner_alpha=inner_alpha,
        outer_alpha=outer_alpha)
    pu.fit(X, T_proxy)
    T_plus_pu = label_expansion.expand_variable(pu, X, T_proxy,
        threshold=threshold,
        only_zeros=only_zeros)
    ATE_pu = util.ATE_adjusted(df.C_true, T_plus_pu , df.Y_sim)

    # Plain regression expansion
    reg = SGDClassifier(loss="log", penalty="l2", alpha=outer_alpha)
    reg.fit(X, T_proxy)
    T_plus_reg = label_expansion.expand_variable(reg, X, T_proxy,
        threshold=threshold,
        only_zeros=only_zeros)
    ATE_reg = util.ATE_adjusted(df.C_true, T_plus_reg , df.Y_sim)

    if run_cb:
        cbw = CausalBert.CausalBertWrapper(g_weight=g_weight, Q_weight=args.Q_weight, mlm_weight=args.mlm_weight)
        cbw.train(df['text'], df.C_true, df.T_proxy, df.Y_sim, epochs=3)
        ATE_cb_Tproxy = cbw.ATE(df.C_true, df['text'], Y=df.Y_sim, platt_scaling=False)

        cbw = CausalBert.CausalBertWrapper(g_weight=g_weight, Q_weight=args.Q_weight, mlm_weight=args.mlm_weight)
        cbw.train(df['text'], df.C_true, T_plus_pu, df.Y_sim, epochs=3)
        ATE_cb_Tplus = cbw.ATE(df.C_true, df['text'], Y=df.Y_sim, platt_scaling=False)

    else:
        ATE_cb_Tproxy, ATE_cb_Tplus = -1, -1

    return ATE_pu, ATE_reg, ATE_cb_Tproxy, ATE_cb_Tplus
コード例 #2
0
def run_experiment(args):
    """ Run an experiment with the given args and seed.

        Returns {causal estimator: ATE estimate}
    """
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed) 

    df = get_data(args)

    ATE_estimates = []

    if 'T_true' in df:
        ATE_estimates.append(
            ('unadj_T', util.ATE_unadjusted(df.T_true, df.Y_sim)))
        ATE_estimates.append(
            ('ate_T', util.ATE_adjusted(df.C_true, df.T_true, df.Y_sim))
        )
        ATE_estimates.append(
            ('ate_matrix', util.ATE_matrix(df.T_true, df.T_proxy, df.C_true, df.Y_sim))
        )

    if 'T_proxy' in df:
        ATE_estimates.append(
            ('unadj_T_proxy', util.ATE_unadjusted(df.T_proxy, df.Y_sim)))
        ATE_estimates.append(
            ('ate_T_proxy', util.ATE_adjusted(df.C_true, df.T_proxy, df.Y_sim)))

        ATE_T_plus_reg, T_plus_reg = run_label_expansion(df, args, 
            inner_alpha=args.ina, outer_alpha=args.outa, threshold=args.thre)
        ATE_estimates.append(('ate_T_plus_reg', ATE_T_plus_reg))

        ATE_T_plus_pu, T_plus_pu = run_label_expansion(df, args, single_class=True, 
            inner_alpha=args.ina, outer_alpha=args.outa, threshold=args.thre)
        ATE_estimates.append(('ate_T_plus_pu', ATE_T_plus_pu))

        if args.run_cb:
            cbw = CausalBert.CausalBertWrapper(g_weight=args.g_weight, Q_weight=args.Q_weight, mlm_weight=args.mlm_weight)
            cbw.train(df['text'], df.C_true, df.T_proxy, df.Y_sim, epochs=3)
            ATE_cb_Tproxy = cbw.ATE(df.C_true, df['text'], Y=df.Y_sim, platt_scaling=False)
            ATE_estimates.append(('ate_cb_T_proxy', ATE_cb_Tproxy))

            cbw = CausalBert.CausalBertWrapper(g_weight=args.g_weight, Q_weight=args.Q_weight, mlm_weight=args.mlm_weight)
            cbw.train(df['text'], df.C_true, T_plus_pu, df.Y_sim, epochs=3)
            ATE_cb_Tplus = cbw.ATE(df.C_true, df['text'], Y=df.Y_sim, platt_scaling=False)
            ATE_estimates.append(('ate_cb_T_plus', ATE_cb_Tplus))

    return dict(ATE_estimates)
コード例 #3
0
ファイル: main.py プロジェクト: rpryzant/causal-text
def run_label_expansion(df,
                        args,
                        stopwords=None,
                        use_counts=False,
                        single_class=False,
                        only_zeros=True,
                        inner_alpha='optimal',
                        outer_alpha='optimal',
                        threshold=0.8):
    X, vocab, vectorizer = prepare_covariates(df, stopwords, args.vs,
                                              use_counts)
    T_proxy = df['T_proxy'].to_numpy()

    #  one-class learning
    if single_class:
        model = label_expansion.PUClassifier(inner_alpha=inner_alpha,
                                             outer_alpha=outer_alpha)

    # use a logistic regression
    else:
        model = SGDClassifier(loss="log", penalty="l2", alpha=outer_alpha)

    model.fit(X, T_proxy)
    T_plus = label_expansion.expand_variable(model,
                                             X,
                                             T_proxy,
                                             threshold=threshold,
                                             only_zeros=only_zeros)
    ATE_plus = util.ATE_adjusted(df.C_true, T_plus, df.Y_sim)

    return ATE_plus, T_plus
コード例 #4
0
            # if i > 100: break

        data = (torch.tensor(out[x])
                for x in ['W_ids', 'W_len', 'W_mask', 'C', 'T', 'Y'])
        data = TensorDataset(*data)
        sampler = RandomSampler(
            data) if sampler == 'random' else SequentialSampler(data)
        dataloader = DataLoader(data,
                                sampler=sampler,
                                batch_size=self.batch_size)
        # collate_fn=collate_CandT)

        return dataloader


if __name__ == '__main__':
    import pandas as pd
    df = pd.read_csv(open('TEST_DF'))
    cb = CausalBertWrapper(batch_size=2,
                           g_weight=0.0,
                           Q_weight=0,
                           mlm_weight=1)
    cb.train(df['review'], df.C_true, df.T_true, df.Y_sim)
    # print(cb.inference(df['review'], df.C_true, df.T_proxy))
    # print(cb.ATE(df.C_true, df['review'], df.Y_sim, platt_scaling=True))

    import util
    print(util.ATE_adjusted(df.C_true, df.T_true, df.Y_sim))

    quit()