Example #1
0
def causal_ATE():
	from causalinference import CausalModel
	from utils import random_data

        D = np.array([0, 0, 0, 1, 1, 1])
        X = np.array([[7, 8], [3, 10], [7, 10], [4, 7], [5, 10], [9, 8]])
        Y = random_data(D_cur=D, X_cur=X)
	print Y

	causal = CausalModel(Y, D, X)
	#causal.est_via_ols()
	#print causal.estimates	
	
	causal.est_propensity_s()
	print causal.propensity
	# -*- coding: utf-8 -*-
	#プロペンシティスコアを元に自分でマッチングすれば良い。

	#estimated propensity scores
	print causal.propensity['fitted']	
Example #2
0
def causal_ATE():
    from causalinference import CausalModel
    from utils import random_data

    D = np.array([0, 0, 0, 1, 1, 1])
    X = np.array([[7, 8], [3, 10], [7, 10], [4, 7], [5, 10], [9, 8]])
    Y = random_data(D_cur=D, X_cur=X)
    print Y

    causal = CausalModel(Y, D, X)
    #causal.est_via_ols()
    #print causal.estimates

    causal.est_propensity_s()
    print causal.propensity
    # -*- coding: utf-8 -*-
    #プロペンシティスコアを元に自分でマッチングすれば良い。

    #estimated propensity scores
    print causal.propensity['fitted']
Example #3
0
    def predict(self, dataset: DatasetInterface):
        data = dataset.get_data()

        size = len(data) // 2
        Y, X = data['E3'].to_numpy(), data[['E1', 'E2']].to_numpy()

        D = pd.np.concatenate(
            (pd.np.zeros(size), pd.np.ones(len(data) - size)))
        pd.np.random.shuffle(D)
        causal = CausalModel(Y, D, X)

        one = causal.est_propensity()
        two = causal.est_propensity_s()
        three = causal.est_via_ols()
        four = causal.est_via_weighting()

        help(causal)

        f = 4
Example #4
0
def causal_inference(
        platform, treatment_name,
        filename_prefix,
        filter_kwargs, exclude_kwargs,
        num_rows=None, quad_psm=False, simple_bin=None, trim_val=0,
        paired_psm=None, iterations=1, sample_num=None):
    """
    Args:
        platform - r for reddit or s for stackoverflow
        filename_prefix - will affect the output files
        filter_kwargs - which entities to include (e.g. has wiki link or not?)
        exlude_kwargs - which entities to exclude
        num_rows - the number of rows to use (defaults to all)
        
    Use causalinference module to perform causal inference analysis

    """
    print(treatment_name)
    def mark_time(desc):
        """return a tuple of time, description of time"""
        return (time.time(), desc)
    start = time.time()
    summary = {}
    treatment_effects = defaultdict(list)
    goal = 0
    fails = 0
    for iteration in range(iterations):
        if float(iteration) / iterations >= goal:
            # print('{}/{}|'.format(iteration, iterations), end='')
            goal += 0.1
        out = []
        times, atts = [], []
        ndifs, big_ndifs_counts = [], []
        times.append(mark_time('function_start'))
        qs, features, outcomes = get_qs_features_and_outcomes(
            platform, num_rows=num_rows, filter_kwargs=filter_kwargs, exclude_kwargs=exclude_kwargs)
        if 'is_top' in filter_kwargs:
            outcomes = ['num_pageviews']
        features.append(treatment_name)
        features.append('uid')
        db_name = connection.settings_dict['NAME']
        filename = '{pre}_Tr_{treatment}_on_{platform}_{subset}_{db}_trim{trim_val}_samples{samples}.txt'.format(**{
            'pre': filename_prefix,
            'treatment': treatment_name,
            'platform': platform,
            'subset': num_rows if num_rows else 'All',
            'db': db_name,
            'trim_val': trim_val,
            'samples': sample_num if sample_num else '0,1,2'
        })
        field_names = features + outcomes
        rows = qs.values_list(*field_names)

        if iterations > 1:
            samples = []
            for _ in rows:
                rand_index = np.random.randint(0, len(rows) - 1)
                samples.append(rows[rand_index])
            records = values_list_to_records(samples, field_names)
        else:
            records = values_list_to_records(rows, field_names)
        times.append(mark_time('records_loaded'))

        feature_rows = []
        successful_fields = []
        for feature in features:
            feature_row = getattr(records, feature)
            if feature == treatment_name:
                D = feature_row
                continue
            elif feature == 'uid':
                ids = feature_row
                continue
            try:
                has_any_nans = any(np.isnan(feature_row))
            except Exception:
                continue
            if not np.any(feature_row):
                continue
            elif has_any_nans:
                continue
            else:
                if max(feature_row) > 1 or min(feature_row) < 0:
                    if feature in [
                        'user_link_karma',
                        'seconds_since_user_creation',
                        'user_comment_karma',
                        'user_reputation',
                    ]:
                        minval = min(feature_row)
                        if minval <= 0:
                            shifted = np.add(-1 * minval + 1, feature_row)
                        else:
                            shifted = feature_row
                        adjusted_feature = np.log(shifted)
                    else:
                        adjusted_feature = feature_row
                    adjusted_feature = (
                        adjusted_feature - np.mean(adjusted_feature)) / np.std(adjusted_feature)
                    feature_rows.append(adjusted_feature)
                else:
                    feature_rows.append(feature_row)
                successful_fields.append(feature)
        outcome_rows = []
        for outcome in outcomes:
            outcome_row = getattr(records, outcome)
            outcome_rows.append(outcome_row)

        times.append(mark_time('rows_loaded'))
        exclude_from_ps = [
        ]
        skip_fields = [
            'user_is_deleted', 'user_is_mod', 'user_is_suspended',
            'title_includes_question_mark',
        ]

        X = np.transpose(np.array(feature_rows))
        X_c = X[D == 0]
        X_t = X[D == 1]
        print('len X_c', len(X_c))
        print('len X_t', len(X_t))
        to_delete, cols_deleted = [], 0
        for col_num, col in enumerate(X_c.T):
            if not np.any(col):
                to_delete.append(col_num)
        for col_num, col in enumerate(X_t.T):
            if not np.any(col):
                to_delete.append(col_num)
        for col_num in to_delete:
            X = np.delete(X, col_num - cols_deleted, 1)
            successful_fields.remove(successful_fields[col_num - cols_deleted])
            cols_deleted += 1

        dummies = {
            'months':	[
                'jan', 'feb', 'mar', 'apr',
                'may', 'jun', 'jul', 'aug', 'sep',
                'octo', 'nov',
            ],
            'hours': ['zero_to_six', 'six_to_twelve', 'twelve_to_eighteen', ],
            'contexts': ['in_todayilearned',
                         'in_borntoday', 'in_wikipedia', 'in_CelebrityBornToday',
                         'in_The_Donald', ],
            'years': ['year2008', 'year2009', 'year2010',
                      'year2011', 'year2012', 'year2013',
                      'year2014', 'year2015', ],
            'days:': [
                'mon', 'tues', 'wed', 'thurs',
                'fri', 'sat',
            ],
        }
        while True:
            can_break = True
            sums = defaultdict(int)
            total = X.shape[0]
            to_delete, cols_deleted = [], 0

            for col_num in range(X.shape[1]):
                for dummy_category, names in dummies.items():
                    if successful_fields[col_num] in names:
                        col = X.T[col_num]
                        sums[dummy_category] += np.sum(col)

            for dummy_category, names in dummies.items():
                if sums[dummy_category] == total:
                    for col_num in range(X.shape[1]):
                        if successful_fields[col_num] in names:
                            can_break = False
                            to_delete.append(col_num)
                            names.remove(successful_fields[col_num])
                            break
            for col_num in to_delete:
                X = np.delete(X, col_num - cols_deleted, 1)
                successful_fields.remove(
                    successful_fields[col_num - cols_deleted])
                cols_deleted += 1
            if can_break:
                break
        Y = np.transpose(np.array(outcome_rows))
        causal = CausalModel(Y, D, X, ids=ids)
        times.append(mark_time('CausalModel'))
        out.append(str(causal.summary_stats))
        ndifs.append(causal.summary_stats['sum_of_abs_ndiffs'])
        big_ndifs_counts.append(causal.summary_stats['num_large_ndiffs'])
        # causal.est_via_ols()
        # times.append(mark_time('est_via_ols'))
        if not quad_psm:
            causal.est_propensity(successful_fields, exclude_from_ps)
            times.append(mark_time('propensity'))
        else:
            causal.est_propensity_s()
            times.append(mark_time('propensity_s'))
        varname_to_field = {
            "X{}".format(i): field for i, field in enumerate(
                successful_fields) if field not in exclude_from_ps
        }
        outname_to_field = {"Y{}".format(
            i): field for i, field in enumerate(outcomes)}
        for dic in [varname_to_field, outname_to_field]:
            for key, val in dic.items():
                out.append("{}:{}".format(key, val))
        out.append(str(causal.propensity))
        if trim_val == 's':
            causal.trim_s()
        else:
            causal.cutoff = float(trim_val)
            causal.trim()
        times.append(mark_time('trim_{}'.format(causal.cutoff)))
        out.append('TRIM PERFORMED: {}'.format(causal.cutoff))
        out.append(str(causal.summary_stats))
        ndifs.append(causal.summary_stats['sum_of_abs_ndiffs'])
        big_ndifs_counts.append(causal.summary_stats['num_large_ndiffs'])

        if paired_psm:
            psm_est, psm_summary, psm_rows = causal.est_via_psm()
            out.append('PSM PAIR REGRESSION')
            out.append(str(psm_summary))
            out.append(str(psm_est))
            diff_avg = 0
            for row in psm_rows:
                diff_avg += abs(row[1] - row[3])
            diff_avg /= len(psm_rows)
            out.append('Pscore diff average: {}'.format(diff_avg))

            with open('PSM_PAIRS' + filename, 'w') as outfile:
                for psm_row in psm_rows:
                    psm_row = [str(entry) for entry in psm_row]
                    outfile.write(','.join(psm_row))
            atts = psm_est['ols']['att']
        else:
            if simple_bin:
                causal.blocks = int(simple_bin)
                causal.stratify()
                times.append(mark_time('stratify_{}'.format(simple_bin)))
            else:
                try:
                    causal.stratify_s()
                except ValueError as err:
                    fails += 1
                    continue
                times.append(mark_time('stratify_s'))
            out.append(str(causal.strata))
            try:
                causal.est_via_blocking(successful_fields, skip_fields)
                out += causal.estimates['blocking']['coef_rows']
                summary['blocking'] = [[filename]]
                summary['blocking'] += causal.estimates['blocking'].as_rows()
                times.append(mark_time('est_via_blocking'))
                atts = causal.estimates['blocking']['att']
                w_avg_ndiff = 0
                w_num_large_ndiffs = 0
                for stratum in causal.strata:
                    val = stratum.summary_stats['sum_of_abs_ndiffs']
                    if np.isnan(val):
                        # could be nan because standard dev for a variable was 0
                        continue
                    count = stratum.raw_data['N']
                    fraction = count / causal.raw_data['N']
                    w_avg_ndiff += fraction * val
                    w_num_large_ndiffs += fraction * \
                        stratum.summary_stats['num_large_ndiffs']
                out.append(
                    'WEIGHTED AVERAGE OF SUM OF ABSOLUTE VALUE OF ALL NDIFs')
                ndifs.append(w_avg_ndiff)
                big_ndifs_counts.append(w_num_large_ndiffs)
                out.append(','.join([str(ndif) for ndif in ndifs]))
                out.append('# of BIG NDIFS')
                out.append(','.join([str(count)
                                     for count in big_ndifs_counts]))

                varname_to_field = {
                    "X{}".format(i): field for i, field in enumerate(
                        successful_fields) if field not in skip_fields
                }
                out.append('VARS USED IN BLOCK REGRESSIONS')
                for dic in [varname_to_field]:
                    for key, val in dic.items():
                        out.append("{}:{}".format(key, val))
            except np.linalg.linalg.LinAlgError as err:
                msg = 'LinAlgError with est_via_blocking: {}'.format(err)
                err_handle(msg, out)
        out.append(str(causal.estimates))
        timing_info = {}
        prev = times[0][0]
        for cur_time, desc in times[1:]:
            timing_info[desc] = cur_time - prev
            prev = cur_time
        for key, val in timing_info.items():
            out.append("{}:{}".format(key, val))
        if iterations == 1:
            if FILTER_LANG:
                filename = 'lang_filtered_' + filename
            with open(filename, 'w') as outfile:
                outfile.write('\n'.join(out))
        else:
            for att_num, att in enumerate(atts):
                treatment_effects[outcomes[att_num]].append(att)
    if iterations > 1:
        boot_rows = [
            ['Bootstrap results for {} iterations of full resampling'.format(
                iterations), str(0.005), str(0.995)]
        ]
        for outcome, att_lst in treatment_effects.items():
            sor = sorted(att_lst)
            n = len(att_lst)
            bot = int(0.005 * n)
            top = int(0.995 * n)
            boot_rows.append([
                outcome, sor[bot], sor[top]
            ])
        boot_rows.append([time.time() - start])
        with open('csv_files/' + 'BOOT_' + filename, 'w', newline='') as outfile:
            writer = csv.writer(outfile)
            writer.writerows(boot_rows)
        summary = causal_inference(
            platform, 
            treatment_name,
            filename_prefix,
            filter_kwargs, exclude_kwargs,
            num_rows, quad_psm, simple_bin, trim_val,
            paired_psm, iterations=1, sample_num=sample_num)
    return summary
Example #5
0
from causalinference import CausalModel
from causalinference.utils import random_data

Y, D, X = random_data()
causal = CausalModel(Y, D, X)

#Propensity Score
causal.est_propensity_s()
    def run_causal_inference(
            self, merged):  # Returns txt file of the causal model output

        log = open(self.default_output_file_name, 'a')
        log.write(
            "########  We have started the Causal Inference Analysis ##########"
            + "\n")
        log.flush()

        # Initialize arrays, data references, and reformat some of the columns

        merged['bill_rate'] = merged.bill_rate.astype('float')
        all_bill_rates = merged.bill_rate.astype('float')
        merged['work_experience'] = merged.work_experience.astype('float')
        all_work_experience = merged.work_experience
        all_education_id = merged.education_id
        #all_new_age_range_id = merged.new_age_range_id
        all_job_category_id = merged.job_category_id
        all_genders = merged.final_gender
        all_genders = merged.gender_computer  # Just looking at gender computer
        gender_array = []
        bill_rate_array = []
        all_covariates_array = []

        # Converting covariates to a matrix on a dichotomous scale

        def make_dichotomous_matrix(id_value, covariate, final_matrix):
            for option in list(set(covariate)):
                if (id_value == option):
                    final_matrix.append(1)
                else:
                    final_matrix.append(0)
            return final_matrix

        for gender in all_genders:
            if (gender == "male"):
                gender_array.append(0)
            elif (gender == "female"):  # Female as the treatment group
                gender_array.append(1)

        for rate in all_bill_rates:
            rate = round(float(rate), 2)
            bill_rate_array.append(rate)

        for row in merged.itertuples():
            job_category_matrix = []
            education_matrix = []
            #new_age_range_id_matrix = []
            individual_covariate_matrix = []

            job_category_matrix = make_dichotomous_matrix(
                row.job_category_id, all_job_category_id, job_category_matrix)
            education_matrix = make_dichotomous_matrix(row.education_id,
                                                       all_education_id,
                                                       education_matrix)
            #new_age_range_id_matrix = make_dichotomous_matrix(row.new_age_range_id, all_new_age_range_id, new_age_range_id_matrix)

            individual_covariate_matrix.extend(job_category_matrix)
            individual_covariate_matrix.extend(education_matrix)
            #individual_covariate_matrix.extend(new_age_range_id_matrix)
            individual_covariate_matrix.append(row.work_experience)
            all_covariates_array.append(individual_covariate_matrix)

        # Sanity checks
        print "Bill rate array length: {0}".format(len(bill_rate_array))
        print "Gender array length: {0}".format(len(gender_array))
        print "All covariates array length: {0}".format(
            len(all_covariates_array))

        # Create the causal model
        Y = np.array(bill_rate_array)
        D = np.array(gender_array)
        X = np.array(all_covariates_array)
        # np.seterr(divide='ignore', invalid='ignore')

        causal = CausalModel(Y, D, X)
        print "We've made the Causal Model!"
        log.write("We've made the Causal Model!" + "\n")

        log.write("---ORIGINAL STATS--- " + "\n")
        log.write(str(causal.summary_stats) + "\n")

        log.write("---MATCHING---" + "\n")
        causal.est_via_matching(bias_adj=True)
        print "We finished matching!!"
        log.write(str(causal.estimates) + "\n")
        log.write(str(causal.summary_stats) + "\n")

        log.write("---PROPENSITY SCORES---" + "\n")
        causal.est_propensity_s()
        print "We finished estimating propensity scores!!"
        log.write(str(causal.propensity) + "\n")
        log.write(str(causal.summary_stats) + "\n")

        log.write("---TRIMMING---" + "\n")
        causal.trim_s()
        causal.cutoff
        print "We finished trimming!!"
        log.write(str(causal.summary_stats) + "\n")

        log.write("---STRATIFYING---" + "\n")
        causal.stratify()
        print "We finished stratifying!!"
        log.write(str(causal.strata) + "\n")
        log.write(str(causal.summary_stats) + "\n")

        log.write(
            "---TREATMENT ESTIMATES (AFTER TRIMMING AND STRATIFYING)---" +
            "\n")
        causal.est_via_matching(bias_adj=True)
        print "We finished estimating via matching (after trimming)!!"
        log.write(str(causal.estimates) + "\n")
        log.write(str(causal.summary_stats) + "\n")

        print "We are all done with the causal inference analysis!"
        log.flush()
    pd.read_stata('../data/raw/cps_controls.dta')
],
                               ignore_index=True)
## preliminary analysis
# %%
treated = rct_data[rct_data.treat == 1]
synthetic_cps1 = pd.concat([
    treated, observational_data[observational_data.data_id == 'CPS1']
]).assign(treat=lambda x: x.treat.astype(bool))
#synthetic_psid = pd.concat([treated, observational_data[observational_data.data_id == 'PSID']])
# %%
# we use the CausalModel method from the causalinference package

causal = CausalModel(Y=synthetic_cps1['re78'].values,
                     D=synthetic_cps1['treat'].values,
                     X=synthetic_cps1[[
                         col for col in synthetic_cps1.columns
                         if col not in ('treat', 're78', 'data_id')
                     ]].values)

causal.est_via_ols(adj=1)
# adj=1 corresponds to the simplicity of the model we entered
# This is called a "constant treatment effect"

print(causal.estimates)
# %%
print(causal.summary_stats)
# %%
#this function estimates the propensity score, so that propensity methods can be employed
causal.est_propensity_s()
print(causal.propensity)
# %%
Example #8
0

# re-run causal analysis with test data to make sure you didn't break it :)
import numpy as np
from causalinference import CausalModel

from causalinference.utils.tools import vignette_data

Y, D, X = vignette_data()

Y = np.array([Y, Y]).T
ids = np.array(range(len(Y)))

causal = CausalModel(Y, D, X, ids=ids)

print(causal.summary_stats)
causal.est_via_ols()
print(causal.estimates)
causal.est_propensity_s()
print(causal.propensity)
causal.trim_s()
causal.stratify_s()
print(causal.strata)
causal.est_via_blocking([], [])
print(causal.estimates)
print(causal.estimates.keys())

## expected ATT is 9.553
Example #9
0
from causalinference import CausalModel
from causalinference.utils import random_data

Y, D, X = random_data()
causal = CausalModel(Y, D, X)

#Estimate ATE
causal.est_via_ols()
causal.est_via_weighting()
causal.est_via_blocking()
causal.est_via_matching(bias_adj=True)