def causal_ATE(): from causalinference import CausalModel from utils import random_data D = np.array([0, 0, 0, 1, 1, 1]) X = np.array([[7, 8], [3, 10], [7, 10], [4, 7], [5, 10], [9, 8]]) Y = random_data(D_cur=D, X_cur=X) print Y causal = CausalModel(Y, D, X) #causal.est_via_ols() #print causal.estimates causal.est_propensity_s() print causal.propensity # -*- coding: utf-8 -*- #プロペンシティスコアを元に自分でマッチングすれば良い。 #estimated propensity scores print causal.propensity['fitted']
def predict(self, dataset: DatasetInterface): data = dataset.get_data() size = len(data) // 2 Y, X = data['E3'].to_numpy(), data[['E1', 'E2']].to_numpy() D = pd.np.concatenate( (pd.np.zeros(size), pd.np.ones(len(data) - size))) pd.np.random.shuffle(D) causal = CausalModel(Y, D, X) one = causal.est_propensity() two = causal.est_propensity_s() three = causal.est_via_ols() four = causal.est_via_weighting() help(causal) f = 4
def causal_inference( platform, treatment_name, filename_prefix, filter_kwargs, exclude_kwargs, num_rows=None, quad_psm=False, simple_bin=None, trim_val=0, paired_psm=None, iterations=1, sample_num=None): """ Args: platform - r for reddit or s for stackoverflow filename_prefix - will affect the output files filter_kwargs - which entities to include (e.g. has wiki link or not?) exlude_kwargs - which entities to exclude num_rows - the number of rows to use (defaults to all) Use causalinference module to perform causal inference analysis """ print(treatment_name) def mark_time(desc): """return a tuple of time, description of time""" return (time.time(), desc) start = time.time() summary = {} treatment_effects = defaultdict(list) goal = 0 fails = 0 for iteration in range(iterations): if float(iteration) / iterations >= goal: # print('{}/{}|'.format(iteration, iterations), end='') goal += 0.1 out = [] times, atts = [], [] ndifs, big_ndifs_counts = [], [] times.append(mark_time('function_start')) qs, features, outcomes = get_qs_features_and_outcomes( platform, num_rows=num_rows, filter_kwargs=filter_kwargs, exclude_kwargs=exclude_kwargs) if 'is_top' in filter_kwargs: outcomes = ['num_pageviews'] features.append(treatment_name) features.append('uid') db_name = connection.settings_dict['NAME'] filename = '{pre}_Tr_{treatment}_on_{platform}_{subset}_{db}_trim{trim_val}_samples{samples}.txt'.format(**{ 'pre': filename_prefix, 'treatment': treatment_name, 'platform': platform, 'subset': num_rows if num_rows else 'All', 'db': db_name, 'trim_val': trim_val, 'samples': sample_num if sample_num else '0,1,2' }) field_names = features + outcomes rows = qs.values_list(*field_names) if iterations > 1: samples = [] for _ in rows: rand_index = np.random.randint(0, len(rows) - 1) samples.append(rows[rand_index]) records = values_list_to_records(samples, field_names) else: records = values_list_to_records(rows, field_names) times.append(mark_time('records_loaded')) feature_rows = [] successful_fields = [] for feature in features: feature_row = getattr(records, feature) if feature == treatment_name: D = feature_row continue elif feature == 'uid': ids = feature_row continue try: has_any_nans = any(np.isnan(feature_row)) except Exception: continue if not np.any(feature_row): continue elif has_any_nans: continue else: if max(feature_row) > 1 or min(feature_row) < 0: if feature in [ 'user_link_karma', 'seconds_since_user_creation', 'user_comment_karma', 'user_reputation', ]: minval = min(feature_row) if minval <= 0: shifted = np.add(-1 * minval + 1, feature_row) else: shifted = feature_row adjusted_feature = np.log(shifted) else: adjusted_feature = feature_row adjusted_feature = ( adjusted_feature - np.mean(adjusted_feature)) / np.std(adjusted_feature) feature_rows.append(adjusted_feature) else: feature_rows.append(feature_row) successful_fields.append(feature) outcome_rows = [] for outcome in outcomes: outcome_row = getattr(records, outcome) outcome_rows.append(outcome_row) times.append(mark_time('rows_loaded')) exclude_from_ps = [ ] skip_fields = [ 'user_is_deleted', 'user_is_mod', 'user_is_suspended', 'title_includes_question_mark', ] X = np.transpose(np.array(feature_rows)) X_c = X[D == 0] X_t = X[D == 1] print('len X_c', len(X_c)) print('len X_t', len(X_t)) to_delete, cols_deleted = [], 0 for col_num, col in enumerate(X_c.T): if not np.any(col): to_delete.append(col_num) for col_num, col in enumerate(X_t.T): if not np.any(col): to_delete.append(col_num) for col_num in to_delete: X = np.delete(X, col_num - cols_deleted, 1) successful_fields.remove(successful_fields[col_num - cols_deleted]) cols_deleted += 1 dummies = { 'months': [ 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'octo', 'nov', ], 'hours': ['zero_to_six', 'six_to_twelve', 'twelve_to_eighteen', ], 'contexts': ['in_todayilearned', 'in_borntoday', 'in_wikipedia', 'in_CelebrityBornToday', 'in_The_Donald', ], 'years': ['year2008', 'year2009', 'year2010', 'year2011', 'year2012', 'year2013', 'year2014', 'year2015', ], 'days:': [ 'mon', 'tues', 'wed', 'thurs', 'fri', 'sat', ], } while True: can_break = True sums = defaultdict(int) total = X.shape[0] to_delete, cols_deleted = [], 0 for col_num in range(X.shape[1]): for dummy_category, names in dummies.items(): if successful_fields[col_num] in names: col = X.T[col_num] sums[dummy_category] += np.sum(col) for dummy_category, names in dummies.items(): if sums[dummy_category] == total: for col_num in range(X.shape[1]): if successful_fields[col_num] in names: can_break = False to_delete.append(col_num) names.remove(successful_fields[col_num]) break for col_num in to_delete: X = np.delete(X, col_num - cols_deleted, 1) successful_fields.remove( successful_fields[col_num - cols_deleted]) cols_deleted += 1 if can_break: break Y = np.transpose(np.array(outcome_rows)) causal = CausalModel(Y, D, X, ids=ids) times.append(mark_time('CausalModel')) out.append(str(causal.summary_stats)) ndifs.append(causal.summary_stats['sum_of_abs_ndiffs']) big_ndifs_counts.append(causal.summary_stats['num_large_ndiffs']) # causal.est_via_ols() # times.append(mark_time('est_via_ols')) if not quad_psm: causal.est_propensity(successful_fields, exclude_from_ps) times.append(mark_time('propensity')) else: causal.est_propensity_s() times.append(mark_time('propensity_s')) varname_to_field = { "X{}".format(i): field for i, field in enumerate( successful_fields) if field not in exclude_from_ps } outname_to_field = {"Y{}".format( i): field for i, field in enumerate(outcomes)} for dic in [varname_to_field, outname_to_field]: for key, val in dic.items(): out.append("{}:{}".format(key, val)) out.append(str(causal.propensity)) if trim_val == 's': causal.trim_s() else: causal.cutoff = float(trim_val) causal.trim() times.append(mark_time('trim_{}'.format(causal.cutoff))) out.append('TRIM PERFORMED: {}'.format(causal.cutoff)) out.append(str(causal.summary_stats)) ndifs.append(causal.summary_stats['sum_of_abs_ndiffs']) big_ndifs_counts.append(causal.summary_stats['num_large_ndiffs']) if paired_psm: psm_est, psm_summary, psm_rows = causal.est_via_psm() out.append('PSM PAIR REGRESSION') out.append(str(psm_summary)) out.append(str(psm_est)) diff_avg = 0 for row in psm_rows: diff_avg += abs(row[1] - row[3]) diff_avg /= len(psm_rows) out.append('Pscore diff average: {}'.format(diff_avg)) with open('PSM_PAIRS' + filename, 'w') as outfile: for psm_row in psm_rows: psm_row = [str(entry) for entry in psm_row] outfile.write(','.join(psm_row)) atts = psm_est['ols']['att'] else: if simple_bin: causal.blocks = int(simple_bin) causal.stratify() times.append(mark_time('stratify_{}'.format(simple_bin))) else: try: causal.stratify_s() except ValueError as err: fails += 1 continue times.append(mark_time('stratify_s')) out.append(str(causal.strata)) try: causal.est_via_blocking(successful_fields, skip_fields) out += causal.estimates['blocking']['coef_rows'] summary['blocking'] = [[filename]] summary['blocking'] += causal.estimates['blocking'].as_rows() times.append(mark_time('est_via_blocking')) atts = causal.estimates['blocking']['att'] w_avg_ndiff = 0 w_num_large_ndiffs = 0 for stratum in causal.strata: val = stratum.summary_stats['sum_of_abs_ndiffs'] if np.isnan(val): # could be nan because standard dev for a variable was 0 continue count = stratum.raw_data['N'] fraction = count / causal.raw_data['N'] w_avg_ndiff += fraction * val w_num_large_ndiffs += fraction * \ stratum.summary_stats['num_large_ndiffs'] out.append( 'WEIGHTED AVERAGE OF SUM OF ABSOLUTE VALUE OF ALL NDIFs') ndifs.append(w_avg_ndiff) big_ndifs_counts.append(w_num_large_ndiffs) out.append(','.join([str(ndif) for ndif in ndifs])) out.append('# of BIG NDIFS') out.append(','.join([str(count) for count in big_ndifs_counts])) varname_to_field = { "X{}".format(i): field for i, field in enumerate( successful_fields) if field not in skip_fields } out.append('VARS USED IN BLOCK REGRESSIONS') for dic in [varname_to_field]: for key, val in dic.items(): out.append("{}:{}".format(key, val)) except np.linalg.linalg.LinAlgError as err: msg = 'LinAlgError with est_via_blocking: {}'.format(err) err_handle(msg, out) out.append(str(causal.estimates)) timing_info = {} prev = times[0][0] for cur_time, desc in times[1:]: timing_info[desc] = cur_time - prev prev = cur_time for key, val in timing_info.items(): out.append("{}:{}".format(key, val)) if iterations == 1: if FILTER_LANG: filename = 'lang_filtered_' + filename with open(filename, 'w') as outfile: outfile.write('\n'.join(out)) else: for att_num, att in enumerate(atts): treatment_effects[outcomes[att_num]].append(att) if iterations > 1: boot_rows = [ ['Bootstrap results for {} iterations of full resampling'.format( iterations), str(0.005), str(0.995)] ] for outcome, att_lst in treatment_effects.items(): sor = sorted(att_lst) n = len(att_lst) bot = int(0.005 * n) top = int(0.995 * n) boot_rows.append([ outcome, sor[bot], sor[top] ]) boot_rows.append([time.time() - start]) with open('csv_files/' + 'BOOT_' + filename, 'w', newline='') as outfile: writer = csv.writer(outfile) writer.writerows(boot_rows) summary = causal_inference( platform, treatment_name, filename_prefix, filter_kwargs, exclude_kwargs, num_rows, quad_psm, simple_bin, trim_val, paired_psm, iterations=1, sample_num=sample_num) return summary
from causalinference import CausalModel from causalinference.utils import random_data Y, D, X = random_data() causal = CausalModel(Y, D, X) #Propensity Score causal.est_propensity_s()
def run_causal_inference( self, merged): # Returns txt file of the causal model output log = open(self.default_output_file_name, 'a') log.write( "######## We have started the Causal Inference Analysis ##########" + "\n") log.flush() # Initialize arrays, data references, and reformat some of the columns merged['bill_rate'] = merged.bill_rate.astype('float') all_bill_rates = merged.bill_rate.astype('float') merged['work_experience'] = merged.work_experience.astype('float') all_work_experience = merged.work_experience all_education_id = merged.education_id #all_new_age_range_id = merged.new_age_range_id all_job_category_id = merged.job_category_id all_genders = merged.final_gender all_genders = merged.gender_computer # Just looking at gender computer gender_array = [] bill_rate_array = [] all_covariates_array = [] # Converting covariates to a matrix on a dichotomous scale def make_dichotomous_matrix(id_value, covariate, final_matrix): for option in list(set(covariate)): if (id_value == option): final_matrix.append(1) else: final_matrix.append(0) return final_matrix for gender in all_genders: if (gender == "male"): gender_array.append(0) elif (gender == "female"): # Female as the treatment group gender_array.append(1) for rate in all_bill_rates: rate = round(float(rate), 2) bill_rate_array.append(rate) for row in merged.itertuples(): job_category_matrix = [] education_matrix = [] #new_age_range_id_matrix = [] individual_covariate_matrix = [] job_category_matrix = make_dichotomous_matrix( row.job_category_id, all_job_category_id, job_category_matrix) education_matrix = make_dichotomous_matrix(row.education_id, all_education_id, education_matrix) #new_age_range_id_matrix = make_dichotomous_matrix(row.new_age_range_id, all_new_age_range_id, new_age_range_id_matrix) individual_covariate_matrix.extend(job_category_matrix) individual_covariate_matrix.extend(education_matrix) #individual_covariate_matrix.extend(new_age_range_id_matrix) individual_covariate_matrix.append(row.work_experience) all_covariates_array.append(individual_covariate_matrix) # Sanity checks print "Bill rate array length: {0}".format(len(bill_rate_array)) print "Gender array length: {0}".format(len(gender_array)) print "All covariates array length: {0}".format( len(all_covariates_array)) # Create the causal model Y = np.array(bill_rate_array) D = np.array(gender_array) X = np.array(all_covariates_array) # np.seterr(divide='ignore', invalid='ignore') causal = CausalModel(Y, D, X) print "We've made the Causal Model!" log.write("We've made the Causal Model!" + "\n") log.write("---ORIGINAL STATS--- " + "\n") log.write(str(causal.summary_stats) + "\n") log.write("---MATCHING---" + "\n") causal.est_via_matching(bias_adj=True) print "We finished matching!!" log.write(str(causal.estimates) + "\n") log.write(str(causal.summary_stats) + "\n") log.write("---PROPENSITY SCORES---" + "\n") causal.est_propensity_s() print "We finished estimating propensity scores!!" log.write(str(causal.propensity) + "\n") log.write(str(causal.summary_stats) + "\n") log.write("---TRIMMING---" + "\n") causal.trim_s() causal.cutoff print "We finished trimming!!" log.write(str(causal.summary_stats) + "\n") log.write("---STRATIFYING---" + "\n") causal.stratify() print "We finished stratifying!!" log.write(str(causal.strata) + "\n") log.write(str(causal.summary_stats) + "\n") log.write( "---TREATMENT ESTIMATES (AFTER TRIMMING AND STRATIFYING)---" + "\n") causal.est_via_matching(bias_adj=True) print "We finished estimating via matching (after trimming)!!" log.write(str(causal.estimates) + "\n") log.write(str(causal.summary_stats) + "\n") print "We are all done with the causal inference analysis!" log.flush()
pd.read_stata('../data/raw/cps_controls.dta') ], ignore_index=True) ## preliminary analysis # %% treated = rct_data[rct_data.treat == 1] synthetic_cps1 = pd.concat([ treated, observational_data[observational_data.data_id == 'CPS1'] ]).assign(treat=lambda x: x.treat.astype(bool)) #synthetic_psid = pd.concat([treated, observational_data[observational_data.data_id == 'PSID']]) # %% # we use the CausalModel method from the causalinference package causal = CausalModel(Y=synthetic_cps1['re78'].values, D=synthetic_cps1['treat'].values, X=synthetic_cps1[[ col for col in synthetic_cps1.columns if col not in ('treat', 're78', 'data_id') ]].values) causal.est_via_ols(adj=1) # adj=1 corresponds to the simplicity of the model we entered # This is called a "constant treatment effect" print(causal.estimates) # %% print(causal.summary_stats) # %% #this function estimates the propensity score, so that propensity methods can be employed causal.est_propensity_s() print(causal.propensity) # %%
# re-run causal analysis with test data to make sure you didn't break it :) import numpy as np from causalinference import CausalModel from causalinference.utils.tools import vignette_data Y, D, X = vignette_data() Y = np.array([Y, Y]).T ids = np.array(range(len(Y))) causal = CausalModel(Y, D, X, ids=ids) print(causal.summary_stats) causal.est_via_ols() print(causal.estimates) causal.est_propensity_s() print(causal.propensity) causal.trim_s() causal.stratify_s() print(causal.strata) causal.est_via_blocking([], []) print(causal.estimates) print(causal.estimates.keys()) ## expected ATT is 9.553
from causalinference import CausalModel from causalinference.utils import random_data Y, D, X = random_data() causal = CausalModel(Y, D, X) #Estimate ATE causal.est_via_ols() causal.est_via_weighting() causal.est_via_blocking() causal.est_via_matching(bias_adj=True)