def qc_prop_matching(self, rel_cols, label): """ Evaluates the need for a propensity score matching and can be used to quality control a propensity score matched population. Will train classifiers and create a plot. :param rel_cols: relevant columns :param label: Label or class which should be regressed. \ (cohort1/cohort2, case/control, treatment/untreated etc.) """ cols = rel_cols[::] # create reduced copies of the dataframes for propensity score quality control qc_dfs = [] for df in self: qc_dfs.append(df[cols]) # exclude label if included into columns if label in cols: cols.remove(label) # construct formula formula = construct_formula(label, cols) # create Matcher m = Matcher(*qc_dfs, yvar=label, formula=formula) # train classifier to asses predictability m.fit_scores(balance=True, nmodels=10) # calculate and visualize propensity scores m.predict_scores() m.plot_scores()
def calc_propensity_scores(file_name): data = pd.read_csv("datasets/{}.csv".format(file_name), index_col=0)[fields] categorical_c=[] for a in data.columns: try: float(data.iloc[0].loc[a]) except: categorical_c.append(a) data_dummy=pd.get_dummies(data, columns=categorical_c, drop_first=True) control=data_dummy[data_dummy["T"]==0] test=data_dummy[data_dummy["T"]==1] m = Matcher(test, control, yvar="T", exclude=["Y"]) np.random.seed(20170925) m.fit_scores(balance=False, nmodels=1) m.predict_scores() m.plot_scores() plt.savefig("output/pm_results_{}.png".format(file_name)) m.data.to_csv("datasets/{}_p.csv".format(file_name)) return m.data["scores"]
control = data3[data3['treatment'] == 0][[ 'Unique_Index', 'state', 'city_ID', 'year', 'logprice_adjusted', 'ImportParcelID', 'timeid', 'treatment', 'YearBuilt', 'NoOfStories', 'TotalRooms', 'TotalBedrooms', 'area', 'LandAssessedValue_persqft' ]] control = control.fillna(control.mean()) m = Matcher(treated, control, yvar="treatment", exclude=[ 'Unique_Index', 'state', 'city_ID', 'year', 'ImportParcelID', 'timeid', 'logprice_adjusted' ]) m.fit_scores(balance=True, nmodels=50) m.predict_scores() m.match(method="min", nmatches=3, threshold=0.0001) m.assign_weight_vector() Matched = pd.concat([Matched, m.matched_data], sort=False) except: pass #%% sort out cities that have both CT Matched = pd.read_csv('Matched4-1to3-add landvaluepersqft-balance false.csv') treatment_city = Matched.groupby('city')['treatment'].value_counts().to_frame() treatment_city.rename(columns={'treatment': 'count'}, inplace=True) treatment_city = treatment_city.reset_index() treatment_city['treatment'] = treatment_city['treatment'].astype(int) treatment_city = treatment_city.pivot(index='city', columns='treatment',
def propensity_match(exposure, control, covariates=[ 'age', 'apache_prob', 'sepsis', 'infection_skin_soft_tissue', 'immunocompromised' ], outcome_var='aki', seed=389202, balance=False, n_models=100, verbose=False): np.random.seed(seed) exposure = exposure.copy() control = control.copy() # make sure we don't overwrite the legit column status if 'status' in exposure.columns: exposure['status_original'] = exposure['status'] control['status_original'] = control['status'] exposure_var = 'status' exposure.loc[:, exposure_var] = 1 control.loc[:, exposure_var] = 0 # vars we exclude cols_exclude, cols_include = [], [] for c in exposure.columns: if c == exposure_var: continue if c not in covariates: cols_exclude.append(c) else: cols_include.append(c) if len(cols_include) == 0: raise ValueError( 'None of the covariates appear in the exposure dataframe.') logger.info((f'Columns included: {cols_include}')) # warn about missing data and missing columns for c in exposure.columns: if str(exposure[c].dtype) == 'object': mu = pd.concat([exposure[c], control[c]], axis=0).value_counts().index[0] else: mu = pd.concat([exposure[c], control[c]], axis=0).mean() n = exposure[c].isnull().sum() if (n > 0) & (c not in cols_exclude): logger.warning( f'Column {c} missing {n} observations in exposure dataframe.') exposure[c].fillna(mu, inplace=True) if c not in control: logger.warning(f'Did not find column {c} in control dataframe.') else: n = control[c].isnull().sum() if (n > 0) & (c not in cols_exclude): logger.warning( f'Column {c} missing {n} observations in control dataframe.' ) control[c].fillna(mu, inplace=True) # print('Dataframe being used:') # display(exposure[cols].head()) m = Matcher(exposure, control, yvar=exposure_var, exclude=cols_exclude) # predict the y outcome balancing the classes # repeat 100 times to be sure we use a lot of majority class data if balance: m.fit_scores(balance=balance, nmodels=n_models) else: m.fit_scores(balance=False) m.predict_scores() if verbose: m.plot_scores() # m.tune_threshold(method='random') m.match( method="min", nmatches=1, threshold=0.0005) # finds the closest match for each minority record # m.record_frequency() # no categorical variables -> this errors if verbose: cc = m.compare_categorical(return_table=True) display(cc) cc = m.compare_continuous(return_table=True) display(cc) return m