Exemple #1
0
    def qc_prop_matching(self, rel_cols, label):
        """
        Evaluates the need for a propensity score matching and can be used to quality control a propensity score matched
        population. Will train classifiers and create a plot.

        :param rel_cols: relevant columns
        :param label: Label or class which should be regressed. \
        (cohort1/cohort2, case/control, treatment/untreated etc.)
        """

        cols = rel_cols[::]

        # create reduced copies of the dataframes for propensity score quality control
        qc_dfs = []
        for df in self:
            qc_dfs.append(df[cols])

        # exclude label if included into columns
        if label in cols:
            cols.remove(label)

        # construct formula
        formula = construct_formula(label, cols)

        # create Matcher
        m = Matcher(*qc_dfs, yvar=label, formula=formula)
        # train classifier to asses predictability
        m.fit_scores(balance=True, nmodels=10)
        # calculate and visualize propensity scores
        m.predict_scores()
        m.plot_scores()
Exemple #2
0
def calc_propensity_scores(file_name):
    data = pd.read_csv("datasets/{}.csv".format(file_name), index_col=0)[fields]
    categorical_c=[]
    for a in data.columns:
        try:
            float(data.iloc[0].loc[a])
        except:
            categorical_c.append(a)

    data_dummy=pd.get_dummies(data, columns=categorical_c, drop_first=True)

    control=data_dummy[data_dummy["T"]==0]
    test=data_dummy[data_dummy["T"]==1]

    m = Matcher(test, control, yvar="T", exclude=["Y"])
    np.random.seed(20170925)
    m.fit_scores(balance=False, nmodels=1)
    m.predict_scores()
    m.plot_scores()
    plt.savefig("output/pm_results_{}.png".format(file_name))
    m.data.to_csv("datasets/{}_p.csv".format(file_name))
    return m.data["scores"]
Exemple #3
0
            control = data3[data3['treatment'] == 0][[
                'Unique_Index', 'state', 'city_ID', 'year',
                'logprice_adjusted', 'ImportParcelID', 'timeid', 'treatment',
                'YearBuilt', 'NoOfStories', 'TotalRooms', 'TotalBedrooms',
                'area', 'LandAssessedValue_persqft'
            ]]
            control = control.fillna(control.mean())
            m = Matcher(treated,
                        control,
                        yvar="treatment",
                        exclude=[
                            'Unique_Index', 'state', 'city_ID', 'year',
                            'ImportParcelID', 'timeid', 'logprice_adjusted'
                        ])
            m.fit_scores(balance=True, nmodels=50)
            m.predict_scores()
            m.match(method="min", nmatches=3, threshold=0.0001)
            m.assign_weight_vector()
            Matched = pd.concat([Matched, m.matched_data], sort=False)
        except:
            pass

#%% sort out cities that have both CT
Matched = pd.read_csv('Matched4-1to3-add landvaluepersqft-balance false.csv')
treatment_city = Matched.groupby('city')['treatment'].value_counts().to_frame()
treatment_city.rename(columns={'treatment': 'count'}, inplace=True)
treatment_city = treatment_city.reset_index()
treatment_city['treatment'] = treatment_city['treatment'].astype(int)

treatment_city = treatment_city.pivot(index='city',
                                      columns='treatment',
Exemple #4
0
def propensity_match(exposure,
                     control,
                     covariates=[
                         'age', 'apache_prob', 'sepsis',
                         'infection_skin_soft_tissue', 'immunocompromised'
                     ],
                     outcome_var='aki',
                     seed=389202,
                     balance=False,
                     n_models=100,
                     verbose=False):

    np.random.seed(seed)

    exposure = exposure.copy()
    control = control.copy()

    # make sure we don't overwrite the legit column status
    if 'status' in exposure.columns:
        exposure['status_original'] = exposure['status']
        control['status_original'] = control['status']
    exposure_var = 'status'
    exposure.loc[:, exposure_var] = 1
    control.loc[:, exposure_var] = 0

    # vars we exclude
    cols_exclude, cols_include = [], []
    for c in exposure.columns:
        if c == exposure_var:
            continue
        if c not in covariates:
            cols_exclude.append(c)
        else:
            cols_include.append(c)

    if len(cols_include) == 0:
        raise ValueError(
            'None of the covariates appear in the exposure dataframe.')
    logger.info((f'Columns included: {cols_include}'))

    # warn about missing data and missing columns
    for c in exposure.columns:
        if str(exposure[c].dtype) == 'object':
            mu = pd.concat([exposure[c], control[c]],
                           axis=0).value_counts().index[0]
        else:
            mu = pd.concat([exposure[c], control[c]], axis=0).mean()

        n = exposure[c].isnull().sum()
        if (n > 0) & (c not in cols_exclude):
            logger.warning(
                f'Column {c} missing {n} observations in exposure dataframe.')
            exposure[c].fillna(mu, inplace=True)

        if c not in control:
            logger.warning(f'Did not find column {c} in control dataframe.')
        else:
            n = control[c].isnull().sum()
            if (n > 0) & (c not in cols_exclude):
                logger.warning(
                    f'Column {c} missing {n} observations in control dataframe.'
                )
                control[c].fillna(mu, inplace=True)

    # print('Dataframe being used:')
    # display(exposure[cols].head())
    m = Matcher(exposure, control, yvar=exposure_var, exclude=cols_exclude)

    # predict the y outcome balancing the classes
    # repeat 100 times to be sure we use a lot of majority class data
    if balance:
        m.fit_scores(balance=balance, nmodels=n_models)
    else:
        m.fit_scores(balance=False)

    m.predict_scores()

    if verbose:
        m.plot_scores()

    # m.tune_threshold(method='random')
    m.match(
        method="min", nmatches=1,
        threshold=0.0005)  # finds the closest match for each minority record
    # m.record_frequency()

    # no categorical variables -> this errors
    if verbose:
        cc = m.compare_categorical(return_table=True)
        display(cc)
        cc = m.compare_continuous(return_table=True)
        display(cc)

    return m