def test_MICE1(self):

        df = gendat()
        imp_data = mice.MICEData(df)
        mi = mice.MICE("y ~ x1 + x2 + x1:x2", sm.OLS, imp_data)

        from statsmodels.regression.linear_model import RegressionResultsWrapper

        for j in range(3):
            x = mi.next_sample()
            assert (issubclass(x.__class__, RegressionResultsWrapper))
Esempio n. 2
0
    def test_plot_imputed_hist(self):

        df = gendat()
        imp_data = mice.MICEData(df)
        imp_data.update_all()

        plt.clf()
        for plot_points in False, True:
            fig = imp_data.plot_imputed_hist('x4')
            fig.get_axes()[0].set_title('plot_imputed_hist')
            close_or_save(pdf, fig)
Esempio n. 3
0
    def test_plot_bivariate(self, close_figures):

        df = gendat()
        imp_data = mice.MICEData(df)
        imp_data.update_all()

        plt.clf()
        for plot_points in False, True:
            fig = imp_data.plot_bivariate('x2', 'x4', plot_points=plot_points)
            fig.get_axes()[0].set_title('plot_bivariate')
            close_or_save(pdf, fig)
Esempio n. 4
0
    def test_fit_obs(self):

        df = gendat()
        imp_data = mice.MICEData(df)
        imp_data.update_all()

        plt.clf()
        for plot_points in False, True:
            fig = imp_data.plot_fit_obs('x4', plot_points=plot_points)
            fig.get_axes()[0].set_title('plot_fit_scatterplot')
            close_or_save(pdf, fig)
    def test_MICE(self):

        df = gendat()
        imp_data = mice.MICEData(df)
        mi = mice.MICE("y ~ x1 + x2 + x1:x2", sm.OLS, imp_data)
        result = mi.fit(1, 3)

        assert (issubclass(result.__class__, mice.MICEResults))

        # Smoke test for results
        smr = result.summary()
Esempio n. 6
0
    def test_MICE2(self):

        from statsmodels.genmod.generalized_linear_model import GLMResultsWrapper

        df = gendat()
        imp_data = mice.MICEData(df)
        mi = mice.MICE("x3 ~ x1 + x2", sm.GLM, imp_data,
                       init_kwds={"family": sm.families.Binomial()})

        for j in range(3):
            x = mi.next_sample()
            assert(isinstance(x, GLMResultsWrapper))
            assert(isinstance(x.family, sm.families.Binomial))
Esempio n. 7
0
    def test_next_sample(self):

        df = gendat()
        imp_data = mice.MICEData(df)

        all_x = []
        for j in range(2):
            x = imp_data.next_sample()
            assert (isinstance(x, pd.DataFrame))
            assert_equal(df.shape, x.shape)
            all_x.append(x)

        # The returned dataframes are all the same object
        assert (all_x[0] is all_x[1])
Esempio n. 8
0
    def test_plot_missing_pattern(self):

        df = gendat()
        imp_data = mice.MICEData(df)

        for row_order in "pattern", "raw":
            for hide_complete_rows in False, True:
                for color_row_patterns in False, True:
                    plt.clf()
                    fig = imp_data.plot_missing_pattern(
                        row_order=row_order,
                        hide_complete_rows=hide_complete_rows,
                        color_row_patterns=color_row_patterns)
                    close_or_save(pdf, fig)
Esempio n. 9
0
    def runDataImputation(self):
        df = self.rawdata.copy()
        df = df.apply(pd.to_numeric)

        # Rename column index statsmodels imputation does not like wierd column names
        impIndex = []
        origIndex = []
        colCounter = 1
        for ithCol in df:
            impIndex.append('Var' + str(colCounter))
            origIndex.append(ithCol.replace('\n', ', ').replace('\r', ''))
            colCounter = colCounter + 1
        df.columns = impIndex

        # Get columns with nans
        naSums = df.isnull().sum()
        naCols = naSums[naSums > 0]
        if len(naCols) < 1:  #don't run imputation if there are no missing data
            QtWidgets.QMessageBox.question(
                self, 'Info', 'There are no missing data points to fill in...',
                QtWidgets.QMessageBox.Ok)
            return
        datetimeIdx = df.index
        self.imputeeData = df[naCols.index]

        # Run imputation
        imp = mice.MICEData(df)
        #print(df.isna().sum())
        imp.update_all()
        #print(imp.data.isna().sum())

        # Get columns with filled-in nans
        self.imputedData = imp.data[naCols.index]
        self.imputedData.set_index(datetimeIdx, inplace=True)

        # Append results to menu-bar
        if hasattr(self, 'imputationMenu'):
            self.imputationMenu.clear()
        else:
            self.imputationMenu = self.myQMenuBar.addMenu(
                'Data Imputation Results')
        for ithCol in self.imputedData:
            ithImputationAction = QtWidgets.QAction(ithCol, self)
            ithImputationAction.triggered.connect(
                lambda checked, item=ithCol: self.showImputationResult(item))
            self.imputationMenu.addAction(ithImputationAction)

        df.columns = origIndex
Esempio n. 10
0
    def test_set_imputer(self):
        """
        Test with specified perturbation method.
        """

        from statsmodels.regression.linear_model import RegressionResultsWrapper
        from statsmodels.genmod.generalized_linear_model import GLMResultsWrapper

        df = gendat()
        orig = df.copy()
        mx = pd.notnull(df)
        nrow, ncol = df.shape

        imp_data = mice.MICEData(df)
        imp_data.set_imputer('x1', 'x3 + x4 + x3*x4')
        imp_data.set_imputer('x2', 'x4 + I(x5**2)')
        imp_data.set_imputer('x3',
                             model_class=sm.GLM,
                             init_kwds={"family": sm.families.Binomial()})

        imp_data.update_all()
        assert_equal(imp_data.data.shape[0], nrow)
        assert_equal(imp_data.data.shape[1], ncol)
        assert_allclose(orig[mx], imp_data.data[mx])
        for j in range(1, 6):
            if j == 3:
                assert_equal(isinstance(imp_data.models['x3'], sm.GLM), True)
                assert_equal(
                    isinstance(imp_data.models['x3'].family,
                               sm.families.Binomial), True)
                assert_equal(
                    isinstance(imp_data.results['x3'], GLMResultsWrapper),
                    True)
            else:
                assert_equal(isinstance(imp_data.models['x%d' % j], sm.OLS),
                             True)
                assert_equal(
                    isinstance(imp_data.results['x%d' % j],
                               RegressionResultsWrapper), True)

        fml = 'x1 ~ x3 + x4 + x3*x4'
        assert_equal(imp_data.conditional_formula['x1'], fml)

        fml = 'x4 ~ x1 + x2 + x3 + x5 + y'
        assert_equal(imp_data.conditional_formula['x4'], fml)

        assert_equal(imp_data._cycle_order,
                     ['x5', 'x3', 'x4', 'y', 'x2', 'x1'])
Esempio n. 11
0
    def test_settingwithcopywarning(self):
        "Test that MICEData does not throw a SettingWithCopyWarning when imputing (https://github.com/statsmodels/statsmodels/issues/5430)"

        df = gendat()
        # There need to be some ints in here for the error to be thrown
        df['intcol'] = np.arange(len(df))
        df['intcol'] = df.intcol.astype('int32')

        miceData = mice.MICEData(df)

        with pd.option_context('mode.chained_assignment', 'warn'):
            with warnings.catch_warnings(record=True) as ws:
                warnings.simplefilter('always')
                miceData.update_all()

                assert len(ws) == 0
 def imputation_mice(self, var_name, reddy_mice, reddy_info):
     # not sure if this var_data is doing anything
     var_data = reddy_mice.copy()
     print("How many null values in " + var_name + " to change? " + str(reddy_info[var_name].isnull().sum()))
     imp_var = mice.MICEData(var_data)
     # create formula: var_name ~ sum(other_vars)
     other_vars = reddy_mice.loc[:, reddy_mice.columns != var_name].columns
     fml_var = var_name + ' ~' + ' +'.join(' {0}'.format(var) for var in other_vars)
     # perform mice imputation
     mice_var = mice.MICE(fml_var, lm.OLS, imp_var)
     results = mice_var.fit(10,10) # fit(#cycles to skip, #datasets to impute)
     reddy_info[var_name] = mice_var.data.data[var_name].values
     if reddy_info[var_name].isnull().sum() != 0:
         raise AssertionError ("All values could not be imputed.")
     else:
         #print(reddy_info[var_name].isnull().sum())
         print(var_name + " successfully imputed")
     return reddy_info
Esempio n. 13
0
    def test_pertmeth(self):
        # Test with specified perturbation method.

        df = gendat()
        orig = df.copy()
        mx = pd.notnull(df)
        nrow, ncol = df.shape

        for pert_meth in "gaussian", "boot":

            imp_data = mice.MICEData(df, perturbation_method=pert_meth)

            for k in range(2):
                imp_data.update_all()
                assert_equal(imp_data.data.shape[0], nrow)
                assert_equal(imp_data.data.shape[1], ncol)
                assert_allclose(orig[mx], imp_data.data[mx])

        assert_equal(imp_data._cycle_order, ['x5', 'x3', 'x4', 'y', 'x2', 'x1'])
Esempio n. 14
0
    def test_default(self):
        # Test with all defaults.

        df = gendat()
        orig = df.copy()
        mx = pd.notnull(df)
        imp_data = mice.MICEData(df)
        nrow, ncol = df.shape

        assert_allclose(imp_data.ix_miss['x1'], np.arange(60))
        assert_allclose(imp_data.ix_obs['x1'], np.arange(60, 200))
        assert_allclose(imp_data.ix_miss['x2'], np.arange(40))
        assert_allclose(imp_data.ix_miss['x3'], np.arange(10, 30, 2))
        assert_allclose(
            imp_data.ix_obs['x3'],
            np.concatenate((np.arange(10), np.arange(11, 30,
                                                     2), np.arange(30, 200))))
        assert_equal([set(imp_data.data[col]) for col in imp_data.data],
                     [set(df[col].dropna()) for col in df])

        for k in range(3):
            imp_data.update_all()
            assert_equal(imp_data.data.shape[0], nrow)
            assert_equal(imp_data.data.shape[1], ncol)
            assert_allclose(orig[mx], imp_data.data[mx])
            assert_equal([set(imp_data.data[col]) for col in imp_data.data],
                         [set(df[col].dropna()) for col in df])

        fml = 'x1 ~ x2 + x3 + x4 + x5 + y'
        assert_equal(imp_data.conditional_formula['x1'], fml)

        assert_equal(imp_data._cycle_order,
                     ['x5', 'x3', 'x4', 'y', 'x2', 'x1'])

        # Should make a copy
        assert (not (df is imp_data.data))

        (endog_obs, exog_obs, exog_miss, predict_obs_kwds,
         predict_miss_kwds) = imp_data.get_split_data('x3')
        assert_equal(len(endog_obs), 190)
        assert_equal(exog_obs.shape, [190, 6])
        assert_equal(exog_miss.shape, [10, 6])
Esempio n. 15
0
    def test_combine(self):

        np.random.seed(3897)
        x1 = np.random.normal(size=300)
        x2 = np.random.normal(size=300)
        y = x1 + x2 + np.random.normal(size=300)
        x1[0:100] = np.nan
        x2[250:] = np.nan
        df = pd.DataFrame({"x1": x1, "x2": x2, "y": y})
        idata = mice.MICEData(df)
        mi = mice.MICE("y ~ x1 + x2", sm.OLS, idata, n_skip=20)
        result = mi.fit(10, 20)

        fmi = np.asarray([0.1920533, 0.1587287, 0.33174032])
        assert_allclose(result.frac_miss_info, fmi, atol=1e-5)

        params = np.asarray([-0.05397474, 0.97273307, 1.01652293])
        assert_allclose(result.params, params, atol=1e-5)

        tvalues = np.asarray([-0.84781698, 15.10491582, 13.59998039])
        assert_allclose(result.tvalues, tvalues, atol=1e-5)
Esempio n. 16
0
def test_micedata_miss1():
    # test for #4375
    np.random.seed(0)
    data = pd.DataFrame(np.random.rand(50, 4))
    data.columns = ['var1', 'var2', 'var3', 'var4']
    # one column with a single missing value
    data.iloc[1, 1] = np.nan
    data.iloc[[1, 3], 2] = np.nan

    data_imp = mice.MICEData(data)
    data_imp.update_all()

    assert_equal(data_imp.data.isnull().values.sum(), 0)

    ix_miss = {'var1': np.array([], dtype=np.int64),
                 'var2': np.array([1], dtype=np.int64),
                 'var3': np.array([1, 3], dtype=np.int64),
                 'var4': np.array([], dtype=np.int64)}

    for k in ix_miss:
        assert_equal(data_imp.ix_miss[k], ix_miss[k])
Esempio n. 17
0
    def test_settingwithcopywarning(self):
        "Test that MICEData does not throw a SettingWithCopyWarning when imputing (https://github.com/statsmodels/statsmodels/issues/5430)"

        df = gendat()
        # There need to be some ints in here for the error to be thrown
        df['intcol'] = np.arange(len(df))
        df['intcol'] = df.intcol.astype('int32')

        miceData = mice.MICEData(df)

        with pd.option_context('mode.chained_assignment', 'warn'):
            with warnings.catch_warnings(record=True) as ws:
                warnings.simplefilter('always')
                miceData.update_all()

                # on Python 3.4, throws warning
                # "DeprecationWarning('pandas.core.common.is_categorical_dtype is deprecated. import from the public API:
                # pandas.api.types.is_categorical_dtype instead',)"
                # ignore this warning, as this is not what is being tested in this test
                assert ((len(ws) == 0)
                        or all([w.category == DeprecationWarning for w in ws]))
Esempio n. 18
0
    def test_combine(self):

        np.random.seed(3897)
        x1 = np.random.normal(size=300)
        x2 = np.random.normal(size=300)
        y = x1 + x2 + np.random.normal(size=300)
        x1[0:100] = np.nan
        x2[250:] = np.nan
        df = pd.DataFrame({"x1": x1, "x2": x2, "y": y})
        idata = mice.MICEData(df)
        mi = mice.MICE("y ~ x1 + x2", sm.OLS, idata, n_skip=20)
        result = mi.fit(10, 20)

        fmi = np.asarray([0.1778143, 0.11057262, 0.29626521])
        assert_allclose(result.frac_miss_info, fmi, atol=1e-5)

        params = np.asarray([-0.03486102, 0.96236808, 0.9970371])
        assert_allclose(result.params, params, atol=1e-5)

        tvalues = np.asarray([-0.54674776, 15.28091069, 13.61359403])
        assert_allclose(result.tvalues, tvalues, atol=1e-5)
Esempio n. 19
0
def Interpolation_mice(df: pd.DataFrame) -> pd.DataFrame:
    imp = mice.MICEData(df)
    fml = df.columns[0] + " ~ " + df.columns[1]
    for i in range(2, len(df.columns)):
        fml += " + " + df.columns[i]
    # fml = 'y ~ x1 + x2 + x3 + x4'
    mi = mice.MICE(fml, sm.OLS, imp)
    results = mi.fit(10, 10)
    dm = imp.next_sample()

    # dm.to_csv("data_mice_10.csv")

    # results = []
    # for k in range(10):
    #    x = mi.next_sample()
    #    results.append(x)

    # TODO:
    # results到底怎么个结果?
    # FINISHED
    return dm
Esempio n. 20
0
    def test_phreg(self):

        np.random.seed(8742)
        n = 300
        x1 = np.random.normal(size=n)
        x2 = np.random.normal(size=n)
        event_time = np.random.exponential(size=n) * np.exp(x1)
        obs_time = np.random.exponential(size=n)
        time = np.where(event_time < obs_time, event_time, obs_time)
        status = np.where(time == event_time, 1, 0)
        df = pd.DataFrame({"time": time, "status": status, "x1": x1, "x2": x2})
        df.loc[10:40, 'time'] = np.nan
        df.loc[10:40, 'status'] = np.nan
        df.loc[30:50, 'x1'] = np.nan
        df.loc[40:60, 'x2'] = np.nan

        from statsmodels.duration.hazard_regression import PHReg

        # Save the dataset size at each iteration.
        hist = []

        def cb(imp):
            hist.append(imp.data.shape)

        for pm in "gaussian", "boot":
            idata = mice.MICEData(df,
                                  perturbation_method=pm,
                                  history_callback=cb)
            idata.set_imputer(
                "time",
                "0 + x1 + x2",
                model_class=PHReg,
                init_kwds={"status": mice.PatsyFormula("status")},
                predict_kwds={"pred_type": "hr"},
                perturbation_method=pm)

            x = idata.next_sample()
            assert (isinstance(x, pd.DataFrame))

        assert (all([x == (299, 4) for x in hist]))
    def MICE_impute(self):
        print()
        print('Using MICE algorithm...')
        df_mice = self.df.copy()
        # mapping Embarked using numeric values
        embarked_mapping = {"S": 1, "C": 2, "Q": 3}
        df_mice['Embarked'] = df_mice['Embarked'].map(embarked_mapping)
        # mapping Cabin using numeric values
        deck = {"A": 0, "B": 1, "C": 2, "D": 3, "E": 4, "F": 5, "G": 6, "U": 7}
        df_mice['Cabin'] = df_mice['Cabin'].fillna("U")
        df_mice['Cabin'] = df_mice['Cabin'].map(
            lambda x: re.compile("([a-zA-Z]+)").search(x).group())
        df_mice['Cabin'] = df_mice['Cabin'].map(deck)
        df_mice['Cabin'].replace({7: np.nan}, inplace=True)

        numeric_features = [
            column for column in df_mice.columns
            if df_mice[column].dtype != 'object'
        ]
        imp = mice.MICEData(df_mice[numeric_features])
        imp.set_imputer('')
        for i in range(100):
            imp.update_all()
        operated_cols = [
            column for column in numeric_features
            if self.df[column].isnull().sum()
        ]
        print(f'Operating on following features : {operated_cols}')
        # copying the imputed values to the original df
        for i in operated_cols:
            df_mice[i] = imp.data[i]

        # reverse mapping the values
        embarked_mapping = {1: "S", 2: "C", 3: "Q"}
        df_mice['Embarked'] = df_mice['Embarked'].map(embarked_mapping)
        deck_mapping = {0: "A", 1: "B", 2: "C", 3: "D", 4: "E", 5: "F", 6: "G"}
        df_mice['Cabin'] = df_mice['Cabin'].map(deck_mapping)
        return df_mice
Esempio n. 22
0
def impute(rawData):
    # impute missing lab and vitals values

    shortData = rawData[[
        'age', 'female', 'sbp', 'dbp', 'crp', 'dDimer', 'ferritin',
        'platelets', 'creatinine', 'tbili', 'hsTrop', 'il6', 'lymph', 'hgb',
        'lac', 'ldh', 'albumin', 'hr', 'rr', 'temp', 'icu', 'hypertension',
        'diabetes', 'asthma', 'copd', 'chronic_lung', 'home_o2', 'osa',
        'immunocompromised', 'pregnant', 'primaryOutcome', 'highFi02',
        'lastTime', 'o2Sat'
    ]]

    imputedData = mice.MICEData(shortData)

    for var in [
            'age', 'female', 'sbp', 'dbp', 'crp', 'dDimer', 'ferritin',
            'platelets', 'creatinine', 'tbili', 'hsTrop', 'il6', 'lymph',
            'hgb', 'lac', 'ldh', 'albumin', 'hr', 'rr', 'temp'
    ]:
        imputedData.set_imputer(var, formula=ols_formula(shortData, var))

    imputedData.update_all(20)
    return imputedData.data, imputedData
Esempio n. 23
0
# -*- coding: utf-8 -*-
"""
Created on Sun Sep  1 23:39:09 2019

@author: 92156
"""

from statsmodels.imputation import mice
imp = mice.MICEData(data)
>>> fml = 'y ~ x1 + x2 + x3 + x4'
>>> mice = mice.MICE(fml, sm.OLS, imp)
>>> results = mice.fit(10, 10)
>>> print(results.summary())
Esempio n. 24
0
def eda():

    files = ['train.csv','test.csv']  
    for file in files:
    
        #read csv file
        try:
            df = pd.read_csv(file)
        except:
            print(file + " not found")
        
        #remove columns unlikely to contain inferential value
        df = df.drop(['Cabin','PassengerId','Name','Ticket'], axis=1)
        
        #convert gender and port into numerics, 
        #so we can do imputation on missing Age values 
        df1 = df.copy(deep=True)
        df1['Sex'] = pd.factorize(df['Sex'])[0]
        df1['Embarked'] = pd.factorize(df['Embarked'])[0]
        
        # use MICE imputation to fix ages
        imp = mice.MICEData(df1)
        imp.update_all(100)  #creates new data frame with imputed values
        df = df.drop(['Age'], axis=1) #drop the original age column
        df = pd.concat([df, imp.data['Age']], axis=1) #add the imputed column back in
        
        #tried binning but could not factorize these in ascending order 
        #and did not like bin labels.
        #bin Age into Toddler, Child, Adolescent, Adult, Elderly
        #df = df.filter(['Age'], axis=1)
        #print(df)
        #age_bins = [0, 2, 7, 21, 60, 100]   
        #out = pd.cut(df['Age'], bins=age_bins)
        #df = pd.concat((df, out), axis=1)
        #df.columns.values[1] = "Age_Bin"
        #df = df.drop(['Age'], axis=1)
        #df = pd.concat([df, df], axis=1)
        
        #create five categories for age, representing boundaries between social mores
        df.loc[df['Age'] < 3, 'Age_Bin'] = '1-Toddler'
        df.loc[(df['Age'] >= 3)  & (df['Age'] < 13), 'Age_Bin'] = '2-Child'
        df.loc[(df['Age'] >= 13) & (df['Age'] < 20), 'Age_Bin'] = '3-Teen'
        df.loc[(df['Age'] >= 20) & (df['Age'] < 60), 'Age_Bin'] = '4-Adult'
        df.loc[df['Age']  >= 60, 'Age_Bin'] = '5-Senior'
        
        #distinguish between traveling alone, small families, and large families
        df['family_size'] = df['SibSp'] + df['Parch']
        df.loc[df['family_size'] == 0, 'Family'] = '2-None'
        df.loc[(df['family_size'] > 0) & (df['family_size'] < 4), 'Family'] = '1-Small'
        df.loc[df['family_size'] >= 4, 'Family'] = '3-Large'
        
        #create 1-hot variables for each category value (level), then drop the original columns
        df = pd.concat([df,pd.get_dummies(df['Age_Bin'], prefix='Age_Bin')],axis=1)
        df = pd.concat([df,pd.get_dummies(df['Sex'], prefix='Gender')],axis=1)
        df = pd.concat([df,pd.get_dummies(df['Embarked'], prefix='Embarked')],axis=1)
        df = pd.concat([df,pd.get_dummies(df['Family'], prefix='Family')],axis=1)
        df.drop(['Age_Bin','Sex','Embarked','Family'],axis=1, inplace=True)
        
        try:
            df.to_csv('mod_'+ file, encoding='utf-8')  #save results to disk
            print('Successful writing file mod_' + file)
        except:
            print("Could not write file mod_" + file)
            
    return()
Esempio n. 25
0
    def test_MICE1_regularized(self):

        df = gendat()
        imp = mice.MICEData(df, perturbation_method='boot')
        imp.set_imputer('x1', 'x2 + y', fit_kwds={'alpha': 1, 'L1_wt': 0})
        imp.update_all()
Esempio n. 26
0
lmodr = smf.ols(
   'logit(race/100) ~ fire + theft + age + np.log(income)',
    chmiss).fit()
(ilogit(lmodr.predict(chmiss))*100)[mv]


#	

chredlin.race.iloc[np.where(chmiss.race.isna())]


# ## Multiple Imputation
#	

import statsmodels.imputation.mice as smi
imp = smi.MICEData(chmiss)
fm = 'involact ~ race + fire + theft + age + np.log(income)'
mmod = smi.MICE(fm, sm.OLS, imp)
results = mmod.fit(10, 50)
print(results.summary())


# ## Discussion
# ## Exercises

# ## Packages Used

import sys
import matplotlib
import statsmodels as sm
import seaborn as sns