def test_MICE1(self): df = gendat() imp_data = mice.MICEData(df) mi = mice.MICE("y ~ x1 + x2 + x1:x2", sm.OLS, imp_data) from statsmodels.regression.linear_model import RegressionResultsWrapper for j in range(3): x = mi.next_sample() assert (issubclass(x.__class__, RegressionResultsWrapper))
def test_plot_imputed_hist(self): df = gendat() imp_data = mice.MICEData(df) imp_data.update_all() plt.clf() for plot_points in False, True: fig = imp_data.plot_imputed_hist('x4') fig.get_axes()[0].set_title('plot_imputed_hist') close_or_save(pdf, fig)
def test_plot_bivariate(self, close_figures): df = gendat() imp_data = mice.MICEData(df) imp_data.update_all() plt.clf() for plot_points in False, True: fig = imp_data.plot_bivariate('x2', 'x4', plot_points=plot_points) fig.get_axes()[0].set_title('plot_bivariate') close_or_save(pdf, fig)
def test_fit_obs(self): df = gendat() imp_data = mice.MICEData(df) imp_data.update_all() plt.clf() for plot_points in False, True: fig = imp_data.plot_fit_obs('x4', plot_points=plot_points) fig.get_axes()[0].set_title('plot_fit_scatterplot') close_or_save(pdf, fig)
def test_MICE(self): df = gendat() imp_data = mice.MICEData(df) mi = mice.MICE("y ~ x1 + x2 + x1:x2", sm.OLS, imp_data) result = mi.fit(1, 3) assert (issubclass(result.__class__, mice.MICEResults)) # Smoke test for results smr = result.summary()
def test_MICE2(self): from statsmodels.genmod.generalized_linear_model import GLMResultsWrapper df = gendat() imp_data = mice.MICEData(df) mi = mice.MICE("x3 ~ x1 + x2", sm.GLM, imp_data, init_kwds={"family": sm.families.Binomial()}) for j in range(3): x = mi.next_sample() assert(isinstance(x, GLMResultsWrapper)) assert(isinstance(x.family, sm.families.Binomial))
def test_next_sample(self): df = gendat() imp_data = mice.MICEData(df) all_x = [] for j in range(2): x = imp_data.next_sample() assert (isinstance(x, pd.DataFrame)) assert_equal(df.shape, x.shape) all_x.append(x) # The returned dataframes are all the same object assert (all_x[0] is all_x[1])
def test_plot_missing_pattern(self): df = gendat() imp_data = mice.MICEData(df) for row_order in "pattern", "raw": for hide_complete_rows in False, True: for color_row_patterns in False, True: plt.clf() fig = imp_data.plot_missing_pattern( row_order=row_order, hide_complete_rows=hide_complete_rows, color_row_patterns=color_row_patterns) close_or_save(pdf, fig)
def runDataImputation(self): df = self.rawdata.copy() df = df.apply(pd.to_numeric) # Rename column index statsmodels imputation does not like wierd column names impIndex = [] origIndex = [] colCounter = 1 for ithCol in df: impIndex.append('Var' + str(colCounter)) origIndex.append(ithCol.replace('\n', ', ').replace('\r', '')) colCounter = colCounter + 1 df.columns = impIndex # Get columns with nans naSums = df.isnull().sum() naCols = naSums[naSums > 0] if len(naCols) < 1: #don't run imputation if there are no missing data QtWidgets.QMessageBox.question( self, 'Info', 'There are no missing data points to fill in...', QtWidgets.QMessageBox.Ok) return datetimeIdx = df.index self.imputeeData = df[naCols.index] # Run imputation imp = mice.MICEData(df) #print(df.isna().sum()) imp.update_all() #print(imp.data.isna().sum()) # Get columns with filled-in nans self.imputedData = imp.data[naCols.index] self.imputedData.set_index(datetimeIdx, inplace=True) # Append results to menu-bar if hasattr(self, 'imputationMenu'): self.imputationMenu.clear() else: self.imputationMenu = self.myQMenuBar.addMenu( 'Data Imputation Results') for ithCol in self.imputedData: ithImputationAction = QtWidgets.QAction(ithCol, self) ithImputationAction.triggered.connect( lambda checked, item=ithCol: self.showImputationResult(item)) self.imputationMenu.addAction(ithImputationAction) df.columns = origIndex
def test_set_imputer(self): """ Test with specified perturbation method. """ from statsmodels.regression.linear_model import RegressionResultsWrapper from statsmodels.genmod.generalized_linear_model import GLMResultsWrapper df = gendat() orig = df.copy() mx = pd.notnull(df) nrow, ncol = df.shape imp_data = mice.MICEData(df) imp_data.set_imputer('x1', 'x3 + x4 + x3*x4') imp_data.set_imputer('x2', 'x4 + I(x5**2)') imp_data.set_imputer('x3', model_class=sm.GLM, init_kwds={"family": sm.families.Binomial()}) imp_data.update_all() assert_equal(imp_data.data.shape[0], nrow) assert_equal(imp_data.data.shape[1], ncol) assert_allclose(orig[mx], imp_data.data[mx]) for j in range(1, 6): if j == 3: assert_equal(isinstance(imp_data.models['x3'], sm.GLM), True) assert_equal( isinstance(imp_data.models['x3'].family, sm.families.Binomial), True) assert_equal( isinstance(imp_data.results['x3'], GLMResultsWrapper), True) else: assert_equal(isinstance(imp_data.models['x%d' % j], sm.OLS), True) assert_equal( isinstance(imp_data.results['x%d' % j], RegressionResultsWrapper), True) fml = 'x1 ~ x3 + x4 + x3*x4' assert_equal(imp_data.conditional_formula['x1'], fml) fml = 'x4 ~ x1 + x2 + x3 + x5 + y' assert_equal(imp_data.conditional_formula['x4'], fml) assert_equal(imp_data._cycle_order, ['x5', 'x3', 'x4', 'y', 'x2', 'x1'])
def test_settingwithcopywarning(self): "Test that MICEData does not throw a SettingWithCopyWarning when imputing (https://github.com/statsmodels/statsmodels/issues/5430)" df = gendat() # There need to be some ints in here for the error to be thrown df['intcol'] = np.arange(len(df)) df['intcol'] = df.intcol.astype('int32') miceData = mice.MICEData(df) with pd.option_context('mode.chained_assignment', 'warn'): with warnings.catch_warnings(record=True) as ws: warnings.simplefilter('always') miceData.update_all() assert len(ws) == 0
def imputation_mice(self, var_name, reddy_mice, reddy_info): # not sure if this var_data is doing anything var_data = reddy_mice.copy() print("How many null values in " + var_name + " to change? " + str(reddy_info[var_name].isnull().sum())) imp_var = mice.MICEData(var_data) # create formula: var_name ~ sum(other_vars) other_vars = reddy_mice.loc[:, reddy_mice.columns != var_name].columns fml_var = var_name + ' ~' + ' +'.join(' {0}'.format(var) for var in other_vars) # perform mice imputation mice_var = mice.MICE(fml_var, lm.OLS, imp_var) results = mice_var.fit(10,10) # fit(#cycles to skip, #datasets to impute) reddy_info[var_name] = mice_var.data.data[var_name].values if reddy_info[var_name].isnull().sum() != 0: raise AssertionError ("All values could not be imputed.") else: #print(reddy_info[var_name].isnull().sum()) print(var_name + " successfully imputed") return reddy_info
def test_pertmeth(self): # Test with specified perturbation method. df = gendat() orig = df.copy() mx = pd.notnull(df) nrow, ncol = df.shape for pert_meth in "gaussian", "boot": imp_data = mice.MICEData(df, perturbation_method=pert_meth) for k in range(2): imp_data.update_all() assert_equal(imp_data.data.shape[0], nrow) assert_equal(imp_data.data.shape[1], ncol) assert_allclose(orig[mx], imp_data.data[mx]) assert_equal(imp_data._cycle_order, ['x5', 'x3', 'x4', 'y', 'x2', 'x1'])
def test_default(self): # Test with all defaults. df = gendat() orig = df.copy() mx = pd.notnull(df) imp_data = mice.MICEData(df) nrow, ncol = df.shape assert_allclose(imp_data.ix_miss['x1'], np.arange(60)) assert_allclose(imp_data.ix_obs['x1'], np.arange(60, 200)) assert_allclose(imp_data.ix_miss['x2'], np.arange(40)) assert_allclose(imp_data.ix_miss['x3'], np.arange(10, 30, 2)) assert_allclose( imp_data.ix_obs['x3'], np.concatenate((np.arange(10), np.arange(11, 30, 2), np.arange(30, 200)))) assert_equal([set(imp_data.data[col]) for col in imp_data.data], [set(df[col].dropna()) for col in df]) for k in range(3): imp_data.update_all() assert_equal(imp_data.data.shape[0], nrow) assert_equal(imp_data.data.shape[1], ncol) assert_allclose(orig[mx], imp_data.data[mx]) assert_equal([set(imp_data.data[col]) for col in imp_data.data], [set(df[col].dropna()) for col in df]) fml = 'x1 ~ x2 + x3 + x4 + x5 + y' assert_equal(imp_data.conditional_formula['x1'], fml) assert_equal(imp_data._cycle_order, ['x5', 'x3', 'x4', 'y', 'x2', 'x1']) # Should make a copy assert (not (df is imp_data.data)) (endog_obs, exog_obs, exog_miss, predict_obs_kwds, predict_miss_kwds) = imp_data.get_split_data('x3') assert_equal(len(endog_obs), 190) assert_equal(exog_obs.shape, [190, 6]) assert_equal(exog_miss.shape, [10, 6])
def test_combine(self): np.random.seed(3897) x1 = np.random.normal(size=300) x2 = np.random.normal(size=300) y = x1 + x2 + np.random.normal(size=300) x1[0:100] = np.nan x2[250:] = np.nan df = pd.DataFrame({"x1": x1, "x2": x2, "y": y}) idata = mice.MICEData(df) mi = mice.MICE("y ~ x1 + x2", sm.OLS, idata, n_skip=20) result = mi.fit(10, 20) fmi = np.asarray([0.1920533, 0.1587287, 0.33174032]) assert_allclose(result.frac_miss_info, fmi, atol=1e-5) params = np.asarray([-0.05397474, 0.97273307, 1.01652293]) assert_allclose(result.params, params, atol=1e-5) tvalues = np.asarray([-0.84781698, 15.10491582, 13.59998039]) assert_allclose(result.tvalues, tvalues, atol=1e-5)
def test_micedata_miss1(): # test for #4375 np.random.seed(0) data = pd.DataFrame(np.random.rand(50, 4)) data.columns = ['var1', 'var2', 'var3', 'var4'] # one column with a single missing value data.iloc[1, 1] = np.nan data.iloc[[1, 3], 2] = np.nan data_imp = mice.MICEData(data) data_imp.update_all() assert_equal(data_imp.data.isnull().values.sum(), 0) ix_miss = {'var1': np.array([], dtype=np.int64), 'var2': np.array([1], dtype=np.int64), 'var3': np.array([1, 3], dtype=np.int64), 'var4': np.array([], dtype=np.int64)} for k in ix_miss: assert_equal(data_imp.ix_miss[k], ix_miss[k])
def test_settingwithcopywarning(self): "Test that MICEData does not throw a SettingWithCopyWarning when imputing (https://github.com/statsmodels/statsmodels/issues/5430)" df = gendat() # There need to be some ints in here for the error to be thrown df['intcol'] = np.arange(len(df)) df['intcol'] = df.intcol.astype('int32') miceData = mice.MICEData(df) with pd.option_context('mode.chained_assignment', 'warn'): with warnings.catch_warnings(record=True) as ws: warnings.simplefilter('always') miceData.update_all() # on Python 3.4, throws warning # "DeprecationWarning('pandas.core.common.is_categorical_dtype is deprecated. import from the public API: # pandas.api.types.is_categorical_dtype instead',)" # ignore this warning, as this is not what is being tested in this test assert ((len(ws) == 0) or all([w.category == DeprecationWarning for w in ws]))
def test_combine(self): np.random.seed(3897) x1 = np.random.normal(size=300) x2 = np.random.normal(size=300) y = x1 + x2 + np.random.normal(size=300) x1[0:100] = np.nan x2[250:] = np.nan df = pd.DataFrame({"x1": x1, "x2": x2, "y": y}) idata = mice.MICEData(df) mi = mice.MICE("y ~ x1 + x2", sm.OLS, idata, n_skip=20) result = mi.fit(10, 20) fmi = np.asarray([0.1778143, 0.11057262, 0.29626521]) assert_allclose(result.frac_miss_info, fmi, atol=1e-5) params = np.asarray([-0.03486102, 0.96236808, 0.9970371]) assert_allclose(result.params, params, atol=1e-5) tvalues = np.asarray([-0.54674776, 15.28091069, 13.61359403]) assert_allclose(result.tvalues, tvalues, atol=1e-5)
def Interpolation_mice(df: pd.DataFrame) -> pd.DataFrame: imp = mice.MICEData(df) fml = df.columns[0] + " ~ " + df.columns[1] for i in range(2, len(df.columns)): fml += " + " + df.columns[i] # fml = 'y ~ x1 + x2 + x3 + x4' mi = mice.MICE(fml, sm.OLS, imp) results = mi.fit(10, 10) dm = imp.next_sample() # dm.to_csv("data_mice_10.csv") # results = [] # for k in range(10): # x = mi.next_sample() # results.append(x) # TODO: # results到底怎么个结果? # FINISHED return dm
def test_phreg(self): np.random.seed(8742) n = 300 x1 = np.random.normal(size=n) x2 = np.random.normal(size=n) event_time = np.random.exponential(size=n) * np.exp(x1) obs_time = np.random.exponential(size=n) time = np.where(event_time < obs_time, event_time, obs_time) status = np.where(time == event_time, 1, 0) df = pd.DataFrame({"time": time, "status": status, "x1": x1, "x2": x2}) df.loc[10:40, 'time'] = np.nan df.loc[10:40, 'status'] = np.nan df.loc[30:50, 'x1'] = np.nan df.loc[40:60, 'x2'] = np.nan from statsmodels.duration.hazard_regression import PHReg # Save the dataset size at each iteration. hist = [] def cb(imp): hist.append(imp.data.shape) for pm in "gaussian", "boot": idata = mice.MICEData(df, perturbation_method=pm, history_callback=cb) idata.set_imputer( "time", "0 + x1 + x2", model_class=PHReg, init_kwds={"status": mice.PatsyFormula("status")}, predict_kwds={"pred_type": "hr"}, perturbation_method=pm) x = idata.next_sample() assert (isinstance(x, pd.DataFrame)) assert (all([x == (299, 4) for x in hist]))
def MICE_impute(self): print() print('Using MICE algorithm...') df_mice = self.df.copy() # mapping Embarked using numeric values embarked_mapping = {"S": 1, "C": 2, "Q": 3} df_mice['Embarked'] = df_mice['Embarked'].map(embarked_mapping) # mapping Cabin using numeric values deck = {"A": 0, "B": 1, "C": 2, "D": 3, "E": 4, "F": 5, "G": 6, "U": 7} df_mice['Cabin'] = df_mice['Cabin'].fillna("U") df_mice['Cabin'] = df_mice['Cabin'].map( lambda x: re.compile("([a-zA-Z]+)").search(x).group()) df_mice['Cabin'] = df_mice['Cabin'].map(deck) df_mice['Cabin'].replace({7: np.nan}, inplace=True) numeric_features = [ column for column in df_mice.columns if df_mice[column].dtype != 'object' ] imp = mice.MICEData(df_mice[numeric_features]) imp.set_imputer('') for i in range(100): imp.update_all() operated_cols = [ column for column in numeric_features if self.df[column].isnull().sum() ] print(f'Operating on following features : {operated_cols}') # copying the imputed values to the original df for i in operated_cols: df_mice[i] = imp.data[i] # reverse mapping the values embarked_mapping = {1: "S", 2: "C", 3: "Q"} df_mice['Embarked'] = df_mice['Embarked'].map(embarked_mapping) deck_mapping = {0: "A", 1: "B", 2: "C", 3: "D", 4: "E", 5: "F", 6: "G"} df_mice['Cabin'] = df_mice['Cabin'].map(deck_mapping) return df_mice
def impute(rawData): # impute missing lab and vitals values shortData = rawData[[ 'age', 'female', 'sbp', 'dbp', 'crp', 'dDimer', 'ferritin', 'platelets', 'creatinine', 'tbili', 'hsTrop', 'il6', 'lymph', 'hgb', 'lac', 'ldh', 'albumin', 'hr', 'rr', 'temp', 'icu', 'hypertension', 'diabetes', 'asthma', 'copd', 'chronic_lung', 'home_o2', 'osa', 'immunocompromised', 'pregnant', 'primaryOutcome', 'highFi02', 'lastTime', 'o2Sat' ]] imputedData = mice.MICEData(shortData) for var in [ 'age', 'female', 'sbp', 'dbp', 'crp', 'dDimer', 'ferritin', 'platelets', 'creatinine', 'tbili', 'hsTrop', 'il6', 'lymph', 'hgb', 'lac', 'ldh', 'albumin', 'hr', 'rr', 'temp' ]: imputedData.set_imputer(var, formula=ols_formula(shortData, var)) imputedData.update_all(20) return imputedData.data, imputedData
# -*- coding: utf-8 -*- """ Created on Sun Sep 1 23:39:09 2019 @author: 92156 """ from statsmodels.imputation import mice imp = mice.MICEData(data) >>> fml = 'y ~ x1 + x2 + x3 + x4' >>> mice = mice.MICE(fml, sm.OLS, imp) >>> results = mice.fit(10, 10) >>> print(results.summary())
def eda(): files = ['train.csv','test.csv'] for file in files: #read csv file try: df = pd.read_csv(file) except: print(file + " not found") #remove columns unlikely to contain inferential value df = df.drop(['Cabin','PassengerId','Name','Ticket'], axis=1) #convert gender and port into numerics, #so we can do imputation on missing Age values df1 = df.copy(deep=True) df1['Sex'] = pd.factorize(df['Sex'])[0] df1['Embarked'] = pd.factorize(df['Embarked'])[0] # use MICE imputation to fix ages imp = mice.MICEData(df1) imp.update_all(100) #creates new data frame with imputed values df = df.drop(['Age'], axis=1) #drop the original age column df = pd.concat([df, imp.data['Age']], axis=1) #add the imputed column back in #tried binning but could not factorize these in ascending order #and did not like bin labels. #bin Age into Toddler, Child, Adolescent, Adult, Elderly #df = df.filter(['Age'], axis=1) #print(df) #age_bins = [0, 2, 7, 21, 60, 100] #out = pd.cut(df['Age'], bins=age_bins) #df = pd.concat((df, out), axis=1) #df.columns.values[1] = "Age_Bin" #df = df.drop(['Age'], axis=1) #df = pd.concat([df, df], axis=1) #create five categories for age, representing boundaries between social mores df.loc[df['Age'] < 3, 'Age_Bin'] = '1-Toddler' df.loc[(df['Age'] >= 3) & (df['Age'] < 13), 'Age_Bin'] = '2-Child' df.loc[(df['Age'] >= 13) & (df['Age'] < 20), 'Age_Bin'] = '3-Teen' df.loc[(df['Age'] >= 20) & (df['Age'] < 60), 'Age_Bin'] = '4-Adult' df.loc[df['Age'] >= 60, 'Age_Bin'] = '5-Senior' #distinguish between traveling alone, small families, and large families df['family_size'] = df['SibSp'] + df['Parch'] df.loc[df['family_size'] == 0, 'Family'] = '2-None' df.loc[(df['family_size'] > 0) & (df['family_size'] < 4), 'Family'] = '1-Small' df.loc[df['family_size'] >= 4, 'Family'] = '3-Large' #create 1-hot variables for each category value (level), then drop the original columns df = pd.concat([df,pd.get_dummies(df['Age_Bin'], prefix='Age_Bin')],axis=1) df = pd.concat([df,pd.get_dummies(df['Sex'], prefix='Gender')],axis=1) df = pd.concat([df,pd.get_dummies(df['Embarked'], prefix='Embarked')],axis=1) df = pd.concat([df,pd.get_dummies(df['Family'], prefix='Family')],axis=1) df.drop(['Age_Bin','Sex','Embarked','Family'],axis=1, inplace=True) try: df.to_csv('mod_'+ file, encoding='utf-8') #save results to disk print('Successful writing file mod_' + file) except: print("Could not write file mod_" + file) return()
def test_MICE1_regularized(self): df = gendat() imp = mice.MICEData(df, perturbation_method='boot') imp.set_imputer('x1', 'x2 + y', fit_kwds={'alpha': 1, 'L1_wt': 0}) imp.update_all()
lmodr = smf.ols( 'logit(race/100) ~ fire + theft + age + np.log(income)', chmiss).fit() (ilogit(lmodr.predict(chmiss))*100)[mv] # chredlin.race.iloc[np.where(chmiss.race.isna())] # ## Multiple Imputation # import statsmodels.imputation.mice as smi imp = smi.MICEData(chmiss) fm = 'involact ~ race + fire + theft + age + np.log(income)' mmod = smi.MICE(fm, sm.OLS, imp) results = mmod.fit(10, 50) print(results.summary()) # ## Discussion # ## Exercises # ## Packages Used import sys import matplotlib import statsmodels as sm import seaborn as sns