def test_statstics_fit_transform(): # Test statistics_ when data in fit() and transform() are different X = np.array([ [1, 0, 0, 1], [2, 1, 2, 2], [3, 2, 3, 2], [np.nan, 4, 5, 5], [6, 7, 6, 7], [8, 8, 8, 8], [16, 15, 18, 19], ]) statistics_mean = np.nanmean(X, axis=0) Y = np.array([ [0, 0, 0, 0], [2, 2, 2, 1], [3, 2, 3, 2], [np.nan, 4, 5, 5], [6, 7, 6, 7], [9, 9, 8, 8], [16, 15, 18, 19], ]) imputer = MissForest() imputer.fit(X).transform(Y) assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean)
def imputer(self, _steps, _answers, train_dataset, _X_train, _y_train, test_dataset, _X_test, _y_test, _headers): self.steps = _steps self.answers = _answers self.X_train = _X_train self.y_train = _y_train self.X_test = _X_test self.y_test = _y_test self.headers = _headers self.train_pipe_steps = [] for i, s in enumerate(self.steps): if (s == 'imputer'): if (self.answers[i][s] == 'Miss Forest'): imputer = MissForest() if (self.answers[i][s] == 'KNN Miss Values'): imputer = KNNImputer(n_neighbors=2) imputer.fit(self.X_train, self.y_train) self.X_train = imputer.transform(self.X_train) self.X_test = imputer.transform(self.X_test) self.new_train_dataset = pd.DataFrame(self.X_train, columns=self.headers[:-1]) self.new_train_dataset[self.headers[-1]] = self.y_train self.new_test_dataset = pd.DataFrame(self.X_test, columns=self.headers[:-1]) self.new_test_dataset[self.headers[-1]] = self.y_test return self.new_train_dataset, self.new_test_dataset
test_data = title_extract(my_test_data1) test_data = dummy_encode(test_data, 2, 6, 0, 7) # Feature scaling (Age) from sklearn.preprocessing import StandardScaler sc = StandardScaler() CV_data[['Age']] = sc.fit_transform(CV_data[['Age']]) test_data[['Age']] = sc.fit_transform(test_data[['Age']]) from missingpy import MissForest # Make an instance and perform the imputation imputer = MissForest(random_state=0) my_imp = imputer.fit(train_data.drop(['Survived', 'Weight'], axis=1)) CV_data_missforest = imputer.transform(CV_data.drop('Survived', axis=1)) CV_data_missforest = pd.DataFrame(CV_data_missforest, columns=CV_data.columns[1:]) CV_data_missforest = pd.concat([CV_data.Survived, CV_data_missforest], axis=1) test_data_missforest = imputer.transform(test_data) test_data_missforest = pd.DataFrame(test_data_missforest, columns=test_data.columns) ## Now that the individuals in the training set have their new weights, and the missing values in the cross-validation and test set have been imputed ## using the MissForest imputation method, we will now fit the logistic model in R since Python doesn't allow for fitting a weighted model train_data.to_excel(r'train_data.xlsx', index = False) CV_data_missforest.to_excel(r'CV_data.xlsx', index = False) test_data_missforest.to_excel(r'test_data.xlsx', index = False)