def test_statstics_fit_transform():
    # Test statistics_ when data in fit() and transform() are different
    X = np.array([
        [1, 0, 0, 1],
        [2, 1, 2, 2],
        [3, 2, 3, 2],
        [np.nan, 4, 5, 5],
        [6, 7, 6, 7],
        [8, 8, 8, 8],
        [16, 15, 18, 19],
    ])
    statistics_mean = np.nanmean(X, axis=0)

    Y = np.array([
        [0, 0, 0, 0],
        [2, 2, 2, 1],
        [3, 2, 3, 2],
        [np.nan, 4, 5, 5],
        [6, 7, 6, 7],
        [9, 9, 8, 8],
        [16, 15, 18, 19],
    ])

    imputer = MissForest()
    imputer.fit(X).transform(Y)
    assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean)
Beispiel #2
0
    def imputer(self, _steps, _answers, train_dataset, _X_train, _y_train,
                test_dataset, _X_test, _y_test, _headers):
        self.steps = _steps
        self.answers = _answers
        self.X_train = _X_train
        self.y_train = _y_train
        self.X_test = _X_test
        self.y_test = _y_test
        self.headers = _headers

        self.train_pipe_steps = []

        for i, s in enumerate(self.steps):
            if (s == 'imputer'):
                if (self.answers[i][s] == 'Miss Forest'):
                    imputer = MissForest()

                if (self.answers[i][s] == 'KNN Miss Values'):

                    imputer = KNNImputer(n_neighbors=2)

        imputer.fit(self.X_train, self.y_train)
        self.X_train = imputer.transform(self.X_train)
        self.X_test = imputer.transform(self.X_test)

        self.new_train_dataset = pd.DataFrame(self.X_train,
                                              columns=self.headers[:-1])
        self.new_train_dataset[self.headers[-1]] = self.y_train

        self.new_test_dataset = pd.DataFrame(self.X_test,
                                             columns=self.headers[:-1])
        self.new_test_dataset[self.headers[-1]] = self.y_test

        return self.new_train_dataset, self.new_test_dataset
Beispiel #3
0
test_data = title_extract(my_test_data1)
test_data = dummy_encode(test_data, 2, 6, 0, 7)

# Feature scaling (Age)
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
CV_data[['Age']] = sc.fit_transform(CV_data[['Age']])
test_data[['Age']] = sc.fit_transform(test_data[['Age']])


from missingpy import MissForest

# Make an instance and perform the imputation
imputer = MissForest(random_state=0)
my_imp = imputer.fit(train_data.drop(['Survived', 'Weight'], axis=1))

CV_data_missforest = imputer.transform(CV_data.drop('Survived', axis=1))
CV_data_missforest = pd.DataFrame(CV_data_missforest, columns=CV_data.columns[1:])
CV_data_missforest = pd.concat([CV_data.Survived, CV_data_missforest], axis=1)

test_data_missforest = imputer.transform(test_data)
test_data_missforest = pd.DataFrame(test_data_missforest, columns=test_data.columns)

## Now that the individuals in the training set have their new weights, and the missing values in the cross-validation and test set have been imputed
## using the MissForest imputation method, we will now fit the logistic model in R since Python doesn't allow for fitting a weighted model

train_data.to_excel(r'train_data.xlsx', index = False)
CV_data_missforest.to_excel(r'CV_data.xlsx', index = False)
test_data_missforest.to_excel(r'test_data.xlsx', index = False)