Example #1
0
class Imputer(SklearnImputer):
    def __init__(self,
                 missing_values="NaN",
                 strategy="mean",
                 axis=0,
                 verbose=0,
                 copy=True,
                 input_features=None,
                 output_features=None):
        self.name = "{}_{}".format(self.op, uuid.uuid1())
        self.input_features = input_features
        self.output_features = output_features
        self.input_shapes = {'data_shape': [{'shape': 'scalar'}]}
        self.feature_extractor = FeatureExtractor(
            input_scalars=[input_features],
            output_vector='extracted_' + output_features,
            output_vector_items=[output_features])
        SklearnImputer.__init__(self, missing_values, strategy, axis, verbose,
                                copy)

    def fit(self, X, y=None):
        super(Imputer, self).fit(self.feature_extractor.transform(X))
        return self

    def transform(self, X):
        return pd.DataFrame(
            super(Imputer,
                  self).transform(self.feature_extractor.transform(X)))

    def serialize_to_bundle(self, path, model_name):
        ImputerSerializer().serialize_to_bundle(self, path, model_name)
Example #2
0
class TestImputerExtension(unittest.TestCase):
    def setUp(self):
        self.df = pd.DataFrame(
            [[0.85281608, 1.50669264], [-1.04544152, np.NaN],
             [0.41515407, -0.29941475], [np.NaN, -0.96775275],
             [np.NaN, -0.85734022]],
            columns=['a', 'b'])
        self.feature_extractor = FeatureExtractor(input_scalars=['a'],
                                                  output_vector='a_extracted')
        self.tmp_dir = tempfile.mkdtemp(prefix="mleap.python.tests")

    def tearDown(self):
        shutil.rmtree(self.tmp_dir)

    def test_imputer_extension_serialization_succeeds(self):
        imputer = Imputer(input_features='a', output_features='a_imputed')

        imputer.fit(self.feature_extractor.transform(self.df))
        imputer.serialize_to_bundle(self.tmp_dir, imputer.name)

        expected_model = {
            "op": "imputer",
            "attributes": {
                "surrogate_value": {
                    "double": self.df.a.mean()
                },
                "strategy": {
                    "string": "mean"
                }
            }
        }

        with open("{}/{}.node/model.json".format(self.tmp_dir,
                                                 imputer.name)) as json_data:
            actual_model = json.load(json_data)

        self.assertEqual(expected_model, actual_model)

        with open("{}/{}.node/node.json".format(self.tmp_dir,
                                                imputer.name)) as json_data:
            node = json.load(json_data)

        self.assertEqual(imputer.name, node['name'])
        self.assertEqual("a", node['shape']['inputs'][0]['name'])
        self.assertEqual("a_imputed", node['shape']['outputs'][0]['name'])
Example #3
0
class TestImputer(unittest.TestCase):
    def setUp(self):
        self.df = pd.DataFrame(
            [[0.85281608, 1.50669264], [-1.04544152, np.NaN],
             [0.41515407, -0.29941475], [np.NaN, -0.96775275],
             [np.NaN, -0.85734022]],
            columns=['a', 'b'])
        self.feature_extractor = FeatureExtractor(input_scalars=['a'],
                                                  output_vector='a_extracted')
        self.tmp_dir = tempfile.mkdtemp(prefix="mleap.python.tests")

    def tearDown(self):
        shutil.rmtree(self.tmp_dir)

    def test_imputer_serialization_fails_with_strategy_set_to_most_frequent(
            self):
        imputer = SimpleImputer(strategy='most_frequent')
        imputer.mlinit(prior_tf=self.feature_extractor,
                       output_features='a_imputed')

        imputer.fit(self.feature_extractor.transform(self.df))

        with self.assertRaises(NotImplementedError):
            imputer.serialize_to_bundle(self.tmp_dir, imputer.name)

    def test_imputer_serialization_fails_with_strategy_set_to_constant(self):
        imputer = SimpleImputer(strategy='constant')
        imputer.mlinit(prior_tf=self.feature_extractor,
                       output_features='a_imputed')

        imputer.fit(self.feature_extractor.transform(self.df))

        with self.assertRaises(NotImplementedError):
            imputer.serialize_to_bundle(self.tmp_dir, imputer.name)

    def test_imputer_serialization_fails_with_add_indicator_set_to_true(self):
        imputer = SimpleImputer(add_indicator=True)
        imputer.mlinit(prior_tf=self.feature_extractor,
                       output_features='a_imputed')

        imputer.fit(self.feature_extractor.transform(self.df))

        with self.assertRaises(NotImplementedError):
            imputer.serialize_to_bundle(self.tmp_dir, imputer.name)

    def test_imputer_serialization_fails_when_fit_on_multiple_features(self):
        imputer = SimpleImputer()
        self.feature_extractor = FeatureExtractor(input_scalars=['a', 'b'],
                                                  output_vector='ab_extracted')
        imputer.mlinit(prior_tf=self.feature_extractor,
                       output_features='ab_imputed')

        imputer.fit(self.feature_extractor.transform(self.df))

        with self.assertRaises(NotImplementedError):
            imputer.serialize_to_bundle(self.tmp_dir, imputer.name)

    def test_imputer_serialization_succeeds_with_strategy_set_to_mean(self):
        imputer = SimpleImputer(strategy='mean')
        imputer.mlinit(prior_tf=self.feature_extractor,
                       output_features='a_imputed')

        imputer.fit(self.feature_extractor.transform(self.df))
        imputer.serialize_to_bundle(self.tmp_dir, imputer.name)

        expected_model = {
            "op": "imputer",
            "attributes": {
                "surrogate_value": {
                    "double": self.df.a.mean()
                },
                "strategy": {
                    "string": "mean"
                }
            }
        }

        with open("{}/{}.node/model.json".format(self.tmp_dir,
                                                 imputer.name)) as json_data:
            actual_model = json.load(json_data)

        self.assertEqual(expected_model, actual_model)

        with open("{}/{}.node/node.json".format(self.tmp_dir,
                                                imputer.name)) as json_data:
            node = json.load(json_data)

        self.assertEqual(imputer.name, node['name'])
        self.assertEqual("a_extracted", node['shape']['inputs'][0]['name'])
        self.assertEqual("a_imputed", node['shape']['outputs'][0]['name'])

    def test_imputer_serialization_succeeds_with_strategy_set_to_median(self):
        imputer = SimpleImputer(strategy='median')
        imputer.mlinit(prior_tf=self.feature_extractor,
                       output_features='a_imputed')

        imputer.fit(self.feature_extractor.transform(self.df))
        imputer.serialize_to_bundle(self.tmp_dir, imputer.name)

        expected_model = {
            "op": "imputer",
            "attributes": {
                "surrogate_value": {
                    "double": self.df.a.median()
                },
                "strategy": {
                    "string": "median"
                }
            }
        }

        with open("{}/{}.node/model.json".format(self.tmp_dir,
                                                 imputer.name)) as json_data:
            actual_model = json.load(json_data)

        self.assertEqual(expected_model, actual_model)

        with open("{}/{}.node/node.json".format(self.tmp_dir,
                                                imputer.name)) as json_data:
            node = json.load(json_data)

        self.assertEqual(imputer.name, node['name'])
        self.assertEqual("a_extracted", node['shape']['inputs'][0]['name'])
        self.assertEqual("a_imputed", node['shape']['outputs'][0]['name'])

    def test_imputer_serialization_succeeds_with_missing_values_set_to_zero(
            self):
        df2 = self.df.fillna(0)

        imputer = SimpleImputer(strategy='mean', missing_values=0.0)
        imputer.mlinit(prior_tf=self.feature_extractor,
                       output_features='a_imputed')

        imputer.fit(self.feature_extractor.transform(df2))
        imputer.serialize_to_bundle(self.tmp_dir, imputer.name)

        expected_model = {
            "op": "imputer",
            "attributes": {
                "surrogate_value": {
                    "double": self.df.a.mean(),
                },
                "strategy": {
                    "string": "mean",
                },
                "missing_value": {
                    "double": 0.0,
                }
            }
        }

        with open("{}/{}.node/model.json".format(self.tmp_dir,
                                                 imputer.name)) as json_data:
            actual_model = json.load(json_data)

        self.assertEqual(expected_model, actual_model)

        with open("{}/{}.node/node.json".format(self.tmp_dir,
                                                imputer.name)) as json_data:
            node = json.load(json_data)

        self.assertEqual(imputer.name, node['name'])
        self.assertEqual("a_extracted", node['shape']['inputs'][0]['name'])
        self.assertEqual("a_imputed", node['shape']['outputs'][0]['name'])