Beispiel #1
0
 def test_validate_sklearn_knn_models_multiclass(self):
     model = KNeighborsClassifier()
     pipe = Pipeline([('model', model)])
     pipe.fit(self.X, self.y)
     file_name = 'knn_model_numlti_class_classification.pmml'
     skl_to_pmml(pipe, self.features, 'species', file_name)
     self.assertEqual(self.schema.is_valid(file_name), True)
Beispiel #2
0
    def test_sklearn_04(self):
        titanic = pd.read_csv("nyoka/tests/titanic_train.csv")
        features = titanic.columns
        target = 'Survived'
        f_name = "gb_pmml.pmml"

        pipeline_obj = Pipeline([
            ("imp", Imputer(strategy="median")),
            ("gbc", GradientBoostingClassifier(n_estimators=10))
        ])

        pipeline_obj.fit(titanic[features], titanic[target])

        skl_to_pmml(pipeline_obj, features, target, f_name)

        pmml_obj = pml.parse(f_name, True)

        ##1
        self.assertEqual(
            pmml_obj.MiningModel[0].Segmentation.multipleModelMethod,
            "modelChain")

        ##2
        self.assertEqual(
            pmml_obj.MiningModel[0].Segmentation.Segment.__len__(), 2)

        ##3
        self.assertEqual(
            pmml_obj.MiningModel[0].Segmentation.Segment[1].RegressionModel.
            normalizationMethod, "logit")
Beispiel #3
0
    def test_sklearn_02(self):
        iris = datasets.load_iris()
        irisd = pd.DataFrame(iris.data, columns=iris.feature_names)
        irisd['Species'] = iris.target

        features = irisd.columns.drop('Species')
        target = 'Species'
        f_name = "knn_pmml.pmml"

        pipeline_obj = Pipeline([('scaling', StandardScaler()),
                                 ('knn', KNeighborsClassifier(n_neighbors=5))])

        pipeline_obj.fit(irisd[features], irisd[target])

        skl_to_pmml(pipeline_obj, features, target, f_name)

        pmml_obj = pml.parse(f_name, True)
        ##1
        self.assertIsNotNone(
            pmml_obj.NearestNeighborModel[0].ComparisonMeasure.euclidean)

        ##2
        self.assertEqual(
            pmml_obj.NearestNeighborModel[0].ComparisonMeasure.kind,
            "distance")

        ##3
        self.assertEqual(pipeline_obj.steps[-1][-1].n_neighbors,
                         pmml_obj.NearestNeighborModel[0].numberOfNeighbors)
Beispiel #4
0
 def test_validate_sklearn_lda_models_multiclass(self):
     model = LinearDiscriminantAnalysis()
     pipe = Pipeline([('model', model)])
     pipe.fit(self.X, self.y)
     file_name = 'lda_model_numlti_class_classification.pmml'
     skl_to_pmml(pipe, self.features, 'species', file_name)
     self.assertEqual(self.schema.is_valid(file_name), True)
 def test_35_isolation_forest(self):
     print("\ntest 34 (Isolation Forest\n")
     detection_map = {
         'true': -1,
         'false': 1
     }
     X = numpy.array([
         [1,2,3,4],
         [2,1,3,4],
         [3,2,1,4],
         [3,2,4,1],
         [4,3,2,1],
         [2,4,3,1]
     ], dtype=numpy.float32)
     test_data = numpy.array([[0,4,0,7],[4,0,4,7]])
     features = ['a','b','c','d']
     model = IsolationForest(n_estimators=40,contamination=0)
     pipeline_obj = Pipeline([
         ("model", model)
     ])
     pipeline_obj.fit(X)
     file_name = 'test35sklearn.pmml'
     skl_to_pmml(pipeline_obj, features, '', file_name)
     model_pred = pipeline_obj.predict(test_data)
     model_scores = model.score_samples(test_data)
     model_name  = self.adapa_utility.upload_to_zserver(file_name)
     z_predictions = self.adapa_utility.score_in_zserver(model_name,'nyoka/tests/test_forest.csv','ANOMALY')
     cnt = 0
     for idx, value in enumerate(z_predictions):
         score, is_anomaly = value.split(",")
         score = -1 * float(score)
         if "{:.6f}".format(score) != "{:.6f}".format(model_scores[idx]) or model_pred[idx] != detection_map[is_anomaly]:
             cnt += 1
     self.assertEqual(cnt,0)
Beispiel #6
0
 def test_validate_sklearn_tree_models_multiclass(self):
     model = DecisionTreeClassifier()
     pipe = Pipeline([('pca', PCA()), ('model', model)])
     pipe.fit(self.X, self.y)
     file_name = 'tree_model_numlti_class_classification.pmml'
     skl_to_pmml(pipe, self.features, 'species', file_name)
     self.assertEqual(self.schema.is_valid(file_name), True)
Beispiel #7
0
 def test_validate_sklearn_svm_models_binary_class(self):
     model = SVC()
     pipe = Pipeline([('scaler', MaxAbsScaler()), ('model', model)])
     pipe.fit(self.X, self.y_bin)
     file_name = 'svm_model_binary_classification.pmml'
     skl_to_pmml(pipe, self.features, 'binary', file_name)
     self.assertEqual(self.schema.is_valid(file_name), True)
Beispiel #8
0
 def test_validate_sklearn_linear_models_binary_class(self):
     model = LogisticRegression()
     pipe = Pipeline([('sclaer', StandardScaler()), ('model', model)])
     pipe.fit(self.X, self.y_bin)
     file_name = 'linear_model_binary_classification.pmml'
     skl_to_pmml(pipe, self.features, 'binary', file_name)
     self.assertEqual(self.schema.is_valid(file_name), True)
    def test_03_logistic_regression_with_scaler(self):
        print(
            "\ntest 03 (logistic regression with preprocessing) [multi-class]\n"
        )
        X, X_test, y, features, target, test_file = self.data_utility.get_data_for_multi_class_classification(
        )

        model = LogisticRegression()
        pipeline_obj = Pipeline([
            ("mapper",
             DataFrameMapper([(["sepal length (cm)",
                                "sepal width (cm)"], MinMaxScaler()),
                              (["petal length (cm)",
                                "petal width (cm)"], None)])), ("model", model)
        ])
        pipeline_obj.fit(X, y)
        file_name = 'test03sklearn.pmml'

        skl_to_pmml(pipeline_obj, features, target, file_name)
        model_name = self.adapa_utility.upload_to_zserver(file_name)
        predictions, probabilities = self.adapa_utility.score_in_zserver(
            model_name, test_file)
        model_pred = pipeline_obj.predict(X_test)
        model_prob = pipeline_obj.predict_proba(X_test)
        self.assertEqual(
            self.adapa_utility.compare_predictions(predictions, model_pred),
            True)
        self.assertEqual(
            self.adapa_utility.compare_probability(probabilities, model_prob),
            True)
Beispiel #10
0
    def test_sklearn_03(self):
        iris = datasets.load_iris()
        irisd = pd.DataFrame(iris.data, columns=iris.feature_names)
        irisd['Species'] = iris.target

        features = irisd.columns.drop('Species')
        target = 'Species'
        f_name = "rf_pmml.pmml"
        model = RandomForestClassifier(n_estimators=100)

        pipeline_obj = Pipeline([
            ("mapping",
             DataFrameMapper([(['sepal length (cm)',
                                'sepal width (cm)'], StandardScaler()),
                              (['petal length (cm)',
                                'petal width (cm)'], Imputer())])),
            ("rfc", model)
        ])

        pipeline_obj.fit(irisd[features], irisd[target])
        skl_to_pmml(pipeline_obj, features, target, f_name)
        pmml_obj = pml.parse(f_name, True)

        ## 1
        self.assertEqual(
            model.n_estimators,
            pmml_obj.MiningModel[0].Segmentation.Segment.__len__())

        ##2
        self.assertEqual(
            pmml_obj.MiningModel[0].Segmentation.multipleModelMethod,
            "majorityVote")
 def test_36_one_class_svm(self):
     print("\ntest 36 (One Class SVM\n")
     detection_map = {
         'true': -1,
         'false': 1
     }
     df = pd.read_csv("nyoka/tests/train_ocsvm.csv")
     df_test = pd.read_csv("nyoka/tests/test_ocsvm.csv")
     features = df.columns
     model = OneClassSVM(nu=0.1)
     pipeline_obj = Pipeline([
         ("model", model)
     ])
     pipeline_obj.fit(df)
     file_name = 'test36sklearn.pmml'
     skl_to_pmml(pipeline_obj, features, '', file_name)
     model_pred = pipeline_obj.predict(df_test)
     model_scores = pipeline_obj.decision_function(df_test)
     model_name  = self.adapa_utility.upload_to_zserver(file_name)
     z_predictions = self.adapa_utility.score_in_zserver(model_name,'nyoka/tests/test_ocsvm.csv','ANOMALY')
     cnt = 0
     for idx, value in enumerate(z_predictions):
         score, is_anomaly = value.split(",")
         score = float(score)
         if "{:.6f}".format(score) != "{:.6f}".format(model_scores[idx]) or model_pred[idx] != detection_map[is_anomaly]:
             cnt += 1
     self.assertEqual(cnt,0)
Beispiel #12
0
 def test_validate_sklearn_gboost_models_binary_class(self):
     model = GradientBoostingClassifier()
     pipe = Pipeline([('scaler', RobustScaler()), ('model', model)])
     pipe.fit(self.X, self.y_bin)
     file_name = 'gboost_model_binary_classification.pmml'
     skl_to_pmml(pipe, self.features, 'binary', file_name)
     self.assertEqual(self.schema.is_valid(file_name), True)
Beispiel #13
0
 def test_validate_sklearn_knn_models_regression(self):
     model = KNeighborsRegressor()
     pipe = Pipeline([('model', model)])
     pipe.fit(self.X_reg, self.y_reg)
     file_name = 'knn_model_regression.pmml'
     skl_to_pmml(pipe, self.features_reg, 'target', file_name)
     self.assertEqual(self.schema.is_valid(file_name), True)
Beispiel #14
0
 def test_validate_sklearn_kmeans_models(self):
     model = KMeans()
     pipe = Pipeline([('model', model)])
     pipe.fit(self.X)
     file_name = 'kmeans_model.pmml'
     skl_to_pmml(pipe, self.features, 'target', file_name)
     self.assertEqual(self.schema.is_valid(file_name), True)
Beispiel #15
0
    def test_sklearn_01(self):

        iris = datasets.load_iris()
        irisd = pd.DataFrame(iris.data, columns=iris.feature_names)
        irisd['Species'] = iris.target

        features = irisd.columns.drop('Species')
        target = 'Species'
        f_name = "svc_pmml.pmml"
        model = SVC()
        pipeline_obj = Pipeline([('svm', model)])

        pipeline_obj.fit(irisd[features], irisd[target])
        skl_to_pmml(pipeline_obj, features, target, f_name)
        pmml_obj = pml.parse(f_name, True)
        ## 1
        svms = pmml_obj.SupportVectorMachineModel[0].SupportVectorMachine
        for mod_val, recon_val in zip(model.intercept_, svms):
            self.assertEqual(
                "{:.16f}".format(mod_val),
                "{:.16f}".format(recon_val.Coefficients.absoluteValue))

        ## 2
        svm = pmml_obj.SupportVectorMachineModel[0]
        self.assertEqual(svm.RadialBasisKernelType.gamma, model._gamma)
Beispiel #16
0
 def test_validate_sklearn_gnb_models_binary_class(self):
     model = GaussianNB()
     pipe = Pipeline([('model', model)])
     pipe.fit(self.X, self.y_bin)
     file_name = 'gnb_model_binary_classification.pmml'
     skl_to_pmml(pipe, self.features, 'binary', file_name)
     self.assertEqual(self.schema.is_valid(file_name), True)
Beispiel #17
0
 def test_sklearn_30(self):
     iris = datasets.load_iris()
     model = KMeans()
     pipe = Pipeline([('model', model)])
     pipe.fit(iris.data)
     file_name = 'kmeans_model.pmml'
     skl_to_pmml(pipe, iris.feature_names, 'target', file_name)
     self.assertEqual(os.path.isfile(file_name), True)
Beispiel #18
0
 def test_validate_sklearn_linear_models_regression(self):
     model = LinearRegression()
     pipe = Pipeline([('impute', Imputer()), ('feat', PolynomialFeatures()),
                      ('model', model)])
     pipe.fit(self.X_reg, self.y_reg)
     file_name = 'linear_model_regression.pmml'
     skl_to_pmml(pipe, self.features_reg, 'target', file_name)
     self.assertEqual(self.schema.is_valid(file_name), True)
Beispiel #19
0
 def test_validate_sklearn_mlp_models_multiclass(self):
     from sklearn.neural_network import MLPClassifier
     model = MLPClassifier()
     pipe = Pipeline([('model', model)])
     pipe.fit(self.X, self.y)
     file_name = 'mlp_model_numlti_class_classification.pmml'
     skl_to_pmml(pipe, self.features, 'species', file_name)
     self.assertEqual(self.schema.is_valid(file_name), True)
 def test_sklearn_40(self):
     iris = datasets.load_iris()
     irisd = pd.DataFrame(iris.data,columns=iris.feature_names)
     irisd['Species'] = iris.target
     target = 'Species'
     features = irisd.columns.drop('Species')
     model = GaussianProcessClassifier()
     model.fit(irisd[features],irisd[target])
     with self.assertRaises(TypeError):
         skl_to_pmml(model,features,target,"no_pipeline.pmml")
Beispiel #21
0
 def test_validate_isolation_forest(self):
     iris = datasets.load_iris()
     X = iris.data
     features = iris.feature_names
     model = IsolationForest()
     pipeline = Pipeline([('standard_scaler', StandardScaler()),
                          ('Imputer', Imputer()), ('model', model)])
     pipeline.fit(X)
     file_name = model.__class__.__name__ + '.pmml'
     skl_to_pmml(pipeline, features, pmml_f_name=file_name)
     self.assertEqual(self.schema.is_valid(file_name), True)
Beispiel #22
0
 def test_validate_sklearn_rf_models_binary_class(self):
     df = pd.DataFrame(data=self.X, columns=self.features)
     df['new'] = [i % 3 for i in range(self.X.shape[0])]
     df['binary'] = self.y_bin
     model = RandomForestClassifier()
     pipe = Pipeline([('mapper', DataFrameMapper([('new', LabelBinarizer())
                                                  ])), ('model', model)])
     pipe.fit(df[self.features + ['new']], df.binary)
     file_name = 'rf_model_binary_classification.pmml'
     skl_to_pmml(pipe, self.features + ['new'], 'binary', file_name)
     self.assertEqual(self.schema.is_valid(file_name), True)
Beispiel #23
0
 def test_validate_sklearn_gboost_models_multiclass(self):
     df = pd.DataFrame(data=self.X, columns=self.features)
     df['new'] = [i % 3 for i in range(self.X.shape[0])]
     df['species'] = self.y
     model = GradientBoostingClassifier()
     pipe = Pipeline([('mapper', DataFrameMapper([('new', LabelEncoder())
                                                  ])), ('model', model)])
     pipe.fit(df.drop(['species'], axis=1), df.species)
     file_name = 'gboost_model_numlti_class_classification.pmml'
     skl_to_pmml(pipe, self.features + ['new'], 'species', file_name)
     self.assertEqual(self.schema.is_valid(file_name), True)
Beispiel #24
0
 def test_validate_sklearn_linear_models_multiclass(self):
     df = pd.DataFrame(data=self.X, columns=self.features)
     df['species'] = self.y
     model = LogisticRegression()
     pipe = Pipeline([('mapper',
                       DataFrameMapper([(['sepal length (cm)'], Binarizer())
                                        ])), ('model', model)])
     pipe.fit(df[self.features], df.species)
     file_name = 'linear_model_multi_class_classification.pmml'
     skl_to_pmml(pipe, self.features, 'species', file_name)
     self.assertEqual(self.schema.is_valid(file_name), True)
Beispiel #25
0
 def test_sklearn_37(self):
     iris = datasets.load_iris()
     irisd = pd.DataFrame(iris.data, columns=iris.feature_names)
     irisd['Species'] = iris.target
     target = 'Species'
     features = irisd.columns.drop('Species')
     model = LogisticRegression()
     pipeline_obj = Pipeline([('new', StandardScaler()),
                              ('imputer', Imputer()), ('model', model)])
     pipeline_obj.fit(irisd[features], irisd[target])
     skl_to_pmml(pipeline_obj, features, target, "imputer.pmml")
     self.assertEqual(os.path.isfile("imputer.pmml"), True)
Beispiel #26
0
 def test_sklearn_31(self):
     iris = datasets.load_iris()
     irisd = pd.DataFrame(iris.data, columns=iris.feature_names)
     irisd['Species'] = iris.target
     target = 'Species'
     features = irisd.columns.drop('Species')
     model = GradientBoostingClassifier()
     pipe = Pipeline([('scaler', MaxAbsScaler()), ('model', model)])
     pipe.fit(irisd[features], irisd[target])
     file_name = 'gbc_model_numlti_class_classification.pmml'
     skl_to_pmml(pipe, iris.feature_names, target, file_name)
     self.assertEqual(os.path.isfile(file_name), True)
Beispiel #27
0
 def test_validate_ocsvm(self):
     iris = datasets.load_iris()
     X = iris.data
     y = iris.target
     features = iris.feature_names
     model = OneClassSVM()
     pipeline = Pipeline([('standard_scaler', StandardScaler()),
                          ('Imputer', Imputer()), ('model', model)])
     pipeline.fit(X, y)
     file_name = model.__class__.__name__ + '.pmml'
     skl_to_pmml(pipeline, features, pmml_f_name=file_name)
     self.assertEqual(self.schema.is_valid(file_name), True)
Beispiel #28
0
 def test_validate_lag(self):
     iris = datasets.load_iris()
     X = iris.data
     y = iris.target
     features = iris.feature_names
     model = LogisticRegression()
     pipeline = Pipeline([('lag', Lag(aggregation="stddev", value=3)),
                          ('model', model)])
     pipeline.fit(X, y)
     file_name = model.__class__.__name__ + 'lag_stddev.pmml'
     skl_to_pmml(pipeline, features, 'species', pmml_f_name=file_name)
     self.assertEqual(self.schema.is_valid(file_name), True)
Beispiel #29
0
 def test_sklearn_39(self):
     iris = datasets.load_iris()
     irisd = pd.DataFrame(iris.data, columns=iris.feature_names)
     irisd['Species'] = iris.target
     target = 'Species'
     features = irisd.columns.drop('Species')
     model = GaussianProcessClassifier()
     pipeline_obj = Pipeline([('model', model)])
     pipeline_obj.fit(irisd[features], irisd[target])
     with self.assertRaises(NotImplementedError):
         skl_to_pmml(pipeline_obj, numpy.array(features), target,
                     "gpc.pmml")
Beispiel #30
0
 def test_validate_sklearn_sgd_with_text(self):
     categories = ['alt.atheism', 'talk.religion.misc']
     data = fetch_20newsgroups(subset='train', categories=categories)
     X = data.data[:4]
     Y = data.target[:4]
     features = ['input']
     target = 'output'
     model = SGDClassifier(loss="log")
     file_name = model.__class__.__name__ + '_TfIdfVec_.pmml'
     pipeline = Pipeline([('vect', TfidfVectorizer()), ('clf', model)])
     pipeline.fit(X, Y)
     skl_to_pmml(pipeline, features, target, file_name)
     self.assertEqual(self.schema.is_valid(file_name), True)