Esempio n. 1
0
    def test_revise_model(self):
        """Tests revise api in scoring engine"""
        kmodel = self.context.models.clustering.kmeans.train(
            self.frame_train, ["Vec1", "Vec2", "Vec3", "Vec4", "Vec5"], 5)

        result_frame = kmodel.predict(self.frame_test)
        old_model_path = kmodel.export_to_mar(self.get_export_file(self.get_name("kmeans")))

        #create a revised model
        kmodel_revised = self.context.models.clustering.kmeans.train(self.frame_train,
                     ["Vec1", "Vec2", "Vec3", "Vec4"], 4, max_iterations=10)
        result_revised = kmodel_revised.predict(self.frame_test)
        test_rows = result_revised.to_pandas(50)
        revised_model_path = kmodel_revised.export_to_mar(self.get_export_file(self.get_name("kmeans_revised")))

        with scoring_utils.scorer(
               old_model_path, self.id()) as scorer:
            res = scorer.revise(revised_model_path)
            self.assertEqual(res.json()["status"], "success")

            for _, i in test_rows.iterrows():
                res = scorer.score(
                    [dict(zip(["Vec1", "Vec2", "Vec3", "Vec4"],
                    list(i[0:4])))])
                self.assertEqual(i["cluster"], res.json()["data"][0]['score'])
Esempio n. 2
0
    def test_arimax_scoring(self):
        """Tests standard usage of arimax."""
        output = self.context.models.timeseries.arimax.train(
            self.train_frame, self.ts_column, self.x_columns, 1, 0, 1, 0)

        timeseries_column = self.actual_data.take(n=self.actual_data.count(),
                                                  columns=self.ts_column)
        y = [item for sublist in timeseries_column for item in sublist]

        x_columns = self.actual_data.take(n=self.actual_data.count(),
                                          columns=self.x_columns)
        x = [item for sublist in x_columns for item in sublist]

        predict_frame = output.predict(self.actual_data, self.ts_column,
                                       self.x_columns)
        predict_data = predict_frame.take(n=self.actual_data.count(),
                                          columns="predicted_y")

        expected_score = [item for sublist in predict_data for item in sublist]
        model_path = output.export_to_mar(
            self.get_export_file(self.get_name("arimax")))

        with scoring_utils.scorer(model_path, self.id()) as scorer:
            r = scorer.score([{"y": y, "x_values": x}])
            scored = r.json()["data"][0]["score"]

            self.assertEqual(scored, expected_score)
Esempio n. 3
0
    def test_revise_model(self):
        """Tests revise api in scoring engine"""
        kmodel = self.context.models.clustering.kmeans.train(
            self.frame_train, ["Vec1", "Vec2", "Vec3", "Vec4", "Vec5"], 5)

        kmodel.predict(self.frame_test)
        old_model_path = kmodel.export_to_mar(
            self.get_export_file(self.get_name("kmeans")))

        # create a revised model
        kmodel_revised = self.context.models.clustering.kmeans.train(
            self.frame_train, ["Vec1", "Vec2", "Vec3", "Vec4"],
            4,
            max_iterations=10)
        result_revised = kmodel_revised.predict(self.frame_test)
        test_rows = result_revised.to_pandas(50)
        revised_model_path = kmodel_revised.export_to_mar(
            self.get_export_file(self.get_name("kmeans_revised")))

        with scoring_utils.scorer(old_model_path, self.id()) as scorer:
            res = scorer.revise(revised_model_path)
            self.assertEqual(res.json()["status"], "success")

            for _, i in test_rows.iterrows():
                res = scorer.score([
                    dict(zip(["Vec1", "Vec2", "Vec3", "Vec4"], list(i[0:4])))
                ])
                self.assertEqual(i["cluster"], res.json()["data"][0]['score'])
Esempio n. 4
0
    def test_scoring_pipeline(self):
        """Test scoring_pipeline"""
        model = self.context.models.classification.naive_bayes.train(
            self.frame, ['f1', 'f2', 'f3'], "label")
        res = model.predict(self.frame, ['f1', 'f2', 'f3'])
        analysis = res.to_pandas()
        file_name = self.get_name("naive_bayes")
        model_path = model.export_to_mar(self.get_export_file(file_name))

        self.tarfile = "pipeline.tar"
        pipeline_funcs = os.path.join(config.root, "regression-tests",
                                      "sparktkregtests", "testcases",
                                      "scoretests", "pipeline_funcs.py")
        pipeline_config = os.path.join(config.root, "regression-tests",
                                       "sparktkregtests", "testcases",
                                       "scoretests", "pipeline_config.json")

        tar = tarfile.open(self.tarfile, "w:gz")
        tar.add(pipeline_funcs, "pipeline_funcs.py")
        tar.add(pipeline_config, "pipeline_config.json")
        tar.close()

        with scoring_utils.scorer(model_path,
                                  self.id(),
                                  pipeline=True,
                                  pipeline_filename=self.tarfile) as scorer:
            for _, i in analysis.iterrows():
                r = scorer.score([
                    dict(
                        zip(['f1', 'f2', 'f3'], map(lambda x: int(x),
                                                    (i[1:4]))))
                ])
                self.assertEqual(r.json(), i['predicted_class'])
    def test_revise_model(self):
        """Tests revise api in scoring engine"""
        model = self.context.models.regression.linear_regression.train(
            self.frame, ['c1', 'c2', 'c3', 'c4'], 'label')
        old_model_path = model.export_to_mar(
            self.get_export_file(self.get_name("lin_reg")))

        #create a revised model
        model_revised = self.context.models.regression.linear_regression.train(
                self.frame,
                ['c1', 'c2', 'c3'], 'label',
                max_iterations=10)
        result_revised = model_revised.predict(self.frame, ['c1', 'c2', 'c3'])
        test_rows = result_revised.to_pandas(50)
        revised_model_path = model_revised.export_to_mar(
            self.get_export_file(self.get_name("lin_reg_revised")))

        with scoring_utils.scorer(
               old_model_path, self.id()) as scorer:
            res = scorer.revise(revised_model_path)
            self.assertEqual(res.json()["status"], "success")

            for _, i in test_rows.iterrows():
                res = scorer.score(
                    [dict(zip(["c1", "c2", "c3"],
                    list(i[0:3])))])
                self.assertEqual(i['predicted_value'],
                    res.json()["data"][0]['Prediction'])
Esempio n. 6
0
    def test_arima_scoring(self):
        """Tests standard usage of arima."""
        timeseries_column = self.train_frame.take(n=self.train_frame.count(), columns=self.ts_column)
        timeseries_data = [item for sublist in timeseries_column for item in sublist]

        output = self.context.models.timeseries.arima.train(timeseries_data, 1, 0, 1)
        predict = output.predict(0)
        prediction = predict[:99]
        model_path = output.export_to_mar(self.get_export_file(self.get_name("arima")))
        with scoring_utils.scorer(
                model_path, self.id()) as scorer:
            r = scorer.score([{"future":0, "timeseries":timeseries_data}])
            scored =r.json()["data"][0]["predicted_values"]
            self.assertEqual(scored, predict)
Esempio n. 7
0
    def test_model_scoring(self):
        """Test publishing a linear regression model"""
        model = self.context.models.regression.linear_regression.train(
            self.frame, ['c1', 'c2', 'c3', 'c4'], 'label')

        predict = model.predict(self.frame, ['c1', 'c2', 'c3', 'c4'])
        test_rows = predict.to_pandas(50)

        file_name = self.get_name("linear_regression")
        model_path = model.export_to_mar(self.get_export_file(file_name))
        with scoring_utils.scorer(model_path, self.id()) as scorer:
            for _, i in test_rows.iterrows():
                res = scorer.score(
                    [dict(zip(["c1", "c2", "c3", "c4"], list(i[0:4])))])
                self.assertEqual(i['predicted_value'],
                                 res.json()["data"][0]['Prediction'])
Esempio n. 8
0
    def test_model_scoring(self):
        """Test publishing a linear regression model"""
        model = self.context.models.regression.linear_regression.train(self.frame, ['c1', 'c2', 'c3', 'c4'], "label")

        predict = model.predict(self.frame, ['c1', 'c2', 'c3', 'c4'])
        test_rows = predict.to_pandas(predict.count())

        file_name = self.get_name("linear_regression")
        model_path = model.export_to_mar(self.get_export_file(file_name))
        with scoring_utils.scorer(
                model_path, self.id()) as scorer:
            for _, i in test_rows.iterrows():
                res = scorer.score(
                    [dict(zip(["c1", "c2", "c3", "c4"], list(i[0:4])))])
                self.assertEqual(
                    i["predicted_value"], res.json()["data"][0]['Prediction'])
Esempio n. 9
0
    def test_model_scoring(self):
        """Test training intializes theta, pi and labels"""
        model = self.context.models.classification.naive_bayes.train(self.frame, "label", ['f1', 'f2', 'f3'])

        res = model.predict(self.frame, ['f1', 'f2', 'f3'])

        analysis = res.to_pandas()
        file_name = self.get_name("naive_bayes")
        model_path = model.export_to_mar(self.get_export_file(file_name))
        with scoring_utils.scorer(
                model_path, self.id()) as scorer:
            for _, i in analysis.iterrows():
                r = scorer.score(
                    [dict(zip(['f1', 'f2', 'f3'],
                    map(lambda x: int(x), (i[1:4]))))])
                self.assertEqual(
                    r.json()["data"][0]['Score'], i['predicted_class'])
Esempio n. 10
0
    def test_model_scoring(self):
        """Tests standard usage of the kmeans cluster algorithm."""
        kmodel = self.context.models.clustering.kmeans.train(
            self.frame_train, ["Vec1", "Vec2", "Vec3", "Vec4", "Vec5"], 5)

        result_frame = kmodel.predict(self.frame_test)
        test_rows = result_frame.to_pandas(50)
        result = kmodel.export_to_mar(self.get_export_file(self.get_name("kmeans")))

        with scoring_utils.scorer(
                result, self.id()) as scorer:
            for _, i in test_rows.iterrows():
                res = scorer.score(
                    [dict(zip(["Vec1", "Vec2", "Vec3", "Vec4", "Vec5"],
                    list(i[0:5])))])

                self.assertEqual(i["cluster"], res.json()["data"][0]['score'])
Esempio n. 11
0
    def test_max_scoring(self):
        """Tests standard usage of max."""
        output = self.context.models.timeseries.max.train(self.train_frame, self.ts_column, self.x_columns, 1, 0)

        predict_frame = output.predict(self.actual_data, self.ts_column, self.x_columns)
        timeseries_column = self.actual_data.take(n=self.actual_data.count(), columns=self.ts_column)
        y = [item for sublist in timeseries_column for item in sublist]
        x_columns = self.actual_data.take(n=self.actual_data.count(), columns=self.x_columns)
        x = [item for sublist in x_columns for item in sublist]
        predict_data = predict_frame.take(n=self.actual_data.count(), columns="predicted_y")
        expected_score = [item for sublist in predict_data for item in sublist]
        model_path = output.export_to_mar(self.get_export_file(self.get_name("max")))

        with scoring_utils.scorer(
                model_path, self.id()) as scorer:
            r = scorer.score([{"y":y,"x_values":x}])
            scored =r.json()["data"][0]["score"]
            self.assertEqual(scored, expected_score)
Esempio n. 12
0
    def test_model_scoring(self):
        """Test lda model scoring"""
        model = self.context.models.clustering.lda.train(self.lda_frame, 'paper', 'word', 'count',
                              num_topics=5, max_iterations=10, seed=0)

        test_phrase = ["word-0-0", "word-1-0",
                       "word-2-0", "word-3-0", "word-4-0"]

        file_name = self.get_name("lda")
        model_path = model.export_to_mar(self.get_export_file(file_name))

        res = lda_model.predict(test_phrase)["topics_given_doc"]

        with scoring_utils.scorer(
                model_path, self.id()) as scorer:
            result = scorer.score([{"paper":test_phrase}]).json()
            for i, j in zip(res, result[u"data"][0]["topics_given_doc"]):
                self.assertAlmostEqual(i, j)
Esempio n. 13
0
    def test_model_scoring(self):
        """Tests standard usage of the kmeans cluster algorithm."""
        kmodel = self.context.models.clustering.kmeans.train(
            self.frame_train, ["Vec1", "Vec2", "Vec3", "Vec4", "Vec5"], 5)

        result_frame = kmodel.predict(self.frame_test)
        test_rows = result_frame.to_pandas(50)
        result = kmodel.export_to_mar(
            self.get_export_file(self.get_name("kmeans")))

        with scoring_utils.scorer(result, self.id()) as scorer:
            for _, i in test_rows.iterrows():
                res = scorer.score([
                    dict(
                        zip(["Vec1", "Vec2", "Vec3", "Vec4", "Vec5"],
                            list(i[0:5])))
                ])

                self.assertEqual(i["cluster"], res.json()["data"][0]['score'])
Esempio n. 14
0
    def test_reg_scoring(self):
        """Test random forest regressor scoring  model"""
        rfmodel = self.context.models.regression.random_forest_regressor.train(
            self.frame, "class", ["feat1", "feat2"], seed=0)

        predresult = rfmodel.predict(self.frame)
        preddf = predresult.to_pandas(predresult.count())

        file_name = self.get_name("random_forest_regressor")
        model_path = rfmodel.export_to_mar(self.get_export_file(file_name))

        with scoring_utils.scorer(
                model_path, self.id()) as scorer:
            for i, row in preddf.iterrows():
                res = scorer.score(
                    [dict(zip(["feat1", "feat2"], map(lambda x: x,row[0:2])))])

                self.assertAlmostEqual(
                    float(row[3]), float(res.json()["data"][0]['Prediction']))
Esempio n. 15
0
    def test_reg_scoring(self):
        """Test random forest regressor scoring  model"""
        rfmodel = self.context.models.regression.random_forest_regressor.train(
            self.frame, ["feat1", "feat2"], "class", seed=0)

        predresult = rfmodel.predict(self.frame)
        preddf = predresult.to_pandas(predresult.count())

        file_name = self.get_name("random_forest_regressor")
        model_path = rfmodel.export_to_mar(self.get_export_file(file_name))

        with scoring_utils.scorer(model_path, self.id()) as scorer:
            for i, row in preddf.iterrows():
                res = scorer.score([
                    dict(zip(["feat1", "feat2"], map(lambda x: x, row[0:2])))
                ])

                self.assertAlmostEqual(
                    float(row[3]), float(res.json()["data"][0]['Prediction']))
Esempio n. 16
0
    def test_model_scoring(self):
        """ Verify that SvmModel operates as expected.  """
        # Test set is a 3x3 square lattice of points
        #   with a fully accurate, linear, unbiased divider.

        train_lattice = ["+++", "++-", "---"]

        training_frame = self.lattice2frame(train_lattice)
        svm_model = self.context.models.classification.svm.train(
            training_frame, ["x", "y"], u"model_class")

        file_name = self.get_name("svm")
        model_path = svm_model.export_to_mar(self.get_export_file(file_name))

        test_rows = training_frame.to_pandas(training_frame.count())

        with scoring_utils.scorer(model_path, self.id()) as scorer:
            for _, i in test_rows.iterrows():
                res = scorer.score([dict(zip(["x", "y"], list(i[0:2])))])
                self.assertEqual(i[2], res.json()["data"][0]['Prediction'])
Esempio n. 17
0
    def test_model_scoring(self):
        """Test publishing a gmm model"""
        model = self.context.models.clustering.gmm.train(
            self.frame, ["x1", "x2"],
            column_scalings=[1.0, 1.0],
            k=5,
            max_iterations=500,
            seed=20,
            convergence_tol=0.0001)

        predict = model.predict(self.frame)
        test_rows = predict.to_pandas(predict.count())

        file_name = self.get_name("gmm")
        model_path = model.export_to_mar(self.get_export_file(file_name))

        with scoring_utils.scorer(model_path, self.id()) as scorer:
            for i, row in test_rows.iterrows():
                res = scorer.score([dict(zip(["x1", "x2"], list(row[0:2])))])
                self.assertEqual(row["predicted_cluster"],
                                 res.json()["data"][0]['Score'])
Esempio n. 18
0
    def test_model_scoring(self):
        """Test publishing a logistic regression model"""
        model = self.context.models.classification.logistic_regression.train(
            self.frame, ["vec0", "vec1", "vec2", "vec3", "vec4"], 'res')

        predict = model.predict(self.frame,
                                ["vec0", "vec1", "vec2", "vec3", "vec4"])
        test_rows = predict.to_pandas(100)

        file_name = self.get_name("logistic_regression")
        model_path = model.export_to_mar(self.get_export_file(file_name))

        with scoring_utils.scorer(model_path, self.id()) as scorer:
            for i, row in test_rows.iterrows():
                res = scorer.score([
                    dict(
                        zip(["vec0", "vec1", "vec2", "vec3", "vec4"],
                            list(row[0:5])))
                ])

                self.assertEqual(row["predicted_label"],
                                 res.json()["data"][0]['PredictedLabel'])
Esempio n. 19
0
    def test_model_scoring(self):
        """Test publishing a gmm model"""
        model = self.context.models.clustering.gmm.train(
            self.frame,
            ["x1", "x2"],
            column_scalings=[1.0, 1.0],
            k=5,
            max_iterations=500,
            seed=20,
            convergence_tol=0.0001,
        )

        predict = model.predict(self.frame)
        test_rows = predict.to_pandas(predict.count())

        file_name = self.get_name("gmm")
        model_path = model.export_to_mar(self.get_export_file(file_name))

        with scoring_utils.scorer(model_path, self.id()) as scorer:
            for i, row in test_rows.iterrows():
                res = scorer.score([dict(zip(["x1", "x2"], list(row[0:2])))])
                self.assertEqual(row["predicted_cluster"], res.json()["data"][0]["Score"])
Esempio n. 20
0
    def test_model_scoring(self):
        """Test publishing a logistic regression model"""
        model = self.context.models.classification.logistic_regression.train(
            self.frame, ["vec0", "vec1", "vec2", "vec3", "vec4"],
            'res')

        predict = model.predict(
            self.frame,
            ["vec0", "vec1", "vec2", "vec3", "vec4"])
        test_rows = predict.to_pandas(100)

        file_name = self.get_name("logistic_regression")
        model_path = model.export_to_mar(self.get_export_file(file_name))

        with scoring_utils.scorer(
                model_path, self.id()) as scorer:
            for i, row in test_rows.iterrows():
                res = scorer.score(
                    [dict(zip(["vec0", "vec1", "vec2", "vec3", "vec4"], list(row[0:5])))])

                self.assertEqual(
                    row["predicted_label"], res.json()["data"][0]['PredictedLabel'])
Esempio n. 21
0
    def test_model_scoring(self):
        """ Verify that SvmModel operates as expected.  """
        # Test set is a 3x3 square lattice of points
        #   with a fully accurate, linear, unbiased divider.

        train_lattice = ["+++",
                         "++-",
                         "---"]

        training_frame = self.lattice2frame(train_lattice)
        svm_model = self.context.models.classification.svm.train(
            training_frame, u"model_class", ["x", "y"])

        file_name = self.get_name("svm")
        model_path = svm_model.export_to_mar(self.get_export_file(file_name))

        test_rows = training_frame.to_pandas(training_frame.count())
        
        with scoring_utils.scorer(
                model_path, self.id()) as scorer:
            for _, i in test_rows.iterrows():
                res = scorer.score([dict(zip(["x", "y"], list(i[0:2])))])
                self.assertEqual(i[2], res.json()["data"][0]['Prediction'])
Esempio n. 22
0
    def test_model_scoring(self):
        """Test pca scoring"""
        model = self.context.models.dimreduction.pca.train(
            self.frame,
            ["X1", "X2", "X3", "X4", "X5",
            "X6", "X7", "X8", "X9", "X10"],
            False, 10)

        file_name = self.get_name("pca")
        model_path = model.export_to_mar(self.get_export_file(file_name))

        with scoring_utils.scorer(
                model_path, self.id()) as scorer:
            baseline = model.predict(self.frame, mean_centered=False)
            testvals = baseline.to_pandas(50)

            for _, i in testvals.iterrows():
                r = scorer.score(
                    [dict(zip(["X1", "X2", "X3", "X4", "X5",
                               "X6", "X7", "X8", "X9", "X10"],
                    map(lambda x: x, i[0:10])))])
                map(lambda x, y: self.assertAlmostEqual(float(x),float(y)),
                    r.json()["data"][-1]["principal_components"], i[10:])
Esempio n. 23
0
    def test_scoring_pipeline(self):
        """Test scoring_pipeline"""
        model = self.context.models.classification.naive_bayes.train(self.frame, ['f1', 'f2', 'f3'], "label")
        res = model.predict(self.frame, ['f1', 'f2', 'f3'])
        analysis = res.to_pandas()
        file_name = self.get_name("naive_bayes")
        model_path = model.export_to_mar(self.get_export_file(file_name))

        self.tarfile = "pipeline.tar"
        pipeline_funcs = os.path.join(config.root, "regression-tests", "sparktkregtests", "testcases", "scoretests","pipeline_funcs.py")
        pipeline_config = os.path.join(config.root, "regression-tests", "sparktkregtests", "testcases", "scoretests","pipeline_config.json")
        tar = tarfile.open(self.tarfile, "w:gz")
        tar.add(pipeline_funcs, "pipeline_funcs.py")
        tar.add(pipeline_config, "pipeline_config.json")
        tar.close()

        with scoring_utils.scorer(
                model_path, self.id(), pipeline=True, pipeline_filename=self.tarfile) as scorer:
            for _, i in analysis.iterrows():
                r = scorer.score(
                    [dict(zip(['f1', 'f2', 'f3'],
                    map(lambda x: int(x), (i[1:4]))))])
                self.assertEqual(
                    r.json(), i['predicted_class'])
Esempio n. 24
0
    def test_model_scoring(self):
        """Test pca scoring"""
        model = self.context.models.dimreduction.pca.train(
            self.frame,
            ["X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10"],
            False, 10)

        file_name = self.get_name("pca")
        model_path = model.export_to_mar(self.get_export_file(file_name))

        with scoring_utils.scorer(model_path, self.id()) as scorer:
            baseline = model.predict(self.frame, mean_centered=False)
            testvals = baseline.to_pandas(50)

            for _, i in testvals.iterrows():
                r = scorer.score([
                    dict(
                        zip([
                            "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8",
                            "X9", "X10"
                        ], map(lambda x: x, i[0:10])))
                ])
                map(lambda x, y: self.assertAlmostEqual(float(x), float(y)),
                    r.json()["data"][-1]["principal_components"], i[10:])
Esempio n. 25
0
    def test_revise_model(self):
        """Tests revise api in scoring engine"""
        model = self.context.models.regression.linear_regression.train(
            self.frame, ['c1', 'c2', 'c3', 'c4'], 'label')
        old_model_path = model.export_to_mar(
            self.get_export_file(self.get_name("lin_reg")))

        #create a revised model
        model_revised = self.context.models.regression.linear_regression.train(
            self.frame, ['c1', 'c2', 'c3'], 'label', max_iterations=10)
        result_revised = model_revised.predict(self.frame, ['c1', 'c2', 'c3'])
        test_rows = result_revised.to_pandas(50)
        revised_model_path = model_revised.export_to_mar(
            self.get_export_file(self.get_name("lin_reg_revised")))

        with scoring_utils.scorer(old_model_path, self.id()) as scorer:
            res = scorer.revise(revised_model_path)
            self.assertEqual(res.json()["status"], "success")

            for _, i in test_rows.iterrows():
                res = scorer.score(
                    [dict(zip(["c1", "c2", "c3"], list(i[0:3])))])
                self.assertEqual(i['predicted_value'],
                                 res.json()["data"][0]['Prediction'])