Ejemplo n.º 1
0
 def test_support_for_weightCol(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0),
                                      (1.0, Vectors.sparse(2, [], []), 1.0),
                                      (2.0, Vectors.dense(0.5, 0.5), 1.0)],
                                     ["label", "features", "weight"])
     # classifier inherits hasWeightCol
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr, weightCol="weight")
     self.assertIsNotNone(ovr.fit(df))
     # classifier doesn't inherit hasWeightCol
     dt = FMClassifier()
     ovr2 = OneVsRest(classifier=dt, weightCol="weight")
     self.assertIsNotNone(ovr2.fit(df))
Ejemplo n.º 2
0
 def test_parallelism_doesnt_change_output(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     ovrPar1 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=1)
     modelPar1 = ovrPar1.fit(df)
     ovrPar2 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=2)
     modelPar2 = ovrPar2.fit(df)
     for i, model in enumerate(modelPar1.models):
         self.assertTrue(np.allclose(model.coefficients.toArray(),
                                     modelPar2.models[i].coefficients.toArray(), atol=1E-4))
         self.assertTrue(np.allclose(model.intercept, modelPar2.models[i].intercept, atol=1E-4))
Ejemplo n.º 3
0
    def test_one_vs_rest(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "sample_multiclass_classification_data.txt")
        data = self.spark.read.format("libsvm").load(input_path)
        lr = LogisticRegression(maxIter=100, tol=0.0001, regParam=0.01)
        ovr = OneVsRest(classifier=lr)
        model = ovr.fit(data)

        feature_count = data.first()[1].size
        model_onnx = convert_sparkml(model, 'Sparkml OneVsRest', [
            ('features', FloatTensorType([1, feature_count]))
        ], spark_session=self.spark)
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32),
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlOneVsRest")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)
Ejemplo n.º 4
0
def binary_classification(tr_data=None,
                          t_data=None,
                          proc_type='train',
                          example=None):
    """Performs binary classification pipelining."""
    lr = LogisticRegression(tol=1e-6, fitIntercept=True)
    ovr = OneVsRest(classifier=lr)

    pipeline = Pipeline(stages=[ovr])

    if proc_type == 'load' or proc_type == 'test':
        model = PipelineModel.load(BC_PATH)
    else:
        model = pipeline.fit(tr_data)
        if os.path.exists(BC_PATH):
            shutil.rmtree(BC_PATH)
        model.save(BC_PATH)

    if proc_type == 'test':
        result = model.transform(example).collect()

        return result, 0.
    else:
        prediction = model.transform(t_data)
        evaluator = MulticlassClassificationEvaluator(
            labelCol='label', predictionCol='prediction', metricName='f1')
        f1Score = evaluator.evaluate(prediction)

        return None, f1Score
Ejemplo n.º 5
0
def test_should_log_model(dataset_binomial, dataset_multinomial):
    mlflow.pyspark.ml.autolog(log_models=True)
    lor = LogisticRegression()

    ova1 = OneVsRest(classifier=lor)
    mlor_model = lor.fit(dataset_multinomial)
    assert _should_log_model(mlor_model)

    ova1_model = ova1.fit(dataset_multinomial)
    assert _should_log_model(ova1_model)

    with mock.patch(
            "mlflow.pyspark.ml._log_model_allowlist",
        {
            "pyspark.ml.regression.LinearRegressionModel",
            "pyspark.ml.classification.OneVsRestModel"
        },
    ), mock.patch("mlflow.pyspark.ml._logger.warning") as mock_warning:
        lr = LinearRegression()
        lr_model = lr.fit(dataset_binomial)
        assert _should_log_model(lr_model)
        lor_model = lor.fit(dataset_binomial)
        assert not _should_log_model(lor_model)
        mock_warning.called_once_with(
            _get_warning_msg_for_skip_log_model(lor_model))
        assert not _should_log_model(ova1_model)
def one_vs_rest(training, test):
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    ovr = OneVsRest(classifier=lr)
    model = ovr.fit(training)
    result = model.transform(test)
    accuracy = 1.0 * result.rdd.filter(
        lambda l: l.label == l.prediction).count() / test.count()
    print "OneVsRest 模型的正确率为:", accuracy
def one_vs_rest_classifier(trainingDataFrame, classifier=None):
    if not classifier:
        classifier = LogisticRegression(regParam=0.01)
    ovr = OneVsRest(classifier=classifier)
    ovrModel = ovr.fit(trainingDataFrame)
    result = {}
    result["model"] = ovrModel
    return result
Ejemplo n.º 8
0
 def test_output_columns(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr, parallelism=1)
     model = ovr.fit(df)
     output = model.transform(df)
     self.assertEqual(output.columns, ["label", "features", "rawPrediction", "prediction"])
Ejemplo n.º 9
0
 def test_raw_prediction_column_is_of_vector_type(self):
     # SPARK-35142: `OneVsRestModel` outputs raw prediction as a string column
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr, parallelism=1)
     model = ovr.fit(df)
     row = model.transform(df).head()
     self.assertIsInstance(row["rawPrediction"], DenseVector)
Ejemplo n.º 10
0
 def test_copy(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     ovr1 = ovr.copy({lr.maxIter: 10})
     self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
     self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
     model = ovr.fit(df)
     model1 = model.copy({model.predictionCol: "indexed"})
     self.assertEqual(model1.getPredictionCol(), "indexed")
Ejemplo n.º 11
0
    def test_onevsrest(self):
        temp_path = tempfile.mkdtemp()
        df = self.spark.createDataFrame([(0.0, 0.5, Vectors.dense(1.0, 0.8)),
                                         (1.0, 0.5, Vectors.sparse(2, [], [])),
                                         (2.0, 1.0, Vectors.dense(0.5, 0.5))] *
                                        10, ["label", "wt", "features"])

        lr = LogisticRegression(maxIter=5, regParam=0.01)
        ovr = OneVsRest(classifier=lr)

        def reload_and_compare(ovr, suffix):
            model = ovr.fit(df)
            ovrPath = temp_path + "/{}".format(suffix)
            ovr.save(ovrPath)
            loadedOvr = OneVsRest.load(ovrPath)
            self._compare_pipelines(ovr, loadedOvr)
            modelPath = temp_path + "/{}Model".format(suffix)
            model.save(modelPath)
            loadedModel = OneVsRestModel.load(modelPath)
            self._compare_pipelines(model, loadedModel)

        reload_and_compare(OneVsRest(classifier=lr), "ovr")
        reload_and_compare(OneVsRest(classifier=lr).setWeightCol("wt"), "ovrw")
Ejemplo n.º 12
0
def test_should_log_model(dataset_binomial, dataset_multinomial, dataset_text):
    mlflow.pyspark.ml.autolog(log_models=True)
    lor = LogisticRegression()

    ova1 = OneVsRest(classifier=lor)
    with mlflow.start_run():
        mlor_model = lor.fit(dataset_multinomial)
    assert _should_log_model(mlor_model)

    with mlflow.start_run():
        ova1_model = ova1.fit(dataset_multinomial)
    assert _should_log_model(ova1_model)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    lr = LogisticRegression(maxIter=2)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    with mlflow.start_run():
        pipeline_model = pipeline.fit(dataset_text)
    assert _should_log_model(pipeline_model)

    nested_pipeline = Pipeline(
        stages=[tokenizer, Pipeline(stages=[hashingTF, lr])])
    with mlflow.start_run():
        nested_pipeline_model = nested_pipeline.fit(dataset_text)
    assert _should_log_model(nested_pipeline_model)

    with mock.patch(
            "mlflow.pyspark.ml._log_model_allowlist",
        {
            "pyspark.ml.regression.LinearRegressionModel",
            "pyspark.ml.classification.OneVsRestModel",
            "pyspark.ml.pipeline.PipelineModel",
        },
    ), mock.patch("mlflow.pyspark.ml._logger.warning") as mock_warning:
        lr = LinearRegression()
        with mlflow.start_run():
            lr_model = lr.fit(dataset_binomial)
        assert _should_log_model(lr_model)
        with mlflow.start_run():
            lor_model = lor.fit(dataset_binomial)
        assert not _should_log_model(lor_model)
        mock_warning.called_once_with(
            _get_warning_msg_for_skip_log_model(lor_model))
        assert not _should_log_model(ova1_model)
        assert not _should_log_model(pipeline_model)
        assert not _should_log_model(nested_pipeline_model)
def SVMCV(trainingData, testData):
    start_time = time.time()

    svm = LinearSVC()
    ovr = OneVsRest(classifier=svm)

    # Parametri su cui effettuare il tuning
    paramGrid = ParamGridBuilder() \
        .addGrid(svm.regParam, [1, 0]) \
        .addGrid(svm.maxIter, [100, 1000]) \
        .build()

    cv = CrossValidator(estimator=ovr, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(),
                        numFolds=5)

    model = cv.fit(trainingData)

    prediction = model.transform(testData)

    result = prediction.select('features', 'label', 'prediction')

    # Calcolo accuracy
    evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(prediction)
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
    f1score = evaluator.evaluate(prediction)

    # Confusion Matrix
    class_temp = prediction.select("label").groupBy("label") \
        .count().sort('count', ascending=False).toPandas()
    class_temp = class_temp["label"].values.tolist()

    y_true = prediction.select("label")
    y_true = y_true.toPandas()

    y_pred = prediction.select("prediction")
    y_pred = y_pred.toPandas()

    cnf_matrix = confusion_matrix(y_true, y_pred, labels=class_temp)
    print("Accuracy K-Fodls: ", accuracy)
    print("F1-Score K-Fodls: ", f1score)
    print("Confusion Matrix: ")
    print(cnf_matrix)
    print("SVM K-Folds Execution TIME:", time.time() - start_time)
    return (f1score, cnf_matrix, cv)
Ejemplo n.º 14
0
    def _run_test_save_load_nested_estimator(self, LogisticRegressionCls):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [
                (Vectors.dense([0.0]), 0.0),
                (Vectors.dense([0.4]), 1.0),
                (Vectors.dense([0.5]), 0.0),
                (Vectors.dense([0.6]), 1.0),
                (Vectors.dense([1.0]), 1.0),
            ] * 10,
            ["features", "label"],
        )

        ova = OneVsRest(classifier=LogisticRegressionCls())
        lr1 = LogisticRegressionCls().setMaxIter(100)
        lr2 = LogisticRegressionCls().setMaxIter(150)
        grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build()
        evaluator = MulticlassClassificationEvaluator()

        # test save/load of CrossValidator
        cv = CrossValidator(estimator=ova,
                            estimatorParamMaps=grid,
                            evaluator=evaluator)
        cvModel = cv.fit(dataset)
        cvPath = temp_path + "/cv"
        cv.save(cvPath)
        loadedCV = CrossValidator.load(cvPath)
        self.assert_param_maps_equal(loadedCV.getEstimatorParamMaps(), grid)
        self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
        self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)

        originalParamMap = cv.getEstimatorParamMaps()
        loadedParamMap = loadedCV.getEstimatorParamMaps()
        for i, param in enumerate(loadedParamMap):
            for p in param:
                if p.name == "classifier":
                    self.assertEqual(param[p].uid, originalParamMap[i][p].uid)
                else:
                    self.assertEqual(param[p], originalParamMap[i][p])

        # test save/load of CrossValidatorModel
        cvModelPath = temp_path + "/cvModel"
        cvModel.save(cvModelPath)
        loadedModel = CrossValidatorModel.load(cvModelPath)
        self.assert_param_maps_equal(loadedModel.getEstimatorParamMaps(), grid)
        self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
Ejemplo n.º 15
0
    def test_getAllNestedStages(self):
        def _check_uid_set_equal(stages, expected_stages):
            uids = set(map(lambda x: x.uid, stages))
            expected_uids = set(map(lambda x: x.uid, expected_stages))
            self.assertEqual(uids, expected_uids)

        df1 = self.spark.createDataFrame(
            [
                (Vectors.dense([1.0, 2.0]), 1.0),
                (Vectors.dense([-1.0, -2.0]), 0.0),
            ],
            ["features", "label"],
        )
        df2 = self.spark.createDataFrame(
            [
                (1.0, 2.0, 1.0),
                (1.0, 2.0, 0.0),
            ],
            ["a", "b", "label"],
        )
        vs = VectorAssembler(inputCols=["a", "b"], outputCol="features")
        lr = LogisticRegression()
        pipeline = Pipeline(stages=[vs, lr])
        pipelineModel = pipeline.fit(df2)
        ova = OneVsRest(classifier=lr)
        ovaModel = ova.fit(df1)

        ova_pipeline = Pipeline(stages=[vs, ova])
        nested_pipeline = Pipeline(stages=[ova_pipeline])

        _check_uid_set_equal(
            MetaAlgorithmReadWrite.getAllNestedStages(pipeline),
            [pipeline, vs, lr])
        _check_uid_set_equal(
            MetaAlgorithmReadWrite.getAllNestedStages(pipelineModel),
            [pipelineModel] + pipelineModel.stages,
        )
        _check_uid_set_equal(MetaAlgorithmReadWrite.getAllNestedStages(ova),
                             [ova, lr])
        _check_uid_set_equal(
            MetaAlgorithmReadWrite.getAllNestedStages(ovaModel),
            [ovaModel, lr] + ovaModel.models)
        _check_uid_set_equal(
            MetaAlgorithmReadWrite.getAllNestedStages(nested_pipeline),
            [nested_pipeline, ova_pipeline, vs, ova, lr],
        )
Ejemplo n.º 16
0
def ovr_classifier(training, testing):
    from pyspark.ml.classification import LogisticRegression, OneVsRest

    #Instantiate the base classifier.
    lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)
    #Instantiate the One Vs Rest Classifier.
    ovr = OneVsRest(classifier=lr)
    # train the multiclass model.
    ovr_model = ovr.fit(training)

    #Test model
    ovr_predictions = ovr_model.transform(testing)

    #Evaluate model
    ovr_evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
    ovr_accuracy = ovr_evaluator.evaluate(ovr_predictions)
    return ovr_accuracy
Ejemplo n.º 17
0
 def test_onevsrest(self):
     temp_path = tempfile.mkdtemp()
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))] * 10,
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     ovrPath = temp_path + "/ovr"
     ovr.save(ovrPath)
     loadedOvr = OneVsRest.load(ovrPath)
     self._compare_pipelines(ovr, loadedOvr)
     modelPath = temp_path + "/ovrModel"
     model.save(modelPath)
     loadedModel = OneVsRestModel.load(modelPath)
     self._compare_pipelines(model, loadedModel)
Ejemplo n.º 18
0
    def _run_test_save_load_nested_estimator(self, LogisticRegressionCls):
        # This tests saving and loading the trained model only.
        # Save/load for TrainValidationSplit will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [
                (Vectors.dense([0.0]), 0.0),
                (Vectors.dense([0.4]), 1.0),
                (Vectors.dense([0.5]), 0.0),
                (Vectors.dense([0.6]), 1.0),
                (Vectors.dense([1.0]), 1.0),
            ] * 10,
            ["features", "label"],
        )
        ova = OneVsRest(classifier=LogisticRegressionCls())
        lr1 = LogisticRegressionCls().setMaxIter(100)
        lr2 = LogisticRegressionCls().setMaxIter(150)
        grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build()
        evaluator = MulticlassClassificationEvaluator()

        tvs = TrainValidationSplit(estimator=ova,
                                   estimatorParamMaps=grid,
                                   evaluator=evaluator)
        tvsModel = tvs.fit(dataset)
        tvsPath = temp_path + "/tvs"
        tvs.save(tvsPath)
        loadedTvs = TrainValidationSplit.load(tvsPath)
        self.assert_param_maps_equal(loadedTvs.getEstimatorParamMaps(), grid)
        self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)
        self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid)

        originalParamMap = tvs.getEstimatorParamMaps()
        loadedParamMap = loadedTvs.getEstimatorParamMaps()
        for i, param in enumerate(loadedParamMap):
            for p in param:
                if p.name == "classifier":
                    self.assertEqual(param[p].uid, originalParamMap[i][p].uid)
                else:
                    self.assertEqual(param[p], originalParamMap[i][p])

        tvsModelPath = temp_path + "/tvsModel"
        tvsModel.save(tvsModelPath)
        loadedModel = TrainValidationSplitModel.load(tvsModelPath)
        self.assert_param_maps_equal(loadedModel.getEstimatorParamMaps(), grid)
        self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def test_param_map_captures_wrapped_params(dataset_binomial):
    lor = LogisticRegression(maxIter=3, standardization=False)
    ova = OneVsRest(classifier=lor, labelCol="abcd")

    param_map = _get_instance_param_map(ova)
    assert param_map["labelCol"] == "abcd"
    assert param_map["classifier"] == lor.uid
    assert param_map[f"{lor.uid}.maxIter"] == 3
    assert not param_map[f"{lor.uid}.standardization"]
    assert param_map[f"{lor.uid}.tol"] == lor.getOrDefault(lor.tol)

    mlflow.pyspark.ml.autolog()
    with mlflow.start_run() as run:
        ova.fit(dataset_binomial.withColumn("abcd", dataset_binomial.label))
    run_id = run.info.run_id
    run_data = get_run_data(run_id)
    assert run_data.params == truncate_param_dict(
        stringify_dict_values(_get_instance_param_map(ova)))
Ejemplo n.º 20
0
def binary_classification(tr_data, t_data):
    """Performs binary classification pipelining."""
    lr = LogisticRegression(tol=1e-6, fitIntercept=True)
    ovr = OneVsRest(classifier=lr)

    pipeline = Pipeline(stages=[ovr])

    model = pipeline.fit(tr_data)
    prediction = model.transform(t_data)
    # prediction.show(5)

    evaluator = MulticlassClassificationEvaluator(labelCol='label',
                                                  predictionCol='prediction',
                                                  metricName='f1')
    f1Score = evaluator.evaluate(prediction)

    result = prediction.collect()

    return result, f1Score
Ejemplo n.º 21
0
def test_meta_estimator_fit(dataset_binomial):
    mlflow.pyspark.ml.autolog()
    with mlflow.start_run() as run:
        svc = LinearSVC()
        ova = OneVsRest(classifier=svc)
        ova_model = ova.fit(dataset_binomial)

    run_id = run.info.run_id
    run_data = get_run_data(run_id)
    assert run_data.params == truncate_param_dict(stringify_dict_values(get_params_to_log(ova)))
    assert run_data.tags == get_expected_class_tags(ova)
    assert MODEL_DIR in run_data.artifacts
    loaded_model = load_model_by_run_id(run_id)
    assert loaded_model.stages[0].uid == ova_model.uid

    # assert no nested run spawned
    query = "tags.{} = '{}'".format(MLFLOW_PARENT_RUN_ID, run.info.run_id)
    assert len(mlflow.search_runs([run.info.experiment_id])) == 1
    assert len(mlflow.search_runs([run.info.experiment_id], query)) == 0
Ejemplo n.º 22
0
def main():
    spark = create_session('wash_post_shootings')
    spark.sparkContext.setLogLevel('ERROR')

    try:
        df = create_df(spark)

        sh = Shootings(df)
        sh.show()

        df = sh.get_df()

        (train, test) = df.randomSplit([0.8, 0.2])

        # instantiate the base classifier.
        lr = LogisticRegression(maxIter=10,
                                tol=1E-6,
                                fitIntercept=True,
                                featuresCol='features',
                                labelCol='label')

        # instantiate the One Vs Rest Classifier.
        ovr = OneVsRest(classifier=lr)

        # train the multiclass model.
        ovrModel = ovr.fit(train)

        # score the model on test data.
        predictions = ovrModel.transform(test)

        # obtain evaluator.
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

        # compute the classification error on test data.
        accuracy = evaluator.evaluate(predictions)
        print("Test Error = %g" % (1.0 - accuracy))
        print("Accuracy = %.2f" % (accuracy * 100))

    except Exception as e:
        print(e)
    finally:
        spark.sparkContext.stop()
Ejemplo n.º 23
0
def main():
    parser = argparse.ArgumentParser(description='Pyspark Training')
    parser.add_argument('--data_dir',
                        type=str,
                        default='../../data',
                        help='Data location.')
    args = parser.parse_args()

    # Get the MNIST data.
    X, y = load_mnist(args.data_dir)

    # Create a train and test set split.
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)

    # Convert numpy arrays to Pyspark dataframe.
    df = create_df(y_train, X_train)

    # Create a train and validation set.
    (train, val) = df.randomSplit([0.1, 0.90])

    # instantiate logistic regression with hyperparameters.
    lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)

    # instantiate the One Vs Rest Classifier.
    ovr = OneVsRest(classifier=lr)

    # train the multiclass model
    ovrModel = ovr.fit(train)

    # score the model on test data.
    predictions = ovrModel.transform(val)

    # obtain evaluator.
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

    # compute the classification error on validation data.
    accuracy = evaluator.evaluate(predictions)
    print("Validation Accuracy = {}".format(accuracy))
    print("Validation Error = {}".format(1.0 - accuracy))
Ejemplo n.º 24
0
def test_param_map_captures_wrapped_params(dataset_binomial):
    lor = LogisticRegression(maxIter=3, standardization=False)
    ova = OneVsRest(classifier=lor, labelCol="abcd")

    param_map = get_params_to_log(ova)
    assert param_map["labelCol"] == "abcd"
    assert param_map["classifier"] == "LogisticRegression"
    assert param_map["LogisticRegression.maxIter"] == 3
    assert not param_map["LogisticRegression.standardization"]
    assert param_map["LogisticRegression.tol"] == lor.getOrDefault(lor.tol)

    mlflow.pyspark.ml.autolog()
    with mlflow.start_run() as run:
        ova.fit(dataset_binomial.withColumn("abcd", dataset_binomial.label))
        metadata = _gen_estimator_metadata(ova)
        estimator_info = load_json_artifact("estimator_info.json")
        assert metadata.hierarchy == estimator_info["hierarchy"]
    run_id = run.info.run_id
    run_data = get_run_data(run_id)
    assert run_data.params == truncate_param_dict(stringify_dict_values(get_params_to_log(ova)))
Ejemplo n.º 25
0
def test_get_params_to_log(spark_session):  # pylint: disable=unused-argument
    lor = LogisticRegression(maxIter=3, standardization=False)
    lor_params = get_params_to_log(lor)
    assert (
        lor_params["maxIter"] == 3
        and not lor_params["standardization"]
        and lor_params["family"] == lor.getOrDefault(lor.family)
    )

    ova = OneVsRest(classifier=lor, labelCol="abcd")
    ova_params = get_params_to_log(ova)
    assert (
        ova_params["classifier"] == "LogisticRegression"
        and ova_params["labelCol"] == "abcd"
        and ova_params["LogisticRegression.maxIter"] == 3
        and ova_params["LogisticRegression.family"] == lor.getOrDefault(lor.family)
    )

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    pipeline = Pipeline(stages=[tokenizer, hashingTF, ova])
    inner_pipeline = Pipeline(stages=[hashingTF, ova])
    nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline])

    pipeline_params = get_params_to_log(pipeline)
    nested_pipeline_params = get_params_to_log(nested_pipeline)

    assert pipeline_params["stages"] == ["Tokenizer", "HashingTF", "OneVsRest"]
    assert nested_pipeline_params["stages"] == ["Tokenizer", "Pipeline_2"]
    assert nested_pipeline_params["Pipeline_2.stages"] == ["HashingTF", "OneVsRest"]
    assert nested_pipeline_params["OneVsRest.classifier"] == "LogisticRegression"

    for params_to_test in [pipeline_params, nested_pipeline_params]:
        assert (
            params_to_test["Tokenizer.inputCol"] == "text"
            and params_to_test["Tokenizer.outputCol"] == "words"
        )
        assert params_to_test["HashingTF.outputCol"] == "features"
        assert params_to_test["OneVsRest.classifier"] == "LogisticRegression"
        assert params_to_test["LogisticRegression.maxIter"] == 3
Ejemplo n.º 26
0
 def test_save_load(self):
     temp_path = tempfile.mkdtemp()
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     ovrPath = temp_path + "/ovr"
     ovr.save(ovrPath)
     loadedOvr = OneVsRest.load(ovrPath)
     self.assertEqual(loadedOvr.getFeaturesCol(), ovr.getFeaturesCol())
     self.assertEqual(loadedOvr.getLabelCol(), ovr.getLabelCol())
     self.assertEqual(loadedOvr.getClassifier().uid,
                      ovr.getClassifier().uid)
     modelPath = temp_path + "/ovrModel"
     model.save(modelPath)
     loadedModel = OneVsRestModel.load(modelPath)
     for m, n in zip(model.models, loadedModel.models):
         self.assertEqual(m.uid, n.uid)
Ejemplo n.º 27
0
def linear_classifier_run(df_training, df_test, whichModel, isSmallSet = False):
    # gather train and test sets, if small set include Sex for accuracy testing
    train = gather_features(df_training).select("Scaled_features", "Sex")
    if isSmallSet == True:
        test = gather_features(df_test).select("Scaled_features", "Sex")
    else:
        test = gather_features(df_test, isTestSet = True).select("Scaled_features")

    # select classifier
    if whichModel == 'logisticRegression':    
        classifier = LogisticRegression(labelCol="Sex", featuresCol="Scaled_features", maxIter = 10)
    elif whichModel == 'onevsall':
        lr = LogisticRegression(labelCol="Sex", featuresCol="Scaled_features", maxIter=10)
        classifier = OneVsRest(classifier=lr, labelCol="Sex", featuresCol="Scaled_features")
    elif whichModel == 'decisionTree':
        classifier = DecisionTreeClassifier(labelCol="Sex", featuresCol="Scaled_features", maxDepth = 3)
    elif whichModel == 'randomForest':
        classifier = DecisionTreeClassifier(labelCol="Sex", featuresCol="Scaled_features")
    elif whichModel == 'gbt':
        classifier = GBTClassifier(labelCol="Sex", featuresCol="Scaled_features", maxIter = 10)
    elif whichModel == 'nb':
        classifier = NaiveBayes(labelCol="Sex", featuresCol="Scaled_features", smoothing=1.0, modelType="multinomial")
    else: 
        raise NameError("Model must be one of the following: logisticRegression, onevsall, decisionTree, randomForest, gbt or nb")
        
    # train the model with selected classifier
    model = classifier.fit(train)
        
    # predict test set
    print('Predicting with ', input_linear_method)
    predict_test = model.transform(test)
    # write to a text file
    predict_test.select('prediction').rdd.map(lambda x : str(int(x[0]))).saveAsTextFile(output_file)
    print('Output has been written to txt file')
    
    # test accuracy if small set
    if isSmallSet == True:
        results = predict_test.select("Sex","prediction").withColumn('Success', (predict_test['Sex'] == predict_test['prediction']))
        print('Accuracy of', whichModel, '= ', results.select("Success").where("Success == true").count() / results.count())
Ejemplo n.º 28
0
def test_should_log_model_with_wildcards_in_allowlist(dataset_binomial,
                                                      dataset_multinomial):
    mlflow.pyspark.ml.autolog(log_models=True)
    lor = LogisticRegression()
    ova1 = OneVsRest(classifier=lor)
    ova1_model = ova1.fit(dataset_multinomial)

    with mock.patch(
            "mlflow.pyspark.ml._log_model_allowlist",
        {
            "pyspark.ml.regression.*",
            "pyspark.ml.classification.LogisticRegressionModel",
            "pyspark.ml.feature.*",
        },
    ):
        lr = LinearRegression()
        with mlflow.start_run():
            lr_model = lr.fit(dataset_binomial)
        assert _should_log_model(lr_model)
        with mlflow.start_run():
            lor_model = lor.fit(dataset_binomial)
        assert _should_log_model(lor_model)
        assert not _should_log_model(ova1_model)
Ejemplo n.º 29
0
    def train_validate(self, df):
        # Split the data into training and test sets (30% held out for testing)
        (training, test) = df.randomSplit([0.7, 0.3])

        # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                   outputCol="filtered")
        hashingTF = HashingTF(numFeatures=10000,
                              inputCol=remover.getOutputCol(),
                              outputCol="features")

        ####################
        # lr = LogisticRegression(maxIter=10, regParam=0.001)
        # pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, lr])
        ####################

        # instantiate the base classifier.
        lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)
        # instantiate the One Vs Rest Classifier.
        ovr = OneVsRest(classifier=lr)
        pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, ovr])
        #####################

        # Fit the pipeline to training documents.
        model = pipeline.fit(training)

        # Make predictions on test documents and print columns of interest.
        prediction = model.transform(test)

        # obtain evaluator.
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

        # compute the classification error on test data.
        accuracy = evaluator.evaluate(prediction)
        print("Test Error : " + str(1 - accuracy))
        return model
def test_get_instance_param_map(spark_session):  # pylint: disable=unused-argument
    lor = LogisticRegression(maxIter=3, standardization=False)
    lor_params = _get_instance_param_map(lor)
    assert (lor_params["maxIter"] == 3 and not lor_params["standardization"]
            and lor_params["family"] == lor.getOrDefault(lor.family))

    ova = OneVsRest(classifier=lor, labelCol="abcd")
    ova_params = _get_instance_param_map(ova)
    assert (ova_params["classifier"] == lor.uid
            and ova_params["labelCol"] == "abcd"
            and ova_params[f"{lor.uid}.maxIter"] == 3 and
            ova_params[f"{lor.uid}.family"] == lor.getOrDefault(lor.family))

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    pipeline = Pipeline(stages=[tokenizer, hashingTF, ova])
    inner_pipeline = Pipeline(stages=[hashingTF, ova])
    nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline])

    pipeline_params = _get_instance_param_map(pipeline)
    nested_pipeline_params = _get_instance_param_map(nested_pipeline)

    assert pipeline_params["stages"] == [tokenizer.uid, hashingTF.uid, ova.uid]
    assert nested_pipeline_params["stages"] == [
        tokenizer.uid,
        {
            inner_pipeline.uid: [hashingTF.uid, ova.uid]
        },
    ]

    for params_to_test in [pipeline_params, nested_pipeline_params]:
        assert (params_to_test[f"{tokenizer.uid}.inputCol"] == "text"
                and params_to_test[f"{tokenizer.uid}.outputCol"] == "words")
        assert params_to_test[f"{hashingTF.uid}.outputCol"] == "features"
        assert params_to_test[f"{ova.uid}.classifier"] == lor.uid
        assert params_to_test[f"{lor.uid}.maxIter"] == 3