def test_support_for_weightCol(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0), (1.0, Vectors.sparse(2, [], []), 1.0), (2.0, Vectors.dense(0.5, 0.5), 1.0)], ["label", "features", "weight"]) # classifier inherits hasWeightCol lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr, weightCol="weight") self.assertIsNotNone(ovr.fit(df)) # classifier doesn't inherit hasWeightCol dt = FMClassifier() ovr2 = OneVsRest(classifier=dt, weightCol="weight") self.assertIsNotNone(ovr2.fit(df))
def test_parallelism_doesnt_change_output(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) ovrPar1 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=1) modelPar1 = ovrPar1.fit(df) ovrPar2 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=2) modelPar2 = ovrPar2.fit(df) for i, model in enumerate(modelPar1.models): self.assertTrue(np.allclose(model.coefficients.toArray(), modelPar2.models[i].coefficients.toArray(), atol=1E-4)) self.assertTrue(np.allclose(model.intercept, modelPar2.models[i].intercept, atol=1E-4))
def test_one_vs_rest(self): this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_multiclass_classification_data.txt") data = self.spark.read.format("libsvm").load(input_path) lr = LogisticRegression(maxIter=100, tol=0.0001, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(data) feature_count = data.first()[1].size model_onnx = convert_sparkml(model, 'Sparkml OneVsRest', [ ('features', FloatTensorType([1, feature_count])) ], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlOneVsRest") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def binary_classification(tr_data=None, t_data=None, proc_type='train', example=None): """Performs binary classification pipelining.""" lr = LogisticRegression(tol=1e-6, fitIntercept=True) ovr = OneVsRest(classifier=lr) pipeline = Pipeline(stages=[ovr]) if proc_type == 'load' or proc_type == 'test': model = PipelineModel.load(BC_PATH) else: model = pipeline.fit(tr_data) if os.path.exists(BC_PATH): shutil.rmtree(BC_PATH) model.save(BC_PATH) if proc_type == 'test': result = model.transform(example).collect() return result, 0. else: prediction = model.transform(t_data) evaluator = MulticlassClassificationEvaluator( labelCol='label', predictionCol='prediction', metricName='f1') f1Score = evaluator.evaluate(prediction) return None, f1Score
def test_should_log_model(dataset_binomial, dataset_multinomial): mlflow.pyspark.ml.autolog(log_models=True) lor = LogisticRegression() ova1 = OneVsRest(classifier=lor) mlor_model = lor.fit(dataset_multinomial) assert _should_log_model(mlor_model) ova1_model = ova1.fit(dataset_multinomial) assert _should_log_model(ova1_model) with mock.patch( "mlflow.pyspark.ml._log_model_allowlist", { "pyspark.ml.regression.LinearRegressionModel", "pyspark.ml.classification.OneVsRestModel" }, ), mock.patch("mlflow.pyspark.ml._logger.warning") as mock_warning: lr = LinearRegression() lr_model = lr.fit(dataset_binomial) assert _should_log_model(lr_model) lor_model = lor.fit(dataset_binomial) assert not _should_log_model(lor_model) mock_warning.called_once_with( _get_warning_msg_for_skip_log_model(lor_model)) assert not _should_log_model(ova1_model)
def one_vs_rest(training, test): lr = LogisticRegression(maxIter=10, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(training) result = model.transform(test) accuracy = 1.0 * result.rdd.filter( lambda l: l.label == l.prediction).count() / test.count() print "OneVsRest 模型的正确率为:", accuracy
def one_vs_rest_classifier(trainingDataFrame, classifier=None): if not classifier: classifier = LogisticRegression(regParam=0.01) ovr = OneVsRest(classifier=classifier) ovrModel = ovr.fit(trainingDataFrame) result = {} result["model"] = ovrModel return result
def test_output_columns(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr, parallelism=1) model = ovr.fit(df) output = model.transform(df) self.assertEqual(output.columns, ["label", "features", "rawPrediction", "prediction"])
def test_raw_prediction_column_is_of_vector_type(self): # SPARK-35142: `OneVsRestModel` outputs raw prediction as a string column df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr, parallelism=1) model = ovr.fit(df) row = model.transform(df).head() self.assertIsInstance(row["rawPrediction"], DenseVector)
def test_copy(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) ovr1 = ovr.copy({lr.maxIter: 10}) self.assertEqual(ovr.getClassifier().getMaxIter(), 5) self.assertEqual(ovr1.getClassifier().getMaxIter(), 10) model = ovr.fit(df) model1 = model.copy({model.predictionCol: "indexed"}) self.assertEqual(model1.getPredictionCol(), "indexed")
def test_onevsrest(self): temp_path = tempfile.mkdtemp() df = self.spark.createDataFrame([(0.0, 0.5, Vectors.dense(1.0, 0.8)), (1.0, 0.5, Vectors.sparse(2, [], [])), (2.0, 1.0, Vectors.dense(0.5, 0.5))] * 10, ["label", "wt", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) def reload_and_compare(ovr, suffix): model = ovr.fit(df) ovrPath = temp_path + "/{}".format(suffix) ovr.save(ovrPath) loadedOvr = OneVsRest.load(ovrPath) self._compare_pipelines(ovr, loadedOvr) modelPath = temp_path + "/{}Model".format(suffix) model.save(modelPath) loadedModel = OneVsRestModel.load(modelPath) self._compare_pipelines(model, loadedModel) reload_and_compare(OneVsRest(classifier=lr), "ovr") reload_and_compare(OneVsRest(classifier=lr).setWeightCol("wt"), "ovrw")
def test_should_log_model(dataset_binomial, dataset_multinomial, dataset_text): mlflow.pyspark.ml.autolog(log_models=True) lor = LogisticRegression() ova1 = OneVsRest(classifier=lor) with mlflow.start_run(): mlor_model = lor.fit(dataset_multinomial) assert _should_log_model(mlor_model) with mlflow.start_run(): ova1_model = ova1.fit(dataset_multinomial) assert _should_log_model(ova1_model) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=2) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) with mlflow.start_run(): pipeline_model = pipeline.fit(dataset_text) assert _should_log_model(pipeline_model) nested_pipeline = Pipeline( stages=[tokenizer, Pipeline(stages=[hashingTF, lr])]) with mlflow.start_run(): nested_pipeline_model = nested_pipeline.fit(dataset_text) assert _should_log_model(nested_pipeline_model) with mock.patch( "mlflow.pyspark.ml._log_model_allowlist", { "pyspark.ml.regression.LinearRegressionModel", "pyspark.ml.classification.OneVsRestModel", "pyspark.ml.pipeline.PipelineModel", }, ), mock.patch("mlflow.pyspark.ml._logger.warning") as mock_warning: lr = LinearRegression() with mlflow.start_run(): lr_model = lr.fit(dataset_binomial) assert _should_log_model(lr_model) with mlflow.start_run(): lor_model = lor.fit(dataset_binomial) assert not _should_log_model(lor_model) mock_warning.called_once_with( _get_warning_msg_for_skip_log_model(lor_model)) assert not _should_log_model(ova1_model) assert not _should_log_model(pipeline_model) assert not _should_log_model(nested_pipeline_model)
def SVMCV(trainingData, testData): start_time = time.time() svm = LinearSVC() ovr = OneVsRest(classifier=svm) # Parametri su cui effettuare il tuning paramGrid = ParamGridBuilder() \ .addGrid(svm.regParam, [1, 0]) \ .addGrid(svm.maxIter, [100, 1000]) \ .build() cv = CrossValidator(estimator=ovr, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), numFolds=5) model = cv.fit(trainingData) prediction = model.transform(testData) result = prediction.select('features', 'label', 'prediction') # Calcolo accuracy evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(prediction) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1") f1score = evaluator.evaluate(prediction) # Confusion Matrix class_temp = prediction.select("label").groupBy("label") \ .count().sort('count', ascending=False).toPandas() class_temp = class_temp["label"].values.tolist() y_true = prediction.select("label") y_true = y_true.toPandas() y_pred = prediction.select("prediction") y_pred = y_pred.toPandas() cnf_matrix = confusion_matrix(y_true, y_pred, labels=class_temp) print("Accuracy K-Fodls: ", accuracy) print("F1-Score K-Fodls: ", f1score) print("Confusion Matrix: ") print(cnf_matrix) print("SVM K-Folds Execution TIME:", time.time() - start_time) return (f1score, cnf_matrix, cv)
def _run_test_save_load_nested_estimator(self, LogisticRegressionCls): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [ (Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0), ] * 10, ["features", "label"], ) ova = OneVsRest(classifier=LogisticRegressionCls()) lr1 = LogisticRegressionCls().setMaxIter(100) lr2 = LogisticRegressionCls().setMaxIter(150) grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build() evaluator = MulticlassClassificationEvaluator() # test save/load of CrossValidator cv = CrossValidator(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(dataset) cvPath = temp_path + "/cv" cv.save(cvPath) loadedCV = CrossValidator.load(cvPath) self.assert_param_maps_equal(loadedCV.getEstimatorParamMaps(), grid) self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid) self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid) originalParamMap = cv.getEstimatorParamMaps() loadedParamMap = loadedCV.getEstimatorParamMaps() for i, param in enumerate(loadedParamMap): for p in param: if p.name == "classifier": self.assertEqual(param[p].uid, originalParamMap[i][p].uid) else: self.assertEqual(param[p], originalParamMap[i][p]) # test save/load of CrossValidatorModel cvModelPath = temp_path + "/cvModel" cvModel.save(cvModelPath) loadedModel = CrossValidatorModel.load(cvModelPath) self.assert_param_maps_equal(loadedModel.getEstimatorParamMaps(), grid) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
def test_getAllNestedStages(self): def _check_uid_set_equal(stages, expected_stages): uids = set(map(lambda x: x.uid, stages)) expected_uids = set(map(lambda x: x.uid, expected_stages)) self.assertEqual(uids, expected_uids) df1 = self.spark.createDataFrame( [ (Vectors.dense([1.0, 2.0]), 1.0), (Vectors.dense([-1.0, -2.0]), 0.0), ], ["features", "label"], ) df2 = self.spark.createDataFrame( [ (1.0, 2.0, 1.0), (1.0, 2.0, 0.0), ], ["a", "b", "label"], ) vs = VectorAssembler(inputCols=["a", "b"], outputCol="features") lr = LogisticRegression() pipeline = Pipeline(stages=[vs, lr]) pipelineModel = pipeline.fit(df2) ova = OneVsRest(classifier=lr) ovaModel = ova.fit(df1) ova_pipeline = Pipeline(stages=[vs, ova]) nested_pipeline = Pipeline(stages=[ova_pipeline]) _check_uid_set_equal( MetaAlgorithmReadWrite.getAllNestedStages(pipeline), [pipeline, vs, lr]) _check_uid_set_equal( MetaAlgorithmReadWrite.getAllNestedStages(pipelineModel), [pipelineModel] + pipelineModel.stages, ) _check_uid_set_equal(MetaAlgorithmReadWrite.getAllNestedStages(ova), [ova, lr]) _check_uid_set_equal( MetaAlgorithmReadWrite.getAllNestedStages(ovaModel), [ovaModel, lr] + ovaModel.models) _check_uid_set_equal( MetaAlgorithmReadWrite.getAllNestedStages(nested_pipeline), [nested_pipeline, ova_pipeline, vs, ova, lr], )
def ovr_classifier(training, testing): from pyspark.ml.classification import LogisticRegression, OneVsRest #Instantiate the base classifier. lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True) #Instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=lr) # train the multiclass model. ovr_model = ovr.fit(training) #Test model ovr_predictions = ovr_model.transform(testing) #Evaluate model ovr_evaluator = MulticlassClassificationEvaluator(metricName='accuracy') ovr_accuracy = ovr_evaluator.evaluate(ovr_predictions) return ovr_accuracy
def test_onevsrest(self): temp_path = tempfile.mkdtemp() df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))] * 10, ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(df) ovrPath = temp_path + "/ovr" ovr.save(ovrPath) loadedOvr = OneVsRest.load(ovrPath) self._compare_pipelines(ovr, loadedOvr) modelPath = temp_path + "/ovrModel" model.save(modelPath) loadedModel = OneVsRestModel.load(modelPath) self._compare_pipelines(model, loadedModel)
def _run_test_save_load_nested_estimator(self, LogisticRegressionCls): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [ (Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0), ] * 10, ["features", "label"], ) ova = OneVsRest(classifier=LogisticRegressionCls()) lr1 = LogisticRegressionCls().setMaxIter(100) lr2 = LogisticRegressionCls().setMaxIter(150) grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build() evaluator = MulticlassClassificationEvaluator() tvs = TrainValidationSplit(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assert_param_maps_equal(loadedTvs.getEstimatorParamMaps(), grid) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) originalParamMap = tvs.getEstimatorParamMaps() loadedParamMap = loadedTvs.getEstimatorParamMaps() for i, param in enumerate(loadedParamMap): for p in param: if p.name == "classifier": self.assertEqual(param[p].uid, originalParamMap[i][p].uid) else: self.assertEqual(param[p], originalParamMap[i][p]) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assert_param_maps_equal(loadedModel.getEstimatorParamMaps(), grid) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def test_param_map_captures_wrapped_params(dataset_binomial): lor = LogisticRegression(maxIter=3, standardization=False) ova = OneVsRest(classifier=lor, labelCol="abcd") param_map = _get_instance_param_map(ova) assert param_map["labelCol"] == "abcd" assert param_map["classifier"] == lor.uid assert param_map[f"{lor.uid}.maxIter"] == 3 assert not param_map[f"{lor.uid}.standardization"] assert param_map[f"{lor.uid}.tol"] == lor.getOrDefault(lor.tol) mlflow.pyspark.ml.autolog() with mlflow.start_run() as run: ova.fit(dataset_binomial.withColumn("abcd", dataset_binomial.label)) run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict( stringify_dict_values(_get_instance_param_map(ova)))
def binary_classification(tr_data, t_data): """Performs binary classification pipelining.""" lr = LogisticRegression(tol=1e-6, fitIntercept=True) ovr = OneVsRest(classifier=lr) pipeline = Pipeline(stages=[ovr]) model = pipeline.fit(tr_data) prediction = model.transform(t_data) # prediction.show(5) evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='f1') f1Score = evaluator.evaluate(prediction) result = prediction.collect() return result, f1Score
def test_meta_estimator_fit(dataset_binomial): mlflow.pyspark.ml.autolog() with mlflow.start_run() as run: svc = LinearSVC() ova = OneVsRest(classifier=svc) ova_model = ova.fit(dataset_binomial) run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict(stringify_dict_values(get_params_to_log(ova))) assert run_data.tags == get_expected_class_tags(ova) assert MODEL_DIR in run_data.artifacts loaded_model = load_model_by_run_id(run_id) assert loaded_model.stages[0].uid == ova_model.uid # assert no nested run spawned query = "tags.{} = '{}'".format(MLFLOW_PARENT_RUN_ID, run.info.run_id) assert len(mlflow.search_runs([run.info.experiment_id])) == 1 assert len(mlflow.search_runs([run.info.experiment_id], query)) == 0
def main(): spark = create_session('wash_post_shootings') spark.sparkContext.setLogLevel('ERROR') try: df = create_df(spark) sh = Shootings(df) sh.show() df = sh.get_df() (train, test) = df.randomSplit([0.8, 0.2]) # instantiate the base classifier. lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True, featuresCol='features', labelCol='label') # instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=lr) # train the multiclass model. ovrModel = ovr.fit(train) # score the model on test data. predictions = ovrModel.transform(test) # obtain evaluator. evaluator = MulticlassClassificationEvaluator(metricName="accuracy") # compute the classification error on test data. accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) print("Accuracy = %.2f" % (accuracy * 100)) except Exception as e: print(e) finally: spark.sparkContext.stop()
def main(): parser = argparse.ArgumentParser(description='Pyspark Training') parser.add_argument('--data_dir', type=str, default='../../data', help='Data location.') args = parser.parse_args() # Get the MNIST data. X, y = load_mnist(args.data_dir) # Create a train and test set split. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # Convert numpy arrays to Pyspark dataframe. df = create_df(y_train, X_train) # Create a train and validation set. (train, val) = df.randomSplit([0.1, 0.90]) # instantiate logistic regression with hyperparameters. lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True) # instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=lr) # train the multiclass model ovrModel = ovr.fit(train) # score the model on test data. predictions = ovrModel.transform(val) # obtain evaluator. evaluator = MulticlassClassificationEvaluator(metricName="accuracy") # compute the classification error on validation data. accuracy = evaluator.evaluate(predictions) print("Validation Accuracy = {}".format(accuracy)) print("Validation Error = {}".format(1.0 - accuracy))
def test_param_map_captures_wrapped_params(dataset_binomial): lor = LogisticRegression(maxIter=3, standardization=False) ova = OneVsRest(classifier=lor, labelCol="abcd") param_map = get_params_to_log(ova) assert param_map["labelCol"] == "abcd" assert param_map["classifier"] == "LogisticRegression" assert param_map["LogisticRegression.maxIter"] == 3 assert not param_map["LogisticRegression.standardization"] assert param_map["LogisticRegression.tol"] == lor.getOrDefault(lor.tol) mlflow.pyspark.ml.autolog() with mlflow.start_run() as run: ova.fit(dataset_binomial.withColumn("abcd", dataset_binomial.label)) metadata = _gen_estimator_metadata(ova) estimator_info = load_json_artifact("estimator_info.json") assert metadata.hierarchy == estimator_info["hierarchy"] run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict(stringify_dict_values(get_params_to_log(ova)))
def test_get_params_to_log(spark_session): # pylint: disable=unused-argument lor = LogisticRegression(maxIter=3, standardization=False) lor_params = get_params_to_log(lor) assert ( lor_params["maxIter"] == 3 and not lor_params["standardization"] and lor_params["family"] == lor.getOrDefault(lor.family) ) ova = OneVsRest(classifier=lor, labelCol="abcd") ova_params = get_params_to_log(ova) assert ( ova_params["classifier"] == "LogisticRegression" and ova_params["labelCol"] == "abcd" and ova_params["LogisticRegression.maxIter"] == 3 and ova_params["LogisticRegression.family"] == lor.getOrDefault(lor.family) ) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") pipeline = Pipeline(stages=[tokenizer, hashingTF, ova]) inner_pipeline = Pipeline(stages=[hashingTF, ova]) nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline]) pipeline_params = get_params_to_log(pipeline) nested_pipeline_params = get_params_to_log(nested_pipeline) assert pipeline_params["stages"] == ["Tokenizer", "HashingTF", "OneVsRest"] assert nested_pipeline_params["stages"] == ["Tokenizer", "Pipeline_2"] assert nested_pipeline_params["Pipeline_2.stages"] == ["HashingTF", "OneVsRest"] assert nested_pipeline_params["OneVsRest.classifier"] == "LogisticRegression" for params_to_test in [pipeline_params, nested_pipeline_params]: assert ( params_to_test["Tokenizer.inputCol"] == "text" and params_to_test["Tokenizer.outputCol"] == "words" ) assert params_to_test["HashingTF.outputCol"] == "features" assert params_to_test["OneVsRest.classifier"] == "LogisticRegression" assert params_to_test["LogisticRegression.maxIter"] == 3
def test_save_load(self): temp_path = tempfile.mkdtemp() sqlContext = SQLContext(self.sc) df = sqlContext.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(df) ovrPath = temp_path + "/ovr" ovr.save(ovrPath) loadedOvr = OneVsRest.load(ovrPath) self.assertEqual(loadedOvr.getFeaturesCol(), ovr.getFeaturesCol()) self.assertEqual(loadedOvr.getLabelCol(), ovr.getLabelCol()) self.assertEqual(loadedOvr.getClassifier().uid, ovr.getClassifier().uid) modelPath = temp_path + "/ovrModel" model.save(modelPath) loadedModel = OneVsRestModel.load(modelPath) for m, n in zip(model.models, loadedModel.models): self.assertEqual(m.uid, n.uid)
def linear_classifier_run(df_training, df_test, whichModel, isSmallSet = False): # gather train and test sets, if small set include Sex for accuracy testing train = gather_features(df_training).select("Scaled_features", "Sex") if isSmallSet == True: test = gather_features(df_test).select("Scaled_features", "Sex") else: test = gather_features(df_test, isTestSet = True).select("Scaled_features") # select classifier if whichModel == 'logisticRegression': classifier = LogisticRegression(labelCol="Sex", featuresCol="Scaled_features", maxIter = 10) elif whichModel == 'onevsall': lr = LogisticRegression(labelCol="Sex", featuresCol="Scaled_features", maxIter=10) classifier = OneVsRest(classifier=lr, labelCol="Sex", featuresCol="Scaled_features") elif whichModel == 'decisionTree': classifier = DecisionTreeClassifier(labelCol="Sex", featuresCol="Scaled_features", maxDepth = 3) elif whichModel == 'randomForest': classifier = DecisionTreeClassifier(labelCol="Sex", featuresCol="Scaled_features") elif whichModel == 'gbt': classifier = GBTClassifier(labelCol="Sex", featuresCol="Scaled_features", maxIter = 10) elif whichModel == 'nb': classifier = NaiveBayes(labelCol="Sex", featuresCol="Scaled_features", smoothing=1.0, modelType="multinomial") else: raise NameError("Model must be one of the following: logisticRegression, onevsall, decisionTree, randomForest, gbt or nb") # train the model with selected classifier model = classifier.fit(train) # predict test set print('Predicting with ', input_linear_method) predict_test = model.transform(test) # write to a text file predict_test.select('prediction').rdd.map(lambda x : str(int(x[0]))).saveAsTextFile(output_file) print('Output has been written to txt file') # test accuracy if small set if isSmallSet == True: results = predict_test.select("Sex","prediction").withColumn('Success', (predict_test['Sex'] == predict_test['prediction'])) print('Accuracy of', whichModel, '= ', results.select("Success").where("Success == true").count() / results.count())
def test_should_log_model_with_wildcards_in_allowlist(dataset_binomial, dataset_multinomial): mlflow.pyspark.ml.autolog(log_models=True) lor = LogisticRegression() ova1 = OneVsRest(classifier=lor) ova1_model = ova1.fit(dataset_multinomial) with mock.patch( "mlflow.pyspark.ml._log_model_allowlist", { "pyspark.ml.regression.*", "pyspark.ml.classification.LogisticRegressionModel", "pyspark.ml.feature.*", }, ): lr = LinearRegression() with mlflow.start_run(): lr_model = lr.fit(dataset_binomial) assert _should_log_model(lr_model) with mlflow.start_run(): lor_model = lor.fit(dataset_binomial) assert _should_log_model(lor_model) assert not _should_log_model(ova1_model)
def train_validate(self, df): # Split the data into training and test sets (30% held out for testing) (training, test) = df.randomSplit([0.7, 0.3]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered") hashingTF = HashingTF(numFeatures=10000, inputCol=remover.getOutputCol(), outputCol="features") #################### # lr = LogisticRegression(maxIter=10, regParam=0.001) # pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, lr]) #################### # instantiate the base classifier. lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True) # instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=lr) pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, ovr]) ##################### # Fit the pipeline to training documents. model = pipeline.fit(training) # Make predictions on test documents and print columns of interest. prediction = model.transform(test) # obtain evaluator. evaluator = MulticlassClassificationEvaluator(metricName="accuracy") # compute the classification error on test data. accuracy = evaluator.evaluate(prediction) print("Test Error : " + str(1 - accuracy)) return model
def test_get_instance_param_map(spark_session): # pylint: disable=unused-argument lor = LogisticRegression(maxIter=3, standardization=False) lor_params = _get_instance_param_map(lor) assert (lor_params["maxIter"] == 3 and not lor_params["standardization"] and lor_params["family"] == lor.getOrDefault(lor.family)) ova = OneVsRest(classifier=lor, labelCol="abcd") ova_params = _get_instance_param_map(ova) assert (ova_params["classifier"] == lor.uid and ova_params["labelCol"] == "abcd" and ova_params[f"{lor.uid}.maxIter"] == 3 and ova_params[f"{lor.uid}.family"] == lor.getOrDefault(lor.family)) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") pipeline = Pipeline(stages=[tokenizer, hashingTF, ova]) inner_pipeline = Pipeline(stages=[hashingTF, ova]) nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline]) pipeline_params = _get_instance_param_map(pipeline) nested_pipeline_params = _get_instance_param_map(nested_pipeline) assert pipeline_params["stages"] == [tokenizer.uid, hashingTF.uid, ova.uid] assert nested_pipeline_params["stages"] == [ tokenizer.uid, { inner_pipeline.uid: [hashingTF.uid, ova.uid] }, ] for params_to_test in [pipeline_params, nested_pipeline_params]: assert (params_to_test[f"{tokenizer.uid}.inputCol"] == "text" and params_to_test[f"{tokenizer.uid}.outputCol"] == "words") assert params_to_test[f"{hashingTF.uid}.outputCol"] == "features" assert params_to_test[f"{ova.uid}.classifier"] == lor.uid assert params_to_test[f"{lor.uid}.maxIter"] == 3