def test_output_columns(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr, parallelism=1) model = ovr.fit(df) output = model.transform(df) self.assertEqual(output.columns, ["label", "features", "rawPrediction", "prediction"])
def test_copy(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) ovr1 = ovr.copy({lr.maxIter: 10}) self.assertEqual(ovr.getClassifier().getMaxIter(), 5) self.assertEqual(ovr1.getClassifier().getMaxIter(), 10) model = ovr.fit(df) model1 = model.copy({model.predictionCol: "indexed"}) self.assertEqual(model1.getPredictionCol(), "indexed")
def test_support_for_weightCol(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0), (1.0, Vectors.sparse(2, [], []), 1.0), (2.0, Vectors.dense(0.5, 0.5), 1.0)], ["label", "features", "weight"]) # classifier inherits hasWeightCol lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr, weightCol="weight") self.assertIsNotNone(ovr.fit(df)) # classifier doesn't inherit hasWeightCol dt = DecisionTreeClassifier() ovr2 = OneVsRest(classifier=dt, weightCol="weight") self.assertIsNotNone(ovr2.fit(df))
def test_parallelism_doesnt_change_output(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) ovrPar1 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=1) modelPar1 = ovrPar1.fit(df) ovrPar2 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=2) modelPar2 = ovrPar2.fit(df) for i, model in enumerate(modelPar1.models): self.assertTrue(np.allclose(model.coefficients.toArray(), modelPar2.models[i].coefficients.toArray(), atol=1E-4)) self.assertTrue(np.allclose(model.intercept, modelPar2.models[i].intercept, atol=1E-4))
def test_onevsrest(self): temp_path = tempfile.mkdtemp() df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))] * 10, ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(df) ovrPath = temp_path + "/ovr" ovr.save(ovrPath) loadedOvr = OneVsRest.load(ovrPath) self._compare_pipelines(ovr, loadedOvr) modelPath = temp_path + "/ovrModel" model.save(modelPath) loadedModel = OneVsRestModel.load(modelPath) self._compare_pipelines(model, loadedModel)
def test_save_load(self): temp_path = tempfile.mkdtemp() sqlContext = SQLContext(self.sc) df = sqlContext.createDataFrame( [(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"], ) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(df) ovrPath = temp_path + "/ovr" ovr.save(ovrPath) loadedOvr = OneVsRest.load(ovrPath) self.assertEqual(loadedOvr.getFeaturesCol(), ovr.getFeaturesCol()) self.assertEqual(loadedOvr.getLabelCol(), ovr.getLabelCol()) self.assertEqual(loadedOvr.getClassifier().uid, ovr.getClassifier().uid) modelPath = temp_path + "/ovrModel" model.save(modelPath) loadedModel = OneVsRestModel.load(modelPath) for m, n in zip(model.models, loadedModel.models): self.assertEqual(m.uid, n.uid)
.appName("PythonOneVsRestExample") \ .getOrCreate() # $example on$ # load data file. inputData = spark.read.format("libsvm") \ .load("data/mllib/sample_multiclass_classification_data.txt") # generate the train/test split. (train, test) = inputData.randomSplit([0.8, 0.2]) # instantiate the base classifier. lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True) # instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=lr) # train the multiclass model. ovrModel = ovr.fit(train) # score the model on test data. predictions = ovrModel.transform(test) # obtain evaluator. evaluator = MulticlassClassificationEvaluator(metricName="precision") # compute the classification error on test data. precision = evaluator.evaluate(predictions) print("Test Error : " + str(1 - precision)) # $example off$
# 从 HDFS 上读取数据 path = '/home/mnist-test/data/train' df = spark.read.csv(path, header=True, inferSchema=True) df = df.dropna() # 删除空值 # 将数据转换为 features labels rf = RFormula(formula="label ~ .", featuresCol="features", labelCol="labels") rf_model = rf.fit(df) df = rf_model.transform(df).select(["features", "labels"]) # 数据集切分 train_df, test_df = df.randomSplit([0.8, 0.2]) # 构造 GBDT 模型 gbdt = GBTClassifier(maxIter=10, maxDepth=3, labelCol="labels", featuresCol="features") # 构造 One Vs Rest Classifier. ovr = OneVsRest(classifier=gbdt) ovr_model = ovr.fit(train_df) predict_res = ovr_model.transform(test_df) # 评估 evaluator = MulticlassClassificationEvaluator(labelCol="labels", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predict_res)
if c not in habitats: inputCols.append(c) print("input ", len(inputCols)) pdf = pd.DataFrame(df2) df = spark.createDataFrame(pdf) df.show() featureassembler = VectorAssembler(inputCols=inputCols, outputCol="features") XY = featureassembler.transform(df).select("features", "habitat_d", "habitat_g", "habitat_l", "habitat_m", "habitat_p", "habitat_u", "habitat_w") XY.show() train, test = XY.randomSplit([.8, .2]) accuracyList = [] for habitat in habitats: _train = train.withColumnRenamed(habitat, "label") _test = test.withColumnRenamed(habitat, "label").select("features", "label") lr = LogisticRegression(maxIter=30, tol=1E-6, fitIntercept=True) ovr = OneVsRest(classifier=lr) ovrModel = ovr.fit(_train) predictions = ovrModel.transform(_test) evaluator = MulticlassClassificationEvaluator(metricName="accuracy") accuracy = evaluator.evaluate(predictions) accuracyList.append(accuracy) print("Test Error = %g" % (1.0 - accuracy))
def test_gen_estimator_metadata(spark_session): # pylint: disable=unused-argument tokenizer1 = Tokenizer(inputCol="text1", outputCol="words1") hashingTF1 = HashingTF(inputCol=tokenizer1.getOutputCol(), outputCol="features1") tokenizer2 = Tokenizer(inputCol="text2", outputCol="words2") hashingTF2 = HashingTF(inputCol=tokenizer2.getOutputCol(), outputCol="features2") vecAssembler = VectorAssembler(inputCols=["features1", "features2"], outputCol="features") lor = LogisticRegression(maxIter=10) ova = OneVsRest(classifier=lor) sub_pipeline1 = Pipeline(stages=[tokenizer1, hashingTF1]) sub_pipeline2 = Pipeline(stages=[tokenizer2, hashingTF2]) sub_pipeline3 = Pipeline(stages=[vecAssembler, ova]) paramGrid = (ParamGridBuilder().addGrid(lor.maxIter, [10, 20]).addGrid( lor.regParam, [0.1, 0.01]).build()) eva = MulticlassClassificationEvaluator() crossval = CrossValidator(estimator=sub_pipeline3, estimatorParamMaps=paramGrid, evaluator=eva, numFolds=2) top_pipeline = Pipeline(stages=[sub_pipeline1, sub_pipeline2, crossval]) metadata = _gen_estimator_metadata(top_pipeline) expected_hierarchy = { "name": "Pipeline_1", "stages": [ { "name": "Pipeline_2", "stages": [{ "name": "Tokenizer_1" }, { "name": "HashingTF_1" }] }, { "name": "Pipeline_3", "stages": [{ "name": "Tokenizer_2" }, { "name": "HashingTF_2" }] }, { "name": "CrossValidator", "evaluator": { "name": "MulticlassClassificationEvaluator" }, "tuned_estimator": { "name": "Pipeline_4", "stages": [ { "name": "VectorAssembler" }, { "name": "OneVsRest", "classifier": { "name": "LogisticRegression" } }, ], }, }, ], } assert metadata.hierarchy == expected_hierarchy assert metadata.uid_to_indexed_name_map == { top_pipeline.uid: "Pipeline_1", sub_pipeline1.uid: "Pipeline_2", tokenizer1.uid: "Tokenizer_1", hashingTF1.uid: "HashingTF_1", sub_pipeline2.uid: "Pipeline_3", tokenizer2.uid: "Tokenizer_2", hashingTF2.uid: "HashingTF_2", crossval.uid: "CrossValidator", sub_pipeline3.uid: "Pipeline_4", vecAssembler.uid: "VectorAssembler", ova.uid: "OneVsRest", lor.uid: "LogisticRegression", eva.uid: "MulticlassClassificationEvaluator", } assert (metadata.uid_to_indexed_name_map[ metadata.param_search_estimators[0].uid] == "CrossValidator")
i = 0 df_list_test = [] for element in x_train: # row tup = (int(y_train[i]), Vectors.dense(element)) i = i + 1 df_list.append(tup) Train_sparkframe = spark.createDataFrame(df_list, schema=['label', 'features']) from pyspark.ml.tuning import CrossValidator, ParamGridBuilder from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.classification import OneVsRest from pyspark.ml.classification import LinearSVC lr = LogisticRegression() ovr = OneVsRest(classifier=lr) print(datetime.datetime.now()) # Fit the model mlrModel = ovr.fit(Train_sparkframe) print(datetime.datetime.now()) # In[ ]:
def test_save_load_pipeline_estimator(self): temp_path = tempfile.mkdtemp() training = self.spark.createDataFrame([ (0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0), (4, "b spark who", 1.0), (5, "g d a y", 0.0), (6, "spark fly", 1.0), (7, "was mapreduce", 0.0), ], ["id", "text", "label"]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") ova = OneVsRest(classifier=LogisticRegression()) lr1 = LogisticRegression().setMaxIter(5) lr2 = LogisticRegression().setMaxIter(10) pipeline = Pipeline(stages=[tokenizer, hashingTF, ova]) paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100]) \ .addGrid(ova.classifier, [lr1, lr2]) \ .build() tvs = TrainValidationSplit( estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator()) # Run train validation split, and choose the best set of parameters. tvsModel = tvs.fit(training) # test save/load of CrossValidatorModel tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid) self.assertEqual(len(loadedModel.bestModel.stages), len(tvsModel.bestModel.stages)) for loadedStage, originalStage in zip(loadedModel.bestModel.stages, tvsModel.bestModel.stages): self.assertEqual(loadedStage.uid, originalStage.uid) # Test nested pipeline nested_pipeline = Pipeline( stages=[tokenizer, Pipeline(stages=[hashingTF, ova])]) tvs2 = TrainValidationSplit( estimator=nested_pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator()) # Run train validation split, and choose the best set of parameters. tvsModel2 = tvs2.fit(training) # test save/load of CrossValidatorModel tvsModelPath2 = temp_path + "/tvsModel2" tvsModel2.save(tvsModelPath2) loadedModel2 = TrainValidationSplitModel.load(tvsModelPath2) self.assertEqual(loadedModel2.bestModel.uid, tvsModel2.bestModel.uid) loaded_nested_pipeline_model = loadedModel2.bestModel.stages[1] original_nested_pipeline_model = tvsModel2.bestModel.stages[1] self.assertEqual(loaded_nested_pipeline_model.uid, original_nested_pipeline_model.uid) self.assertEqual(len(loaded_nested_pipeline_model.stages), len(original_nested_pipeline_model.stages)) for loadedStage, originalStage in zip( loaded_nested_pipeline_model.stages, original_nested_pipeline_model.stages): self.assertEqual(loadedStage.uid, originalStage.uid)
tmpTrainX = tmpTrainRDD.toDF() csvTrainTmp = spark.read.format("csv").option( "header", "true").load(imageDir + "/train25.csv") #csvTrainTmp = spark.read.format("csv").option("header", "true").load(imageDir + "/test1.csv") csvTrainRDD = csvTrainTmp.rdd.map(lambda x: Row(image=x[0], label=int(x[1]))) csvTrain = csvTrainRDD.toDF() finalTrainDataFrame = tmpTrainX.join(csvTrain, tmpTrainX.fileName == csvTrain.image, 'inner').drop(csvTrain.image) featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") method = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10) ovr = OneVsRest(classifier=method) featureVector = featurizer.transform(finalTrainDataFrame).persist() model_gbt = ovr.fit(featureVector) model_gbt.write().overwrite().save( 'hdfs://192.168.65.188:8020/paih/model-gradiant-boosted-tree-classifier') predictions = model_lr.transform(featureVector).persist() predictionAndLabels = predictions.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Train Data set accuracy with Gradiant boosted tree classifier = " + str(evaluator.evaluate(predictionAndLabels)) + " and error " + str(1 - evaluator.evaluate(predictionAndLabels))) #predictions.show() #apply evaluator on constructed model for training data.
training = splits[0] test = splits[1] #------------------------------------------------------------------------------------------------------------------- tokenizer_svm = RegexTokenizer(inputCol="tweet", outputCol="words", pattern="\\s+") hashing_tf_svm = HashingTF(inputCol="words", outputCol="tf") idf_svm = IDF(inputCol="tf", outputCol="features") svm = LinearSVC() ovr = OneVsRest(classifier=svm) pipeline_svm = Pipeline( stages=[tokenizer_svm, hashing_tf_svm, idf_svm, ovr]) model_svm = pipeline_svm.fit(training) result_svm = model_svm.transform(test) result_svm.show() predictionAndLabels = result_svm.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels))) model_svm.write().overwrite().save("model-svm")
# transfer the train_df in to the dataframe with 2 column label and features so that we can do the further process assembler_train = VectorAssembler(inputCols=train_df.columns[:1024], outputCol="fectures") train_vectors_withlabel = assembler_train.transform(train_df).selectExpr("_c1024 as label_train", "fectures") # fit the pca of train_vector first # we set the k=50, so that we can keep 90% data of MINST # After fit process, we can get the model of pca_200. # Therefore, we can use the model to transform the test and train data. pca = PCA(k=200, inputCol="fectures", outputCol="pca_vector") model_200 = pca.fit(train_vectors_withlabel) pca_train_result = model_200.transform(train_vectors_withlabel).selectExpr('label_train as label', 'pca_vector as features') pca_test_result = model_200.transform(test_vectors_withlabel).selectExpr('label_test as label', 'pca_vector as features') lr = LogisticRegression(maxIter=200, tol=1E-6, fitIntercept=True) ovr2 = OneVsRest(classifier=lr) model2 = ovr2.fit(pca_train_result) result = model2.transform(pca_test_result) result_lp = result.selectExpr("label", "cast (prediction as int) prediction") final_result = result_lp.rdd # calculate the accuracy neutral_zero_value = 0 def seqOp(a, b): if b[0] == b[1]: return a else: return a + 1
def RandomForestClassifier(self): print("********************************************************************************************************************************************") print("Random Forest") self.t0 = time() rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees = 100, maxDepth = 4, maxBins = 32,impurity="entropy") pipeline = Pipeline(stages=[self.labelIndexer, self.featureIndexer, rf, self.labelConverter]) model = pipeline.fit(self.trainingData) self.tm = time() - self.t0 print ("Modeli egitme zamani {} saniye ".format(self.tm)) self.t0 = time() self.predictions = model.transform(self.testData) self.tt = time() - self.t0 print ("Test verisini siniflandirma zamani {} saniye ".format(self.tt)) self.t0 = time() predictions_train = model.transform(self.trainingData) self.te = time() - self.t0 print ("Egitim verisini siniflandirma zamani {} saniye ".format(self.te)) self.predictions.select("features", "label", "predictedLabel", "probability").show(5) evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") self.t0 = time() self.accuracy = evaluator.evaluate(self.predictions) self.tt2 = time() -self.t0 print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, self.accuracy)) self.t0 = time() self.train_accuracy = evaluator.evaluate(predictions_train) self.te2 = time() -self.t0 print ("Tahmini yapilis zamani {} saniye . Egitim Verisinin dogrulanmasi {} saniye ".format(self.te2, self.train_accuracy)) print("Test Dogruluk = %g" % (self.accuracy)) self.testError = (1.0 - self.accuracy) print("Test Test Error = %g" % (1.0 - self.accuracy)) print("Egitim Dogruluk = %g" % (self.train_accuracy)) self.train_Error = (1.0 - self.train_accuracy) print("Egitim Error = %g" % (1.0 - self.train_accuracy)) rfModel = model.stages[2] evaluatorf1 = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1") self.f1 = evaluatorf1.evaluate(self.predictions) self.train_f1 = evaluatorf1.evaluate(predictions_train) print("test f1 = %g" % self.f1) print("egitim f1 = %g" % self.train_f1) evaluatorwp = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision") self.wp = evaluatorwp.evaluate(self.predictions) self.train_wp = evaluatorwp.evaluate(predictions_train) print("test weightedPrecision = %g" % self.wp) print("egitim weightedPrecision = %g" % self.train_wp) evaluatorwr = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedRecall") self.wr = evaluatorwr.evaluate(self.predictions) self.train_wr = evaluatorwr.evaluate(predictions_train) print("test weightedRecall = %g" % self.wr) print("egitim weightedRecall = %g" % self.train_wr) rfModel = model.stages[2] #print (rfModel._call_java('toDebugString')) messagebox.showinfo("Başarılı","Model Eğitildi") self.skorEkle() self.ModelBtn.grid_remove() self.SonucBtn.grid(row=7,column=2) self.ExportCsvBtn.grid(row=8,column=2) svm = LinearSVC(maxIter=5, regParam=0.01) LSVC = LinearSVC() ovr = OneVsRest(classifier=LSVC) paramGrid = ParamGridBuilder().addGrid(LSVC.maxIter, [10, 100]).addGrid(LSVC.regParam,[0.001, 0.01, 1.0,10.0]).build() crossval = CrossValidator(estimator=ovr, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(metricName="f1"), numFolds=2) Train_sparkframe = self.trainingData.select("features", "label") cvModel = crossval.fit(Train_sparkframe) bestModel = cvModel.bestModel