Ejemplo n.º 1
0
 def test_output_columns(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr, parallelism=1)
     model = ovr.fit(df)
     output = model.transform(df)
     self.assertEqual(output.columns, ["label", "features", "rawPrediction", "prediction"])
Ejemplo n.º 2
0
 def test_copy(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     ovr1 = ovr.copy({lr.maxIter: 10})
     self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
     self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
     model = ovr.fit(df)
     model1 = model.copy({model.predictionCol: "indexed"})
     self.assertEqual(model1.getPredictionCol(), "indexed")
Ejemplo n.º 3
0
 def test_support_for_weightCol(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0),
                                      (1.0, Vectors.sparse(2, [], []), 1.0),
                                      (2.0, Vectors.dense(0.5, 0.5), 1.0)],
                                     ["label", "features", "weight"])
     # classifier inherits hasWeightCol
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr, weightCol="weight")
     self.assertIsNotNone(ovr.fit(df))
     # classifier doesn't inherit hasWeightCol
     dt = DecisionTreeClassifier()
     ovr2 = OneVsRest(classifier=dt, weightCol="weight")
     self.assertIsNotNone(ovr2.fit(df))
Ejemplo n.º 4
0
 def test_parallelism_doesnt_change_output(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     ovrPar1 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=1)
     modelPar1 = ovrPar1.fit(df)
     ovrPar2 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=2)
     modelPar2 = ovrPar2.fit(df)
     for i, model in enumerate(modelPar1.models):
         self.assertTrue(np.allclose(model.coefficients.toArray(),
                                     modelPar2.models[i].coefficients.toArray(), atol=1E-4))
         self.assertTrue(np.allclose(model.intercept, modelPar2.models[i].intercept, atol=1E-4))
Ejemplo n.º 5
0
 def test_onevsrest(self):
     temp_path = tempfile.mkdtemp()
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))] * 10,
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     ovrPath = temp_path + "/ovr"
     ovr.save(ovrPath)
     loadedOvr = OneVsRest.load(ovrPath)
     self._compare_pipelines(ovr, loadedOvr)
     modelPath = temp_path + "/ovrModel"
     model.save(modelPath)
     loadedModel = OneVsRestModel.load(modelPath)
     self._compare_pipelines(model, loadedModel)
Ejemplo n.º 6
0
 def test_save_load(self):
     temp_path = tempfile.mkdtemp()
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame(
         [(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))],
         ["label", "features"],
     )
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     ovrPath = temp_path + "/ovr"
     ovr.save(ovrPath)
     loadedOvr = OneVsRest.load(ovrPath)
     self.assertEqual(loadedOvr.getFeaturesCol(), ovr.getFeaturesCol())
     self.assertEqual(loadedOvr.getLabelCol(), ovr.getLabelCol())
     self.assertEqual(loadedOvr.getClassifier().uid, ovr.getClassifier().uid)
     modelPath = temp_path + "/ovrModel"
     model.save(modelPath)
     loadedModel = OneVsRestModel.load(modelPath)
     for m, n in zip(model.models, loadedModel.models):
         self.assertEqual(m.uid, n.uid)
Ejemplo n.º 7
0
        .appName("PythonOneVsRestExample") \
        .getOrCreate()

    # $example on$
    # load data file.
    inputData = spark.read.format("libsvm") \
        .load("data/mllib/sample_multiclass_classification_data.txt")

    # generate the train/test split.
    (train, test) = inputData.randomSplit([0.8, 0.2])

    # instantiate the base classifier.
    lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)

    # instantiate the One Vs Rest Classifier.
    ovr = OneVsRest(classifier=lr)

    # train the multiclass model.
    ovrModel = ovr.fit(train)

    # score the model on test data.
    predictions = ovrModel.transform(test)

    # obtain evaluator.
    evaluator = MulticlassClassificationEvaluator(metricName="precision")

    # compute the classification error on test data.
    precision = evaluator.evaluate(predictions)
    print("Test Error : " + str(1 - precision))
    # $example off$
Ejemplo n.º 8
0
# 从 HDFS 上读取数据
path = '/home/mnist-test/data/train'
df = spark.read.csv(path, header=True, inferSchema=True)
df = df.dropna()  # 删除空值

# 将数据转换为 features labels
rf = RFormula(formula="label ~ .", featuresCol="features", labelCol="labels")
rf_model = rf.fit(df)
df = rf_model.transform(df).select(["features", "labels"])

# 数据集切分
train_df, test_df = df.randomSplit([0.8, 0.2])

# 构造 GBDT 模型
gbdt = GBTClassifier(maxIter=10,
                     maxDepth=3,
                     labelCol="labels",
                     featuresCol="features")

# 构造 One Vs Rest Classifier.
ovr = OneVsRest(classifier=gbdt)
ovr_model = ovr.fit(train_df)
predict_res = ovr_model.transform(test_df)

# 评估
evaluator = MulticlassClassificationEvaluator(labelCol="labels",
                                              predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predict_res)
Ejemplo n.º 9
0
    if c not in habitats:
        inputCols.append(c)

print("input ", len(inputCols))

pdf = pd.DataFrame(df2)
df = spark.createDataFrame(pdf)
df.show()

featureassembler = VectorAssembler(inputCols=inputCols, outputCol="features")
XY = featureassembler.transform(df).select("features", "habitat_d",
                                           "habitat_g", "habitat_l",
                                           "habitat_m", "habitat_p",
                                           "habitat_u", "habitat_w")
XY.show()
train, test = XY.randomSplit([.8, .2])

accuracyList = []
for habitat in habitats:
    _train = train.withColumnRenamed(habitat, "label")
    _test = test.withColumnRenamed(habitat,
                                   "label").select("features", "label")
    lr = LogisticRegression(maxIter=30, tol=1E-6, fitIntercept=True)
    ovr = OneVsRest(classifier=lr)
    ovrModel = ovr.fit(_train)
    predictions = ovrModel.transform(_test)
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    accuracyList.append(accuracy)
    print("Test Error = %g" % (1.0 - accuracy))
Ejemplo n.º 10
0
def test_gen_estimator_metadata(spark_session):  # pylint: disable=unused-argument
    tokenizer1 = Tokenizer(inputCol="text1", outputCol="words1")
    hashingTF1 = HashingTF(inputCol=tokenizer1.getOutputCol(),
                           outputCol="features1")

    tokenizer2 = Tokenizer(inputCol="text2", outputCol="words2")
    hashingTF2 = HashingTF(inputCol=tokenizer2.getOutputCol(),
                           outputCol="features2")

    vecAssembler = VectorAssembler(inputCols=["features1", "features2"],
                                   outputCol="features")

    lor = LogisticRegression(maxIter=10)
    ova = OneVsRest(classifier=lor)
    sub_pipeline1 = Pipeline(stages=[tokenizer1, hashingTF1])
    sub_pipeline2 = Pipeline(stages=[tokenizer2, hashingTF2])
    sub_pipeline3 = Pipeline(stages=[vecAssembler, ova])

    paramGrid = (ParamGridBuilder().addGrid(lor.maxIter, [10, 20]).addGrid(
        lor.regParam, [0.1, 0.01]).build())
    eva = MulticlassClassificationEvaluator()
    crossval = CrossValidator(estimator=sub_pipeline3,
                              estimatorParamMaps=paramGrid,
                              evaluator=eva,
                              numFolds=2)

    top_pipeline = Pipeline(stages=[sub_pipeline1, sub_pipeline2, crossval])

    metadata = _gen_estimator_metadata(top_pipeline)

    expected_hierarchy = {
        "name":
        "Pipeline_1",
        "stages": [
            {
                "name": "Pipeline_2",
                "stages": [{
                    "name": "Tokenizer_1"
                }, {
                    "name": "HashingTF_1"
                }]
            },
            {
                "name": "Pipeline_3",
                "stages": [{
                    "name": "Tokenizer_2"
                }, {
                    "name": "HashingTF_2"
                }]
            },
            {
                "name": "CrossValidator",
                "evaluator": {
                    "name": "MulticlassClassificationEvaluator"
                },
                "tuned_estimator": {
                    "name":
                    "Pipeline_4",
                    "stages": [
                        {
                            "name": "VectorAssembler"
                        },
                        {
                            "name": "OneVsRest",
                            "classifier": {
                                "name": "LogisticRegression"
                            }
                        },
                    ],
                },
            },
        ],
    }
    assert metadata.hierarchy == expected_hierarchy
    assert metadata.uid_to_indexed_name_map == {
        top_pipeline.uid: "Pipeline_1",
        sub_pipeline1.uid: "Pipeline_2",
        tokenizer1.uid: "Tokenizer_1",
        hashingTF1.uid: "HashingTF_1",
        sub_pipeline2.uid: "Pipeline_3",
        tokenizer2.uid: "Tokenizer_2",
        hashingTF2.uid: "HashingTF_2",
        crossval.uid: "CrossValidator",
        sub_pipeline3.uid: "Pipeline_4",
        vecAssembler.uid: "VectorAssembler",
        ova.uid: "OneVsRest",
        lor.uid: "LogisticRegression",
        eva.uid: "MulticlassClassificationEvaluator",
    }
    assert (metadata.uid_to_indexed_name_map[
        metadata.param_search_estimators[0].uid] == "CrossValidator")
Ejemplo n.º 11
0
i = 0
df_list_test = []


for element in x_train:  # row
            tup = (int(y_train[i]), Vectors.dense(element))
            i = i + 1
            df_list.append(tup)


Train_sparkframe = spark.createDataFrame(df_list, schema=['label', 'features'])

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import OneVsRest
from pyspark.ml.classification import LinearSVC

lr = LogisticRegression()
ovr = OneVsRest(classifier=lr)
print(datetime.datetime.now())
# Fit the model
mlrModel = ovr.fit(Train_sparkframe)
print(datetime.datetime.now())


# In[ ]:




Ejemplo n.º 12
0
    def test_save_load_pipeline_estimator(self):
        temp_path = tempfile.mkdtemp()
        training = self.spark.createDataFrame([
            (0, "a b c d e spark", 1.0),
            (1, "b d", 0.0),
            (2, "spark f g h", 1.0),
            (3, "hadoop mapreduce", 0.0),
            (4, "b spark who", 1.0),
            (5, "g d a y", 0.0),
            (6, "spark fly", 1.0),
            (7, "was mapreduce", 0.0),
        ], ["id", "text", "label"])

        # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                              outputCol="features")

        ova = OneVsRest(classifier=LogisticRegression())
        lr1 = LogisticRegression().setMaxIter(5)
        lr2 = LogisticRegression().setMaxIter(10)

        pipeline = Pipeline(stages=[tokenizer, hashingTF, ova])

        paramGrid = ParamGridBuilder() \
            .addGrid(hashingTF.numFeatures, [10, 100]) \
            .addGrid(ova.classifier, [lr1, lr2]) \
            .build()

        tvs = TrainValidationSplit(
            estimator=pipeline,
            estimatorParamMaps=paramGrid,
            evaluator=MulticlassClassificationEvaluator())

        # Run train validation split, and choose the best set of parameters.
        tvsModel = tvs.fit(training)

        # test save/load of CrossValidatorModel
        tvsModelPath = temp_path + "/tvsModel"
        tvsModel.save(tvsModelPath)
        loadedModel = TrainValidationSplitModel.load(tvsModelPath)
        self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
        self.assertEqual(len(loadedModel.bestModel.stages),
                         len(tvsModel.bestModel.stages))
        for loadedStage, originalStage in zip(loadedModel.bestModel.stages,
                                              tvsModel.bestModel.stages):
            self.assertEqual(loadedStage.uid, originalStage.uid)

        # Test nested pipeline
        nested_pipeline = Pipeline(
            stages=[tokenizer, Pipeline(stages=[hashingTF, ova])])
        tvs2 = TrainValidationSplit(
            estimator=nested_pipeline,
            estimatorParamMaps=paramGrid,
            evaluator=MulticlassClassificationEvaluator())

        # Run train validation split, and choose the best set of parameters.
        tvsModel2 = tvs2.fit(training)
        # test save/load of CrossValidatorModel
        tvsModelPath2 = temp_path + "/tvsModel2"
        tvsModel2.save(tvsModelPath2)
        loadedModel2 = TrainValidationSplitModel.load(tvsModelPath2)
        self.assertEqual(loadedModel2.bestModel.uid, tvsModel2.bestModel.uid)
        loaded_nested_pipeline_model = loadedModel2.bestModel.stages[1]
        original_nested_pipeline_model = tvsModel2.bestModel.stages[1]
        self.assertEqual(loaded_nested_pipeline_model.uid,
                         original_nested_pipeline_model.uid)
        self.assertEqual(len(loaded_nested_pipeline_model.stages),
                         len(original_nested_pipeline_model.stages))
        for loadedStage, originalStage in zip(
                loaded_nested_pipeline_model.stages,
                original_nested_pipeline_model.stages):
            self.assertEqual(loadedStage.uid, originalStage.uid)
Ejemplo n.º 13
0
tmpTrainX = tmpTrainRDD.toDF()
csvTrainTmp = spark.read.format("csv").option(
    "header", "true").load(imageDir + "/train25.csv")
#csvTrainTmp = spark.read.format("csv").option("header", "true").load(imageDir + "/test1.csv")
csvTrainRDD = csvTrainTmp.rdd.map(lambda x: Row(image=x[0], label=int(x[1])))
csvTrain = csvTrainRDD.toDF()
finalTrainDataFrame = tmpTrainX.join(csvTrain,
                                     tmpTrainX.fileName == csvTrain.image,
                                     'inner').drop(csvTrain.image)

featurizer = DeepImageFeaturizer(inputCol="image",
                                 outputCol="features",
                                 modelName="InceptionV3")

method = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)
ovr = OneVsRest(classifier=method)
featureVector = featurizer.transform(finalTrainDataFrame).persist()
model_gbt = ovr.fit(featureVector)
model_gbt.write().overwrite().save(
    'hdfs://192.168.65.188:8020/paih/model-gradiant-boosted-tree-classifier')

predictions = model_lr.transform(featureVector).persist()
predictionAndLabels = predictions.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Train Data set accuracy with Gradiant boosted tree classifier = " +
      str(evaluator.evaluate(predictionAndLabels)) + " and error " +
      str(1 - evaluator.evaluate(predictionAndLabels)))
#predictions.show()

#apply evaluator on constructed model for training data.
Ejemplo n.º 14
0
    training = splits[0]
    test = splits[1]

    #-------------------------------------------------------------------------------------------------------------------

    tokenizer_svm = RegexTokenizer(inputCol="tweet",
                                   outputCol="words",
                                   pattern="\\s+")

    hashing_tf_svm = HashingTF(inputCol="words", outputCol="tf")

    idf_svm = IDF(inputCol="tf", outputCol="features")

    svm = LinearSVC()

    ovr = OneVsRest(classifier=svm)

    pipeline_svm = Pipeline(
        stages=[tokenizer_svm, hashing_tf_svm, idf_svm, ovr])

    model_svm = pipeline_svm.fit(training)
    result_svm = model_svm.transform(test)
    result_svm.show()

    predictionAndLabels = result_svm.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    print("Test set accuracy = " +
          str(evaluator.evaluate(predictionAndLabels)))

    model_svm.write().overwrite().save("model-svm")
Ejemplo n.º 15
0
# transfer the train_df in to the dataframe with 2 column label and features so that we can do the further process
assembler_train = VectorAssembler(inputCols=train_df.columns[:1024], outputCol="fectures")
train_vectors_withlabel = assembler_train.transform(train_df).selectExpr("_c1024 as label_train", "fectures")

# fit the pca of train_vector first
# we set the k=50, so that we can keep 90% data of MINST
# After fit process, we can get the model of pca_200.
# Therefore, we can use the model to transform the test and train data.
pca = PCA(k=200, inputCol="fectures", outputCol="pca_vector")
model_200 = pca.fit(train_vectors_withlabel)
pca_train_result = model_200.transform(train_vectors_withlabel).selectExpr('label_train as label',
                                                                           'pca_vector as features')
pca_test_result = model_200.transform(test_vectors_withlabel).selectExpr('label_test as label', 'pca_vector as features')

lr = LogisticRegression(maxIter=200, tol=1E-6, fitIntercept=True)
ovr2 = OneVsRest(classifier=lr)
model2 = ovr2.fit(pca_train_result)
result = model2.transform(pca_test_result)
result_lp = result.selectExpr("label", "cast (prediction as int) prediction")
final_result = result_lp.rdd

# calculate the accuracy

neutral_zero_value = 0


def seqOp(a, b):
    if b[0] == b[1]:
        return a
    else:
        return a + 1
Ejemplo n.º 16
0
 def RandomForestClassifier(self):
   print("********************************************************************************************************************************************")
   print("Random Forest")
   self.t0 = time()
   rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees = 100, maxDepth = 4, maxBins = 32,impurity="entropy")
   pipeline = Pipeline(stages=[self.labelIndexer, self.featureIndexer, rf, self.labelConverter])
   model = pipeline.fit(self.trainingData)
   self.tm = time() - self.t0
   print ("Modeli egitme zamani {} saniye ".format(self.tm))

   self.t0 = time()
   self.predictions = model.transform(self.testData)
   self.tt = time() - self.t0
   print ("Test verisini siniflandirma zamani {} saniye ".format(self.tt))

   self.t0 = time()
   predictions_train = model.transform(self.trainingData)
   self.te = time() - self.t0
   print ("Egitim verisini siniflandirma zamani {} saniye ".format(self.te))
   
   self.predictions.select("features", "label", "predictedLabel", "probability").show(5)
   evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
   
   self.t0 = time()
   self.accuracy = evaluator.evaluate(self.predictions)
   self.tt2 = time() -self.t0
   print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, self.accuracy))
   
   self.t0 = time()
   self.train_accuracy = evaluator.evaluate(predictions_train)
   self.te2 = time() -self.t0
   print ("Tahmini yapilis zamani {} saniye . Egitim Verisinin dogrulanmasi {} saniye ".format(self.te2, self.train_accuracy))
   
   print("Test Dogruluk = %g" % (self.accuracy))
   self.testError = (1.0 - self.accuracy)
   print("Test Test Error = %g" % (1.0 - self.accuracy))

   print("Egitim Dogruluk = %g" % (self.train_accuracy))
   self.train_Error = (1.0 - self.train_accuracy)
   print("Egitim Error = %g" % (1.0 - self.train_accuracy))

   rfModel = model.stages[2]
   evaluatorf1 = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1")
   self.f1 = evaluatorf1.evaluate(self.predictions)
   self.train_f1 = evaluatorf1.evaluate(predictions_train)
   print("test f1 = %g" % self.f1)
   print("egitim f1 = %g" % self.train_f1)
 
   evaluatorwp = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision")
   self.wp = evaluatorwp.evaluate(self.predictions)
   self.train_wp = evaluatorwp.evaluate(predictions_train)
   print("test weightedPrecision = %g" % self.wp)
   print("egitim weightedPrecision = %g" % self.train_wp)
 
   evaluatorwr = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedRecall")
   self.wr = evaluatorwr.evaluate(self.predictions)
   self.train_wr = evaluatorwr.evaluate(predictions_train)
   print("test weightedRecall = %g" % self.wr)
   print("egitim weightedRecall = %g" % self.train_wr)

   rfModel = model.stages[2]
   #print (rfModel._call_java('toDebugString'))
   messagebox.showinfo("Başarılı","Model Eğitildi")
   self.skorEkle()
   self.ModelBtn.grid_remove()
   self.SonucBtn.grid(row=7,column=2)
   self.ExportCsvBtn.grid(row=8,column=2)
   
   svm = LinearSVC(maxIter=5, regParam=0.01)
   LSVC = LinearSVC()
   ovr = OneVsRest(classifier=LSVC)
   paramGrid = ParamGridBuilder().addGrid(LSVC.maxIter, [10, 100]).addGrid(LSVC.regParam,[0.001, 0.01, 1.0,10.0]).build()
   crossval = CrossValidator(estimator=ovr,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=MulticlassClassificationEvaluator(metricName="f1"),
                                  numFolds=2)
   Train_sparkframe = self.trainingData.select("features", "label")
   cvModel = crossval.fit(Train_sparkframe)
   bestModel = cvModel.bestModel