Example #1
0
 def test_fm_load(self):
     df = self.spark.createDataFrame([(1.0, Vectors.dense(1.0)),
                                      (0.0, Vectors.sparse(1, [], []))],
                                     ["label", "features"])
     fm = FMClassifier(factorSize=2, maxIter=50, stepSize=2.0)
     model = fm.fit(df)
     self.assertEqual(model.getSolver(), "adamW")
     transformed1 = model.transform(df)
     path = tempfile.mkdtemp()
     model_path = path + "/fm"
     model.save(model_path)
     model2 = FMClassificationModel.load(model_path)
     self.assertEqual(model2.getSolver(), "adamW")
     transformed2 = model2.transform(df)
     self.assertEqual(transformed1.take(2), transformed2.take(2))
Example #2
0
def models():
    rf_classifier = RandomForestClassifier(labelCol="label",
                                           featuresCol="features")
    print("Random Forest F1 = %g" % evaluate(rf_classifier))
    lsvc = LinearSVC(maxIter=50)
    print("Linear SVC F1 = %g" % evaluate(lsvc))
    gbt = GBTClassifier()
    print("GBT F1 = %g" % evaluate(gbt))

    mlp = MultilayerPerceptronClassifier(seed=1234, featuresCol='features')
    print("MLP F1 = %g" % evaluate(mlp))

    fm = FMClassifier()
    print('FM')
    evaluate(fm)
    featurize_lda()
    # NGrams
    # print("NGram Random Forest F1 = %g" % evaluate(rf_classifier, "ngrams"))
    # print("Ngram Linear SVC F1 = %g" % evaluate(lsvc, "ngrams"))
    # print("Ngram GBT F1 = %g" % evaluate(gbt, "ngrams"))
    # TF-IDF
    print("Ngram TF-IDF Random Forest F1 = %g" %
          evaluate(rf_classifier, "ngrams", "TF-IDF"))
    print("Ngram TF-IDF Linear SVC F1 = %g" %
          evaluate(lsvc, "ngrams", "TF-IDF"))
    print("Ngram TF-IDF GBT F1 = %g" % evaluate(gbt, "ngrams", "TF-IDF"))
    print("Words TF-IDF Random Forest F1 = %g" %
          evaluate(rf_classifier, "words", "TF-IDF"))
    print("Words TF-IDF Linear SVC F1 = %g" %
          evaluate(lsvc, "words", "TF-IDF"))
    print("Words TF-IDF GBT F1 = %g" % evaluate(gbt, "words", "TF-IDF"))
Example #3
0
 def test_fm_classification_summary(self):
     df = self.spark.createDataFrame(
         [
             (1.0, Vectors.dense(2.0)),
             (0.0, Vectors.dense(2.0)),
             (0.0, Vectors.dense(6.0)),
             (1.0, Vectors.dense(3.0)),
         ],
         ["label", "features"],
     )
     fm = FMClassifier(maxIter=5)
     model = fm.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary()
     # test that api is callable and returns expected types
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.scoreCol, "probability")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.predictionCol, "prediction")
     objHist = s.objectiveHistory
     self.assertTrue(
         isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.labels, list))
     self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
     self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
     self.assertTrue(isinstance(s.precisionByLabel, list))
     self.assertTrue(isinstance(s.recallByLabel, list))
     self.assertTrue(isinstance(s.fMeasureByLabel(), list))
     self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
     self.assertTrue(isinstance(s.roc, DataFrame))
     self.assertAlmostEqual(s.areaUnderROC, 0.625, 2)
     self.assertTrue(isinstance(s.pr, DataFrame))
     self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
     self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
     self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
     self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2)
     self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2)
     self.assertAlmostEqual(s.weightedRecall, 0.75, 2)
     self.assertAlmostEqual(s.weightedPrecision, 0.8333333333333333, 2)
     self.assertAlmostEqual(s.weightedFMeasure(), 0.7333333333333334, 2)
     self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.7333333333333334, 2)
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertTrue(isinstance(sameSummary, FMClassificationSummary))
     self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
Example #4
0
 def test_support_for_weightCol(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0),
                                      (1.0, Vectors.sparse(2, [], []), 1.0),
                                      (2.0, Vectors.dense(0.5, 0.5), 1.0)],
                                     ["label", "features", "weight"])
     # classifier inherits hasWeightCol
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr, weightCol="weight")
     self.assertIsNotNone(ovr.fit(df))
     # classifier doesn't inherit hasWeightCol
     dt = FMClassifier()
     ovr2 = OneVsRest(classifier=dt, weightCol="weight")
     self.assertIsNotNone(ovr2.fit(df))
Example #5
0
    def train_model(self, train_df, assembler, step):

        #lsvc =NaiveBayes(smoothing=1.0, modelType="multinomial")
        fm = FMClassifier(labelCol="labl",
                          featuresCol="features",
                          stepSize=step)
        #paramGrid = ParamGridBuilder()\
        #             .addGrid(fm.stepSize, [0.01,0.0001,0.00001])\
        #             .build()
        pipeline = Pipeline(stages=[assembler, fm])
        #crossval = CrossValidator(
        #           estimator=pipeline,
        #            estimatorParamMaps=paramGrid,
        #           evaluator=self.evaluator,
        #           numFolds=3)
        model = pipeline.fit(train_df)
        return model
Example #6
0
        "data/mllib/sample_libsvm_data.txt")

    # Index labels, adding metadata to the label column.
    # Fit on whole dataset to include all labels in index.
    labelIndexer = StringIndexer(inputCol="label",
                                 outputCol="indexedLabel").fit(data)
    # Scale features.
    featureScaler = MinMaxScaler(inputCol="features",
                                 outputCol="scaledFeatures").fit(data)

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a FM model.
    fm = FMClassifier(labelCol="indexedLabel",
                      featuresCol="scaledFeatures",
                      stepSize=0.001)

    # Create a Pipeline.
    pipeline = Pipeline(stages=[labelIndexer, featureScaler, fm])

    # Train model.
    model = pipeline.fit(trainingData)

    # Make predictions.
    predictions = model.transform(testData)

    # Select example rows to display.
    predictions.select("prediction", "indexedLabel", "features").show(5)

    # Select (prediction, true label) and compute test accuracy
# 生存分析
from pyspark.ml.regression import AFTSurvivalRegression
from pyspark.ml.linalg import Vectors

training = spark.createDataFrame([(1.218, 1.0, Vectors.dense(1.560, -0.605)),
                                  (2.949, 0.0, Vectors.dense(0.346, 2.158)),
                                  (3.627, 0.0, Vectors.dense(1.380, 0.231)),
                                  (0.273, 1.0, Vectors.dense(0.520, 1.151)),
                                  (4.199, 0.0, Vectors.dense(0.795, -0.226))],
                                 ["label", "censor", "features"])
quantileProbabilities = [0.3, 0.6]
aft = AFTSurvivalRegression(quantileProbabilities=quantileProbabilities,
                            quantilesCol="quantiles")

model = aft.fit(training)

# Print the coefficients, intercept and scale parameter for AFT survival regression
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))
print("Scale: " + str(model.scale))
model.transform(training).show(truncate=False)

# 分解机
from pyspark.ml.classification import FMClassifier

training = spark.read.format("libsvm").load("sample_libsvm_data.txt")
cls = FMClassifier(maxIter=10, regParam=0.3, factorSize=16)

fmModel = cls.fit(training)