def test_fm_load(self): df = self.spark.createDataFrame([(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) fm = FMClassifier(factorSize=2, maxIter=50, stepSize=2.0) model = fm.fit(df) self.assertEqual(model.getSolver(), "adamW") transformed1 = model.transform(df) path = tempfile.mkdtemp() model_path = path + "/fm" model.save(model_path) model2 = FMClassificationModel.load(model_path) self.assertEqual(model2.getSolver(), "adamW") transformed2 = model2.transform(df) self.assertEqual(transformed1.take(2), transformed2.take(2))
def models(): rf_classifier = RandomForestClassifier(labelCol="label", featuresCol="features") print("Random Forest F1 = %g" % evaluate(rf_classifier)) lsvc = LinearSVC(maxIter=50) print("Linear SVC F1 = %g" % evaluate(lsvc)) gbt = GBTClassifier() print("GBT F1 = %g" % evaluate(gbt)) mlp = MultilayerPerceptronClassifier(seed=1234, featuresCol='features') print("MLP F1 = %g" % evaluate(mlp)) fm = FMClassifier() print('FM') evaluate(fm) featurize_lda() # NGrams # print("NGram Random Forest F1 = %g" % evaluate(rf_classifier, "ngrams")) # print("Ngram Linear SVC F1 = %g" % evaluate(lsvc, "ngrams")) # print("Ngram GBT F1 = %g" % evaluate(gbt, "ngrams")) # TF-IDF print("Ngram TF-IDF Random Forest F1 = %g" % evaluate(rf_classifier, "ngrams", "TF-IDF")) print("Ngram TF-IDF Linear SVC F1 = %g" % evaluate(lsvc, "ngrams", "TF-IDF")) print("Ngram TF-IDF GBT F1 = %g" % evaluate(gbt, "ngrams", "TF-IDF")) print("Words TF-IDF Random Forest F1 = %g" % evaluate(rf_classifier, "words", "TF-IDF")) print("Words TF-IDF Linear SVC F1 = %g" % evaluate(lsvc, "words", "TF-IDF")) print("Words TF-IDF GBT F1 = %g" % evaluate(gbt, "words", "TF-IDF"))
def test_fm_classification_summary(self): df = self.spark.createDataFrame( [ (1.0, Vectors.dense(2.0)), (0.0, Vectors.dense(2.0)), (0.0, Vectors.dense(6.0)), (1.0, Vectors.dense(3.0)), ], ["label", "features"], ) fm = FMClassifier(maxIter=5) model = fm.fit(df) self.assertTrue(model.hasSummary) s = model.summary() # test that api is callable and returns expected types self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.scoreCol, "probability") self.assertEqual(s.labelCol, "label") self.assertEqual(s.predictionCol, "prediction") objHist = s.objectiveHistory self.assertTrue( isinstance(objHist, list) and isinstance(objHist[0], float)) self.assertGreater(s.totalIterations, 0) self.assertTrue(isinstance(s.labels, list)) self.assertTrue(isinstance(s.truePositiveRateByLabel, list)) self.assertTrue(isinstance(s.falsePositiveRateByLabel, list)) self.assertTrue(isinstance(s.precisionByLabel, list)) self.assertTrue(isinstance(s.recallByLabel, list)) self.assertTrue(isinstance(s.fMeasureByLabel(), list)) self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list)) self.assertTrue(isinstance(s.roc, DataFrame)) self.assertAlmostEqual(s.areaUnderROC, 0.625, 2) self.assertTrue(isinstance(s.pr, DataFrame)) self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame)) self.assertTrue(isinstance(s.precisionByThreshold, DataFrame)) self.assertTrue(isinstance(s.recallByThreshold, DataFrame)) self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2) self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2) self.assertAlmostEqual(s.weightedRecall, 0.75, 2) self.assertAlmostEqual(s.weightedPrecision, 0.8333333333333333, 2) self.assertAlmostEqual(s.weightedFMeasure(), 0.7333333333333334, 2) self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.7333333333333334, 2) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertTrue(isinstance(sameSummary, FMClassificationSummary)) self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
def test_support_for_weightCol(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0), (1.0, Vectors.sparse(2, [], []), 1.0), (2.0, Vectors.dense(0.5, 0.5), 1.0)], ["label", "features", "weight"]) # classifier inherits hasWeightCol lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr, weightCol="weight") self.assertIsNotNone(ovr.fit(df)) # classifier doesn't inherit hasWeightCol dt = FMClassifier() ovr2 = OneVsRest(classifier=dt, weightCol="weight") self.assertIsNotNone(ovr2.fit(df))
def train_model(self, train_df, assembler, step): #lsvc =NaiveBayes(smoothing=1.0, modelType="multinomial") fm = FMClassifier(labelCol="labl", featuresCol="features", stepSize=step) #paramGrid = ParamGridBuilder()\ # .addGrid(fm.stepSize, [0.01,0.0001,0.00001])\ # .build() pipeline = Pipeline(stages=[assembler, fm]) #crossval = CrossValidator( # estimator=pipeline, # estimatorParamMaps=paramGrid, # evaluator=self.evaluator, # numFolds=3) model = pipeline.fit(train_df) return model
"data/mllib/sample_libsvm_data.txt") # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) # Scale features. featureScaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures").fit(data) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a FM model. fm = FMClassifier(labelCol="indexedLabel", featuresCol="scaledFeatures", stepSize=0.001) # Create a Pipeline. pipeline = Pipeline(stages=[labelIndexer, featureScaler, fm]) # Train model. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("prediction", "indexedLabel", "features").show(5) # Select (prediction, true label) and compute test accuracy
# 生存分析 from pyspark.ml.regression import AFTSurvivalRegression from pyspark.ml.linalg import Vectors training = spark.createDataFrame([(1.218, 1.0, Vectors.dense(1.560, -0.605)), (2.949, 0.0, Vectors.dense(0.346, 2.158)), (3.627, 0.0, Vectors.dense(1.380, 0.231)), (0.273, 1.0, Vectors.dense(0.520, 1.151)), (4.199, 0.0, Vectors.dense(0.795, -0.226))], ["label", "censor", "features"]) quantileProbabilities = [0.3, 0.6] aft = AFTSurvivalRegression(quantileProbabilities=quantileProbabilities, quantilesCol="quantiles") model = aft.fit(training) # Print the coefficients, intercept and scale parameter for AFT survival regression print("Coefficients: " + str(model.coefficients)) print("Intercept: " + str(model.intercept)) print("Scale: " + str(model.scale)) model.transform(training).show(truncate=False) # 分解机 from pyspark.ml.classification import FMClassifier training = spark.read.format("libsvm").load("sample_libsvm_data.txt") cls = FMClassifier(maxIter=10, regParam=0.3, factorSize=16) fmModel = cls.fit(training)