def Logistic(): # Create a LogisticRegression instance. This instance is an Estimator. lr = LogisticRegression(maxIter=10, regParam=0.01) # Print out the parameters, documentation, and any default values. print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") # Learn a LogisticRegression model. This uses the parameters stored in lr. lrModel = lr.fit(train) lrModel.write().overwrite().save("save/bert_logistic") # Make predictions on test data using the Transformer.transform() method. # LogisticRegression.transform will only use the 'features' column. # Note that model2.transform() outputs a "myProbability" column instead of the usual # 'probability' column since we renamed the lr.probabilityCol parameter previously. predictions = lrModel.transform(test) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Accuracy = %g " % accuracy)
# Get some stats on the datasets df_train.describe("V1","Class").show() df_test.describe("V1","Class").show() # ## Specify a logistic regression model # Use the # [LogisticRegression](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.classification.LogisticRegression) # class to specify a logistic regression model: from pyspark.ml.classification import LogisticRegression log_reg = LogisticRegression(featuresCol="Features", labelCol="Class") # Use the `explainParams` method to get a full list of parameters: print(log_reg.explainParams()) # ## Fit the logistic regression model # Use the `fit` method to fit the linear regression model on the train DataFrame: # And use time to measure how long the model fit operation took. %time log_reg_model = log_reg.fit(df_train) # The result is an instance of the # [LogisticRegressionModel](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.classification.LogisticRegressionModel) # class: type(log_reg_model) # The model parameters are stored in the `intercept` and `coefficients` attributes: log_reg_model.intercept log_reg_model.coefficients
fit(data) featureIndexer = VectorIndexer().\ setInputCol('features').\ setOutputCol('indexedFeatures').\ fit(data) # 设置LogisticRegression算法的参数 lr = LogisticRegression().\ setLabelCol('indexedLabel').\ setFeaturesCol('indexedFeatures').\ setMaxIter(100).\ setRegParam(0.3).\ setElasticNetParam(0.8) print('LogisticRegression parameters:\n' + lr.explainParams()) # 设置一个IndexToString的转换器 labelConverter = IndexToString().\ setInputCol('prediction').\ setOutputCol('predictedLabel').\ setLabels(labelIndexer.labels) # 把预测的类别重新转化成字符型的,构建一个机器学习流水线,设置各个阶段,上一个阶段的输出,将是本阶段的输入 lrPipeline = Pipeline().\ setStages([labelIndexer, featureIndexer, lr, labelConverter]) # 把数据集随机分成训练集和测试集,其中训练集占70% trainingData, testData = data.randomSplit([0.7, 0.3]) lrPipelineModel = lrPipeline.fit(trainingData) lrPredictions = lrPipelineModel.transform(testData) ''' Pipeline本质上是一个评估器,当Pipeline调用fit()的时候就产生了一个PipelineModel,它是一个
# Prepare training data from a list of (label, features) tuples. training = spark.createDataFrame([(1.0, Vectors.dense([0.0, 1.1, 0.1])), (0.0, Vectors.dense([2.0, 1.0, -1.0])), (0.0, Vectors.dense([2.0, 1.3, 1.0])), (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"]) # In[5]: # Create a LogisticRegression instance. This instance is an Estimator. lr = LogisticRegression(maxIter=10, regParam=0.01) # In[6]: # Print out the parameters, documentation, and any default values. print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") # In[7]: # Learn a LogisticRegression model. This uses the parameters stored in lr. model1 = lr.fit(training) # In[8]: model1 # In[9]: # Since model1 is a Model (i.e., a transformer produced by an Estimator), # we can view the parameters it used during fit(). # This prints the parameter (name: value) pairs, where names are unique IDs for this
# COMMAND ---------- # MAGIC %md The evaluator currently accepts 2 kinds of metrics - areaUnderROC and areaUnderPR. # MAGIC We can set it to areaUnderPR by using evaluator.setMetricName("areaUnderPR"). # COMMAND ---------- # MAGIC %md # MAGIC Now we will try tuning the model with the ParamGridBuilder and the CrossValidator. # MAGIC # MAGIC If you are unsure what params are available for tuning, you can use explainParams() to print a list of all params. # COMMAND ---------- print lr.explainParams() # COMMAND ---------- # MAGIC %md As we indicate 5 values for regParam, 4 values for maxIter, and 5 values for elasticNetParam, this grid will have 5 x 4 x 5 = 100 parameter settings for CrossValidator to choose from. We will create a 5-fold cross validator. # COMMAND ---------- from pyspark.ml.tuning import ParamGridBuilder, CrossValidator # Create ParamGrid for Cross Validation paramGrid = (ParamGridBuilder() .addGrid(lr.regParam, [0.01, 0.1, 0.5, 1.0, 2.0]) .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.5, 0.8, 1.0]) .addGrid(lr.maxIter, [1, 5, 10, 20]) .build())
print_performance_metrics(gbPredictions) # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC ## Cross Validation # COMMAND ---------- # MAGIC %md # MAGIC For each model you can run the below comand to see its params and a brief explanation of each. # COMMAND ---------- print(lr.explainParams()) # COMMAND ---------- print(gb.explainParams()) # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC ####Logisitic Regression - Param Grid # COMMAND ---------- from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
bInput = spark.read.format("parquet").load("/data/binary-classification")\ .selectExpr("features", "cast(label as double) as label") # COMMAND ---------- from pyspark.ml.classification import LogisticRegression lr = LogisticRegression() print lr.explainParams() # see all parameters lrModel = lr.fit(bInput) # COMMAND ---------- print lrModel.coefficients print lrModel.intercept # COMMAND ---------- summary = lrModel.summary print summary.areaUnderROC summary.roc.show() summary.pr.show() # COMMAND ---------- summary.objectiveHistory # COMMAND ---------- from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier() print dt.explainParams()
sc = SparkContext(appName="ML Example") sc.setLogLevel("FATAL") sqlContext = SQLContext(sc) # Prepare training data from a list of (label, features) tuples. training = sqlContext.createDataFrame([ (1.0, Vectors.dense([0.0, 1.1, 0.1])), (0.0, Vectors.dense([2.0, 1.0, -1.0])), (0.0, Vectors.dense([2.0, 1.3, 1.0])), (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"]) # Create a LogisticRegression instance. This instance is an Estimator. lr = LogisticRegression(maxIter=10, regParam=0.01) # Print out the parameters, documentation, and any default values. print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") # Learn a LogisticRegression model. This uses the parameters stored in lr. model1 = lr.fit(training) # Since model1 is a Model (i.e., a transformer produced by an Estimator), # we can view the parameters it used during fit(). # This prints the parameter (name: value) pairs, where names are unique IDs for this # LogisticRegression instance. print("Model 1 was fit using parameters: ") print(model1.extractParamMap()) # We may alternatively specify parameters using a Python dictionary as a paramMap paramMap = {lr.maxIter: 20} paramMap[lr.maxIter] = 30 # Specify 1 Param, overwriting the original maxIter. paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55}) # Specify multiple Params.
dfTitanic = sqlContext.createDataFrame(titanic[["label", "Pclass", "Parch"]]) assembler = VectorAssembler( inputCols=["Pclass", "Parch"], # ["your", "independent", "variables"], outputCol="features") transformed = assembler.transform(dfTitanic) trainingData, testData = transformed.randomSplit([0.75, 0.25]) from pyspark.ml.classification import LogisticRegression # Create initial LogisticRegression model lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) print lr.explainParams() # Train model with Training Data lrModel = lr.fit(trainingData) # Make predictions on test data using the transform() method. # LogisticRegression.transform() will only use the 'features' column. predictions = lrModel.transform(testData) predictions.printSchema() # check predictions predictions.take(10) from pyspark.ml.evaluation import BinaryClassificationEvaluator # Print intercept and coefficients
from pyspark.ml.linalg import Vectors from pyspark.ml.classification import LogisticRegression from pyspark.sql import SparkSession spark=SparkSession.builder.appName("Pipeline").getOrCreate() #Example: Estimator, Transformer, and Param training=spark.createDataFrame([(1.0, Vectors.dense([0.0, 1.1, 0.1])), (0.0, Vectors.dense([2.0, 1.0, -1.0])), (0.0, Vectors.dense([2.0, 1.3, 1.0])), (1.0, Vectors.dense([0.0, 1.2, -0.5]))],["label", "features"]) lr=LogisticRegression(maxIter=10, regParam=0.01) print("LogisticRegression: ", lr.explainParams()) model1=lr.fit(training) print("Model1 was fit using params:") print(model1.extractParamMap()) paramMap={lr.maxIter:20} paramMap[lr.maxIter]=30 paramMap.update({lr.regParam:0.1, lr.threshold:0.55}) paramMap2={lr.probabilityCol:"myProbability"} paramMapCombined=paramMap.copy() paramMapCombined.update(paramMap2) model2=lr.fit(training, paramMapCombined) print("Model2 was fit using params:") print(model2.extractParamMap()) test=spark.createDataFrame([ (1.0, Vectors.dense([-1.0, 1.5, 1.3])), (0.0, Vectors.dense([3.0, 2.0, -0.1])), (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"]) prediction=model2.transform(test) result=prediction.select("features", "label", "myProbability", "prediction").collect() for row in result:
def estimator_transformer(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() # Prepare training data from a list of (label, features) tuples. training = spark.createDataFrame([(1.0, Vectors.dense([0.0, 1.1, 0.1])), (0.0, Vectors.dense([2.0, 1.0, -1.0])), (0.0, Vectors.dense([2.0, 1.3, 1.0])), (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"]) # Create a LogisticRegression instance. This instance is an Estimator. lr = LogisticRegression(maxIter=10, regParam=0.01) # Print out the parameters, documentation, and any default values. print("\nLogisticRegression parameters:\n" + lr.explainParams() + "\n") lr.setMaxIter(10).setRegParam(0.01).setAggregationDepth(5) # Learn a LogisticRegression model. This uses the parameters stored in lr. model1 = lr.fit(training) # Since model1 is a Model (i.e., a transformer produced by an Estimator), # we can view the parameters it used during fit(). # This prints the parameter (name: value) pairs, where names are unique IDs for this # LogisticRegression instance. print("\nModel 1 was fit using parameters: ") print(model1.extractParamMap()) # We may alternatively specify parameters using a Python dictionary as a paramMap paramMap = {lr.maxIter: 20} paramMap[ lr.maxIter] = 30 # Specify 1 Param, overwriting the original maxIter. paramMap.update({ lr.regParam: 0.1, lr.threshold: 0.55 }) # Specify multiple Params. # You can combine paramMaps, which are python dictionaries. paramMap2 = { lr.probabilityCol: "myProbability" } # Change output column name paramMapCombined = paramMap.copy() paramMapCombined.update(paramMap2) # Now learn a new model using the paramMapCombined parameters. # paramMapCombined overrides all parameters set earlier via lr.set* methods. model2 = lr.fit(training, paramMapCombined) print("\nModel 2 was fit using parameters: ") print(model2.extractParamMap()) # Prepare test data test = spark.createDataFrame([(1.0, Vectors.dense([-1.0, 1.5, 1.3])), (0.0, Vectors.dense([3.0, 2.0, -0.1])), (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"]) # Make predictions on test data using the Transformer.transform() method. # LogisticRegression.transform will only use the 'features' column. # Note that model2.transform() outputs a "myProbability" column instead of the usual # 'probability' column since we renamed the lr.probabilityCol parameter previously. prediction = model2.transform(test) result = prediction.select("features", "label", "myProbability", "prediction") \ .collect() for row in result: print("features=%s, label=%s -> prob=%s, prediction=%s" % (row.features, row.label, row.myProbability, row.prediction)) spark.stop()
# COMMAND ---------- bInput = spark.read.format("parquet").load("/databricks-datasets/definitive-guide/data/binary-classification")\ .selectExpr("features", "cast(label as double) as label") # COMMAND ---------- print(bInput.count()) bInput.show(5, False) # COMMAND ---------- from pyspark.ml.classification import LogisticRegression lr = LogisticRegression() print(lr.explainParams()) # see all parameters lrModel = lr.fit(bInput) # COMMAND ---------- print(lrModel.coefficients) print(lrModel.intercept) # COMMAND ---------- summary = lrModel.summary print(summary.areaUnderROC) summary.roc.show() summary.pr.show() # COMMAND ----------
def main(): #spark = SparkSession.builder.master("yarn").appName("spark_demo").getOrCreate() spark = SparkSession.builder.getOrCreate() print "Session created!" sc = spark.sparkContext print "The url to track the job: http://namenode-01:8088/proxy/" + sc.applicationId sampleHDFS_train = sys.argv[1] sampleHDFS_test = sys.argv[2] outputHDFS = sys.argv[3] featureLst, colLst = getFeatureName() #读取hdfs上数据,将RDD转为DataFrame #训练数据 rdd_train = sc.textFile(sampleHDFS_train) rowRDD_train = rdd_train.map(lambda x: getDict(x.split('\t'), colLst)) trainDF = spark.createDataFrame(rowRDD_train) #测试数据 rdd_test = sc.textFile(sampleHDFS_test) rowRDD_test = rdd_test.map(lambda x: getDict(x.split('\t'), colLst)) testDF = spark.createDataFrame(rowRDD_test) #用于训练的特征featureLst vectorAssembler = VectorAssembler().setInputCols(featureLst).setOutputCol( "features") #### 训练 #### print "step 1" lr = LogisticRegression(regParam=0.01, maxIter=100) # regParam 正则项参数 pipeline = Pipeline(stages=[vectorAssembler, lr]) model = pipeline.fit(trainDF) #打印参数 print "\n-------------------------------------------------------------------------" print "LogisticRegression parameters:\n" + lr.explainParams() + "\n" print "-------------------------------------------------------------------------\n" #### 预测, 保存结果 #### print "step 2" labelsAndPreds = model.transform(testDF).withColumn("probability_xj", to_array(col("probability"))[1])\ .select("uid", "label", "prediction", "probability_xj") labelsAndPreds.show() labelsAndPreds.write.mode("overwrite").options( header="true").csv(outputHDFS + "/target/output") #### 评估不同阈值下的准确率、召回率 print "step 3" labelsAndPreds_label_1 = labelsAndPreds.where(labelsAndPreds.label == 1) labelsAndPreds_label_0 = labelsAndPreds.where(labelsAndPreds.label == 0) labelsAndPreds_label_1.show(3) labelsAndPreds_label_0.show(3) t_cnt = labelsAndPreds_label_1.count() f_cnt = labelsAndPreds_label_0.count() print "thre\ttp\ttn\tfp\tfn\taccuracy\trecall" for thre in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]: tp = labelsAndPreds_label_1.where( labelsAndPreds_label_1.probability_xj > thre).count() tn = t_cnt - tp fp = labelsAndPreds_label_0.where( labelsAndPreds_label_0.probability_xj > thre).count() fn = f_cnt - fp print("%.1f\t%d\t%d\t%d\t%d\t%.4f\t%.4f" % (thre, tp, tn, fp, fn, float(tp) / (tp + fp), float(tp) / (t_cnt))) # 保存模型 model.write().overwrite().save(outputHDFS + "/target/model/lrModel") #加载模型 #model.load(outputHDFS + "/target/model/lrModel") print "output:", outputHDFS