def Logistic():
    # Create a LogisticRegression instance. This instance is an Estimator.
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    # Print out the parameters, documentation, and any default values.
    print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

    # Learn a LogisticRegression model. This uses the parameters stored in lr.
    lrModel = lr.fit(train)
    lrModel.write().overwrite().save("save/bert_logistic")

    # Make predictions on test data using the Transformer.transform() method.
    # LogisticRegression.transform will only use the 'features' column.
    # Note that model2.transform() outputs a "myProbability" column instead of the usual
    # 'probability' column since we renamed the lr.probabilityCol parameter previously.
    predictions = lrModel.transform(test)
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Accuracy = %g " % accuracy)
# Get some stats on the datasets
df_train.describe("V1","Class").show()
df_test.describe("V1","Class").show()


# ## Specify a logistic regression model

# Use the
# [LogisticRegression](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.classification.LogisticRegression)
# class to specify a logistic regression model:
from pyspark.ml.classification import LogisticRegression
log_reg = LogisticRegression(featuresCol="Features", labelCol="Class")

# Use the `explainParams` method to get a full list of parameters:
print(log_reg.explainParams())

# ## Fit the logistic regression model

# Use the `fit` method to fit the linear regression model on the train DataFrame:
# And use time to measure how long the model fit operation took.
%time log_reg_model = log_reg.fit(df_train)

# The result is an instance of the
# [LogisticRegressionModel](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.classification.LogisticRegressionModel)
# class:
type(log_reg_model)

# The model parameters are stored in the `intercept` and `coefficients` attributes:
log_reg_model.intercept
log_reg_model.coefficients
Beispiel #3
0
    fit(data)

featureIndexer = VectorIndexer().\
    setInputCol('features').\
    setOutputCol('indexedFeatures').\
    fit(data)

# 设置LogisticRegression算法的参数
lr = LogisticRegression().\
    setLabelCol('indexedLabel').\
    setFeaturesCol('indexedFeatures').\
    setMaxIter(100).\
    setRegParam(0.3).\
    setElasticNetParam(0.8)

print('LogisticRegression parameters:\n' + lr.explainParams())

# 设置一个IndexToString的转换器
labelConverter = IndexToString().\
    setInputCol('prediction').\
    setOutputCol('predictedLabel').\
    setLabels(labelIndexer.labels)
# 把预测的类别重新转化成字符型的,构建一个机器学习流水线,设置各个阶段,上一个阶段的输出,将是本阶段的输入
lrPipeline = Pipeline().\
    setStages([labelIndexer, featureIndexer, lr, labelConverter])
# 把数据集随机分成训练集和测试集,其中训练集占70%
trainingData, testData = data.randomSplit([0.7, 0.3])
lrPipelineModel = lrPipeline.fit(trainingData)
lrPredictions = lrPipelineModel.transform(testData)
'''
Pipeline本质上是一个评估器,当Pipeline调用fit()的时候就产生了一个PipelineModel,它是一个
# Prepare training data from a list of (label, features) tuples.
training = spark.createDataFrame([(1.0, Vectors.dense([0.0, 1.1, 0.1])),
                                  (0.0, Vectors.dense([2.0, 1.0, -1.0])),
                                  (0.0, Vectors.dense([2.0, 1.3, 1.0])),
                                  (1.0, Vectors.dense([0.0, 1.2, -0.5]))],
                                 ["label", "features"])

# In[5]:

# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)

# In[6]:

# Print out the parameters, documentation, and any default values.
print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

# In[7]:

# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(training)

# In[8]:

model1

# In[9]:

# Since model1 is a Model (i.e., a transformer produced by an Estimator),
# we can view the parameters it used during fit().
# This prints the parameter (name: value) pairs, where names are unique IDs for this
# COMMAND ----------

# MAGIC %md The evaluator currently accepts 2 kinds of metrics - areaUnderROC and areaUnderPR.
# MAGIC We can set it to areaUnderPR by using evaluator.setMetricName("areaUnderPR").

# COMMAND ----------

# MAGIC %md
# MAGIC Now we will try tuning the model with the ParamGridBuilder and the CrossValidator.
# MAGIC 
# MAGIC If you are unsure what params are available for tuning, you can use explainParams() to print a list of all params.

# COMMAND ----------

print lr.explainParams()

# COMMAND ----------

# MAGIC %md As we indicate 5 values for regParam, 4 values for maxIter, and 5 values for elasticNetParam, this grid will have 5 x 4 x 5 = 100 parameter settings for CrossValidator to choose from. We will create a 5-fold cross validator.

# COMMAND ----------

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.1, 0.5, 1.0, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.5, 0.8, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10, 20])
             .build())
Beispiel #6
0
print_performance_metrics(gbPredictions)

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC ## Cross Validation

# COMMAND ----------

# MAGIC %md
# MAGIC For each model you can run the below comand to see its params and a brief explanation of each.

# COMMAND ----------

print(lr.explainParams())

# COMMAND ----------

print(gb.explainParams())

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC ####Logisitic Regression - Param Grid

# COMMAND ----------

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
Beispiel #7
0
bInput = spark.read.format("parquet").load("/data/binary-classification")\
  .selectExpr("features", "cast(label as double) as label")

# COMMAND ----------

from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression()
print lr.explainParams()  # see all parameters
lrModel = lr.fit(bInput)

# COMMAND ----------

print lrModel.coefficients
print lrModel.intercept

# COMMAND ----------

summary = lrModel.summary
print summary.areaUnderROC
summary.roc.show()
summary.pr.show()

# COMMAND ----------

summary.objectiveHistory

# COMMAND ----------

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier()
print dt.explainParams()
Beispiel #8
0
sc = SparkContext(appName="ML Example")
sc.setLogLevel("FATAL")
sqlContext = SQLContext(sc)

# Prepare training data from a list of (label, features) tuples.
training = sqlContext.createDataFrame([
    (1.0, Vectors.dense([0.0, 1.1, 0.1])),
    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
    (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])

# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)
# Print out the parameters, documentation, and any default values.
print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(training)

# Since model1 is a Model (i.e., a transformer produced by an Estimator),
# we can view the parameters it used during fit().
# This prints the parameter (name: value) pairs, where names are unique IDs for this
# LogisticRegression instance.
print("Model 1 was fit using parameters: ")
print(model1.extractParamMap())

# We may alternatively specify parameters using a Python dictionary as a paramMap
paramMap = {lr.maxIter: 20}
paramMap[lr.maxIter] = 30 # Specify 1 Param, overwriting the original maxIter.
paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55}) # Specify multiple Params.
Beispiel #9
0
dfTitanic = sqlContext.createDataFrame(titanic[["label", "Pclass", "Parch"]])

assembler = VectorAssembler(
    inputCols=["Pclass", "Parch"], # ["your", "independent", "variables"],
    outputCol="features")

transformed = assembler.transform(dfTitanic)

trainingData, testData = transformed.randomSplit([0.75, 0.25])

from pyspark.ml.classification import LogisticRegression

# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)
print lr.explainParams()

# Train model with Training Data
lrModel = lr.fit(trainingData)

# Make predictions on test data using the transform() method.
# LogisticRegression.transform() will only use the 'features' column.
predictions = lrModel.transform(testData)
predictions.printSchema()

# check predictions
predictions.take(10)

from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Print intercept and coefficients
Beispiel #10
0
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("Pipeline").getOrCreate()
#Example: Estimator, Transformer, and Param
training=spark.createDataFrame([(1.0, Vectors.dense([0.0, 1.1, 0.1])),
    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
    (1.0, Vectors.dense([0.0, 1.2, -0.5]))],["label", "features"])
lr=LogisticRegression(maxIter=10, regParam=0.01)
print("LogisticRegression: ", lr.explainParams())
model1=lr.fit(training)
print("Model1 was fit using params:")
print(model1.extractParamMap())
paramMap={lr.maxIter:20}
paramMap[lr.maxIter]=30
paramMap.update({lr.regParam:0.1, lr.threshold:0.55})
paramMap2={lr.probabilityCol:"myProbability"}
paramMapCombined=paramMap.copy()
paramMapCombined.update(paramMap2)
model2=lr.fit(training, paramMapCombined)
print("Model2 was fit using params:")
print(model2.extractParamMap())
test=spark.createDataFrame([
    (1.0, Vectors.dense([-1.0, 1.5, 1.3])),
    (0.0, Vectors.dense([3.0, 2.0, -0.1])),
    (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"])
prediction=model2.transform(test)
result=prediction.select("features", "label", "myProbability", "prediction").collect()
for row in result:
Beispiel #11
0
def estimator_transformer():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    # Prepare training data from a list of (label, features) tuples.
    training = spark.createDataFrame([(1.0, Vectors.dense([0.0, 1.1, 0.1])),
                                      (0.0, Vectors.dense([2.0, 1.0, -1.0])),
                                      (0.0, Vectors.dense([2.0, 1.3, 1.0])),
                                      (1.0, Vectors.dense([0.0, 1.2, -0.5]))],
                                     ["label", "features"])

    # Create a LogisticRegression instance. This instance is an Estimator.
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    # Print out the parameters, documentation, and any default values.
    print("\nLogisticRegression parameters:\n" + lr.explainParams() + "\n")
    lr.setMaxIter(10).setRegParam(0.01).setAggregationDepth(5)
    # Learn a LogisticRegression model. This uses the parameters stored in lr.
    model1 = lr.fit(training)

    # Since model1 is a Model (i.e., a transformer produced by an Estimator),
    # we can view the parameters it used during fit().
    # This prints the parameter (name: value) pairs, where names are unique IDs for this
    # LogisticRegression instance.
    print("\nModel 1 was fit using parameters: ")
    print(model1.extractParamMap())

    # We may alternatively specify parameters using a Python dictionary as a paramMap
    paramMap = {lr.maxIter: 20}
    paramMap[
        lr.maxIter] = 30  # Specify 1 Param, overwriting the original maxIter.
    paramMap.update({
        lr.regParam: 0.1,
        lr.threshold: 0.55
    })  # Specify multiple Params.

    # You can combine paramMaps, which are python dictionaries.
    paramMap2 = {
        lr.probabilityCol: "myProbability"
    }  # Change output column name
    paramMapCombined = paramMap.copy()
    paramMapCombined.update(paramMap2)

    # Now learn a new model using the paramMapCombined parameters.
    # paramMapCombined overrides all parameters set earlier via lr.set* methods.
    model2 = lr.fit(training, paramMapCombined)
    print("\nModel 2 was fit using parameters: ")
    print(model2.extractParamMap())

    # Prepare test data
    test = spark.createDataFrame([(1.0, Vectors.dense([-1.0, 1.5, 1.3])),
                                  (0.0, Vectors.dense([3.0, 2.0, -0.1])),
                                  (1.0, Vectors.dense([0.0, 2.2, -1.5]))],
                                 ["label", "features"])

    # Make predictions on test data using the Transformer.transform() method.
    # LogisticRegression.transform will only use the 'features' column.
    # Note that model2.transform() outputs a "myProbability" column instead of the usual
    # 'probability' column since we renamed the lr.probabilityCol parameter previously.
    prediction = model2.transform(test)
    result = prediction.select("features", "label", "myProbability", "prediction") \
        .collect()

    for row in result:
        print("features=%s, label=%s -> prob=%s, prediction=%s" %
              (row.features, row.label, row.myProbability, row.prediction))
    spark.stop()
# COMMAND ----------

bInput = spark.read.format("parquet").load("/databricks-datasets/definitive-guide/data/binary-classification")\
  .selectExpr("features", "cast(label as double) as label")

# COMMAND ----------

print(bInput.count())
bInput.show(5, False)

# COMMAND ----------

from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression()
print(lr.explainParams())  # see all parameters
lrModel = lr.fit(bInput)

# COMMAND ----------

print(lrModel.coefficients)
print(lrModel.intercept)

# COMMAND ----------

summary = lrModel.summary
print(summary.areaUnderROC)
summary.roc.show()
summary.pr.show()

# COMMAND ----------
Beispiel #13
0
def main():
    #spark = SparkSession.builder.master("yarn").appName("spark_demo").getOrCreate()
    spark = SparkSession.builder.getOrCreate()
    print "Session created!"
    sc = spark.sparkContext
    print "The url to track the job: http://namenode-01:8088/proxy/" + sc.applicationId

    sampleHDFS_train = sys.argv[1]
    sampleHDFS_test = sys.argv[2]
    outputHDFS = sys.argv[3]

    featureLst, colLst = getFeatureName()

    #读取hdfs上数据,将RDD转为DataFrame
    #训练数据
    rdd_train = sc.textFile(sampleHDFS_train)
    rowRDD_train = rdd_train.map(lambda x: getDict(x.split('\t'), colLst))
    trainDF = spark.createDataFrame(rowRDD_train)
    #测试数据
    rdd_test = sc.textFile(sampleHDFS_test)
    rowRDD_test = rdd_test.map(lambda x: getDict(x.split('\t'), colLst))
    testDF = spark.createDataFrame(rowRDD_test)

    #用于训练的特征featureLst
    vectorAssembler = VectorAssembler().setInputCols(featureLst).setOutputCol(
        "features")

    #### 训练 ####
    print "step 1"
    lr = LogisticRegression(regParam=0.01, maxIter=100)  # regParam 正则项参数

    pipeline = Pipeline(stages=[vectorAssembler, lr])
    model = pipeline.fit(trainDF)
    #打印参数
    print "\n-------------------------------------------------------------------------"
    print "LogisticRegression parameters:\n" + lr.explainParams() + "\n"
    print "-------------------------------------------------------------------------\n"

    #### 预测, 保存结果 ####
    print "step 2"
    labelsAndPreds = model.transform(testDF).withColumn("probability_xj", to_array(col("probability"))[1])\
                                            .select("uid", "label", "prediction", "probability_xj")
    labelsAndPreds.show()
    labelsAndPreds.write.mode("overwrite").options(
        header="true").csv(outputHDFS + "/target/output")

    #### 评估不同阈值下的准确率、召回率
    print "step 3"
    labelsAndPreds_label_1 = labelsAndPreds.where(labelsAndPreds.label == 1)
    labelsAndPreds_label_0 = labelsAndPreds.where(labelsAndPreds.label == 0)
    labelsAndPreds_label_1.show(3)
    labelsAndPreds_label_0.show(3)
    t_cnt = labelsAndPreds_label_1.count()
    f_cnt = labelsAndPreds_label_0.count()
    print "thre\ttp\ttn\tfp\tfn\taccuracy\trecall"
    for thre in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        tp = labelsAndPreds_label_1.where(
            labelsAndPreds_label_1.probability_xj > thre).count()
        tn = t_cnt - tp
        fp = labelsAndPreds_label_0.where(
            labelsAndPreds_label_0.probability_xj > thre).count()
        fn = f_cnt - fp
        print("%.1f\t%d\t%d\t%d\t%d\t%.4f\t%.4f" %
              (thre, tp, tn, fp, fn, float(tp) / (tp + fp), float(tp) /
               (t_cnt)))

    # 保存模型
    model.write().overwrite().save(outputHDFS + "/target/model/lrModel")
    #加载模型
    #model.load(outputHDFS + "/target/model/lrModel")

    print "output:", outputHDFS