Ejemplo n.º 1
0
def train():
    schema = StructType([
        StructField("Pregnancies", DoubleType()),
        StructField("Glucose", DoubleType()),
        StructField("BloodPressure", DoubleType()),
        StructField("SkinThickness", DoubleType()),
        StructField("Insulin", DoubleType()),
        StructField("BMI", DoubleType()),
        StructField("DiabetesPedigreeFunction", DoubleType()),
        StructField("Age", DoubleType()),
        StructField("Outcome", DoubleType())
    ])
    df = spark.read.schema(schema).csv("/home/admin/Downloads/diabetes.csv",
                                       header=True)
    df_assembler = VectorAssembler(inputCols=[
        'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
        'BMI', 'DiabetesPedigreeFunction', 'Age'
    ],
                                   outputCol="features")
    df = df_assembler.transform(df)
    model_df = df.select(['features', 'Outcome'])
    train_df, test_df = model_df.randomSplit([0.75, 0.25])
    rf_classifier = RandomForestClassifier(labelCol='Outcome',
                                           numTrees=50).fit(train_df)
    rf_predictions = rf_classifier.transform(test_df)
    rf_accuracy = MulticlassClassificationEvaluator(
        labelCol='Outcome', metricName='accuracy').evaluate(rf_predictions)
    print(rf_accuracy)
    #Save Model As Pickle File
    rf_classifier.save("/home/admin/Downloads/RF_model")
# Creamos el modelo de Random Forest, lo entrenamos y realizamos la prediccion
now = datetime.datetime.now()
print(now.year, now.month, now.day, now.hour, now.minute, now.second)

rf = RandomForestClassifier(labelCol='attack_cat_index',
                            featuresCol='features',
                            seed=1234,
                            maxBins=136,
                            maxDepth=25,
                            featureSubsetStrategy='all')
rf = rf.fit(train)

now = datetime.datetime.now()
print(now.year, now.month, now.day, now.hour, now.minute, now.second)

result = rf.transform(test)

# Evaluamos la prediccion
evaluator = MulticlassClassificationEvaluator(labelCol="attack_cat_index",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(result)
print("Accuracy = {}".format(accuracy))

evaluator = MulticlassClassificationEvaluator(labelCol="attack_cat_index",
                                              metricName="weightedPrecision")
weightedPrecision = evaluator.evaluate(result)
print("weightedPrecision = {}".format(weightedPrecision))

evaluator = MulticlassClassificationEvaluator(labelCol="attack_cat_index",
                                              metricName="f1")
f1 = evaluator.evaluate(result)
Ejemplo n.º 3
0
print('AUC ROC of Decision Tree model is %f' % evaluator1.evaluate(pred))
print('F1 score of Decision Tree model is %f' % evaluator2.evaluate(pred))
metrics.confusionMatrix().toArray().transpose()

# <a id="context323"></a>
# #### 3.2.3. Random Forest

# In[15]:

from pyspark.ml.classification import RandomForestClassifier

# model on training data numTrees is the hyperparameter
rfModel = RandomForestClassifier(numTrees=100).fit(trainData)

# make prediction on test data
pred = rfModel.transform(testData)

pred.select('label', 'prediction').show()

evaluator1 = BinaryClassificationEvaluator(labelCol='label',
                                           metricName="areaUnderROC")
evaluator2 = MulticlassClassificationEvaluator(labelCol='label',
                                               metricName="f1")
metrics = MulticlassMetrics(pred.select('label', 'prediction').rdd.map(tuple))

print('AUC ROC of Random Forest model is %f' % evaluator1.evaluate(pred))
print('F1 score of Random Forest model is %f' % evaluator2.evaluate(pred))
metrics.confusionMatrix().toArray().transpose()

# <a id="context4"></a>
# ## 4. Summary
Ejemplo n.º 4
0
#index y
#分训练和测试
#labelIndexer = StringIndexer(inputCol = "affairs", outputCol = "indexedLabel").fit(df)
#data = labelIndexer.transform(df)
Data = feature_model.transform(data)
print("所有的特征名称:{0}".format(Data.columns))
train_data, test_data = Data.randomSplit([0.7, 0.3], seed=1994)
print("训练样本数:%d\n测试样本数:%d" % (train_data.count(), test_data.count()))

#随机森林
rf = RandomForestClassifier(numTrees=100,
                            featuresCol='features',
                            labelCol="labels",
                            seed=7).fit(train_data)
Predictions = rf.transform(test_data)

#f1 = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='affairs',metricName='f1',metricLabel=1).evaluate(lrPredictions)
#accuracy = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='affairs',metricName='accuracy',metricLabel=1).evaluate(lrPredictions)
#weightedPrecision = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='affairs',metricName='weightedPrecision',metricLabel=1).evaluate(lrPredictions)
#weightedRecall = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='affairs',metricName='weightedRecall',metricLabel=1).evaluate(lrPredictions)

#分类报告
report = Predictions.select("prediction", "labels", "features",
                            "probability").toPandas()
print(
    classification_report(y_true=report['labels'],
                          y_pred=report['prediction']))
# 使用混淆矩阵评估模型性能[[TP,FN],[TN,FP]]
TP = Predictions.filter(Predictions['prediction'] == 1).filter(
    Predictions['labels'] == 1).count()
from pyspark.ml.classification import RandomForestClassifier

rf_classifier=RandomForestClassifier(labelCol='affairs',numTrees=50).fit(train_df)
rf_predictions=rf_classifier.transform(test_df)
rf_predictions.show()
rf_predictions.groupBy('prediction').count().show()
rf_predictions.select(['probability','affairs','prediction']).show(10,False)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

rf_accuracy=MulticlassClassificationEvaluator(labelCol='affairs',metricName='accuracy').evaluate(rf_predictions)
print('The accuracy of RF on test data is {0:.0%}'.format(rf_accuracy))
print(rf_accuracy)

rf_precision=MulticlassClassificationEvaluator(labelCol='affairs',metricName='weightedPrecision').evaluate(rf_predictions)
print('The precision rate on test data is {0:.0%}'.format(rf_precision))

rf_precision

rf_auc=BinaryClassificationEvaluator(labelCol='affairs').evaluate(rf_predictions)
print(rf_auc)

# Feature importance
rf_classifier.featureImportances
df.schema["features"].metadata["ml_attr"]["attrs"]

# Save the model 
rf_classifier.save("C:\\Users\\Hernan\\Data Science\\SPARK\\machine-learning-with-pyspark\\chapter_6_Random_Forests\\RF_model")

from pyspark.ml.classification import RandomForestClassificationModel
Ejemplo n.º 6
0
def predict(model: RandomForestClassifier,
            testing_data: DataFrame) -> DataFrame:
    """Node for making predictions given a pre-trained model and a testing dataset.
    """
    predictions = model.transform(testing_data)
    return predictions
Ejemplo n.º 7
0
print(kars_train.count(), kars_test.count())

# Create a Random Forest classifier
#tree = DecisionTreeClassifier(labelCol="origin_idx")
forest = RandomForestClassifier(labelCol="origin_idx", numTrees=5)

# Learn from training data
#tree = tree.fit(kars_train)
forest = forest.fit(kars_train)
print("\nforest.trees:")
for i in forest.trees:
    print(" ", i)
print()

# Make predictions on testing data
prediction = forest.transform(kars_test)

prediction.show(9, False)

print("\nforest.featureImportances:", forest.featureImportances, '\n')
# Confusion matrix
confusion_matrix = prediction.groupBy("origin_idx", "prediction").count()
confusion_matrix.show()

# Accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="origin_idx",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(prediction)
print("Test set accuracy = " + str(accuracy))

spark.stop()
Ejemplo n.º 8
0
    # A Pipeline object that combines all the transformations we defined above.

    # Use the pipeline object to transform our dataframe
    mushrooms_trans = pipeline \
                        .fit(mushrooms) \
                        .transform(mushrooms) \
                        .cache()

    # Train-test split
    mushrooms_train, mushrooms_val = mushrooms_trans.randomSplit([0.7, 0.3],
                                                                 seed=2017)

    model = RandomForestClassifier(labelCol='poisonous', featuresCol='features', numTrees=200) \
            .fit(mushrooms_train)

    pred = model.transform(mushrooms_val)

    results = pred.select(['probability', 'prediction', 'poisonous'])
    # Select the columns relevant for evaluation
    # `results` looks like this:
    # +--------------------+----------+---------+
    # |         probability|prediction|poisonous|
    # +--------------------+----------+---------+
    # |[0.97024593961675...|       0.0|      0.0|
    # |[0.96303265951929...|       0.0|      0.0|
    # |[0.95909221894651...|       0.0|      0.0|
    # |[0.95958294573868...|       0.0|      0.0|
    # |[0.95580449199223...|       0.0|      0.0|
    # +--------------------+----------+---------+

    results_collect = results.collect()
    #----------------- Decision and Random Forest -----------------

    # Final assembly
    inputCols = ['norm_cols'
                 ] + [cname + "classVec" for cname in categorical_cols]
    final_assembler = VectorAssembler(inputCols=inputCols,
                                      outputCol='features')
    stages += [final_assembler]

    pipeline = Pipeline(stages=stages)
    train_final = pipeline.fit(train).transform(train)
    test_final = pipeline.fit(test).transform(test)

    dt = DecisionTreeClassifier(featuresCol='features',
                                labelCol='label').fit(train_final)
    res_dt = dt.transform(test_final)

    rf = RandomForestClassifier(featuresCol='features',
                                labelCol='label',
                                numTrees=20).fit(train_final)
    res_rf = rf.transform(test_final)

    res_lr.select('prediction', 'label').write.csv(sys.argv[2] + "lr",
                                                   header=True)
    res_dt.select('prediction', 'label').write.csv(sys.argv[2] + "dt",
                                                   header=True)
    res_rf.select('prediction', 'label').write.csv(sys.argv[2] + "rf",
                                                   header=True)

    spark.stop()
Ejemplo n.º 10
0
dftest = impage.transform(dftest)
dftest = dftest.drop('Age')

# In[22]:

dftest = sipclass.transform(dftest)
dftest = dftest.drop('Pclass')
dftest = ohe.transform(dftest)
dftest = dftest.drop('idxPclass')

# In[23]:

dftest = va.transform(dftest)
dftest = dftest.drop('SibSp', 'Parch', 'Fare', 'impAge', 'ohePclass')
dftest.show()

# In[24]:

# predict using random forest classifier on test data
predictions = rfc.transform(dftest)
predictions.show()

# In[25]:

# evaluate prediction results
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="Survived",
                                              predictionCol="prediction",
                                              metricName="accuracy")
evaluator.evaluate(predictions)