Esempio n. 1
0
def train():
    schema = StructType([
        StructField("Pregnancies", DoubleType()),
        StructField("Glucose", DoubleType()),
        StructField("BloodPressure", DoubleType()),
        StructField("SkinThickness", DoubleType()),
        StructField("Insulin", DoubleType()),
        StructField("BMI", DoubleType()),
        StructField("DiabetesPedigreeFunction", DoubleType()),
        StructField("Age", DoubleType()),
        StructField("Outcome", DoubleType())
    ])
    df = spark.read.schema(schema).csv("/home/admin/Downloads/diabetes.csv",
                                       header=True)
    df_assembler = VectorAssembler(inputCols=[
        'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
        'BMI', 'DiabetesPedigreeFunction', 'Age'
    ],
                                   outputCol="features")
    df = df_assembler.transform(df)
    model_df = df.select(['features', 'Outcome'])
    train_df, test_df = model_df.randomSplit([0.75, 0.25])
    rf_classifier = RandomForestClassifier(labelCol='Outcome',
                                           numTrees=50).fit(train_df)
    rf_predictions = rf_classifier.transform(test_df)
    rf_accuracy = MulticlassClassificationEvaluator(
        labelCol='Outcome', metricName='accuracy').evaluate(rf_predictions)
    print(rf_accuracy)
    #Save Model As Pickle File
    rf_classifier.save("/home/admin/Downloads/RF_model")
import pyspark.sql.functions as func
import pyspark

conf = SparkConf().setAppName("Wine Quality Prediction").setMaster("local[4]")
sc = SparkContext(conf=conf)

spark = SparkSession.builder.getOrCreate()

#Read data from csv
data = spark.read.format('csv').options(header='true', inferSchema='true', delimiter=';').csv("s3://pa2smit/TrainingDataset.csv")

print("\nPrinting Training Schema\n")
data.printSchema()
data.count()

featureColumns = [col for col in data.columns if col != '""""quality"""""']

assembler = VectorAssembler(inputCols=featureColumns, outputCol='values')
transformData = assembler.transform(data)

rf = RandomForestClassifier(featuresCol='values', labelCol='""""quality"""""',numTrees=100, maxBins=484, maxDepth=25, minInstancesPerNode=5, seed=34)
rfModel = rf.fit(transformData)

evaluator = MulticlassClassificationEvaluator(labelCol='""""quality"""""', predictionCol="prediction", metricName="f1")
rfTrainingPredictions = rfModel.transform(transformData)

print("\nModel Training Completed ...\n")
print("\nRandom Forest f1 of traning data = %g\n" % evaluator.evaluate(rfTrainingPredictions))

rf.save("s3://pa2smit/wine_model.model")
print(rf_accuracy)

rf_precision=MulticlassClassificationEvaluator(labelCol='affairs',metricName='weightedPrecision').evaluate(rf_predictions)
print('The precision rate on test data is {0:.0%}'.format(rf_precision))

rf_precision

rf_auc=BinaryClassificationEvaluator(labelCol='affairs').evaluate(rf_predictions)
print(rf_auc)

# Feature importance
rf_classifier.featureImportances
df.schema["features"].metadata["ml_attr"]["attrs"]

# Save the model 
rf_classifier.save("C:\\Users\\Hernan\\Data Science\\SPARK\\machine-learning-with-pyspark\\chapter_6_Random_Forests\\RF_model")

from pyspark.ml.classification import RandomForestClassificationModel

rf=RandomForestClassificationModel.load("C:\\Users\\Hernan\\Data Science\\SPARK\\machine-learning-with-pyspark\\chapter_6_Random_Forests\\RF_model")
test_df.show(5)
model_preditions=rf.transform(test_df)
model_preditions.show()

single_df = spark.createDataFrame([[5.0,33.0,5.0,1.0,5.0,0.0]], ['rate_marriage', 'age', 'yrs_married', 'children', 'religious', 'affairs'])
single_df = df_assembler.transform(single_df)
single_df = single_df.select(['features','affairs'])

model_predition=rf.transform(single_df)
model_predition.show()
Esempio n. 4
0
sc = SparkContext(conf=config)
myspark = SparkSession.builder.getOrCreate()

# Read the data and Print the schema
print("\nProgram Starting...\n")
defTrain = myspark.read.format('csv').options(header='true', inferSchema='true', delimiter=';').csv("s3://winedataset/TrainingDataset.csv")
print("\nTraining Schema\n")
defTrain.printSchema()
defTrain.count()

featureData = [col for col in defTrain.columns if (col != '""""quality"""""')]

assembler = VectorAssembler(inputCols=featureData, outputCol='features')

dataDF = assembler.transform(defTrain)

print("\n\nPrinting Training Schema with Features Table\n\n")
dataDF.printSchema()

# Random Forest Regression on TrainingDataset

rf = RandomForestClassifier(featuresCol='features', labelCol='""""quality"""""',
                            numTrees=100, maxBins=484, maxDepth=25, minInstancesPerNode=5, seed=34)
rfPipeline = Pipeline(stages=[assembler, rf])
rfPipelineModel = rfPipeline.fit(trainingDF)
evaluator = RegressionEvaluator(
    labelCol='""""quality"""""', predictionCol="prediction", metricName="rmse")
rfTrainingPredictions = rfPipelineModel.transform(defTrain)

rf.save("s3://myprogrambucket/rfwine_model.model")
Esempio n. 5
0
model = idf.fit(featurized)
result = model.transform(featurized)

#save idf and idf model
idf_path = PROJECT_HOME + 'tmp/idf'
#idf.save(idf_path)
idfmodel_path = PROJECT_HOME + 'tmp/idfmodel'
#model.save(idfmodel_path)
#load via following
#loadedModel = IDFModel.load(idfmodel_path)

#fit single rf model
rf = RandomForestClassifier(numTrees=100, labelCol="label", seed=42)
rf_model = rf.fit(result)
rf_path = PROJECT_HOME + 'tmp/rf'
rf.save(rf_path)
rfmodel_path = PROJECT_HOME + 'tmp/rfmodel'
rf_model.save(rfmodel_path)
"""
#Prepare Train Test Split
train, test = result.randomSplit([0.8, 0.2], seed=42)

# Configure an ML pipeline, which consists of tree stages: hashingTF, idf and RandomForestClassifier.
rf = RandomForestClassifier(labelCol="label", seed=42)
pipeline = Pipeline(stages=[rf])

#grid search
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [100]).addGrid(rf.maxDepth, [5]).build()

crossval = CrossValidator(estimator=pipeline,
                            estimatorParamMaps=paramGrid,
# Random Splitting of Data

splitValue = 0.7
trainingDF, testDF = defTrain.randomSplit([splitValue, 1 - splitValue])
print("\nSplitted Data into Training and Testing Dataset\n")

# Random Forest Regression on TrainingDataset

rf = RandomForestClassifier(featuresCol='features',
                            labelCol='""""quality"""""',
                            numTrees=100,
                            maxBins=484,
                            maxDepth=25,
                            minInstancesPerNode=5,
                            seed=34)
rfPipeline = Pipeline(stages=[assembler, rf])
rfPipelineModel = rfPipeline.fit(trainingDF)
evaluator = RegressionEvaluator(labelCol='""""quality"""""',
                                predictionCol="prediction",
                                metricName="rmse")
rfTrainingPredictions = rfPipelineModel.transform(defTrain)
rfTestPredictions = rfPipelineModel.transform(testDF)

print(
    "\nCompleted Model Training...\n\nRandom Forest RMSE on traning data = %g\n"
    % evaluator.evaluate(rfTrainingPredictions))
print("\nRandom Forest RMSE on test data = %g\n" %
      evaluator.evaluate(rfTestPredictions))

rf.save("rfPipelineModel")
Esempio n. 7
0
from pyspark.ml.classification import RandomForestClassifier

rf_classifier = RandomForestClassifier(labelCol='故障', numTrees=50).fit(
    train_df
)  # numTrees设置随机数的数量为50,还有其他参数:maxDepth 树深;返回的模型类型为:RandomForestClassificationModel

rf_predictions = rf_classifier.transform(test_df)

print('{}{}'.format(
    '评估每个属性的重要性:',
    rf_classifier.featureImportances))  # featureImportances : 评估每个功能的重要性,

rf_predictions.select(['probability', '故障', 'prediction']).show(10, False)

print("------查阅pyspark api,没有发现有训练准确率的字段,所以还需要计算预测的准确率------")

from pyspark.ml.evaluation import BinaryClassificationEvaluator  # 对二进制分类的评估器,它期望两个输入列:原始预测值和标签
from pyspark.ml.evaluation import MulticlassClassificationEvaluator  # 多类分类的评估器,它期望两个输入列:预测和标签

rf_accuracy = MulticlassClassificationEvaluator(
    labelCol='故障', metricName='accuracy').evaluate(rf_predictions)
print('MulticlassClassificationEvaluator 随机深林测试的准确性: {0:.0%}'.format(
    rf_accuracy))

rf_auc = BinaryClassificationEvaluator(labelCol='故障').evaluate(rf_predictions)
print('BinaryClassificationEvaluator 随机深林测试的准确性: {0:.0%}'.format(rf_auc))

print('-----------保持模型,用于下次使用----------------')

rf_classifier.save("RF_model")