/
ML_Spark.py
101 lines (76 loc) · 4.36 KB
/
ML_Spark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import time
start_time = time.time()
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import *
from coalesce import coalesce
from pyspark.sql.functions import col
from pyspark.ml.feature import *
from pyspark.ml.regression import RandomForestRegressor, DecisionTreeRegressor
from pyspark.ml import *
import time
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.storagelevel import StorageLevel
import sys
reload(sys)
sys.setdefaultencoding('utf8')
print("------ CONFIG -------")
# CONF CLUSTER
conf=SparkConf()\
.set('spark.network.timeout','5000000s')\
.set('spark.executor.heartbeatInterval','4500000s')
sc = SparkContext(conf=conf)
sc.setCheckpointDir("checkpointdir2")
sqlContext=SQLContext(sc)
print("------ reading data -------")
data=sqlContext.read.parquet("data.parquet")
print(sc.uiWebUrl)
print("------ train test split --------")
(trainingData,testData)=data.randomSplit([0.7,0.3])
print("--------- Indexing categorical values for OHE------------")
codeIndexer = StringIndexer(inputCol='code', outputCol='code_index').setHandleInvalid("keep")
lieuIndexer = StringIndexer(inputCol='lieu', outputCol='lieu_index').setHandleInvalid("keep")
#putting indexed data into a OHE for a better modelling approach
ohe = OneHotEncoderEstimator(inputCols=['lieu_index','code_index'],outputCols=['lieu_category','code_category']).setHandleInvalid("keep")
#Here i combine numerical features into a vector, rescale them between 0 and 1 and then merge the new scaled numerical features with features from the OHE
numericalAssembler=VectorAssembler(inputCols=['time'],outputCol='numerical_features')
numericalScaler=MinMaxScaler(inputCol='numerical_features',outputCol='scaled_numerical_features')
assembler= VectorAssembler(inputCols=['lieu_category','code_category','scaled_numerical_features'],outputCol='features')
labelCol="label"
print("--------------- Random Forest Model definition ----------------")
rf=RandomForestRegressor(labelCol=labelCol, featuresCol='features', predictionCol='prediction', maxBins=100, numTrees=100, maxDepth=11, subsamplingRate=0.1)
print("------------- transformer pipeline --------------")
transformerStages=[codeIndexer,lieuIndexer,ohe,numericalAssembler,numericalScaler,assembler]
transformerPipeline=Pipeline(stages=transformerStages)
transformer=transformerPipeline.fit(trainingData)
transformedTrainingData=transformer.transform(trainingData).checkpoint()
trainingData.unpersist()
print("--------------- TRANSFORMER serialization ----------------")
transformer.write().overwrite().save("TransformerTestCluster")
print("------------ REGRESSION MODEL ---------------")
modelPipeline=Pipeline(stages=[rf])
model=modelPipeline.fit(transformedTrainingData)
model.write().overwrite().save("ModelTestCluster")
print("------------- READING SERIALIZED TRANSFORMER AND MODEL --------------")
rfTransformerPipeline=PipelineModel.load("TransformerTestCluster")
rfModel=PipelineModel.load("ModelTestCluster")
print("---------- Prediction on Test Data -------------")
transformedTestData = rfTransformerPipeline.transform(testData)
predictions=rfModel.transform(transformedTestData)
print("---------- MODEL EVALUATION ------------")
RMSE=RegressionEvaluator(predictionCol="prediction",labelCol=labelCol,metricName="rmse")
R2=RegressionEvaluator(predictionCol="prediction",labelCol=labelCol,metricName="r2")
MAE=RegressionEvaluator(predictionCol="prediction",labelCol=labelCol,metricName="mae")
# Calculation of 3 metrics to check performances of the regression approach (you can change the RF with other regression models)
# see other regression algorithms here: https://spark.apache.org/docs/2.2.0/ml-classification-regression.html#regression
errorRMSE=RMSE.evaluate(predictions)
errorR2=R2.evaluate(predictions)
errorMAE=MAE.evaluate(predictions)
print("-------- RMSE ---------")
print(errorRMSE)
print("-------- R2 ---------")
print(errorR2)
print("-------- MAE ---------")
print(errorMAE)
#check time into the terminal (but can be checked with tools from the cluster)
print("--- %s seconds ---" % (time.time() - start_time))