-
Notifications
You must be signed in to change notification settings - Fork 0
/
part_4.py
78 lines (57 loc) · 2.16 KB
/
part_4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from pyspark.sql import *
from pyspark.ml import *
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml import Pipeline
from builtins import round
from user_definition import *
ss = SparkSession.builder.getOrCreate()
# step 1
train_df = ss.read.parquet(train_folder).repartition(8).cache()
valid_df = ss.read.parquet(valid_folder).repartition(8).cache()
print(train_df.count())
print('')
print(valid_df.count())
print('')
# step 2
rf = RandomForestClassifier()
evaluator = BinaryClassificationEvaluator()
paraGrid = ParamGridBuilder().addGrid(rf.numTrees, num_trees).build()
cv = CrossValidator(estimator=rf,
evaluator=evaluator,
numFolds=n_fold,
estimatorParamMaps=paraGrid)
cvmodel = cv.fit(train_df)
rfpredicts = cvmodel.bestModel.transform(valid_df)
print('RandomForestClassifier')
print(cvmodel.bestModel.getNumTrees)
print(round(evaluator.evaluate(rfpredicts), n_digits))
print('')
# step 3
GBT = GBTClassifier()
evaluator = BinaryClassificationEvaluator() # areaUnderROC is default
paramGrid = ParamGridBuilder().addGrid(GBT.maxDepth, max_depth).build()
cv = CrossValidator(estimator=GBT,
evaluator=evaluator,
numFolds=n_fold,
estimatorParamMaps=paramGrid)
cvmodel = cv.fit(train_df)
GBTpredicts = cvmodel.bestModel.transform(valid_df)
print('GBTClassifier')
print(cvmodel.bestModel.getMaxDepth())
print(round(evaluator.evaluate(GBTpredicts), n_digits))
ss.stop()
# spark-submit --executor-memory 12g
# --driver-memory 12g --executor-cores 6