-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
121 lines (96 loc) · 3.31 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""
Contains the code handling how the model is trained
"""
# external imports
import subprocess
import pyspark.sql.functions as sqlf
from pyspark.ml import Pipeline
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.feature import (
RegexTokenizer,
StopWordsRemover,
CountVectorizer,
OneHotEncoder,
StringIndexer,
VectorAssembler,
)
# local imoprts
import get_config
import utilities
def train_model(data_in, model_type="LogisticRegression"):
"""
Train a model based on the provided data
Args:
data_in (SPARK dataframe) - data to be used for training
Has to contain the columns 'tag' and 'Body'
Returns:
A trained machine learning model!
"""
regexTokenizer = RegexTokenizer(inputCol="Body", outputCol="words", pattern="\\W")
add_stopwords = ["http", "https", "amp", "rt", "t", "c", "the"]
stopwordsRemover = StopWordsRemover(
inputCol="words", outputCol="filtered"
).setStopWords(add_stopwords)
countVectors = CountVectorizer(
inputCol="filtered", outputCol="features", vocabSize=500, minDF=1.0
)
label_stringIdx = StringIndexer(inputCol="tag", outputCol="label")
ml_model = utilities.get_model(model_type)
pipeline = Pipeline(
stages=[
regexTokenizer,
stopwordsRemover,
countVectors,
label_stringIdx,
ml_model,
]
)
# Fit the pipeline to training documents.
pipelineModel = pipeline.fit(data_in)
return pipelineModel
def train_and_eval_model(conf, spark_in, sc_in):
"""
Train a model based on the path provided in config.yaml.
Save the model to the path defined in onfig.yaml
Args:
conf (dict) - contains the configuration
spark_in - SparkSession object
sc_in - SparkContext object
Returns:
"""
data = spark_in.read.parquet(conf["tagged_emails_pipeline_path"])
data = data.where(~(sqlf.col("Body").isNull()))
(trainingData, testData) = data.randomSplit([0.7, 0.3], seed=100)
pipelineModel = train_model(trainingData, conf["model_type"])
preds = pipelineModel.transform(testData)
eval_model(preds, pipelineModel)
subprocess.check_call(["aws", "s3", "rm", conf["model_path"], "--recursive"])
try:
pipelineModel.save(conf["model_path"])
except:
pipelineModel.write().overwrite().save(conf["model_path"])
def eval_model(test_preds, model):
"""
Evaluate the ml model given the predictions and test data
Args:
test_preds - a list of transformed prediction data
model - the ml pipelined model
Returns:
A confusion matrix, along with the precision, recall and F1 score of the currently trained model
"""
metrics = MulticlassMetrics(test_preds.select("prediction", "label").rdd)
# Overall statistics
precision = metrics.precision()
recall = metrics.recall()
f1Score = metrics.fMeasure()
print("Confusion matrix")
print(metrics.confusionMatrix())
print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)
if __name__ == "__main__":
conf = get_config.get_config()
spark, sc = utilities.get_spark_session("preprocessing")
train_and_eval_model(conf, spark, sc)
print("Done.")