forked from MahmoudFarid/prediction_spark
/
modifications.py
212 lines (158 loc) · 8.98 KB
/
modifications.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import sys
import os
from datetime import datetime, date, timedelta
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, udf, row_number, last, max, round
from pyspark.sql.window import Window
from pyspark.sql.types import DateType, StringType, IntegerType, DoubleType
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.linalg import Vectors, DenseVector, VectorUDT
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
spark = SparkSession.builder.config('spark.sql.crossJoin.enabled', 'true').getOrCreate()
prediction_features = ['overall', 'doctor', 'specialty', 'procedure', 'priority']
change_to_month_func = udf(lambda record: int(datetime.strftime(datetime.strptime(record, '%d/%m/%Y'), '%Y%m')),
IntegerType())
change_to_date_func = udf(lambda record: datetime.strptime(str(record), '%Y%m'), DateType())
change_date_to_month = udf(lambda record: datetime(record.year, record.month, 1), DateType())
to_vector = udf(lambda record: Vectors.dense(record), VectorUDT())
to_vectors = udf(lambda col_a, col_b: Vectors.sparse(col_a, col_b))
negative_to_zero = udf(lambda col: 0.0 if col < 0.0 else col, DoubleType())
def perform_prediction(csv, predict_by='Overall', predict_period=3):
"""
The function is the entry point to the prediction module.
:param csv: --string: path to the csv file containing the data
:param predict_by: -- string: The choices should be 'Overall', 'Doctor', 'Specialty', 'Procedure',
and 'Priority'
:param predict_period: -- integer: The choices should be 3, 6, 12, 24, 36
:return:
"""
if predict_by.lower() not in prediction_features:
raise ValueError("Unknown prediction feature. Predict_by should be one of %s" % prediction_features)
# Reading data from CSV
csv_file = spark.read.csv(csv, header=True)
# Changing date format
date_col = get_date_column_name(csv_file.columns)
if not date_col:
raise ValueError("Date column should contain the word 'date'")
csv_file_date = csv_file.withColumn('Date', change_to_month_func(col(date_col)))
if predict_by.lower() == 'overall':
training_df = overall_prediction_grouping(csv_file_date)
model, evaluator = create_model(training_df, features_col='id', label_col='count(Date)')
prediction_dataset = create_prediction_df(training_df, predict_period)
prediction_df = model.transform(prediction_dataset)
final_prediction_df = prediction_df.select('Date', round('prediction', 0)).withColumn(
'prediction', negative_to_zero(col('round(prediction, 0)')))
# print "The mean average error is %s" % evaluator.evaluate(prediction_df, {evaluator.metricName: "mae"})
return final_prediction_df.select(change_date_to_month(col('Date')).alias('Date'), 'prediction').coalesce(
1).write.csv('prediction_%s_%s' % (predict_by, predict_period),
mode='overwrite', header=True)
else:
if predict_by.lower() not in [c.lower() for c in csv_file_date.columns]:
raise ValueError("Predict by field should be one the CSV file columns")
str_name = predict_by.lower()
column_name = str_name[0].capitalize() + str_name[1:]
training_df = specialized_prediction_grouping(csv_file_date, column=column_name)
model, evaluator = create_model(training_df, 'features', 'count(%s)' % column_name)
specialized_prediction_dataset = create_prediction_df(training_df, predict_period, other_column=column_name)
prediction_df = model.transform(specialized_prediction_dataset)
final_prediction_df = prediction_df.select('Date', round('prediction', 0), column_name).withColumn(
'prediction', negative_to_zero(col('round(prediction, 0)')))
# print "The mean average error is %s" % evaluator.evaluate(prediction_df, {evaluator.metricName: "mae"})
return final_prediction_df.select(change_date_to_month(col('Date')).alias(
'Date'), column_name, 'prediction').coalesce(1).write.csv(
'prediction_%s_%s' % (predict_by, predict_period),
mode='overwrite', header=True)
def overall_prediction_grouping(csv):
"""
Grouping dataset by date
:param csv: -- dataframe: containing all the data
:return: -- dataframe: grouped
"""
grouped = csv.groupby('Date').agg({'Date': 'count'})
grouped_with_date = grouped.withColumn('Date', change_to_date_func(col('Date')))
window_row = Window().orderBy('Date')
grouped_indexed = grouped_with_date.withColumn('id', row_number().over(window_row))
return grouped_indexed.withColumn('id', to_vector(col('id')))
def create_model(training_data, features_col, label_col):
"""
Create machine learning model
:param training_data: -- dataframe: training dataset
:param features_col: -- col: containing all the features needed.
:param label_col: -- col: label
:return: model created and its evaluator
"""
# Create Generalized Linear Regression Model
glr = GeneralizedLinearRegression()
# Create params for the model
params = ParamGridBuilder().baseOn({glr.labelCol: label_col}).baseOn({glr.featuresCol: features_col}).addGrid(
glr.family, ["gaussian", "poisson"]).build()
# Model Evaluator
glr_evaluator = RegressionEvaluator(labelCol=label_col)
# Create model with Cross Validation to get the best results
glr_cv = CrossValidator(estimator=glr, estimatorParamMaps=params, evaluator=glr_evaluator)
dt_cv_model = glr_cv.fit(training_data)
return dt_cv_model, glr_evaluator
def create_prediction_df(training_df, prediction_period, other_column=None):
"""
:param training_df: -- dataframe: for training
:param prediction_period: -- integer: number of period to predict on
:param other_column: -- col column for prediction other than the overall rank
:return: -- dataframe and corresponding dates
"""
if other_column:
last_date = training_df.select('Date').distinct().orderBy('Date').select(last('Date')).collect()[0][0]
last_id = training_df.select('id').distinct().orderBy('id').select(max('id')).collect()[0][0]
date_rows = list(Row(float(last_id + i), date(last_date.year, last_date.month, 1) + timedelta(days=i * 31))
for i in range(1, prediction_period + 1))
date_df = spark.createDataFrame(date_rows, ['id', 'Date'])
specialized_rows = training_df.select(other_column, other_column + '_idx').distinct()
prediction_df = specialized_rows.join(date_df)
assembler = VectorAssembler(inputCols=['id', other_column + '_idx'], outputCol='features')
return assembler.transform(prediction_df)
else:
last_date = training_df.orderBy('Date').select(last('Date')).collect()[0][0]
last_id = training_df.orderBy('id').select(max('id')).collect()[0][0][0]
prediction_rows = list(Row(float(last_id + i), date(last_date.year, last_date.month, 1) +
timedelta(days=i * 31)) for i in range(1, prediction_period + 1))
prediction_df = spark.createDataFrame(prediction_rows, ['id', 'Date'])
return prediction_df.withColumn('id', to_vector(col('id')))
def specialized_prediction_grouping(csv, column):
"""
Grouping dataset by date
:param csv: -- dataframe: containing all the data
:param column: -- string: column name used for aggregation
:return: -- dataframe: grouped
"""
grouped = csv.groupby('Date', column).agg({column: 'count'})
grouped_with_date = grouped.withColumn('Date', change_to_date_func(col('Date')))
window_row = Window().partitionBy(column).orderBy('Date')
grouped_with_date_and_id = grouped_with_date.withColumn('id', row_number().over(window_row))
str_indexer = StringIndexer(inputCol=column, outputCol=column + '_idx')
model_str_indexer = str_indexer.fit(grouped_with_date_and_id)
grouped_with_date_and_id_indexed = model_str_indexer.transform(grouped_with_date_and_id)
assembler = VectorAssembler(inputCols=['id', column + '_idx'], outputCol='features')
return assembler.transform(grouped_with_date_and_id_indexed)
def get_date_column_name(column_names):
"""
:param column_names: -- list: containing dataframe column names
:return: -- string: date column name
"""
for column in column_names:
if 'date' in column.lower():
return column
return ''
if __name__ == '__main__':
try:
path = sys.argv[1]
if 'csv' not in path:
raise ValueError("File should be in csv format")
if 'dmission' not in path:
raise ValueError("This file for Admissions only!")
predict_by = sys.argv[2]
period = int(sys.argv[3])
perform_prediction(path, predict_by=predict_by, predict_period=period)
except IndexError:
print "Usage: run.py <CSV path> <predict on> <period>"
sys.exit(1)