def load(config_properties): print("Training Data Load Start") ######################################### # Load Data ######################################### print("QUERY_SERVICE_URL from platform sdk is ", QUERY_SERVICE_URL) client_context = get_client_context(config_properties) dataset_reader = DatasetReader(client_context, config_properties['trainingDataSetId']) timeframe = config_properties.get("timeframe") tenant_id = config_properties.get("tenant_id") if (timeframe is not None): date_before = datetime.utcnow().date() date_after = date_before - timedelta(minutes=int(timeframe)) dataframe = dataset_reader.where( dataset_reader[tenant_id + '.date'].gt( str(date_after)).And(dataset_reader[tenant_id + '.date'].lt( str(date_before)))).read() else: dataframe = dataset_reader.read() ######################################### # Data Preparation/Feature Engineering ######################################### if '_id' in dataframe.columns: #Rename columns to strip tenantId dataframe = dataframe.rename( columns=lambda x: str(x)[str(x).find('.') + 1:]) #Drop id, eventType and timestamp dataframe.drop(['_id', 'eventType', 'timestamp'], axis=1, inplace=True) dataframe.head() print(dataframe) dataframe.date = pd.to_datetime(dataframe.date) dataframe['week'] = dataframe.date.dt.week dataframe['year'] = dataframe.date.dt.year dataframe = pd.concat( [dataframe, pd.get_dummies(dataframe['storeType'])], axis=1) dataframe.drop('storeType', axis=1, inplace=True) dataframe['isHoliday'] = dataframe['isHoliday'].astype(int) dataframe['weeklySalesAhead'] = dataframe.shift(-45)['weeklySales'] dataframe['weeklySalesLag'] = dataframe.shift(45)['weeklySales'] dataframe['weeklySalesDiff'] = ( dataframe['weeklySales'] - dataframe['weeklySalesLag']) / dataframe['weeklySalesLag'] dataframe.dropna(0, inplace=True) dataframe = dataframe.set_index(dataframe.date) dataframe.drop('date', axis=1, inplace=True) print("Training Data Load Finish") return dataframe
def train(self, config={}): ######################################### # Set Up ######################################### tf_config = json.loads(os.environ['TF_CONFIG']) tf_config = json.loads('{}') os.environ['TF_CONFIG'] = json.dumps(tf_config) ######################################### # Load Data ######################################### client_context = get_client_context(config) dataset_reader = DatasetReader(client_context, config['trainingDataSetId']) dataframe = dataset_reader.read() ######################################### # Data Preparation/Feature Engineering ######################################### if '_id' in dataframe.columns: # Rename columns to strip tenantId dataframe = dataframe.rename(columns=lambda x: str(x)[str(x).find('.') + 1:]) # Drop id, eventType and timestamp dataframe.drop(['_id', 'eventType', 'timestamp'], axis=1, inplace=True) dataframe.date = pd.to_datetime(dataframe.date) dataframe['week'] = dataframe.date.dt.week dataframe['year'] = dataframe.date.dt.year dataframe = dataframe.sort_values(by=['date', 'store']) dataframe = pd.concat([dataframe, pd.get_dummies(dataframe['storeType'])], axis=1) dataframe.drop('storeType', axis=1, inplace=True) dataframe['isHoliday'] = dataframe['isHoliday'].astype(int) dataframe['weeklySalesAhead'] = dataframe.shift(-45)['weeklySales'] dataframe['weeklySalesLag'] = dataframe.shift(45)['weeklySales'] dataframe['weeklySalesDiff'] = (dataframe['weeklySales'] - dataframe['weeklySalesLag']) / dataframe[ 'weeklySalesLag'] dataframe.dropna(0, inplace=True) dataframe = dataframe.set_index(dataframe.date) dataframe.drop('date', axis=1, inplace=True) ######################################### # Train / Validation Split ######################################### train_start = '2010-02-12' train_end = '2012-01-27' val_start = '2012-02-03' train = dataframe[train_start:train_end] val = dataframe[val_start:] X_train = train.drop('weeklySalesAhead', axis=1) y_train = train['weeklySalesAhead'].values X_val = val.drop('weeklySalesAhead', axis=1) y_val = val['weeklySalesAhead'].values features = [] for feature in X_train.columns: features.append(fc.numeric_column(feature, dtype=tf.float32)) def gen_input_fn(features, labels, epochs=10, shuffle=True, batch_size=32): def input_function(): dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels)) if shuffle: dataset = dataset.shuffle(1000) dataset = dataset.batch(batch_size).repeat(epochs) return dataset return input_function train_input_fn = gen_input_fn(X_train, y_train) eval_input_fn = gen_input_fn(X_val, y_val, shuffle=False, epochs=1) ######################################### # BoostedTreesRegressor Model ######################################### learning_rate = float(config['learning_rate']) n_estimators = int(config['n_estimators']) max_depth = int(config['max_depth']) filename = config['modelPATH'] + '/my_model' os.makedirs(os.path.dirname(filename), exist_ok=True) model = tf.estimator.BoostedTreesRegressor(features, model_dir=filename, n_batches_per_layer=5, n_trees=n_estimators, max_depth=max_depth, learning_rate=learning_rate) model.train(train_input_fn, max_steps=n_estimators) ######################################### # Process Metrics ######################################### pred_dict = list(model.predict(eval_input_fn)) y_pred = pd.Series([pred['predictions'][0] for pred in pred_dict]) mape = np.mean(np.abs((y_val - y_pred) / y_val)) mae = np.mean(np.abs(y_val - y_pred)) rmse = np.sqrt(np.mean((y_val - y_pred) ** 2)) metrics_dict = {} metrics_dict['MAPE'] = round(mape, 3) metrics_dict['MAE'] = round(mae, 3) metrics_dict['RMSE'] = round(rmse, 3) pickle.dump(metrics_dict, open(os.path.join(config['modelPATH'], 'metrics_dict.pkl'), 'wb'))
def score(self, config={}): tf_config = json.loads(os.environ['TF_CONFIG']) tf_config = json.loads('{}') os.environ['TF_CONFIG'] = json.dumps(tf_config) ######################################### # Load Data ######################################### client_context = get_client_context(config) dataset_reader = DatasetReader(client_context, config['scoringDataSetId']) dataframe = dataset_reader.read() ######################################### # Data Preparation/Feature Engineering ######################################### if '_id' in dataframe.columns: # Rename columns to strip tenantId dataframe = dataframe.rename( columns=lambda x: str(x)[str(x).find('.') + 1:]) # Drop id, eventType and timestamp dataframe.drop(['_id', 'eventType', 'timestamp'], axis=1, inplace=True) dataframe.date = pd.to_datetime(dataframe.date) dataframe['week'] = dataframe.date.dt.week dataframe['year'] = dataframe.date.dt.year dataframe = dataframe.sort_values(by=['date', 'store']) dataframe = pd.concat( [dataframe, pd.get_dummies(dataframe['storeType'])], axis=1) dataframe.drop('storeType', axis=1, inplace=True) dataframe['isHoliday'] = dataframe['isHoliday'].astype(int) dataframe['weeklySalesAhead'] = dataframe.shift(-45)['weeklySales'] dataframe['weeklySalesLag'] = dataframe.shift(45)['weeklySales'] dataframe['weeklySalesDiff'] = ( dataframe['weeklySales'] - dataframe['weeklySalesLag']) / dataframe['weeklySalesLag'] dataframe.dropna(0, inplace=True) dataframe = dataframe.set_index(dataframe.date) dataframe.drop('date', axis=1, inplace=True) ######################################### # Data Preparation/Feature Engineering ######################################### X_test = dataframe.drop('weeklySalesAhead', axis=1) y_test = dataframe['weeklySalesAhead'].values features = [] for feature in X_test.columns: features.append(fc.numeric_column(feature, dtype=tf.float32)) def gen_input_fn(features, labels, epochs=10, shuffle=True, batch_size=32): def input_function(): dataset = tf.data.Dataset.from_tensor_slices( (dict(features), labels)) if shuffle: dataset = dataset.shuffle(1000) dataset = dataset.batch(batch_size).repeat(epochs) return dataset return input_function test_input_fn = gen_input_fn(X_test, y_test, shuffle=False, epochs=1) ######################################### # BoostedTreesRegressor Model ######################################### model = tf.estimator.BoostedTreesRegressor( features, n_batches_per_layer=5, model_dir=config['modelPATH']) ######################################### # Write Results ######################################### pred_dict = list(model.predict(test_input_fn)) y_pred = pd.Series([pred['predictions'][0] for pred in pred_dict]) X_test['prediction'] = y_pred.values output = X_test[['store', 'prediction']].reset_index() output['date'] = output['date'].astype(str) client_context = get_client_context(config) tenant_id = config['tenantId'] output = output.add_prefix(tenant_id + '.') output = output.join( pd.DataFrame( { '_id': '', 'timestamp': '2019-01-01T00:00:00', 'eventType': '' }, index=output.index)) dataset = Dataset(client_context).get_by_id( config['scoringResultsDataSetId']) dataset_writer = DatasetWriter(client_context, dataset) dataset_writer.write(output, file_format='json') print('Write Done')