def test_score(predicted, actual, plot_file): scores = [] for i in range(len(predicted)): scores.append(utils.smape(actual=actual[i], predicted=predicted[i])) plt.figure(figsize=(10, 8)) plt.plot(range(len(scores)), scores, 'b-', label='score') plt.legend(loc='lower right') plt.savefig(plot_file) plt.show() return scores
def error(self, data, times=None, metric='mape'): """ Model prediction error. metric : str Error metric to use. It can be "mape", "smape", "logaccratio", and "rmse". Default: mape. """ if times is None: times = numpy.arange(len(data)) y = self.simulate(times) if metric == 'mape': return mape(y, data) elif metric == 'smape': return smape(y, data) elif metric == 'logaccratio': return logaccratio(y, data) elif metric == 'rmse': return numpy.sqrt(self.cost_) else: raise ValueError("No such metric: {}".format(metric))
def stacked_lstm_multi_step_forecast(series, validation_series, input_length, horizon, del_outliers=False, normalize=False, plot=False): """ Perform forecasting of a time series using an lstm neural network. The network is trained using samples of shape input_length (corresponding to the last input_length days) to predict an array of horizon values (corresponding to horizon days). In this case, the network predicts horizon days at the time. Performance of the trained network is assessed on a validation series. The size of the validation series must be horizon. :param series: :param validation_series: :param input_length: :param horizon: :param del_outliers: :param normalize: :param plot: :return: SMAPE for the validation series, the forecast validation series """ # whether to remove outliers in the training series if del_outliers: working_series = remove_outliers(series) else: working_series = series # whether to normalize the training series if normalize: scaler, working_series = normalize_series(working_series) else: scaler = None # input sequence is our data, np.log1p is applied to the data and mae error is used to approximate SMAPE error train_series = np.log1p(working_series) # we use the last n_steps_in days as input and predict n_steps_out n_steps_in, n_steps_out = input_length, horizon # split into samples train_samples, train_targets = split_sequence(train_series, n_steps_in, n_steps_out) # here we work with the original series so only the actual values n_features = 1 train_samples = train_samples.reshape( (train_samples.shape[0], train_samples.shape[1], n_features)) # create the model model = Sequential() model.add( LSTM(256, activation='relu', input_shape=(n_steps_in, n_features))) # we predict n_steps_out values model.add(Dense(n_steps_out)) # we use 'mae' with data transformed with log1p and expm1 to approach SMAPE error model.compile(optimizer='adam', loss='mae') # fit model model.fit(train_samples, train_targets, epochs=200, verbose=0) # perform prediction # input is the last n_steps_in values of the train series (working_series is not log1p transformed) validation_in_sample = np.log1p( np.array(working_series.values[-n_steps_in:])) validation_in_sample = validation_in_sample.reshape( (1, n_steps_in, n_features)) validation_forecast = model.predict(validation_in_sample, verbose=0) # dataframe which contains the result forecast_dataframe = pd.DataFrame(index=validation_series.index) # if data was normalized, we need to apply the reverse transform if normalize: # first reverse log1p using expm1 validation_forecast = np.expm1(validation_forecast) # use scaler to reverse normalizing denormalized_forecast = scaler.inverse_transform( validation_forecast.reshape(-1, 1)) denormalized_forecast = [val[0] for val in denormalized_forecast] # save the forecast in the dataframe forecast_dataframe['forecast'] = denormalized_forecast else: # save the forecast in the dataframe forecast_dataframe['forecast'] = np.expm1(validation_forecast) if plot: plt.figure(figsize=(10, 6)) plt.plot(series[-100:], color="blue", linestyle="-") plt.plot(validation_series, color="green", linestyle="-") plt.plot(forecast_dataframe, color="red", linestyle="--") plt.legend(["Train series", "Validation series", "Predicted series"]) plt.title("Validation of LSTM with input size " + str(n_steps_in) + " output size " + str(n_steps_out)) plt.show() return smape( validation_series, forecast_dataframe['forecast']), forecast_dataframe['forecast']
def nn_with_past_outliers_single_step_forecast(series, validation_series, input_length, horizon, del_outliers=False, normalize=False, plot=False): """ Perform forecasting of a time series using a simple neural network with a single 128 neurons hidden layer. The network is trained using samples of shape input_length (corresponding to the last input_length days) to predict an array of horizon values (corresponding to horizon days). In this case, the network predicts one day at the time. Performance of the trained network is assessed on a validation series. This is computed by repeating one day predictions and shifting the input values. The size of the validation series must be horizon. This function differs from nn_single_step_forecast as in addition to the last input_length days, we also use the value from the same day the previous year as an input to the network. This value is normalized but contains the outliers. The hope is to gain information from the previous year. :param series: :param validation_series: :param input_length: :param horizon: :param del_outliers: :param normalize: :param plot: :return: SMAPE for the validation series, the forecast validation series """ # whether to remove outliers in the training series if del_outliers: working_series = remove_outliers(series) else: working_series = series # whether to normalize the training series if normalize: scaler, working_series = normalize_series(working_series) scaler_bis, working_series_with_outliers = normalize_series(series) else: scaler = None working_series_with_outliers = series # input sequence is our data, np.log1p is applied to the data and mae error is used to approximate SMAPE error train_series = np.log1p(working_series) # we use the last n_steps_in days as input and predict one step n_steps_in, n_steps_out = input_length, 1 # split into samples, using sample from previous year # implementation from multi steps can be used here since single step is special case of multi steps train_samples, train_targets = split_sequence_nn_with_past_outliers_multi_step( train_series, working_series_with_outliers, n_steps_in, n_steps_out) # create the model model = Sequential() model.add(Dense(128, activation='relu', input_dim=n_steps_in + 1)) # we predict n_steps_out values model.add(Dense(n_steps_out)) # we use 'mae' with data transformed with log1p and expm1 to approach SMAPE error model.compile(optimizer='adam', loss='mae') # fit model model.fit(train_samples, train_targets, epochs=200, verbose=0) # perform prediction # we start by transforming the normalized series into log1p, new one day predictions will be added to this series # as we predict them and these predictions will be used for the next forecasting step working_series_values = np.log1p(working_series.values) # perform horizon predictions for i in range(horizon): # input contains the value from the previous year for the forecast day validation_in_sample = np.append( np.array(working_series_with_outliers[-365 + 1]), np.array(working_series_values[-n_steps_in:])) validation_in_sample = validation_in_sample.reshape( (1, n_steps_in + 1)) validation_forecast = model.predict(validation_in_sample, verbose=0) working_series_values = np.append(working_series_values, validation_forecast) working_series_with_outliers = np.append(working_series_with_outliers, validation_forecast) # take last horizon values from the series (this is the forecast for the validation series validation_forecast = working_series_values[-horizon:] # dataframe which contains the result forecast_dataframe = pd.DataFrame(index=validation_series.index) # if data was normalized, we need to apply the reverse transform if normalize: # first reverse log1p using expm1 validation_forecast = np.expm1(validation_forecast) # use scaler to reverse normalizing denormalized_forecast = scaler.inverse_transform( validation_forecast.reshape(-1, 1)) denormalized_forecast = [val[0] for val in denormalized_forecast] # save the forecast in the dataframe forecast_dataframe['forecast'] = denormalized_forecast else: # save the forecast in the dataframe forecast_dataframe['forecast'] = np.expm1(validation_forecast) if plot: plt.figure(figsize=(10, 6)) plt.plot(series[-100:], color="blue", linestyle="-") plt.plot(validation_series, color="green", linestyle="-") plt.plot(forecast_dataframe, color="red", linestyle="--") plt.legend(["Train series", "Validation series", "Predicted series"]) plt.title("Validation of simple NN with input size " + str(n_steps_in) + " output size " + str(n_steps_out)) plt.show() return smape( validation_series, forecast_dataframe['forecast']), forecast_dataframe['forecast']
def train_model(city, air_code, model_name='model_a', train_mother=True, train_child=True, test=False, special_date=False, special_startday=None, special_endday=None, min_aq=-999, max_aq=1e9): #prepare training data set ##################################################################### meo_codes = [ 'temperature', 'pressure', 'humidity', 'wind_direction', 'wind_speed/kph' ] aq_stations_file = './data/{}_aq_stations.csv'.format(city.lower()) aq_data_root = './data/from_aq/' meo_data_root = './data/from_grid/' ##################################################################### ############################################################### window = 1 * 24 predict_step = 1 * 24 predict_hours = 2 * 24 normalized = 2 file_norm = './data/norm_pars_{}_{}_{}.csv'.format(city.lower(), air_code, model_name) folder_norm = os.path.join('./model/', model_name, city, air_code) ############################################################## if not os.path.isdir(folder_norm): os.makedirs(folder_norm) ######################################################################## train_rate = 0.8 ###################################################################### x_stations_obs = {} x_stations_pre = {} y_stations = {} all_data = [] total_samples = 0 data_aq_station = pd.read_csv(aq_stations_file, usecols=[1]) aq_stations = data_aq_station.values for station in aq_stations: meo_file = os.path.join( meo_data_root, 'from_grid_{}_{}.csv'.format(city.lower(), station[0])) aq_file = os.path.join( aq_data_root, 'from_aq_{}_{}.csv'.format(city.lower(), station[0])) x_obs = [] x_pre = [] y = [] data_aq = pd.read_csv(aq_file) data_aq = data_aq[air_code] data_aq = data_aq.values data_aq = data_aq.astype('float32') data_meo = pd.read_csv(meo_file) data_meo = data_meo[meo_codes] data_meo = data_meo.values data_meo = data_meo.astype('float32') len_hours = min(data_aq.shape[0], data_meo.shape[0]) data_aq = data_aq.reshape((data_aq.shape[0], 1)) data = np.concatenate( (data_aq[:len_hours, :], data_meo[:len_hours, :]), axis=1) all_data.append(data) for i in range(0, (len_hours - window - predict_hours), predict_step): if min(data[i:i + window + predict_hours, 0]) < 0 or min( data[i:i + window + predict_hours, 0]) < min_aq or max( data[i:i + window + predict_hours, 0]) > max_aq: continue day = i // 24 if special_date: if day >= special_startday and day < special_endday: y.append(data[i + window:i + window + predict_hours, 0]) x_obs.append(data[i:i + window, :]) x_pre.append(data[i + window:i + window + predict_hours, 1:]) else: y.append(data[i + window:i + window + predict_hours, 0]) x_obs.append(data[i:i + window, :]) x_pre.append(data[i + window:i + window + predict_hours, 1:]) y = np.array(y) x_obs = np.array(x_obs) x_pre = np.array(x_pre) x_stations_obs[station[0]] = x_obs x_stations_pre[station[0]] = x_pre y_stations[station[0]] = y total_samples += y.shape[0] data_for_statistic = all_data[0] for i in range(1, len(all_data)): data_for_statistic = np.concatenate((data_for_statistic, all_data[i])) means = data_for_statistic.mean(axis=0) stds = data_for_statistic.std(axis=0) maxs = data_for_statistic.max(axis=0) mins = data_for_statistic.min(axis=0) with open(file_norm, 'w') as f: f.write(',') f.write(air_code + ',') for code in meo_codes: f.write(code + ',') f.write('\n') f.write('means:,') for mean in means: f.write(str(mean) + ',') f.write('\n') f.write('stds:,') for std in stds: f.write(str(std) + ',') f.write('\n') f.write('maxs:,') for max_ in maxs: f.write(str(max_) + ',') f.write('\n') f.write('mins:,') for min_ in mins: f.write(str(min_) + ',') f.write('\n') if mins[0] < 0: mins[0] = 0 norm_y = utils.normalization(normalized, means[0], stds[0], maxs[0], mins[0]) norm_x_obs = utils.normalization(normalized, means, stds, maxs, mins) norm_x_pre = utils.normalization(normalized, means[1:], stds[1:], maxs[1:], mins[1:]) if train_mother: norm_y.save(os.path.join(folder_norm, 'norm_y.json')) norm_x_obs.save(os.path.join(folder_norm, 'norm_x_obs.json')) norm_x_pre.save(os.path.join(folder_norm, 'norm_x_pre.json')) if special_date: for station in aq_stations: key = station[0] if y_stations[key].shape[0] == 0: continue days = np.array(range(y_stations[key].shape[0])) np.random.shuffle(days) x_stations_obs[key] = x_stations_obs[key][days] x_stations_pre[key] = x_stations_pre[key][days] y_stations[key] = y_stations[key][days] i = 0 for station in aq_stations: key = station[0] if y_stations[key].shape[0] == 0: continue train_row = round(train_rate * y_stations[key].shape[0]) if i == 0: x_train_1 = x_stations_obs[key][:train_row, :, :] x_train_2 = x_stations_pre[key][:train_row, :, :] y_train = y_stations[key][:train_row, :] x_test_1 = x_stations_obs[key][train_row:, :, :] x_test_2 = x_stations_pre[key][train_row:, :, :] y_test = y_stations[key][train_row:, :] else: x_train_1 = np.concatenate( (x_train_1, x_stations_obs[key][:train_row, :, :])) x_train_2 = np.concatenate( (x_train_2, x_stations_pre[key][:train_row, :, :])) y_train = np.concatenate((y_train, y_stations[key][:train_row, :])) x_test_1 = np.concatenate( (x_test_1, x_stations_obs[key][train_row:, :, :])) x_test_2 = np.concatenate( (x_test_2, x_stations_pre[key][train_row:, :, :])) y_test = np.concatenate((y_test, y_stations[key][train_row:, :])) i += 1 x_train_1 = norm_x_obs(x_train_1) x_train_2 = norm_x_pre(x_train_2) y_train = norm_y(y_train) x_test_1 = norm_x_obs(x_test_1) x_test_2 = norm_x_pre(x_test_2) y_test = norm_y(y_test) #paras ###################################################################### lr = 1e-5 batch_size = 512 epoches = 2000 dim_rnn = [256, 256, 512] dim_dense = [512, 256, 128, 64] drop = 0.2 activations = ['relu', 'sigmoid'] root_model = os.path.join('./model/', model_name, city, air_code) model_structure_file = './model/{}/{}.png'.format(model_name, model_name) ##################################################################### if not os.path.isdir(root_model): os.mkdir(root_model) test_total_scores = {} #----------------------------------------------------------------------------------------------------- ''' train a model with data in all stations ''' if train_mother: input_shape_obs = (window, len(meo_codes) + 1) input_shape_pre = (predict_hours, len(meo_codes)) output_shape = predict_hours optimizer = keras.optimizers.RMSprop(lr=lr) loss_fuc = my_loss.loss_smape_rmse model = models.model_f(input_shape_obs, input_shape_pre, output_shape, opt=optimizer, loss=loss_fuc, dim_rnn=dim_rnn, dim_dense=dim_dense, drop=drop, activations=activations) model.summary() plot_model(model, to_file=model_structure_file, show_shapes=True, show_layer_names=True) print('x_train_obs shape:', x_train_1.shape) print('x_train_pre shape:', x_train_2.shape) print('train samples:', x_train_1.shape[0]) print('test samples:', x_test_1.shape[0]) hist = model.fit([x_train_1, x_train_2], y_train, batch_size=batch_size, epochs=epoches, verbose=1, validation_data=([x_test_1, x_test_2], y_test)) model.save(os.path.join(root_model, model_name + '.h5')) _predicted, _y, scores = test_model(root_model, model_name, model, [x_test_1, x_test_2], [y_test], norm_y) test_total_scores[model_name] = utils.smape(_y, _predicted) print("total_score:{}".format(utils.smape(_y, _predicted))) compare_predict_actual( _y, _predicted, os.path.join(root_model, '{}_test.png'.format(model_name))) K.clear_session() #------------------------------------------------------------------------------------------------------------ #----------------------------------------------------------------------------------------------------------------------------------- ''' train models for each station with the mother model ''' if train_child: for station in aq_stations: s_model_name = model_name + '_' + station[0] key = station[0] print('*' * 5 + key + '*' * 5) train_row = round(train_rate * y_stations[key].shape[0]) s_x_train_1 = x_stations_obs[key][:train_row, :, :] s_x_train_2 = x_stations_pre[key][:train_row, :, :] s_y_train = y_stations[key][:train_row, :] s_x_test_1 = x_stations_obs[key][train_row:, :, :] s_x_test_2 = x_stations_pre[key][train_row:, :, :] s_y_test = y_stations[key][train_row:, :] s_x_train_1 = norm_x_obs(s_x_train_1) s_x_train_2 = norm_x_pre(s_x_train_2) s_y_train = norm_y(s_y_train) s_x_test_1 = norm_x_obs(s_x_test_1) s_x_test_2 = norm_x_pre(s_x_test_2) s_y_test = norm_y(s_y_test) input_shape_obs = (window, len(meo_codes) + 1) input_shape_pre = (predict_hours, len(meo_codes)) output_shape = predict_hours model = keras.models.load_model( os.path.join(root_model, model_name + '.h5'), custom_objects={'loss_smape_rmse': my_loss.loss_smape_rmse}) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, cooldown=10, min_lr=1e-8) early_stopping = EarlyStopping(monitor='val_loss', patience=100) model.summary() print('x_train_obs shape:', s_x_train_1.shape) print('x_train_pre shape:', s_x_train_2.shape) print('train samples:', s_x_train_1.shape[0]) print('test samples:', s_x_test_1.shape[0]) hist = model.fit([s_x_train_1, s_x_train_2], s_y_train, batch_size=batch_size, epochs=epoches, verbose=1, shuffle=True, validation_data=([s_x_test_1, s_x_test_2], s_y_test), callbacks=[reduce_lr, early_stopping]) model.save(os.path.join(root_model, s_model_name + '.h5')) _predicted, _y, scores = test_model(root_model, s_model_name, model, [s_x_test_1, s_x_test_2], [s_y_test], norm_y) test_total_scores[s_model_name] = utils.smape(_y, _predicted) print("total_score:{}".format(utils.smape(_y, _predicted))) compare_predict_actual( _y, _predicted, os.path.join(root_model, '{}_test.png'.format(s_model_name))) K.clear_session() #--------------------------------------------------------------------------------------------------------------------------------------- #****************************************************************************# ''' save scores ''' with open(os.path.join(root_model, 'scores.json'), 'w') as f: json.dump(test_total_scores, f) #***************************************************************************# if test: test_total_scores = {} model = keras.models.load_model( os.path.join(root_model, model_name + '.h5'), custom_objects={'loss_smape_rmse': my_loss.loss_smape_rmse}) _predicted, _y, scores = test_model(root_model, model_name, model, [x_test_1, x_test_2], [y_test], norm_y) test_total_scores[model_name] = utils.smape(_y, _predicted) print("total_score:{}".format(utils.smape(_y, _predicted))) K.clear_session() for station in aq_stations: s_model_name = model_name + '_' + station[0] key = station[0] train_row = round(train_rate * y_stations[key].shape[0]) s_x_train_1 = x_stations_obs[key][:train_row, :, :] s_x_train_2 = x_stations_pre[key][:train_row, :, :] s_y_train = y_stations[key][:train_row, :] s_x_test_1 = x_stations_obs[key][train_row:, :, :] s_x_test_2 = x_stations_pre[key][train_row:, :, :] s_y_test = y_stations[key][train_row:, :] s_x_train_1 = norm_x_obs(s_x_train_1) s_x_train_2 = norm_x_pre(s_x_train_2) s_y_train = norm_y(s_y_train) s_x_test_1 = norm_x_obs(s_x_test_1) s_x_test_2 = norm_x_pre(s_x_test_2) s_y_test = norm_y(s_y_test) model = keras.models.load_model( os.path.join(root_model, s_model_name + '.h5'), custom_objects={'loss_smape_rmse': my_loss.loss_smape_rmse}) _predicted, _y, scores = test_model(root_model, s_model_name, model, [s_x_test_1, s_x_test_2], [s_y_test], norm_y) test_total_scores[s_model_name] = utils.smape(_y, _predicted) print("total_score:{}".format(utils.smape(_y, _predicted))) K.clear_session() with open(os.path.join(root_model, 'ztest_scores.json'), 'w') as f: json.dump(test_total_scores, f)
def test_standard(self, predict_file, predict_time): ''' Calc scores based on the official evaluation ''' root = os.path.join(os.path.split(predict_file)[0], 'test') predict_file_name = os.path.splitext(os.path.split(predict_file)[1])[0] forecast_hours = self.request_paras['forecast'] #live data bien = self.request_paras['bien'] request_real_data_aq(bien['cities'], forecast_hours, root, predict_time) utils.clip_real_data_aq(root, self.request_paras['stations'], now=predict_time, backward_hours=forecast_hours, interpolate=False) p_aq_codes = { 'PM2.5': 'PM25_Concentration', 'PM10': 'PM10_Concentration', 'O3': 'O3_Concentration' } obs = {} for station_id in self.station_infos: station_info = self.station_infos[station_id] aq_file = os.path.join(root, 'bien', 'aq', station_id + '.csv') pd_aq = pd.read_csv(aq_file) obs_station = {} for p in station_info['pollutions']: aq = pd_aq[p_aq_codes[p]] obs_station[p] = aq obs[station_id] = obs_station #weather forecast pd_predict = pd.read_csv(predict_file, usecols=['test_id', 'PM2.5', 'PM10', 'O3'], index_col=['test_id']) pre = {} for station_id in self.station_infos: station_info = self.station_infos[station_id] station_ids = [] for i in range(forecast_hours): station_ids.append('{}#{}'.format(station_id, i)) pre_station = {} for p in station_info['pollutions']: pre_station[p] = pd_predict[p].ix[station_ids] pre[station_id] = pre_station #scores score = {} for station_id in self.station_infos: station_info = self.station_infos[station_id] obss = [] for p in station_info['pollutions']: obss.append(obs[station_id][p].values) obss = np.array(obss) for i in range(obss.shape[1]): if np.isnan(obss[:, i]).any() or (obss[:, i] < 0).any(): obss[:, i] = np.nan i = 0 for p in station_info['pollutions']: obs[station_id][p] = obss[i] i += 1 score_station = {} for p in station_info['pollutions']: predicts = pre[station_id][p].values observes = obs[station_id][p] score_station[p] = utils.smape(observes, predicts) dic = {'obs': observes, 'pre': predicts} pd_data = pd.DataFrame(dic) pd_data.to_csv( os.path.join( root, predict_file_name + '{}_{}_scores.csv'.format(station_id, p))) score[station_id] = score_station with open(os.path.join(root, predict_file_name + '_scores.csv'), 'w') as f: for station_id in self.station_infos: station_info = self.station_infos[station_id] f.write(station_id) f.write('\n') for p in station_info['pollutions']: f.write(p + ',') f.write(str(score[station_id][p]) + ',') f.write('\n') with open(os.path.join(root, predict_file_name + '_scorelist.csv'), 'w') as f: f.write('station_id,PM2.5,PM10,O3\n') for station_id in self.station_infos: station_info = self.station_infos[station_id] f.write(station_id) f.write(',') line = '' for p in station_info['pollutions']: line += (str(score[station_id][p]) + ',') line = line[:-1] f.write(line) f.write('\n') return score
def arima_forecast(series, validation_series, horizon, order, seasonal_order, del_outliers=False, normalize=False, plot=False): """ Creates an arima model with the provided order and seasonal order and assess performance of the model is on a validation series. :param series: :param validation_series: :param horizon: :param order: :param seasonal_order: :param del_outliers: :param normalize: :param plot: :return: SMAPE for the validation series, the forecast validation series """ # whether to remove outliers in the training series if del_outliers: working_series = remove_outliers(series) else: working_series = series # whether to normalize the training series if normalize: scaler, working_series = normalize_series(working_series) else: scaler = None # input sequence is our data train_series = working_series # perform search for best parameters and fit model = arima.ARIMA(order=order, seasonal_order=seasonal_order, suppress_warnings=True) model.fit(train_series) # perform predictions f_autoarima = model.predict(n_periods=horizon) # dataframe which contains the result forecast_dataframe = pd.DataFrame(index=validation_series.index) # if data was normalized, we need to apply the reverse transform if normalize: # first reverse log1p using expm1 validation_forecast = f_autoarima # use scaler to reverse normalizing denormalized_forecast = scaler.inverse_transform( validation_forecast.reshape(-1, 1)) denormalized_forecast = [val[0] for val in denormalized_forecast] # save the forecast in the dataframe forecast_dataframe['forecast'] = denormalized_forecast else: # save the forecast in the dataframe forecast_dataframe['forecast'] = f_autoarima if plot: plt.figure(figsize=(10, 6)) plt.plot(series[-100:], color="blue", linestyle="-") plt.plot(validation_series, color="green", linestyle="-") plt.plot(forecast_dataframe, color="red", linestyle="--") plt.legend(["Train series", "Validation series", "Predicted series"]) plt.title("Validation of arima model with order " + str(order) + " seasonal order " + str(seasonal_order)) plt.show() return smape( validation_series, forecast_dataframe['forecast']), forecast_dataframe['forecast']
def auto_arima_forecast(series, validation_series, horizon, del_outliers=False, normalize=False, plot=False): """ Fits an auto arima model from the series to find the best parameters. Performance of the trained model is assessed on a validation series. :param series: :param validation_series: :param horizon: :param del_outliers: :param normalize: :param plot: :return: SMAPE for the validation series, the forecast validation series, order, seasonal_order """ # whether to remove outliers in the training series if del_outliers: working_series = remove_outliers(series) else: working_series = series # whether to normalize the training series if normalize: scaler, working_series = normalize_series(working_series) else: scaler = None # input sequence is our data train_series = working_series # perform search for best parameters and fit model = auto_arima(train_series, seasonal=True, max_D=2, m=7, trace=True, error_action='ignore', suppress_warnings=True, stepwise=True) order = model.get_params()['order'] seasonal_order = model.get_params()['seasonal_order'] # apparently useless model.fit(train_series) # perform predictions f_autoarima = model.predict(n_periods=horizon) # dataframe which contains the result forecast_dataframe = pd.DataFrame(index=validation_series.index) # if data was normalized, we need to apply the reverse transform if normalize: # first reverse log1p using expm1 validation_forecast = f_autoarima # use scaler to reverse normalizing denormalized_forecast = scaler.inverse_transform( validation_forecast.reshape(-1, 1)) denormalized_forecast = [val[0] for val in denormalized_forecast] # save the forecast in the dataframe forecast_dataframe['forecast'] = denormalized_forecast else: # save the forecast in the dataframe forecast_dataframe['forecast'] = f_autoarima if plot: plt.figure(figsize=(10, 6)) plt.plot(series[-100:], color="blue", linestyle="-") plt.plot(validation_series, color="green", linestyle="-") plt.plot(forecast_dataframe, color="red", linestyle="--") plt.legend(["Train series", "Validation series", "Predicted series"]) plt.title("Validation of auto arima model") plt.show() return smape(validation_series, forecast_dataframe['forecast'] ), forecast_dataframe['forecast'], order, seasonal_order