def test_creation(self): series_test = TimeSeries.from_dataframe(self.dataframe1) self.assertTrue( np.all( series_test.pd_dataframe().values == (self.dataframe1.values))) # Series cannot be lower than three without passing frequency as argument to constructor with self.assertRaises(ValueError): TimeSeries(self.dataframe1.iloc[:2, :]) TimeSeries.from_dataframe(self.dataframe1.iloc[:2, :], freq="D")
def create_time_series(resampling_methods, chunk_ids, chunk_type, original_chunks, parameter, window_idx, configs, mean=0, std=1): # Apply filler as some time series have missing measurements what would lead to ValueError in prediction filler = MissingValuesFiller() for resampling in resampling_methods: series_per_resampling = dict() pred_scalers = dict() for chunk_id in chunk_ids: current_chunk = original_chunks[original_chunks['CHUNK_ID_FILLED_TH'] == chunk_id] # Scale chunk values if it is configured and create filled time series if configs.scaling_method == 'standard': current_chunk[f'SCALED_{resampling}'] = apply_standard_scaling( current_chunk[f'VITAL_PARAMTER_VALUE_{resampling}_RESAMPLING'], mean, std) series_per_resampling[chunk_id] = filler.transform(TimeSeries.from_dataframe( df=current_chunk, time_col='CHARTTIME', value_cols=[f'SCALED_{resampling}'], freq='H')) elif configs.scaling_method == 'min-max': # Darts uses MinMaxScaler by default current_scaler = Scaler() series_per_resampling[chunk_id] = current_scaler.fit_transform(filler.transform( TimeSeries.from_dataframe( df=current_chunk, time_col='CHARTTIME', value_cols=[f'VITAL_PARAMTER_VALUE_{resampling}_RESAMPLING'], freq='H'))) if chunk_type == 'pred' and \ ((configs.with_exogenous_input and resampling != 'MEDIAN') or not configs.with_exogenous_input): pred_scalers[chunk_id] = current_scaler else: # apply no scaling series_per_resampling[chunk_id] = filler.transform(TimeSeries.from_dataframe( df=current_chunk, time_col='CHARTTIME', value_cols=[f'VITAL_PARAMTER_VALUE_{resampling}_RESAMPLING'], freq='H')) # Save series dict path = get_script_path(configs) write_pickle_file(f'{path}/time_series/time_series_{parameter}_win{window_idx}_{chunk_type}_' f'{resampling.capitalize()}.pickle', series_per_resampling) # Save scaler dict if it was filled if pred_scalers: write_pickle_file(f'{path}/scalers/scalers_{parameter}_win{window_idx}_{resampling.capitalize()}.pickle', pred_scalers)
def lstm(): for company in lstCompanies: df = pd.DataFrame(list(db[company].find({}))) df = df.drop('_id', axis=1) df['Open'] = df['Open'].astype('float') df['Close'] = df['Close'].astype('float') series = TimeSeries.from_dataframe( df, 'Date', ['Close'], freq='B', fill_missing_dates=True) # 'B' = Business day series = auto_fillna(series) model = RNNModel( model= 'LSTM', # Either a string specifying the RNN module type (“RNN”, “LSTM” or “GRU”) output_length= 1, # Number of time steps to be output by the forecasting module hidden_size= 25, # Size for feature maps for each hidden RNN layer (hn) n_rnn_layers=1, # Number of layers in the RNN module input_length= 12, # The dimensionality of the TimeSeries instances that will be fed to the fit function batch_size= 16, # The batch size is a hyperparameter that defines the number of samples to work through before updating the internal model parameters n_epochs= 200, # The number of epochs is a hyperparameter that defines the number times that the learning algorithm will work through the entire training dataset optimizer_kwargs={'lr': 1e-3}, model_name='{}_RNN'.format(company)) model.fit(series) lstmPred = model.predict(1).values()[0][0] db.prediction.insert_one({ "Date": datetime.datetime.today(), "Company": company, "Prediction": round(float(lstmPred), 2) })
def test_eq(self): seriesA = TimeSeries.from_dataframe(self.dataframe1) self.assertTrue(self.series1 == seriesA) self.assertFalse(self.series1 != seriesA) # with different dates dataframeB = self.dataframe1.copy() dataframeB.index = pd.date_range("20130102", "20130111") seriesB = TimeSeries.from_dataframe(dataframeB) self.assertFalse(self.series1 == seriesB) # with one different value dataframeC = self.dataframe1.copy() dataframeC.iloc[2, 2] = 0 seriesC = TimeSeries.from_dataframe(dataframeC) self.assertFalse(self.series1 == seriesC)
def test_kalman(self): """KalmanFilter test. Creates an increasing sequence of numbers, adds noise and assumes the kalman filter predicts values closer to real values """ testing_signal = np.arange(1, 5, 0.1) noise = np.random.normal(0, 0.7, testing_signal.shape) testing_signal_with_noise = testing_signal + noise df = pd.DataFrame(data=testing_signal_with_noise, columns=["signal"]) testing_signal_with_noise_ts = TimeSeries.from_dataframe( df, value_cols=["signal"]) kf = KalmanFilter(dim_x=1) kf.fit(testing_signal_with_noise_ts) filtered_ts = kf.filter(testing_signal_with_noise_ts, num_samples=1) filtered_values = filtered_ts.univariate_values() noise_distance = testing_signal_with_noise - testing_signal prediction_distance = filtered_values - testing_signal self.assertGreater(noise_distance.std(), prediction_distance.std()) self.assertEqual(filtered_ts.width, 1) self.assertEqual(filtered_ts.n_samples, 1)
def eval_sarima_model(serialized_model, dataset): sarima_model = pickle.loads(serialized_model) df = pd.DataFrame.from_dict(dataset) ts = TimeSeries.from_dataframe(df, time_col='time_interval', value_cols=['count']) train, val = ts.split_after(0.8) #80% train, 20% val no_retrain = sarima_model.predict(len(val)) # backtest = sarima_model.historical_forecasts( # series=ts, # start=0.8, # forecast_horizon=1, # stride=1, # ) scores = dict() scores['retrained'] = dict() scores['not_retrained'] = dict() # scores['retrained']['r2'] = r2_score(val, backtest[1:]) # scores['retrained']['mase_score'] = mase(val, backtest[1:], train) # scores['retrained']['mae_score'] = mae(val, backtest[1:]) logging.debug(no_retrain) logging.debug(val) scores['r2'] = r2_score(val, no_retrain) scores['mase_score'] = mase(val, no_retrain, train) scores['mae_score'] = mae(val, no_retrain) scores['rmse_score'] = np.sqrt(mse(val, no_retrain)) try: #scores['retrained']['mape_score'] = mape(val, backtest[1:]) scores['mape_score'] = mape(val, no_retrain) except: #scores['retrained']['mape_score'] = "Could not be calculated (Zero value in time series)" scores[ 'mape_score'] = "Could not be calculated (Zero value in time series)" return scores
def eval_tcn_model(serialized_model, dataset): tcn_model = pickle.loads(serialized_model) df = pd.DataFrame.from_dict(dataset) ts = TimeSeries.from_dataframe(df, time_col='time_interval', value_cols=['count']) train, val = ts.split_after(0.8) #80% train, 20% val scaler = Scaler() ts = scaler.fit_transform(ts) val_transformed = scaler.transform(val) train_transformed = scaler.transform(train) backtest = tcn_model.historical_forecasts( series=ts, start=0.8, forecast_horizon=1, stride=1, retrain=False, ) val_transformed = scaler.inverse_transform(val_transformed) backtest = scaler.inverse_transform(backtest) train_transformed = scaler.inverse_transform(train_transformed) scores = dict() scores['r2'] = r2_score(val_transformed, backtest[1:]) scores['mase_score'] = mase(val_transformed, backtest[1:], train_transformed) scores['mae_score'] = mae(val_transformed, backtest[1:]) scores['rmse_score'] = np.sqrt(mse(val_transformed, backtest[1:])) try: scores['mape_score'] = mape(val_transformed, backtest[1:]) except: scores[ 'mape_score'] = "Could not be calculated (Zero value in time series)" return scores
def test_strip(self): dataframe1 = pd.DataFrame( { "0": 2 * [np.nan] + list(range(7)) + [np.nan], "1": [np.nan] + list(range(7)) + 2 * [np.nan], }, index=self.times1, ) series1 = TimeSeries.from_dataframe(dataframe1) self.assertTrue( (series1.strip().time_index == self.times1[1:-1]).all())
def test_ts_from_x(self): ts = linear_timeseries(length=10).with_static_covariates( pd.Series([0.0, 1.0], index=["st1", "st2"])) self.helper_test_cov_transfer(ts, TimeSeries.from_xarray(ts.data_array())) self.helper_test_cov_transfer( ts, TimeSeries.from_dataframe(ts.pd_dataframe(), static_covariates=ts.static_covariates), ) # ts.pd_series() loses component names -> static covariates have different components names self.helper_test_cov_transfer_values( ts, TimeSeries.from_series(ts.pd_series(), static_covariates=ts.static_covariates), ) self.helper_test_cov_transfer( ts, TimeSeries.from_times_and_values( times=ts.time_index, values=ts.all_values(), columns=ts.components, static_covariates=ts.static_covariates, ), ) self.helper_test_cov_transfer( ts, TimeSeries.from_values( values=ts.all_values(), columns=ts.components, static_covariates=ts.static_covariates, ), ) f_csv = os.path.join(self.temp_work_dir, "temp_ts.csv") f_pkl = os.path.join(self.temp_work_dir, "temp_ts.pkl") ts.to_csv(f_csv) ts.to_pickle(f_pkl) ts_json = ts.to_json() self.helper_test_cov_transfer( ts, TimeSeries.from_csv(f_csv, time_col="time", static_covariates=ts.static_covariates), ) self.helper_test_cov_transfer(ts, TimeSeries.from_pickle(f_pkl)) self.helper_test_cov_transfer( ts, TimeSeries.from_json(ts_json, static_covariates=ts.static_covariates))
def plot_sarima_predictions(serialized_model, dataset): df = pd.DataFrame.from_dict(dataset) model = pickle.loads(serialized_model) ts = TimeSeries.from_dataframe(df, time_col='time_interval', value_cols=['count']) model.fit(series=ts) prediction = model.predict(7) #Predict a week ahead ts.plot(label='Actual', lw=3, c='black') prediction.plot(label='SARIMA Prediction', lw=3, c='blue')
def get_sarima_backtest(serialized_model, dataset): df = pd.DataFrame.from_dict(dataset) ts = TimeSeries.from_dataframe(df, time_col='time_interval', value_cols=['count']) train, val = ts.split_after(0.8) #80% train, 20% val sarima_model = pickle.loads(serialized_model) sarima_model.fit(train) backtest = sarima_model.predict(len(val)) ts.plot(label='Actual', lw=3, c='black') backtest.plot(label='SARIMA Model', lw=3, c='blue')
def setUp(self): self.temp_work_dir = tempfile.mkdtemp(prefix="darts") times = pd.date_range("20130101", "20130410") pd_series = pd.Series(range(100), index=times) self.series = TimeSeries.from_series(pd_series) df = pd.DataFrame({ "var1": range(100), "var2": range(100) }, index=times) self.multivariate_series = TimeSeries.from_dataframe(df)
def convert_list_to_TimeSeries(selected_list): import pandas as pd from darts import TimeSeries selected_list_as_df = selected_list.to_frame() selected_list_as_df.reset_index(level=0, inplace=True) start = 'Jan 1, 1970 00:00' selected_list_as_df['timestamp'] = pd.to_datetime( selected_list_as_df.index, origin=start, unit='h') selected_list_as_TS = TimeSeries.from_dataframe(selected_list_as_df, 'timestamp', selected_list.name, freq='H') return selected_list_as_TS
def plot_tcn_predictions(serialized_model, dataset): df = pd.DataFrame.from_dict(dataset) model = pickle.loads(serialized_model) ts = TimeSeries.from_dataframe(df, time_col='time_interval', value_cols=['count']) scaler = Scaler() ts = scaler.fit_transform(ts) model.fit(series=ts) prediction = scaler.inverse_transform( model.predict(7)) #Predict a week ahead prediction.plot(label='TCN Prediction', lw=3, c='red')
def test_prophet_model_with_stdout_suppression(self): model = Prophet(suppress_stdout_stderror=True) model._execute_and_suppress_output = Mock(return_value=True) model._model_builder = Mock(return_value=Mock(fit=Mock(return_value=True))) df = pd.DataFrame( { "ds": pd.date_range(start="2022-01-01", periods=30, freq="D"), "y": np.linspace(0, 10, 30), } ) ts = TimeSeries.from_dataframe(df, time_col="ds", value_cols="y") model.fit(ts) model._execute_and_suppress_output.assert_called_once(), "Suppression should be called once"
def _load_from_disk( self, path_to_file: Path, metadata: DatasetLoaderMetadata ) -> Union[TimeSeries, List[TimeSeries]]: df = pd.read_csv(path_to_file) if metadata.header_time is not None: df = self._format_time_column(df) series = TimeSeries.from_dataframe(df=df, time_col=metadata.header_time, freq=metadata.freq) if (self._metadata.multivariate is not None and self._metadata.multivariate is False): try: series = self._to_multi_series(series.pd_dataframe()) except Exception as e: raise DatasetLoadingException( "Could not convert to multi-series. Reason:" + e.__repr__()) from None else: df.sort_index(inplace=True) series = TimeSeries.from_dataframe(df) return series
def get_sarima_predictions(model, dataset): logging.debug(dataset) df = pd.DataFrame.from_dict(dataset) ts = TimeSeries.from_dataframe(df, time_col='time_interval', value_cols=['count']) model.fit(series=ts) prediction = model.predict(7) # Predict a week ahead prediction_json = json.loads(prediction.to_json()) dates = prediction_json['index'] counts = prediction_json['data'] prediction_dataset = to_dataset(dates, counts) logging.debug(prediction_dataset) return prediction_dataset
def predict_series(df_delito_state, delito): df_pred = df_delito_state.copy() df_pred = pd.DataFrame(df_pred) df_pred['Year'] = pd.date_range('2015-01', '2021-01', freq='M') series = TimeSeries.from_dataframe(df_pred, 'Year', delito) #train, val = series.split_before(pd.Timestamp('20200201')) train, val = series.split_before(pd.Timestamp('20191230')) model = Prophet() #model = ExponentialSmoothing() model.fit(train) prediction = model.predict(len(val)) prediction = prediction.pd_dataframe() prediction[prediction < 0] = 0 return prediction
def denoising_input(self): np.random.seed(self.RANDOM_SEED) ts_periodic = tg.sine_timeseries(length=500) ts_gaussian = tg.gaussian_timeseries(length=500) ts_random_walk = tg.random_walk_timeseries(length=500) ts_cov1 = ts_periodic.stack(ts_gaussian) ts_cov1 = ts_cov1.pd_dataframe() ts_cov1.columns = ["Periodic", "Gaussian"] ts_cov1 = TimeSeries.from_dataframe(ts_cov1) ts_sum1 = ts_periodic + ts_gaussian ts_cov2 = ts_sum1.stack(ts_random_walk) ts_sum2 = ts_sum1 + ts_random_walk return ts_sum1, ts_cov1, ts_sum2, ts_cov2
def get_tcn_predictions(model, dataset): df = pd.DataFrame.from_dict(dataset) ts = TimeSeries.from_dataframe(df, time_col='time_interval', value_cols=['count']) scaler = Scaler() ts = scaler.fit_transform(ts) model.fit(series=ts) prediction = scaler.inverse_transform( model.predict(7)) #Predict a week ahead prediction_json = json.loads(prediction.to_json()) dates = prediction_json['index'] counts = prediction_json['data'] prediction_dataset = to_dataset(dates, counts) logging.debug(prediction_dataset) return prediction_dataset
def test_rescale(self): with self.assertRaises(ValueError): self.series1.rescale_with_value(1) seriesA = self.series2.rescale_with_value(0) self.assertTrue(np.all(seriesA.values() == 0).all()) seriesB = self.series2.rescale_with_value(1) self.assertEqual( seriesB, TimeSeries.from_dataframe( pd.DataFrame( { "0": np.arange(1, 11), "1": np.arange(1, 11), "2": np.arange(1, 11), }, index=self.dataframe2.index, ).astype(float)), )
def get_lstm_backtest(serialized_model, dataset): df = pd.DataFrame.from_dict(dataset) ts = TimeSeries.from_dataframe(df, time_col='time_interval', value_cols=['count']) scaler = Scaler() ts = scaler.fit_transform(ts) model = pickle.loads(serialized_model) backtest = model.historical_forecasts(series=ts, start=0.8, forecast_horizon=1, stride=1, retrain=False, verbose=False) backtest = scaler.inverse_transform(backtest[1:]) ts = scaler.inverse_transform(ts) backtest.plot(label='LSTM Model', lw=3, c='orange')
def make_lstm_prediction(): for ticker in lst_tickers_of_interest: df_ticker = pd.DataFrame( list(col_price_history.find({'Ticker': ticker})))[["DailyChangePct", "Date"]].set_index('Date') df_ticker.index = pd.to_datetime(df_ticker.index) df_ticker = df_ticker.reindex(index=df_ticker.index[::-1]) series = TimeSeries.from_dataframe(df_ticker, time_col=None, value_cols='DailyChangePct', freq='B', fill_missing_dates=True) series = auto_fillna(series) SEQ_LENGTH = 6 HIDDEN_SIZE = 5 OUTPUT_LEN = 1 NUM_LAYERS = 1 model = RNNModel(model='LSTM', output_length=OUTPUT_LEN, hidden_size=HIDDEN_SIZE, n_rnn_layers=NUM_LAYERS, input_length=SEQ_LENGTH, batch_size=16, n_epochs=10, optimizer_kwargs={'lr': 1e-3}, model_name=f'{ticker}_RNN', log_tensorboard=False) model.fit(series) lstm_prediction = model.predict(1).values()[0][0] lstm_prediction_history.insert_one({ "Date": datetime.datetime.today(), "Ticker": ticker, "LSTM_prediction": float(lstm_prediction) })
def _to_multi_series(self, series: pd.DataFrame) -> List[TimeSeries]: """ load the Uber TLC dataset as a list of univariate timeseries, one for each locationID. """ ts_list = [] # list of timeseries for label in series: srs = series[label] # filter column down to the period of recording start_date = min(srs.fillna(method="ffill").dropna().index) end_date = max(srs.fillna(method="bfill").dropna().index) active_range = (srs.index >= start_date) & (srs.index <= end_date) srs = srs[active_range] # convert to timeseries tmp = pd.DataFrame({"locationID": srs}) tmp["date"] = tmp.index ts = TimeSeries.from_dataframe(tmp, "date", ["locationID"]) ts_list.append(ts) return ts_list
def _to_multi_series(self, series: pd.DataFrame) -> List[TimeSeries]: """ Load the electricity dataset as a list of univariate series, one for each household. """ ts_list = [] # list of timeseries for label in series: srs = series[label] # filter column down to the period of recording srs = srs.replace(0.0, np.nan) start_date = min(srs.fillna(method="ffill").dropna().index) end_date = max(srs.fillna(method="bfill").dropna().index) active_range = (srs.index >= start_date) & (srs.index <= end_date) srs = srs[active_range].fillna(0.0) # convert to timeseries tmp = pd.DataFrame({"power_usage": srs}) tmp["date"] = tmp.index ts = TimeSeries.from_dataframe(tmp, "date", ["power_usage"]) ts_list.append(ts) return ts_list
def get_tcn_backtest(serialized_model, dataset, topic): df = pd.DataFrame.from_dict(dataset) ts = TimeSeries.from_dataframe(df, time_col='time_interval', value_cols=['count']) scaler = Scaler() ts = scaler.fit_transform(ts) model = pickle.loads(serialized_model) backtest = model.historical_forecasts(series=ts, start=0.8, forecast_horizon=1, stride=1, retrain=False, verbose=False) backtest = scaler.inverse_transform(backtest[1:]) ts = scaler.inverse_transform(ts) backtest.plot(label='TCN Model', lw=3, c='red') plt.title("{} Daily".format(topic)) plt.xlabel("Date") plt.ylabel("Count")
def get_sarima_model(dataset=None, plot=False, verbose=False): if (dataset is None): df = pd.read_csv("jeans_day.csv") else: df = pd.DataFrame.from_dict(dataset) ts = TimeSeries.from_dataframe(df, time_col='time_interval', value_cols=['count']) train, val = ts.split_after(0.8) # 80% train, 20% val params = dict() params['m'] = [7] # Weekly seasonality sarima = AutoARIMA.gridsearch(parameters=params, series=train, val_series=val, verbose=verbose, metric=mse) logging.debug("CHOSEN PARAMETERS:") params = sarima[1] sarima_model = sarima[0] sarima_model.fit(series=train) print(params) if (plot): backtest = sarima_model.predict(len(val)) print(val) print(backtest) print("R2: {}".format(r2_score(val, backtest, intersect=False))) print("MAPE: {}".format(mape(val, backtest))) print("MASE: {}".format(mase(val, backtest, train))) print("MAE: {}".format(mae(val, backtest))) backtest.plot(label='backtest') ts.plot(label='actual') plt.legend() plt.show() else: return [sarima_model, params]
# Extract relevant chunks relevant_series = dict() # Collect all series with minimal length for chunk_id in pd.unique(resampled.CHUNK_ID_FILLED_TH): current_series = resampled[resampled['CHUNK_ID_FILLED_TH'] == chunk_id] # At least input_chunk_length + output_chunk_length = 12 + 1 = 13 data points are required if len(current_series) > 12: relevant_series[chunk_id] = filler.transform( TimeSeries.from_dataframe( df=current_series, time_col='CHARTTIME', value_cols=[ f'VITAL_PARAMTER_VALUE_{endogenous_input.upper()}_RESAMPLING' ], freq='H')) # Extract all relevant chunk IDs relevant_chunk_ids = list(relevant_series.keys()) # Calculate number of chunks corresponding to 20% of chunks twenty_percent = int((20 * len(relevant_chunk_ids)) / 100) # Iterate five times different 20% of the chunks (= 5 windows) to predict all chunks for window_idx in range(n_windows): print(f'{window_idx}. window\n', file=sys.stderr)
###################################################### df = pd.read_csv("data/input_data.csv") # Rename columns header=df.columns df.rename(columns={header[0]: "time", header[1]: "wind actual", header[2]: "price forecast", header[3]: "price actual"}, inplace=True) df.reset_index() df.set_index("time") df["time"] = pd.to_datetime(df["time"], utc=True) # Transform DataFrame to Time Series Object df_series = TimeSeries.from_dataframe(df[["time","price actual"]], time_col='time', value_cols="price actual") ### Train and Test Model ####################################################### # Train Test Split train, val = df_series.split_before(pd.Timestamp("2021-03-01 00:00:00+00:00")) # Normalize the time series (note: we avoid fitting the transformer on the validation set) transformer = Scaler() train_transformed = transformer.fit_transform(train) val_transformed = transformer.transform(val) series_transformed = transformer.transform(df_series) # Define the LSTM Model parameters my_model = RNNModel(
# Import libraries import pandas as pd import numpy as np from darts import TimeSeries import plotly.offline as py import io import matplotlib.pyplot as plt plt.style.use("fivethirtyeight")# for pretty graphs #Upload training data from google.colab import files uploaded = files.upload() data = pd.read_csv(io.BytesIO(uploaded['AirPassengers.csv'])) series = TimeSeries.from_dataframe(data, 'Month', '#Passengers') train, val = series.split_after(pd.Timestamp('19590101')) """ExponentialSmoothing implementation""" from darts.models import ExponentialSmoothing model = ExponentialSmoothing() model.fit(train) prediction_exponential = model.predict(len(val)) series.plot(label='actual', lw=3) prediction_exponential.plot(label='forecast', lw=3) plt.legend() plt.xlabel('Year')