def var_predict(df, n_forwards=(1, 3), n_lags=4, test_ratio=0.2): n_sample, n_output = df.shape n_test = int(round(n_sample * test_ratio)) n_train = n_sample - n_test df_train, df_test = df[:n_train], df[n_train:] scaler = StandardScaler(mean=df_train.values.mean(), std=df_train.values.std()) data = scaler.transform(df_train.values) var_model = VAR(data) var_result = var_model.fit(n_lags) max_n_forwards = np.max(n_forwards) # Do forecasting. result = np.zeros(shape=(len(n_forwards), n_test, n_output)) start = n_train - n_lags - max_n_forwards + 1 for input_ind in range(start, n_sample - n_lags): prediction = var_result.forecast( scaler.transform(df.values[input_ind:input_ind + n_lags]), max_n_forwards) for i, n_forward in enumerate(n_forwards): result_ind = input_ind - n_train + n_lags + n_forward - 1 if 0 <= result_ind < n_test: result[i, result_ind, :] = prediction[n_forward - 1, :] df_predicts = [] for i, n_forward in enumerate(n_forwards): df_predict = pd.DataFrame(scaler.inverse_transform(result[i]), index=df_test.index, columns=df_test.columns) df_predicts.append(df_predict) df_predict.to_csv("./df_predict.csv", sep=',', index=False) df_test.to_csv("./df_test.csv", sep=',', index=False) return df_predicts, df_test
def var_predict(df, n_forwards=(1, 3), n_lags=4, test_ratio=0.2): """ Multivariate time series forecasting using Vector Auto-Regressive Model. :param df: pandas.DataFrame, index: time, columns: sensor id, content: data. :param n_forwards: a tuple of horizons. :param n_lags: the order of the VAR model. :param test_ratio: :return: [list of prediction in different horizon], dt_test """ n_sample, n_output = df.shape n_test = int(round(n_sample * test_ratio)) n_train = n_sample - n_test df_train, df_test = df[:n_train], df[n_train:] scaler = StandardScaler(mean=df_train.values.mean(), std=df_train.values.std()) data = scaler.transform(df_train.values) var_model = VAR(data) var_result = var_model.fit(n_lags) max_n_forwards = np.max(n_forwards) # Do forecasting. result = np.zeros(shape=(len(n_forwards), n_test, n_output)) start = n_train - n_lags - max_n_forwards + 1 for input_ind in range(start, n_sample - n_lags): prediction = var_result.forecast(scaler.transform(df.values[input_ind: input_ind + n_lags]), max_n_forwards) for i, n_forward in enumerate(n_forwards): result_ind = input_ind - n_train + n_lags + n_forward - 1 if 0 <= result_ind < n_test: result[i, result_ind, :] = prediction[n_forward - 1, :] df_predicts = [] for i, n_forward in enumerate(n_forwards): df_predict = pd.DataFrame(scaler.inverse_transform(result[i]), index=df_test.index, columns=df_test.columns) df_predicts.append(df_predict) return df_predicts, df_test
def evaluate(self, sess, **kwargs): y_preds_all = [] half_length = int(len(self.clusters) / 2) sclusters = self.clusters[0:32] for cluster in sclusters: node_count, adj_mx = self.cluster_data(cluster) adj_mx = utils.calculate_random_walk_matrix(adj_mx).T adj_mx = self._build_sparse_matrix(adj_mx) global_step = sess.run(tf.train.get_or_create_global_step()) scaler_path = self._kwargs['data'].get( 'dataset_dir') + '/scaler.npy' scaler_data_ = np.load(scaler_path) mean, var = scaler_data_[0], scaler_data_[1] scaler = StandardScaler(mean=mean, std=var) # change val to test before run test_data_path = self._kwargs['data'].get( 'dataset_dir') + '/test_' + str(cluster) + '.tfrecords' test_dataset = tf.data.TFRecordDataset([test_data_path]) test_dataset = test_dataset.map(self._parse_record_fn) test_dataset = test_dataset.make_one_shot_iterator() test_next_element = test_dataset.get_next() test_results = self.run_epoch_generator(sess, self._test_model, test_next_element, adj_mx, return_output=True, training=False) test_loss, y_preds = test_results['loss'], test_results['outputs'] utils.add_simple_summary(self._writer, ['loss/test_loss'], [test_loss], global_step=global_step) y_preds = np.concatenate(y_preds, axis=0) y_preds = scaler.inverse_transform(y_preds[:, self.horizon - 1, :, 0]) y_preds = y_preds[:, 0:node_count] y_preds_all.append(y_preds) y_preds_all = np.concatenate(y_preds_all, axis=1) return y_preds_all
def test_reverse_transform_df(self): df = pd.DataFrame([[0., -1.], [-1, -0.5], [1., 0.]]) expected_result = np.array([[35., 0.], [0., 17.5], [70., 35.]]) scaler = StandardScaler(mean=35., std=35.) result = scaler.inverse_transform(df) self.assertTrue(np.array_equal(expected_result, result.values))
def test_reverse_transform(self): data = np.array([[0., -1.], [-1, -0.5], [1., 0.]]) expected_result = np.array([[35., 0.], [0., 17.5], [70., 35.]]) scaler = StandardScaler(mean=35., std=35.) result = scaler.inverse_transform(data) self.assertTrue(np.array_equal(expected_result, result))
def main(args): with open(args.config_filename) as f: supervisor_config = yaml.load(f) graph_pkl_filename = supervisor_config['data'].get( 'graph_pkl_filename') sensor_ids, sensor_id_to_ind, adj_mx = load_graph_data( graph_pkl_filename) supervisor_config['model']['num_nodes'] = num_nodes = len(sensor_ids) # Data preprocessing traffic_df_filename = supervisor_config['data']['hdf_filename'] df_data = pd.read_hdf(traffic_df_filename) #df_data = df_data.iloc[int(df_data.shape[0]/3):,:] validation_ratio = supervisor_config.get('data').get( 'validation_ratio') test_ratio = supervisor_config.get('data').get('test_ratio') df_train, df_val, df_test = train_val_test_split( df_data, val_ratio=validation_ratio, test_ratio=test_ratio) batch_size = supervisor_config.get('data').get('batch_size') val_batch_size = supervisor_config.get('data').get('val_batch_size') test_batch_size = supervisor_config.get('data').get('test_batch_size') horizon = supervisor_config.get('model').get('horizon') seq_len = supervisor_config.get('model').get('seq_len') scaler = StandardScaler(mean=df_train.values.mean(), std=df_train.values.std()) data_train = generate_seq2seq_data(df_train, batch_size, seq_len, horizon, num_nodes, 'train', scaler) data_val = generate_seq2seq_data(df_val, val_batch_size, seq_len, horizon, num_nodes, 'val', scaler) data_train.update(data_val) #data_train['scaler'] = scaler data_test = generate_seq2seq_data(df_test, test_batch_size, seq_len, horizon, num_nodes, 'test', scaler) #data_test['scaler'] = scaler tf_config = tf.ConfigProto() if args.use_cpu_only: tf_config = tf.ConfigProto(device_count={'GPU': 0}) tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: supervisor = DCRNNSupervisor(adj_mx, data_train, supervisor_config) data_tag = supervisor_config.get('data').get('dataset_dir') folder = data_tag + '/model/' if not os.path.exists(folder): os.makedirs(folder) # Train supervisor.train(sess=sess) # Test yaml_files = glob.glob('%s/model/*/*.yaml' % data_tag, recursive=True) yaml_files.sort(key=os.path.getmtime) config_filename = yaml_files[-1] #'config_%d.yaml' % config_id with open(config_filename) as f: config = yaml.load(f) # Load model and evaluate supervisor.load(sess, config['train']['model_filename']) y_preds = supervisor.evaluate(sess, data_test) n_test_samples = data_test['y_test'].shape[0] folder = data_tag + '/results/' if not os.path.exists(folder): os.makedirs(folder) for horizon_i in range(data_test['y_test'].shape[1]): y_pred = scaler.inverse_transform(y_preds[:, horizon_i, :, 0]) eval_dfs = df_test[seq_len + horizon_i:seq_len + horizon_i + n_test_samples] df = pd.DataFrame(y_pred, index=eval_dfs.index, columns=eval_dfs.columns) #df = pd.DataFrame(y_pred, columns=df_test.columns) filename = os.path.join( '%s/results/' % data_tag, 'dcrnn_speed_prediction_%s.h5' % str(horizon_i + 1)) df.to_hdf(filename, 'results') print( 'Predictions saved as %s/results/dcrnn_prediction_[1-12].h5...' % data_tag)