def create_submission(model_dir): submission_path = os.path.join( DATASET_PATH, 'submissions', 'seq2seq', '%s.csv' % os.path.basename(model_dir)) if os.path.exists(submission_path): return meta_model = _load_meta_model(model_dir) _, test, submission, metadata = load_data() test_preds = {} submission_series_id = submission[submission.prediction_window == 'hourly'].series_id.unique() for series_id in submission_series_id: window = submission[submission.series_id == series_id].prediction_window.values[0] df = test[test.series_id == series_id].copy() df.reset_index(inplace=True) if len(df) > 7*24: df = df.loc[len(df)-7*24:] x, mean_value, next_day = prepare_x(window, df, metadata, series_id) pred = meta_model.predict(x, window, next_day)*mean_value test_preds[series_id] = pred[0, :, 0] base_submission = submission new_consumption = base_submission.consumption.values.copy() submission_series_id = base_submission.series_id.values for series_id in test_preds: new_consumption[submission_series_id == series_id] = test_preds[series_id] new_submission = base_submission.copy() new_submission['consumption'] = new_consumption new_submission.to_csv(submission_path, index=False) print(os.path.basename(submission_path))
def load_and_arrange_data(conf, verbose=False): train, test, _, metadata = load_data() if 'random_seed' in conf: train_ids, val_ids = stratified_cv_series_id( train.series_id.unique(), fold_idx=conf['fold_idx'], random_seed=conf['random_seed']) else: train_ids, val_ids = split_series_id(train.series_id.unique(), fold_idx=conf['fold_idx']) val = train[train.series_id.isin(val_ids)] val.reset_index(inplace=True, drop=True) train = train[train.series_id.isin(train_ids)] train.reset_index(inplace=True, drop=True) train = pd.concat([train, test]) train.reset_index(inplace=True, drop=True) train_x, train_y = prepare_data_for_train(train, metadata, conf['input_days'], conf['window'], conf['only_working_day'], verbose=verbose) val_x, val_y = prepare_data_for_train(val, metadata, conf['input_days'], conf['window'], conf['only_working_day'], verbose=verbose) train_x = np.concatenate([train_x[key] for key in conf['inputs']], axis=2) val_x = np.concatenate([val_x[key] for key in conf['inputs']], axis=2) return train_x, train_y, val_x, val_y
def main(): print(colored('\tPreparing data', 'blue')) train, test, _, metadata = load_data() add_is_off_column(train, metadata) add_is_off_column(test, metadata) add_is_holiday_column(train) add_is_holiday_column(test) train.to_csv(TRAIN_PATH, index=False) test.to_csv(TEST_PATH, index=False)
def main(): print(colored('\tFinding clusters', 'blue')) train, test, _, metadata = load_data() train_clusters = get_clusters_with_time_continuity(train, metadata) train_clusters = [cluster_to_int(cluster) for cluster in train_clusters] with open(TRAIN_CLUSTERS_PATH, 'w') as f: json.dump(train_clusters, f) test_clusters = get_clusters_with_time_continuity(test, metadata) test_clusters = [cluster_to_int(cluster) for cluster in test_clusters] with open(TEST_CLUSTERS_PATH, 'w') as f: json.dump(test_clusters, f)
def load_and_arrange_data(conf, verbose=False): """ Prepares the data for training using cross-validation """ train, test, _, metadata = load_data() if 'random_seed' in conf: print('Using random seed') train_ids, val_ids = stratified_cv_series_id( train.series_id.unique(), fold_idx=conf['fold_idx'], random_seed=conf['random_seed']) else: train_ids, val_ids = split_series_id(train.series_id.unique(), fold_idx=conf['fold_idx']) val = train[train.series_id.isin(val_ids)] val.reset_index(inplace=True, drop=True) train = train[train.series_id.isin(train_ids)] train.reset_index(inplace=True, drop=True) train = pd.concat([train, test]) train.reset_index(inplace=True, drop=True) _train_x, train_y = prepare_data_for_train(train, conf['input_days'], conf['window'], verbose=verbose) _val_x, val_y = prepare_data_for_train(val, conf['input_days'], conf['window'], verbose=verbose) train_x, val_x = {}, {} train_x['past_features'] = np.concatenate([_train_x['past_%s' % key] \ for key in conf['past_features']], axis=2) val_x['past_features'] = np.concatenate([_val_x['past_%s' % key] \ for key in conf['past_features']], axis=2) train_x['future_features'] = np.concatenate([_train_x['future_%s' % key] \ for key in conf['future_features']], axis=2) val_x['future_features'] = np.concatenate([_val_x['future_%s' % key] \ for key in conf['future_features']], axis=2) train_x['cluster_features'] = np.concatenate([_train_x[key] \ for key in conf['cluster_features']], axis=1) val_x['cluster_features'] = np.concatenate([_val_x[key] \ for key in conf['cluster_features']], axis=1) return train_x, train_y, val_x, val_y
def _load_and_arrange_data(conf): train, test, submission, metadata = load_data() if 'random_seed' in conf: train_ids, val_ids = stratified_cv_series_id( train.series_id.unique(), fold_idx=conf['fold_idx'], random_seed=conf['random_seed']) else: train_ids, val_ids = split_series_id(train.series_id.unique(), fold_idx=conf['fold_idx']) val = train[train.series_id.isin(val_ids)] val.reset_index(inplace=True, drop=True) train = train[train.series_id.isin(train_ids)] train.reset_index(inplace=True, drop=True) train = pd.concat([train, test]) train.reset_index(inplace=True, drop=True) train_x, train_y = prepare_data_for_train( train, metadata, conf['input_days'], conf['window'], verbose=False) val_x, val_y = prepare_data_for_train( val, metadata, conf['input_days'], conf['window'], verbose=False) train_x = {key:train_x[key] for key in train_x if key not in conf['remove_inputs']} val_x = {key:val_x[key] for key in val_x if key not in conf['remove_inputs']} return train_x, train_y, val_x, val_y
def all_series_ids(): train, test, _, _ = load_data() all_series_id = train.series_id.unique().tolist() + test.series_id.unique( ).tolist() return all_series_id