def load_BikeNYC(window_len=6, nb_flow=2, len_test=240): # load original data data, timestamps = load_stdata(data_path + 'NYC14_M16x8_T60_NewEnd.h5') # print(timestamps[:100]) # remove days that do not have 24 timestamps data, timestamps = remove_incomplete_days(data, timestamps, T=24) data = data[:, :nb_flow] data[data < 0] = 0. data_all = [data] timestamps_all = [timestamps] # Min_Max Scale data_train = data[:-len_test] # print('train_data shape: ', data_train.shape) mmn = MinMaxNormalization() mmn.fit(data_train) data_all_mmn = [] for d in data_all: data_all_mmn.append(mmn.transform(d)) # save min and max while scaling fpkl = open('preprocessing.pkl', 'wb') for obj in [mmn]: pickle.dump(obj, fpkl) fpkl.close() X, Y = data_slide_window(data=data_all_mmn[0], window_len=window_len) xtr, ytr, xte, yte = X[:-len_test], Y[:-len_test], X[-len_test:], Y[ -len_test:] # print('BikeNYC data loaded...') return xtr, ytr, xte, yte
def load_BikeNYC_new(window_len=6, nb_flow=2, len_test=240): # load original data data, timestamps = load_stdata(data_path + 'NYC14_M16x8_T60_NewEnd.h5') # print(timestamps[:100]) # remove days that do not have 24 timestamps data, timestamps = remove_incomplete_days(data, timestamps, T=24) data = data[:, :nb_flow] data[data < 0] = 0. data_all = [data] timestamps_all = [timestamps] # Min_Max Scale data_train = data[:-len_test] # print('train_data shape: ', data_train.shape) # mmn = MinMaxNormalization() mmn = MinMaxNormalization_01() mmn.fit(data_train) data_all_mmn = [] for d in data_all: data_all_mmn.append(mmn.transform(d)) # save min and max while scaling fpkl = open('preprocessing.pkl', 'wb') for obj in [mmn]: pickle.dump(obj, fpkl) fpkl.close() X, Y = data_slide_window(data=data_all_mmn[0], window_len=window_len) X, Y = shuffle_data(X, Y) # from sklearn.model_selection import train_test_split # xtr, ytr, xte, yte = train_test_split(X, Y, test_size=0.1, shuffle=True) xtr, ytr, xte, yte = X[:-len_test], Y[:-len_test], X[-len_test:], Y[ -len_test:] xtr = generate_new_sample(xtr, T=window_len) xte = generate_new_sample(xte, T=window_len) ytr = np.array(ytr) yte = np.array(yte) # print('BikeNYC data loaded...') return xtr, ytr, xte, yte
def load_data(T=48, nb_flow=2, len_test=None, preprocess_name='preprocessing.pkl', meta_data=True, meteorol_data=True, holiday_data=True, window_len=12): # assert(len_closeness + len_period + len_trend > 0) # load data # 13 - 16 data_all = [] timestamps_all = list() for year in range(13, 17): fname = os.path.join(datapath, 'TaxiBJ', 'BJ{}_M32x32_T30_InOut.h5'.format(year)) print("file name: ", fname) # stat(fname) data, timestamps = load_stdata(fname) # print(timestamps) # remove a certain day which does not have 48 timestamps data, timestamps = remove_incomplete_days(data, timestamps, T) data = data[:, :nb_flow] data[data < 0] = 0. data_all.append(data) timestamps_all.append(timestamps) print("\n") # minmax_scale data_train = np.vstack(copy(data_all))[:-len_test] print('train_data shape: ', data_train.shape) mmn = MinMaxNormalization() mmn.fit(data_train) data_all_mmn = [mmn.transform(d) for d in data_all] data_all_mmn_vstack = np.vstack(copy(data_all_mmn)) timestamps_all_vstack = [] for timestamps_element in timestamps_all: timestamps_all_vstack += timestamps_element # timestamps_all_vstack = np.vstack(copy(timestamps_all) fpkl = open(preprocess_name, 'wb') for obj in [mmn]: pickle.dump(obj, fpkl) fpkl.close() meta_feature = [] if meta_data: # load time feature time_feature = timestamp2vec(timestamps_all_vstack) meta_feature.append(time_feature) if holiday_data: # load holiday holiday_feature = load_holiday(timestamps_all_vstack) meta_feature.append(holiday_feature) if meteorol_data: # load meteorol data meteorol_feature = load_meteorol(timestamps_all_vstack) meta_feature.append(meteorol_feature) meta_feature = np.hstack( meta_feature) if len(meta_feature) > 0 else np.asarray(meta_feature) metadata_dim = meta_feature.shape[1] if len( meta_feature.shape) > 1 else None if metadata_dim < 1: metadata_dim = None if meta_data and holiday_data and meteorol_data: print('time feature:', time_feature.shape, 'holiday feature:', holiday_feature.shape, 'meteorol feature: ', meteorol_feature.shape, 'mete feature: ', meta_feature.shape) X, Y, timestamps_X, meta_feature_X = data_slide_window_timestamps( data_all_mmn_vstack, window_len=window_len, timestamps=timestamps_all_vstack, meta_data=meta_feature) s = shuffle_data_many([X, Y, timestamps_X, meta_feature_X]) X, Y, timestamps_X, meta_feature = s[0], s[1], s[2], s[3] X_train, X_test = X[:-len_test], X[-len_test:] Y_train, Y_test = Y[:-len_test], Y[-len_test:] if metadata_dim is not None: meta_feature_train, meta_feature_test = meta_feature[: -len_test], meta_feature[ -len_test:] X_train.append(meta_feature_train) X_test.append(meta_feature_test) return X_train, Y_train, X_test, Y_test, mmn, metadata_dim, timestamp_train, timestamp_test