Beispiel #1
0
def load_BikeNYC(window_len=6, nb_flow=2, len_test=240):
    # load original data
    data, timestamps = load_stdata(data_path + 'NYC14_M16x8_T60_NewEnd.h5')
    # print(timestamps[:100])

    # remove days that do not have 24 timestamps
    data, timestamps = remove_incomplete_days(data, timestamps, T=24)
    data = data[:, :nb_flow]
    data[data < 0] = 0.
    data_all = [data]
    timestamps_all = [timestamps]

    # Min_Max Scale
    data_train = data[:-len_test]
    # print('train_data shape: ', data_train.shape)
    mmn = MinMaxNormalization()
    mmn.fit(data_train)
    data_all_mmn = []
    for d in data_all:
        data_all_mmn.append(mmn.transform(d))
    # save min and max while scaling
    fpkl = open('preprocessing.pkl', 'wb')
    for obj in [mmn]:
        pickle.dump(obj, fpkl)
    fpkl.close()

    X, Y = data_slide_window(data=data_all_mmn[0], window_len=window_len)
    xtr, ytr, xte, yte = X[:-len_test], Y[:-len_test], X[-len_test:], Y[
        -len_test:]
    # print('BikeNYC data loaded...')
    return xtr, ytr, xte, yte
Beispiel #2
0
def load_BikeNYC_new(window_len=6, nb_flow=2, len_test=240):
    # load original data
    data, timestamps = load_stdata(data_path + 'NYC14_M16x8_T60_NewEnd.h5')
    # print(timestamps[:100])

    # remove days that do not have 24 timestamps
    data, timestamps = remove_incomplete_days(data, timestamps, T=24)
    data = data[:, :nb_flow]
    data[data < 0] = 0.
    data_all = [data]
    timestamps_all = [timestamps]

    # Min_Max Scale
    data_train = data[:-len_test]
    # print('train_data shape: ', data_train.shape)
    # mmn = MinMaxNormalization()
    mmn = MinMaxNormalization_01()
    mmn.fit(data_train)
    data_all_mmn = []
    for d in data_all:
        data_all_mmn.append(mmn.transform(d))
    # save min and max while scaling
    fpkl = open('preprocessing.pkl', 'wb')
    for obj in [mmn]:
        pickle.dump(obj, fpkl)
    fpkl.close()

    X, Y = data_slide_window(data=data_all_mmn[0], window_len=window_len)
    X, Y = shuffle_data(X, Y)
    # from sklearn.model_selection import train_test_split
    # xtr, ytr, xte, yte = train_test_split(X, Y, test_size=0.1, shuffle=True)
    xtr, ytr, xte, yte = X[:-len_test], Y[:-len_test], X[-len_test:], Y[
        -len_test:]
    xtr = generate_new_sample(xtr, T=window_len)
    xte = generate_new_sample(xte, T=window_len)
    ytr = np.array(ytr)
    yte = np.array(yte)
    # print('BikeNYC data loaded...')
    return xtr, ytr, xte, yte
Beispiel #3
0
def load_data(T=48,
              nb_flow=2,
              len_test=None,
              preprocess_name='preprocessing.pkl',
              meta_data=True,
              meteorol_data=True,
              holiday_data=True,
              window_len=12):
    # assert(len_closeness + len_period + len_trend > 0)
    # load data
    # 13 - 16
    data_all = []
    timestamps_all = list()
    for year in range(13, 17):
        fname = os.path.join(datapath, 'TaxiBJ',
                             'BJ{}_M32x32_T30_InOut.h5'.format(year))
        print("file name: ", fname)
        # stat(fname)
        data, timestamps = load_stdata(fname)
        # print(timestamps)
        # remove a certain day which does not have 48 timestamps
        data, timestamps = remove_incomplete_days(data, timestamps, T)
        data = data[:, :nb_flow]
        data[data < 0] = 0.
        data_all.append(data)
        timestamps_all.append(timestamps)
        print("\n")

    # minmax_scale
    data_train = np.vstack(copy(data_all))[:-len_test]
    print('train_data shape: ', data_train.shape)
    mmn = MinMaxNormalization()
    mmn.fit(data_train)
    data_all_mmn = [mmn.transform(d) for d in data_all]
    data_all_mmn_vstack = np.vstack(copy(data_all_mmn))
    timestamps_all_vstack = []
    for timestamps_element in timestamps_all:
        timestamps_all_vstack += timestamps_element
    # timestamps_all_vstack = np.vstack(copy(timestamps_all)

    fpkl = open(preprocess_name, 'wb')
    for obj in [mmn]:
        pickle.dump(obj, fpkl)
    fpkl.close()

    meta_feature = []
    if meta_data:
        # load time feature
        time_feature = timestamp2vec(timestamps_all_vstack)
        meta_feature.append(time_feature)
    if holiday_data:
        # load holiday
        holiday_feature = load_holiday(timestamps_all_vstack)
        meta_feature.append(holiday_feature)
    if meteorol_data:
        # load meteorol data
        meteorol_feature = load_meteorol(timestamps_all_vstack)
        meta_feature.append(meteorol_feature)

    meta_feature = np.hstack(
        meta_feature) if len(meta_feature) > 0 else np.asarray(meta_feature)
    metadata_dim = meta_feature.shape[1] if len(
        meta_feature.shape) > 1 else None
    if metadata_dim < 1:
        metadata_dim = None
    if meta_data and holiday_data and meteorol_data:
        print('time feature:', time_feature.shape, 'holiday feature:',
              holiday_feature.shape, 'meteorol feature: ',
              meteorol_feature.shape, 'mete feature: ', meta_feature.shape)

    X, Y, timestamps_X, meta_feature_X = data_slide_window_timestamps(
        data_all_mmn_vstack,
        window_len=window_len,
        timestamps=timestamps_all_vstack,
        meta_data=meta_feature)
    s = shuffle_data_many([X, Y, timestamps_X, meta_feature_X])
    X, Y, timestamps_X, meta_feature = s[0], s[1], s[2], s[3]
    X_train, X_test = X[:-len_test], X[-len_test:]
    Y_train, Y_test = Y[:-len_test], Y[-len_test:]

    if metadata_dim is not None:
        meta_feature_train, meta_feature_test = meta_feature[:
                                                             -len_test], meta_feature[
                                                                 -len_test:]

    X_train.append(meta_feature_train)
    X_test.append(meta_feature_test)

    return X_train, Y_train, X_test, Y_test, mmn, metadata_dim, timestamp_train, timestamp_test