예제 #1
0
def process_data(df_raw_data, time_steps, train_num):
    """
    processing raw data
    :param df_raw_data:
    :param time_steps:
    :param train_num:
    :return:
    """
    # df_raw_data['PM25'].astype(float)
    # df_raw_data = df_raw_data[df_raw_data['PM25'].astype(float) < 100]
    # df_raw_data = drop_outlier(df_raw_data, ['PM25'], 6)
    # draw.draw_time_series(df_raw_data, ['PM25'])
    # time_steps = data.get_time_steps()
    df_raw_data = group_by_diff_time_span(df_raw_data, 'hour')
    max_time_step = max(time_steps.values())
    # pop the date features
    df_date = df_raw_data.pop('Month')
    if 'Day' in df_raw_data.columns:
        df_date = pd.concat([df_date, df_raw_data.pop('Day')], axis=1)
    if 'Hour' in df_raw_data.columns:
        df_date = pd.concat([df_date, df_raw_data.pop('Hour')], axis=1)
    df_date = df_date.loc[max_time_step:]

    # processing the sequence features
    # df_raw_data = df_raw_data[list(time_steps.keys())]
    df_raw_data = data.process_sequence_features(df_raw_data, 0, time_steps, max_time_step, padding_value=-1)
    df_raw_data = df_raw_data.loc[max_time_step:]

    # encoding the date features
    df_date_encoded = data.encoding_features(df_date, ['Month', 'Hour', 'Day'])

    # normalization
    y_scaled, y_scaler = data.min_max_scale(np.array(df_raw_data.pop('PM25')).reshape(-1, 1))
    X_scaled, X_scaler = data.min_max_scale(df_raw_data)
    date_encoded = np.array(df_date_encoded)

    # 分割样本
    X_train = np.append(X_scaled[:train_num, :], date_encoded[:train_num, :], axis=1)
    X_test = np.append(X_scaled[train_num:, :], date_encoded[train_num:, :], axis=1)
    y_train = np.array(y_scaled[:train_num]).reshape(1, -1)[0]
    y_test = np.array(y_scaled[train_num:]).reshape(1, -1)[0]

    return X_train, X_test, y_train, y_test, y_scaler
예제 #2
0
def load_data(path, time_steps, lstm_num, cols=None, dtype=str):
    """
    load data
    :param path: data file path
    :param cols: which features
    :return: X, X_scaler, y, y_scaler
    """
    df_raw = pd.read_csv(path, usecols=cols, dtype=dtype)
    df_date = df_raw.pop('Month')
    df_date = pd.concat([df_date, df_raw.pop('Day')], axis=1)
    df_date = pd.concat([df_date, df_raw.pop('Hour')], axis=1)
    df_date = df_date.loc[time_steps:]

    # processing the sequence features
    df_raw = data.process_sequence_features(df_raw, time_steps=time_steps)
    # encoding the date features
    df_date_encode = data.encoding_features(df_date, ['Month', 'Hour', 'Day'])

    # normalization
    y_scaled, y_scaler = data.min_max_scale(
        np.array(df_raw.pop('PM25')).reshape(-1, 1))
    X_scaled, X_scaler = data.min_max_scale(df_raw)
    date_encode = np.array(df_date_encode)

    # reshape y
    y = y_scaled.reshape((y_scaled.shape[0], 1, y_scaled.shape[1]))
    # reshape X
    X_scaled = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))
    date_encode = date_encode.reshape(
        (date_encode.shape[0], 1, date_encode.shape[1]))
    X = []
    # 分割,将PM2.5,Press等时间序列特征分别作为每个lstm模型的输入
    split_size = int(X_scaled.shape[2] / lstm_num)
    for i in range(lstm_num):
        X.append(X_scaled[:, :, i * split_size:(i + 1) * split_size])
    # 日期时间特征
    X.append(date_encode)

    return X, X_scaler, y, y_scaler
예제 #3
0
def process_data(df_raw, time_steps, train_num):
    max_time_step = max(time_steps.values())
    # pop the date features
    df_date = df_raw.pop('Month')
    df_date = pd.concat([df_date, df_raw.pop('Day')], axis=1)
    df_date = pd.concat([df_date, df_raw.pop('Hour')], axis=1)
    df_date = df_date.loc[max_time_step:]

    # processing the sequence features
    df_raw = data.process_sequence_features(df_raw, time_steps=time_steps)
    df_raw = df_raw.loc[max_time_step:]
    # encoding the date features
    df_date_encode = data.encoding_features(df_date, ['Month', 'Hour', 'Day'])

    # normalization
    y_scaled, y_scaler = data.min_max_scale(np.array(df_raw.pop('PM25')).reshape(-1, 1))
    X_scaled, X_scaler = data.min_max_scale(df_raw)
    date_encode = np.array(df_date_encode)

    # reshape y
    train_y = y_scaled[:train_num]
    test_y = y_scaled[train_num:]
    train_y = train_y.reshape((train_y.shape[0], 1, train_y.shape[1]))
    test_y = test_y.reshape((test_y.shape[0], 1, test_y.shape[1]))
    # reshape X
    X_scaled = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))
    date_encode = date_encode.reshape((date_encode.shape[0], 1, date_encode.shape[1]))
    train_X = []
    test_X = []
    # 分割,将PM2.5,Press等时间序列特征作为一个lstm模型的输入
    train_X.append(X_scaled[:train_num, :, :])
    test_X.append(X_scaled[train_num:, :, :])
    # 日期时间特征
    train_X.append(date_encode[:train_num, :, :])
    test_X.append(date_encode[train_num:, :, :])

    return train_X, test_X, train_y, test_y, y_scaler
예제 #4
0
def train(df_raw, time_steps=1, train_num=365 * 24):
    # processing the sequence features
    df_raw = data.process_sequence_features(df_raw, time_steps=time_steps)

    # normalization
    y_scaled, y_scaler = data.min_max_scale(
        np.array(df_raw.pop('PM25')).reshape(-1, 1))
    X_scaled, X_scaler = data.min_max_scale(df_raw)

    # split data to train data and test data
    train_X, train_y, test_X, test_y = data.split_data(X_scaled,
                                                       y_scaled,
                                                       train_num=train_num)

    # reshape data
    train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
    train_y = train_y.reshape((train_y.shape[0], 1, train_y.shape[1]))
    test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
    test_y = test_y.reshape((test_y.shape[0], 1, test_y.shape[1]))

    # build a Sequential model
    model = Sequential()
    model.add(
        LSTM(50,
             input_shape=(train_X.shape[1], train_X.shape[2]),
             return_sequences=True))
    model.add(Dropout(0.3))
    model.add(LSTM(100, return_sequences=True))
    model.add(Dropout(0.3))
    model.add(LSTM(100, return_sequences=True))
    model.add(Dropout(0.3))
    model.add(LSTM(50, return_sequences=True))
    model.add(Dropout(0.3))
    model.add(Dense(units=1024, activation='linear'))
    # model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(units=1024, activation='linear'))
    # model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(units=1))
    model.compile(loss='mse', optimizer='RMSprop')
    history = model.fit(train_X,
                        train_y,
                        epochs=100,
                        batch_size=1024,
                        validation_data=(test_X, test_y),
                        verbose=2,
                        shuffle=False)

    # draw the loss curve
    plt.figure(1)
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')

    # draw to compare the original data and the predicted data, and print the evaluation metrics
    pred_y = model.predict(test_X)
    test_y = data.inverse_to_original_data(train_y.reshape(1, -1),
                                           test_y.reshape(1, -1),
                                           scaler=y_scaler,
                                           train_num=train_num)
    pred_y = data.inverse_to_original_data(train_y.reshape(1, -1),
                                           pred_y.reshape(1, -1),
                                           scaler=y_scaler,
                                           train_num=train_num)
    evaluate.print_metrics(test_y, pred_y)
    evaluate.print_curve(test_y, pred_y)
예제 #5
0
def train(df_raw,
          model_path,
          weight_path,
          lstm_config,
          dense_config,
          epochs=100,
          batch_size=100,
          time_steps=1,
          test_split=0.3):
    # pop the date features
    df_date = df_raw.pop('Month')
    df_date = pd.concat([df_date, df_raw.pop('Day')], axis=1)
    df_date = pd.concat([df_date, df_raw.pop('Hour')], axis=1)
    df_date = df_date.loc[time_steps:]

    # processing the sequence features
    df_raw = data.process_sequence_features(df_raw, time_steps=time_steps)
    # encoding the date features
    df_date_encode = data.encoding_features(df_date, ['Month', 'Hour', 'Day'])

    # normalization
    y_scaled, y_scaler = data.min_max_scale(
        np.array(df_raw.pop('PM25')).reshape(-1, 1))
    X_scaled, X_scaler = data.min_max_scale(df_raw)
    date_encode = np.array(df_date_encode)

    # reshape y
    train_y = y_scaled[:int(len(X_scaled) * (1 - test_split))]
    test_y = y_scaled[int(len(X_scaled) * (1 - test_split)):]
    train_y = train_y.reshape((train_y.shape[0], 1, train_y.shape[1]))
    test_y = test_y.reshape((test_y.shape[0], 1, test_y.shape[1]))
    # reshape X
    X_scaled = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))
    date_encode = date_encode.reshape(
        (date_encode.shape[0], 1, date_encode.shape[1]))
    train_X = []
    test_X = []
    # 分割,将PM2.5,Press等时间序列特征分别作为每个lstm模型的输入
    for i in range(lstm_config['num']):
        train_X.append(X_scaled[:int(len(X_scaled) * (1 - test_split)), :,
                                i * time_steps:(i + 1) * time_steps])
        test_X.append(X_scaled[int(len(X_scaled) * (1 - test_split)):, :,
                               i * time_steps:(i + 1) * time_steps])
    # 日期时间特征
    train_X.append(date_encode[:int(len(X_scaled) * (1 - test_split)), :, :])
    test_X.append(date_encode[int(len(X_scaled) * (1 - test_split)):, :, :])

    # build model
    model = build_model(model_path, weight_path, lstm_config, dense_config,
                        time_steps)

    # checkpoint
    checkpoint = ModelCheckpoint(weight_path,
                                 monitor='val_loss',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='min')
    callbacks_list = [checkpoint]
    history = model.fit(train_X,
                        train_y,
                        epochs=epochs,
                        batch_size=batch_size,
                        validation_data=(test_X, test_y),
                        verbose=1,
                        callbacks=callbacks_list,
                        shuffle=False)

    # draw the loss curve
    plt.figure(0)
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')

    # draw to compare the original data and the predicted data, and print the evaluation metrics
    pred_y = model.predict(test_X)
    test_y = data.inverse_to_original_data(
        train_y.reshape(1, -1),
        test_y.reshape(1, -1),
        scaler=y_scaler,
        train_num=int(len(X_scaled) * (1 - test_split)))
    pred_y = data.inverse_to_original_data(
        train_y.reshape(1, -1),
        pred_y.reshape(1, -1),
        scaler=y_scaler,
        train_num=int(len(X_scaled) * (1 - test_split)))
    return test_y, pred_y