def process_data(df_raw_data, time_steps, train_num): """ processing raw data :param df_raw_data: :param time_steps: :param train_num: :return: """ # df_raw_data['PM25'].astype(float) # df_raw_data = df_raw_data[df_raw_data['PM25'].astype(float) < 100] # df_raw_data = drop_outlier(df_raw_data, ['PM25'], 6) # draw.draw_time_series(df_raw_data, ['PM25']) # time_steps = data.get_time_steps() df_raw_data = group_by_diff_time_span(df_raw_data, 'hour') max_time_step = max(time_steps.values()) # pop the date features df_date = df_raw_data.pop('Month') if 'Day' in df_raw_data.columns: df_date = pd.concat([df_date, df_raw_data.pop('Day')], axis=1) if 'Hour' in df_raw_data.columns: df_date = pd.concat([df_date, df_raw_data.pop('Hour')], axis=1) df_date = df_date.loc[max_time_step:] # processing the sequence features # df_raw_data = df_raw_data[list(time_steps.keys())] df_raw_data = data.process_sequence_features(df_raw_data, 0, time_steps, max_time_step, padding_value=-1) df_raw_data = df_raw_data.loc[max_time_step:] # encoding the date features df_date_encoded = data.encoding_features(df_date, ['Month', 'Hour', 'Day']) # normalization y_scaled, y_scaler = data.min_max_scale(np.array(df_raw_data.pop('PM25')).reshape(-1, 1)) X_scaled, X_scaler = data.min_max_scale(df_raw_data) date_encoded = np.array(df_date_encoded) # 分割样本 X_train = np.append(X_scaled[:train_num, :], date_encoded[:train_num, :], axis=1) X_test = np.append(X_scaled[train_num:, :], date_encoded[train_num:, :], axis=1) y_train = np.array(y_scaled[:train_num]).reshape(1, -1)[0] y_test = np.array(y_scaled[train_num:]).reshape(1, -1)[0] return X_train, X_test, y_train, y_test, y_scaler
def load_data(path, time_steps, lstm_num, cols=None, dtype=str): """ load data :param path: data file path :param cols: which features :return: X, X_scaler, y, y_scaler """ df_raw = pd.read_csv(path, usecols=cols, dtype=dtype) df_date = df_raw.pop('Month') df_date = pd.concat([df_date, df_raw.pop('Day')], axis=1) df_date = pd.concat([df_date, df_raw.pop('Hour')], axis=1) df_date = df_date.loc[time_steps:] # processing the sequence features df_raw = data.process_sequence_features(df_raw, time_steps=time_steps) # encoding the date features df_date_encode = data.encoding_features(df_date, ['Month', 'Hour', 'Day']) # normalization y_scaled, y_scaler = data.min_max_scale( np.array(df_raw.pop('PM25')).reshape(-1, 1)) X_scaled, X_scaler = data.min_max_scale(df_raw) date_encode = np.array(df_date_encode) # reshape y y = y_scaled.reshape((y_scaled.shape[0], 1, y_scaled.shape[1])) # reshape X X_scaled = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1])) date_encode = date_encode.reshape( (date_encode.shape[0], 1, date_encode.shape[1])) X = [] # 分割,将PM2.5,Press等时间序列特征分别作为每个lstm模型的输入 split_size = int(X_scaled.shape[2] / lstm_num) for i in range(lstm_num): X.append(X_scaled[:, :, i * split_size:(i + 1) * split_size]) # 日期时间特征 X.append(date_encode) return X, X_scaler, y, y_scaler
def process_data(df_raw, time_steps, train_num): max_time_step = max(time_steps.values()) # pop the date features df_date = df_raw.pop('Month') df_date = pd.concat([df_date, df_raw.pop('Day')], axis=1) df_date = pd.concat([df_date, df_raw.pop('Hour')], axis=1) df_date = df_date.loc[max_time_step:] # processing the sequence features df_raw = data.process_sequence_features(df_raw, time_steps=time_steps) df_raw = df_raw.loc[max_time_step:] # encoding the date features df_date_encode = data.encoding_features(df_date, ['Month', 'Hour', 'Day']) # normalization y_scaled, y_scaler = data.min_max_scale(np.array(df_raw.pop('PM25')).reshape(-1, 1)) X_scaled, X_scaler = data.min_max_scale(df_raw) date_encode = np.array(df_date_encode) # reshape y train_y = y_scaled[:train_num] test_y = y_scaled[train_num:] train_y = train_y.reshape((train_y.shape[0], 1, train_y.shape[1])) test_y = test_y.reshape((test_y.shape[0], 1, test_y.shape[1])) # reshape X X_scaled = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1])) date_encode = date_encode.reshape((date_encode.shape[0], 1, date_encode.shape[1])) train_X = [] test_X = [] # 分割,将PM2.5,Press等时间序列特征作为一个lstm模型的输入 train_X.append(X_scaled[:train_num, :, :]) test_X.append(X_scaled[train_num:, :, :]) # 日期时间特征 train_X.append(date_encode[:train_num, :, :]) test_X.append(date_encode[train_num:, :, :]) return train_X, test_X, train_y, test_y, y_scaler
def train(df_raw, time_steps=1, train_num=365 * 24): # processing the sequence features df_raw = data.process_sequence_features(df_raw, time_steps=time_steps) # normalization y_scaled, y_scaler = data.min_max_scale( np.array(df_raw.pop('PM25')).reshape(-1, 1)) X_scaled, X_scaler = data.min_max_scale(df_raw) # split data to train data and test data train_X, train_y, test_X, test_y = data.split_data(X_scaled, y_scaled, train_num=train_num) # reshape data train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1])) train_y = train_y.reshape((train_y.shape[0], 1, train_y.shape[1])) test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1])) test_y = test_y.reshape((test_y.shape[0], 1, test_y.shape[1])) # build a Sequential model model = Sequential() model.add( LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2]), return_sequences=True)) model.add(Dropout(0.3)) model.add(LSTM(100, return_sequences=True)) model.add(Dropout(0.3)) model.add(LSTM(100, return_sequences=True)) model.add(Dropout(0.3)) model.add(LSTM(50, return_sequences=True)) model.add(Dropout(0.3)) model.add(Dense(units=1024, activation='linear')) # model.add(BatchNormalization()) model.add(Dropout(0.5)) model.add(Dense(units=1024, activation='linear')) # model.add(BatchNormalization()) model.add(Dropout(0.5)) model.add(Dense(units=1)) model.compile(loss='mse', optimizer='RMSprop') history = model.fit(train_X, train_y, epochs=100, batch_size=1024, validation_data=(test_X, test_y), verbose=2, shuffle=False) # draw the loss curve plt.figure(1) plt.plot(history.history['loss'], label='train') plt.plot(history.history['val_loss'], label='test') # draw to compare the original data and the predicted data, and print the evaluation metrics pred_y = model.predict(test_X) test_y = data.inverse_to_original_data(train_y.reshape(1, -1), test_y.reshape(1, -1), scaler=y_scaler, train_num=train_num) pred_y = data.inverse_to_original_data(train_y.reshape(1, -1), pred_y.reshape(1, -1), scaler=y_scaler, train_num=train_num) evaluate.print_metrics(test_y, pred_y) evaluate.print_curve(test_y, pred_y)
def train(df_raw, model_path, weight_path, lstm_config, dense_config, epochs=100, batch_size=100, time_steps=1, test_split=0.3): # pop the date features df_date = df_raw.pop('Month') df_date = pd.concat([df_date, df_raw.pop('Day')], axis=1) df_date = pd.concat([df_date, df_raw.pop('Hour')], axis=1) df_date = df_date.loc[time_steps:] # processing the sequence features df_raw = data.process_sequence_features(df_raw, time_steps=time_steps) # encoding the date features df_date_encode = data.encoding_features(df_date, ['Month', 'Hour', 'Day']) # normalization y_scaled, y_scaler = data.min_max_scale( np.array(df_raw.pop('PM25')).reshape(-1, 1)) X_scaled, X_scaler = data.min_max_scale(df_raw) date_encode = np.array(df_date_encode) # reshape y train_y = y_scaled[:int(len(X_scaled) * (1 - test_split))] test_y = y_scaled[int(len(X_scaled) * (1 - test_split)):] train_y = train_y.reshape((train_y.shape[0], 1, train_y.shape[1])) test_y = test_y.reshape((test_y.shape[0], 1, test_y.shape[1])) # reshape X X_scaled = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1])) date_encode = date_encode.reshape( (date_encode.shape[0], 1, date_encode.shape[1])) train_X = [] test_X = [] # 分割,将PM2.5,Press等时间序列特征分别作为每个lstm模型的输入 for i in range(lstm_config['num']): train_X.append(X_scaled[:int(len(X_scaled) * (1 - test_split)), :, i * time_steps:(i + 1) * time_steps]) test_X.append(X_scaled[int(len(X_scaled) * (1 - test_split)):, :, i * time_steps:(i + 1) * time_steps]) # 日期时间特征 train_X.append(date_encode[:int(len(X_scaled) * (1 - test_split)), :, :]) test_X.append(date_encode[int(len(X_scaled) * (1 - test_split)):, :, :]) # build model model = build_model(model_path, weight_path, lstm_config, dense_config, time_steps) # checkpoint checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min') callbacks_list = [checkpoint] history = model.fit(train_X, train_y, epochs=epochs, batch_size=batch_size, validation_data=(test_X, test_y), verbose=1, callbacks=callbacks_list, shuffle=False) # draw the loss curve plt.figure(0) plt.plot(history.history['loss'], label='train') plt.plot(history.history['val_loss'], label='test') # draw to compare the original data and the predicted data, and print the evaluation metrics pred_y = model.predict(test_X) test_y = data.inverse_to_original_data( train_y.reshape(1, -1), test_y.reshape(1, -1), scaler=y_scaler, train_num=int(len(X_scaled) * (1 - test_split))) pred_y = data.inverse_to_original_data( train_y.reshape(1, -1), pred_y.reshape(1, -1), scaler=y_scaler, train_num=int(len(X_scaled) * (1 - test_split))) return test_y, pred_y