def process_data(df_raw_data, time_steps, train_num): """ processing raw data :param df_raw_data: :param time_steps: :param train_num: :return: """ # df_raw_data['PM25'].astype(float) # df_raw_data = df_raw_data[df_raw_data['PM25'].astype(float) < 100] # df_raw_data = drop_outlier(df_raw_data, ['PM25'], 6) # draw.draw_time_series(df_raw_data, ['PM25']) # time_steps = data.get_time_steps() df_raw_data = group_by_diff_time_span(df_raw_data, 'hour') max_time_step = max(time_steps.values()) # pop the date features df_date = df_raw_data.pop('Month') if 'Day' in df_raw_data.columns: df_date = pd.concat([df_date, df_raw_data.pop('Day')], axis=1) if 'Hour' in df_raw_data.columns: df_date = pd.concat([df_date, df_raw_data.pop('Hour')], axis=1) df_date = df_date.loc[max_time_step:] # processing the sequence features # df_raw_data = df_raw_data[list(time_steps.keys())] df_raw_data = data.process_sequence_features(df_raw_data, 0, time_steps, max_time_step, padding_value=-1) df_raw_data = df_raw_data.loc[max_time_step:] # encoding the date features df_date_encoded = data.encoding_features(df_date, ['Month', 'Hour', 'Day']) # normalization y_scaled, y_scaler = data.min_max_scale(np.array(df_raw_data.pop('PM25')).reshape(-1, 1)) X_scaled, X_scaler = data.min_max_scale(df_raw_data) date_encoded = np.array(df_date_encoded) # 分割样本 X_train = np.append(X_scaled[:train_num, :], date_encoded[:train_num, :], axis=1) X_test = np.append(X_scaled[train_num:, :], date_encoded[train_num:, :], axis=1) y_train = np.array(y_scaled[:train_num]).reshape(1, -1)[0] y_test = np.array(y_scaled[train_num:]).reshape(1, -1)[0] return X_train, X_test, y_train, y_test, y_scaler
def load_data(path, time_steps, lstm_num, cols=None, dtype=str): """ load data :param path: data file path :param cols: which features :return: X, X_scaler, y, y_scaler """ df_raw = pd.read_csv(path, usecols=cols, dtype=dtype) df_date = df_raw.pop('Month') df_date = pd.concat([df_date, df_raw.pop('Day')], axis=1) df_date = pd.concat([df_date, df_raw.pop('Hour')], axis=1) df_date = df_date.loc[time_steps:] # processing the sequence features df_raw = data.process_sequence_features(df_raw, time_steps=time_steps) # encoding the date features df_date_encode = data.encoding_features(df_date, ['Month', 'Hour', 'Day']) # normalization y_scaled, y_scaler = data.min_max_scale( np.array(df_raw.pop('PM25')).reshape(-1, 1)) X_scaled, X_scaler = data.min_max_scale(df_raw) date_encode = np.array(df_date_encode) # reshape y y = y_scaled.reshape((y_scaled.shape[0], 1, y_scaled.shape[1])) # reshape X X_scaled = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1])) date_encode = date_encode.reshape( (date_encode.shape[0], 1, date_encode.shape[1])) X = [] # 分割,将PM2.5,Press等时间序列特征分别作为每个lstm模型的输入 split_size = int(X_scaled.shape[2] / lstm_num) for i in range(lstm_num): X.append(X_scaled[:, :, i * split_size:(i + 1) * split_size]) # 日期时间特征 X.append(date_encode) return X, X_scaler, y, y_scaler
def process_data(df_raw, time_steps, train_num): max_time_step = max(time_steps.values()) # pop the date features df_date = df_raw.pop('Month') df_date = pd.concat([df_date, df_raw.pop('Day')], axis=1) df_date = pd.concat([df_date, df_raw.pop('Hour')], axis=1) df_date = df_date.loc[max_time_step:] # processing the sequence features df_raw = data.process_sequence_features(df_raw, time_steps=time_steps) df_raw = df_raw.loc[max_time_step:] # encoding the date features df_date_encode = data.encoding_features(df_date, ['Month', 'Hour', 'Day']) # normalization y_scaled, y_scaler = data.min_max_scale(np.array(df_raw.pop('PM25')).reshape(-1, 1)) X_scaled, X_scaler = data.min_max_scale(df_raw) date_encode = np.array(df_date_encode) # reshape y train_y = y_scaled[:train_num] test_y = y_scaled[train_num:] train_y = train_y.reshape((train_y.shape[0], 1, train_y.shape[1])) test_y = test_y.reshape((test_y.shape[0], 1, test_y.shape[1])) # reshape X X_scaled = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1])) date_encode = date_encode.reshape((date_encode.shape[0], 1, date_encode.shape[1])) train_X = [] test_X = [] # 分割,将PM2.5,Press等时间序列特征作为一个lstm模型的输入 train_X.append(X_scaled[:train_num, :, :]) test_X.append(X_scaled[train_num:, :, :]) # 日期时间特征 train_X.append(date_encode[:train_num, :, :]) test_X.append(date_encode[train_num:, :, :]) return train_X, test_X, train_y, test_y, y_scaler
def train(df_raw, time_steps=1, train_num=365 * 24): # processing the sequence features df_raw = data.process_sequence_features(df_raw, time_steps=time_steps) # normalization y_scaled, y_scaler = data.min_max_scale( np.array(df_raw.pop('PM25')).reshape(-1, 1)) X_scaled, X_scaler = data.min_max_scale(df_raw) # split data to train data and test data train_X, train_y, test_X, test_y = data.split_data(X_scaled, y_scaled, train_num=train_num) # reshape data train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1])) train_y = train_y.reshape((train_y.shape[0], 1, train_y.shape[1])) test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1])) test_y = test_y.reshape((test_y.shape[0], 1, test_y.shape[1])) # build a Sequential model model = Sequential() model.add( LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2]), return_sequences=True)) model.add(Dropout(0.3)) model.add(LSTM(100, return_sequences=True)) model.add(Dropout(0.3)) model.add(LSTM(100, return_sequences=True)) model.add(Dropout(0.3)) model.add(LSTM(50, return_sequences=True)) model.add(Dropout(0.3)) model.add(Dense(units=1024, activation='linear')) # model.add(BatchNormalization()) model.add(Dropout(0.5)) model.add(Dense(units=1024, activation='linear')) # model.add(BatchNormalization()) model.add(Dropout(0.5)) model.add(Dense(units=1)) model.compile(loss='mse', optimizer='RMSprop') history = model.fit(train_X, train_y, epochs=100, batch_size=1024, validation_data=(test_X, test_y), verbose=2, shuffle=False) # draw the loss curve plt.figure(1) plt.plot(history.history['loss'], label='train') plt.plot(history.history['val_loss'], label='test') # draw to compare the original data and the predicted data, and print the evaluation metrics pred_y = model.predict(test_X) test_y = data.inverse_to_original_data(train_y.reshape(1, -1), test_y.reshape(1, -1), scaler=y_scaler, train_num=train_num) pred_y = data.inverse_to_original_data(train_y.reshape(1, -1), pred_y.reshape(1, -1), scaler=y_scaler, train_num=train_num) evaluate.print_metrics(test_y, pred_y) evaluate.print_curve(test_y, pred_y)
def main(is_train=True): data_path = '../DataSet/Processed/Train/261630033_2016_2017_v1.csv' # model_path = '../Models/Test/model.best.json' # weight_path = '../Models/Test/weights.best.hdf5' model_path = '../Models/Test/model_epochs10_batch24.best.json' weight_path = '../Models/Test/weights_epochs10_batch24.best.hdf5' df_raw = data.get_raw_data(data_path, ['PM25'], dtype=float) seq_data = np.array(df_raw).reshape(1, -1)[0] test_split = 0.4 time_steps = 4 new_data = [] for i in range(len(df_raw) - time_steps): new_data.append(list(seq_data[i:i + time_steps + 1])) new_data = np.array(new_data) train_num = int(len(new_data) * (1 - test_split)) y_scaled, y_scaler = data.min_max_scale(new_data[:, -1].reshape(-1, 1)) X_scaled, X_scaler = data.min_max_scale(new_data[:, 0:time_steps]) y_train = y_scaled[:train_num, :].reshape(1, -1)[0] y_test = y_scaled[train_num:, :].reshape(1, -1)[0] X_train = X_scaled[:train_num, :] X_test = X_scaled[train_num:, :] X_train = X_train.reshape(X_train.shape[0], time_steps, 1) X_test = X_test.reshape(X_test.shape[0], time_steps, 1) if is_train: if os.path.exists(model_path): json_string = open(model_path).read() model = model_from_json(json_string) # 有参数则加载 if os.path.exists(weight_path): print('load weights ' + weight_path) model.load_weights(weight_path) else: model = Sequential() model.add( LSTM(32, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True)) model.add(LSTM(32, return_sequences=False)) model.add(Dense(units=64, activation='linear')) model.add(Dense(units=1)) open(model_path, 'w').write(model.to_json()) model.compile(loss='mse', optimizer='RMSprop') checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min') callbacks_list = [checkpoint] history = model.fit(X_train, y_train, epochs=20, batch_size=24, validation_data=(X_test, y_test), verbose=1, callbacks=callbacks_list, shuffle=False) evaluate.draw_loss_curve(figure_num='PM2.5', train_loss=history.history['loss'], val_loss=history.history['val_loss']) else: json_string = open(model_path).read() model = model_from_json(json_string) model.load_weights(weight_path) y_pred = model.predict(X_test) y_true = y_scaler.inverse_transform(y_test.reshape(-1, 1)) y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1)) df_all_metrics = evaluate.all_metrics(y_true[:len(y_true) - 1], y_pred[1:]) evaluate.draw_fitting_curve(y_true[:len(y_true) - 1], y_pred[1:])
def train(df_raw, model_path, weight_path, lstm_config, dense_config, epochs=100, batch_size=100, time_steps=1, test_split=0.3): # pop the date features df_date = df_raw.pop('Month') df_date = pd.concat([df_date, df_raw.pop('Day')], axis=1) df_date = pd.concat([df_date, df_raw.pop('Hour')], axis=1) df_date = df_date.loc[time_steps:] # processing the sequence features df_raw = data.process_sequence_features(df_raw, time_steps=time_steps) # encoding the date features df_date_encode = data.encoding_features(df_date, ['Month', 'Hour', 'Day']) # normalization y_scaled, y_scaler = data.min_max_scale( np.array(df_raw.pop('PM25')).reshape(-1, 1)) X_scaled, X_scaler = data.min_max_scale(df_raw) date_encode = np.array(df_date_encode) # reshape y train_y = y_scaled[:int(len(X_scaled) * (1 - test_split))] test_y = y_scaled[int(len(X_scaled) * (1 - test_split)):] train_y = train_y.reshape((train_y.shape[0], 1, train_y.shape[1])) test_y = test_y.reshape((test_y.shape[0], 1, test_y.shape[1])) # reshape X X_scaled = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1])) date_encode = date_encode.reshape( (date_encode.shape[0], 1, date_encode.shape[1])) train_X = [] test_X = [] # 分割,将PM2.5,Press等时间序列特征分别作为每个lstm模型的输入 for i in range(lstm_config['num']): train_X.append(X_scaled[:int(len(X_scaled) * (1 - test_split)), :, i * time_steps:(i + 1) * time_steps]) test_X.append(X_scaled[int(len(X_scaled) * (1 - test_split)):, :, i * time_steps:(i + 1) * time_steps]) # 日期时间特征 train_X.append(date_encode[:int(len(X_scaled) * (1 - test_split)), :, :]) test_X.append(date_encode[int(len(X_scaled) * (1 - test_split)):, :, :]) # build model model = build_model(model_path, weight_path, lstm_config, dense_config, time_steps) # checkpoint checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min') callbacks_list = [checkpoint] history = model.fit(train_X, train_y, epochs=epochs, batch_size=batch_size, validation_data=(test_X, test_y), verbose=1, callbacks=callbacks_list, shuffle=False) # draw the loss curve plt.figure(0) plt.plot(history.history['loss'], label='train') plt.plot(history.history['val_loss'], label='test') # draw to compare the original data and the predicted data, and print the evaluation metrics pred_y = model.predict(test_X) test_y = data.inverse_to_original_data( train_y.reshape(1, -1), test_y.reshape(1, -1), scaler=y_scaler, train_num=int(len(X_scaled) * (1 - test_split))) pred_y = data.inverse_to_original_data( train_y.reshape(1, -1), pred_y.reshape(1, -1), scaler=y_scaler, train_num=int(len(X_scaled) * (1 - test_split))) return test_y, pred_y