data_test = np.array(data_test) Y_test = np.array(Y_test, dtype=np.float) # -- # X_train = data_train # X_test = data_test X_train = dict() X_test = dict() # -- print('Constructing time series data set ..') # for rnn X_train['rnn'] = construct_time_steps(data_train[:-1], time_steps) X_test['rnn'] = construct_time_steps(data_test[:-1], time_steps) X_train['concatenate'] = concatenate_time_steps(data_train[:-1], time_steps) Y_train = Y_train[time_steps:] X_test['concatenate'] = concatenate_time_steps(data_test[:-1], time_steps) Y_test = Y_test[time_steps:] # -- Y_real = np.copy(Y_test) Y_train = higher(Y_train, interval_hours) Y_test = higher(Y_test, interval_hours) # Y_train = highest(Y_train, degree=degree) # Y_test = highest(Y_test, degree=degree)
def ensemble_model(target_kind, local, city, target_site, training_year, testing_year, training_duration, testing_duration, interval_hours, data, is_training): print('is_training(%s) = %s' % (target_site, is_training)) site_list = pollution_site_map[local][ city] # ['中山', '古亭', '士林', '松山', '萬華'] # change format from 2014-2015 to ['2014', '2015'] training_year = [ training_year[:training_year.index('-')], training_year[training_year.index('-') + 1:] ] testing_year = [ testing_year[:testing_year.index('-')], testing_year[testing_year.index('-') + 1:] ] training_duration = [ training_duration[:training_duration.index('-')], training_duration[training_duration.index('-') + 1:] ] testing_duration = [ testing_duration[:testing_duration.index('-')], testing_duration[testing_duration.index('-') + 1:] ] interval_hours = int(interval_hours) # is_training = False # clear redundancy work if training_year[0] == training_year[1]: training_year.pop(1) if testing_year[0] == testing_year[1]: testing_year.pop(1) else: input( 'The range of testing year should not more than one year or crossing the bound of years.' ) # checking years rangeofYear = int(training_year[-1]) - int(training_year[0]) for i in range(rangeofYear): if not (str(i + int(training_year[0])) in training_year): training_year.insert(i, str(i + int(training_year[0]))) # Training Parameters # WIND_DIREC is a specific feature, that need to be processed, and it can only be element of input vector now. if target_kind == 'PM2.5': pollution_kind = [ 'PM2.5', 'O3', 'AMB_TEMP', 'RH', 'WIND_SPEED', 'WIND_DIREC' ] # target_kind = 'PM2.5' data_update = False # batch_size = 24 * 7 seed = 0 # Network Parameters input_size = (len(site_list) * len(pollution_kind) + len(site_list)) if 'WIND_DIREC' in pollution_kind else ( len(site_list) * len(pollution_kind)) time_steps = 12 # hidden_size = 20 output_size = 1 testing_month = testing_duration[0][:testing_duration[0].index('/')] folder = root_path + "model/%s/%s/%sh/%s/" % (local, city, interval_hours, target_kind) training_begining = training_duration[0][:training_duration[0].index('/')] training_deadline = training_duration[-1][:training_duration[-1].index('/' )] print('site: %s' % target_site) print('Training for %s/%s to %s/%s' % (training_year[0], training_duration[0], training_year[-1], training_duration[-1])) print('Testing for %s/%s to %s/%s' % (testing_year[0], testing_duration[0], testing_year[-1], testing_duration[-1])) # for interval def ave(X, Y, interval_hours): reserve_hours = interval_hours - 1 deadline = 0 for i in range(len(Y)): # check the reserve data is enough or not if (len(Y) - i - 1) < reserve_hours: deadline = i break # not enough for j in range(reserve_hours): Y[i] += Y[i + j + 1] Y[i] /= interval_hours if deadline: X = X[:deadline] Y = Y[:deadline] return X, Y # for interval def higher(X, Y, interval_hours): reserve_hours = 1 # choose the first n number of biggest if interval_hours > reserve_hours: deadline = 0 for i in range(len(Y)): # check the reserve data is enough or not if (len(Y) - i) < interval_hours: deadline = i break # not enough higher_list = [] for j in range(interval_hours): if len(higher_list) < reserve_hours: higher_list.append(Y[i + j]) elif Y[i + j] > higher_list[0]: higher_list[0] = Y[i + j] higher_list = sorted(higher_list) Y[i] = np.array(higher_list).sum() / reserve_hours if deadline: X = X[:deadline] Y = Y[:deadline] return X, Y if is_training: # reading data print('Reading data .. ') start_time = time.time() print('preparing training set ..') X_train = read_data_sets(sites=site_list + [target_site], date_range=np.atleast_1d(training_year), beginning=training_duration[0], finish=training_duration[-1], feature_selection=pollution_kind, update=data_update) X_train = missing_check(X_train) Y_train = np.array(X_train)[:, -len(pollution_kind):] Y_train = Y_train[:, pollution_kind.index(target_kind)] X_train = np.array(X_train)[:, :-len(pollution_kind)] print('preparing testing set ..') X_test = read_data_sets(sites=site_list + [target_site], date_range=np.atleast_1d(testing_year), beginning=testing_duration[0], finish=testing_duration[-1], feature_selection=pollution_kind, update=data_update) Y_test = np.array(X_test)[:, -len(pollution_kind):] Y_test = Y_test[:, pollution_kind.index(target_kind)] X_test = missing_check(np.array(X_test)[:, :-len(pollution_kind)]) final_time = time.time() print('Reading data .. ok, ', end='') time_spent_printer(start_time, final_time) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') if (len(X_train) < time_steps) or (len(X_test) < time_steps): input('time_steps(%d) too long.' % time_steps) # normalize print('Normalize ..') mean_X_train = np.mean(X_train, axis=0) std_X_train = np.std(X_train, axis=0) if 0 in std_X_train: input("Denominator can't be 0.") X_train = np.array([(x_train - mean_X_train) / std_X_train for x_train in X_train]) X_test = np.array([(x_test - mean_X_train) / std_X_train for x_test in X_test]) mean_y_train = np.mean(Y_train) std_y_train = np.std(Y_train) if not std_y_train: input("Denominator can't be 0.") Y_train = [(y - mean_y_train) / std_y_train for y in Y_train] print('mean_y_train: %f std_y_train: %f' % (mean_y_train, std_y_train)) fw = open(folder + "%s_parameter.pickle" % target_site, 'wb') cPickle.dump( str(mean_X_train) + ',' + str(std_X_train) + ',' + str(mean_y_train) + ',' + str(std_y_train), fw) fw.close() # feature process if 'WIND_DIREC' in pollution_kind: index_of_kind = pollution_kind.index('WIND_DIREC') length_of_kind_list = len(pollution_kind) len_of_sites_list = len(site_list) X_train = X_train.tolist() X_test = X_test.tolist() for i in range(len(X_train)): for j in range(len_of_sites_list): specific_index = index_of_kind + j * length_of_kind_list coordin = data_coordinate_angle( (X_train[i].pop(specific_index + j)) * std_X_train[specific_index] + mean_X_train[specific_index]) X_train[i].insert(specific_index, coordin[1]) X_train[i].insert(specific_index, coordin[0]) if i < len(X_test): coordin = data_coordinate_angle( (X_test[i].pop(specific_index + j)) * std_X_train[specific_index] + mean_X_train[specific_index]) X_test[i].insert(specific_index, coordin[1]) X_test[i].insert(specific_index, coordin[0]) X_train = np.array(X_train) X_test = np.array(X_test) Y_test = np.array(Y_test, dtype=np.float) # -- print('Constructing time series data set ..') # for rnn X_rnn_train = construct_time_steps(X_train[:-1], time_steps) X_rnn_test = construct_time_steps(X_test[:-1], time_steps) X_train = concatenate_time_steps(X_train[:-1], time_steps) Y_train = Y_train[time_steps:] X_test = concatenate_time_steps(X_test[:-1], time_steps) Y_test = Y_test[time_steps:] [X_train, Y_train] = higher(X_train, Y_train, interval_hours) [X_test, Y_test] = higher(X_test, Y_test, interval_hours) X_rnn_train = X_rnn_train[:len(X_train)] X_rnn_test = X_rnn_test[:len(X_test)] # delete data which have missing values i = 0 while i < len(Y_test): if not ( Y_test[i] > -10000 ): # check missing or not, if Y_test[i] is missing, then this command will return True Y_test = np.delete(Y_test, i, 0) X_test = np.delete(X_test, i, 0) X_rnn_test = np.delete(X_rnn_test, i, 0) i = -1 i += 1 Y_test = np.array(Y_test, dtype=np.float) # -- X_rnn_train = np.array(X_rnn_train) X_rnn_test = np.array(X_rnn_test) X_train = np.array(X_train) Y_train = np.array(Y_train) X_test = np.array(X_test) np.random.seed(seed) np.random.shuffle(X_train) np.random.seed(seed) np.random.shuffle(Y_train) np.random.seed(seed) np.random.shuffle(X_rnn_train) else: # is_training = false # mean and std fr = open(folder + "%s_parameter.pickle" % target_site, 'rb') [mean_X_train, std_X_train, mean_y_train, std_y_train] = (cPickle.load(fr)).split(',') mean_X_train = mean_X_train.replace('[', '').replace(']', '').replace( '\n', '').split(' ') while '' in mean_X_train: mean_X_train.pop(mean_X_train.index('')) mean_X_train = np.array(mean_X_train, dtype=np.float) std_X_train = std_X_train.replace('[', '').replace(']', '').replace( '\n', '').split(' ') while '' in std_X_train: std_X_train.pop(std_X_train.index('')) std_X_train = np.array(std_X_train, dtype=np.float) mean_y_train = float(mean_y_train) std_y_train = float(std_y_train) fr.close() # reading data print('preparing testing set ..') X_test = data X_test = missing_check(np.array(X_test)) # normalize print('Normalize ..') if 0 in std_X_train: input("Denominator can't be 0.") X_test = np.array([(x_test - mean_X_train) / std_X_train for x_test in X_test]) # feature process if 'WIND_DIREC' in pollution_kind: index_of_kind = pollution_kind.index('WIND_DIREC') length_of_kind_list = len(pollution_kind) len_of_sites_list = len(site_list) X_test = X_test.tolist() for i in range(len(X_test)): for j in range(len_of_sites_list): specific_index = index_of_kind + j * length_of_kind_list coordin = data_coordinate_angle( (X_test[i].pop(specific_index + j)) * std_X_train[specific_index] + mean_X_train[specific_index]) X_test[i].insert(specific_index, coordin[1]) X_test[i].insert(specific_index, coordin[0]) X_test = np.array(X_test) # -- print('Constructing time series data set ..') X_rnn_test = construct_time_steps(X_test, time_steps) X_test = concatenate_time_steps(X_test, time_steps) # -- X_rnn_test = np.array(X_rnn_test) X_test = np.array(X_test) # -- xgboost -- print('- xgboost -') filename = ("xgboost_%s_training_%s_m%s_to_%s_m%s_interval_%s" % (target_site, training_year[0], training_begining, training_year[-1], training_deadline, interval_hours)) print(filename) if is_training: xgb_model = xgb.XGBRegressor().fit(X_train, Y_train) fw = open(folder + filename, 'wb') cPickle.dump(xgb_model, fw) fw.close() else: fr = open(folder + filename, 'rb') xgb_model = cPickle.load(fr) fr.close() xgb_pred = xgb_model.predict(X_test) # print('rmse(xgboost): %.5f' % (np.mean((Y_test - (mean_y_train + std_y_train * xgb_pred))**2, 0)**0.5)) # -- rnn -- print('- rnn -') filename = ("sa_DropoutLSTM_%s_training_%s_m%s_to_%s_m%s_interval_%s" % (target_site, training_year[0], training_begining, training_year[-1], training_deadline, interval_hours)) print(filename) # Network Parameters time_steps = 12 hidden_size = 20 print("Expected args: p_W, p_U, p_dense, p_emb, weight_decay, batch_size") print("Using default args:") param = ["", "0.5", "0.5", "0.5", "0.5", "1e-6", "128"] args = [float(a) for a in param[1:]] print(args) p_W, p_U, p_dense, p_emb, weight_decay, batch_size = args batch_size = int(batch_size) # -- print('Build rnn model...') start_time = time.time() rnn_model = Sequential() # layer 1 rnn_model.add( BatchNormalization(epsilon=0.001, mode=0, axis=-1, momentum=0.99, weights=None, beta_init='zero', gamma_init='one', gamma_regularizer=None, beta_regularizer=None, input_shape=(time_steps, input_size))) rnn_model.add( LSTM(hidden_size, W_regularizer=l2(weight_decay), U_regularizer=l2(weight_decay), b_regularizer=l2(weight_decay), dropout_W=p_W, dropout_U=p_U)) # return_sequences=True rnn_model.add(Dropout(p_dense)) # output layer rnn_model.add( BatchNormalization(epsilon=0.001, mode=0, axis=-1, momentum=0.99, weights=None, beta_init='zero', gamma_init='one', gamma_regularizer=None, beta_regularizer=None)) rnn_model.add( Dense(output_size, W_regularizer=l2(weight_decay), b_regularizer=l2(weight_decay))) # optimiser = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=False) optimiser = 'adam' rnn_model.compile(loss='mean_squared_error', optimizer=optimiser) final_time = time.time() time_spent_printer(start_time, final_time) if is_training: print("Train...") start_time = time.time() rnn_model.fit(X_rnn_train, Y_train, batch_size=batch_size, epochs=50) # Potentially save weights rnn_model.save_weights(folder + filename, overwrite=True) final_time = time.time() time_spent_printer(start_time, final_time) else: print('loading model ..') # print('loading model from %s' % (folder + filename + ".hdf5")) rnn_model.load_weights(folder + filename) rnn_pred = rnn_model.predict(X_rnn_test, batch_size=500, verbose=1) final_time = time.time() time_spent_printer(start_time, final_time) # print('rmse(rnn): %.5f' % (np.mean((np.atleast_2d(Y_test).T - (mean_y_train + std_y_train * rnn_pred))**2, 0)**0.5)) # -- ensemble -- print('stacking ..') if is_training: xgb_output = xgb_model.predict(X_train).reshape(len(X_train), 1) # rf_output = rf_model.predict(X_train).reshape(len(X_train), 1) rnn_output = rnn_model.predict(X_rnn_train, batch_size=500, verbose=1) # ensemble_X_train = np.hstack((X_train, xgb_output, rf_output, rnn_output)) ensemble_X_train = np.hstack((X_train, xgb_output, rnn_output)) Y_alert_train = [y * std_y_train + mean_y_train for y in Y_train] for element in range(len(Y_train)): if Y_alert_train[element] > high_alert: Y_alert_train[element] = 1 # [1, 0] = [high, low] else: Y_alert_train[element] = 0 xgb_pred = xgb_pred.reshape(len(X_test), 1) # rf_pred = rf_pred.reshape(len(X_test), 1) rnn_pred = rnn_pred.reshape(len(X_test), 1) # ensemble_X_test = np.hstack((X_test, xgb_pred, rf_pred, rnn_pred)) ensemble_X_test = np.hstack((X_test, xgb_pred, rnn_pred)) # Y_alert_test = np.zeros(len(Y_test)) # for element in range(len(Y_test)): # if Y_test[element] > high_alert: # Y_alert_test[element] = 1 # [1, 0] = [high, low] print('- ensemble -') filename = ("ensemble_%s_training_%s_m%s_to_%s_m%s_interval_%s" % (target_site, training_year[0], training_begining, training_year[-1], training_deadline, interval_hours)) filename2 = ("classification_%s_training_%s_m%s_to_%s_m%s_interval_%s" % (target_site, training_year[0], training_begining, training_year[-1], training_deadline, interval_hours)) if is_training: ensemble_model = xgb.XGBRegressor().fit(ensemble_X_train, Y_train) classification_model = xgb.XGBClassifier().fit(ensemble_X_train, Y_alert_train) fw = open(folder + filename, 'wb') cPickle.dump(ensemble_model, fw) fw.close() fw2 = open(folder + filename2, 'wb') cPickle.dump(classification_model, fw2) fw2.close() else: fr = open(folder + filename, 'rb') ensemble_model = cPickle.load(fr) fr.close() fr2 = open(folder + filename2, 'rb') classification_model = cPickle.load(fr2) fr2.close() pred = ensemble_model.predict(ensemble_X_test) alert_pred = classification_model.predict(ensemble_X_test) # -- predictions = mean_y_train + std_y_train * pred # print('mse: %.5f' % mean_squared_error(Y_test, predictions)) if is_training: print('rmse: %.5f' % (np.mean((Y_test - predictions)**2, 0)**0.5)) def target_level(target, kind='PM2.5'): # target should be a 1d-list if kind == 'PM2.5': if (target >= 0) and (target < 11.5): # 0-11 return 1 elif (target >= 11.5) and (target < 23.5): # 12-23 return 2 elif (target >= 23.5) and (target < 35.5): # 24-35 return 3 elif (target >= 35.5) and (target < 41.5): # 36-41 return 4 elif (target >= 41.5) and (target < 47.5): # 42-47 return 5 elif (target >= 47.5) and (target < 53.5): # 48-53 return 6 elif (target >= 53.5) and (target < 58.5): # 54-58 return 7 elif (target >= 58.5) and (target < 64.5): # 59-64 return 8 elif (target >= 64.5) and (target < 70.5): # 65-70 return 9 elif target >= 70.5: # others(71+) return 10 else: print('error value: %d' % target) return 1 pred_label = np.zeros(len(predictions)) real_target = np.zeros(len(Y_test)) pred_label_true = 0. pred_label_false = 0. four_label_true = 0.0 four_label_false = 0.0 # calculate the accuracy of ten level for i in range(len(predictions)): pred_label[i] = target_level(predictions[i]) real_target[i] = target_level(Y_test[i]) if real_target[i] == pred_label[i]: pred_label_true += 1 else: pred_label_false += 1 # four label if (real_target[i] >= 1 and real_target[i] <= 3) and (pred_label[i] >= 1 and pred_label[i] <= 3): four_label_true += 1 elif (real_target[i] >= 4 and real_target[i] <= 6) and (pred_label[i] >= 4 and pred_label[i] <= 6): four_label_true += 1 elif (real_target[i] >= 7 and real_target[i] <= 9) and (pred_label[i] >= 7 and pred_label[i] <= 9): four_label_true += 1 elif (real_target[i] >= 10) and (pred_label[i] >= 10): four_label_true += 1 else: four_label_false += 1 # print('standard_prob_accuracy: %.5f' % (standard_prob_true / (standard_prob_true + standard_prob_false))) print('Ten level accuracy: %.5f' % (pred_label_true / (pred_label_true + pred_label_false))) print('Four level accuracy: %.5f' % (four_label_true / (four_label_true + four_label_false))) print('--') # -- ha = 0.0 # observation high, predict high hb = 0.0 # observation low, predict high hc = 0.0 # observation high, predict low hd = 0.0 # observation low, predict low la = 0.0 # observation very high, predict very high lb = 0.0 lc = 0.0 ld = 0.0 alert_a = 0.0 alert_b = 0.0 alert_c = 0.0 alert_d = 0.0 integration_a = 0.0 integration_b = 0.0 integration_c = 0.0 integration_d = 0.0 for each_value in range(len(Y_test)): if Y_test[each_value] >= high_alert: # observation high # regression if predictions[ each_value] >= high_alert: # forecast high(with tolerance) ha += 1 else: hc += 1 # classification if alert_pred[each_value]: # [1, 0] = [high, low] alert_a += 1 else: alert_c += 1 # integration if alert_pred[each_value] or (predictions[each_value] >= high_alert): integration_a += 1 else: integration_c += 1 else: # observation low # regression if predictions[each_value] >= high_alert: hb += 1 else: hd += 1 # classification if alert_pred[each_value]: alert_b += 1 else: alert_d += 1 # integration if alert_pred[each_value] or (predictions[each_value] >= high_alert): integration_b += 1 else: integration_d += 1 # -------------------------------------------------------- if Y_test[each_value] >= low_alert: # observation higher if predictions[each_value] >= low_alert: la += 1 else: lc += 1 else: # observation very low if predictions[each_value] >= low_alert: lb += 1 else: ld += 1 # print('Two level accuracy: %f' % (two_label_true / (two_label_true + two_label_false))) print('high label: (%d, %d, %d, %d)' % (ha, hb, hc, hd)) print('low label: (%d, %d, %d, %d)' % (la, lb, lc, ld)) print('alert: (%d, %d, %d, %d)' % (alert_a, alert_b, alert_c, alert_d)) return predictions
X_test = np.array(X_test) Y_test = np.array(Y_test, dtype=np.float) # ---------------------------------------------- Data Frame -------------------------------------------------------- print('Constructing time series data set ..', end='') # for cnn X_cnn_train = construct_time_steps(X_cnn_train[:-1], cnn_time_steps) X_cnn_test = construct_time_steps(X_cnn_test[:-1], cnn_time_steps) # for rnn # X_rnn_train = construct_time_steps(X_train[:-1], time_steps) # X_rnn_test = construct_time_steps(X_test[:-1], time_steps) # for others X_train = concatenate_time_steps(X_train[:-1], time_steps) X_test = concatenate_time_steps(X_test[:-1], time_steps) # -- if cnn_time_steps > time_steps: X_train = X_train[cnn_time_steps - time_steps:] # X_rnn_train = X_rnn_train[cnn_time_steps-time_steps:] Y_train = Y_train[cnn_time_steps:] X_test = X_test[cnn_time_steps - time_steps:] # X_rnn_test = X_rnn_test[cnn_time_steps-time_steps:] Y_test = Y_test[cnn_time_steps:] else: X_cnn_train = X_cnn_train[time_steps - cnn_time_steps:] Y_train = Y_train[time_steps:]