def data_collection(local, city, feature_selection, msg): sites = pollution_site_map[local][city] feature_matrix = [] for publishTime in sorted(msg.keys()): feature_vector = [] for site in sites: usite = site.decode('utf8') for feature_elem in feature_selection: if usite in msg[publishTime]: if feature_elem == 'WIND_SPEED': if 'WindSpeed' in msg[publishTime][usite]: feature_vector.append(msg[publishTime][usite]['WindSpeed']) else: feature_vector.append('NaN') elif feature_elem == 'WIND_DIREC': if 'WindDirec' in msg[publishTime][usite]: feature_vector.append(msg[publishTime][usite]['WindDirec']) else: feature_vector.append('NaN') # ---------------------------- elif feature_elem == 'AMB_TEMP': if local == '北部': feature_vector.append(18) elif local == '中部': feature_vector.append(20) elif local == '高屏': feature_vector.append(22) elif feature_elem == 'RH': feature_vector.append(50) # ---------------------------- elif feature_elem == 'PM2.5': if 'PM2_5' in msg[publishTime][usite]: feature_vector.append(msg[publishTime][usite]['PM2_5']) else: feature_vector.append('NaN') else: feature_vector.append(msg[publishTime][usite][feature_elem]) else: feature_vector.append('NaN') feature_matrix.append(feature_vector) feature_matrix = missing_check(feature_matrix) return feature_matrix
# Set start time of data loading. print('Loading data .. ') start_time = time.time() initial_time = time.time() # Load training data, where: size(X_train) = (data_size, map_l, map_w, map_h), not sequentialized yet. print('Preparing training dataset ..') X_train = read_data_map(path=data_path, site=target_site, feature_selection=pollution_kind, date_range=np.atleast_1d(training_year), beginning=training_duration[0], finish=training_duration[-1], update=data_update) X_train = missing_check(X_train) Y_train = np.array(X_train)[:, center_i, center_j, [6 + pollution_kind.index(i) for i in target_kind]] # Load testing data, where: size(X_test) = (data_size, map_l, map_w, map_h), not sequentialized yet. print('Preparing testing dataset ..') X_test = read_data_map(path=data_path, site=target_site, feature_selection=pollution_kind, date_range=np.atleast_1d(testing_year), beginning=testing_duration[0], finish=testing_duration[-1], update=data_update) X_test = missing_check(X_test) Y_test = np.array(X_test)[:, center_i, center_j, [6 + pollution_kind.index(i) for i in target_kind]]
print('Training for %s/%s to %s/%s' % (training_year[0], training_duration[0], training_year[-1], training_duration[-1])) print('Testing for %s/%s to %s/%s' % (testing_year[0], testing_duration[0], testing_year[-1], testing_duration[-1])) if is_training: print('Training ..') else: print('Testing ..') # reading data print('Reading data .. ') start_time = time.time() print('preparing training set ..') X_train = read_data_sets(sites=site_list+[target_site], date_range=np.atleast_1d(training_year), beginning=training_duration[0], finish=training_duration[-1], feature_selection=pollution_kind, update=data_update) X_train = missing_check(X_train) Y_train = np.array(X_train)[:, -len(pollution_kind):] Y_train = Y_train[:, pollution_kind.index(target_kind)] X_train = np.array(X_train)[:, :-len(pollution_kind)] print('preparing testing set ..') X_test = read_data_sets(sites=site_list + [target_site], date_range=np.atleast_1d(testing_year), beginning=testing_duration[0], finish=testing_duration[-1], feature_selection=pollution_kind, update=data_update) Y_test = np.array(X_test)[:, -len(pollution_kind):] Y_test = Y_test[:, pollution_kind.index(target_kind)] X_test = missing_check(np.array(X_test)[:, :-len(pollution_kind)]) final_time = time.time() print('Reading data .. ok, ', end='') time_spent_printer(start_time, final_time)
def load_data(self, data_file, site_list, target_site, target_kind, training_year, training_duration, pollution_kind, SEQ_LENGTH_1, SEQ_LENGTH_2, data_update=False): print('Reading data .. ') X = read_data_sets(sites=site_list + [target_site], date_range=np.atleast_1d(training_year), beginning=training_duration[0], finish=training_duration[-1], feature_selection=pollution_kind, update=data_update) X = missing_check(X) Y = np.array(X)[:, -len(pollution_kind):] Y = Y[:, pollution_kind.index(target_kind)] SeqY = [] for y in range(len(Y)): if (y + (SEQ_LENGTH_2 - 1)) < len(Y): Seqy = [] for time_step in range(SEQ_LENGTH_2): Seqy.append(Y[y + time_step]) SeqY.append(Seqy) del Seqy else: break X = np.array(X)[:, :-len(pollution_kind)] # feature process if 'WIND_DIREC' in pollution_kind: index_of_kind = pollution_kind.index('WIND_DIREC') length_of_kind_list = len(pollution_kind) len_of_sites_list = len(site_list) X = X.tolist() for i in range(len(X)): for j in range(len_of_sites_list): specific_index = index_of_kind + j * length_of_kind_list coordin = data_coordinate_angle(X[i].pop(specific_index + j)) X[i].insert(specific_index + j, coordin[1]) X[i].insert(specific_index + j, coordin[0]) X = np.array(X) X = construct_time_steps(X[:-1], SEQ_LENGTH_1) if SEQ_LENGTH_1 < SEQ_LENGTH_2: self.X = X[0:len(SeqY)] elif SEQ_LENGTH_1 > SEQ_LENGTH_2: SeqY = SeqY[:len(X)] with open(data_file, 'w') as f: for line in SeqY: for elem_no in range(len(line)): f.write(str(line[elem_no])) if elem_no < (len(line) - 1): f.write(' ') f.write('\n')
def ensemble_model(target_kind, local, city, target_site, training_year, testing_year, training_duration, testing_duration, interval_hours, data, is_training): print('is_training(%s) = %s' % (target_site, is_training)) site_list = pollution_site_map[local][ city] # ['中山', '古亭', '士林', '松山', '萬華'] # change format from 2014-2015 to ['2014', '2015'] training_year = [ training_year[:training_year.index('-')], training_year[training_year.index('-') + 1:] ] testing_year = [ testing_year[:testing_year.index('-')], testing_year[testing_year.index('-') + 1:] ] training_duration = [ training_duration[:training_duration.index('-')], training_duration[training_duration.index('-') + 1:] ] testing_duration = [ testing_duration[:testing_duration.index('-')], testing_duration[testing_duration.index('-') + 1:] ] interval_hours = int(interval_hours) # is_training = False # clear redundancy work if training_year[0] == training_year[1]: training_year.pop(1) if testing_year[0] == testing_year[1]: testing_year.pop(1) else: input( 'The range of testing year should not more than one year or crossing the bound of years.' ) # checking years rangeofYear = int(training_year[-1]) - int(training_year[0]) for i in range(rangeofYear): if not (str(i + int(training_year[0])) in training_year): training_year.insert(i, str(i + int(training_year[0]))) # Training Parameters # WIND_DIREC is a specific feature, that need to be processed, and it can only be element of input vector now. if target_kind == 'PM2.5': pollution_kind = [ 'PM2.5', 'O3', 'AMB_TEMP', 'RH', 'WIND_SPEED', 'WIND_DIREC' ] # target_kind = 'PM2.5' data_update = False # batch_size = 24 * 7 seed = 0 # Network Parameters input_size = (len(site_list) * len(pollution_kind) + len(site_list)) if 'WIND_DIREC' in pollution_kind else ( len(site_list) * len(pollution_kind)) time_steps = 12 # hidden_size = 20 output_size = 1 testing_month = testing_duration[0][:testing_duration[0].index('/')] folder = root_path + "model/%s/%s/%sh/%s/" % (local, city, interval_hours, target_kind) training_begining = training_duration[0][:training_duration[0].index('/')] training_deadline = training_duration[-1][:training_duration[-1].index('/' )] print('site: %s' % target_site) print('Training for %s/%s to %s/%s' % (training_year[0], training_duration[0], training_year[-1], training_duration[-1])) print('Testing for %s/%s to %s/%s' % (testing_year[0], testing_duration[0], testing_year[-1], testing_duration[-1])) # for interval def ave(X, Y, interval_hours): reserve_hours = interval_hours - 1 deadline = 0 for i in range(len(Y)): # check the reserve data is enough or not if (len(Y) - i - 1) < reserve_hours: deadline = i break # not enough for j in range(reserve_hours): Y[i] += Y[i + j + 1] Y[i] /= interval_hours if deadline: X = X[:deadline] Y = Y[:deadline] return X, Y # for interval def higher(X, Y, interval_hours): reserve_hours = 1 # choose the first n number of biggest if interval_hours > reserve_hours: deadline = 0 for i in range(len(Y)): # check the reserve data is enough or not if (len(Y) - i) < interval_hours: deadline = i break # not enough higher_list = [] for j in range(interval_hours): if len(higher_list) < reserve_hours: higher_list.append(Y[i + j]) elif Y[i + j] > higher_list[0]: higher_list[0] = Y[i + j] higher_list = sorted(higher_list) Y[i] = np.array(higher_list).sum() / reserve_hours if deadline: X = X[:deadline] Y = Y[:deadline] return X, Y if is_training: # reading data print('Reading data .. ') start_time = time.time() print('preparing training set ..') X_train = read_data_sets(sites=site_list + [target_site], date_range=np.atleast_1d(training_year), beginning=training_duration[0], finish=training_duration[-1], feature_selection=pollution_kind, update=data_update) X_train = missing_check(X_train) Y_train = np.array(X_train)[:, -len(pollution_kind):] Y_train = Y_train[:, pollution_kind.index(target_kind)] X_train = np.array(X_train)[:, :-len(pollution_kind)] print('preparing testing set ..') X_test = read_data_sets(sites=site_list + [target_site], date_range=np.atleast_1d(testing_year), beginning=testing_duration[0], finish=testing_duration[-1], feature_selection=pollution_kind, update=data_update) Y_test = np.array(X_test)[:, -len(pollution_kind):] Y_test = Y_test[:, pollution_kind.index(target_kind)] X_test = missing_check(np.array(X_test)[:, :-len(pollution_kind)]) final_time = time.time() print('Reading data .. ok, ', end='') time_spent_printer(start_time, final_time) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') if (len(X_train) < time_steps) or (len(X_test) < time_steps): input('time_steps(%d) too long.' % time_steps) # normalize print('Normalize ..') mean_X_train = np.mean(X_train, axis=0) std_X_train = np.std(X_train, axis=0) if 0 in std_X_train: input("Denominator can't be 0.") X_train = np.array([(x_train - mean_X_train) / std_X_train for x_train in X_train]) X_test = np.array([(x_test - mean_X_train) / std_X_train for x_test in X_test]) mean_y_train = np.mean(Y_train) std_y_train = np.std(Y_train) if not std_y_train: input("Denominator can't be 0.") Y_train = [(y - mean_y_train) / std_y_train for y in Y_train] print('mean_y_train: %f std_y_train: %f' % (mean_y_train, std_y_train)) fw = open(folder + "%s_parameter.pickle" % target_site, 'wb') cPickle.dump( str(mean_X_train) + ',' + str(std_X_train) + ',' + str(mean_y_train) + ',' + str(std_y_train), fw) fw.close() # feature process if 'WIND_DIREC' in pollution_kind: index_of_kind = pollution_kind.index('WIND_DIREC') length_of_kind_list = len(pollution_kind) len_of_sites_list = len(site_list) X_train = X_train.tolist() X_test = X_test.tolist() for i in range(len(X_train)): for j in range(len_of_sites_list): specific_index = index_of_kind + j * length_of_kind_list coordin = data_coordinate_angle( (X_train[i].pop(specific_index + j)) * std_X_train[specific_index] + mean_X_train[specific_index]) X_train[i].insert(specific_index, coordin[1]) X_train[i].insert(specific_index, coordin[0]) if i < len(X_test): coordin = data_coordinate_angle( (X_test[i].pop(specific_index + j)) * std_X_train[specific_index] + mean_X_train[specific_index]) X_test[i].insert(specific_index, coordin[1]) X_test[i].insert(specific_index, coordin[0]) X_train = np.array(X_train) X_test = np.array(X_test) Y_test = np.array(Y_test, dtype=np.float) # -- print('Constructing time series data set ..') # for rnn X_rnn_train = construct_time_steps(X_train[:-1], time_steps) X_rnn_test = construct_time_steps(X_test[:-1], time_steps) X_train = concatenate_time_steps(X_train[:-1], time_steps) Y_train = Y_train[time_steps:] X_test = concatenate_time_steps(X_test[:-1], time_steps) Y_test = Y_test[time_steps:] [X_train, Y_train] = higher(X_train, Y_train, interval_hours) [X_test, Y_test] = higher(X_test, Y_test, interval_hours) X_rnn_train = X_rnn_train[:len(X_train)] X_rnn_test = X_rnn_test[:len(X_test)] # delete data which have missing values i = 0 while i < len(Y_test): if not ( Y_test[i] > -10000 ): # check missing or not, if Y_test[i] is missing, then this command will return True Y_test = np.delete(Y_test, i, 0) X_test = np.delete(X_test, i, 0) X_rnn_test = np.delete(X_rnn_test, i, 0) i = -1 i += 1 Y_test = np.array(Y_test, dtype=np.float) # -- X_rnn_train = np.array(X_rnn_train) X_rnn_test = np.array(X_rnn_test) X_train = np.array(X_train) Y_train = np.array(Y_train) X_test = np.array(X_test) np.random.seed(seed) np.random.shuffle(X_train) np.random.seed(seed) np.random.shuffle(Y_train) np.random.seed(seed) np.random.shuffle(X_rnn_train) else: # is_training = false # mean and std fr = open(folder + "%s_parameter.pickle" % target_site, 'rb') [mean_X_train, std_X_train, mean_y_train, std_y_train] = (cPickle.load(fr)).split(',') mean_X_train = mean_X_train.replace('[', '').replace(']', '').replace( '\n', '').split(' ') while '' in mean_X_train: mean_X_train.pop(mean_X_train.index('')) mean_X_train = np.array(mean_X_train, dtype=np.float) std_X_train = std_X_train.replace('[', '').replace(']', '').replace( '\n', '').split(' ') while '' in std_X_train: std_X_train.pop(std_X_train.index('')) std_X_train = np.array(std_X_train, dtype=np.float) mean_y_train = float(mean_y_train) std_y_train = float(std_y_train) fr.close() # reading data print('preparing testing set ..') X_test = data X_test = missing_check(np.array(X_test)) # normalize print('Normalize ..') if 0 in std_X_train: input("Denominator can't be 0.") X_test = np.array([(x_test - mean_X_train) / std_X_train for x_test in X_test]) # feature process if 'WIND_DIREC' in pollution_kind: index_of_kind = pollution_kind.index('WIND_DIREC') length_of_kind_list = len(pollution_kind) len_of_sites_list = len(site_list) X_test = X_test.tolist() for i in range(len(X_test)): for j in range(len_of_sites_list): specific_index = index_of_kind + j * length_of_kind_list coordin = data_coordinate_angle( (X_test[i].pop(specific_index + j)) * std_X_train[specific_index] + mean_X_train[specific_index]) X_test[i].insert(specific_index, coordin[1]) X_test[i].insert(specific_index, coordin[0]) X_test = np.array(X_test) # -- print('Constructing time series data set ..') X_rnn_test = construct_time_steps(X_test, time_steps) X_test = concatenate_time_steps(X_test, time_steps) # -- X_rnn_test = np.array(X_rnn_test) X_test = np.array(X_test) # -- xgboost -- print('- xgboost -') filename = ("xgboost_%s_training_%s_m%s_to_%s_m%s_interval_%s" % (target_site, training_year[0], training_begining, training_year[-1], training_deadline, interval_hours)) print(filename) if is_training: xgb_model = xgb.XGBRegressor().fit(X_train, Y_train) fw = open(folder + filename, 'wb') cPickle.dump(xgb_model, fw) fw.close() else: fr = open(folder + filename, 'rb') xgb_model = cPickle.load(fr) fr.close() xgb_pred = xgb_model.predict(X_test) # print('rmse(xgboost): %.5f' % (np.mean((Y_test - (mean_y_train + std_y_train * xgb_pred))**2, 0)**0.5)) # -- rnn -- print('- rnn -') filename = ("sa_DropoutLSTM_%s_training_%s_m%s_to_%s_m%s_interval_%s" % (target_site, training_year[0], training_begining, training_year[-1], training_deadline, interval_hours)) print(filename) # Network Parameters time_steps = 12 hidden_size = 20 print("Expected args: p_W, p_U, p_dense, p_emb, weight_decay, batch_size") print("Using default args:") param = ["", "0.5", "0.5", "0.5", "0.5", "1e-6", "128"] args = [float(a) for a in param[1:]] print(args) p_W, p_U, p_dense, p_emb, weight_decay, batch_size = args batch_size = int(batch_size) # -- print('Build rnn model...') start_time = time.time() rnn_model = Sequential() # layer 1 rnn_model.add( BatchNormalization(epsilon=0.001, mode=0, axis=-1, momentum=0.99, weights=None, beta_init='zero', gamma_init='one', gamma_regularizer=None, beta_regularizer=None, input_shape=(time_steps, input_size))) rnn_model.add( LSTM(hidden_size, W_regularizer=l2(weight_decay), U_regularizer=l2(weight_decay), b_regularizer=l2(weight_decay), dropout_W=p_W, dropout_U=p_U)) # return_sequences=True rnn_model.add(Dropout(p_dense)) # output layer rnn_model.add( BatchNormalization(epsilon=0.001, mode=0, axis=-1, momentum=0.99, weights=None, beta_init='zero', gamma_init='one', gamma_regularizer=None, beta_regularizer=None)) rnn_model.add( Dense(output_size, W_regularizer=l2(weight_decay), b_regularizer=l2(weight_decay))) # optimiser = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=False) optimiser = 'adam' rnn_model.compile(loss='mean_squared_error', optimizer=optimiser) final_time = time.time() time_spent_printer(start_time, final_time) if is_training: print("Train...") start_time = time.time() rnn_model.fit(X_rnn_train, Y_train, batch_size=batch_size, epochs=50) # Potentially save weights rnn_model.save_weights(folder + filename, overwrite=True) final_time = time.time() time_spent_printer(start_time, final_time) else: print('loading model ..') # print('loading model from %s' % (folder + filename + ".hdf5")) rnn_model.load_weights(folder + filename) rnn_pred = rnn_model.predict(X_rnn_test, batch_size=500, verbose=1) final_time = time.time() time_spent_printer(start_time, final_time) # print('rmse(rnn): %.5f' % (np.mean((np.atleast_2d(Y_test).T - (mean_y_train + std_y_train * rnn_pred))**2, 0)**0.5)) # -- ensemble -- print('stacking ..') if is_training: xgb_output = xgb_model.predict(X_train).reshape(len(X_train), 1) # rf_output = rf_model.predict(X_train).reshape(len(X_train), 1) rnn_output = rnn_model.predict(X_rnn_train, batch_size=500, verbose=1) # ensemble_X_train = np.hstack((X_train, xgb_output, rf_output, rnn_output)) ensemble_X_train = np.hstack((X_train, xgb_output, rnn_output)) Y_alert_train = [y * std_y_train + mean_y_train for y in Y_train] for element in range(len(Y_train)): if Y_alert_train[element] > high_alert: Y_alert_train[element] = 1 # [1, 0] = [high, low] else: Y_alert_train[element] = 0 xgb_pred = xgb_pred.reshape(len(X_test), 1) # rf_pred = rf_pred.reshape(len(X_test), 1) rnn_pred = rnn_pred.reshape(len(X_test), 1) # ensemble_X_test = np.hstack((X_test, xgb_pred, rf_pred, rnn_pred)) ensemble_X_test = np.hstack((X_test, xgb_pred, rnn_pred)) # Y_alert_test = np.zeros(len(Y_test)) # for element in range(len(Y_test)): # if Y_test[element] > high_alert: # Y_alert_test[element] = 1 # [1, 0] = [high, low] print('- ensemble -') filename = ("ensemble_%s_training_%s_m%s_to_%s_m%s_interval_%s" % (target_site, training_year[0], training_begining, training_year[-1], training_deadline, interval_hours)) filename2 = ("classification_%s_training_%s_m%s_to_%s_m%s_interval_%s" % (target_site, training_year[0], training_begining, training_year[-1], training_deadline, interval_hours)) if is_training: ensemble_model = xgb.XGBRegressor().fit(ensemble_X_train, Y_train) classification_model = xgb.XGBClassifier().fit(ensemble_X_train, Y_alert_train) fw = open(folder + filename, 'wb') cPickle.dump(ensemble_model, fw) fw.close() fw2 = open(folder + filename2, 'wb') cPickle.dump(classification_model, fw2) fw2.close() else: fr = open(folder + filename, 'rb') ensemble_model = cPickle.load(fr) fr.close() fr2 = open(folder + filename2, 'rb') classification_model = cPickle.load(fr2) fr2.close() pred = ensemble_model.predict(ensemble_X_test) alert_pred = classification_model.predict(ensemble_X_test) # -- predictions = mean_y_train + std_y_train * pred # print('mse: %.5f' % mean_squared_error(Y_test, predictions)) if is_training: print('rmse: %.5f' % (np.mean((Y_test - predictions)**2, 0)**0.5)) def target_level(target, kind='PM2.5'): # target should be a 1d-list if kind == 'PM2.5': if (target >= 0) and (target < 11.5): # 0-11 return 1 elif (target >= 11.5) and (target < 23.5): # 12-23 return 2 elif (target >= 23.5) and (target < 35.5): # 24-35 return 3 elif (target >= 35.5) and (target < 41.5): # 36-41 return 4 elif (target >= 41.5) and (target < 47.5): # 42-47 return 5 elif (target >= 47.5) and (target < 53.5): # 48-53 return 6 elif (target >= 53.5) and (target < 58.5): # 54-58 return 7 elif (target >= 58.5) and (target < 64.5): # 59-64 return 8 elif (target >= 64.5) and (target < 70.5): # 65-70 return 9 elif target >= 70.5: # others(71+) return 10 else: print('error value: %d' % target) return 1 pred_label = np.zeros(len(predictions)) real_target = np.zeros(len(Y_test)) pred_label_true = 0. pred_label_false = 0. four_label_true = 0.0 four_label_false = 0.0 # calculate the accuracy of ten level for i in range(len(predictions)): pred_label[i] = target_level(predictions[i]) real_target[i] = target_level(Y_test[i]) if real_target[i] == pred_label[i]: pred_label_true += 1 else: pred_label_false += 1 # four label if (real_target[i] >= 1 and real_target[i] <= 3) and (pred_label[i] >= 1 and pred_label[i] <= 3): four_label_true += 1 elif (real_target[i] >= 4 and real_target[i] <= 6) and (pred_label[i] >= 4 and pred_label[i] <= 6): four_label_true += 1 elif (real_target[i] >= 7 and real_target[i] <= 9) and (pred_label[i] >= 7 and pred_label[i] <= 9): four_label_true += 1 elif (real_target[i] >= 10) and (pred_label[i] >= 10): four_label_true += 1 else: four_label_false += 1 # print('standard_prob_accuracy: %.5f' % (standard_prob_true / (standard_prob_true + standard_prob_false))) print('Ten level accuracy: %.5f' % (pred_label_true / (pred_label_true + pred_label_false))) print('Four level accuracy: %.5f' % (four_label_true / (four_label_true + four_label_false))) print('--') # -- ha = 0.0 # observation high, predict high hb = 0.0 # observation low, predict high hc = 0.0 # observation high, predict low hd = 0.0 # observation low, predict low la = 0.0 # observation very high, predict very high lb = 0.0 lc = 0.0 ld = 0.0 alert_a = 0.0 alert_b = 0.0 alert_c = 0.0 alert_d = 0.0 integration_a = 0.0 integration_b = 0.0 integration_c = 0.0 integration_d = 0.0 for each_value in range(len(Y_test)): if Y_test[each_value] >= high_alert: # observation high # regression if predictions[ each_value] >= high_alert: # forecast high(with tolerance) ha += 1 else: hc += 1 # classification if alert_pred[each_value]: # [1, 0] = [high, low] alert_a += 1 else: alert_c += 1 # integration if alert_pred[each_value] or (predictions[each_value] >= high_alert): integration_a += 1 else: integration_c += 1 else: # observation low # regression if predictions[each_value] >= high_alert: hb += 1 else: hd += 1 # classification if alert_pred[each_value]: alert_b += 1 else: alert_d += 1 # integration if alert_pred[each_value] or (predictions[each_value] >= high_alert): integration_b += 1 else: integration_d += 1 # -------------------------------------------------------- if Y_test[each_value] >= low_alert: # observation higher if predictions[each_value] >= low_alert: la += 1 else: lc += 1 else: # observation very low if predictions[each_value] >= low_alert: lb += 1 else: ld += 1 # print('Two level accuracy: %f' % (two_label_true / (two_label_true + two_label_false))) print('high label: (%d, %d, %d, %d)' % (ha, hb, hc, hd)) print('low label: (%d, %d, %d, %d)' % (la, lb, lc, ld)) print('alert: (%d, %d, %d, %d)' % (alert_a, alert_b, alert_c, alert_d)) return predictions
# if True: # is_training: # reading data print('Reading data .. ') start_time = time.time() initial_time = time.time() print('preparing training set ..') raw_data_train = read_data_sets(sites=site_list + [target_site], date_range=np.atleast_1d(training_year), beginning=training_duration[0], finish=training_duration[-1], feature_selection=pollution_kind, update=data_update) raw_data_train = missing_check(raw_data_train) Y_train = np.array(raw_data_train)[:, -len(pollution_kind):] Y_train = Y_train[:, pollution_kind.index(target_kind)] raw_data_train = np.array(raw_data_train)[:, :-len(pollution_kind)] print('preparing testing set ..') raw_data_test = read_data_sets(sites=site_list + [target_site], date_range=np.atleast_1d(testing_year), beginning=testing_duration[0], finish=testing_duration[-1], feature_selection=pollution_kind, update=data_update) Y_test = np.array(raw_data_test)[:, -len(pollution_kind):] Y_test = Y_test[:, pollution_kind.index(target_kind)] raw_data_test = missing_check(
def rnn(pollution_kind, local, city, target_site, training_year, testing_year, training_duration, testing_duration, interval_hours, data, is_training): print('is_training(%s) = %s' % (target_site, is_training)) # format of training_year and testing_year should be (start year)-(end year), like 2014-2015 # format of training_duration and testing_duration should be (start date)-(end date), like 1/1-12/31 # local = os.sys.argv[1] # city = os.sys.argv[2] site_list = pollution_site_map[local][city] # change format from 2014-2015 to ['2014', '2015'] training_year = [ training_year[:training_year.index('-')], training_year[training_year.index('-') + 1:] ] testing_year = [ testing_year[:testing_year.index('-')], testing_year[testing_year.index('-') + 1:] ] training_duration = [ training_duration[:training_duration.index('-')], training_duration[training_duration.index('-') + 1:] ] testing_duration = [ testing_duration[:testing_duration.index('-')], testing_duration[testing_duration.index('-') + 1:] ] interval_hours = int( interval_hours ) # predict the label of average data of many hours later, default is 1 # is_training = os.sys.argv[9] # True False # clear redundancy work if training_year[0] == training_year[1]: training_year.pop(1) if testing_year[0] == testing_year[1]: testing_year.pop(1) # Training Parameters # WIND_DIREC is a specific feature, that need to be processed, and it can only be element of input vector now. # pollution_kind = ['PM2.5', 'O3', 'AMB_TEMP', 'RH', 'WIND_SPEED', 'WIND_DIREC'] target_kind = 'PM2.5' data_update = False # batch_size = 24 * 7 seed = 0 # Network Parameters input_size = (len(site_list) * len(pollution_kind) + len(site_list)) if 'WIND_DIREC' in pollution_kind else ( len(site_list) * len(pollution_kind)) time_steps = 12 hidden_size = 20 output_size = 1 # print("Expected args: p_W, p_U, p_dense, p_emb, weight_decay, batch_size, maxlen") # print("Using default args:") param = ["", "0.5", "0.5", "0.5", "0.5", "1e-6", "128", "200"] # args = [float(a) for a in sys.argv[1:]] args = [float(a) for a in param[1:]] # print(args) p_W, p_U, p_dense, p_emb, weight_decay, batch_size, maxlen = args batch_size = int(batch_size) maxlen = int(maxlen) testing_month = testing_duration[0][:testing_duration[0].index('/')] folder = root_path + "model/%s/%s/" % (local, city) filename = ( "sa_DropoutLSTM_pW_%.2f_pU_%.2f_pDense_%.2f_pEmb_%.2f_reg_%f_batch_size_%d_cutoff_%d_epochs_%s_%sm_%sh" % (p_W, p_U, p_dense, p_emb, weight_decay, batch_size, maxlen, target_site, testing_month, interval_hours)) print(filename) if is_training: # reading data print('Reading data for %s .. ' % target_site) start_time = time.time() print('preparing training set for %s ..' % target_site) X_train = read_data_sets(sites=site_list + [target_site], date_range=np.atleast_1d(training_year), beginning=training_duration[0], finish=training_duration[-1], feature_selection=pollution_kind, update=data_update) X_train = missing_check(X_train) Y_train = np.array(X_train)[:, -len(pollution_kind):] Y_train = Y_train[:, pollution_kind.index(target_kind)] X_train = np.array(X_train)[:, :-len(pollution_kind)] print('preparing testing set for %s..' % target_site) X_test = read_data_sets(sites=site_list + [target_site], date_range=np.atleast_1d(testing_year), beginning=testing_duration[0], finish=testing_duration[-1], feature_selection=pollution_kind, update=data_update) Y_test = np.array(X_test)[:, -len(pollution_kind):] Y_test = Y_test[:, pollution_kind.index(target_kind)] X_test = missing_check(np.array(X_test)[:, :-len(pollution_kind)]) final_time = time.time() print('Reading data for %s.. ok, ' % target_site, end='') time_spent_printer(start_time, final_time) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') if (len(X_train) < time_steps) or (len(X_test) < time_steps): input('time_steps(%d) too long.' % time_steps) # normalize print('Normalize for %s ..' % target_site) mean_X_train = np.mean(X_train, axis=0) std_X_train = np.std(X_train, axis=0) if 0 in std_X_train: input("Denominator can't be 0.(%s)" % target_site) X_train = np.array([(x_train - mean_X_train) / std_X_train for x_train in X_train]) X_test = np.array([(x_test - mean_X_train) / std_X_train for x_test in X_test]) mean_y_train = np.mean(Y_train) std_y_train = np.std(Y_train) if not std_y_train: input("Denominator can't be 0.(%s)" % target_site) Y_train = [(y - mean_y_train) / std_y_train for y in Y_train] print('mean_y_train: %f std_y_train: %f (%s)' % (mean_y_train, std_y_train, target_site)) fw = open(folder + filename + ".pickle", 'wb') cPickle.dump( str(mean_X_train) + ',' + str(std_X_train) + ',' + str(mean_y_train) + ',' + str(std_y_train), fw) fw.close() # feature process if 'WIND_DIREC' in pollution_kind: index_of_kind = pollution_kind.index('WIND_DIREC') length_of_kind_list = len(pollution_kind) len_of_sites_list = len(site_list) X_train = X_train.tolist() X_test = X_test.tolist() for i in range(len(X_train)): for j in range(len_of_sites_list): specific_index = index_of_kind + j * length_of_kind_list coordin = data_coordinate_angle( (X_train[i].pop(specific_index + j)) * std_X_train[specific_index] + mean_X_train[specific_index]) X_train[i].insert(specific_index, coordin[1]) X_train[i].insert(specific_index, coordin[0]) if i < len(X_test): coordin = data_coordinate_angle( (X_test[i].pop(specific_index + j)) * std_X_train[specific_index] + mean_X_train[specific_index]) X_test[i].insert(specific_index, coordin[1]) X_test[i].insert(specific_index, coordin[0]) X_train = np.array(X_train) X_test = np.array(X_test) Y_test = np.array(Y_test, dtype=np.float) # -- print('Constructing time series data set for %s ..' % target_site) X_train = construct_time_steps(X_train[:-1], time_steps) Y_train = Y_train[time_steps:] reserve_hours = interval_hours - 1 deadline = 0 for i in range(len(Y_train)): # check the reserve data is enough or not if (len(Y_train) - i - 1) < reserve_hours: deadline = i break # not enough for j in range(reserve_hours): Y_train[i] += Y_train[i + j + 1] Y_train[i] /= interval_hours if deadline: X_train = X_train[:deadline] Y_train = Y_train[:deadline] X_test = construct_time_steps(X_test[:-1], time_steps) Y_test = Y_test[time_steps:] deadline = 0 for i in range(len(Y_test)): # check the reserve data is enough or not if (len(Y_test) - i - 1) < reserve_hours: deadline = i break # not enough for j in range(reserve_hours): Y_test[i] += Y_test[i + j + 1] Y_test[i] /= interval_hours if deadline: X_test = X_test[:deadline] Y_test = Y_test[:deadline] # delete data which have missing values i = 0 while i < len(Y_test): if not ( Y_test[i] > -10000 ): # check missing or not, if Y_test[i] is missing, then this command will return True Y_test = np.delete(Y_test, i, 0) X_test = np.delete(X_test, i, 0) i = -1 i += 1 Y_test = np.array(Y_test, dtype=np.float) # -- X_train = np.array(X_train) Y_train = np.array(Y_train) X_test = np.array(X_test) np.random.seed(seed) np.random.shuffle(X_train) np.random.seed(seed) np.random.shuffle(Y_train) # ------------------------------------ else: fr = open(folder + filename + ".pickle", 'rb') [mean_X_train, std_X_train, mean_y_train, std_y_train] = (cPickle.load(fr)).split(',') mean_X_train = mean_X_train.replace('[', '').replace(']', '').replace( '\n', '').split(' ') while '' in mean_X_train: mean_X_train.pop(mean_X_train.index('')) mean_X_train = np.array(mean_X_train, dtype=np.float) std_X_train = std_X_train.replace('[', '').replace(']', '').replace( '\n', '').split(' ') while '' in std_X_train: std_X_train.pop(std_X_train.index('')) std_X_train = np.array(std_X_train, dtype=np.float) mean_y_train = float(mean_y_train) std_y_train = float(std_y_train) fr.close() # input data X_test = data # normalize print('Normalize for %s ..' % target_site) X_test = np.array([(x_test - mean_X_train) / std_X_train for x_test in X_test]) # feature process if 'WIND_DIREC' in pollution_kind: index_of_kind = pollution_kind.index('WIND_DIREC') length_of_kind_list = len(pollution_kind) len_of_sites_list = len(site_list) X_test = X_test.tolist() for i in range(len(X_test)): for j in range(len_of_sites_list): specific_index = index_of_kind + j * length_of_kind_list coordin = data_coordinate_angle( (X_test[i].pop(specific_index + j)) * std_X_train[specific_index] + mean_X_train[specific_index]) X_test[i].insert(specific_index, coordin[1]) X_test[i].insert(specific_index, coordin[0]) X_test = np.array([X_test]) print('Build model for %s ..' % target_site) start_time = time.time() model = Sequential() model.add( DropoutLSTM(input_size, hidden_size, truncate_gradient=maxlen, W_regularizer=l2(weight_decay), U_regularizer=l2(weight_decay), b_regularizer=l2(weight_decay), p_W=p_W, p_U=p_U)) model.add(Dropout(p_dense)) model.add( Dense(hidden_size, output_size, W_regularizer=l2(weight_decay), b_regularizer=l2(weight_decay))) # optimiser = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=False) optimiser = 'adam' model.compile(loss='mean_squared_error', optimizer=optimiser) final_time = time.time() time_spent_printer(start_time, final_time) # -- if is_training: print("Train for %s .." % target_site) start_time = time.time() checkpointer = ModelCheckpoint(filepath=folder + filename + ".hdf5", verbose=1, append_epoch_name=False, save_every_X_epochs=50) modeltest_1 = ModelTest(X_train[:100], mean_y_train + std_y_train * np.atleast_2d(Y_train[:100]).T, test_every_X_epochs=1, verbose=0, loss='euclidean', mean_y_train=mean_y_train, std_y_train=std_y_train, tau=0.1) modeltest_2 = ModelTest(X_test, np.atleast_2d(Y_test).T, test_every_X_epochs=1, verbose=0, loss='euclidean', mean_y_train=mean_y_train, std_y_train=std_y_train, tau=0.1) model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=251, callbacks=[checkpointer, modeltest_1, modeltest_2]) # score, acc = model.evaluate(X_test, y_test, batch_size=batch_size, show_accuracy=True) # print('Test score:', score) # print('Test accuracy:', acc) # model.save_weights(folder+filename+"_250.hdf5", overwrite=True) final_time = time.time() time_spent_printer(start_time, final_time) # -- print("Test for %s .." % target_site) standard_prob = model.predict(X_train, batch_size=500, verbose=1) print( np.mean(((mean_y_train + std_y_train * np.atleast_2d(Y_train).T) - (mean_y_train + std_y_train * standard_prob))**2, 0)**0.5) # -- standard_prob = model.predict(X_test, batch_size=500, verbose=1) T = 50 prob = np.array([ model.predict_stochastic(X_test, batch_size=500, verbose=0) for _ in xrange(T) ]) prob_mean = np.mean(prob, 0) print( np.mean((np.atleast_2d(Y_test).T - (mean_y_train + std_y_train * standard_prob))**2, 0)**0.5) print( np.mean((np.atleast_2d(Y_test).T - (mean_y_train + std_y_train * prob_mean))**2, 0)**0.5) standard_prob_pred = np.zeros(len(standard_prob)) prob_mean_pred = np.zeros(len(prob_mean)) real_target = np.zeros(len(Y_test)) standard_prob_true = 0. standard_prob_false = 0. prob_mean_true = 0. prob_mean_false = 0. # calculate the accuracy of ten level for i in range(len(prob_mean)): standard_prob_pred[i] = target_level(mean_y_train + std_y_train * prob_mean[i]) prob_mean_pred[i] = target_level(mean_y_train + std_y_train * prob_mean[i]) real_target[i] = target_level(Y_test[i]) if real_target[i] == standard_prob_pred[i]: standard_prob_true += 1 else: standard_prob_false += 1 if real_target[i] == prob_mean_pred[i]: prob_mean_true += 1 else: prob_mean_false += 1 print('standard_prob_accuracy(%s): %.5f' % (target_site, standard_prob_true / ((standard_prob_true + standard_prob_false)))) print('prob_mean_accuracy(%s): %.5f' % (target_site, (prob_mean_true / (prob_mean_true + prob_mean_false)))) print('--') ha = 0.0 # observation high, predict high hb = 0.0 # observation low, predict high hc = 0.0 # observation high, predict low hd = 0.0 # observation low, predict low vha = 0.0 # observation very high, predict very high vhb = 0.0 vhc = 0.0 vhd = 0.0 two_label_true = 0.0 two_label_false = 0.0 # statistic of status of prediction by forecast & observation for each_label in np.arange(len(real_target)): if real_target[each_label] >= 7: # observation high if prob_mean_pred[each_label] >= 7: ha += 1 two_label_true += 1 else: hc += 1 two_label_false += 1 else: # observation low if prob_mean_pred[each_label] >= 7: hb += 1 two_label_false += 1 else: hd += 1 two_label_true += 1 if real_target[each_label] >= 10: # observation very high if prob_mean_pred[each_label] >= 10: vha += 1 else: vhc += 1 else: # observation low if prob_mean_pred[each_label] >= 10: vhb += 1 else: vhd += 1 print('Two level accuracy of %s : %f' % (target_site, (two_label_true / (two_label_true + two_label_false)))) print('high label of %s: (%d, %d, %d, %d)' % (target_site, ha, hb, hc, hd)) print('very high label of %s: (%d, %d, %d, %d)' % (target_site, vha, vhb, vhc, vhd)) # plot the real trend and trend of prediction prediction = mean_y_train + std_y_train * prob_mean plt.plot(np.arange(len(prediction)), Y_test[:len(prediction)], c='gray') plt.plot(np.arange(len(prediction)), prediction, color='pink') plt.xticks(np.arange(0, len(prediction), 24)) plt.yticks(np.arange(0, max(Y_test), 10)) plt.grid(True) plt.rc('axes', labelsize=4) else: print('loading model for %s ..' % target_site) model.load_weights(folder + filename + ".hdf5") standard_prob = model.predict(X_test, batch_size=1, verbose=1) T = 50 prob = np.array([ model.predict_stochastic(X_test, batch_size=1, verbose=0) for _ in xrange(T) ]) prob_mean = np.mean(prob, 0) return mean_y_train + std_y_train * prob_mean