def generate_X_test_set(city="bj", station_list=None, X_aq_list=None, X_meo_list=None, pre_days=5, gap=0, use_norm_data=True, use_day_model=True, generate_mean=False, generate_range=False): if use_norm_data: aq_dev = pd.read_csv( "preprocessed_data/after_split/norm_data/%s_aq_dev_data.csv" % (city)) meo_dev = pd.read_csv( "preprocessed_data/after_split/norm_data/%s_meo_dev_data.csv" % (city)) else: aq_dev = pd.read_csv( "preprocessed_data/after_split/original_data/%s_aq_dev_data.csv" % (city)) meo_dev = pd.read_csv( "preprocessed_data/after_split/original_data/%s_meo_dev_data.csv" % (city)) dev_df = pd.concat([aq_dev, meo_dev], axis=1) # step 1 : keep all features about the stations station_filters = get_stations_filters(dev_df, station_list) # step 2 : filter of X features X_feature_filters = get_X_feature_filters(X_aq_list, X_meo_list, station_filters) X_df = dev_df[X_feature_filters] # step 3 : 根据 pre_days 和 gap,确定 preprocessed_data 中X的值 delta = 0 X_end_index = X_df.shape[0] - 1 - delta X_start_index = X_end_index - pre_days * 24 + gap + 1 X = X_df.loc[X_start_index:X_end_index] X = np.array(X) X = np.expand_dims(X, axis=0) if use_day_model: if generate_mean and generate_range: X_mean = day_mean(X) X_range = day_range(X) X = np.concatenate([X_mean, X_range], axis=2) elif generate_mean and not generate_range: X = day_mean(X) elif generate_range and not generate_mean: X = day_range(X) return X
def generate_training_batch(X_array_list, y_array_list, batch_size=128, use_day_model=True, generate_mean=False, generate_range=False): X_list = [] y_list = [] for i in range(batch_size): selected_index = int( np.random.choice(len(X_array_list), 1, replace=False)) X_list.append(X_array_list[selected_index]) y_list.append(y_array_list[selected_index]) X_train_batch = np.concatenate(X_list, axis=0) y_train_batch = np.concatenate(y_list, axis=0) if use_day_model: if generate_mean and generate_range: X_train_batch_mean = day_mean(X_train_batch) y_train_batch_mean = day_mean(y_train_batch) X_train_batch_range = day_range(X_train_batch) y_train_batch_range = day_range(y_train_batch) # concanate X_train_batch_all = np.concatenate( [X_train_batch_mean, X_train_batch_range], axis=2) y_train_batch_all = np.concatenate( [y_train_batch_mean, y_train_batch_range], axis=2) return X_train_batch_all, (y_train_batch_all, y_train_batch) elif generate_mean and not generate_range: X_train_batch_mean = day_mean(X_train_batch) y_train_batch_mean = day_mean(y_train_batch) return X_train_batch_mean, (y_train_batch_mean, y_train_batch) elif generate_range and not generate_mean: X_train_batch_range = day_range(X_train_batch) y_train_batch_range = day_range(y_train_batch) return X_train_batch_range, (y_train_batch_range, y_train_batch) else: print("wrong mode !") else: return X_train_batch, y_train_batch
def generate_training_set(city="bj", station_list=None, X_aq_list=None, y_aq_list=None, X_meo_list=None, use_day=True, pre_days=5, gap=0, use_day_model=True, generate_mean=False, generate_range=False): # aq_train = pd.read_csv("preprocessed_data/after_split/%s_aq_train_data.csv" %(city)) # meo_train = pd.read_csv("preprocessed_data/after_split/%s_meo_train_data.csv" %(city)) aq_train = pd.read_csv( "preprocessed_data/after_split/original_data/%s_aq_train_data.csv" % (city)) meo_train = pd.read_csv( "preprocessed_data/after_split/original_data/%s_meo_train_data.csv" % (city)) train_df = pd.concat([aq_train, meo_train], axis=1) station_filters = get_stations_filters(train_df, station_list) X_feature_filters = get_X_feature_filters(X_aq_list, X_meo_list, station_filters) y_feature_filters = get_y_feature_filters(y_aq_list, station_filters) X_df = train_df[X_feature_filters] y_df = train_df[y_feature_filters] # step 4 : generate training batch X_df_list = [] y_df_list = [] max_start_points = X_df.shape[0] - (pre_days + 2) * 24 - gap if use_day: total_start_points = range(0, max_start_points, 24) else: total_start_points = range(0, max_start_points, 1) for X_start_index in total_start_points: X_end_index = X_start_index + pre_days * 24 - 1 - gap y_start_index = X_start_index + pre_days * 24 y_end_index = y_start_index + 47 # print(X_start_index, X_end_index, y_start_index, y_end_index) X = X_df.loc[X_start_index:X_end_index] y = y_df.loc[y_start_index:y_end_index] # 判断是不是有 NAN if pd.isnull(X).any().any() or pd.isnull(y).any().any(): pass else: X = np.array(X) y = np.array(y) X = np.expand_dims(X, axis=0) y = np.expand_dims(y, axis=0) X_df_list.append(X) y_df_list.append(y) X_train = np.concatenate(X_df_list, axis=0) y_train = np.concatenate(y_df_list, axis=0) if use_day_model: if generate_mean: X_train_mean = day_mean(X_train) # (m, days, features) y_train_mean = day_mean(y_train) [m, input_day, input_features] = X_train_mean.shape [m, output_day, output_features] = y_train_mean.shape # X_train_mean = X_train_mean.reshape([m, input_day*input_features]) # (m, days*features) # y_train_mean = y_train_mean.reshape([m, output_day*output_features]) return X_train_mean, y_train_mean elif generate_range: X_train_range = day_range(X_train) # (m, days, features) y_train_range = day_range(y_train) [m, input_day, input_features] = X_train_range.shape [m, output_day, output_features] = y_train_range.shape # X_train_range = X_train_range.reshape([m, input_day*input_features]) # (m, days*features) # y_train_range = y_train_range.reshape([m, output_day*output_features]) return X_train_range, y_train_range return X_train_batch, y_train_batch
def generate_dev_set(city="bj", station_list=None, X_aq_list=None, y_aq_list=None, X_meo_list=None, pre_days=5, gap=0, use_day_model=True, generate_mean=False, generate_range=False): ''' Args: station_list : a list of used stations. X_aq_list : a list of used aq features as input. y_aq_list : a list of used aq features as output. X_meo_list : a list of used meo features. use_day_model : which model to use, True for day_based model, false for old model. ''' # aq_dev = pd.read_csv("preprocessed_data/after_split/%s_aq_dev_data.csv" %(city)) # meo_dev = pd.read_csv("preprocessed_data/after_split/%s_meo_dev_data.csv" %(city)) aq_dev = pd.read_csv( "preprocessed_data/after_split/original_data/%s_aq_dev_data.csv" % (city)) meo_dev = pd.read_csv( "preprocessed_data/after_split/original_data/%s_meo_dev_data.csv" % (city)) dev_df = pd.concat([aq_dev, meo_dev], axis=1) station_filters = get_stations_filters(dev_df, station_list) X_feature_filters = get_X_feature_filters(X_aq_list, X_meo_list, station_filters) y_feature_filters = get_y_feature_filters(y_aq_list, station_filters) X_df = dev_df[X_feature_filters] y_df = dev_df[y_feature_filters] # step 4 : 按天生成数据 min_y_start_index = 7 * 24 # 7 是当前的最长的 pre_days max_y_start_index = X_df.shape[ 0] - 2 * 24 # if not for aggr, use all data from the dev set X_df_list = [] y_df_list = [] for y_start_index in range(min_y_start_index, max_y_start_index, 24): X_start_index = y_start_index - pre_days * 24 X_end_index = y_start_index - 1 - gap y_end_index = y_start_index + 47 X = X_df.loc[X_start_index:X_end_index] y = y_df.loc[y_start_index:y_end_index] X = np.array(X) y = np.array(y) X = np.expand_dims(X, axis=0) y = np.expand_dims(y, axis=0) X_df_list.append(X) y_df_list.append(y) X_dev = np.concatenate(X_df_list, axis=0) y_dev = np.concatenate(y_df_list, axis=0) if use_day_model: if generate_mean: X_dev_mean = day_mean(X_dev) y_dev_mean = day_mean(y_dev) [m, input_day, input_features] = X_dev_mean.shape [m, output_day, output_features] = y_dev_mean.shape X_dev_mean = X_dev_mean.reshape([m, input_day * input_features ]) # (m, days*features) y_dev_mean = y_dev_mean.reshape([m, output_day * output_features]) return X_dev_mean, y_dev_mean, y_dev elif generate_range: X_dev_range = day_mean(X_dev) y_dev_range = day_mean(y_dev) [m, input_day, input_features] = X_dev_range.shape [m, output_day, output_features] = y_dev_range.shape X_dev_range = X_dev_range.reshape([m, input_day * input_features ]) # (m, days*features) y_dev_range = y_dev_range.reshape( [m, output_day * output_features]) return X_dev_range, y_dev_range, y_dev else: return X_dev, y_dev
def generate_dev_set(city="bj", station_list=None, X_aq_list=None, y_aq_list=None, X_meo_list=None, pre_days=5, gap=0, use_day_model=True, use_norm_data=True, generate_mean=False, generate_range=False, for_aggr=False, aggr_start_time="2018-5-22 0:00", predict_one_day=False): ''' Args: station_list : a list of used stations. X_aq_list : a list of used aq features as input. y_aq_list : a list of used aq features as output. X_meo_list : a list of used meo features. use_day_model : which model to use, True for day_based model, false for old model. ''' if use_norm_data: aq_dev = pd.read_csv( "preprocessed_data/after_split/norm_data/%s_aq_dev_data.csv" % (city)) meo_dev = pd.read_csv( "preprocessed_data/after_split/norm_data/%s_meo_dev_data.csv" % (city)) else: aq_dev = pd.read_csv( "preprocessed_data/after_split/original_data/%s_aq_dev_data.csv" % (city)) meo_dev = pd.read_csv( "preprocessed_data/after_split/original_data/%s_meo_dev_data.csv" % (city)) dev_df = pd.concat([aq_dev, meo_dev], axis=1) station_filters = get_stations_filters(dev_df, station_list) X_feature_filters = get_X_feature_filters(X_aq_list, X_meo_list, station_filters) y_feature_filters = get_y_feature_filters(y_aq_list, station_filters) X_df = dev_df[X_feature_filters] y_df = dev_df[y_feature_filters] # step 4 : 按天生成数据 min_y_start_index = 7 * 24 # 7 是当前的最长的 pre_days if predict_one_day: y_hours = 23 else: y_hours = 47 if for_aggr: # remove dupilicated columns dev_df = dev_df.loc[:, ~dev_df.columns.duplicated()] dev_df['time'] = pd.to_datetime(dev_df['time']) # get the index of the aggr_start_time max_y_start_index = dev_df.loc[ dev_df['time'] == aggr_start_time].index.values[0] - y_hours else: max_y_start_index = X_df.shape[ 0] - 2 * 24 # if not for aggr, use all data from the dev set X_df_list = [] y_df_list = [] if predict_one_day: y_day_one_df_list = [] for y_start_index in range(min_y_start_index, max_y_start_index, 24): X_start_index = y_start_index - pre_days * 24 X_end_index = y_start_index - 1 - gap y_end_index = y_start_index + y_hours X = X_df.loc[X_start_index:X_end_index] y = y_df.loc[y_start_index:y_end_index] X = np.array(X) y = np.array(y) X = np.expand_dims(X, axis=0) y = np.expand_dims(y, axis=0) X_df_list.append(X) y_df_list.append(y) if predict_one_day: y_day_one = X_df.loc[y_start_index:y_end_index - 24] # just for the 1 st day y_day_one = np.array(y_day_one) y_day_one = np.expand_dims(y_day_one, axis=0) y_day_one_df_list.append(y_day_one) X_dev = np.concatenate(X_df_list, axis=0) y_dev = np.concatenate(y_df_list, axis=0) if predict_one_day: x_dev_day_one = np.concatenate(y_day_one_df_list, axis=0) if use_day_model: if generate_mean and generate_range: X_dev_mean = day_mean(X_dev) y_dev_mean = day_mean(y_dev) X_dev_range = day_range(X_dev) y_dev_range = day_range(y_dev) # concanate X_dev_all = np.concatenate([X_dev_mean, X_dev_range], axis=2) y_dev_all = np.concatenate([y_dev_mean, y_dev_range], axis=2) return X_dev_all, ( y_dev_all, y_dev ) # (m, days_in, 2*input_features), (m, days_out, 2*output_features) elif generate_mean and not generate_range: X_dev_mean = day_mean(X_dev) y_dev_mean = day_mean(y_dev) return X_dev_mean, (y_dev_mean, y_dev) elif generate_range and not generate_mean: X_dev_range = day_range(X_dev) y_dev_range = day_range(y_dev) return X_dev_range, (y_dev_range, y_dev) else: print("wrong mode !") elif predict_one_day: return X_dev, y_dev, x_dev_day_one, X_feature_filters, y_feature_filters else: return X_dev, y_dev
def generate_aggr_set(city="bj", station_list=None, X_aq_list=None, y_aq_list=None, X_meo_list=None, pre_days=5, gap=0, use_day_model=True, use_norm_data=True, generate_mean=False, generate_range=False, aggr_start_time="2018-5-22 0:00", predict_one_day=False): ''' Args: station_list : a list of used stations. X_aq_list : a list of used aq features as input. y_aq_list : a list of used aq features as output. X_meo_list : a list of used meo features. aggr_start_time : set aggr y start time. ''' if use_norm_data: aq_aggr = pd.read_csv( "preprocessed_data/after_split/norm_data/%s_aq_dev_data.csv" % (city)) meo_aggr = pd.read_csv( "preprocessed_data/after_split/norm_data/%s_meo_dev_data.csv" % (city)) else: aq_aggr = pd.read_csv( "preprocessed_data/after_split/original_data/%s_aq_dev_data.csv" % (city)) meo_aggr = pd.read_csv( "preprocessed_data/after_split/original_data/%s_meo_dev_data.csv" % (city)) aggr_df = pd.concat([aq_aggr, meo_aggr], axis=1) # remove dupilicated columns aggr_df = aggr_df.loc[:, ~aggr_df.columns.duplicated()] aggr_df['time'] = pd.to_datetime(aggr_df['time']) # get the index of the aggr_start_time total_y_start_index = aggr_df.loc[aggr_df['time'] == aggr_start_time].index.values[0] station_filters = get_stations_filters(aggr_df, station_list) X_feature_filters = get_X_feature_filters(X_aq_list, X_meo_list, station_filters) y_feature_filters = get_y_feature_filters(y_aq_list, station_filters) X_df = aggr_df[X_feature_filters] y_df = aggr_df[y_feature_filters] # step 2 : generate aggr batch X_df_list = [] y_df_list = [] if predict_one_day: y_day_one_df_list = [] if predict_one_day: y_hours = 23 else: y_hours = 47 for y_start_index in range(total_y_start_index, y_df.shape[0] - y_hours, 24): y_end_index = y_start_index + y_hours X_start_index = y_start_index - pre_days * 24 X_end_index = y_start_index - 1 - gap y = y_df.loc[y_start_index:y_end_index] X = X_df.loc[X_start_index:X_end_index] X = np.array(X) y = np.array(y) X = np.expand_dims(X, axis=0) y = np.expand_dims(y, axis=0) X_df_list.append(X) y_df_list.append(y) if predict_one_day: y_day_one = X_df.loc[y_start_index:y_end_index - 24] # just for the 1 st day y_day_one = np.array(y_day_one) y_day_one = np.expand_dims(y_day_one, axis=0) y_day_one_df_list.append(y_day_one) X_aggr = np.concatenate(X_df_list, axis=0) y_aggr = np.concatenate(y_df_list, axis=0) if predict_one_day: x_aggr_day_one = np.concatenate(y_day_one_df_list, axis=0) if use_day_model: if generate_mean and generate_range: X_aggr_mean = day_mean(X_aggr) y_aggr_mean = day_mean(y_aggr) X_aggr_range = day_range(X_aggr) y_aggr_range = day_range(y_aggr) # concanate X_aggr_all = np.concatenate([X_aggr_mean, X_aggr_range], axis=2) y_aggr_all = np.concatenate([y_aggr_mean, y_aggr_range], axis=2) return X_aggr_all, (y_aggr_all, y_aggr) elif generate_mean and not generate_range: X_aggr_mean = day_mean(X_aggr) y_aggr_mean = day_mean(y_aggr) return X_aggr_mean, (y_aggr_mean, y_aggr) elif generate_range and not generate_mean: X_aggr_range = day_range(X_aggr) y_aggr_range = day_range(y_aggr) return X_aggr_range, (y_aggr_range, y_aggr) else: print("wrong mode !") elif predict_one_day: return X_aggr, y_aggr, x_aggr_day_one else: return X_aggr, y_aggr