def generate(self): # load_model data stations = pd.read_csv(self.config[const.STATIONS], sep=";", low_memory=False) ts = pd.read_csv(self.config[const.OBSERVED], sep=";", low_memory=False) data = reform.group_by_station(ts=ts, stations=stations) stations = stations.to_dict(orient='index') features = list() start_time = time.time() for _, s_info in stations.items(): if s_info[const.PREDICT] != 1: continue station_id = s_info[const.ID] s_data = data[station_id] s_time = pd.to_datetime(s_data[const.TIME], format=const.T_FORMAT, utc=True).tolist() first_x = self.time_steps - 1 last_x = len(s_data) - 1 - 48 sid = [station_id] * (len(s_time) - self.time_steps) s_value = s_data[self.config[const.POLLUTANT]].tolist() t, value = reform.split(time=s_time, value=s_value, step=self.time_steps) label = times.split(time=s_time, value=s_value, group_hours=1, step=48, region=(first_x + 1, last_x + 1)) # values to be predicted feature_set = [[s] + [t] + v + l for s, t, v, l in zip(sid, t, value, label)] features.extend(feature_set) # set name for columns columns = [const.ID, const.TIME] columns.extend(['v' + str(i) for i in range(0, self.time_steps)]) columns.extend(['l' + str(i) for i in range(0, 48)]) self.features = pd.DataFrame(data=features, columns=columns) print(len(self.features.index), 'feature vectors generated in', time.time() - start_time, 'secs') return self
def test_group_by_station(): data = pd.DataFrame(data={const.ID: [1, 2, 3], 'value': [5, 6, 7]}) stations = pd.DataFrame(data={const.ID: [1, 2, 3], const.PREDICT: [1, 0, 1]}) grouped = reform.group_by_station(ts=data, stations=stations) expected = { 1: pd.DataFrame(data={const.ID: [1], 'value': [5]}), 3: pd.DataFrame(data={const.ID: [3], 'value': [7]}), } pd_test.assert_frame_equal(expected[1], grouped[1]) pd_test.assert_frame_equal(expected[3], grouped[3])
def load(self): # load_model data self.stations = pd.read_csv(self.config[const.STATIONS], sep=";", low_memory=False) ts = pd.read_csv(self.config[const.OBSERVED], sep=";", low_memory=False) self.data = reform.group_by_station(ts=ts, stations=self.stations) return self
def generate(self, ts=None, stations=None, verbose=True, save=True): """ Create a basic feature set from pollutant time series, per hour x: (time, longitude, latitude, pollutant values of t:t+n) y: (pollutant values of t+n:t+n+m) :return: """ # load_model data if ts is None: ts = pd.read_csv(self.config[const.OBSERVED], sep=";", low_memory=False) if stations is None: self._stations = pd.read_csv(self.config[const.STATIONS], sep=";", low_memory=False) else: self._stations = stations self.data = reform.group_by_station(ts=ts, stations=self._stations) features = list() start_time = time.time() stations = self._stations.to_dict(orient='index') chunk_index = np.linspace(start=0, stop=len(stations) - 1, num=self.chunk_count + 1) station_count = self._stations[const.PREDICT].sum() processed_stations = 0 next_chunk = 1 total_data_points = 0 for s_index, s_info in stations.items(): if s_info[const.PREDICT] != 1: continue station_id = s_info[const.ID] if verbose: print(' Features of {sid} ({index} of {len})..'.format( sid=station_id, index=s_index + 1, len=len(stations))) s_data = self.data[station_id] s_time = pd.to_datetime(s_data[const.TIME], format=const.T_FORMAT).tolist() first_x = self.air_group * self.air_steps - 1 station_features = self.generate_per_station( station_id, s_data, s_time, first_x) # aggregate all features per row features.extend(station_features) processed_stations += 1 # save current chunk and go to next if save and (s_index >= chunk_index[next_chunk] or processed_stations == station_count): # set and save the chunk of features self.features = pd.DataFrame(data=features, columns=self.get_all_columns()) before_drop = len(self.features) self.dropna() after_drop = len(self.features) print(' %d feature vectors dropped having NaN' % (before_drop - after_drop)) self.features = self.features.sample( frac=self.config[const.FRACTION]) self.save_features(chunk_id=next_chunk) total_data_points += len(self.features) # go to next chunk features = list() self.features = pd.DataFrame() next_chunk += 1 if not save: self.features = pd.DataFrame(data=features, columns=self.get_all_columns()) total_data_points = len(self.features) print(total_data_points, 'feature vectors generated in', time.time() - start_time, 'secs') return self
import pandas as pd import const, settings from src import util from src.preprocess import reform, times # access default configurations config = settings.config[const.DEFAULT] # load_model data stations = pd.read_csv(config[const.BJ_STATIONS], sep=";", low_memory=False) data = pd.read_csv(config[const.BJ_OBSERVED], sep=";", low_memory=False) data = times.select(df=data, time_key=const.TIME, from_time='18-01-01 00', to_time='18-01-31 23') data_grouped = reform.group_by_station(ts=data, stations=stations) pollutants = ['PM2.5'] #['PM2.5', 'PM10', 'O3'] columns = ['forecast', 'actual', 'station', 'pollutant'] predictions = pd.DataFrame(data={}, columns=columns) for station in data_grouped: station_data = data_grouped[station] station_time = pd.to_datetime(station_data[const.TIME], format=const.T_FORMAT, utc=True) for pollutant in pollutants: t, x, y = reform.split_dual(time=station_time, value=station_data[pollutant], unit_x=24, unit_y=48) # use day d values as forecast of days d+1 and d+2
}, 'LD': { 'PM2.5': feature_dir + const.LD_PM25 + suffix, # 'PM10': feature_dir + const.LD_PM10 + suffix, } } smape_columns = ['city', const.ID, const.LONG, const.LAT, 'pollutant', 'SMAPE', 'count'] smapes = pd.DataFrame(columns=smape_columns) for city in paths: station_path = config[const.BJ_STATIONS] if city == 'BJ' else config[const.LD_STATIONS] stations = pd.read_csv(station_path, sep=";", low_memory=False) stations_dict = stations.to_dict(orient='index') for pollutant, path in paths[city].items(): ts = pd.read_csv(path, sep=";", low_memory=False) station_data = reform.group_by_station(ts=ts, stations=stations) local_smapes = pd.DataFrame(data=[], columns=smape_columns) for _, station in stations_dict.items(): data = station_data[station[const.ID]] if station[const.PREDICT] == 1 else pd.DataFrame() if len(data.index) == 0: continue # no prediction for this station actual = data[[pollutant + '__' + str(i) for i in range(1, 49)]].as_matrix() forecast = data[['f' + str(i) for i in range(0, 48)]].as_matrix() station['SMAPE'] = util.SMAPE(actual=actual, forecast=forecast) smape = pd.DataFrame( data=[[city, station[const.ID], station[const.LONG], station[const.LAT], pollutant, station['SMAPE'], actual.size]], columns=smape_columns) local_smapes = local_smapes.append(other=smape, ignore_index=True) smapes = smapes.append(other=smape, ignore_index=True)