def load_other_bj_taxi_external(timeslots): """ ret: if 1 represent the day is a holiday, otherwise 0 the hour of a day, from 0 to 23 the day is weekday or weekend, from 0 to 6 """ path = os.path.join(utils.get_data_path(), 'TaxiBJ', 'BJ_Holiday.txt') with open(path, 'r') as f: holiday = np.asarray(list(map(str.strip, f.readlines()))) date = [time[:-2] for time in timeslots] hour = [int(time[-2:]) // 2 for time in timeslots] dayOfWeek = [datetime.strptime(time, "%Y%m%d").weekday() for time in date] # create vacation vacation = [] for day in date: tmp = np.where(holiday == day)[0] if len(tmp) == 1: vacation.append(1) else: vacation.append(0) vacation = np.asarray(vacation) hour = np.asarray(hour) dayOfWeek = np.asarray(dayOfWeek) return vacation, hour, dayOfWeek
def load_bj_taxi_flow(): """ ret: complete taxi flow data """ path_list = [] for year in range(16, 17): path_list.append( os.path.join(utils.get_data_path(), 'TaxiBJ', 'BJ{}_M32x32_T30_InOut.h5'.format(year))) # print(path_list) complete_date = [] complete_data = [] for path in path_list: with h5py.File(path, 'r+') as f: date = np.array(f['date'][()]) data = np.array(f['data'][()]) # 1. 缺省值处理,将不完整的一天去掉 date, data = utils.remove_incomplete_day(date, data) # 2. 异常值处理,去掉流动量小于0的值 data = np.where(data > 0, data, 0) complete_date.extend(date) complete_data.extend(data) complete_data = np.asarray(complete_data) mmn = MinMaxNormalization() mmn.fit(complete_data) complete_data = np.asarray([mmn.transform(d) for d in complete_data]) print(mmn._max, mmn._min) # 把这个对象存储在pkl文件里 path = os.path.join(utils.get_pkl_path(), 'bj_taxi_preprocessing.pkl') fpkl = open(path, 'wb') for obj in [mmn]: pickle.dump(obj, fpkl) fpkl.close() complete_date = np.array([time.decode('utf-8') for time in complete_date]) complete_data = np.asarray(complete_data) # create index:date->index->data index = {} for i, date in enumerate(complete_date): index[date] = i return complete_date, complete_data, mmn, index
def load_ny_bike_flow(): """ ret: complete taxi flow data """ path = os.path.join(utils.get_data_path(), 'BikeNYC', 'NYC14_M16x8_T60_NewEnd.h5') # print(path_list) complete_date = [] complete_data = [] with h5py.File(path, 'r+') as f: date = np.array(f['date'][()]) data = np.array(f['data'][()]) # 1. 缺省值处理,将不完整的一天去掉 date, data = utils.remove_incomplete_day(date, data, unit_len=24) # 2. 异常值处理,去掉流动量小于0的值 data = np.where(data > 0, data, 0) complete_date.extend(date) complete_data.extend(data) complete_data = np.asarray(complete_data) print(complete_data.shape) mmn = MinMaxNormalization() mmn.fit(complete_data) complete_data = np.asarray([mmn.transform(d) for d in complete_data]) # 把这个对象存储在pkl文件里 path = os.path.join(utils.get_pkl_path(), 'ny_bike_preprocessing.pkl') fpkl = open(path, 'wb') for obj in [mmn]: pickle.dump(obj, fpkl) fpkl.close() complete_date = np.array([time.decode('utf-8') for time in complete_date]) complete_data = np.asarray(complete_data) # create index:date->index->data index = {} for i, date in enumerate(complete_date): index[date] = i return complete_date, complete_data, mmn, index
def load_bj_taxi_meteorology(timeslots=None): path = os.path.join(utils.get_data_path(), 'TaxiBJ', 'BJ_Meteorology.h5') with h5py.File(path, 'r+') as f: Weather = f['Weather'][()] date = f['date'][()] # byte->str date = np.array([time.decode('utf-8') for time in date]) # 创建索引 index = [np.where(date == time)[0][0] for time in timeslots] weather = [] for idx in index: weather.append(Weather[idx - 1]) weather = np.asarray(weather) return weather
def load_bj_taxi_holiday(timeslots): """ ret: holiday list """ path = os.path.join(utils.get_data_path(), 'TaxiBJ', 'BJ_Holiday.txt') with open(path, 'r') as f: holiday = f.readlines() holiday = list(map(str.strip, holiday)) # 将预测日期中为节假日的日期置为1 feature_holiday = [] for time in timeslots: if time[:-2] in holiday: feature_holiday.append(1) else: feature_holiday.append(0) return np.asarray(feature_holiday)
def load_bj_taxi_meteorology(timeslots=None): """ args: timeslots: a list of date :return wind speed; temperature; weather """ path = os.path.join(utils.get_data_path(), 'TaxiBJ', 'BJ_Meteorology.h5') with h5py.File(path, 'r+') as f: Temperature = f['Temperature'][()] Weather = f['Weather'][()] WindSpeed = f['WindSpeed'][()] date = f['date'][()] # 从numpy byte转化为str date = np.array([time.decode('utf-8') for time in date]) # 创建索引 index = [np.where(date == time)[0][0] for time in timeslots] temperature = [] windSpeed = [] weather = [] # the last time slot used as the external data for idx in index: temperature.append(Temperature[idx - 1]) weather.append(np.argwhere( Weather[idx - 1] == 1).squeeze()) # just one index for embed windSpeed.append(WindSpeed[idx - 1]) temperature = np.asarray(temperature) weather = np.asarray(weather) windSpeed = np.asarray(windSpeed) # min-max-scale to wind speed and temperature temperature = MinMaxScale(temperature) windSpeed = MinMaxScale(windSpeed) return weather, temperature, windSpeed