def disp_gap_bytraffic(self): df = self.gapdf data_dir = g_singletonDataFilePath.getTrainDir() dumpfile_path = '../data_preprocessed/' + data_dir.split( '/')[-2] + '_prevtraffic.df.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): temp_df = dumpload.load() else: traffic_dict = self.get_traffic_dict(data_dir) temp_df = self.X_y_Df[['start_district_id', 'time_slotid' ]].apply(self.find_prev_traffic, axis=1, traffic_dict=traffic_dict, pre_num=3) dumpload.dump(temp_df) df = pd.concat([df, temp_df], axis=1) by_traffic = df.groupby('traffic1') x = [] y = [] for name, group in by_traffic: x.append(name) y.append(group['gap'].mean()) plt.scatter(x, y) return
def disp_gap_by_district_type(self): df = self.gapdf data_dir = g_singletonDataFilePath.getTrainDir() dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_poi.df.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): temp_df = dumpload.load() else: poi_dict = self.get_district_type_dict() temp_df = self.X_y_Df[['start_district_id']].apply(self.find_poi,axis = 1, poi_dict=poi_dict) dumpload.dump(temp_df) df = pd.concat([df, temp_df], axis=1) dt_list = self.get_district_type_list() size = len(dt_list) col_len = 4 row_len = 7 # col_len = row_len = int(math.ceil(math.sqrt(size))) # count = 1 _, axarr = plt.subplots(row_len, col_len, sharex=True, sharey=True) for row in range(row_len): for col in range(col_len): index = row * col_len + col if index >= size: break item = dt_list[index] axarr[row, col].scatter(df[item], df['gap']) axarr[row, col].set_ylabel('Gap') axarr[row, col].set_xlabel(item) return
def get_history_data_dict(self): """ indexes for quick search key = 'start_district_id','time_id' value = 'gap its data includes those from train, test1, test2. """ t0 = time() filename = "../data_preprocessed/" + 'traintest_history_data.dict.pickle' dumpload = DumpLoad(filename) if dumpload.isExisiting(): return dumpload.load() test1data_df = ExploreOrder().load_gapdf( g_singletonDataFilePath.getTest1Dir()) test2data_df = ExploreOrder().load_gapdf( g_singletonDataFilePath.getTest2Dir()) traindata_df = ExploreOrder().load_gapdf( g_singletonDataFilePath.getTrainDir()) df = pd.concat([traindata_df, test1data_df, test2data_df], axis=0) self.__fileter_earlier_date(df) res_dict = self.__generate_dict(df) dumpload.dump(res_dict) print "dump weather dict:", round(time() - t0, 3), "s" return res_dict
def get_history_data_dict(self): """ indexes for quick search key = 'start_district_id','time_id' value = 'gap its data includes those from train, test1, test2. """ t0 = time() filename = "../data_preprocessed/" + 'traintest_history_data.dict.pickle' dumpload = DumpLoad( filename) if dumpload.isExisiting(): return dumpload.load() test1data_df = ExploreOrder().load_gapdf(g_singletonDataFilePath.getTest1Dir()) test2data_df = ExploreOrder().load_gapdf(g_singletonDataFilePath.getTest2Dir()) traindata_df = ExploreOrder().load_gapdf(g_singletonDataFilePath.getTrainDir()) df = pd.concat([traindata_df, test1data_df,test2data_df], axis=0) self.__fileter_earlier_date(df) res_dict = self.__generate_dict(df) dumpload.dump(res_dict) print "dump weather dict:", round(time()-t0, 3), "s" return res_dict
def disp_gap_bytraffic(self): df = self.gapdf data_dir = g_singletonDataFilePath.getTrainDir() dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_prevtraffic.df.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): temp_df = dumpload.load() else: traffic_dict = self.get_traffic_dict(data_dir) temp_df = self.X_y_Df[['start_district_id', 'time_slotid']].apply(self.find_prev_traffic,axis = 1, traffic_dict=traffic_dict, pre_num = 3) dumpload.dump(temp_df) df = pd.concat([df, temp_df], axis=1) by_traffic = df.groupby('traffic1') x=[] y=[] for name, group in by_traffic: x.append(name) y.append(group['gap'].mean()) plt.scatter(x,y) return
def save_model(self): if not self.save_final_model: return dumpload = DumpLoad('logs/' + self.application_start_time + '_estimator.pickle') dumpload.dump(self) self.predictTestSet(g_singletonDataFilePath.getTest2Dir()) return
def __normalize(self): self.X_train = self.X_train.astype(np.float32) self.X_val = self.X_val.astype(np.float32) self.X_test = self.X_test.astype(np.float32) mean_image = np.mean(self.X_train, axis=0) std_image = np.std(self.X_train, axis=0) self.X_train = (self.X_train - mean_image) / std_image self.X_val = (self.X_val - mean_image) / std_image self.X_test = (self.X_test - mean_image) / std_image dumpload = DumpLoad('../data/meanstdimage.pickle') if not dumpload.isExisiting(): dumpload.dump((mean_image, std_image)) # self.X_test -= mean_image # self.X_train = ((self.X_train - 128)/128.0) # self.X_val = ((self.X_val - 128)/128.0) # self.X_test = ((self.X_val - 128)/128.0) # sc = MinMaxScaler() # sc.fit(self.X_train) # self.x_train= sc.transform(self.X_train) # self.X_val= sc.transform(self.X_val) # self.X_test= sc.transform(self.X_test) return
def add_pre_gaps(self, data_dir): dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_prevgap.df.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): df = dumpload.load() else: gap_dict = self.get_gap_dict(data_dir) df = self.X_y_Df[['start_district_id', 'time_slotid']].apply(self.find_prev_gap, axis = 1, pre_num = 3, gap_dict = gap_dict) dumpload.dump(df) self.X_y_Df = pd.concat([self.X_y_Df, df], axis=1) return
def add_history_data(self,data_dir): dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_history_data.df.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): df = dumpload.load() else: temp_dict = self.get_history_data_dict() df = self.X_y_Df[['start_district_id', 'time_id']].apply(self.find_history_data, axis = 1, history_dict = temp_dict) dumpload.dump(df) self.X_y_Df = pd.concat([self.X_y_Df, df], axis=1) return
def add_district_gap_sum(self): dumpfile_path = '../data_preprocessed/' +'training_data_district_gap_sum.dict.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): district_gap_sum_dict = dumpload.load() else: district_gap_sum_dict = self.X_y_Df.groupby('start_district_id')['gap'].sum().to_dict() dumpload.dump(district_gap_sum_dict) self.X_y_Df["district_gap_sum"] = self.X_y_Df["start_district_id"].map(district_gap_sum_dict) return
def __add_poi(self, data_dir): dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_poi.df.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): df = dumpload.load() else: poi_dict = self.get_district_type_dict() df = self.X_y_Df[['start_district_id']].apply(self.find_poi,axis = 1, poi_dict=poi_dict) dumpload.dump(df) self.X_y_Df = pd.concat([self.X_y_Df, df], axis=1) return
def add_district_gap_sum(self): dumpfile_path = '../data_preprocessed/' + 'training_data_district_gap_sum.dict.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): district_gap_sum_dict = dumpload.load() else: district_gap_sum_dict = self.X_y_Df.groupby( 'start_district_id')['gap'].sum().to_dict() dumpload.dump(district_gap_sum_dict) self.X_y_Df["district_gap_sum"] = self.X_y_Df["start_district_id"].map( district_gap_sum_dict) return
def add_prev_weather(self, data_dir): dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_prevweather.df.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): df = dumpload.load() else: weather_dict = self.get_weather_dict(data_dir) df = self.X_y_Df['time_slotid'].apply(self.find_prev_weather_mode, weather_dict=weather_dict) dumpload.dump(df) self.X_y_Df = pd.concat([self.X_y_Df, df], axis=1) self.add_rain_check() return
def add_history_data(self, data_dir): dumpfile_path = '../data_preprocessed/' + data_dir.split( '/')[-2] + '_history_data.df.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): df = dumpload.load() else: temp_dict = self.get_history_data_dict() df = self.X_y_Df[['start_district_id', 'time_id']].apply(self.find_history_data, axis=1, history_dict=temp_dict) dumpload.dump(df) self.X_y_Df = pd.concat([self.X_y_Df, df], axis=1) return
def add_pre_gaps(self, data_dir): dumpfile_path = '../data_preprocessed/' + data_dir.split( '/')[-2] + '_prevgap.df.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): df = dumpload.load() else: gap_dict = self.get_gap_dict(data_dir) df = self.X_y_Df[['start_district_id', 'time_slotid']].apply(self.find_prev_gap, axis=1, pre_num=3, gap_dict=gap_dict) dumpload.dump(df) self.X_y_Df = pd.concat([self.X_y_Df, df], axis=1) return
def add_prev_weather(self, data_dir): dumpfile_path = '../data_preprocessed/' + data_dir.split( '/')[-2] + '_prevweather.df.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): df = dumpload.load() else: weather_dict = self.get_weather_dict(data_dir) df = self.X_y_Df['time_slotid'].apply(self.find_prev_weather_mode, weather_dict=weather_dict) dumpload.dump(df) self.X_y_Df = pd.concat([self.X_y_Df, df], axis=1) self.add_rain_check() return
def __add_poi(self, data_dir): dumpfile_path = '../data_preprocessed/' + data_dir.split( '/')[-2] + '_poi.df.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): df = dumpload.load() else: poi_dict = self.get_district_type_dict() df = self.X_y_Df[['start_district_id']].apply(self.find_poi, axis=1, poi_dict=poi_dict) dumpload.dump(df) self.X_y_Df = pd.concat([self.X_y_Df, df], axis=1) return
def get_traffic_dict(self, data_dir): t0 = time() filename = '../data_raw/' + data_dir.split( '/')[-2] + '_traffic.csv.dict.pickle' dumpload = DumpLoad(filename) if dumpload.isExisiting(): return dumpload.load() resDict = {} df = self.load_trafficdf(data_dir) for _, row in df.iterrows(): resDict[tuple(row[['start_district_id', 'time_slotid']].tolist())] = row['traffic'] dumpload.dump(resDict) print "dump traffic dict:", round(time() - t0, 3), "s" return resDict
def disp_names(self, sorted_inds, probabilities, include_background=True): dump_load = DumpLoad("../../data/imagenet/imagenet_labels_dict.pickle") if dump_load.isExisiting(): names = dump_load.load() else: names = imagenet.create_readable_names_for_imagenet_labels() dump_load.dump(names) for i in range(5): index = sorted_inds[i] if include_background: print('Probability %0.2f%% => [%s]' % (probabilities[index], names[index])) else: print('Probability %0.2f%% => [%s]' % (probabilities[index], names[index + 1])) return
def get_gap_meanmedian_dict(self): data_dir = g_singletonDataFilePath.getTrainDir() filename = data_dir + 'order_data/temp/gap_meanmedian.dict.pickle' dumpload = DumpLoad( filename) if dumpload.isExisiting(): return dumpload.load() resDict = {} df = self.load_gapdf(data_dir) grps = df.groupby(['start_district_id','time_id']) for name, row in grps: resDict[name] = row['gap'].tolist() # resDict[name] = [i for i in row['gap'].tolist() if i !=0] dumpload.dump(resDict) return resDict
def get_gap_meanmedian_dict(self): data_dir = g_singletonDataFilePath.getTrainDir() filename = data_dir + 'order_data/temp/gap_meanmedian.dict.pickle' dumpload = DumpLoad(filename) if dumpload.isExisiting(): return dumpload.load() resDict = {} df = self.load_gapdf(data_dir) grps = df.groupby(['start_district_id', 'time_id']) for name, row in grps: resDict[name] = row['gap'].tolist() # resDict[name] = [i for i in row['gap'].tolist() if i !=0] dumpload.dump(resDict) return resDict
def get_weather_dict(self,data_dir): t0 = time() filename = '../data_raw/' + data_dir.split('/')[-2] + '_weather.csv.dict.pickle' dumpload = DumpLoad( filename) if dumpload.isExisiting(): return dumpload.load() resDict = {} df = self.load_weatherdf(data_dir) for index, row in df.iterrows(): resDict[row['time_slotid']] = (index, row['weather'], row['temparature'], row['pm25']) for name, group in df.groupby('time_date'): resDict[name] = (-1, mode(group['weather'])[0][0], mode(group['temparature'])[0][0], mode(group['pm25'])[0][0]) dumpload.dump(resDict) print "dump weather dict:", round(time()-t0, 3), "s" return resDict
def disp_gap_by_district_type(self): df = self.gapdf data_dir = g_singletonDataFilePath.getTrainDir() dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_poi.df.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): temp_df = dumpload.load() else: poi_dict = self.get_district_type_dict() temp_df = self.X_y_Df[['start_district_id']].apply(self.find_poi,axis = 1, poi_dict=poi_dict) dumpload.dump(temp_df) df = pd.concat([df, temp_df], axis=1) dt_list = self.get_district_type_list() size = len(dt_list) col_len = 4 row_len = 7 # col_len = row_len = int(math.ceil(math.sqrt(size))) # count = 1 _, axarr = plt.subplots(row_len, col_len, sharex=True, sharey=True) for row in range(row_len): for col in range(col_len): index = row * col_len + col if index >= size: break item = dt_list[index] axarr[row, col].scatter(df[item], df['gap']) axarr[row, col].set_ylabel('Gap') axarr[row, col].set_xlabel(item) # axarr[row, col].set_title('POI/Gap Correlation') # for item in dt_list: # plt.subplot(row_len, col_len, count) # plt.scatter(df[item], df['gap']) # plt.ylabel('Gap') # plt.xlabel('POI') # count += 1 # plt.title('POI/Gap Correlation') return
def get_gap_dict(self, data_dir): """ indexes for quick search key = 'start_district_id','time_slotid value = gap """ t0 = time() filename = "../data_preprocessed/" + data_dir.split('/')[-2] + '_gap.csv.dict.pickle' dumpload = DumpLoad( filename) if dumpload.isExisiting(): return dumpload.load() resDict = {} df = self.load_gapdf(data_dir) for _, row in df.iterrows(): resDict[tuple(row[['start_district_id','time_slotid']].tolist())] = row['gap'] dumpload.dump(resDict) print "dump gapdict:", round(time()-t0, 3), "s" return resDict
def disp_gap_byweather(self): df = self.gapdf data_dir = g_singletonDataFilePath.getTrainDir() dumpfile_path = '../data_preprocessed/' + data_dir.split('/')[-2] + '_prevweather.df.pickle' dumpload = DumpLoad(dumpfile_path) if dumpload.isExisiting(): temp_df = dumpload.load() else: weather_dict = self.get_weather_dict(data_dir) temp_df = self.X_y_Df['time_slotid'].apply(self.find_prev_weather_mode, weather_dict=weather_dict) dumpload.dump(temp_df) df = pd.concat([df, temp_df], axis=1) gaps_mean = df.groupby('preweather')['gap'].mean() gaps_mean.plot(kind='bar') plt.ylabel('Mean of gap') plt.xlabel('Weather') plt.title('Weather/Gap Correlation') return
def get_weather_dict(self, data_dir): t0 = time() filename = '../data_raw/' + data_dir.split( '/')[-2] + '_weather.csv.dict.pickle' dumpload = DumpLoad(filename) if dumpload.isExisiting(): return dumpload.load() resDict = {} df = self.load_weatherdf(data_dir) for index, row in df.iterrows(): resDict[row['time_slotid']] = (index, row['weather'], row['temparature'], row['pm25']) for name, group in df.groupby('time_date'): resDict[name] = (-1, mode(group['weather'])[0][0], mode(group['temparature'])[0][0], mode(group['pm25'])[0][0]) dumpload.dump(resDict) print "dump weather dict:", round(time() - t0, 3), "s" return resDict
def get_gap_dict(self, data_dir): """ indexes for quick search key = 'start_district_id','time_slotid value = gap """ t0 = time() filename = "../data_preprocessed/" + data_dir.split( '/')[-2] + '_gap.csv.dict.pickle' dumpload = DumpLoad(filename) if dumpload.isExisiting(): return dumpload.load() resDict = {} df = self.load_gapdf(data_dir) for _, row in df.iterrows(): resDict[tuple(row[['start_district_id', 'time_slotid']].tolist())] = row['gap'] dumpload.dump(resDict) print("dump gapdict:", round(time() - t0, 3), "s") return resDict
unk_count = 0 for word in words: if word in dictionary: index = dictionary[word] else: index = 0 # dictionary['UNK'] unk_count = unk_count + 1 data.append(index) count[0][1] = unk_count reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) return data, count, dictionary, reverse_dictionary rawdata = DumpLoad('./data/rawdata.pickle') if not rawdata.isExisiting(): # data, count, dictionary, reverse_dictionary = build_dataset(words) rawdata.dump(build_dataset(words)) data, count, dictionary, reverse_dictionary = rawdata.load() print('Most common words (+UNK)', count[:5]) print('Sample data', data[:10]) del words # Hint to reduce memory. data_index = 0 def generate_batch(batch_size, num_skips, skip_window): global data_index assert batch_size % num_skips == 0 assert num_skips <= 2 * skip_window batch = np.ndarray(shape=(batch_size), dtype=np.int32)