def process(): update_list = [] with open('op-go.csv') as csv_file: reader = csv.DictReader(csv_file, fieldnames=fields) line_count = 0 for row in reader: if line_count == 0: row.pop("geohash") row.update({'lat': 'lat'}) row.update({'lon': 'lon'}) line_count += 1 update_list.append(row) continue else: hashed = row["geohash"] lat = Geohash.decode(hashed)[0] lon = Geohash.decode(hashed)[1] row.pop("geohash") row.update({'lat': lat}) row.update({'lon': lon}) update_list.append(row) with open('converted.csv', 'w') as myfile: wr = csv.writer(myfile) for row in update_list: wr.writerow(dict(row).values())
def decode_geohash(data): result = data.copy() start_loc = np.array(result['geohashed_start_loc'].apply( lambda x: Geohash.decode(x)).tolist()) result['start_loc_lat'] = start_loc[:, 0] result['start_loc_lon'] = start_loc[:, 1] if 'geohashed_end_loc' in data.columns: end_loc = np.array(result['geohashed_end_loc'].apply( lambda x: Geohash.decode(x)).tolist()) result['end_loc_lat'] = end_loc[:, 0] result['end_loc_lon'] = end_loc[:, 1] return result
def get_loc_matrix(): result_path = cache_path + 'loc_matrix.hdf' if os.path.exists(result_path): result = pd.read_hdf(result_path, 'w') else: train = pd.read_csv(train_path) test = pd.read_csv(test_path) end_loc = pd.DataFrame( {'geohashed_end_loc': list(train['geohashed_end_loc'].unique())}) end_loc['end_loc_lat'] = end_loc['geohashed_end_loc'].apply( lambda x: Geohash.decode(x)[0]) end_loc['end_loc_lon'] = end_loc['geohashed_end_loc'].apply( lambda x: Geohash.decode(x)[1]) end_loc['end_loc_lat_box'] = end_loc['end_loc_lat'].apply( lambda x: x // 0.003) end_loc['end_loc_lon_box'] = end_loc['end_loc_lon'].apply( lambda x: x // 0.00375) count_of_loc = train.groupby('geohashed_end_loc', as_index=False)['geohashed_end_loc'].agg( {'count_of_loc': 'count'}) end_loc = pd.merge(end_loc, count_of_loc, on='geohashed_end_loc', how='left') max_index = end_loc.groupby( ['end_loc_lat_box', 'end_loc_lon_box']).apply(lambda x: x['count_of_loc'].argmax()) end_loc = end_loc.loc[ max_index.tolist(), ['geohashed_end_loc', 'end_loc_lat', 'end_loc_lon']] end_loc.sort_values('end_loc_lat', inplace=True) end_loc = end_loc.values start_loc = pd.DataFrame({ 'geohashed_start_loc': list(pd.concat([train, test])['geohashed_start_loc'].unique()) }) start_loc['start_loc_lat'] = start_loc['geohashed_start_loc'].apply( lambda x: Geohash.decode(x)[0]) start_loc['start_loc_lon'] = start_loc['geohashed_start_loc'].apply( lambda x: Geohash.decode(x)[1]) start_loc = start_loc.values start_end_loc_arr = [] for i in start_loc: for j in end_loc: if (np.abs(i[1] - j[1]) < 0.012) & (np.abs(i[2] - j[2]) < 0.015): start_end_loc_arr.append([i[0], j[0]]) result = pd.DataFrame( start_end_loc_arr, columns=['geohashed_start_loc', 'geohashed_end_loc']) result.to_hdf(result_path, 'w', complib='blosc', complevel=5) return result
def get_distance(result): result_path = cache_path + 'distance_feat_%d.hdf' % (result.shape[0]) if os.path.exists(result_path) & flag: temp = pd.read_hdf(result_path, 'w') result = pd.merge(result, temp, on=['orderid', 'geohashed_end_loc'], how='left') else: locs = list(set(result['geohashed_start_loc']) | set(result['geohashed_end_loc'])) if np.nan in locs: locs.remove(np.nan) deloc = [] for loc in locs: deloc.append(Geohash.decode(loc)) loc_dict = dict(zip(locs,deloc)) geohashed_loc = result[['geohashed_start_loc','geohashed_end_loc']].values distance = [] mht_distance = [] for i in geohashed_loc: lat1, lon1 = loc_dict[i[0]] lat2, lon2 = loc_dict[i[1]] distance.append(cal_distance(lat1,lon1,lat2,lon2)) mht_distance.append(cal_mht_distance(lat1,lon1,lat2,lon2)) result['distance'] = distance result['mht_distance'] = mht_distance result[['orderid','geohashed_end_loc','distance','mht_distance']].to_hdf(result_path, 'w', complib='blosc', complevel=5) return result
def get_distance(result): locs = list( set(result['geohashed_start_loc']) | set(result['geohashed_end_loc'])) if np.nan in locs: locs.remove(np.nan) deloc = [] for loc in locs: deloc.append(Geohash.decode(loc)) loc_dict = dict(zip(locs, deloc)) geohashed_loc = result[['geohashed_start_loc', 'geohashed_end_loc']].values distance = [] manhattan_distance = [] for i in geohashed_loc: if i[0] is not np.nan and i[1] is not np.nan: lat1, lon1 = loc_dict[i[0]] lat2, lon2 = loc_dict[i[1]] distance.append( cal_distance(float(lat1), float(lon1), float(lat2), float(lon2))) manhattan_distance.append( manhattan(float(lat1), float(lon1), float(lat2), float(lon2))) else: distance.append(np.nan) manhattan_distance.append(np.nan) result.loc[:, 'distance'] = distance result.loc[:, 'manhattan'] = manhattan_distance return result
def put(self): hour = int(request.form['hour']) date = request.form['date'] prcp = float(request.form['prcp'])*100 snow = float(request.form['snow']) * 10 tmax = float(request.form['tmax']) * 10 tmin = float(request.form['tmin']) * 10 date = pd.to_datetime(date) with open(os.path.join(APP_STATIC, 'uniquegeohash.pkl'), 'rb') as f: uniquegeohash = dill.load(f) with open(os.path.join(APP_STATIC, 'predict_pickup_density.pkl'), 'rb') as f: model = dill.load(f) x_dict = [{"pickup_geohash": geostr, "hour": hour, "dayofweek": date.dayofweek, 'month': date.month,'PRCP':prcp,'SNOW':snow,'TMAX':tmax,'TMIN':tmin} for geostr in uniquegeohash] x_df = pd.DataFrame(x_dict) y = model.predict(x_df) geodecode = [Geohash.decode(geocode) for geocode in uniquegeohash] yzipgeo = zip(y, geodecode) sortedlist = sorted(yzipgeo, key=lambda x: -x[0]) top10address = [] top10dict = {} for y, geodecode in sortedlist[0:50]: key = ",".join(geodecode) top10dict[key] = top10dict.get(key,0) + y top10res = [] for key in top10dict: temptuple = (float(key.split(",")[0]),float(key.split(",")[1])) top10res.append([top10dict[key],temptuple]) top10res = sorted(top10res,key=lambda x:-x[0]) top10res = top10res[0:10] if len(top10res) > 10 else top10res for u,geodecode in top10res: g = geocoder.google([geodecode[0], geodecode[1]], method='reverse').address top10address.append(g) return {"top10": top10res,"top10address":top10address}
def test_basic(self): hash = Geohash.encode(self.family[0], self.family[1], precision=20) (lats, lons) = Geohash.decode(hash) assert float(lats) == self.family[0] assert float(lons) == self.family[1]
def geohash_decode(file, test=False): #约半小时之内跑完 data = pd.read_csv(file) if test: x = [] y = [] geohash = data["geohashed_start_loc"] b = len(geohash) for i in range(len(geohash)): print(i, b) a = gh.decode(geohash[i]) x.append(a[0]) y.append(a[1]) data = data.drop(labels="geohashed_start_loc", axis=1) data.insert(data.shape[1], "start_loc_x", pd.Series(x)) data.insert(data.shape[1], "start_loc_y", pd.Series(y)) data.to_csv("test_1.csv", index=False) else: x1 = [] y1 = [] x2 = [] y2 = [] geohash1 = data["geohashed_start_loc"] geohash2 = data["geohashed_end_loc"] b = len(geohash1) for i in range(len(geohash1)): print(i, b) a = gh.decode(geohash1[i]) x1.append(a[0]) y1.append(a[1]) b = len(geohash2) for i in range(len(geohash2)): print(i, b) a = gh.decode(geohash2[i]) x2.append(a[0]) y2.append(a[1]) data = data.drop(labels="geohashed_start_loc", axis=1) data = data.drop(labels="geohashed_end_loc", axis=1) data.insert(data.shape[1], "start_loc_x", pd.Series(x1)) data.insert(data.shape[1], "start_loc_y", pd.Series(y1)) data.insert(data.shape[1], "end_loc_x", pd.Series(x2)) data.insert(data.shape[1], "end_loc_y", pd.Series(y2)) data.to_csv("train_1.csv", index=False)
def show_station_coordinfo(self): for channel in self.station_coordinfo.keys(): info = self.station_coordinfo[channel] logger.debug("%s (%f, %f, %f) %s | %s", channel, info['latitude'], info['longitude'], info['elevation'], info['geohash'], Geohash.decode(info['geohash']))
def geohash_decoding(geohash): ''' geohash 逆转换为火星坐标对 :param geohash: 待转换的geohash :return: (火星坐标纬度,火星坐标经度) ''' wgsLat, wgsLng = Geohash.decode(geohash) return wgs2gcj(wgsLat, wgsLng)
def gettile(): vehicle_map = init_vehicle_map tilesresult = getdata.tiles() if tilesresult[:8] != "<option>": app.logger.debug('Connection error : %s', tilesresult) return render_template('gettile.html', vehicle_map=vehicle_map, error=ErrorMessage) else: alltiles = Markup(tilesresult) if request.method == 'POST': if request.form['tile']: tile = request.form['tile'] markers_map = getdata.getvehicles_fortile(tile) #app.logger.debug('Debugging KILLRTAXI : %s',markers_map) if not isinstance(markers_map, list): app.logger.debug('Connection error : %s', markers_map) return render_template('gettile.html', vehicle_map=vehicle_map, error=ErrorMessage) nbmvts = len(markers_map) mappos = Geohash.decode(tile) vehicle_map = Map( identifier="view-side", lat=str(float(mappos[0]) - 0.2), lng=str(float(mappos[1]) - 0.2), style="height:700px;width:700px;margin:10;", zoom=9, markers=markers_map #markers=[(54.96848201388808, 0.39963558097359564),(54.968382013888075, -0.39953558097359565)] ) return render_template('gettile.html', alltiles=alltiles, nbmvts=nbmvts, tile=tile, vehicle_map=vehicle_map) return render_template('gettile.html', alltiles=alltiles, vehicle_map=vehicle_map)
def get_loc_dict(): dump_path = cache_path + 'loc_dict.pkl' if os.path.exists(dump_path): loc_dict = pickle.load(open(dump_path, 'rb+')) else: train = pd.read_csv(train_path) test = pd.read_csv(test_path) locs = list( set(train['geohashed_start_loc']) | set(train['geohashed_end_loc']) | set(test['geohashed_start_loc'])) deloc = [] for loc in locs: deloc.append(Geohash.decode(loc)) loc_dict = dict(zip(locs, deloc)) pickle.dump(loc_dict, open(dump_path, 'wb+')) return loc_dict
def get_distance(result): locs = list( set(result['geohashed_start_loc']) | set(result['geohashed_end_loc'])) if np.nan in locs: locs.remove(np.nan) deloc = [] for loc in locs: deloc.append(Geohash.decode(loc)) loc_dict = dict(zip(locs, deloc)) geohashed_loc = result[['geohashed_start_loc', 'geohashed_end_loc']].values distance = [] for i in geohashed_loc: lat1, lon1 = loc_dict[i[0]] lat2, lon2 = loc_dict[i[1]] distance.append(cal_distance(lat1, lon1, lat2, lon2)) result.loc[:, 'distance'] = distance return result
def get(self, point, buffer_size=0, multiple=False): """ lookup state and county based on geohash of coordinates from tweet """ lon, lat = point geohash = Geohash.encode(lat, lon, precision=self.precision) key = (geohash, buffer_size, multiple) if key in self.geohash_cache: # cache hit on geohash self.hit += 1 #print self.hit, self.miss return self.geohash_cache[key] self.miss += 1 # cache miss on geohash # project point to ESRI:102005 lat, lon = Geohash.decode(geohash) proj_point = project([float(lon), float(lat)]) args = dict(buffer_size=buffer_size, multiple=multiple) payload = self.get_object(proj_point, **args) self.geohash_cache[key] = payload return payload
def gettile(): vehicle_map=init_vehicle_map tilesresult=getdata.tiles() if tilesresult[:8]!="<option>": app.logger.debug('Connection error : %s',tilesresult) return render_template('gettile.html',vehicle_map=vehicle_map,error=ErrorMessage) else: alltiles=Markup(tilesresult) if request.method == 'POST': if request.form['tile']: tile=request.form['tile'] markers_map=getdata.getvehicles_fortile(tile) #app.logger.debug('Debugging KILLRTAXI : %s',markers_map) if not isinstance(markers_map,list): app.logger.debug('Connection error : %s',markers_map) return render_template('gettile.html',vehicle_map=vehicle_map,error=ErrorMessage) nbmvts=len(markers_map) mappos=Geohash.decode(tile) vehicle_map = Map( identifier="view-side", lat=str(float(mappos[0])-0.2), lng=str(float(mappos[1])-0.2), style="height:700px;width:700px;margin:10;", zoom=9, markers=markers_map #markers=[(54.96848201388808, 0.39963558097359564),(54.968382013888075, -0.39953558097359565)] ) return render_template('gettile.html', alltiles=alltiles,nbmvts=nbmvts,tile=tile,vehicle_map=vehicle_map) return render_template('gettile.html',alltiles=alltiles,vehicle_map=vehicle_map)
def read_data(self): df = pd.read_csv(filepath_or_buffer="training.csv") print("Processing data ......") df['timestamp'] = pd.to_datetime(df['timestamp'], format='%H:%M') df.sort_values(["day", "timestamp"], axis=0, ascending=[True, True], inplace=True) #print (df) image = np.zeros(shape=(25, 5)).tolist() #print(image) i = -1 j = 0 for index, row in df.iterrows(): #print(row["day"],row["timestamp"]) coordinates = Geohash.decode(row['geohash6']) i = i + 1 if i == 0 or day != row['day'] or timestamp != row['timestamp']: if i != 0: images.append(image) image = np.zeros(shape=(25, 5)).tolist() j = j + 1 day = row['day'] timestamp = row['timestamp'] image[int((-(float(coordinates[0]) + 5.24)) * 100)][int( (float(coordinates[1]) - 90.6) * 10)] = row['demand'] else: image[int((-(float(coordinates[0]) + 5.24)) * 100)][int( (float(coordinates[1]) - 90.6) * 10)] = row['demand'] #print(image) #print(images) #print (np.asarray(images).shape) return images
def geohash_decode(geohash): return Geohash.decode(geohash)
#coding:utf8 import Geohash #longitude : 经度 #latitude : 纬度 lng = 116.37439 lat = 39.94758 h = Geohash.encode(lat, lng) print h print Geohash.decode(h)
def get_latlng(self, waypoint_id): return Geohash.decode(self.__waypoints[waypoint_id]["geohash"])
# At the moment, this data is not used, since the location has been nicely # clustered in a grid-like fashion thanks to the geohash precision reduction. # However, it is necessary to cluster the locations when the locations are # chaotically scattered on the map. print("Pre-process geo-location data") num_geo = tr_df['geohash6'].unique().shape[0] geo_df = pd.DataFrame(data=np.transpose([['' for i in range(num_geo)], np.zeros(num_geo), np.zeros(num_geo)]), columns=['geohash6', 'latitude', 'longitude']) # Obtain the location information for each unique geohash6. i = 0 for x in zip(tr_df['geohash6'].unique()): geo_df.loc[i]['geohash6'] = x[0] geo_df.loc[i]['latitude', 'longitude'] = gh.decode(x[0]) i += 1 # Resulting dataframe of the geo-location data: ''' geohash6 latitude longitude 0 qp03wc -5.353088 90.653687 1 qp03pn -5.413513 90.664673 2 qp09sw -5.325623 90.906372 3 qp0991 -5.353088 90.752563 ... 1326 qp03yn -5.281677 90.620728 1327 qp09v9 -5.309143 90.950317 1328 qp0d45 -5.254211 90.796509 ''' # Save the geo-location data into csv file for easy access when necessary. geo_df.to_csv(cwd + '/Traffic data/geo_set.csv', sep=',')
print("Loading data ......") df=pd.read_csv(filepath_or_buffer = "training.csv") print("Processing data ......") df['timestamp'] = pd.to_datetime(df['timestamp'], format='%H:%M').dt.time df.sort_values(["day","timestamp"], axis = 0, ascending = [True,True], inplace = True) print (df) image= np.zeros(shape=(25,5)).tolist() #print(image) i=-1 j=0 for index, row in df.iterrows(): #print(row["day"],row["timestamp"]) coordinates=Geohash.decode(row['geohash6']) i=i+1 if i==0 or day!=row['day'] or timestamp!=row['timestamp']: if i!=0: timing.append((timestamp.hour * 60 + timestamp.minute)/15) images.append(image) image= np.zeros(shape=(25,5)).tolist() j=j+1 #print (timing) day=row['day'] timestamp=row['timestamp'] image[int((-(float(coordinates[0])+5.24))*100)][int((float(coordinates[1])-90.6)*10)]=row['demand'] else: image[int((-(float(coordinates[0])+5.24))*100)][int((float(coordinates[1])-90.6)*10)]=row['demand']
def geohash_decoding(geohash): wgsLat, wgsLng = Geohash.decode(geohash) return wgs2gcj(wgsLat, wgsLng)
def change_corr(cor, precision=5): x, y = cor geohash = Geohash.encode(x, y, precision=precision) x, y = Geohash.decode(geohash) return [float(x), float(y)]
#coding:utf8 import Geohash #longitude : 经度 #latitude : 纬度 lng = 116.37439 lat = 39.94758 h = Geohash.encode(lat,lng) print h print Geohash.decode(h)
def hash2latlon(geohash): g = Geohash.decode(geohash) return [float(g[0]), float(g[1])]
target_dir = r'D:\lu_work\python\data' from_file = 'train.csv' to_file = 'train_lat.csv' input = open(target_dir + os.sep + from_file, 'r') output = open(target_dir + os.sep + to_file, 'w+') log = open(target_dir + os.sep + 'log.txt', 'w+') all_train_lines = input.readlines() i = 0 #逐行转换起始点坐标 for line in all_train_lines: #首行 & 筛选有价值列 items = line.split(',') if i == 0: output.write(items[0] + items[1] + ',' + items[4] + ',' + items[5] + ',' + items[6] + '\n') else: try: st_pri = items[6].replace('\n', '') en_pri = items[5].replace('\n', '') start = Geohash.decode(st_pri) end = Geohash.decode(en_pri) output.write(items[0] + items[1] + ',' + items[4] + ',' + start[0] + ' ' + start[1] + ',' + end[0] + ' ' + end[1] + '\n') except: log.write(items[0] + '订单号转换坐标有错误\n') i = i + 1 print(i) output.close() log.close()
to_region_B = {} for i in range(n_intervals_S): for j in range(n_intervals_S): temp_region = i * n_intervals_S + j all_regions_S.append(temp_region) for i in range(n_intervals_M): for j in range(n_intervals_M): temp_region = i * n_intervals_M + j all_regions_M.append(temp_region) for i in range(n_intervals_B): for j in range(n_intervals_B): temp_region = i * n_intervals_B + j all_regions_B.append(temp_region) for ghash in all_geo_hashes: gps = geo.decode(ghash) lat, lng = gps lat = float(lat) lng = float(lng) lat_region_B = int((lat - min_lat) / lat_size_B) if lat_region_B < 0: lat_region_B = 0 elif lat_region_B >= n_intervals_B: lat_region_B = n_intervals_B - 1 lng_region_B = int((lng - min_lng) / lng_size_B) if lng_region_B < 0: lng_region_B = 0 elif lng_region_B >= n_intervals_B: lng_region_B = n_intervals_B - 1 region_B = lat_region_B * n_intervals_B + lng_region_B to_region_B[ghash] = region_B lat_region_M = int((lat - min_lat) / lat_size_M) if lat_region_M < 0: lat_region_M = 0
def geohashed_loc(loc): lat, lon = Geohash.decode(loc) Lx = int(6371004.0 * ((lon - 115.98350244732475) / 57.2958) * np.cos(lon / 57.2958) / 116.97) Ly = int(6371004.0 * ((lat - 39.3492529600276) / 57.2958) / 76.35) return (Lx, Ly)
def feature_gen(data_file, file_type): data_sort = data_file.sort_values(by=['day', 'timestamp']) min_day = min(data_file['day'].unique()) if file_type == 'testing': max_day = max(data_file['day'].unique()) max_day_period = max( data_file[data_file['day'] == max_day]['timestamp'].unique()) no_tuples, no_dimensions = data_file.shape prev_period = -1 period = -1 fine_dd = {} coarse_dd_B = {} coarse_dd_M = {} coarse_dd_S = {} period_list = [] temp_dd = {} recorded_points = Set() temp_coarse_B = {} temp_coarse_M = {} temp_coarse_S = {} idx = -1 #Manually increment index instead of using index present in the dataframe due to rearrangement of dataframe objects (and their corresponding indices) earlier during sorting. # Create a list of historical aggregated demands for every point and every region. If there is no record at a point/region during a period, we assign a 0 for that point/region. for row in data_sort.itertuples(): idx += 1 geohash = str(row[1]) day = int(row[2]) period = int(row[3]) dd = float(row[4]) region_B = to_region_B[geohash] region_M = to_region_M[geohash] region_S = to_region_S[geohash] #for part in sgParts.itertuples(): # poly = part.geometry # if p.within(poly): #print part.Name # region = part.Name if idx == 0: temp_dd[geohash] = dd temp_coarse_B[region_B] = dd temp_coarse_M[region_M] = dd temp_coarse_S[region_S] = dd # When the period of the current record is different from the period of the previous record, we append all the recorded demands of the previous period to the historical demand lists, while adding 0 to the historical demand lists that belong to points/regions that do not have any records during the previous period elif prev_period != period: if period - prev_period != 1 and not (period == 0 and prev_period == 95): empty_periods = period - prev_period - 1 else: empty_periods = 0 period_list.append(prev_period) for ghash in all_geo_hashes: if ghash in temp_dd: if ghash in fine_dd: fine_dd[ghash].append(temp_dd[ghash]) else: fine_dd[ghash] = [temp_dd[ghash]] if ghash not in temp_dd: if ghash in fine_dd: fine_dd[ghash].append(0.0) else: fine_dd[ghash] = [0.0] for reg in all_regions_B: if reg in temp_coarse_B: if reg in coarse_dd_B: coarse_dd_B[reg].append(temp_coarse_B[reg]) else: coarse_dd_B[reg] = [temp_coarse_B[reg]] else: if reg in coarse_dd_B: coarse_dd_B[reg].append(0) else: coarse_dd_B[reg] = [0] for reg in all_regions_M: if reg in temp_coarse_M: if reg in coarse_dd_M: coarse_dd_M[reg].append(temp_coarse_M[reg]) else: coarse_dd_M[reg] = [temp_coarse_M[reg]] else: if reg in coarse_dd_M: coarse_dd_M[reg].append(0) else: coarse_dd_M[reg] = [0] for reg in all_regions_S: if reg in temp_coarse_S: if reg in coarse_dd_S: coarse_dd_S[reg].append(temp_coarse_S[reg]) else: coarse_dd_S[reg] = [temp_coarse_S[reg]] if reg not in temp_coarse_S: if reg in coarse_dd_S: coarse_dd_S[reg].append(0) else: coarse_dd_S[reg] = [0] # There are two instances where there are consecutive periods without any records at all, across all points and regions. Here we fill in 0 for these periods. if empty_periods > 0: for i in range(empty_periods): for ghash in all_geo_hashes: fine_dd[ghash].append(0) for reg in coarse_dd_B: coarse_dd_B[reg].append(0) for reg in coarse_dd_M: coarse_dd_M[reg].append(0) for reg in coarse_dd_S: coarse_dd_S[reg].append(0) period_list.append(prev_period + i) temp_dd = {} temp_dd[geohash] = dd temp_coarse_B = {} temp_coarse_B[region_B] = dd temp_coarse_M = {} temp_coarse_M[region_M] = dd temp_coarse_S = {} temp_coarse_S[region_S] = dd # When the program reaches the end of the dataset, append all the recorded demands to the historical demand lists, while adding 0 to the historical demand lists that belong to points/regions that do not have any records during the this period elif idx == no_tuples - 1: period_list.append(prev_period) temp_dd[geohash] = dd if region_B in temp_coarse_B: temp_coarse_B[region_B] += dd else: temp_coarse_B[region_B] = dd if region_M in temp_coarse_M: temp_coarse_M[region_M] += dd else: temp_coarse_M[region_M] = dd if region_S in temp_coarse_S: temp_coarse_S[region_S] += dd else: temp_coarse_S[region_S] = dd for ghash in all_geo_hashes: if ghash in temp_dd: if ghash in fine_dd: fine_dd[ghash].append(temp_dd[ghash]) else: fine_dd[ghash] = [temp_dd[ghash]] if ghash not in temp_dd: if ghash in fine_dd: fine_dd[ghash].append(0) else: fine_dd[ghash] = [0] for reg in all_regions_B: if reg in temp_coarse_B: if reg in coarse_dd_B: coarse_dd_B[reg].append(temp_coarse_B[reg]) else: coarse_dd_B[reg] = [temp_coarse_B[reg]] else: if reg in coarse_dd_B: coarse_dd_B[reg].append(0) else: coarse_dd_B[reg] = [0] for reg in all_regions_M: if reg in temp_coarse_M: if reg in coarse_dd_M: coarse_dd_M[reg].append(temp_coarse_M[reg]) else: coarse_dd_M[reg] = [temp_coarse_M[reg]] else: if reg in coarse_dd_M: coarse_dd_M[reg].append(0) else: coarse_dd_M[reg] = [0] for reg in all_regions_S: if reg in temp_coarse_S: if reg in coarse_dd_S: coarse_dd_S[reg].append(temp_coarse_S[reg]) else: coarse_dd_S[reg] = [temp_coarse_S[reg]] else: if reg in coarse_dd_S: coarse_dd_S[reg].append(0) else: coarse_dd_S[reg] = [0] else: temp_dd[geohash] = dd if region_B in temp_coarse_B: temp_coarse_B[region_B] += dd else: temp_coarse_B[region_B] = dd if region_M in temp_coarse_M: temp_coarse_M[region_M] += dd else: temp_coarse_M[region_M] = dd if region_S in temp_coarse_S: temp_coarse_S[region_S] += dd else: temp_coarse_S[region_S] = dd prev_period = period prev_day = day prev_row = row ## Feature Generation. Based on the historical information built earlier, this section generates additional historical features for each training sample # Generate additional historical/attribute features w.r.t. each tuple. Total features include: A) Attributional Features: 1. geohash ID, 2. Region(Small), 3. Region(Medium), 4. Region(Big), 5. Day-of-week, 6. Period. B) Short-term Historical Features: 1. Demand at this point (that this tuple corresponds to) over each of past 6 periods (6 features here), 2. Sum of demand at this point over past 2,4,6 periods (3 features here). C) Long-term Historical Features: Demand at this point during the current period over past 1,2 weeks and their average (3 features here). In total, considering point-granularity, there are 12 historical (both short and long-term) features. Repeat this for Region(Small), Region(Medium), Region(Big). # Altogether there are 54 features + 1 target variable columns_count = 55 # Build an empty matrix for filling in of feature tuples generated later. While there are 54 features, an extra empty column is created to include the target variable (the thing we want to predict), for conciseness. This target variable column will be separated later during training/testing. engineered_data = np.zeros([no_tuples, columns_count]) data_idx = 0 idx = -1 # If the data is testing data, get the maximum number of periods available in the testing dataset. This is useful later to determine when does historical data stop, i.e. 5 periods before dataset end. if file_type == 'testing': total_periods = (max_day - min_day) * 96 + max_day_period + 1 # For each record in the dataset, append the engineered features. for row in data_sort.itertuples(): idx += 1 feature_list = [] day = int(row[2]) day_of_week = day % 7 period = int(row[3]) if day <= min_day + 13: continue #if day == 61 and period >= 91: continue geohash = str(row[1]) hash_id = to_id[geohash] (lat, lng) = geo.decode(geohash) lat = float(lat) lng = float(lng) dd = float(row[4]) region_B = int(to_region_B[geohash]) region_M = int(to_region_M[geohash]) region_S = int(to_region_S[geohash]) idx_in_list = (day - min_day) * 96 + period # Extract temporary demand sub-lists from historical demand lists for generation of short-term historical features. #dd = fine_dd[geohash][idx_in_list] dd_list = fine_dd[geohash][idx_in_list - 6:idx_in_list] dd_S_list = coarse_dd_S[region_S][idx_in_list - 6:idx_in_list] dd_M_list = coarse_dd_M[region_M][idx_in_list - 6:idx_in_list] dd_B_list = coarse_dd_B[region_B][idx_in_list - 6:idx_in_list] # If dataset is testing dataset, certain short-term historical demands may not be available for all points/regions. E.g. If the current record is recorded at T+5 (We are only allowed to generate features up to T), the only short-term historical demands (over past six periods) available are during T-1 and T. In these cases, extrapolation is done by filling these missing demands with its closest available recorded demand. In the above example, T2-T4 are filled with demand at T. if file_type == 'testing': if idx_in_list + 1 > total_periods - 4: periods_diff = idx_in_list + 1 - total_periods + 4 for i in range(periods_diff): j = 6 - 4 + i dd_list[j] = dd_list[6 - periods_diff - 1] dd_S_list[j] = dd_S_list[6 - periods_diff - 1] dd_M_list[j] = dd_M_list[6 - periods_diff - 1] dd_B_list[j] = dd_B_list[6 - periods_diff - 1] dd_1 = dd_list[5] dd_2 = dd_list[4] dd_3 = dd_list[3] dd_4 = dd_list[2] dd_5 = dd_list[1] dd_6 = dd_list[0] dd_S_1 = dd_S_list[5] dd_S_2 = dd_S_list[4] dd_S_3 = dd_S_list[3] dd_S_4 = dd_S_list[2] dd_S_5 = dd_S_list[1] dd_S_6 = dd_S_list[0] dd_M_1 = dd_M_list[5] dd_M_2 = dd_M_list[4] dd_M_3 = dd_M_list[3] dd_M_4 = dd_M_list[2] dd_M_5 = dd_M_list[1] dd_M_6 = dd_M_list[0] dd_B_1 = dd_B_list[5] dd_B_2 = dd_B_list[4] dd_B_3 = dd_B_list[3] dd_B_4 = dd_B_list[2] dd_B_5 = dd_B_list[1] dd_B_6 = dd_B_list[0] else: dd_1 = fine_dd[geohash][idx_in_list - 1] dd_2 = fine_dd[geohash][idx_in_list - 2] dd_3 = fine_dd[geohash][idx_in_list - 3] dd_4 = fine_dd[geohash][idx_in_list - 4] dd_5 = fine_dd[geohash][idx_in_list - 5] dd_6 = fine_dd[geohash][idx_in_list - 6] dd_S_1 = coarse_dd_S[region_S][idx_in_list - 1] dd_S_2 = coarse_dd_S[region_S][idx_in_list - 2] dd_S_3 = coarse_dd_S[region_S][idx_in_list - 3] dd_S_4 = coarse_dd_S[region_S][idx_in_list - 4] dd_S_5 = coarse_dd_S[region_S][idx_in_list - 5] dd_S_6 = coarse_dd_S[region_S][idx_in_list - 6] dd_M_1 = coarse_dd_M[region_M][idx_in_list - 1] dd_M_2 = coarse_dd_M[region_M][idx_in_list - 2] dd_M_3 = coarse_dd_M[region_M][idx_in_list - 3] dd_M_4 = coarse_dd_M[region_M][idx_in_list - 4] dd_M_5 = coarse_dd_M[region_M][idx_in_list - 5] dd_M_6 = coarse_dd_M[region_M][idx_in_list - 6] dd_B_1 = coarse_dd_B[region_B][idx_in_list - 1] dd_B_2 = coarse_dd_B[region_B][idx_in_list - 2] dd_B_3 = coarse_dd_B[region_B][idx_in_list - 3] dd_B_4 = coarse_dd_B[region_B][idx_in_list - 4] dd_B_5 = coarse_dd_B[region_B][idx_in_list - 5] dd_B_6 = coarse_dd_B[region_B][idx_in_list - 6] sum6 = dd_1 + dd_2 + dd_3 + dd_4 + dd_5 + dd_6 sum4 = dd_1 + dd_2 + dd_3 + dd_4 sum2 = dd_1 + dd_2 dd_2week = fine_dd[geohash][idx_in_list - 2 * (96 * 7)] dd_1week = fine_dd[geohash][idx_in_list - (96 * 7)] dd_avg = (dd_2week + dd_1week) / 2.0 sum_S_6 = dd_S_1 + dd_S_2 + dd_S_3 + dd_S_4 + dd_S_5 + dd_S_6 sum_S_4 = dd_S_1 + dd_S_2 + dd_S_3 + dd_S_4 sum_S_2 = dd_S_1 + dd_S_2 dd_S_2week = coarse_dd_S[region_S][idx_in_list - 2 * (96 * 7)] dd_S_1week = coarse_dd_S[region_S][idx_in_list - (96 * 7)] dd_S_avg = (dd_S_2week + dd_S_1week) / 2.0 sum_M_6 = dd_M_1 + dd_M_2 + dd_M_3 + dd_M_4 + dd_M_5 + dd_M_6 sum_M_4 = dd_M_1 + dd_M_2 + dd_M_3 + dd_M_4 sum_M_2 = dd_M_1 + dd_M_2 dd_M_2week = coarse_dd_M[region_M][idx_in_list - 2 * (96 * 7)] dd_M_1week = coarse_dd_M[region_M][idx_in_list - (96 * 7)] dd_M_avg = (dd_M_2week + dd_M_1week) / 2.0 sum_B_6 = dd_B_1 + dd_B_2 + dd_B_3 + dd_B_4 + dd_B_5 + dd_B_6 sum_B_4 = dd_B_1 + dd_B_2 + dd_B_3 + dd_B_4 sum_B_2 = dd_B_1 + dd_B_2 dd_B_2week = coarse_dd_B[region_B][idx_in_list - 2 * (96 * 7)] dd_B_1week = coarse_dd_B[region_B][idx_in_list - (96 * 7)] dd_B_avg = (dd_B_2week + dd_B_1week) / 2.0 # Form a new tuple with these features and adding it to the engineered dataset. feature_list.append(dd) feature_list.append(int(hash_id)) feature_list.append(int(region_S)) feature_list.append(int(region_M)) feature_list.append(int(region_B)) feature_list.append(int(day_of_week)) feature_list.append(int(period)) feature_list.append(dd_1) feature_list.append(dd_2) feature_list.append(dd_3) feature_list.append(dd_4) feature_list.append(dd_5) feature_list.append(dd_6) feature_list.append(sum6) feature_list.append(sum4) feature_list.append(sum2) feature_list.append(dd_2week) feature_list.append(dd_1week) feature_list.append(dd_avg) feature_list.append(dd_S_1) feature_list.append(dd_S_2) feature_list.append(dd_S_3) feature_list.append(dd_S_4) feature_list.append(dd_S_5) feature_list.append(dd_S_6) feature_list.append(sum_S_6) feature_list.append(sum_S_4) feature_list.append(sum_S_2) feature_list.append(dd_S_2week) feature_list.append(dd_S_1week) feature_list.append(dd_S_avg) feature_list.append(dd_M_1) feature_list.append(dd_M_2) feature_list.append(dd_M_3) feature_list.append(dd_M_4) feature_list.append(dd_M_5) feature_list.append(dd_M_6) feature_list.append(sum_M_6) feature_list.append(sum_M_4) feature_list.append(sum_M_2) feature_list.append(dd_M_2week) feature_list.append(dd_M_1week) feature_list.append(dd_M_avg) feature_list.append(dd_B_1) feature_list.append(dd_B_2) feature_list.append(dd_B_3) feature_list.append(dd_B_4) feature_list.append(dd_B_5) feature_list.append(dd_B_6) feature_list.append(sum_B_6) feature_list.append(sum_B_4) feature_list.append(sum_B_2) feature_list.append(dd_B_2week) feature_list.append(dd_B_1week) feature_list.append(dd_B_avg) engineered_data[data_idx, :] = feature_list data_idx += 1 sys.stdout.write('\r Progress {:.2f}%'.format( (idx + 1) * 100.0 / no_tuples)) sys.stdout.flush() print '\n' print 'Saving File \n' engineered_data.resize((data_idx, columns_count)) engineered_data = pd.DataFrame(engineered_data) return engineered_data
import Geohash geo_hash_str = Geohash.encode(39.92324, 116.3906, 5) print(geo_hash_str) # looking for the corne corne_list = Geohash.decode('wx4g0') print(corne_list) # exactly point information exactly_point = Geohash.decode_exactly('wx4g0') print(exactly_point)
#!/usr/bin/env python import sys sys.path.insert(0, "~/.local/lib/python2.6/site-packages/") import codecs sys.stdout = codecs.getwriter('utf8')(sys.stdout) sys.stderr = codecs.getwriter('utf8')(sys.stderr) import Geohash import requests lat, lng = Geohash.decode(sys.argv[1]) lat = float(lat) lng = float(lng) #print '%f, %f' % (lat, lng) url = 'http://nominatim.openstreetmap.org/reverse?format=json&addressdetails=1&lat=%f&lon=%f' % ( lat, lng) r = requests.get(url) json = r.json() addr = json['address'] city = None if 'city' in addr: city = addr['city'] state = addr['state'] country = addr['country'] # // country_code ... .upper() if not city: print '%s, %s' % (state, country) else:
def hash2tag(geohash): return Geohash.decode(geohash.rstrip("0"))