def get_latlon(result, end=True): if end: eloc_latlon = result['geohashed_end_loc'].apply(lambda x: geohash.decode_exactly(x)) result['eloc_lat'] = eloc_latlon.apply(lambda x: float(x[0])) result['eloc_lon'] = eloc_latlon.apply(lambda x: float(x[1])) sloc_latlon = result['geohashed_start_loc'].apply(lambda x: geohash.decode_exactly(x)) result['sloc_lat'] = sloc_latlon.apply(lambda x: float(x[0])) result['sloc_lon'] = sloc_latlon.apply(lambda x: float(x[1])) if end: result['eloc_sloc_lat_sub'] = result['eloc_lat'] - result['sloc_lat'] result['eloc_sloc_lon_sub'] = result['eloc_lon'] - result['sloc_lon'] return result
def get_latlon(result): eloc_latlon = result['geohashed_end_loc'].apply( lambda x: Geohash.decode_exactly(x)) result['eloc_lat'] = eloc_latlon.apply(lambda x: float(x[0])) result['eloc_lon'] = eloc_latlon.apply(lambda x: float(x[1])) sloc_latlon = result['geohashed_start_loc'].apply( lambda x: Geohash.decode_exactly(x)) result['sloc_lat'] = sloc_latlon.apply(lambda x: float(x[0])) result['sloc_lon'] = sloc_latlon.apply(lambda x: float(x[1])) result['eloc_sloc_lat_sub'] = result['eloc_lat'] - result['sloc_lat'] result['eloc_sloc_lon_sub'] = result['eloc_lon'] - result['sloc_lon'] return result
def get_distance(result): locs = list( set(result['geohashed_start_loc']) | set(result['geohashed_end_loc'])) if np.nan in locs: locs.remove(np.nan) deloc = [] for loc in locs: deloc.append(geohash.decode_exactly(loc)) loc_dict = dict(zip(locs, deloc)) geohashed_loc = result[['geohashed_start_loc', 'geohashed_end_loc']].values distance = [] manhattan_distance = [] for i in geohashed_loc: if i[0] is not np.nan and i[1] is not np.nan: lat1, lon1, _, _ = loc_dict[i[0]] lat2, lon2, _, _ = loc_dict[i[1]] distance.append( cal_distance(float(lat1), float(lon1), float(lat2), float(lon2))) manhattan_distance.append( manhattan(float(lat1), float(lon1), float(lat2), float(lon2))) else: distance.append(np.nan) manhattan_distance.append(np.nan) result.loc[:, 'distance'] = distance result.loc[:, 'manhattan'] = manhattan_distance return result
def indexEvents(x): geoh = x['geohash'] lat = Geohash.decode_exactly(geoh)[0] lon = Geohash.decode_exactly(geoh)[1] for e in x['data']: rec = {} rec['geoloc'] = {'lat':lat, 'lon': lon} rec['geohash'] = geoh rec['tags'] = [] rec['count'] = e['count'] rec['datetime'] = e['event'] rec['images'] = e['likes'] print e for tag in e['tags'].keys(): rec['tags'].append({"name":tag, "count":e['tags'][tag]}) es.index(index='instagram_events_j_final',doc_type='dc',body=rec)
def hotspots(self, family, name): query = Search(using=self.client, index=family) if name: query = query.filter('term', name=name) query = query.filter('range', timestamp={'gte':self.lookback}) query.aggs.bucket('hotspot', 'geohash_grid', field='location', precision=7) hashes = query[0].execute().aggregations['hotspot']['buckets'][:3] return [Geohash.decode_exactly(hash['key'])[:2] for hash in hashes]
def geogrid(self, query='*', region='*', min_published_on=None, max_published_on=None, weight=False): payload = { "size": 1000, "query": { "bool": { "must": { "query_string": { "fields": ["title", "description"], "query": "*", "analyze_wildcard": True } }, "filter": { "bool": { "must": [ { "range": { "published_on": { "format": "epoch_millis", "gte": min_published_on or self._default_min_published_on, "lte": max_published_on or self._default_max_published_on } } } ] } } } }, "aggs": { "geogrid": { "geohash_grid": { "field": "geo", "precision": 6 } } } } if weight: payload['aggs']['geogrid']['aggs'] = { "weight": { "sum": { "field": "geoweight" } } } self._add_theme(payload, query) self._add_region(payload, region) geogrid = self.request(payload) for row in geogrid['aggregations']['geogrid']['buckets']: lat, lon, _, _ = Geohash.decode_exactly(row['key']) row['lat'] = lat row['lon'] = lon return geogrid
def get_geohash(): x = [] y = [] data1 = pd.read_csv("train.csv") data2 = pd.read_csv("test.csv") hash1 = list(data1["geohashed_start_loc"]) hash2 = list(data1["geohashed_end_loc"]) hash3 = list(data2["geohashed_start_loc"]) #print(hash1[:100],"\n",hash2[:100],"\n",hash3[:100],"\n",(len(hash1)+len(hash2)+len(hash3))) _hash = [] _hash.extend(hash1) _hash.extend(hash2) _hash.extend(hash3) print(_hash.__len__()) _hash = list(set(_hash)) print("hash:", len(_hash)) # 这里遇到的问题:一个经纬度竟然对应多个geohash值,以经纬度作为键,得到的结果是长度为5890的字典, # 而以geohash作为键,得到的是长度为11万左右的字典 # 原因是解码后精度不够高! # 也就是说,实际上在一开始数据处理的时候就应该使用精度更高的geohash_exactly! # 但是现在没有必要去改,如果为了更加提高精度,就需要去更改 # TODO:将数据集采用gh.decode_exactly得到精度更高的结果重新处理一下,但是对于原始数据不行,因为之后要贝叶斯分类 # TODO:所以推荐的方法是只改变将start loc 和end loc变成精度更高的数据进行训练 for i in _hash: t = gh.decode_exactly(i) x.append(t[0]) y.append(t[1]) x.sort() y.sort() print(x[0:100]) print(y[0:100]) between_x = [] between_y = [] # 这里获得原始具有高精度的geo数据,两两相减,为的就是获取最小的间隔,之后在预测值左右的间隔上获取编码 for i in range(len(x) - 1): between_x.append(x[i] - x[i + 1]) between_y.append(y[i] - y[i + 1]) print(between_x[0:100]) print(between_y[0:100]) print(np.mean(np.array(between_x))) print(np.mean(np.array(between_y)))
def get_loc_dict(): dump_path = cache_path + 'loc_dict.pkl' if os.path.exists(dump_path): loc_dict = pickle.load(open(dump_path, 'rb+')) else: train = pd.read_csv(train_path) test = pd.read_csv(test_path) locs = list(set(train['geohashed_start_loc']) | set(train['geohashed_end_loc']) | set(test['geohashed_start_loc'])) deloc = [] for loc in locs: deloc.append(Geohash.decode_exactly(loc)[:2]) loc_dict = dict(zip(locs, deloc)) pickle.dump(loc_dict, open(dump_path, 'wb+')) return loc_dict
def get_hash(filename, col_name): x = [] y = [] data = pd.read_csv(filename) geohash = data[col_name] b = len(geohash) for i in tqdm.trange(b): a = gh.decode_exactly(geohash[i]) x.append(a[0]) y.append(a[1]) print(x, y) return x, y
def get_distance(result): locs = list(set(result['geohashed_start_loc']) | set(result['geohashed_end_loc'])) if np.nan in locs: locs.remove(np.nan) deloc = [] for loc in locs: deloc.append(geohash.decode_exactly(loc)) loc_dict = dict(zip(locs, deloc)) geohashed_loc = result[['geohashed_start_loc', 'geohashed_end_loc']].values distance = [] manhattan_distance = [] for i in geohashed_loc: if i[0] is not np.nan and i[1] is not np.nan: lat1, lon1, _, _ = loc_dict[i[0]] lat2, lon2, _, _ = loc_dict[i[1]] distance.append(cal_distance(float(lat1), float(lon1), float(lat2), float(lon2))) manhattan_distance.append(manhattan(float(lat1), float(lon1), float(lat2), float(lon2))) else: distance.append(np.nan) manhattan_distance.append(np.nan) result.loc[:, 'distance'] = distance result.loc[:, 'manhattan'] = manhattan_distance return result
def get_eloc_latlon(result): eloc_latlon = result['geohashed_end_loc'].apply(lambda x: geohash.decode_exactly(x)[:2]) result['eloc_lat'] = eloc_latlon.apply(lambda x: float(x[0])) result['eloc_lon'] = eloc_latlon.apply(lambda x: float(x[1])) return result
def get_x(pos): x, y, x_m, y_m = geo.decode_exactly(str(pos)) return x
# 首先对日期时间转化 import datetime train.loc[:, 'starttime'] = pd.to_datetime(train.starttime) train['weekday_time'] = train.starttime.dt.weekday train['hour_time'] = train.starttime.dt.hour train['minute_time'] = train.starttime.dt.minute # In[25]: # 对经纬度解码 import Geohash start = list(train.geohashed_start_loc) start_jw = list(map(lambda x: Geohash.decode_exactly(x), start)) end = list(train.geohashed_end_loc) end_jw = list(map(lambda x: Geohash.decode_exactly(x), end)) train[['start_lat', 'start_lon', 'start_lat_exactly', 'start_lon_exactly']] = pd.DataFrame(start_jw, columns=[ 'start_lat', 'start_lon', 'start_lat_exactly', 'start_lon_exactly' ]) train[['end_lat', 'end_lon', 'end_lat_exactly', 'end_lon_exactly']] = pd.DataFrame( end_jw, columns=['end_lat', 'end_lon', 'exactly_lat', 'exactly_lon']) # In[26]:
import Geohash geo_hash_str = Geohash.encode(39.92324, 116.3906, 5) print(geo_hash_str) # looking for the corne corne_list = Geohash.decode('wx4g0') print(corne_list) # exactly point information exactly_point = Geohash.decode_exactly('wx4g0') print(exactly_point)
def get_eloc_latlon(result): eloc_latlon = result['geohashed_end_loc'].apply( lambda x: geohash.decode_exactly(x)[:2]) result['eloc_lat'] = eloc_latlon.apply(lambda x: float(x[0])) result['eloc_lon'] = eloc_latlon.apply(lambda x: float(x[1])) return result
} } } } res = es.search(index=fromIndex, search_type="count", body=body, request_timeout=3600) buckets = res["aggregations"]["mygrid"]["buckets"] # prettyPrint(buckets) bulkActions = [] for geobucket in buckets: geo = Geohash.decode_exactly(geobucket['key']) print geobucket['key'], geo[0], geo[1] for bucket in geobucket['ingridhist']['buckets']: m = moment.unix(bucket['key']).date ##.add(hours=4).date doc = { "@timestamp": bucket['key_as_string'], "startLocation": [geo[1], geo[0]] } if ("the_count" in bucket): doc["the_count"] = bucket["the_count"]["value"] if ("prediction" in bucket): doc["prediction"] = bucket["prediction"]["value"] if ('prediction' in doc and 'the_count' in doc): doc['surprise'] = max( 0, 10.0 * (doc["the_count"] - doc["prediction"]) / doc["prediction"])
} } } } } } } res = es.search(index=fromIndex, search_type="count",body=body, request_timeout=3600) buckets = res["aggregations"]["mygrid"]["buckets"] # prettyPrint(buckets) bulkActions = [] for geobucket in buckets: geo = Geohash.decode_exactly(geobucket['key']) print geobucket['key'], geo[0], geo[1] for bucket in geobucket['ingridhist']['buckets']: m = moment.unix(bucket['key']).date ##.add(hours=4).date doc = { "@timestamp": bucket['key_as_string'], "startLocation": [geo[1], geo[0]] } if("the_count" in bucket): doc["the_count"] = bucket["the_count"]["value"] if("prediction" in bucket): doc["prediction"] = bucket["prediction"]["value"] if('prediction' in doc and 'the_count' in doc): doc['surprise'] = max(0, 10.0 * (doc["the_count"] - doc["prediction"]) / doc["prediction"]) action = { "_index": toIndex,
def get_sloc_latlon(result): sloc_latlon = result['geohashed_start_loc'].apply( lambda x: geohash.decode_exactly(x)[:2]) result['sloc_lat'] = sloc_latlon.apply(lambda x: float(x[0])) result['sloc_lon'] = sloc_latlon.apply(lambda x: float(x[1])) return result
def get_sloc_latlon(result): sloc_latlon = result['geohashed_start_loc'].apply(lambda x: geohash.decode_exactly(x)[:2]) result['sloc_lat'] = sloc_latlon.apply(lambda x: float(x[0])) result['sloc_lon'] = sloc_latlon.apply(lambda x: float(x[1])) return result
import Geohash #works fine #print(Geohash.encode(45.0602750,7.6548340)) arr = Geohash.decode_exactly("u0j2q4yp4s1f") print(arr[1], " ", arr[2])