Esempio n. 1
0
def process():

    update_list = []
    with open('op-go.csv') as csv_file:
        reader = csv.DictReader(csv_file, fieldnames=fields)
        line_count = 0

        for row in reader:
            if line_count == 0:
                row.pop("geohash")
                row.update({'lat': 'lat'})
                row.update({'lon': 'lon'})
                line_count += 1
                update_list.append(row)
                continue
            else:
                hashed = row["geohash"]
                lat = Geohash.decode(hashed)[0]
                lon = Geohash.decode(hashed)[1]
                row.pop("geohash")
                row.update({'lat': lat})
                row.update({'lon': lon})

            update_list.append(row)

    with open('converted.csv', 'w') as myfile:
        wr = csv.writer(myfile)
        for row in update_list:
            wr.writerow(dict(row).values())
Esempio n. 2
0
def decode_geohash(data):
    result = data.copy()
    start_loc = np.array(result['geohashed_start_loc'].apply(
        lambda x: Geohash.decode(x)).tolist())
    result['start_loc_lat'] = start_loc[:, 0]
    result['start_loc_lon'] = start_loc[:, 1]
    if 'geohashed_end_loc' in data.columns:
        end_loc = np.array(result['geohashed_end_loc'].apply(
            lambda x: Geohash.decode(x)).tolist())
        result['end_loc_lat'] = end_loc[:, 0]
        result['end_loc_lon'] = end_loc[:, 1]
    return result
Esempio n. 3
0
def get_loc_matrix():
    result_path = cache_path + 'loc_matrix.hdf'
    if os.path.exists(result_path):
        result = pd.read_hdf(result_path, 'w')
    else:
        train = pd.read_csv(train_path)
        test = pd.read_csv(test_path)
        end_loc = pd.DataFrame(
            {'geohashed_end_loc': list(train['geohashed_end_loc'].unique())})
        end_loc['end_loc_lat'] = end_loc['geohashed_end_loc'].apply(
            lambda x: Geohash.decode(x)[0])
        end_loc['end_loc_lon'] = end_loc['geohashed_end_loc'].apply(
            lambda x: Geohash.decode(x)[1])
        end_loc['end_loc_lat_box'] = end_loc['end_loc_lat'].apply(
            lambda x: x // 0.003)
        end_loc['end_loc_lon_box'] = end_loc['end_loc_lon'].apply(
            lambda x: x // 0.00375)
        count_of_loc = train.groupby('geohashed_end_loc',
                                     as_index=False)['geohashed_end_loc'].agg(
                                         {'count_of_loc': 'count'})
        end_loc = pd.merge(end_loc,
                           count_of_loc,
                           on='geohashed_end_loc',
                           how='left')
        max_index = end_loc.groupby(
            ['end_loc_lat_box',
             'end_loc_lon_box']).apply(lambda x: x['count_of_loc'].argmax())
        end_loc = end_loc.loc[
            max_index.tolist(),
            ['geohashed_end_loc', 'end_loc_lat', 'end_loc_lon']]
        end_loc.sort_values('end_loc_lat', inplace=True)
        end_loc = end_loc.values
        start_loc = pd.DataFrame({
            'geohashed_start_loc':
            list(pd.concat([train, test])['geohashed_start_loc'].unique())
        })
        start_loc['start_loc_lat'] = start_loc['geohashed_start_loc'].apply(
            lambda x: Geohash.decode(x)[0])
        start_loc['start_loc_lon'] = start_loc['geohashed_start_loc'].apply(
            lambda x: Geohash.decode(x)[1])
        start_loc = start_loc.values
        start_end_loc_arr = []
        for i in start_loc:
            for j in end_loc:
                if (np.abs(i[1] - j[1]) < 0.012) & (np.abs(i[2] - j[2]) <
                                                    0.015):
                    start_end_loc_arr.append([i[0], j[0]])
        result = pd.DataFrame(
            start_end_loc_arr,
            columns=['geohashed_start_loc', 'geohashed_end_loc'])
        result.to_hdf(result_path, 'w', complib='blosc', complevel=5)
    return result
Esempio n. 4
0
def get_distance(result):
    result_path = cache_path + 'distance_feat_%d.hdf' % (result.shape[0])
    if os.path.exists(result_path) & flag:
        temp = pd.read_hdf(result_path, 'w')
        result = pd.merge(result, temp, on=['orderid', 'geohashed_end_loc'], how='left')
    else:
        locs = list(set(result['geohashed_start_loc']) | set(result['geohashed_end_loc']))
        if np.nan in locs:
            locs.remove(np.nan)
        deloc = []
        for loc in locs:
            deloc.append(Geohash.decode(loc))
        loc_dict = dict(zip(locs,deloc))
        geohashed_loc = result[['geohashed_start_loc','geohashed_end_loc']].values
        distance = []
        mht_distance = []
        for i in geohashed_loc:
            lat1, lon1 = loc_dict[i[0]]
            lat2, lon2 = loc_dict[i[1]]
            distance.append(cal_distance(lat1,lon1,lat2,lon2))
            mht_distance.append(cal_mht_distance(lat1,lon1,lat2,lon2))
        result['distance'] = distance
        result['mht_distance'] = mht_distance
        result[['orderid','geohashed_end_loc','distance','mht_distance']].to_hdf(result_path, 'w', complib='blosc', complevel=5)
    return result
Esempio n. 5
0
def get_distance(result):
    locs = list(
        set(result['geohashed_start_loc']) | set(result['geohashed_end_loc']))
    if np.nan in locs:
        locs.remove(np.nan)
    deloc = []
    for loc in locs:
        deloc.append(Geohash.decode(loc))
    loc_dict = dict(zip(locs, deloc))
    geohashed_loc = result[['geohashed_start_loc', 'geohashed_end_loc']].values
    distance = []
    manhattan_distance = []
    for i in geohashed_loc:
        if i[0] is not np.nan and i[1] is not np.nan:
            lat1, lon1 = loc_dict[i[0]]
            lat2, lon2 = loc_dict[i[1]]
            distance.append(
                cal_distance(float(lat1), float(lon1), float(lat2),
                             float(lon2)))
            manhattan_distance.append(
                manhattan(float(lat1), float(lon1), float(lat2), float(lon2)))
        else:
            distance.append(np.nan)
            manhattan_distance.append(np.nan)
    result.loc[:, 'distance'] = distance
    result.loc[:, 'manhattan'] = manhattan_distance
    return result
Esempio n. 6
0
 def put(self):
     hour = int(request.form['hour'])
     date = request.form['date']
     prcp = float(request.form['prcp'])*100
     snow = float(request.form['snow']) * 10
     tmax = float(request.form['tmax']) * 10
     tmin = float(request.form['tmin']) * 10
     date = pd.to_datetime(date)
     with open(os.path.join(APP_STATIC, 'uniquegeohash.pkl'), 'rb') as f:
         uniquegeohash = dill.load(f)
     with open(os.path.join(APP_STATIC, 'predict_pickup_density.pkl'), 'rb') as f:
         model = dill.load(f)
     x_dict = [{"pickup_geohash": geostr, "hour": hour, "dayofweek": date.dayofweek, 'month': date.month,'PRCP':prcp,'SNOW':snow,'TMAX':tmax,'TMIN':tmin} for geostr in uniquegeohash]
     x_df = pd.DataFrame(x_dict)
     y = model.predict(x_df)
     geodecode = [Geohash.decode(geocode) for geocode in uniquegeohash]
     yzipgeo = zip(y, geodecode)
     sortedlist = sorted(yzipgeo, key=lambda x: -x[0])
     top10address = []
     top10dict = {}
     for y, geodecode in sortedlist[0:50]:
         key = ",".join(geodecode)
         top10dict[key] = top10dict.get(key,0) + y
     top10res = []
     for key in top10dict:
         temptuple = (float(key.split(",")[0]),float(key.split(",")[1]))
         top10res.append([top10dict[key],temptuple])
     top10res = sorted(top10res,key=lambda x:-x[0])
     top10res = top10res[0:10] if len(top10res) > 10 else top10res
     for u,geodecode in top10res:
         g = geocoder.google([geodecode[0], geodecode[1]], method='reverse').address
         top10address.append(g)
     return {"top10": top10res,"top10address":top10address}
    def test_basic(self):

        hash = Geohash.encode(self.family[0], self.family[1], precision=20)

        (lats, lons) = Geohash.decode(hash)

        assert float(lats) == self.family[0]
        assert float(lons) == self.family[1]
Esempio n. 8
0
def geohash_decode(file, test=False):  #约半小时之内跑完
    data = pd.read_csv(file)
    if test:
        x = []
        y = []
        geohash = data["geohashed_start_loc"]
        b = len(geohash)
        for i in range(len(geohash)):

            print(i, b)
            a = gh.decode(geohash[i])
            x.append(a[0])
            y.append(a[1])
        data = data.drop(labels="geohashed_start_loc", axis=1)
        data.insert(data.shape[1], "start_loc_x", pd.Series(x))
        data.insert(data.shape[1], "start_loc_y", pd.Series(y))
        data.to_csv("test_1.csv", index=False)
    else:
        x1 = []
        y1 = []
        x2 = []
        y2 = []
        geohash1 = data["geohashed_start_loc"]
        geohash2 = data["geohashed_end_loc"]
        b = len(geohash1)
        for i in range(len(geohash1)):
            print(i, b)
            a = gh.decode(geohash1[i])
            x1.append(a[0])
            y1.append(a[1])
        b = len(geohash2)
        for i in range(len(geohash2)):
            print(i, b)
            a = gh.decode(geohash2[i])
            x2.append(a[0])
            y2.append(a[1])
        data = data.drop(labels="geohashed_start_loc", axis=1)
        data = data.drop(labels="geohashed_end_loc", axis=1)
        data.insert(data.shape[1], "start_loc_x", pd.Series(x1))
        data.insert(data.shape[1], "start_loc_y", pd.Series(y1))
        data.insert(data.shape[1], "end_loc_x", pd.Series(x2))
        data.insert(data.shape[1], "end_loc_y", pd.Series(y2))
        data.to_csv("train_1.csv", index=False)
Esempio n. 9
0
 def show_station_coordinfo(self):
     for channel in self.station_coordinfo.keys():
             info = self.station_coordinfo[channel]
             logger.debug("%s (%f, %f, %f) %s | %s",
                          channel,
                          info['latitude'],
                          info['longitude'],
                          info['elevation'],
                          info['geohash'], 
                          Geohash.decode(info['geohash']))
Esempio n. 10
0
def geohash_decoding(geohash):
    '''
    geohash 逆转换为火星坐标对

    :param geohash: 待转换的geohash
    :return: (火星坐标纬度,火星坐标经度)
    '''

    wgsLat, wgsLng = Geohash.decode(geohash)
    return wgs2gcj(wgsLat, wgsLng)
Esempio n. 11
0
def gettile():

    vehicle_map = init_vehicle_map

    tilesresult = getdata.tiles()

    if tilesresult[:8] != "<option>":
        app.logger.debug('Connection error : %s', tilesresult)
        return render_template('gettile.html',
                               vehicle_map=vehicle_map,
                               error=ErrorMessage)
    else:
        alltiles = Markup(tilesresult)

    if request.method == 'POST':

        if request.form['tile']:

            tile = request.form['tile']

            markers_map = getdata.getvehicles_fortile(tile)

            #app.logger.debug('Debugging KILLRTAXI : %s',markers_map)

            if not isinstance(markers_map, list):
                app.logger.debug('Connection error : %s', markers_map)
                return render_template('gettile.html',
                                       vehicle_map=vehicle_map,
                                       error=ErrorMessage)

            nbmvts = len(markers_map)

            mappos = Geohash.decode(tile)

            vehicle_map = Map(
                identifier="view-side",
                lat=str(float(mappos[0]) - 0.2),
                lng=str(float(mappos[1]) - 0.2),
                style="height:700px;width:700px;margin:10;",
                zoom=9,
                markers=markers_map
                #markers=[(54.96848201388808, 0.39963558097359564),(54.968382013888075, -0.39953558097359565)]
            )

            return render_template('gettile.html',
                                   alltiles=alltiles,
                                   nbmvts=nbmvts,
                                   tile=tile,
                                   vehicle_map=vehicle_map)

    return render_template('gettile.html',
                           alltiles=alltiles,
                           vehicle_map=vehicle_map)
Esempio n. 12
0
def get_loc_dict():
    dump_path = cache_path + 'loc_dict.pkl'
    if os.path.exists(dump_path):
        loc_dict = pickle.load(open(dump_path, 'rb+'))
    else:
        train = pd.read_csv(train_path)
        test = pd.read_csv(test_path)
        locs = list(
            set(train['geohashed_start_loc']) | set(train['geohashed_end_loc'])
            | set(test['geohashed_start_loc']))
        deloc = []
        for loc in locs:
            deloc.append(Geohash.decode(loc))
        loc_dict = dict(zip(locs, deloc))
        pickle.dump(loc_dict, open(dump_path, 'wb+'))
    return loc_dict
Esempio n. 13
0
def get_distance(result):
    locs = list(
        set(result['geohashed_start_loc']) | set(result['geohashed_end_loc']))
    if np.nan in locs:
        locs.remove(np.nan)
    deloc = []
    for loc in locs:
        deloc.append(Geohash.decode(loc))
    loc_dict = dict(zip(locs, deloc))
    geohashed_loc = result[['geohashed_start_loc', 'geohashed_end_loc']].values
    distance = []
    for i in geohashed_loc:
        lat1, lon1 = loc_dict[i[0]]
        lat2, lon2 = loc_dict[i[1]]
        distance.append(cal_distance(lat1, lon1, lat2, lon2))
    result.loc[:, 'distance'] = distance
    return result
Esempio n. 14
0
 def get(self, point, buffer_size=0, multiple=False):
     """ lookup state and county based on geohash of coordinates from tweet """
     lon, lat = point
     geohash = Geohash.encode(lat, lon, precision=self.precision)
     key = (geohash, buffer_size, multiple)
     if key in self.geohash_cache:
         # cache hit on geohash
         self.hit += 1
         #print self.hit, self.miss
         return self.geohash_cache[key]
     self.miss += 1
     # cache miss on geohash
     # project point to ESRI:102005
     lat, lon = Geohash.decode(geohash)
     proj_point = project([float(lon), float(lat)])
     args = dict(buffer_size=buffer_size, multiple=multiple)
     payload = self.get_object(proj_point, **args)
     self.geohash_cache[key] = payload
     return payload
Esempio n. 15
0
 def get(self, point, buffer_size=0, multiple=False):
     """ lookup state and county based on geohash of coordinates from tweet """
     lon, lat = point
     geohash = Geohash.encode(lat, lon, precision=self.precision)
     key = (geohash, buffer_size, multiple)
     if key in self.geohash_cache:
         # cache hit on geohash
         self.hit += 1
         #print self.hit, self.miss
         return self.geohash_cache[key]
     self.miss += 1
     # cache miss on geohash
     # project point to ESRI:102005
     lat, lon = Geohash.decode(geohash)
     proj_point = project([float(lon), float(lat)])
     args = dict(buffer_size=buffer_size, multiple=multiple)
     payload = self.get_object(proj_point, **args)
     self.geohash_cache[key] = payload
     return payload
Esempio n. 16
0
def gettile():

    vehicle_map=init_vehicle_map

    tilesresult=getdata.tiles()

    if tilesresult[:8]!="<option>":
        app.logger.debug('Connection error : %s',tilesresult)
        return render_template('gettile.html',vehicle_map=vehicle_map,error=ErrorMessage)
    else:
        alltiles=Markup(tilesresult)

    if request.method == 'POST':

        if request.form['tile']:

            tile=request.form['tile']

            markers_map=getdata.getvehicles_fortile(tile)

            #app.logger.debug('Debugging KILLRTAXI : %s',markers_map)

            if not isinstance(markers_map,list):
                app.logger.debug('Connection error : %s',markers_map)
                return render_template('gettile.html',vehicle_map=vehicle_map,error=ErrorMessage)

            nbmvts=len(markers_map)

            mappos=Geohash.decode(tile)

            vehicle_map = Map(
                identifier="view-side",
                lat=str(float(mappos[0])-0.2),
                lng=str(float(mappos[1])-0.2),
                style="height:700px;width:700px;margin:10;",
                zoom=9,
                markers=markers_map
                #markers=[(54.96848201388808, 0.39963558097359564),(54.968382013888075, -0.39953558097359565)]
            )

            return render_template('gettile.html', alltiles=alltiles,nbmvts=nbmvts,tile=tile,vehicle_map=vehicle_map)

    return render_template('gettile.html',alltiles=alltiles,vehicle_map=vehicle_map)
    def read_data(self):

        df = pd.read_csv(filepath_or_buffer="training.csv")
        print("Processing data ......")
        df['timestamp'] = pd.to_datetime(df['timestamp'], format='%H:%M')
        df.sort_values(["day", "timestamp"],
                       axis=0,
                       ascending=[True, True],
                       inplace=True)
        #print (df)

        image = np.zeros(shape=(25, 5)).tolist()
        #print(image)
        i = -1
        j = 0
        for index, row in df.iterrows():
            #print(row["day"],row["timestamp"])
            coordinates = Geohash.decode(row['geohash6'])
            i = i + 1
            if i == 0 or day != row['day'] or timestamp != row['timestamp']:
                if i != 0:
                    images.append(image)
                    image = np.zeros(shape=(25, 5)).tolist()
                    j = j + 1
                day = row['day']
                timestamp = row['timestamp']
                image[int((-(float(coordinates[0]) + 5.24)) * 100)][int(
                    (float(coordinates[1]) - 90.6) * 10)] = row['demand']

            else:
                image[int((-(float(coordinates[0]) + 5.24)) * 100)][int(
                    (float(coordinates[1]) - 90.6) * 10)] = row['demand']
                #print(image)

        #print(images)
        #print (np.asarray(images).shape)
        return images
Esempio n. 18
0
def geohash_decode(geohash):
    return Geohash.decode(geohash)
Esempio n. 19
0
#coding:utf8
import Geohash
#longitude : 经度
#latitude  : 纬度

lng = 116.37439
lat = 39.94758

h = Geohash.encode(lat, lng)
print h
print Geohash.decode(h)
Esempio n. 20
0
 def get_latlng(self, waypoint_id):
     return Geohash.decode(self.__waypoints[waypoint_id]["geohash"])
# At the moment, this data is not used, since the location has been nicely
# clustered in a grid-like fashion thanks to the geohash precision reduction.
# However, it is necessary to cluster the locations when the locations are
# chaotically scattered on the map.
print("Pre-process geo-location data")
num_geo = tr_df['geohash6'].unique().shape[0]
geo_df = pd.DataFrame(data=np.transpose([['' for i in range(num_geo)],
                                         np.zeros(num_geo),
                                         np.zeros(num_geo)]),
                      columns=['geohash6', 'latitude', 'longitude'])

# Obtain the location information for each unique geohash6.
i = 0
for x in zip(tr_df['geohash6'].unique()):
    geo_df.loc[i]['geohash6'] = x[0]
    geo_df.loc[i]['latitude', 'longitude'] = gh.decode(x[0])
    i += 1
# Resulting dataframe of the geo-location data:
'''
     geohash6  latitude  longitude
0      qp03wc -5.353088  90.653687
1      qp03pn -5.413513  90.664673
2      qp09sw -5.325623  90.906372
3      qp0991 -5.353088  90.752563
...
1326   qp03yn -5.281677  90.620728
1327   qp09v9 -5.309143  90.950317
1328   qp0d45 -5.254211  90.796509
'''
# Save the geo-location data into csv file for easy access when necessary.
geo_df.to_csv(cwd + '/Traffic data/geo_set.csv', sep=',')
Esempio n. 22
0
print("Loading data ......")
df=pd.read_csv(filepath_or_buffer = "training.csv")
print("Processing data ......")
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%H:%M').dt.time
df.sort_values(["day","timestamp"], axis = 0, ascending = [True,True], inplace = True) 

print (df)

image= np.zeros(shape=(25,5)).tolist()
#print(image)
i=-1
j=0
for index, row in df.iterrows():
	#print(row["day"],row["timestamp"])
	coordinates=Geohash.decode(row['geohash6'])
	i=i+1
	if i==0 or day!=row['day'] or timestamp!=row['timestamp']:
		
		if i!=0:
			timing.append((timestamp.hour * 60 + timestamp.minute)/15)
			images.append(image)
			image= np.zeros(shape=(25,5)).tolist()
			j=j+1
			#print (timing)
		day=row['day']
		timestamp=row['timestamp']
		image[int((-(float(coordinates[0])+5.24))*100)][int((float(coordinates[1])-90.6)*10)]=row['demand']

	else:
		image[int((-(float(coordinates[0])+5.24))*100)][int((float(coordinates[1])-90.6)*10)]=row['demand']
Esempio n. 23
0
 def show_station_coordinfo(self):
     for channel in self.station_coordinfo.keys():
         info = self.station_coordinfo[channel]
         logger.debug("%s (%f, %f, %f) %s | %s", channel, info['latitude'],
                      info['longitude'], info['elevation'], info['geohash'],
                      Geohash.decode(info['geohash']))
Esempio n. 24
0
def geohash_decoding(geohash):
    wgsLat, wgsLng = Geohash.decode(geohash)
    return wgs2gcj(wgsLat, wgsLng)
def change_corr(cor, precision=5):
    x, y = cor
    geohash = Geohash.encode(x, y, precision=precision)
    x, y = Geohash.decode(geohash)
    return [float(x), float(y)]
#coding:utf8
import Geohash
#longitude : 经度
#latitude  : 纬度

lng = 116.37439
lat = 39.94758

h = Geohash.encode(lat,lng)
print h
print Geohash.decode(h)
Esempio n. 27
0
def hash2latlon(geohash):
    g = Geohash.decode(geohash)
    return [float(g[0]), float(g[1])]
Esempio n. 28
0
target_dir = r'D:\lu_work\python\data'
from_file = 'train.csv'
to_file = 'train_lat.csv'
input = open(target_dir + os.sep + from_file, 'r')
output = open(target_dir + os.sep + to_file, 'w+')
log = open(target_dir + os.sep + 'log.txt', 'w+')
all_train_lines = input.readlines()
i = 0
#逐行转换起始点坐标
for line in all_train_lines:
    #首行 & 筛选有价值列
    items = line.split(',')
    if i == 0:
        output.write(items[0] + items[1] + ',' + items[4] + ',' + items[5] +
                     ',' + items[6] + '\n')
    else:
        try:
            st_pri = items[6].replace('\n', '')
            en_pri = items[5].replace('\n', '')
            start = Geohash.decode(st_pri)
            end = Geohash.decode(en_pri)
            output.write(items[0] + items[1] + ',' + items[4] + ',' +
                         start[0] + ' ' + start[1] + ',' + end[0] + ' ' +
                         end[1] + '\n')
        except:
            log.write(items[0] + '订单号转换坐标有错误\n')
    i = i + 1
    print(i)
output.close()
log.close()
    to_region_B = {}

    for i in range(n_intervals_S):
        for j in range(n_intervals_S):
            temp_region = i * n_intervals_S + j
            all_regions_S.append(temp_region)
    for i in range(n_intervals_M):
        for j in range(n_intervals_M):
            temp_region = i * n_intervals_M + j
            all_regions_M.append(temp_region)
    for i in range(n_intervals_B):
        for j in range(n_intervals_B):
            temp_region = i * n_intervals_B + j
            all_regions_B.append(temp_region)
    for ghash in all_geo_hashes:
        gps = geo.decode(ghash)
        lat, lng = gps
        lat = float(lat)
        lng = float(lng)

        lat_region_B = int((lat - min_lat) / lat_size_B)
        if lat_region_B < 0: lat_region_B = 0
        elif lat_region_B >= n_intervals_B: lat_region_B = n_intervals_B - 1
        lng_region_B = int((lng - min_lng) / lng_size_B)
        if lng_region_B < 0: lng_region_B = 0
        elif lng_region_B >= n_intervals_B: lng_region_B = n_intervals_B - 1
        region_B = lat_region_B * n_intervals_B + lng_region_B
        to_region_B[ghash] = region_B

        lat_region_M = int((lat - min_lat) / lat_size_M)
        if lat_region_M < 0: lat_region_M = 0
Esempio n. 30
0
def geohashed_loc(loc):
    lat, lon = Geohash.decode(loc)
    Lx = int(6371004.0 * ((lon - 115.98350244732475) / 57.2958) *
             np.cos(lon / 57.2958) / 116.97)
    Ly = int(6371004.0 * ((lat - 39.3492529600276) / 57.2958) / 76.35)
    return (Lx, Ly)
def feature_gen(data_file, file_type):
    data_sort = data_file.sort_values(by=['day', 'timestamp'])
    min_day = min(data_file['day'].unique())
    if file_type == 'testing':
        max_day = max(data_file['day'].unique())
        max_day_period = max(
            data_file[data_file['day'] == max_day]['timestamp'].unique())

    no_tuples, no_dimensions = data_file.shape
    prev_period = -1
    period = -1
    fine_dd = {}
    coarse_dd_B = {}
    coarse_dd_M = {}
    coarse_dd_S = {}
    period_list = []
    temp_dd = {}
    recorded_points = Set()
    temp_coarse_B = {}
    temp_coarse_M = {}
    temp_coarse_S = {}
    idx = -1  #Manually increment index instead of using index present in the dataframe due to rearrangement of dataframe objects (and their corresponding indices) earlier during sorting.

    # Create a list of historical aggregated demands for every point and every region. If there is no record at a point/region during a period, we assign a 0 for that point/region.
    for row in data_sort.itertuples():
        idx += 1
        geohash = str(row[1])
        day = int(row[2])
        period = int(row[3])
        dd = float(row[4])
        region_B = to_region_B[geohash]
        region_M = to_region_M[geohash]
        region_S = to_region_S[geohash]

        #for part in sgParts.itertuples():
        #        poly = part.geometry
        #        if p.within(poly):
        #print part.Name
        #                region = part.Name

        if idx == 0:
            temp_dd[geohash] = dd
            temp_coarse_B[region_B] = dd
            temp_coarse_M[region_M] = dd
            temp_coarse_S[region_S] = dd

        # When the period of the current record is different from the period of the previous record, we append all the recorded demands of the previous period to the historical demand lists, while adding 0 to the historical demand lists that belong to points/regions that do not have any records during the previous period
        elif prev_period != period:
            if period - prev_period != 1 and not (period == 0
                                                  and prev_period == 95):
                empty_periods = period - prev_period - 1
            else:
                empty_periods = 0
            period_list.append(prev_period)
            for ghash in all_geo_hashes:
                if ghash in temp_dd:
                    if ghash in fine_dd:
                        fine_dd[ghash].append(temp_dd[ghash])
                    else:
                        fine_dd[ghash] = [temp_dd[ghash]]
                if ghash not in temp_dd:
                    if ghash in fine_dd:
                        fine_dd[ghash].append(0.0)
                    else:
                        fine_dd[ghash] = [0.0]

            for reg in all_regions_B:
                if reg in temp_coarse_B:
                    if reg in coarse_dd_B:
                        coarse_dd_B[reg].append(temp_coarse_B[reg])
                    else:
                        coarse_dd_B[reg] = [temp_coarse_B[reg]]
                else:
                    if reg in coarse_dd_B:
                        coarse_dd_B[reg].append(0)
                    else:
                        coarse_dd_B[reg] = [0]

            for reg in all_regions_M:
                if reg in temp_coarse_M:
                    if reg in coarse_dd_M:
                        coarse_dd_M[reg].append(temp_coarse_M[reg])
                    else:
                        coarse_dd_M[reg] = [temp_coarse_M[reg]]
                else:
                    if reg in coarse_dd_M:
                        coarse_dd_M[reg].append(0)
                    else:
                        coarse_dd_M[reg] = [0]

            for reg in all_regions_S:
                if reg in temp_coarse_S:
                    if reg in coarse_dd_S:
                        coarse_dd_S[reg].append(temp_coarse_S[reg])
                    else:
                        coarse_dd_S[reg] = [temp_coarse_S[reg]]
                if reg not in temp_coarse_S:
                    if reg in coarse_dd_S:
                        coarse_dd_S[reg].append(0)
                    else:
                        coarse_dd_S[reg] = [0]

            # There are two instances where there are consecutive periods without any records at all, across all points and regions. Here we fill in 0 for these periods.
            if empty_periods > 0:
                for i in range(empty_periods):
                    for ghash in all_geo_hashes:
                        fine_dd[ghash].append(0)
                    for reg in coarse_dd_B:
                        coarse_dd_B[reg].append(0)
                    for reg in coarse_dd_M:
                        coarse_dd_M[reg].append(0)
                    for reg in coarse_dd_S:
                        coarse_dd_S[reg].append(0)
                    period_list.append(prev_period + i)

            temp_dd = {}
            temp_dd[geohash] = dd
            temp_coarse_B = {}
            temp_coarse_B[region_B] = dd
            temp_coarse_M = {}
            temp_coarse_M[region_M] = dd
            temp_coarse_S = {}
            temp_coarse_S[region_S] = dd

        # When the program reaches the end of the dataset, append all the recorded demands to the historical demand lists, while adding 0 to the historical demand lists that belong to points/regions that do not have any records during the this period

        elif idx == no_tuples - 1:
            period_list.append(prev_period)
            temp_dd[geohash] = dd
            if region_B in temp_coarse_B:
                temp_coarse_B[region_B] += dd
            else:
                temp_coarse_B[region_B] = dd

            if region_M in temp_coarse_M:
                temp_coarse_M[region_M] += dd
            else:
                temp_coarse_M[region_M] = dd

            if region_S in temp_coarse_S:
                temp_coarse_S[region_S] += dd
            else:
                temp_coarse_S[region_S] = dd
            for ghash in all_geo_hashes:
                if ghash in temp_dd:
                    if ghash in fine_dd:
                        fine_dd[ghash].append(temp_dd[ghash])
                    else:
                        fine_dd[ghash] = [temp_dd[ghash]]
                if ghash not in temp_dd:
                    if ghash in fine_dd:
                        fine_dd[ghash].append(0)
                    else:
                        fine_dd[ghash] = [0]

            for reg in all_regions_B:
                if reg in temp_coarse_B:
                    if reg in coarse_dd_B:
                        coarse_dd_B[reg].append(temp_coarse_B[reg])
                    else:
                        coarse_dd_B[reg] = [temp_coarse_B[reg]]
                else:
                    if reg in coarse_dd_B:
                        coarse_dd_B[reg].append(0)
                    else:
                        coarse_dd_B[reg] = [0]
            for reg in all_regions_M:
                if reg in temp_coarse_M:
                    if reg in coarse_dd_M:
                        coarse_dd_M[reg].append(temp_coarse_M[reg])
                    else:
                        coarse_dd_M[reg] = [temp_coarse_M[reg]]
                else:
                    if reg in coarse_dd_M:
                        coarse_dd_M[reg].append(0)
                    else:
                        coarse_dd_M[reg] = [0]

            for reg in all_regions_S:
                if reg in temp_coarse_S:
                    if reg in coarse_dd_S:
                        coarse_dd_S[reg].append(temp_coarse_S[reg])
                    else:
                        coarse_dd_S[reg] = [temp_coarse_S[reg]]
                else:
                    if reg in coarse_dd_S:
                        coarse_dd_S[reg].append(0)
                    else:
                        coarse_dd_S[reg] = [0]

        else:
            temp_dd[geohash] = dd
            if region_B in temp_coarse_B:
                temp_coarse_B[region_B] += dd
            else:
                temp_coarse_B[region_B] = dd

            if region_M in temp_coarse_M:
                temp_coarse_M[region_M] += dd
            else:
                temp_coarse_M[region_M] = dd

            if region_S in temp_coarse_S:
                temp_coarse_S[region_S] += dd
            else:
                temp_coarse_S[region_S] = dd
        prev_period = period
        prev_day = day
        prev_row = row

    ## Feature Generation. Based on the historical information built earlier, this section generates additional historical features for each training sample
    # Generate additional historical/attribute features w.r.t. each tuple. Total features include: A) Attributional Features: 1. geohash ID, 2. Region(Small), 3. Region(Medium), 4. Region(Big), 5. Day-of-week, 6. Period. B) Short-term Historical Features: 1. Demand at this point (that this tuple corresponds to) over each of past 6 periods (6 features here), 2. Sum of demand at this point over past 2,4,6 periods (3 features here). C) Long-term Historical Features: Demand at this point during the current period over past 1,2 weeks and their average (3 features here). In total, considering point-granularity, there are 12 historical (both short and long-term) features. Repeat this for Region(Small), Region(Medium), Region(Big).

    # Altogether there are 54 features + 1 target variable
    columns_count = 55
    # Build an empty matrix for filling in of feature tuples generated later. While there are 54 features, an extra empty column is created to include the target variable (the thing we want to predict), for conciseness. This target variable column will be separated later during training/testing.
    engineered_data = np.zeros([no_tuples, columns_count])
    data_idx = 0
    idx = -1
    # If the data is testing data, get the maximum number of periods available in the testing dataset. This is useful later to determine when does historical data stop, i.e. 5 periods before dataset end.
    if file_type == 'testing':
        total_periods = (max_day - min_day) * 96 + max_day_period + 1
    # For each record in the dataset, append the engineered features.
    for row in data_sort.itertuples():
        idx += 1
        feature_list = []
        day = int(row[2])
        day_of_week = day % 7
        period = int(row[3])
        if day <= min_day + 13: continue
        #if day == 61 and period >= 91: continue
        geohash = str(row[1])
        hash_id = to_id[geohash]
        (lat, lng) = geo.decode(geohash)
        lat = float(lat)
        lng = float(lng)
        dd = float(row[4])
        region_B = int(to_region_B[geohash])
        region_M = int(to_region_M[geohash])
        region_S = int(to_region_S[geohash])
        idx_in_list = (day - min_day) * 96 + period
        # Extract temporary demand sub-lists from historical demand lists for generation of short-term historical features.
        #dd = fine_dd[geohash][idx_in_list]
        dd_list = fine_dd[geohash][idx_in_list - 6:idx_in_list]
        dd_S_list = coarse_dd_S[region_S][idx_in_list - 6:idx_in_list]
        dd_M_list = coarse_dd_M[region_M][idx_in_list - 6:idx_in_list]
        dd_B_list = coarse_dd_B[region_B][idx_in_list - 6:idx_in_list]

        # If dataset is testing dataset, certain short-term historical demands may not be available for all points/regions. E.g. If the current record is recorded at T+5 (We are only allowed to generate features up to T), the only short-term historical demands (over past six periods) available are during T-1 and T. In these cases, extrapolation is done by filling these missing demands with its closest available recorded demand. In the above example, T2-T4 are filled with demand at T.
        if file_type == 'testing':
            if idx_in_list + 1 > total_periods - 4:
                periods_diff = idx_in_list + 1 - total_periods + 4
                for i in range(periods_diff):
                    j = 6 - 4 + i
                    dd_list[j] = dd_list[6 - periods_diff - 1]
                    dd_S_list[j] = dd_S_list[6 - periods_diff - 1]
                    dd_M_list[j] = dd_M_list[6 - periods_diff - 1]
                    dd_B_list[j] = dd_B_list[6 - periods_diff - 1]

            dd_1 = dd_list[5]
            dd_2 = dd_list[4]
            dd_3 = dd_list[3]
            dd_4 = dd_list[2]
            dd_5 = dd_list[1]
            dd_6 = dd_list[0]
            dd_S_1 = dd_S_list[5]
            dd_S_2 = dd_S_list[4]
            dd_S_3 = dd_S_list[3]
            dd_S_4 = dd_S_list[2]
            dd_S_5 = dd_S_list[1]
            dd_S_6 = dd_S_list[0]
            dd_M_1 = dd_M_list[5]
            dd_M_2 = dd_M_list[4]
            dd_M_3 = dd_M_list[3]
            dd_M_4 = dd_M_list[2]
            dd_M_5 = dd_M_list[1]
            dd_M_6 = dd_M_list[0]
            dd_B_1 = dd_B_list[5]
            dd_B_2 = dd_B_list[4]
            dd_B_3 = dd_B_list[3]
            dd_B_4 = dd_B_list[2]
            dd_B_5 = dd_B_list[1]
            dd_B_6 = dd_B_list[0]

        else:
            dd_1 = fine_dd[geohash][idx_in_list - 1]
            dd_2 = fine_dd[geohash][idx_in_list - 2]
            dd_3 = fine_dd[geohash][idx_in_list - 3]
            dd_4 = fine_dd[geohash][idx_in_list - 4]
            dd_5 = fine_dd[geohash][idx_in_list - 5]
            dd_6 = fine_dd[geohash][idx_in_list - 6]

            dd_S_1 = coarse_dd_S[region_S][idx_in_list - 1]
            dd_S_2 = coarse_dd_S[region_S][idx_in_list - 2]
            dd_S_3 = coarse_dd_S[region_S][idx_in_list - 3]
            dd_S_4 = coarse_dd_S[region_S][idx_in_list - 4]
            dd_S_5 = coarse_dd_S[region_S][idx_in_list - 5]
            dd_S_6 = coarse_dd_S[region_S][idx_in_list - 6]

            dd_M_1 = coarse_dd_M[region_M][idx_in_list - 1]
            dd_M_2 = coarse_dd_M[region_M][idx_in_list - 2]
            dd_M_3 = coarse_dd_M[region_M][idx_in_list - 3]
            dd_M_4 = coarse_dd_M[region_M][idx_in_list - 4]
            dd_M_5 = coarse_dd_M[region_M][idx_in_list - 5]
            dd_M_6 = coarse_dd_M[region_M][idx_in_list - 6]

            dd_B_1 = coarse_dd_B[region_B][idx_in_list - 1]
            dd_B_2 = coarse_dd_B[region_B][idx_in_list - 2]
            dd_B_3 = coarse_dd_B[region_B][idx_in_list - 3]
            dd_B_4 = coarse_dd_B[region_B][idx_in_list - 4]
            dd_B_5 = coarse_dd_B[region_B][idx_in_list - 5]
            dd_B_6 = coarse_dd_B[region_B][idx_in_list - 6]

        sum6 = dd_1 + dd_2 + dd_3 + dd_4 + dd_5 + dd_6
        sum4 = dd_1 + dd_2 + dd_3 + dd_4
        sum2 = dd_1 + dd_2
        dd_2week = fine_dd[geohash][idx_in_list - 2 * (96 * 7)]
        dd_1week = fine_dd[geohash][idx_in_list - (96 * 7)]
        dd_avg = (dd_2week + dd_1week) / 2.0

        sum_S_6 = dd_S_1 + dd_S_2 + dd_S_3 + dd_S_4 + dd_S_5 + dd_S_6
        sum_S_4 = dd_S_1 + dd_S_2 + dd_S_3 + dd_S_4
        sum_S_2 = dd_S_1 + dd_S_2
        dd_S_2week = coarse_dd_S[region_S][idx_in_list - 2 * (96 * 7)]
        dd_S_1week = coarse_dd_S[region_S][idx_in_list - (96 * 7)]
        dd_S_avg = (dd_S_2week + dd_S_1week) / 2.0

        sum_M_6 = dd_M_1 + dd_M_2 + dd_M_3 + dd_M_4 + dd_M_5 + dd_M_6
        sum_M_4 = dd_M_1 + dd_M_2 + dd_M_3 + dd_M_4
        sum_M_2 = dd_M_1 + dd_M_2
        dd_M_2week = coarse_dd_M[region_M][idx_in_list - 2 * (96 * 7)]
        dd_M_1week = coarse_dd_M[region_M][idx_in_list - (96 * 7)]
        dd_M_avg = (dd_M_2week + dd_M_1week) / 2.0

        sum_B_6 = dd_B_1 + dd_B_2 + dd_B_3 + dd_B_4 + dd_B_5 + dd_B_6
        sum_B_4 = dd_B_1 + dd_B_2 + dd_B_3 + dd_B_4
        sum_B_2 = dd_B_1 + dd_B_2
        dd_B_2week = coarse_dd_B[region_B][idx_in_list - 2 * (96 * 7)]
        dd_B_1week = coarse_dd_B[region_B][idx_in_list - (96 * 7)]
        dd_B_avg = (dd_B_2week + dd_B_1week) / 2.0

        # Form a new tuple with these features and adding it to the engineered dataset.
        feature_list.append(dd)
        feature_list.append(int(hash_id))
        feature_list.append(int(region_S))
        feature_list.append(int(region_M))
        feature_list.append(int(region_B))
        feature_list.append(int(day_of_week))
        feature_list.append(int(period))
        feature_list.append(dd_1)
        feature_list.append(dd_2)
        feature_list.append(dd_3)
        feature_list.append(dd_4)
        feature_list.append(dd_5)
        feature_list.append(dd_6)
        feature_list.append(sum6)
        feature_list.append(sum4)
        feature_list.append(sum2)
        feature_list.append(dd_2week)
        feature_list.append(dd_1week)
        feature_list.append(dd_avg)
        feature_list.append(dd_S_1)
        feature_list.append(dd_S_2)
        feature_list.append(dd_S_3)
        feature_list.append(dd_S_4)
        feature_list.append(dd_S_5)
        feature_list.append(dd_S_6)
        feature_list.append(sum_S_6)
        feature_list.append(sum_S_4)
        feature_list.append(sum_S_2)
        feature_list.append(dd_S_2week)
        feature_list.append(dd_S_1week)
        feature_list.append(dd_S_avg)
        feature_list.append(dd_M_1)
        feature_list.append(dd_M_2)
        feature_list.append(dd_M_3)
        feature_list.append(dd_M_4)
        feature_list.append(dd_M_5)
        feature_list.append(dd_M_6)
        feature_list.append(sum_M_6)
        feature_list.append(sum_M_4)
        feature_list.append(sum_M_2)
        feature_list.append(dd_M_2week)
        feature_list.append(dd_M_1week)
        feature_list.append(dd_M_avg)
        feature_list.append(dd_B_1)
        feature_list.append(dd_B_2)
        feature_list.append(dd_B_3)
        feature_list.append(dd_B_4)
        feature_list.append(dd_B_5)
        feature_list.append(dd_B_6)
        feature_list.append(sum_B_6)
        feature_list.append(sum_B_4)
        feature_list.append(sum_B_2)
        feature_list.append(dd_B_2week)
        feature_list.append(dd_B_1week)
        feature_list.append(dd_B_avg)

        engineered_data[data_idx, :] = feature_list
        data_idx += 1
        sys.stdout.write('\r Progress {:.2f}%'.format(
            (idx + 1) * 100.0 / no_tuples))
        sys.stdout.flush()
    print '\n'
    print 'Saving File \n'
    engineered_data.resize((data_idx, columns_count))
    engineered_data = pd.DataFrame(engineered_data)
    return engineered_data
Esempio n. 32
0
import Geohash

geo_hash_str = Geohash.encode(39.92324, 116.3906, 5)
print(geo_hash_str)

# looking for the corne

corne_list = Geohash.decode('wx4g0')

print(corne_list)

# exactly point information
exactly_point = Geohash.decode_exactly('wx4g0')
print(exactly_point)
#!/usr/bin/env python

import sys
sys.path.insert(0, "~/.local/lib/python2.6/site-packages/")

import codecs
sys.stdout = codecs.getwriter('utf8')(sys.stdout)
sys.stderr = codecs.getwriter('utf8')(sys.stderr)

import Geohash
import requests

lat, lng = Geohash.decode(sys.argv[1])
lat = float(lat)
lng = float(lng)
#print '%f, %f' % (lat, lng)
url = 'http://nominatim.openstreetmap.org/reverse?format=json&addressdetails=1&lat=%f&lon=%f' % (
    lat, lng)
r = requests.get(url)
json = r.json()
addr = json['address']

city = None
if 'city' in addr:
    city = addr['city']
state = addr['state']
country = addr['country']  # // country_code ... .upper()

if not city:
    print '%s, %s' % (state, country)
else:
Esempio n. 34
0
def hash2tag(geohash):
    return Geohash.decode(geohash.rstrip("0"))