def test_timezone_conversion(self): tdf = skmob.TrajDataFrame(self.default_data_df, latitude='latitude', datetime='hour', user_id='user') tdf.timezone_conversion(from_timezone='Europe/London', to_timezone='Europe/Berlin') assert tdf[DATETIME][0] == pd.Timestamp('2008-10-23 14:53:05')
def test_tdf_from_list(self): tdf = skmob.TrajDataFrame(self.default_data_list, latitude=1, longitude=2, datetime=3, user_id=0) self.perform_default_asserts(tdf) print(tdf.head() ) # raised TypeError: 'BlockManager' object is not iterable
def GetDistance(df_input): df_input = RemoveZeroLatLon(df_input) #print(' ==> ',len(df_input)) if (len(df_input) > 0): #print(df_input) tdf = skmob.TrajDataFrame(df_input, latitude='UserLat', longitude='UserLong', datetime='DateTimeStamp', user_id='EmployeeId') dsl_df = distance_straight_line(tdf) #print(dsl_df) dfHead = dsl_df.head() distanceTravel = dfHead['distance_straight_line'].values[0] del dfHead, dsl_df, tdf else: distanceTravel = 0 return distanceTravel
def process_for_county(curfips): #POI sanity check POI_location = '/project/biocomplexity/data/XMode/evaluation/POI_directory/' if not os.path.isfile(POI_location + curfips + '_POIS.csv'): empty_df = pd.DataFrame(columns=['uid']) mylogger.info("no nearby POI will be found in this part for " + str(curfips)) return empty_df #Res sanity check Res_location = '/project/biocomplexity/data/XMode/evaluation/Residence_data/' if not os.path.isfile(Res_location + curfips + '_Res.csv'): empty_df = pd.DataFrame(columns=['uid']) mylogger.info("no nearby Res will be found in this part for " + str(curfips)) return empty_df func_start = datetime.now() fips_df = info_df[info_df.fips == curfips] cur_min_lat, cur_max_lat, cur_min_lng, cur_max_lng = fips_df.iloc[0][ 'min_lat'], fips_df.iloc[0]['max_lat'], fips_df.iloc[0][ 'min_lng'], fips_df.iloc[0]['max_lng'] #initial filtering based on bounding box, works very fast county_df = part_df[part_df['latitude'].ge(cur_min_lat) & part_df['latitude'].le(cur_max_lat) & part_df['longitude'].ge(cur_min_lng) & part_df['longitude'].le(cur_max_lng)] county_code = (str(curfips).zfill(5))[2:] county_prefix = (str(curfips).zfill(5))[0:2] #print(county_code) state_shape_gdf = whole_shape_gdf[( whole_shape_gdf['STATEFP'] == county_prefix)] #state_shape_gdf = whole_shape_gdf county_shape_gdf = state_shape_gdf[( state_shape_gdf['COUNTYFP'] == county_code)] #joining with shape file to get more accurate measurement, works slower, that is why initial filtering is done. if (len(county_df) > 0): county_df = gpd.GeoDataFrame(county_df, geometry=gpd.points_from_xy( county_df.longitude, county_df.latitude)) county_df.crs = county_shape_gdf.crs county_gdf = gpd.sjoin(county_df, county_shape_gdf, how='inner') county_gdf = county_gdf[updated_col_names].copy(deep=True) spatial_join_time = datetime.now() mylogger.info("county pings separated for " + str(curfips) + ", time taken: " + str(spatial_join_time - func_start)) else: empty_df = pd.DataFrame(columns=['uid']) mylogger.info("no nearby pings found in this part for " + str(curfips)) return empty_df #stop point detection county_gdf['location_at'] = pd.to_datetime(county_gdf['location_at'], unit='s') try: traj_df = skmob.TrajDataFrame(county_gdf, latitude='latitude', longitude='longitude', user_id='advertiser_id', datetime='location_at') county_stop_df = detection.stops(traj_df, stop_radius_factor=0.05, minutes_for_a_stop=5.0, spatial_radius_km=0.2, leaving_time=True) county_stop_df['county_fips'] = curfips stop_detect_time = datetime.now() mylogger.info("stop point detection complete for " + str(curfips) + ", time taken: " + str(stop_detect_time - spatial_join_time)) except: empty_df = pd.DataFrame(columns=['uid']) mylogger.info('stop detection failed for ' + curfips) return empty_df #associate each point with its closest POI information #load POI data for corresponding county POI_df = pd.read_csv(POI_location + curfips + '_POIS.csv') POI_df = POI_df[[ 'source_id', 'poi_name', 'st_name', 'st_num', 'designation', 'lat', 'lon', 'fac_name' ]] POI_df = POI_df.rename(columns={'lat': 'POI_lat', 'lon': 'POI_lon'}) #find index of closest POI for each ping POI_dist_np = distance.cdist(county_stop_df[['lat', 'lng']], POI_df[['POI_lat', 'POI_lon']], metric='euclidean') POI_dist_df = pd.DataFrame(POI_dist_np, index=county_stop_df['uid'], columns=POI_df['source_id'].tolist()) county_stop_df['POI_id'] = [ i[i.astype(bool)][0] for i in np.where( POI_dist_df.values == POI_dist_df.min( axis=1)[:, None], POI_dist_df.columns, False) ] #join with POI data based on previously found index, also calculate actual distance county_POI_df = county_stop_df.merge(POI_df, how='inner', left_on='POI_id', right_on='source_id') county_POI_df = county_POI_df.drop(columns=['source_id']) county_POI_df["POI_dist"] = haversine(county_POI_df["lng"], county_POI_df["lat"], county_POI_df["POI_lon"], county_POI_df["POI_lat"]) POI_time = datetime.now() mylogger.info("pings matched with nearby POIs " + str(curfips) + ", time taken: " + str(POI_time - stop_detect_time)) try: Res_df = pd.read_csv(Res_location + curfips + '_Res.csv') #hack if Res_df.shape[0] > 500000: Res_df = Res_df.sample(500000) if Res_df.shape[0] < 2: return county_POI_df Res_df = Res_df[['blockgroup_id', 'urban_rural', 'lat', 'lon']] Res_df = Res_df.rename(columns={'lat': 'Res_lat', 'lon': 'Res_lon'}) #find index of closest POI for each ping Res_dist_np = distance.cdist(county_stop_df[['lat', 'lng']], Res_df[['Res_lat', 'Res_lon']], metric='euclidean') Res_dist_df = pd.DataFrame(Res_dist_np, index=county_stop_df['uid'], columns=Res_df['blockgroup_id'].tolist()) county_stop_df['Res_id'] = [ i[i.astype(bool)][0] for i in np.where( Res_dist_df.values == Res_dist_df.min( axis=1)[:, None], Res_dist_df.columns, False) ] #join with POI data based on previously found index, also calculate actual distance county_Res_df = county_stop_df.merge(Res_df, how='inner', left_on='Res_id', right_on='blockgroup_id') county_Res_df = county_Res_df.drop(columns=['blockgroup_id']) county_Res_df["Res_dist"] = haversine(county_Res_df["lng"], county_Res_df["lat"], county_Res_df["Res_lon"], county_Res_df["Res_lat"]) Res_time = datetime.now() mylogger.info("pings matched with nearby Residences " + str(curfips) + ", time taken: " + str(Res_time - stop_detect_time)) except Exception as e: mylogger.info('something wrong happened while matching with ' + str(curfips) + ' Residences') mylogger.info(e) return county_POI_df county_POI_df['Res_dist'] = county_Res_df['Res_dist'] county_POI_df['Res_lon'] = county_Res_df['Res_lon'] county_POI_df['Res_lat'] = county_Res_df['Res_lat'] county_POI_df['urban_rural'] = county_Res_df['urban_rural'] return county_POI_df
def test_slicing_a_tdf_returns_a_tdf(self): tdf = skmob.TrajDataFrame(self.default_data_df, latitude='latitude', datetime='hour', user_id='user') assert type(tdf) == type(tdf[tdf[UID] == 1][:1])
def test_tdf_from_dict(self): tdf = skmob.TrajDataFrame(self.default_data_dict, latitude='latitude', datetime='hour', user_id='user') self.perform_default_asserts(tdf)
import skmob # create a TrajDataFrame from a list data_list = [[1, 39.984094, 116.319236, '2008-10-23 13:53:05'], [1, 39.984198, 116.319322, '2008-10-23 13:53:06'], [1, 39.984224, 116.319402, '2008-10-23 13:53:11'], [1, 39.984211, 116.319389, '2008-10-23 13:53:16']] tdf = skmob.TrajDataFrame(data_list, latitude=1, longitude=2, datetime=3) # print a portion of the TrajDataFrame print(tdf.head()) print(type(tdf)) import pandas as pd # create a DataFrame from the previous list data_df = pd.DataFrame(data_list, columns=['user', 'latitude', 'lng', 'hour']) # print the type of the object print(type(data_df)) # now create a TrajDataFrame from the pandas DataFrame tdf = skmob.TrajDataFrame(data_df, latitude='latitude', datetime='hour', user_id='user') # print the type of the object print(type(tdf)) # print a portion of the TrajDataFrame print(tdf.head()) # download the file from https://raw.githubusercontent.com/scikit-mobility/scikit-mobility/master/examples/geolife_sample.txt.gz # read the trajectory data (GeoLife, Beijing, China) tdf = skmob.TrajDataFrame.from_file('examples/geolife_sample.txt.gz', latitude='lat', longitude='lon', user_id='user', datetime='datetime')