def generate_trip_stats_aggregation(feed): # get geometry by stop for distance measurement geometry_by_stop = gtfstk.build_geometry_by_stop(feed, use_utm=True) def trip_stats_aggregation(group): d = OrderedDict() keys = [ 'route_id', 'route_short_name', 'route_long_name', 'route_mkt', 'route_direction', 'route_alternative', 'agency_id', 'agency_name', 'route_type', 'direction_id', 'shape_id', 'line_type', 'line_type_desc', 'cluster_id', 'cluster_name', 'cluster_sub_desc', ] for key in keys: d[key] = group[key].iat[0] d['num_stops'] = group.shape[0] d['start_time'] = group['departure_time'].iat[0] d['end_time'] = group['departure_time'].iat[-1] keys_for_start_and_end = [ 'stop_id', 'stop_code', 'stop_name', 'stop_desc', 'stop_lat', 'stop_lon', ] keys_for_all = [ 'stop_code', 'stop_id', 'stop_desc_city', 'stop_name', ] for key in keys_for_start_and_end: d[f'start_{key}'] = group[key].iat[0] d[f'end_{key}'] = group[key].iat[-1] d['start_stop_city'] = group['stop_desc_city'].iat[0] d['end_stop_city'] = group['stop_desc_city'].iat[-1] d['start_zone'] = group['zone_name'].iat[0] d['end_zone'] = group['zone_name'].iat[-1] d['num_zones'] = group.zone_name.nunique() d['num_zones_missing'] = group.zone_name.isnull().sum() dist = geometry_by_stop[d['start_stop_id']].distance( geometry_by_stop[d['end_stop_id']]) d['is_loop'] = int(dist < 400) d['duration'] = (d['end_time'] - d['start_time']) / 3600 d['all_stop_latlon'] = ';'.join(str(x) + ',' + str(y) for x, y in zip(group['stop_lat'].tolist(), group['stop_lon'].tolist())) for key in keys_for_all: d[f'all_{key}'] = ';'.join(group[key].tolist()) return pd.Series(d) return trip_stats_aggregation
def compute_trip_stats_partridge(feed, zones): """ Parameters ---------- feed : partridge feed zones: DataFrame with stop_code to zone_name mapping Returns ------- DataFrame with the following columns: - ``'trip_id'`` - ``'route_id'`` - ``'route_short_name'`` - ``'route_short_name'`` - ``'agency_id'`` - ``'agency_name'`` - ``'route_long_name'`` - ``'route_type'`` - ``'direction_id'`` - ``'shape_id'`` - ``'num_stops'``: number of stops on trip - ``'start_time'``: first departure time of the trip - ``'end_time'``: last departure time of the trip - ``'start_stop_id'``: stop ID of the first stop of the trip - ``'end_stop_id'``: stop ID of the last stop of the trip - ``'start_stop_name'``: stop name of the first stop of the trip - ``'end_stop_name'``: stop name of the last stop of the trip - ``'start_stop_code'``: stop code of the first stop of the trip - ``'end_stop_code'``: stop code of the last stop of the trip - ``'start_stop_lat'``: ``start_stop_lat`` of the first stop of the trip - ``'start_stop_lon'``: ``start_stop_lon`` of the first stop of the trip - ``'end_stop_lat'``: ``end_stop_lat`` of the last stop of the trip - ``'end_stop_lon'``: ``end_stop_lon`` of the last stop of the trip - ``'start_zone'``: zone name of the first stop of the trip - ``'end_zone'``: zone name of the last stop of the trip - ``'num_zones'``: ``num_zones`` of the first stop of the trip - ``'num_zones_missing'``: ``num_zones_missing`` of the first stop of the trip - ``'is_loop'``: 1 if the start and end stop are less than 400m apart and 0 otherwise - ``'distance'``: distance of the trip in ``feed.dist_units``; contains all ``np.nan`` entries if ``feed.shapes is None`` - ``'duration'``: duration of the trip in hours - ``'speed'``: distance/duration TODO: this is not true here, we're only using shape_dist_traveled TODO: implement or drop from docs If ``feed.stop_times`` has a ``shape_dist_traveled`` column with at least one non-NaN value and ``compute_dist_from_shapes == False``, then use that column to compute the distance column. Else if ``feed.shapes is not None``, then compute the distance column using the shapes and Shapely. Otherwise, set the distances to NaN. If route IDs are given, then restrict to trips on those routes. Notes ----- - Assume the following feed attributes are not ``None``: * ``feed.trips`` * ``feed.routes`` * ``feed.stop_times`` * ``feed.shapes`` (optionally) * Those used in :func:`.stops.build_geometry_by_stop` - Calculating trip distances with ``compute_dist_from_shapes=True`` seems pretty accurate. For example, calculating trip distances on `this Portland feed <https://transitfeeds.com/p/trimet/43/1400947517>`_ using ``compute_dist_from_shapes=False`` and ``compute_dist_from_shapes=True``, yields a difference of at most 0.83km from the original values. """ f = feed.trips f = (f[['route_id', 'trip_id', 'direction_id', 'shape_id']] .merge(feed.routes[['route_id', 'route_short_name', 'route_long_name', 'route_type', 'agency_id', 'route_desc']]) .merge(feed.agency[['agency_id', 'agency_name']], how='left', on='agency_id') .merge(feed.stop_times) .merge(feed.stops[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'stop_code', 'stop_desc']]) .merge(zones, how='left') .sort_values(['trip_id', 'stop_sequence']) # .assign(departure_time=lambda x: x['departure_time'].map( # hp.timestr_to_seconds) # ) ) f[['route_mkt', 'route_direction', 'route_alternative']] = f['route_desc'].str.split('-', expand=True) f = f.drop('route_desc', axis=1) geometry_by_stop = gtfstk.build_geometry_by_stop(feed, use_utm=True) g = f.groupby('trip_id') def my_agg(group): d = OrderedDict() d['route_id'] = group['route_id'].iat[0] d['route_short_name'] = group['route_short_name'].iat[0] d['route_long_name'] = group['route_long_name'].iat[0] d['route_mkt'] = group['route_mkt'].iat[0] d['route_direction'] = group['route_direction'].iat[0] d['route_alternative'] = group['route_alternative'].iat[0] d['agency_id'] = group['agency_id'].iat[0] d['agency_name'] = group['agency_name'].iat[0] d['route_type'] = group['route_type'].iat[0] d['direction_id'] = group['direction_id'].iat[0] d['shape_id'] = group['shape_id'].iat[0] d['num_stops'] = group.shape[0] d['start_time'] = group['departure_time'].iat[0] d['end_time'] = group['departure_time'].iat[-1] d['start_stop_id'] = group['stop_id'].iat[0] d['end_stop_id'] = group['stop_id'].iat[-1] d['start_stop_code'] = group['stop_code'].iat[0] d['end_stop_code'] = group['stop_code'].iat[-1] d['start_stop_name'] = group['stop_name'].iat[0] d['end_stop_name'] = group['stop_name'].iat[-1] d['start_stop_desc'] = group['stop_desc'].iat[0] d['end_stop_desc'] = group['stop_desc'].iat[-1] d['start_stop_lat'] = group['stop_lat'].iat[0] d['start_stop_lon'] = group['stop_lon'].iat[0] d['end_stop_lat'] = group['stop_lat'].iat[-1] d['end_stop_lon'] = group['stop_lon'].iat[-1] d['start_zone'] = group['zone_name'].iat[0] d['end_zone'] = group['zone_name'].iat[-1] d['num_zones'] = group.zone_name.nunique() d['num_zones_missing'] = group.zone_name.isnull().sum() dist = geometry_by_stop[d['start_stop_id']].distance( geometry_by_stop[d['end_stop_id']]) d['is_loop'] = int(dist < 400) d['duration'] = (d['end_time'] - d['start_time']) / 3600 return pd.Series(d) h = g.apply(my_agg) h['distance'] = g.shape_dist_traveled.max() # Reset index and compute final stats h = h.reset_index() h['speed'] = h['distance'] / h['duration'] / 1000 h[['start_time', 'end_time']] = ( h[['start_time', 'end_time']].applymap( lambda x: gtfstk.helpers.timestr_to_seconds(x, inverse=True)) ) return h
def compute_trip_stats_partridge(feed, zones): """ Parameters ---------- feed : partridge feed zones: DataFrame with stop_code to zone_name mapping Returns ------- DataFrame with the following columns: - ``'trip_id'`` - ``'route_id'`` - ``'route_short_name'`` - ``'route_short_name'`` - ``'agency_id'`` - ``'agency_name'`` - ``'route_long_name'`` - ``'route_type'`` - ``'direction_id'`` - ``'shape_id'`` - ``'num_stops'``: number of stops on trip - ``'start_time'``: first departure time of the trip - ``'end_time'``: last departure time of the trip - ``'start_stop_id'``: stop ID of the first stop of the trip - ``'end_stop_id'``: stop ID of the last stop of the trip - ``'start_stop_name'``: stop name of the first stop of the trip - ``'end_stop_name'``: stop name of the last stop of the trip - ``'start_stop_code'``: stop code of the first stop of the trip - ``'end_stop_code'``: stop code of the last stop of the trip - ``'start_stop_lat'``: ``start_stop_lat`` of the first stop of the trip - ``'start_stop_lon'``: ``start_stop_lon`` of the first stop of the trip - ``'end_stop_lat'``: ``end_stop_lat`` of the last stop of the trip - ``'end_stop_lon'``: ``end_stop_lon`` of the last stop of the trip - ``'start_zone'``: zone name of the first stop of the trip - ``'end_zone'``: zone name of the last stop of the trip - ``'num_zones'``: ``num_zones`` of the first stop of the trip - ``'num_zones_missing'``: ``num_zones_missing`` of the first stop of the trip - ``'is_loop'``: 1 if the start and end stop are less than 400m apart and 0 otherwise - ``'distance'``: distance of the trip in ``feed.dist_units``; contains all ``np.nan`` entries if ``feed.shapes is None`` - ``'duration'``: duration of the trip in hours - ``'speed'``: distance/duration TODO: this is not true here, we're only using shape_dist_traveled TODO: implement or drop from docs If ``feed.stop_times`` has a ``shape_dist_traveled`` column with at least one non-NaN value and ``compute_dist_from_shapes == False``, then use that column to compute the distance column. Else if ``feed.shapes is not None``, then compute the distance column using the shapes and Shapely. Otherwise, set the distances to NaN. If route IDs are given, then restrict to trips on those routes. Notes ----- - Assume the following feed attributes are not ``None``: * ``feed.trips`` * ``feed.routes`` * ``feed.stop_times`` * ``feed.shapes`` (optionally) * Those used in :func:`.stops.build_geometry_by_stop` - Calculating trip distances with ``compute_dist_from_shapes=True`` seems pretty accurate. For example, calculating trip distances on `this Portland feed <https://transitfeeds.com/p/trimet/43/1400947517>`_ using ``compute_dist_from_shapes=False`` and ``compute_dist_from_shapes=True``, yields a difference of at most 0.83km from the original values. """ f = feed.trips f = (f[['route_id', 'trip_id', 'direction_id', 'shape_id']] .merge(feed.routes[['route_id', 'route_short_name', 'route_long_name', 'route_type', 'agency_id', 'route_desc']]) .merge(feed.agency[['agency_id', 'agency_name']], how='left', on='agency_id') .merge(feed.stop_times) .merge(feed.stops[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'stop_code', 'stop_desc']]) .merge(zones, how='left') .sort_values(['trip_id', 'stop_sequence']) # .assign(departure_time=lambda x: x['departure_time'].map( # hp.timestr_to_seconds) # ) ) # parse route_desc f[['route_mkt', 'route_direction', 'route_alternative']] = f['route_desc'].str.split('-', expand=True) f = f.drop('route_desc', axis=1) # parse stop_desc stop_desc_fields = {'street': 'רחוב', 'city': 'עיר', 'platform': 'רציף', 'floor': 'קומה'} stop_desc_prefix = 'stop_desc_' STOP_DESC_RE = '' for n, fld in stop_desc_fields.items(): STOP_DESC_RE += fld + f':(?P<{stop_desc_prefix + n}>.*)' sd = f.stop_desc.str.extract(STOP_DESC_RE).apply(lambda x: x.str.strip()) f = pd.concat([f, sd], axis=1) # get geometry by stop for distance measurement geometry_by_stop = gtfstk.build_geometry_by_stop(feed, use_utm=True) g = f.groupby('trip_id') def my_agg(group): d = OrderedDict() d['route_id'] = group['route_id'].iat[0] d['route_short_name'] = group['route_short_name'].iat[0] d['route_long_name'] = group['route_long_name'].iat[0] d['route_mkt'] = group['route_mkt'].iat[0] d['route_direction'] = group['route_direction'].iat[0] d['route_alternative'] = group['route_alternative'].iat[0] d['agency_id'] = group['agency_id'].iat[0] d['agency_name'] = group['agency_name'].iat[0] d['route_type'] = group['route_type'].iat[0] d['direction_id'] = group['direction_id'].iat[0] d['shape_id'] = group['shape_id'].iat[0] d['num_stops'] = group.shape[0] d['start_time'] = group['departure_time'].iat[0] d['end_time'] = group['departure_time'].iat[-1] d['start_stop_id'] = group['stop_id'].iat[0] d['end_stop_id'] = group['stop_id'].iat[-1] d['start_stop_code'] = group['stop_code'].iat[0] d['end_stop_code'] = group['stop_code'].iat[-1] d['start_stop_name'] = group['stop_name'].iat[0] d['end_stop_name'] = group['stop_name'].iat[-1] d['start_stop_desc'] = group['stop_desc'].iat[0] d['end_stop_desc'] = group['stop_desc'].iat[-1] d['start_stop_lat'] = group['stop_lat'].iat[0] d['start_stop_lon'] = group['stop_lon'].iat[0] d['end_stop_lat'] = group['stop_lat'].iat[-1] d['end_stop_lon'] = group['stop_lon'].iat[-1] d['start_stop_city'] = group['stop_desc_city'].iat[0] d['end_stop_city'] = group['stop_desc_city'].iat[-1] d['start_zone'] = group['zone_name'].iat[0] d['end_zone'] = group['zone_name'].iat[-1] d['num_zones'] = group.zone_name.nunique() d['num_zones_missing'] = group.zone_name.isnull().sum() dist = geometry_by_stop[d['start_stop_id']].distance( geometry_by_stop[d['end_stop_id']]) d['is_loop'] = int(dist < 400) d['duration'] = (d['end_time'] - d['start_time']) / 3600 d['all_stop_latlon'] = ';'.join(str(x)+','+str(y) for x, y in zip(group['stop_lat'].tolist(), group['stop_lon'].tolist())) d['all_stop_code'] = ';'.join(group['stop_code'].tolist()) d['all_stop_id'] = ';'.join(group['stop_id'].tolist()) d['all_stop_desc_city'] = ';'.join(group['stop_desc_city'].tolist()) return pd.Series(d) h = g.apply(my_agg) h['distance'] = g.shape_dist_traveled.max() # Reset index and compute final stats h = h.reset_index() h['speed'] = h['distance'] / h['duration'] / 1000 h[['start_time', 'end_time']] = ( h[['start_time', 'end_time']].applymap( lambda x: gtfstk.helpers.timestr_to_seconds(x, inverse=True)) ) return h