Example #1
0
def generate_trip_stats_aggregation(feed):

    # get geometry by stop for distance measurement
    geometry_by_stop = gtfstk.build_geometry_by_stop(feed, use_utm=True)

    def trip_stats_aggregation(group):
        d = OrderedDict()

        keys = [
            'route_id', 'route_short_name', 'route_long_name', 'route_mkt', 'route_direction',
            'route_alternative', 'agency_id', 'agency_name', 'route_type', 'direction_id',
            'shape_id', 'line_type', 'line_type_desc', 'cluster_id',
            'cluster_name', 'cluster_sub_desc',
        ]
        for key in keys:
            d[key] = group[key].iat[0]

        d['num_stops'] = group.shape[0]

        d['start_time'] = group['departure_time'].iat[0]
        d['end_time'] = group['departure_time'].iat[-1]

        keys_for_start_and_end = [
            'stop_id', 'stop_code', 'stop_name', 'stop_desc', 'stop_lat', 'stop_lon',
        ]

        keys_for_all = [
            'stop_code', 'stop_id', 'stop_desc_city', 'stop_name',
        ]

        for key in keys_for_start_and_end:
            d[f'start_{key}'] = group[key].iat[0]
            d[f'end_{key}'] = group[key].iat[-1]

        d['start_stop_city'] = group['stop_desc_city'].iat[0]
        d['end_stop_city'] = group['stop_desc_city'].iat[-1]
        d['start_zone'] = group['zone_name'].iat[0]
        d['end_zone'] = group['zone_name'].iat[-1]

        d['num_zones'] = group.zone_name.nunique()
        d['num_zones_missing'] = group.zone_name.isnull().sum()
        dist = geometry_by_stop[d['start_stop_id']].distance(
            geometry_by_stop[d['end_stop_id']])
        d['is_loop'] = int(dist < 400)
        d['duration'] = (d['end_time'] - d['start_time']) / 3600

        d['all_stop_latlon'] = ';'.join(str(x) + ',' + str(y) for x, y in
                                        zip(group['stop_lat'].tolist(), group['stop_lon'].tolist()))

        for key in keys_for_all:
            d[f'all_{key}'] = ';'.join(group[key].tolist())

        return pd.Series(d)

    return trip_stats_aggregation
Example #2
0
def compute_trip_stats_partridge(feed, zones):
    """
    Parameters
    ----------
    feed : partridge feed
    zones: DataFrame with stop_code to zone_name mapping
    
    Returns
    -------
    DataFrame with the following columns:

    - ``'trip_id'``
    - ``'route_id'``
    - ``'route_short_name'``
    - ``'route_short_name'``
    - ``'agency_id'``
    - ``'agency_name'``
    - ``'route_long_name'``
    - ``'route_type'``
    - ``'direction_id'``
    - ``'shape_id'``
    - ``'num_stops'``: number of stops on trip
    - ``'start_time'``: first departure time of the trip
    - ``'end_time'``: last departure time of the trip
    - ``'start_stop_id'``: stop ID of the first stop of the trip
    - ``'end_stop_id'``: stop ID of the last stop of the trip
    - ``'start_stop_name'``: stop name of the first stop of the trip
    - ``'end_stop_name'``: stop name of the last stop of the trip
    - ``'start_stop_code'``: stop code of the first stop of the trip
    - ``'end_stop_code'``: stop code of the last stop of the trip
    - ``'start_stop_lat'``: ``start_stop_lat`` of the first stop of the trip
    - ``'start_stop_lon'``: ``start_stop_lon`` of the first stop of the trip
    - ``'end_stop_lat'``: ``end_stop_lat`` of the last stop of the trip
    - ``'end_stop_lon'``: ``end_stop_lon`` of the last stop of the trip
    - ``'start_zone'``: zone name of the first stop of the trip
    - ``'end_zone'``: zone name of the last stop of the trip
    - ``'num_zones'``:  ``num_zones`` of the first stop of the trip
    - ``'num_zones_missing'``:  ``num_zones_missing`` of the first stop of the trip
    - ``'is_loop'``: 1 if the start and end stop are less than 400m apart and
      0 otherwise
    - ``'distance'``: distance of the trip in ``feed.dist_units``;
      contains all ``np.nan`` entries if ``feed.shapes is None``
    - ``'duration'``: duration of the trip in hours
    - ``'speed'``: distance/duration

    TODO: this is not true here, we're only using shape_dist_traveled
    TODO: implement or drop from docs
    If ``feed.stop_times`` has a ``shape_dist_traveled`` column with at
    least one non-NaN value and ``compute_dist_from_shapes == False``,
    then use that column to compute the distance column.
    Else if ``feed.shapes is not None``, then compute the distance
    column using the shapes and Shapely.
    Otherwise, set the distances to NaN.

    If route IDs are given, then restrict to trips on those routes.

    Notes
    -----
    - Assume the following feed attributes are not ``None``:

        * ``feed.trips``
        * ``feed.routes``
        * ``feed.stop_times``
        * ``feed.shapes`` (optionally)
        * Those used in :func:`.stops.build_geometry_by_stop`

    - Calculating trip distances with ``compute_dist_from_shapes=True``
      seems pretty accurate.  For example, calculating trip distances on
      `this Portland feed
      <https://transitfeeds.com/p/trimet/43/1400947517>`_
      using ``compute_dist_from_shapes=False`` and
      ``compute_dist_from_shapes=True``,
      yields a difference of at most 0.83km from the original values.

    """
    f = feed.trips
    f = (f[['route_id', 'trip_id', 'direction_id', 'shape_id']]
         .merge(feed.routes[['route_id', 'route_short_name', 'route_long_name',
                             'route_type', 'agency_id', 'route_desc']])
         .merge(feed.agency[['agency_id', 'agency_name']], how='left', on='agency_id')
         .merge(feed.stop_times)
         .merge(feed.stops[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'stop_code', 'stop_desc']])
         .merge(zones, how='left')
         .sort_values(['trip_id', 'stop_sequence'])
         # .assign(departure_time=lambda x: x['departure_time'].map(
         #    hp.timestr_to_seconds)
         #       )
         )

    f[['route_mkt', 'route_direction', 'route_alternative']] = f['route_desc'].str.split('-', expand=True)
    f = f.drop('route_desc', axis=1)

    geometry_by_stop = gtfstk.build_geometry_by_stop(feed, use_utm=True)

    g = f.groupby('trip_id')

    def my_agg(group):
        d = OrderedDict()
        d['route_id'] = group['route_id'].iat[0]
        d['route_short_name'] = group['route_short_name'].iat[0]
        d['route_long_name'] = group['route_long_name'].iat[0]
        d['route_mkt'] = group['route_mkt'].iat[0]
        d['route_direction'] = group['route_direction'].iat[0]
        d['route_alternative'] = group['route_alternative'].iat[0]
        d['agency_id'] = group['agency_id'].iat[0]
        d['agency_name'] = group['agency_name'].iat[0]
        d['route_type'] = group['route_type'].iat[0]
        d['direction_id'] = group['direction_id'].iat[0]
        d['shape_id'] = group['shape_id'].iat[0]
        d['num_stops'] = group.shape[0]
        d['start_time'] = group['departure_time'].iat[0]
        d['end_time'] = group['departure_time'].iat[-1]
        d['start_stop_id'] = group['stop_id'].iat[0]
        d['end_stop_id'] = group['stop_id'].iat[-1]
        d['start_stop_code'] = group['stop_code'].iat[0]
        d['end_stop_code'] = group['stop_code'].iat[-1]
        d['start_stop_name'] = group['stop_name'].iat[0]
        d['end_stop_name'] = group['stop_name'].iat[-1]
        d['start_stop_desc'] = group['stop_desc'].iat[0]
        d['end_stop_desc'] = group['stop_desc'].iat[-1]
        d['start_stop_lat'] = group['stop_lat'].iat[0]
        d['start_stop_lon'] = group['stop_lon'].iat[0]
        d['end_stop_lat'] = group['stop_lat'].iat[-1]
        d['end_stop_lon'] = group['stop_lon'].iat[-1]
        d['start_zone'] = group['zone_name'].iat[0]
        d['end_zone'] = group['zone_name'].iat[-1]
        d['num_zones'] = group.zone_name.nunique()
        d['num_zones_missing'] = group.zone_name.isnull().sum()
        dist = geometry_by_stop[d['start_stop_id']].distance(
            geometry_by_stop[d['end_stop_id']])
        d['is_loop'] = int(dist < 400)
        d['duration'] = (d['end_time'] - d['start_time']) / 3600
        return pd.Series(d)

    h = g.apply(my_agg)
    h['distance'] = g.shape_dist_traveled.max()

    # Reset index and compute final stats
    h = h.reset_index()
    h['speed'] = h['distance'] / h['duration'] / 1000
    h[['start_time', 'end_time']] = (
        h[['start_time', 'end_time']].applymap(
            lambda x: gtfstk.helpers.timestr_to_seconds(x, inverse=True))
    )
    return h
Example #3
0
def compute_trip_stats_partridge(feed, zones):
    """
    Parameters
    ----------
    feed : partridge feed
    zones: DataFrame with stop_code to zone_name mapping
    
    Returns
    -------
    DataFrame with the following columns:

    - ``'trip_id'``
    - ``'route_id'``
    - ``'route_short_name'``
    - ``'route_short_name'``
    - ``'agency_id'``
    - ``'agency_name'``
    - ``'route_long_name'``
    - ``'route_type'``
    - ``'direction_id'``
    - ``'shape_id'``
    - ``'num_stops'``: number of stops on trip
    - ``'start_time'``: first departure time of the trip
    - ``'end_time'``: last departure time of the trip
    - ``'start_stop_id'``: stop ID of the first stop of the trip
    - ``'end_stop_id'``: stop ID of the last stop of the trip
    - ``'start_stop_name'``: stop name of the first stop of the trip
    - ``'end_stop_name'``: stop name of the last stop of the trip
    - ``'start_stop_code'``: stop code of the first stop of the trip
    - ``'end_stop_code'``: stop code of the last stop of the trip
    - ``'start_stop_lat'``: ``start_stop_lat`` of the first stop of the trip
    - ``'start_stop_lon'``: ``start_stop_lon`` of the first stop of the trip
    - ``'end_stop_lat'``: ``end_stop_lat`` of the last stop of the trip
    - ``'end_stop_lon'``: ``end_stop_lon`` of the last stop of the trip
    - ``'start_zone'``: zone name of the first stop of the trip
    - ``'end_zone'``: zone name of the last stop of the trip
    - ``'num_zones'``:  ``num_zones`` of the first stop of the trip
    - ``'num_zones_missing'``:  ``num_zones_missing`` of the first stop of the trip
    - ``'is_loop'``: 1 if the start and end stop are less than 400m apart and
      0 otherwise
    - ``'distance'``: distance of the trip in ``feed.dist_units``;
      contains all ``np.nan`` entries if ``feed.shapes is None``
    - ``'duration'``: duration of the trip in hours
    - ``'speed'``: distance/duration

    TODO: this is not true here, we're only using shape_dist_traveled
    TODO: implement or drop from docs
    If ``feed.stop_times`` has a ``shape_dist_traveled`` column with at
    least one non-NaN value and ``compute_dist_from_shapes == False``,
    then use that column to compute the distance column.
    Else if ``feed.shapes is not None``, then compute the distance
    column using the shapes and Shapely.
    Otherwise, set the distances to NaN.

    If route IDs are given, then restrict to trips on those routes.

    Notes
    -----
    - Assume the following feed attributes are not ``None``:

        * ``feed.trips``
        * ``feed.routes``
        * ``feed.stop_times``
        * ``feed.shapes`` (optionally)
        * Those used in :func:`.stops.build_geometry_by_stop`

    - Calculating trip distances with ``compute_dist_from_shapes=True``
      seems pretty accurate.  For example, calculating trip distances on
      `this Portland feed
      <https://transitfeeds.com/p/trimet/43/1400947517>`_
      using ``compute_dist_from_shapes=False`` and
      ``compute_dist_from_shapes=True``,
      yields a difference of at most 0.83km from the original values.

    """
    f = feed.trips
    f = (f[['route_id', 'trip_id', 'direction_id', 'shape_id']]
         .merge(feed.routes[['route_id', 'route_short_name', 'route_long_name',
                             'route_type', 'agency_id', 'route_desc']])
         .merge(feed.agency[['agency_id', 'agency_name']], how='left', on='agency_id')
         .merge(feed.stop_times)
         .merge(feed.stops[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'stop_code', 'stop_desc']])
         .merge(zones, how='left')
         .sort_values(['trip_id', 'stop_sequence'])
         # .assign(departure_time=lambda x: x['departure_time'].map(
         #    hp.timestr_to_seconds)
         #       )
         )

    # parse route_desc
    f[['route_mkt', 'route_direction', 'route_alternative']] = f['route_desc'].str.split('-', expand=True)
    f = f.drop('route_desc', axis=1)

    # parse stop_desc
    stop_desc_fields = {'street': 'רחוב',
                        'city': 'עיר',
                        'platform': 'רציף',
                        'floor': 'קומה'}

    stop_desc_prefix = 'stop_desc_'

    STOP_DESC_RE = ''
    for n, fld in stop_desc_fields.items():
        STOP_DESC_RE += fld + f':(?P<{stop_desc_prefix + n}>.*)'

    sd = f.stop_desc.str.extract(STOP_DESC_RE).apply(lambda x: x.str.strip())
    f = pd.concat([f, sd], axis=1)

    # get geometry by stop for distance measurement
    geometry_by_stop = gtfstk.build_geometry_by_stop(feed, use_utm=True)

    g = f.groupby('trip_id')

    def my_agg(group):
        d = OrderedDict()
        d['route_id'] = group['route_id'].iat[0]
        d['route_short_name'] = group['route_short_name'].iat[0]
        d['route_long_name'] = group['route_long_name'].iat[0]
        d['route_mkt'] = group['route_mkt'].iat[0]
        d['route_direction'] = group['route_direction'].iat[0]
        d['route_alternative'] = group['route_alternative'].iat[0]
        d['agency_id'] = group['agency_id'].iat[0]
        d['agency_name'] = group['agency_name'].iat[0]
        d['route_type'] = group['route_type'].iat[0]
        d['direction_id'] = group['direction_id'].iat[0]
        d['shape_id'] = group['shape_id'].iat[0]
        d['num_stops'] = group.shape[0]
        d['start_time'] = group['departure_time'].iat[0]
        d['end_time'] = group['departure_time'].iat[-1]
        d['start_stop_id'] = group['stop_id'].iat[0]
        d['end_stop_id'] = group['stop_id'].iat[-1]
        d['start_stop_code'] = group['stop_code'].iat[0]
        d['end_stop_code'] = group['stop_code'].iat[-1]
        d['start_stop_name'] = group['stop_name'].iat[0]
        d['end_stop_name'] = group['stop_name'].iat[-1]
        d['start_stop_desc'] = group['stop_desc'].iat[0]
        d['end_stop_desc'] = group['stop_desc'].iat[-1]
        d['start_stop_lat'] = group['stop_lat'].iat[0]
        d['start_stop_lon'] = group['stop_lon'].iat[0]
        d['end_stop_lat'] = group['stop_lat'].iat[-1]
        d['end_stop_lon'] = group['stop_lon'].iat[-1]
        d['start_stop_city'] = group['stop_desc_city'].iat[0]
        d['end_stop_city'] = group['stop_desc_city'].iat[-1]
        d['start_zone'] = group['zone_name'].iat[0]
        d['end_zone'] = group['zone_name'].iat[-1]
        d['num_zones'] = group.zone_name.nunique()
        d['num_zones_missing'] = group.zone_name.isnull().sum()
        dist = geometry_by_stop[d['start_stop_id']].distance(
            geometry_by_stop[d['end_stop_id']])
        d['is_loop'] = int(dist < 400)
        d['duration'] = (d['end_time'] - d['start_time']) / 3600

        d['all_stop_latlon'] = ';'.join(str(x)+','+str(y) for x, y in
                                        zip(group['stop_lat'].tolist(), group['stop_lon'].tolist()))

        d['all_stop_code'] = ';'.join(group['stop_code'].tolist())
        d['all_stop_id'] = ';'.join(group['stop_id'].tolist())
        d['all_stop_desc_city'] = ';'.join(group['stop_desc_city'].tolist())

        return pd.Series(d)

    h = g.apply(my_agg)
    h['distance'] = g.shape_dist_traveled.max()

    # Reset index and compute final stats
    h = h.reset_index()
    h['speed'] = h['distance'] / h['duration'] / 1000
    h[['start_time', 'end_time']] = (
        h[['start_time', 'end_time']].applymap(
            lambda x: gtfstk.helpers.timestr_to_seconds(x, inverse=True))
    )
    return h