def _linearly_interpolate_infill_times( stop_times_orig_df: pd.DataFrame, use_multiprocessing: bool) -> pd.DataFrame: # Prevent any upstream modification of this object stops_times_df = stop_times_orig_df.copy() # Extract a list of all unqiue trip ids attached to the stops target_trip_ids = stops_times_df['trip_id'].unique().tolist() # Monitor run time performance start_time = time.time() if use_multiprocessing is True: cpu_count = mp.cpu_count() log('Running parallelized trip times interpolation on ' '{} processes'.format(cpu_count)) manager = make_trip_time_interpolator_manager() trip_times_interpolator = manager.TripTimesInterpolator(stops_times_df) with mp.Pool(processes=cpu_count) as pool: results = pool.starmap(_trip_times_interpolator_pool_map, [(trip_times_interpolator, trip_id) for trip_id in target_trip_ids]) else: log('Running serialized trip times interpolation (no parallelization)') trip_times_interpolator = TripTimesInterpolator(stops_times_df) results = [trip_times_interpolator.generate_infilled_times(trip_id) for trip_id in target_trip_ids] elapsed = round(time.time() - start_time, 2) log('Trip times interpolation complete. Execution time: {}s'.format( elapsed)) # Take all the resulting dataframes and stack them together cleaned = [] for times_sub in results: # Note: Extract values as list with the intent of avoiding # otherwise-expensive append operations (df-to-df) cleaned.extend(times_sub.values.tolist()) # Prepare for new df creation by getting list of columns cols = stops_times_df.columns.values.tolist() cols.remove('trip_id') cols.append('trip_id') # Convert matrices to a pandas DataFrame again cleaned_new_df = pd.DataFrame(cleaned, columns=cols) cleaned_new_df = cleaned_new_df.reset_index(drop=True) return cleaned_new_df
def _generate_route_processing_results( target_route_ids: List, target_time_start: int, target_time_end: int, ftrips: pd.DataFrame, stop_times: pd.DataFrame, feed_stops: pd.DataFrame, stop_cost_method: Any, use_multiprocessing: bool) -> Tuple[pd.DataFrame, pd.DataFrame]: # Track the runtime of this method start_time = time.time() if use_multiprocessing is True: cpu_count = mp.cpu_count() log('Running parallelized route costing on ' '{} processes'.format(cpu_count)) manager = make_route_processor_manager() route_analyzer = manager.RouteProcessor( target_time_start, target_time_end, ftrips, stop_times, feed_stops, stop_cost_method) with mp.Pool(processes=cpu_count) as pool: results = pool.starmap(_route_analyzer_pool_map, [(route_analyzer, route_id) for route_id in target_route_ids]) else: log('Running serialized route costing (no parallelization)') route_analyzer = RouteProcessor( target_time_start, target_time_end, ftrips, stop_times, feed_stops, stop_cost_method) results = [route_analyzer.generate_route_costs(rid) for rid in target_route_ids] elapsed = round(time.time() - start_time, 2) log('Route costing complete. Execution time: {}s'.format(elapsed)) # First, create a 2-dimensional matrix for each of the output series all_edge_costs = [] all_wait_times = [] for tst_sub, edge_costs in results: # For each result, skip if it is empty if type(edge_costs) is pd.DataFrame and not edge_costs.empty: # Resume the expected adding of each list result to the matrices all_edge_costs.extend(edge_costs.values.tolist()) # And again, for the other dataframe if type(tst_sub) is pd.DataFrame and not tst_sub.empty: all_wait_times.extend(tst_sub.values.tolist()) # Convert matrices to a pandas DataFrame again all_edge_costs_columns = ['edge_cost', 'from_stop_id', 'to_stop_id'] all_edge_costs_new_df = pd.DataFrame(all_edge_costs, columns=all_edge_costs_columns) all_wait_times_columns = ['stop_id', 'wait_dir_0', 'wait_dir_1'] all_wait_times_new_df = pd.DataFrame(all_wait_times, columns=all_wait_times_columns) return (all_edge_costs_new_df, all_wait_times_new_df)
def generate_summary_wait_times( df: pd.DataFrame, fallback_stop_cost: float) -> pd.DataFrame: """ Calculates average wait time at this stop, given all observed Parameters ---------- df : pd.DataFrame A DataFrame wait times in both directions (0 and 1) for a specific \ stop id fallback_stop_cost : float A fallback wait time (in seconds) if there is not enough information \ (e.g. discrete arrival times scheduled) to calcualte a headway for a \ specific transit stop Returns ------- summary_wait_times : pd.DataFrame Returns a DataFrame of the estimated wait time (boarding cost) for \ each stop given the wait times associated with that stop in the \ schedule timeframe """ df_sub = df[['stop_id', 'wait_dir_0', 'wait_dir_1']].reset_index(drop=True) init_of_stop_ids = df_sub['stop_id'].unique() # Default values for average waits with not enough data should be # NaN types, but let's make sure all null types are NaNs to be safe for col in ['wait_dir_0', 'wait_dir_1']: mask = df_sub[col].isnull() df_sub.loc[mask, col] = np.nan # Convert anything that is 0 or less seconds to a NaN as well # to remove negative or 0 second waits in the system over_zero_mask = df_sub[col] > 0 df_sub.loc[~over_zero_mask, col] = np.nan # With all null types converted to NaN, we can cast col as float df_sub[col] = df_sub[col].astype(float) # Clean out the None values dir_0_mask = ~np.isnan(df_sub.wait_dir_0) dir_1_mask = ~np.isnan(df_sub.wait_dir_1) # We can't include values where both directions # have NaNs at same time d0_ids = df_sub[dir_0_mask].stop_id.unique() d1_ids = df_sub[dir_1_mask].stop_id.unique() keep_ids = list(d0_ids) + list(d1_ids) df_sub_clean = df_sub[df_sub.stop_id.isin(keep_ids)] orig_len = len(df_sub) new_len = len(df_sub_clean) if not new_len == orig_len: log(('Cleaned out bi-directional NaN values from ' 'stop IDs. From {} to {}.'.format(orig_len, new_len))) # And now replace df_sub df_sub = df_sub_clean # Recheck all for NaNs dir_0_mask_2 = np.isnan(df_sub.wait_dir_0) dir_1_mask_2 = np.isnan(df_sub.wait_dir_1) df_sub.loc[dir_0_mask_2, 'wait_dir_0'] = df_sub.wait_dir_1 df_sub.loc[dir_1_mask_2, 'wait_dir_1'] = df_sub.wait_dir_0 # TODO: All this pruning is a mess, needs to be # organized much better # One more time to drop out the subset that are NaN # from a given stop id dir_0_mask_3 = ~np.isnan(df_sub.wait_dir_0) df_sub = df_sub[dir_0_mask_3] dir_1_mask_3 = ~np.isnan(df_sub.wait_dir_1) df_sub = df_sub[dir_1_mask_3] # Make sure that there are no None values left dir_0_check_2 = df_sub[np.isnan(df_sub.wait_dir_0)] dir_1_check_2 = df_sub[np.isnan(df_sub.wait_dir_1)] dir_0_trigger = len(dir_0_check_2) > 0 dir_1_trigger = len(dir_1_check_2) > 0 if dir_0_trigger or dir_1_trigger: raise InvalidParsedWaitTimes( 'NaN values for both directions on some stop IDs.') # At this point, we should make sure that there are still values # in the DataFrame - otherwise we are in a situation where there are # no valid times to evaluate. This is okay; we just need to skip straight # to the application of the fallback value if df_sub.empty: # So just make a fallback empty dataframe for now summed_reset = pd.DataFrame({'stop_id': [], 'avg_cost': []}) # Only attempt this group by summary if at least one row to group on else: grouped = df_sub.groupby('stop_id') summarized = grouped.apply(summarize_waits_at_one_stop) # Clean up summary results, reformat pandas DataFrame result summed_reset = _format_summarized_outputs(summarized) end_of_stop_ids = summed_reset.stop_id.unique() log('Original stop id count: {}'.format(len(init_of_stop_ids))) log('After cleaning stop id count: {}'.format(len(end_of_stop_ids))) # Check for the presence of any unresolved stop ids and # assign them some value boarding cost if len(init_of_stop_ids) > len(end_of_stop_ids): a = set(list(init_of_stop_ids)) b = set(list(end_of_stop_ids)) unresolved_ids = list(a - b) log('Some unaccounted for stop ids. ' 'Resolving {}...'.format(len(unresolved_ids))) # TODO: Perhaps these are start/end stops and should adopt # a cost that is "average" for that route? # I should think of how to actually do this # because we do not have enough data, for now let's # just assign some default high cost connection value # to these stops sids = list(summed_reset.stop_id) acst = list(summed_reset.avg_cost) for i in unresolved_ids: sids.append(i) acst.append(fallback_stop_cost) # Rebuild the dataframe summed_reset = pd.DataFrame({'stop_id': sids, 'avg_cost': acst}) return summed_reset
def generate_route_costs(self, route_id: str): # Get all the subset of trips that are related to this route trips = self.trips.loc[route_id].copy() # Pandas will try and make returned result a Series if there # is only one result - prevent this from happening if isinstance(trips, pd.Series): trips = trips.to_frame().T # Get just the stop times related to this trip st_trip_id_mask = self.stop_times.trip_id.isin(trips.trip_id) stimes_init = self.stop_times[st_trip_id_mask].copy() # Then subset further by just the time period that we care about start_time_mask = (stimes_init.arrival_time >= self.target_time_start) end_time_mask = (stimes_init.arrival_time <= self.target_time_end) stimes = stimes_init[start_time_mask & end_time_mask] # Report on progress if requested a = len(stimes_init.trip_id.unique()) b = len(stimes.trip_id.unique()) log('\tReduced selected trips on route {} from {} to {}.'.format( route_id, a, b)) trips_and_stop_times = pd.merge(trips, stimes, how='inner', on='trip_id') trips_and_stop_times = pd.merge(trips_and_stop_times, self.all_stops.copy(), how='inner', on='stop_id') sort_list = ['stop_sequence', 'arrival_time', 'departure_time'] trips_and_stop_times = trips_and_stop_times.sort_values(sort_list) # Check direction_id column value before using # trips_and_stop_times to generate wait and edge costs # Note: Advantage to adding handling at route level is that peartree # avoids tossing direction id if a specific route has all direction # id rows filled in (while another does not, which is possible). if 'direction_id' in trips_and_stop_times: # If there is such column then check if it contains NaN has_nan = trips_and_stop_times['direction_id'].isnull() if len(trips_and_stop_times[has_nan]) > 0: # If it has no full coverage in direction_id, drop the column trips_and_stop_times.drop('direction_id', axis=1, inplace=True) wait_times = generate_wait_times(trips_and_stop_times) # Look up wait time for each stop in wait_times for each direction wait_zero = trips_and_stop_times['stop_id'].apply( lambda x: wait_times[0][x]) trips_and_stop_times['wait_dir_0'] = wait_zero wait_one = trips_and_stop_times['stop_id'].apply( lambda x: wait_times[1][x]) trips_and_stop_times['wait_dir_1'] = wait_one tst_sub = trips_and_stop_times[['stop_id', 'wait_dir_0', 'wait_dir_1']] # Get all edge costs for this route and add to the running total edge_costs = generate_all_observed_edge_costs(trips_and_stop_times) return (tst_sub, edge_costs)
def test_log(): log('foo')
def generate_summary_wait_times( df: pd.DataFrame, fallback_stop_cost: float) -> pd.DataFrame: df_sub = df[['stop_id', 'wait_dir_0', 'wait_dir_1']].reset_index(drop=True) init_of_stop_ids = df_sub.stop_id.unique() # Default values for average waits with not enough data should be # NaN types, but let's make sure all null types are NaNs to be safe for col in ['wait_dir_0', 'wait_dir_1']: mask = df_sub[col].isnull() df_sub.loc[mask, col] = np.nan # Convert anything that is 0 or less seconds to a NaN as well # to remove negative or 0 second waits in the system df_sub.loc[~(df_sub[col] > 0), col] = np.nan # With all null types converted to NaN, we can cast col as float df_sub[col] = df_sub[col].astype(float) # Clean out the None values dir_0_mask = ~np.isnan(df_sub.wait_dir_0) dir_1_mask = ~np.isnan(df_sub.wait_dir_1) # We can't include values where both directions # have NaNs at same time d0_ids = df_sub[dir_0_mask].stop_id.unique() d1_ids = df_sub[dir_1_mask].stop_id.unique() keep_ids = list(d0_ids) + list(d1_ids) df_sub_clean = df_sub[df_sub.stop_id.isin(keep_ids)] orig_len = len(df_sub) new_len = len(df_sub_clean) if not new_len == orig_len: log(('Cleaned out bi-directional NaN values from ' 'stop IDs. From {} to {}.'.format(orig_len, new_len))) # And now replace df_sub df_sub = df_sub_clean # Recheck all for NaNs dir_0_mask_2 = np.isnan(df_sub.wait_dir_0) dir_1_mask_2 = np.isnan(df_sub.wait_dir_1) df_sub.loc[dir_0_mask_2, 'wait_dir_0'] = df_sub.wait_dir_1 df_sub.loc[dir_1_mask_2, 'wait_dir_1'] = df_sub.wait_dir_0 # TODO: All this pruning is a mess, needs to be # organized much better # One more time to drop out the subset that are NaN # from a given stop id dir_0_mask_3 = ~np.isnan(df_sub.wait_dir_0) df_sub = df_sub[dir_0_mask_3] dir_1_mask_3 = ~np.isnan(df_sub.wait_dir_1) df_sub = df_sub[dir_1_mask_3] # Make sure that there are no None values left dir_0_check_2 = df_sub[np.isnan(df_sub.wait_dir_0)] dir_1_check_2 = df_sub[np.isnan(df_sub.wait_dir_1)] if (len(dir_0_check_2) > 0) or (len(dir_1_check_2) > 0): raise Exception('NaN values for both directions on some stop IDs.') grouped = df_sub.groupby('stop_id') summarized = grouped.apply(summarize_waits_at_one_stop) summed_reset = summarized.reset_index(drop=False) summed_reset.columns = ['stop_id', 'avg_cost'] end_of_stop_ids = summed_reset.stop_id.unique() log('Original stop id count: {}'.format(len(init_of_stop_ids))) log('After cleaning stop id count: {}'.format(len(end_of_stop_ids))) # Check for the presence of any unresolved stop ids and # assign them some value boarding cost if len(init_of_stop_ids) > len(end_of_stop_ids): a = set(list(init_of_stop_ids)) b = set(list(end_of_stop_ids)) unresolved_ids = list(a - b) log('Some unaccounted for stop ids. ' 'Resolving {}...'.format(len(unresolved_ids))) # TODO: Perhaps these are start/end stops and should adopt # a cost that is "average" for that route? # I should think of how to actually do this # because we do not have enough data, for now let's # just assign some default high cost connection value # to these stops sids = list(summed_reset.stop_id) acst = list(summed_reset.avg_cost) for i in unresolved_ids: sids.append(i) acst.append(fallback_stop_cost) # Rebuild the dataframe summed_reset = pd.DataFrame({'stop_id': sids, 'avg_cost': acst}) return summed_reset
def generate_edge_and_wait_values( feed: ptg.gtfs.feed, target_time_start: int, target_time_end: int, interpolate_times: bool, use_multiprocessing: bool) -> Tuple[pd.DataFrame]: # Initialize the trips dataframe to be worked with ftrips = feed.trips.copy() ftrips = ftrips[~ftrips['route_id'].isnull()] # Flags whether we interpolate intermediary stops or not if interpolate_times: # Prepare the stops times dataframe by also infilling # all stop times that are NaN with their linearly interpolated # values based on their nearest numerically valid neighbors stop_times = linearly_interpolate_infill_times( feed.stop_times, use_multiprocessing) else: stop_times = feed.stop_times.copy() # TODO: Just like linearly_interpolate_infill_times contains all these # operations neatly in an abstracted method, do the same for the # running of the parallelize route processing start_time = time.time() target_route_ids = feed.routes.route_id if use_multiprocessing is True: cpu_count = mp.cpu_count() log('Running parallelized route costing on ' '{} processes'.format(cpu_count)) manager = make_route_processor_manager() route_analyzer = manager.RouteProcessor( target_time_start, target_time_end, ftrips, stop_times, feed.stops.copy()) with mp.Pool(processes=cpu_count) as pool: results = pool.starmap(_route_analyzer_pool_map, [(route_analyzer, route_id) for route_id in target_route_ids]) else: log('Running serialized route costing (no parallelization)') route_analyzer = RouteProcessor( target_time_start, target_time_end, ftrips, stop_times, feed.stops.copy()) results = [route_analyzer.generate_route_costs(rid) for rid in target_route_ids] elapsed = round(time.time() - start_time, 2) log('Route costing complete. Execution time: {}s'.format(elapsed)) all_edge_costs = None all_wait_times = None for tst_sub, edge_costs in results: # Add to the running total for wait times in this feed subset if all_wait_times is None: all_wait_times = tst_sub else: all_wait_times = all_wait_times.append(tst_sub) # Add to the running total in this feed subset if all_edge_costs is None: all_edge_costs = edge_costs else: all_edge_costs = all_edge_costs.append(edge_costs) return (all_edge_costs, all_wait_times)