Esempio n. 1
0
def _stops_in_edge_table_selector(input_stops_df, input_stop_times_df):
    """
    Select stops that are active during the day and time period specified

    Parameters
    ----------
    input_stops_df : pandas.DataFrame
        stops DataFrame
    input_stop_times_df : pandas.DataFrame
        stop_times dataframe

    Returns
    -------
    selected_stops_df : pandas.DataFrame

    """
    start_time = time.time()

    # add unique stop id
    input_stops_df['unique_stop_id'] = (input_stops_df['stop_id'].str.cat(
        input_stops_df['unique_agency_id'].astype('str'), sep='_'))

    # Select stop ids that match stop ids in the subset stop time data that
    # match day and time selection
    selected_stops_df = input_stops_df.loc[
        input_stops_df['unique_stop_id'].isin(
            input_stop_times_df['unique_stop_id'])]

    log('{:,} of {:,} records selected from stops. Took {:,'
        '.2f} seconds'.format(len(selected_stops_df), len(input_stops_df),
                              time.time() - start_time))

    return selected_stops_df
Esempio n. 2
0
def _adjust_outliers(df_ref, col, treshhold):
    # proactively prevent mutation outside of function scope
    df = df_ref.copy()

    # first, handle the positive columns
    pos_thresh = abs(treshhold)
    neg_thresh = ((-1) * pos_thresh)

    mask_pos = df[col] > pos_thresh
    if len(df[mask_pos]):
        log(('{} rows in transit stops dataset exceeded positive threshold '
             'of {} for {} column.').format(len(df[mask_pos]), pos_thresh,
                                            col))

    df.loc[mask_pos, col] = pos_thresh - 0.0001

    # now handle the negative columns
    mask_neg = df[col] < neg_thresh
    if len(df[mask_neg]):
        log(('{} rows in transit stops dataset exceeded negative threshold '
             'of {} for {} column.').format(len(df[mask_neg]), neg_thresh,
                                            col))

    df.loc[mask_neg, col] = neg_thresh + 0.0001

    return df
Esempio n. 3
0
def _read_gtfs_calendar_dates(textfile_path, textfile):
    """
    Read gtfs calendar_dates.txt as a pandas dataframe

    Parameters
    ----------
    textfile_path : str
        director of text file
    textfile : str
        name of text file

    Returns
    -------
    df : pandas.DataFrame
    """
    if textfile != 'calendar_dates.txt':
        raise ValueError('{} is not a proper GTFS file name'.format(textfile))

    df = pd.read_csv(os.path.join(textfile_path, textfile),
                     dtype={'service_id': object}, low_memory=False)
    if len(df) == 0:
        warning_msg = ('{} has no records. This could indicate that this feed '
                       'is using calendar.txt for service_ids.')
        log(warning_msg.format(os.path.join(
            textfile_path, textfile)), level=lg.WARNING)

    # remove any extra whitespace in column names
    df.rename(columns=lambda x: x.strip(), inplace=True)
    return df
Esempio n. 4
0
def _format_transit_net_edge(stop_times_df):
    """
    Format transit network data table to match the format required for edges
    in Pandana graph networks edges

    Parameters
    ----------
    stop_times_df : pandas.DataFrame
        interpolated stop times with travel time between stops for the subset
        time and day

    Returns
    -------
    merged_edge_df : pandas.DataFrame

    """
    start_time = time.time()

    log('Starting transformation process for {:,} '
        'total trips...'.format(len(stop_times_df['unique_trip_id'].unique())))

    # set columns for new df for data needed by pandana for edges
    merged_edge = []

    stop_times_df.sort_values(by=['unique_trip_id', 'stop_sequence'],
                              inplace=True)

    for trip, tmp_trip_df in stop_times_df.groupby(['unique_trip_id']):
        edge_df = pd.DataFrame({
            "node_id_from":
            tmp_trip_df['unique_stop_id'].iloc[:-1].values,
            "node_id_to":
            tmp_trip_df['unique_stop_id'].iloc[1:].values,
            "weight":
            tmp_trip_df['timediff'].iloc[1:].values,
            "unique_agency_id":
            tmp_trip_df['unique_agency_id'].iloc[1:].values,
            # set unique trip id without edge order to join other data later
            "unique_trip_id":
            trip
        })

        # Set current trip id to edge id column adding edge order at
        # end of string
        edge_df['sequence'] = (edge_df.index + 1).astype(int)

        # append completed formatted edge table to master edge table
        merged_edge.append(edge_df)

    merged_edge_df = pd.concat(merged_edge, ignore_index=True)
    merged_edge_df['sequence'] = merged_edge_df['sequence'].astype(int,
                                                                   copy=False)
    merged_edge_df['id'] = (merged_edge_df['unique_trip_id'].str.cat(
        merged_edge_df['sequence'].astype('str'), sep='_'))

    log('stop time table transformation to '
        'Pandana format edge table completed. '
        'Took {:,.2f} seconds'.format(time.time() - start_time))

    return merged_edge_df
Esempio n. 5
0
def _convert_imp_time_units(df, time_col='weight', convert_to='minutes'):
    """
    Convert the travel time impedance units

    Parameters
    ----------
    df : pandas.DataFrame
        edge DataFrame with weight column
    time_col : str
        name of column that holds the travel impedance
    convert_to : {'seconds', 'minutes'}
        unit to convert travel time to. should always be set to 'minutes'

    Returns
    -------
    df : pandas.DataFrame

    """
    valid_convert_to = ['seconds', 'minutes']
    if convert_to not in valid_convert_to or not isinstance(convert_to, str):
        raise ValueError(
            '{} not a valid value or not a string'.format(convert_to))

    if convert_to == 'seconds':
        df[time_col] = df[time_col].astype('float')
        df[time_col] = df[time_col] * 60
        log('Time conversion completed: minutes converted to seconds.')

    if convert_to == 'minutes':
        df[time_col] = df[time_col].astype('float')
        df[time_col] = df[time_col] / 60.0
        log('Time conversion completed: seconds converted to minutes.')

    return df
Esempio n. 6
0
    def remove_feed(self, del_key=None, remove_all=False):
        """
        Remove GTFS feeds from the existing urbanaccess_gtfsfeeds instance

        Parameters
        ----------
        del_key : str or list
            dict keys as a single string or list of
            strings to remove from existing
        remove_all : bool
            if true, remove all keys from existing
            urbanaccess_gtfsfeeds instance
        """

        assert isinstance(remove_all, bool)

        if del_key is None and remove_all:
            self.gtfs_feeds = {}
            log('Removed all feeds from gtfs_feeds')

        else:

            assert isinstance(del_key,list) or isinstance(del_key,str), \
                'del_key must be a string or list of strings'
            assert remove_all == False, \
                'remove_all must be False in order to ' \
                'remove individual records: {}'.format(del_key)

            del_key = [del_key]

            for key in del_key:
                assert key in self.gtfs_feeds.keys(), \
                    ('{} key to delete was not found in gtfs_feeds').format(key)
                del self.gtfs_feeds[key]
                log('Removed {} feed from gtfs_feeds'.format(key))
Esempio n. 7
0
def _time_difference(stop_times_df=None):
    """
    Calculate the difference in departure_time between stops in stop times
    table to produce travel time

    Parameters
    ----------
    stop_times_df : pandas.DataFrame
        interpolated stop times dataframe

    Returns
    -------
    stop_times_df : pandas.DataFrame

    """
    start_time = time.time()

    # calculate difference between consecutive records grouping by trip id.
    stop_times_df['timediff'] = stop_times_df.groupby('unique_trip_id')[
        'departure_time_sec_interpolate'].diff()
    log(
        'Difference between stop times has been successfully calculated. '
        'Took {:,.2f} seconds'.format(
            time.time() - start_time))

    return stop_times_df
Esempio n. 8
0
def _time_selector(df=None, starttime=None, endtime=None):
    """
    Select stop times that fall within a specified time range

    Parameters
    ----------
    df : pandas.DataFrame
        interpolated stop times dataframe
    starttime : str
        24 hour clock formatted time 1
    endtime : str
        24 hour clock formatted time 2
    Returns
    -------
    selected_stop_timesdf : pandas.DataFrame

    """
    assert len(df) > 0

    # TODO: Deprecated, should not be referenced anymore
    start_time = time.time()

    selected_stop_timesdf = time_selector(df, starttime, endtime)

    log('Stop times from {} to {} successfully selected {:,} records out of '
        '{:,} total records ({:.2f} percent of total). Took {:,'
        '.2f} seconds'.format(starttime, endtime, len(selected_stop_timesdf),
                              len(df),
                              (len(selected_stop_timesdf) / len(df)) * 100,
                              time.time() - start_time))

    return selected_stop_timesdf
Esempio n. 9
0
    def add_feed(self, add_dict, replace=False):
        """
        Add a dictionary to the urbanaccess_gtfsfeeds instance.

        Parameters
        ----------
        add_dict : dict
            Dictionary to add to existing urbanaccess_gtfsfeeds with the name
            of the transit service or agency GTFS feed as the key and the
            GTFS feed URL as the value to pass to the GTFS downloader
            as:
            {unique name of GTFS feed or transit service/agency : URL of feed}
        replace : bool, optional
            If key of dict is already in the UrbanAccess replace
            the existing dict value with the value passed
        """

        if not isinstance(add_dict, dict):
            raise ValueError('add_dict is not a dict')
        if not isinstance(replace, bool):
            raise ValueError('replace is not bool')

        if replace is not True:

            for key in add_dict.keys():
                if key in self.gtfs_feeds.keys():
                    raise ValueError(
                        '{} passed in add_dict already exists in gtfs_feeds. '
                        'Only unique keys are allowed to be added.'.format(
                            key))
                if not isinstance(key, str):
                    raise ValueError('{} must be a string'.format(key))
                for value in add_dict[key]:
                    if not isinstance(value, str):
                        raise ValueError('{} must be a string'.format(value))

            for key, value in add_dict.items():
                if value in self.gtfs_feeds.values():
                    raise ValueError('duplicate values were found when the '
                                     'passed add_dict dictionary was added to '
                                     'the existing dictionary. Feed URL '
                                     'values must be unique.')
            gtfs_feeds = self.gtfs_feeds.update(add_dict)

        else:
            for key in add_dict.keys():
                if key in self.gtfs_feeds.keys():
                    log('{} passed in add_dict will replace existing {} feed '
                        'in gtfs_feeds.'.format(key, key))
                if not isinstance(key, str):
                    raise ValueError('{} must be a string'.format(key))
                for value in add_dict[key]:
                    if not isinstance(value, str):
                        raise ValueError('{} must be a string'.format(value))

            gtfs_feeds = self.gtfs_feeds.update(add_dict)

        log('Added {} feeds to gtfs_feeds: {}'.format(len(add_dict), add_dict))

        return gtfs_feeds
Esempio n. 10
0
def _stops_in_edge_table_selector(input_stops_df=None,
                                  input_stop_times_df=None):
    """
    Select stops that are active during the day and time period specified

    Parameters
    ----------
    input_stops_df : pandas.DataFrame
        stops dataframe
    input_stop_times_df : pandas.DataFrame
        stop_times dataframe

    Returns
    -------
    selected_stops_df : pandas.DataFrame

    """
    start_time = time.time()

    # add unique stop id
    input_stops_df['unique_stop_id'] = input_stops_df[[
        'stop_id', 'unique_agency_id'
    ]].apply(lambda x: '{}_{}'.format(x[0], x[1]), axis=1)

    # Select stop ids that match stop ids in the subset stop time data that
    # match day and time selection
    selected_stops_df = input_stops_df.loc[
        input_stops_df['unique_stop_id'].isin(
            input_stop_times_df['unique_stop_id'])]

    log('{:,} of {:,} records selected from stops. Took {:,'
        '.2f} seconds'.format(len(selected_stops_df), len(input_stops_df),
                              time.time() - start_time))

    return selected_stops_df
Esempio n. 11
0
def _convert_imp_time_units(df=None, time_col='weight', convert_to='minutes'):
    """
    Convert the travel time impedance units

    Parameters
    ----------
    df : pandas.DataFrame
        edge dataframe with weight column
    time_col : str
        name of column that holds the travel impedance
    convert_to : {'seconds','minutes'}
        unit to convert travel time to. should always be set to 'minutes'

    Returns
    -------
    df : pandas.DataFrame

    """
    valid_convert_to = ['seconds', 'minutes']
    assert convert_to in valid_convert_to and isinstance(convert_to, str)

    if convert_to == 'seconds':
        df[time_col] = df[time_col].astype('float')
        df[time_col] = df[time_col] * 60
        log('Time conversion completed: minutes converted to seconds.')

    if convert_to == 'minutes':
        df[time_col] = df[time_col].astype('float')
        df[time_col] = df[time_col] / 60.0
        log('Time conversion completed: seconds converted to minutes.')

    return df
Esempio n. 12
0
def _add_txt_definitions(stops_df, routes_df, stop_times_df,
                         trips_df):
    """
    Append GTFS definitions to stops, routes, stop times, and trips dataframes

    Parameters
    ----------
    stops_df : pandas:DataFrame
        stops dataframe
    routes_df : pandas:DataFrame
        routes dataframe
    stop_times_df : pandas:DataFrame
        stop times dataframe
    trips_df : pandas:DataFrame
        trip dataframe

    Returns
    -------
    stops_df, routes_df, stop_times_df, trips_df : pandas.DataFrame
    """
    stops_df = _stops_definitions(df=stops_df)
    routes_df = _routes_definitions(df=routes_df)
    stop_times_df = _stop_times_definitions(df=stop_times_df)
    trips_df = _trips_definitions(df=trips_df)

    log(
        'Added descriptive definitions to stops, routes, stop_times, '
        'and trips tables')

    return stops_df, routes_df, stop_times_df, trips_df
Esempio n. 13
0
def _connector_edges(osm_nodes, transit_nodes, travel_speed_mph=3):
    """
    Generate the connector edges between the osm and transit edges and
    weight by travel time

    Parameters
    ----------
    osm_nodes : pandas.DataFrame
        osm nodes DataFrame
    transit_nodes : pandas.DataFrame
        transit nodes DataFrame
    travel_speed_mph : int, optional
        travel speed to use to calculate travel time across a
        distance on a edge. units are in miles per hour (MPH)
        for pedestrian travel this is assumed to be 3 MPH

    Returns
    -------
    net_connector_edges : pandas.DataFrame

    """
    start_time = time.time()

    transit_nodes['nearest_osm_node'] = _nearest_neighbor(
        osm_nodes[['x', 'y']],
        transit_nodes[['x', 'y']])

    net_connector_edges = []

    for transit_node_id, row in transit_nodes.iterrows():
        # create new edge between the node in df2 (transit)
        # and the node in openstreetmap (pedestrian)

        osm_node_id = int(row['nearest_osm_node'])
        osm_row = osm_nodes.loc[osm_node_id]

        distance = dist_calc((row['y'], row['x']),
                             (osm_row['y'], osm_row['x'])).miles
        time_ped_to_transit = distance / travel_speed_mph * 60
        time_transit_to_ped = distance / travel_speed_mph * 60

        # save the edge
        net_type = 'transit to osm'
        net_connector_edges.append((transit_node_id, osm_node_id,
                                    time_transit_to_ped, net_type))
        # make the edge bi-directional
        net_type = 'osm to transit'
        net_connector_edges.append((osm_node_id, transit_node_id,
                                    time_ped_to_transit, net_type))

    net_connector_edges = pd.DataFrame(net_connector_edges,
                                       columns=["from", "to",
                                                "weight", "net_type"])

    log(
        'Connector edges between the OSM and transit network nodes '
        'successfully completed. Took {:,.2f} seconds'.format(
            time.time() - start_time))

    return net_connector_edges
Esempio n. 14
0
def tripschedualselector(input_trips_df=None,
                         input_calendar_df=None,
                         day=None):
    """
    Select trips that run on a specific day

    Parameters
    ----------
    input_trips_df : pandas.DataFrame
        trips dataframe
    input_calendar_df : pandas.DataFrame
        calendar dataframe
    day : {'friday','monday','saturday','sunday','thursday','tuesday','wednesday'}
        day of the week to extract transit schedule from that corresponds to the day in the GTFS calendar

    Returns
    -------
    calendar_selected_trips_df : pandas.DataFrame

    """
    start_time = time.time()

    valid_days = [
        'friday', 'monday', 'saturday', 'sunday', 'thursday', 'tuesday',
        'wednesday'
    ]
    assert day in valid_days and isinstance(day, str),'Incorrect day specified. Must be lowercase string: ' \
                                                      'friday, monday, saturday, sunday, thursday, tuesday, wednesday.'

    # create unique service ids
    input_trips_df['unique_service_id'] = input_trips_df[[
        'service_id', 'unique_agency_id'
    ]].apply(lambda x: '{}_{}'.format(x[0], x[1]), axis=1)
    input_calendar_df['unique_service_id'] = input_calendar_df[[
        'service_id', 'unique_agency_id'
    ]].apply(lambda x: '{}_{}'.format(x[0], x[1]), axis=1)

    # select service ids where day specified in function has a 1 = service runs on that day
    input_calendar_df = input_calendar_df[(
        input_calendar_df[day] == 1)]  # subset calendar by the specified day
    input_calendar_df = input_calendar_df[['unique_service_id']]

    # select and create df of trips that match the service ids for the day of the week specified in function
    # merge calendar df that has service ids for specified day with trips df
    calendar_selected_trips_df = input_trips_df.loc[
        input_trips_df['unique_service_id'].isin(
            input_calendar_df['unique_service_id'])]

    sort_columns = ['route_id', 'trip_id', 'direction_id']
    if 'direction_id' not in calendar_selected_trips_df.columns:
        sort_columns.remove('direction_id')
    calendar_selected_trips_df.sort_values(by=sort_columns, inplace=True)
    calendar_selected_trips_df.reset_index(drop=True, inplace=True)
    calendar_selected_trips_df.drop('unique_service_id', axis=1, inplace=True)

    log('{:,} of {:,} total trips were extracted representing calendar day: {}. Took {:,.2f} seconds'
        .format(len(calendar_selected_trips_df), len(input_trips_df), day,
                time.time() - start_time))

    return calendar_selected_trips_df
Esempio n. 15
0
def _calc_headways_by_route_stop(df):
    """
    Calculate headways by route stop

    Parameters
    ----------
    df : pandas.DataFrame
        interpolated stop times dataframe for stop times within the time
        range with appended trip and route information

    Returns
    -------
    dataframe : pandas.DataFrame
        dataframe of statistics of route stop headways in units of minutes
    """

    #TODO: Optimize for speed

    start_time = time.time()

    delimiter = ','
    df['unique_stop_route'] = df[['unique_stop_id','unique_route_id']].apply(lambda x : '{}{}{}'.format(x[0],delimiter,x[1]), axis=1)
    stop_route_groups = df.groupby('unique_stop_route')
    log('Starting route stop headway calculation for {:,} route stops...'.format(len(stop_route_groups)))
    results = {}
    for unique_stop_route, stop_route_group in stop_route_groups:
        stop_route_group.sort(['departure_time_sec_interpolate'],ascending = True, inplace=True)
        next_bus_time = stop_route_group['departure_time_sec_interpolate'].iloc[1:].values
        prev_bus_time = stop_route_group['departure_time_sec_interpolate'].iloc[:-1].values
        stop_route_group_headways = (next_bus_time - prev_bus_time)/60
        results[unique_stop_route] = pd.Series(stop_route_group_headways).describe()

    log('Route stop headway calculation complete. Took {:,.2f} seconds'.format(time.time()-start_time))

    return pd.DataFrame(results).T
Esempio n. 16
0
    def from_yaml(cls,
                  gtfsfeeddir=os.path.join(config.settings.data_folder,
                                           'gtfsfeeds'),
                  yamlname='gtfsfeeds.yaml'):
        """
        Create a urbanaccess_gtfsfeeds instance from a saved YAML.

        Parameters
        ----------
        gtfsfeeddir : str, optional
            Directory to load a YAML file.
        yamlname : str or file like, optional
            File name from which to load a YAML file.
        Returns
        -------
        urbanaccess_gtfsfeeds
        """

        if not isinstance(gtfsfeeddir, str):
            raise ValueError('gtfsfeeddir must be a string')
        if not os.path.exists(gtfsfeeddir):
            raise ValueError(
                '{} does not exist or was not found'.format(gtfsfeeddir))
        if not isinstance(yamlname, str):
            raise ValueError('yaml must be a string')

        yaml_file = os.path.join(gtfsfeeddir, yamlname)

        with open(yaml_file, 'r') as f:
            yaml_config = yaml.load(f)

        if not isinstance(yaml_config, dict):
            raise ValueError('{} yamlname is not a dict'.format(yamlname))

        validkey = 'gtfs_feeds'
        if validkey not in yaml_config.keys():
            raise ValueError('key gtfs_feeds was not found in YAML file')

        for key in yaml_config['gtfs_feeds'].keys():
            if not isinstance(key, str):
                raise ValueError('{} must be a string'.format(key))
            for value in yaml_config['gtfs_feeds'][key]:
                if not isinstance(value, str):
                    raise ValueError('{} must be a string'.format(value))
        unique_url_count = len(
            pd.DataFrame.from_dict(yaml_config['gtfs_feeds'],
                                   orient='index')[0].unique())
        url_count = len(yaml_config['gtfs_feeds'])
        if unique_url_count != url_count:
            raise ValueError(
                'duplicate values were found when the passed add_dict '
                'dictionary was added to the existing dictionary. Feed URL '
                'values must be unique.')

        gtfsfeeds = cls(gtfs_feeds=yaml_config.get('gtfs_feeds', {}))
        log('{} YAML successfully loaded with {} feeds.'.format(
            yaml_file, len(yaml_config['gtfs_feeds'])))

        return gtfsfeeds
Esempio n. 17
0
def headways(gtfsfeeds_df, headway_timerange):
    """
    Calculate headways by route stop for a specific time range

    Parameters
    ----------
    gtfsfeeds_df : object
        gtfsfeeds_dfs object with all processed GTFS data tables
    headway_timerange : list
        time range for which to calculate headways between as a list of
        time 1 and time 2
        where times are 24 hour clock strings such as:
        ['07:00:00','10:00:00']

    Returns
    -------
    gtfsfeeds_dfs.headways : pandas.DataFrame
        gtfsfeeds_dfs object for the headways dataframe with statistics of
        route stop headways in units of minutes
        with relevant route and stop information
    """

    # TODO: Change Assertion to errors/exceptions
    time_error_statement = (
        '{} starttime and endtime are not in the correct format. '
        'Format should be 24 hour clock in following format: 08:00:00 or 17:00:00'
        .format(headway_timerange))
    assert isinstance(
        headway_timerange,
        list) and len(headway_timerange) == 2, time_error_statement
    assert headway_timerange[0] < headway_timerange[1], time_error_statement

    for t in headway_timerange:
        assert isinstance(t, str), time_error_statement
        assert len(t) == 8, time_error_statement
    if int(str(headway_timerange[1][0:2])) - int(str(
            headway_timerange[0][0:2])) > 3:
        log('WARNING: Time range passed: {} is a {} hour period. Long periods over 3 hours may take a '
            'significant amount of time to process.'.format(
                headway_timerange,
                int(str(headway_timerange[1][0:2])) -
                int(str(headway_timerange[0][0:2]))),
            level=lg.WARNING)

    assert gtfsfeeds_df is not None
    if gtfsfeeds_df.stop_times_int.empty or gtfsfeeds_df.trips.empty or gtfsfeeds_df.routes.empty:
        raise ValueError(
            'one of the gtfsfeeds_dfs objects: stop_times_int, trips, or routes were found to be empty.'
        )

    headways_df = _headway_handler(
        interpolated_stop_times_df=gtfsfeeds_df.stop_times_int,
        trips_df=gtfsfeeds_df.trips,
        routes_df=gtfsfeeds_df.routes,
        headway_timerange=headway_timerange)

    gtfsfeeds_df.headways = headways_df

    return gtfsfeeds_df
Esempio n. 18
0
def _format_pandana_edges_nodes(edge_df, node_df):
    """
    Perform final formatting on nodes and edge DataFrames to prepare them
    for use in Pandana.
    Formatting mainly consists of creating a unique node id and edge from
    and to id that is an integer
    per Pandana requirements.

    Parameters
    ----------
    edge_df : pandas.DataFrame
        integrated transit and osm edge DataFrame
    node_df : pandas.DataFrame
        integrated transit and osm node DataFrame

    Returns
    -------
    edge_df_wnumericid, node_df : pandas.DataFrame

    """
    start_time = time.time()

    # pandana requires ids that are integer: for nodes - make it the index,
    # for edges make it the from and to columns
    node_df['id_int'] = range(1, len(node_df) + 1)

    edge_df.rename(columns={'id': 'edge_id'}, inplace=True)
    tmp = pd.merge(edge_df, node_df[['id', 'id_int']], left_on='from',
                   right_on='id', sort=False, copy=False, how='left')
    tmp['from_int'] = tmp['id_int']
    tmp.drop(['id_int', 'id'], axis=1, inplace=True)
    edge_df_wnumericid = pd.merge(tmp, node_df[['id', 'id_int']], left_on='to',
                                  right_on='id', sort=False, copy=False,
                                  how='left')
    edge_df_wnumericid['to_int'] = edge_df_wnumericid['id_int']
    edge_df_wnumericid.drop(['id_int', 'id'], axis=1, inplace=True)
    # turn mixed dtype cols into all same format
    col_list = edge_df_wnumericid.select_dtypes(include=['object']).columns
    for col in col_list:
        try:
            edge_df_wnumericid[col] = edge_df_wnumericid[col].astype(str)
        # deal with edge cases where typically the name of a street is not
        # in a uniform string encoding such as names with accents
        except UnicodeEncodeError:
            log('Fixed unicode error in {} column'.format(col))
            edge_df_wnumericid[col] = edge_df_wnumericid[col].str.encode(
                'utf-8')

    node_df.set_index('id_int', drop=True, inplace=True)
    # turn mixed dtype col into all same format
    node_df['id'] = node_df['id'].astype(str)
    if 'nearest_osm_node' in node_df.columns:
        node_df.drop(['nearest_osm_node'], axis=1, inplace=True)

    log('Edge and node tables formatted for Pandana with integer node ids: '
        'id_int, to_int, and from_int. Took {:,.2f} seconds'.format(
            time.time() - start_time))
    return edge_df_wnumericid, node_df
Esempio n. 19
0
    def from_yaml(cls, gtfsfeeddir=os.path.join(config.settings.data_folder,
                                                'gtfsfeeds'),
                  yamlname='gtfsfeeds.yaml'):
        """
        Create a urbanaccess_gtfsfeeds instance from a saved YAML.

        Parameters
        ----------
        gtfsfeeddir : str, optional
            Directory to load a YAML file.
        yamlname : str or file like, optional
            File name from which to load a YAML file.
        Returns
        -------
        urbanaccess_gtfsfeeds
        """

        assert isinstance(gtfsfeeddir,str), 'gtfsfeeddir must be a string'
        assert os.path.exists(gtfsfeeddir), \
            ('{} does not exist or was not found').format(gtfsfeeddir)
        assert isinstance(yamlname,str) and '.yaml' in yamlname, \
            'yaml must be a string and have file extension .yaml'

        yaml_file = os.path.join(gtfsfeeddir, yamlname)

        with open(yaml_file, 'r') as f:
            yaml_config = yaml.load(f)

        assert isinstance(yaml_config,dict), \
            'yamlname is not a dict'.format(yamlname)

        validkey = 'gtfs_feeds'
        assert validkey in yaml_config.keys(), \
            'key gtfs_feeds was not found in YAML file'

        for key in yaml_config['gtfs_feeds'].keys():
            assert isinstance(key,str), ('{} must be a string').format(key)
            for value in yaml_config['gtfs_feeds'][key]:
                assert isinstance(value,str), \
                    ('{} must be a string').format(value)

        # make sure there is just one feed coming in from the yaml file
        feed_cts = pd.Series(yaml_config['gtfs_feeds'].values()).value_counts()
        all_feeds_equal_one = (feed_cts == 1).all()
        assert all_feeds_equal_one, ('Duplicate values were found '
                                     'when the passed add_dict '
                                     'dictionary was added to '
                                     'the existing dictionary. '
                                     'Feed URL values '
                                     'must be unique.')

        gtfsfeeds = cls(gtfs_feeds=yaml_config.get('gtfs_feeds', {}))
        yaml_len = len(yaml_config['gtfs_feeds'])
        log('{} YAML successfully loaded with {} feeds.'.format(yaml_file, 
                                                                yaml_len))

        return gtfsfeeds
Esempio n. 20
0
def create_osm_net(osm_edges,
                   osm_nodes,
                   travel_speed_mph=3,
                   network_type='walk'):
    """
    Create a travel time weight network graph in units of minutes from
    openstreetmap nodes and edges

    Parameters
    ----------
    osm_edges : pandas.DataFrame
        osm edge dataframe
    osm_nodes : pandas.DataFrame
        osm node dataframe
    travel_speed_mph : int, optional
        travel speed to use to calculate travel time across a
        distance on a edge. units are in miles per hour (MPH)
        for pedestrian travel this is assumed to be 3 MPH
    network_type : str, optional
        default is 'walk' for the osm pedestrian network.
        this string is used to label the osm network once it is
        integrated with the transit network

    Returns
    -------
    ua_network : object
        urbanaccess_network object with osm_edges and osm_nodes dataframes
    ua_network.osm_edges : pandas.DataFrame
    ua_network.osm_nodes : pandas.DataFrame

    """
    start_time = time.time()

    assert network_type == 'walk'
    # don't divide by zero!
    assert travel_speed_mph > 0

    # assign impedance to OSM edges, measured in minutes
    dist_in_miles = (osm_edges['distance'] / 1609.34)
    dist_in_hours = (dist_in_miles / travel_speed_mph)
    dist_in_minutes = (dist_in_hours * 60)
    osm_edges['weight'] = dist_in_minutes

    # assign node and edge net type
    osm_edges['net_type'] = network_type
    osm_nodes['net_type'] = network_type

    ua_network.osm_nodes = osm_nodes
    ua_network.osm_edges = osm_edges

    time_diff = time.time() - start_time
    msg = ('Created OSM network with travel time impedance '
           'using a travel speed of {} MPH. Took {:,.2f} '
           'seconds').format(travel_speed_mph, time_diff)
    log(msg)

    return ua_network
Esempio n. 21
0
def _add_unique_gtfsfeed_id(stops_df, routes_df, trips_df,
                            stop_times_df, calendar_df, calendar_dates_df,
                            feed_folder, feed_number):
    """
    Create a unique GTFS feed specific id for all gtfs feed dataframes to
    enable tracking of specific feeds

    Parameters
    ----------
    stops_df : pandas:DataFrame
        stops dataframe
    routes_df : pandas:DataFrame
        routes dataframe
    trips_df : pandas:DataFrame
        trips dataframe
    stop_times_df : pandas:DataFrame
        stop times dataframe
    calendar_df : pandas:DataFrame
        calendar dataframe
    calendar_dates_df : pandas:DataFrame
        calendar dates dataframe
    feed_folder : str
        name of gtfs feed folder
    feed_number : int
        current number iteration of gtfs feed being read in root directory
    Returns
    -------
    stops_df, routes_df, trips_df, stop_times_df, calendar_df,
    calendar_dates_df : pandas.DataFrame
    """
    start_time = time.time()

    df_list = [stops_df,
               routes_df,
               trips_df,
               stop_times_df,
               calendar_df]
    # if calendar_dates_df is not empty then add it to the processing list
    if calendar_dates_df.empty is False:
        df_list.extend([calendar_dates_df])

    # standardize feed_folder name
    feed_folder = _generate_unique_feed_id(feed_folder)

    for index, df in enumerate(df_list):
        # create new unique_feed_id column based on the name of the feed folder
        df['unique_feed_id'] = '_'.join([feed_folder, str(feed_number)])
        df_list[index] = df

    # if calendar_dates_df is empty then return the original empty df
    if calendar_dates_df.empty:
        df_list.extend([calendar_dates_df])

    log('Unique GTFS feed id operation complete. Took {:,.2f} seconds'.format(
        time.time() - start_time))
    return df_list
Esempio n. 22
0
def _txt_header_whitespace_check(gtfsfiles_to_use,
                                 csv_rootpath=os.path.join(
                                     config.settings.data_folder,
                                     'gtfsfeed_text')):
    """
    Standardize all text files inside a GTFS feed to remove whitespace
    in headers

    Parameters
    ----------
    gtfsfiles_to_use : list
        list of gtfs feed txt files to utilize
    csv_rootpath : str, optional
        root path where all gtfs feeds that make up a contiguous metropolitan
        area are stored

    Returns
    -------
    None
    """
    start_time = time.time()

    folderlist = [foldername for foldername in os.listdir(csv_rootpath) if
                  os.path.isdir(os.path.join(csv_rootpath, foldername))]

    if not folderlist:
        folderlist = [csv_rootpath]

    for folder in folderlist:
        textfilelist = [textfilename for textfilename in
                        os.listdir(os.path.join(csv_rootpath, folder)) if
                        textfilename.endswith(".txt")]

        for textfile in textfilelist:
            if textfile in gtfsfiles_to_use:
                # Read from file
                with open(os.path.join(csv_rootpath, folder, textfile)) as f:
                    lines = f.readlines()
                lines[0] = re.sub(r'\s+', '', lines[0]) + '\n'
                # Write to file
                try:
                    with open(os.path.join(csv_rootpath, folder, textfile),
                              'w') as f:
                        f.writelines(lines)
                except Exception:
                    log('Unable to read {}. Check that file is not currently'
                        'being read or is not already in memory as this is '
                        'likely the cause of the error.'
                        ''.format(os.path.join(csv_rootpath,
                                               folder, textfile)))
    log(
        'GTFS text file header whitespace check completed. Took {:,'
        '.2f} seconds'.format(
            time.time() - start_time))
Esempio n. 23
0
def _append_route_type(stops_df, stop_times_df, routes_df,
                       trips_df, info_to_append):
    """
    Append GTFS route type definitions to stops and stop times dataframes

    Parameters
    ----------
    stops_df : pandas:DataFrame
        stops dataframe
    stop_times_df : pandas:DataFrame
        stop times dataframe
    routes_df : pandas:DataFrame
        routes dataframe
    trips_df : pandas:DataFrame
        trip dataframe
    info_to_append : {'route_type_to_stops', 'route_type_to_stop_times'}
        the type of information to append

    Returns
    -------
    stops_df or stop_times_df : pandas.DataFrame
    """
    valid_info_to_append = ['route_type_to_stops', 'route_type_to_stop_times']
    if info_to_append not in valid_info_to_append:
        raise ValueError('{} is not a valid parameter'.format(info_to_append))

    if info_to_append == 'route_type_to_stops':
        tmp1 = pd.merge(trips_df, routes_df, how='left', on='route_id',
                        sort=False, copy=False)
        merged_df = pd.merge(stop_times_df, tmp1, how='left', on='trip_id',
                             sort=False, copy=False)
        merged_df.drop_duplicates(subset='stop_id', keep='first', inplace=True)

        stops_df = pd.merge(stops_df, merged_df[['route_type', 'stop_id']],
                            how='left', on='stop_id', sort=False, copy=False)

        log('Appended route type to stops')

        return stops_df

    if info_to_append == 'route_type_to_stop_times':
        merged_df = pd.merge(trips_df, routes_df, how='left', on='route_id',
                             sort=False, copy=False)
        merged_df.drop_duplicates(subset='trip_id', keep='first', inplace=True)

        stop_times_df = pd.merge(stop_times_df,
                                 merged_df[['route_type', 'trip_id']],
                                 how='left', on='trip_id', sort=False,
                                 copy=False)

        log('Appended route type to stop_times')

        return stop_times_df
def _txt_encoder_check(gtfsfiles_to_use,
                       csv_rootpath=os.path.join(config.settings.data_folder,
                                                 'gtfsfeed_text')):
    """
    Standardize all text files inside a GTFS feed for encoding problems.
    Has not been updated for Python 3.

    Parameters
    ----------
    gtfsfiles_to_use : list
        list of gtfs feed txt files to utilize
    csv_rootpath : str, optional
        root path where all gtfs feeds that make up a contiguous metropolitan
        area are stored

    Returns
    -------
    None
    """
    # UnicodeDecodeError
    start_time = time.time()

    folderlist = [
        foldername for foldername in os.listdir(csv_rootpath)
        if os.path.isdir(os.path.join(csv_rootpath, foldername))
    ]

    if not folderlist:
        folderlist = [csv_rootpath]

    for folder in folderlist:
        textfilelist = [
            textfilename
            for textfilename in os.listdir(os.path.join(csv_rootpath, folder))
            if textfilename.endswith(".txt")
        ]

        for textfile in textfilelist:
            if textfile in gtfsfiles_to_use:
                # Read from file
                file_open = open(os.path.join(csv_rootpath, folder, textfile))
                raw = file_open.read()
                file_open.close()
                if raw.startswith(codecs.BOM_UTF8):
                    raw = raw.replace(codecs.BOM_UTF8, '', 1)
                    # Write to file
                    file_open = open(
                        os.path.join(csv_rootpath, folder, textfile), 'w')
                    file_open.write(raw)
                    file_open.close()

    log('GTFS text file encoding check completed. Took {:,.2f} seconds'.format(
        time.time() - start_time))
Esempio n. 25
0
def create_osm_net(osm_edges, osm_nodes,
                   travel_speed_mph=3, network_type='walk'):
    """
    Create a travel time weight network graph in units of minutes from
    openstreetmap nodes and edges

    Parameters
    ----------
    osm_edges : pandas.DataFrame
        osm edge dataframe
    osm_nodes : pandas.DataFrame
        osm node dataframe
    travel_speed_mph : int, optional
        travel speed to use to calculate travel time across a
        distance on a edge. units are in miles per hour (MPH)
        for pedestrian travel this is assumed to be 3 MPH
    network_type : str, optional
        default is 'walk' for the osm pedestrian network.
        this string is used to label the osm network once it is
        integrated with the transit network

    Returns
    -------
    ua_network : object
        urbanaccess_network object with osm_edges and osm_nodes dataframes
    ua_network.osm_edges : pandas.DataFrame
    ua_network.osm_nodes : pandas.DataFrame

    """
    start_time = time.time()

    if not isinstance(network_type, str) or network_type is None:
        raise ValueError('{!s} network_type passed is either not a '
                         'string or is None'.format(network_type))

    # assign impedance to OSM edges
    osm_edges['weight'] = (osm_edges[
                               'distance'] / 1609.34) / travel_speed_mph * 60

    # assign node and edge net type
    osm_edges['net_type'] = network_type
    osm_nodes['net_type'] = network_type

    ua_network.osm_nodes = osm_nodes
    ua_network.osm_edges = osm_edges

    log(
        'Created OSM network with travel time impedance using a travel speed '
        'of {} MPH. Took {:,.2f} seconds'.format(
            travel_speed_mph, time.time() - start_time))

    return ua_network
Esempio n. 26
0
def _time_selector(df, starttime, endtime):
    """
    Select stop times that fall within a specified time range

    Parameters
    ----------
    df : pandas.DataFrame
        interpolated stop times DataFrame
    starttime : str
        24 hour clock formatted time 1
    endtime : str
        24 hour clock formatted time 2
    Returns
    -------
    selected_stop_timesdf : pandas.DataFrame

    """
    start_time = time.time()

    # takes input start and end time range from 24 hour clock and converts
    # it to seconds past midnight
    # in order to select times that may be after midnight

    # convert string time components to integer and then calculate seconds
    # past midnight
    # convert starttime 24 hour to seconds past midnight
    # TODO: optimize for speed
    start_h = int(str(starttime[0:2]))
    start_m = int(str(starttime[3:5]))
    start_s = int(str(starttime[6:8]))
    starttime_sec = (start_h * 60 * 60) + (start_m * 60) + start_s
    # convert endtime 24 hour to seconds past midnight
    end_h = int(str(endtime[0:2]))
    end_m = int(str(endtime[3:5]))
    end_s = int(str(endtime[6:8]))
    endtime_sec = (end_h * 60 * 60) + (end_m * 60) + end_s

    # create df of stops times that are within the requested range
    selected_stop_timesdf = df[(
        (starttime_sec < df["departure_time_sec_interpolate"]) & (
            df["departure_time_sec_interpolate"] < endtime_sec))]

    log(
        'Stop times from {} to {} successfully selected {:,} records out of '
        '{:,} total records ({:.2f} percent of total). Took {:,'
        '.2f} seconds'.format(
            starttime, endtime, len(selected_stop_timesdf), len(df),
            (len(selected_stop_timesdf) / len(df)) * 100,
            time.time() - start_time))

    return selected_stop_timesdf
Esempio n. 27
0
def _checkcoordinates(df=None, feed_folder=None):
    """
    Check and print the hemisphere that stop coordinates are in

    Parameters
    ----------
    df : pandas.DataFrame
        stops dataframe
    feed_folder : str
        name of originating gtfs feed folder

    Returns
    -------
    None

    """
    if (df['stop_lat'] > 0).values.any() & (df['stop_lon'] < 0).values.any():
        log('{} GTFS feed stops: coordinates are in northwest hemisphere. '
            'Latitude = North (90); Longitude = West (-90).'.format(
                os.path.split(feed_folder)[1]))

    if (df['stop_lat'] < 0).values.any() & (df['stop_lon'] < 0).values.any():
        log('{} GTFS feed stops: coordinates are in southwest hemisphere. '
            'Latitude = South (-90); Longitude = West (-90).'.format(
                os.path.split(feed_folder)[1]))

    if (df['stop_lat'] > 0).values.any() & (df['stop_lon'] > 0).values.any():
        log('{} GTFS feed stops: coordinates are in northeast hemisphere. '
            'Latitude = North (90); Longitude = East (90).'.format(
                os.path.split(feed_folder)[1]))

    if (df['stop_lat'] < 0).values.any() & (df['stop_lon'] > 0).values.any():
        log('{} GTFS feed stops: coordinates are in southeast hemisphere. '
            'Latitude = South (-90); Longitude = East (90).'.format(
                os.path.split(feed_folder)[1]))
Esempio n. 28
0
def _nearest_neighbor(df1, df2, use_4326_constraints=True):
    df1_new = df1.copy()  # osm nodes
    df2_new = df2.copy()  # transit nodes

    # be aggressive about ensuring float limits (no vals over than 4 decimals)
    for col in ['x', 'y']:
        df1_new[col] = np.around(df1_new[col], decimals=4)
        df2_new[col] = np.around(df2_new[col], decimals=4)

    # drop out any invalid x, y columns from the left
    invalid_osm_rows = (df1_new['x'].isnull() | df1_new['y'].isnull() |
                        (~np.isfinite(df1_new['x'])) |
                        (~np.isfinite(df1_new['y'])))
    orig_df1_len = len(df1_new)
    df1_new = df1_new[~invalid_osm_rows]
    cleaned_df1_len = len(df1_new)

    # log if any rows were removed from osm nodes dataset
    df1_cleaned_diff = orig_df1_len - cleaned_df1_len
    if df1_cleaned_diff > 0:
        log(('{} OSM node rows ommitted during nearest neighbor calculations'
             'due to being invalid numeric values.').format(df1_cleaned_diff))

    # let's make sure that latitudes and longitudes are not in excess of
    # their geographic limits
    if use_4326_constraints:
        # y is latitude -90 - +90
        # x is longitude -180 - +180
        df1_new = _adjust_outliers(df1_new, 'x', 90)
        df2_new = _adjust_outliers(df2_new, 'x', 90)
        df1_new = _adjust_outliers(df1_new, 'y', 180)
        df2_new = _adjust_outliers(df2_new, 'y', 180)

    # identify problem rows on the right
    invalid_trans_rows = (df2_new['x'].isnull() | df2_new['y'].isnull())
    invalid_trans_rows_ct = len(df2_new[invalid_trans_rows])
    if invalid_trans_rows_ct > 0:
        log(('{} out of {} invalid rows identified for the transit nodes '
             'dataframe, but not removed. These may cause operation to '
             'fail.').format(invalid_trans_rows_ct, len(df2_new)))

    # for xy coordinates df find the nearest in a subsequent dataframe
    kdt = KDTree(df1_new.as_matrix().astype(np.float))
    df2_mtx = df2_new.as_matrix().astype(np.float)

    indexes = kdt.query(df2_mtx, k=1, return_distance=False)

    # this is returning the osmids (indexed) from the left dataframe
    return df1.index.values[indexes]
Esempio n. 29
0
def _add_headway_impedance(ped_to_transit_edges_df,
                           headways_df,
                           headway_statistic='mean'):
    """
    Add route stop level headways to the osm to transit connector
    travel time weight column

    Parameters
    ----------
    ped_to_transit_edges_df : pandas.DataFrame
        DataFrame of the osm to transit connectors
    headways_df : pandas.DataFrame
        headways DataFrame
    headway_statistic : {'mean', 'std', 'min', 'max'}, optional
        required if headways is true; route stop headway statistic to apply
        to the osm to transit connector edges:
        mean, std, min, max. Default is mean.

    Returns
    -------
    osm_to_transit_wheadway : pandas.DataFrame

    """

    start_time = time.time()

    log('{} route stop headway will be used for pedestrian to transit edge '
        'impedance.'.format(headway_statistic))

    osm_to_transit_wheadway = pd.merge(
        ped_to_transit_edges_df,
        headways_df[[headway_statistic, 'node_id_route']],
        how='left',
        left_on=['to'],
        right_on=['node_id_route'],
        sort=False,
        copy=False)
    osm_to_transit_wheadway['weight_tmp'] = osm_to_transit_wheadway[
        'weight'] + (osm_to_transit_wheadway[headway_statistic] / 2.0)
    osm_to_transit_wheadway['weight_tmp'].fillna(
        osm_to_transit_wheadway['weight'], inplace=True)
    osm_to_transit_wheadway.drop('weight', axis=1, inplace=True)
    osm_to_transit_wheadway.rename(columns={'weight_tmp': 'weight'},
                                   inplace=True)

    log('Headway impedance calculation completed. Took {:,.2f} seconds'.format(
        time.time() - start_time))

    return osm_to_transit_wheadway
Esempio n. 30
0
def _calc_headways_by_route_stop(df):
    """
    Calculate headways by route stop

    Parameters
    ----------
    df : pandas.DataFrame
        interpolated stop times dataframe for stop times within the time
        range with appended trip and route information

    Returns
    -------
    dataframe : pandas.DataFrame
        dataframe of statistics of route stop headways in units of minutes
    """

    # TODO: Optimize for speed

    start_time = time.time()

    df['unique_stop_route'] = (df['unique_stop_id'].str.cat(
        df['unique_route_id'].astype('str'), sep=','))

    stop_route_groups = df.groupby('unique_stop_route')
    log('Starting route stop headway calculation for {:,} route '
        'stops...'.format(len(stop_route_groups)))

    results = {}

    # suppress RuntimeWarning: Mean of empty slice. for this code block
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category='RuntimeWarning')

        for unique_stop_route, stop_route_group in stop_route_groups:
            stop_route_group.sort_values(['departure_time_sec_interpolate'],
                                         ascending=True,
                                         inplace=True)
            next_bus_time = (stop_route_group['departure_time_sec_interpolate']
                             .iloc[1:].values)
            prev_bus_time = (stop_route_group['departure_time_sec_interpolate']
                             .iloc[:-1].values)
            stop_route_group_headways = (next_bus_time - prev_bus_time) / 60
            results[unique_stop_route] = (
                pd.Series(stop_route_group_headways).describe())

    log('Route stop headway calculation complete. Took {:,.2f} seconds'.format(
        time.time() - start_time))

    return pd.DataFrame(results).T