Beispiel #1
0
def merge_osm_fwaze(
        join_table,
        waze_features,
        osm,
        city,
        features_name,
        load_cache=True):

    path = Path(__file__).resolve().parents[2] / 'data' / 'processed' / ('waze+osm_' + city + '_' + features_name + '.csv')

    if os.path.exists(path) and load_cache:

        with utils.Logger('Fetching data from cache'):
            wo = pd.read_csv(path)

    else: 
        
        with utils.Logger('Merging Waze with OSM'):
            
            ww = join_table.merge(waze_features, right_index=True, left_on='waze_id')
            wo = ww.merge(osm['edges'], right_on='geometry_id', left_on='osm_id', how='right')

            wo = wo.drop(columns=['geometry_x', 'osm_id', 'id_segment'])

            wo = wo.rename(columns={'geometry_segment': 'waze_geometry',
                        'geometry_y': 'osm_geometry',
                        'geometry_id': 'osm_id'})
            

        with utils.Logger('Saving csv to {}'.format(path)):

            wo.to_csv(path, index=False)

    return wo
Beispiel #2
0
def get_priority_G(osm, city, force=False):

    path = '../processed_data/join/priority_G_{}.p'.format(city)

    if os.path.exists(path) and not force:
        logging.debug('Loading cached priority G')
        priority_G = pickle.load(open(path, 'rb'))

    else:

        highway_priority = get_highway_priority()

        priorities = {'high': [1],
                      'medium': [1, 2],
                      'low': [1, 2, 3],
                      'all': [1, 2, 3, 4, 5]}

        priority_G = {}

        for priority, values in priorities.items():
            with utils.Logger('Subgraphing {}'.format(priority), 'INFO') as t:
                if priority == 'all':
                    priority_G[priority] = osm['G']
                else:
                    priority_G[priority] = nx.MultiDiGraph(((u, v, d)
                                                            for u, v, k, d in osm['G'].edges(keys=True, data=True)
                                                            if d['highway'] in list(
                        highway_priority[highway_priority['priority'].isin(values)]['highway'])))

        with utils.Logger('Saving to '.format(path), 'INFO') as t:
            pickle.dump(priority_G, open(path, 'wb'))

    return priority_G
Beispiel #3
0
def join_osm_waze(
        osm,
        features,
        city,
        n_threads=10,
        load_cache=True):

    path = Path(__file__).resolve().parents[2] / 'data' / 'processed' / ('waze+osm_' + city + '.csv')

    if os.path.exists(path) and load_cache:

        with utils.Logger('Fetching data from cache'):
            wo = pd.read_csv(path)

    else: 
        with utils.Logger('Boxing Waze by OSM', 'INFO'):
            osm_box = dt.box_around_city(osm)
            features = features[features.apply(lambda x: 
                                        x['geometry_segment'].intersects(osm_box),
                                        axis=1)]

        with utils.Logger('Joining osm and waze with {} threads'.format(n_threads),
                        'INFO'):

            centroids = get_centroids_tuple(osm)

            final = []
            i = 0
            for idx, row in features.iterrows():
                i = i + 1

                osm_segments = dask.delayed(get_similar_edges)(
                                row['geometry_segment'], osm, centroids,
                                row['id_segment'])

                final.append(osm_segments)

            with ProgressBar():
                final = dask.compute(final)[0]

            # To dataframe
            merge = pd.concat(final)
            merge.columns = ['osm_id', 'geometry', 'waze_id']

        with utils.Logger('Saving csv to {}'.format(path)):

            merge.to_csv(path, index=False)

    return merge
Beispiel #4
0
def treat_osm_data(df):

    with utils.Logger('Fixing Types'):

        columns = ['highway', 'lanes', 'maxspeed', 'name', 'osmid']

        for column in list(columns):
            utils.Logger('Treating column: {}'.format(column)).print_message()
            df = split_data_frame_list(df, column, output_type=str)

        #print(df['maxspeed'])
        #df['maxspeed'] = df['maxspeed'].apply(clean_speed_number)

        df['oneway'] = df['oneway'].apply(clean_oneway)

    return df
Beispiel #5
0
def calculate_shortest_path(begin_node, end_node, priority_G, priority=True, i=1):

    utils.Logger(
        'Calculating shortest path {}'.format(i),
        'INFO').print_message()

    for prio in ['high', 'medium', 'low', 'all']:
        if priority and prio != 'low':
            continue

        try:
            return nx.shortest_path(
                priority_G[prio],
                source=begin_node,
                target=end_node)
        except Exception as e:  # (nx.NetworkXNoPath, nx.NodeNotFound):
            True
        try:
            return nx.shortest_path(
                priority_G[prio],
                source=end_node,
                target=begin_node)
        except Exception as e:
            continue
    else:
        return None
Beispiel #6
0
def segment_length(waze, features):

    with utils.Logger('Calculating segment length', 'INFO'):  
        features = null_feature(waze, columns=['geometry_segment'])
        features['segment_length'] = features['geometry_segment'].apply(
                                                    lambda x: x.length)

        return features
Beispiel #7
0
def to_geojson(df, filename, path=False):

    with utils.Logger('Preprocessing Data'):
        for col in get_df_columns_types(df, 'numpy'):

            df = treat_numpy_to_geojson(df, col)

        for col in get_df_columns_types(df, list):

            df = split_data_frame_list(df, col, str)

        for col in get_df_columns_types(df, 'timestamp'):

            df = split_data_frame_list(df, col, str)

    geo_columns = get_shapely_columns(df)
    for column in geo_columns:

        with utils.Logger(
                'Writing GeoJson with {} as geometry column'.format(column)):

            path = Path(__file__).resolve(
            ).parents[2] / 'data' / 'processed' / 'geojson' / (
                filename + '_' + column + '.geojson')

            if not os.path.exists(path.parent):
                os.makedirs(path.parent)

            if isinstance(df, pd.DataFrame):
                df_geo = df.dropna(subset=[column])
                df_geo = gpd.GeoDataFrame(df_geo, geometry=column)
            else:
                df_geo.set_geometry(column)

            # Drop other geometry columns
            df_geo = df_geo.drop(
                columns=[x for x in geo_columns if x != column])

            try:
                os.remove(path)
            except OSError:
                pass
            df_geo.to_file(path, driver='GeoJSON')

            print('Saved at ', path)
Beispiel #8
0
def basic_info(waze, features):

    with utils.Logger('Extracting unique values', 'INFO'):

        unique_columns = ['id_segment', 'start_node', 'end_node', 'road_type',
                  'street', 'city', 'country', 'line', 'line_geo', 'type',
                 'geometry']

        return  waze.drop_duplicates(subset=['id_segment'])[unique_columns].set_index('id_segment')
Beispiel #9
0
def perform_join(osm, path, i=1):

    utils.Logger('Joining aadt+osm {}'.format(i), 'INFO').print_message()
    idx = []
    for first, last in zip(path[:-1], path[1:]):
        idx.append(osm['edges'][((osm['edges']['u'] == first) & (osm['edges']['v'] == last)) | (
            (osm['edges']['v'] == first) & (osm['edges']['u'] == last))].index[0])

    return idx
Beispiel #10
0
def simple_statistics(waze, features):

    with utils.Logger('Calculating Simple Statistics', 'INFO'):

        if 'speedKMH' not in waze.columns:
            waze['speedKMH'] = waze['speed_kmh']

        aggregations = aggregations_generator(['speed_kmh', 'delay'])
        features = waze.groupby('id_segment').agg(aggregations)
        features.columns = features.columns.get_level_values(1)

    return features
Beispiel #11
0
def get_aadt(city_name):
    """
    info_path: path to AADT info
    shape_path: path to shape file
    return: GeoPandas DataFrame
    """

    year = 2016

    # TODO by city

    info_path_major = '../raw_data/AADT/AADF-data-major-roads.csv'
    shape_path_major = "../raw_data/AADT/shape/major-roads-link-network2016.shp"

    with utils.Logger('Loading data'):
        aadt_shape = gpd.read_file(shape_path_major)
        aadt = pd.read_csv(info_path_major)

    logging.debug('Filtering by city and year: {} {}'.format(city_name, year))

    aadt = aadt[(aadt['ONS LA Name'].str.contains(city_name, case=False))
                & (aadt['AADFYear'] == year)]

    with utils.Logger('Merging: {}, {}'.format(city_name, year)):
        aadt = aadt_shape.merge(aadt, right_on='CP', left_on='CP_Number')

    project = partial(
        pyproj.transform,
        pyproj.Proj(init='epsg:27700'),  # source coordinate system
        pyproj.Proj(init='epsg:4326'))  # destination coordinate system

    with utils.Logger('Transform coord system'):
        aadt['geometry'] = aadt['geometry'].apply(
            lambda x: transform(project, x))
        aadt['begin'] = aadt['geometry'].apply(
            lambda line: list(map(Point, line.coords)))
        aadt[['begin', 'end']] = pd.DataFrame(aadt['begin'].values.tolist(),
                                              index=aadt.index)

    return gpd.GeoDataFrame(aadt)
Beispiel #12
0
def create_boxes(osm, threshold, max_distance=2):

    with utils.Logger('Creating boxes', 'INFO'):

        initial_geometry = box_around_city(osm)

        df = fishnet(initial_geometry, osm, threshold=threshold)

        for distance in range(1, max_distance + 1):
            df['osmids_neigh_{}'.format(distance)] = df.apply(
                enhance_osm_box_neighbours, args=(df, distance), axis=1)

    return df
Beispiel #13
0
def fake_feature(waze, features):
    """A feature prototype.

    How to use:
    - copy this
    - change the name
    - change the log content
    - write some feature code.
    - add the function name to the desired_features parameter on
    build_features

    Rules:
    1. If this feature depends of other features, solve this 
    inside this function. Do not excepct that someone will know
    which feature to ask for.
    2. Name your features properly. We like to understand them and
    make sure that no other feature already have this name.
    
    Parameters
    ----------
    waze : pd.DataFrame
        Raw waze data
    features : pd.DataFrame
        Unique segment data
    
    Returns
    -------
    pd.DataFrame
        Features -- Unique segment with specific data. The index has to be
        the segment_id, the columns your features.
    """

    with utils.Logger('This is doing something...', 'INFO'):  

        # Null feature - it just construct a DataFrame with the index as 
        # id_segment and a set of columns of your choice.
        features = null_feature(waze, columns=[])
    
        # Some code ...

        return features
Beispiel #14
0
def fishnet(geometry, osm, threshold):

    utils.Logger('Calculating Fishnet with threshold of {}'.format(threshold),
                 'INFO').print_message()

    bounds = geometry.bounds
    xmin = int(bounds[0] // threshold)
    xmax = int(bounds[2] // threshold)
    ymin = int(bounds[1] // threshold)
    ymax = int(bounds[3] // threshold)
    ncols = int(xmax - xmin + 1)
    nrows = int(ymax - ymin + 1)

    result = []
    for i in range(xmin, xmax + 1):
        for j in range(ymin, ymax + 1):

            b = box(i * threshold, j * threshold, (i + 1) * threshold,
                    (j + 1) * threshold)

            g = geometry.intersection(b)

            if g.is_empty:
                continue

            result.append({
                'box':
                g,
                'i':
                i,
                'j':
                j,
                'centroid':
                g.centroid,
                'osmids':
                list(osm['edges'][osm['edges']['geometry'].apply(
                    lambda x: x.intersects(g))]['geometry_id'])
            })

    return pd.DataFrame(result)
Beispiel #15
0
def free_flow_speed_estimation(waze, features):
    """Estimates the maximum and minimum free flow speed
    values based on the fact that the value level is
    a proportion of the speed.  
    
    Returns
    -------
    'free_flow_speed_min', 'free_flow_speed_max'
        The bounds of the estimation
    """


    with utils.Logger('Estimating Free Flow Speed', 'INFO'):  

        waze[['free_flow_speed_min', 'free_flow_speed_max']] = (
            waze.apply(level_to_free_flow, axis=1).apply(pd.Series))

        features = null_feature(waze)

        features[['free_flow_speed_min', 'free_flow_speed_max']] = (
            waze[['id_segment', 'free_flow_speed_min', 'free_flow_speed_max']]
            .groupby('id_segment').apply(improve_free_flow_estimate).apply(pd.Series))

        return features
Beispiel #16
0
def get_osm(city, G=False, load_cache=True, fishnet=False, threshold=0.01):
    """Process the graph generated by Open Stree Maps (OSM) to nodes, edges DataFrames. 
    
    It also maps the edges and nodes of the graph to smaller quadrants.
    # TODO improve id system

    Parameters
    ----------
    city : string
        Name of the city that you are working on. It works as a unique id.
    G : nx.DiGraph, optional
        A road network graph generated by OSMNX 
        (the default is False, if there is cached data accessed by the name of the city)
    load_cache : bool, optional
        Whether to load cached data (the default is True, which loads cache)
    fishnet: bool, optional
        Whether to divide the city in regions as quadrants using the threshold
    threshold : float, optional
        Size of the smaller quadrants (the default is 0.01)
    
    Returns
    -------
    dictionary

        A dictionary with:
        
        - 'G': OSMNX nx.DiGraph
        - 'edges': geopandas.GeoDataFrame with the features of all edges
        - 'nodes': geopandas.GeoDataFrame with the features of all nodes
        - 'boxes': geopandas.GeoDataFrame mapping edges to quadrants (optional)
    """

    path = Path(__file__).resolve().parents[2] / 'data' / 'processed' / (
        'osm_' + city + '.p')

    if os.path.exists(path) and load_cache:

        with utils.Logger('Fetching data from cache'):
            osm = pickle.load(open(path, 'rb'))

    else:

        if not G:
            raise ('There is no cached data for {}. Please, insert a graph'.
                   format(path))

        with utils.Logger('Treating data'):
            osm = {
                'nodes': gpd.GeoDataFrame(nodes_to_pd(G)),
                'edges': edges_to_pd(G),
                'G': G
            }
            osm['edges'] = gpd.GeoDataFrame(treat_osm_data(osm['edges'], ))
            osm['nodes']['point'] = osm['nodes'].apply(
                lambda node: Point([node['x'], node['y']]), axis=1)
            osm['edges']['geometry_id'] = osm['edges']['geometry'].apply(str)

        with utils.Logger('Calculating fishnet'):
            if fishnet:
                osm['boxes'] = create_boxes(osm, threshold=threshold)

        with utils.Logger('Saving OSM data to {}'.format(path)):
            pickle.dump(osm, open(path, 'wb'))

    return osm
Beispiel #17
0
def treat_waze_jams(city, waze, load_cache=True, columns={}):
    """Get Waze Jam Data and splits the segments to their smallest units, 'segment atom'.
    It is important because a 'segment atom' are indepent and can be in different jams
    depending on the day, time, weather and other conditions.

    This connects directly to the database and queries it.
    
    Parameters
    ----------
    city : string   
        Name of the city that you are working on. It works as a unique id.
    table : string, optional
        Table name (the default is False, which searches cached data)
    schema : bool, optional
        Schema name (the default is False, which searches cached data)
    con : bool, optional
        [description] (the default is False, which searches cached data)
    load_cache : bool, optional
        Whether to load data from cache (the default is True, which [default_description])
    columns : dict, optional
        Point to these columns:
            - id
            - pub_utc_date
            - line
        as dict ==> {'uuid': 'id', 'time': 'pub_utc_date'}
    
    Returns
    -------
    gpd.GeoPandasDataFrame
        'Atomized' table
    """

    path = Path(__file__).resolve().parents[2] / 'data' / 'processed' / (
        'waze_' + city + '.p')

    if os.path.exists(path) and load_cache:

        with utils.Logger('Fetching data from cache'):
            waze = pickle.load(open(path, 'rb'))

    else:

        with utils.Logger('Converting line to shape'):

            waze = waze.rename(columns=columns)

            try:
                waze['geometry'] = waze['line'].apply(
                    lambda x: LineString(ast.literal_eval(x)))
            except:
                if isinstance(waze['line'][0], str):
                    waze['line'] = waze['line'].apply(ast.literal_eval)

                waze['geometry'] = waze['line'].apply(
                    lambda x: LineString([(i['x'], i['y']) for i in x]))

        with utils.Logger('Converting to datetime'):
            try:
                waze['startTime'] = pd.to_datetime(waze['startTime'])
                waze['endTime'] = pd.to_datetime(waze['endTime'])
            except:
                waze['put_utc_date'] = pd.to_datetime(waze['pub_utc_date'])

        with utils.Logger('Get waze segments', 'INFO'):

            if '_id' not in waze.columns:
                waze['_id'] = waze['id']

            dask.set_options(pool=ThreadPool(40))

            segments = []
            for i, row in waze.iterrows():
                segments.append(dask.delayed(get_segments)(row))

            segments = dask.delayed(pd.concat)(segments)

            segments_waze = dask.delayed(segments).merge(waze, on='_id')

            with ProgressBar():
                waze = dask.compute(segments_waze)[0]

        with utils.Logger('Saving csv to {}'.format(path)):

            pickle.dump(gpd.GeoDataFrame(waze), open(path, 'wb'))

    return gpd.GeoDataFrame(waze)
Beispiel #18
0
def join_osm_aadt(
        osm,
        aadt,
        city,
        n_threads=10,
        load_cache=False,
        force_computation=False):

    dask.set_options(pool=ThreadPool(n_threads))

    save_path = '../processed_data/join/{}_'.format(city)
    if os.path.exists(save_path) and not load_cache:

        with utils.Logger('Fetching data from cache', 'INFO') as t:

            utils.Logger('Loading AADT').print_message()
            aadt = pickle.load(open(save_path + 'aadt.csv', 'rb'))
            utils.Logger('Loading AADT').print_message()
            osm = pickle.load(open(save_path + 'osm.csv', 'rb'))
    else:

        with utils.Logger('Calculate priority subgraphs', 'INFO') as t:
            priority_G = get_priority_G(osm, city, force=force_computation)

        with utils.Logger('Check OSM and AADT', 'INFO') as t:

            if 'CP_Number' in osm['edges']:
                osm = dt.get_osm(None, city)
                aadt = dt.get_aadt(city)

        with utils.Logger('Join aadt and osm', 'INFO') as t:

            final = []
            for i, row in aadt.iterrows():
                begin_aadt = dask.delayed(
                    get_closest_osm_node)(row['begin'], osm, i)
                end_aadt = dask.delayed(
                    get_closest_osm_node)(row['end'], osm, i)
                path = dask.delayed(calculate_shortest_path)(
                    begin_aadt, end_aadt, priority_G, i)
                osm_idx = dask.delayed(perform_join)(osm, path, i)
                final.append({'index': i,
                              'begin_aadt': begin_aadt,
                              'end_aadt': end_aadt,
                              'path': path,
                              'osm_idx': osm_idx})

            final = dask.compute(final)[0]

            for f in final:
                osm['edges'].loc[f['osm_idx'],
                                 'CP_Number'] = aadt.loc[f['index'], 'CP_Number']

            utils.Logger('OSM length {}'.format(
                len(osm['edges']))).print_message()
            aadt = aadt.merge(
                pd.DataFrame(final),
                right_on='index',
                left_index=True,
                how='left')
            utils.Logger('OSM length {}'.format(
                len(osm['edges']))).print_message()
            osm['edges'] = osm['edges'].merge(aadt, on='CP_Number', how='left')
            utils.Logger('OSM length {}'.format(
                len(osm['edges']))).print_message()

        with utils.Logger('Saving osm and aadt processed data to {}'.format(path), 'INFO') as t:
            pickle.dump(osm, open(save_path + 'osm.csv', 'wb'))
            pickle.dump(aadt, open(save_path + 'aadt.csv', 'wb'))

    return osm, aadt
Beispiel #19
0
def get_closest_osm_node(node, osm, line=1, radius=0.0059, priority=True):

    utils.Logger(
        'Searching closest node {}'.format(line),
        'INFO').print_message()

    if not isinstance(node, shapely.geometry.point.Point):
        node = shapely.geometry.Point(node)

    # TODO: pass this as args 
    highway_priority = get_highway_priority()

    while True:
        radius = radius + 0.001

        utils.Logger(
            'Incrementing radius to  {}, {}'.format(
                radius, line)).print_message()

        # Get distance from node
        node_box = bbox_from_point(node, radius)
        osm_intersection = osm['nodes'][osm['nodes'][
            'point'].apply(lambda x: x.intersects(node_box))]
        utils.Logger('Intersection Lenght {}, {}'.format(
                                        len(osm_intersection),
                                        line)).print_message()
        if len(osm_intersection) == 0:
            utils.Logger('Passing'.format(line)).print_message()
            continue
        distance = [(row['n'], node.distance(row['point']))
                    for i, row in osm_intersection.iterrows()]

        # Merge with node info
        distance = osm['nodes'].merge(
            pd.DataFrame(distance, columns=['n', 'distance']),
            right_on='n',
            left_on='n').sort_values(by='distance')

        # Get node that connect high priority highway edges
        for i, row in distance.iterrows():

            # Get edges info
            a = get_edges_from_node(row['n'], osm)

            # If there is a priority edge, return node
            if priority:
                if radius > 0.0009 + 0.001 * 10:
                    if a['highway'].apply(
                            lambda x: highway_priority[highway_priority['highway'] == x].values[0][1]).min() < 3:
                        utils.Logger(
                            'Returning low priority  <3, {}'.format(line)).print_message()
                        return row['n']
                else:
                    if a['highway'].apply(
                            lambda x: highway_priority[highway_priority['highway'] == x].values[0][1]).min() < 2:
                        utils.Logger('Returning high priority  <2, {}'.format(line)).print_message()
                        return row['n']
            else:
                if a['highway'].apply(
                            lambda x: highway_priority[highway_priority['highway'] == x].values[0][1]).min():
                        utils.Logger( 'Returning no priority, {}'.format(line)).print_message()
                        return row['n']