def merge_osm_fwaze( join_table, waze_features, osm, city, features_name, load_cache=True): path = Path(__file__).resolve().parents[2] / 'data' / 'processed' / ('waze+osm_' + city + '_' + features_name + '.csv') if os.path.exists(path) and load_cache: with utils.Logger('Fetching data from cache'): wo = pd.read_csv(path) else: with utils.Logger('Merging Waze with OSM'): ww = join_table.merge(waze_features, right_index=True, left_on='waze_id') wo = ww.merge(osm['edges'], right_on='geometry_id', left_on='osm_id', how='right') wo = wo.drop(columns=['geometry_x', 'osm_id', 'id_segment']) wo = wo.rename(columns={'geometry_segment': 'waze_geometry', 'geometry_y': 'osm_geometry', 'geometry_id': 'osm_id'}) with utils.Logger('Saving csv to {}'.format(path)): wo.to_csv(path, index=False) return wo
def get_priority_G(osm, city, force=False): path = '../processed_data/join/priority_G_{}.p'.format(city) if os.path.exists(path) and not force: logging.debug('Loading cached priority G') priority_G = pickle.load(open(path, 'rb')) else: highway_priority = get_highway_priority() priorities = {'high': [1], 'medium': [1, 2], 'low': [1, 2, 3], 'all': [1, 2, 3, 4, 5]} priority_G = {} for priority, values in priorities.items(): with utils.Logger('Subgraphing {}'.format(priority), 'INFO') as t: if priority == 'all': priority_G[priority] = osm['G'] else: priority_G[priority] = nx.MultiDiGraph(((u, v, d) for u, v, k, d in osm['G'].edges(keys=True, data=True) if d['highway'] in list( highway_priority[highway_priority['priority'].isin(values)]['highway']))) with utils.Logger('Saving to '.format(path), 'INFO') as t: pickle.dump(priority_G, open(path, 'wb')) return priority_G
def join_osm_waze( osm, features, city, n_threads=10, load_cache=True): path = Path(__file__).resolve().parents[2] / 'data' / 'processed' / ('waze+osm_' + city + '.csv') if os.path.exists(path) and load_cache: with utils.Logger('Fetching data from cache'): wo = pd.read_csv(path) else: with utils.Logger('Boxing Waze by OSM', 'INFO'): osm_box = dt.box_around_city(osm) features = features[features.apply(lambda x: x['geometry_segment'].intersects(osm_box), axis=1)] with utils.Logger('Joining osm and waze with {} threads'.format(n_threads), 'INFO'): centroids = get_centroids_tuple(osm) final = [] i = 0 for idx, row in features.iterrows(): i = i + 1 osm_segments = dask.delayed(get_similar_edges)( row['geometry_segment'], osm, centroids, row['id_segment']) final.append(osm_segments) with ProgressBar(): final = dask.compute(final)[0] # To dataframe merge = pd.concat(final) merge.columns = ['osm_id', 'geometry', 'waze_id'] with utils.Logger('Saving csv to {}'.format(path)): merge.to_csv(path, index=False) return merge
def treat_osm_data(df): with utils.Logger('Fixing Types'): columns = ['highway', 'lanes', 'maxspeed', 'name', 'osmid'] for column in list(columns): utils.Logger('Treating column: {}'.format(column)).print_message() df = split_data_frame_list(df, column, output_type=str) #print(df['maxspeed']) #df['maxspeed'] = df['maxspeed'].apply(clean_speed_number) df['oneway'] = df['oneway'].apply(clean_oneway) return df
def calculate_shortest_path(begin_node, end_node, priority_G, priority=True, i=1): utils.Logger( 'Calculating shortest path {}'.format(i), 'INFO').print_message() for prio in ['high', 'medium', 'low', 'all']: if priority and prio != 'low': continue try: return nx.shortest_path( priority_G[prio], source=begin_node, target=end_node) except Exception as e: # (nx.NetworkXNoPath, nx.NodeNotFound): True try: return nx.shortest_path( priority_G[prio], source=end_node, target=begin_node) except Exception as e: continue else: return None
def segment_length(waze, features): with utils.Logger('Calculating segment length', 'INFO'): features = null_feature(waze, columns=['geometry_segment']) features['segment_length'] = features['geometry_segment'].apply( lambda x: x.length) return features
def to_geojson(df, filename, path=False): with utils.Logger('Preprocessing Data'): for col in get_df_columns_types(df, 'numpy'): df = treat_numpy_to_geojson(df, col) for col in get_df_columns_types(df, list): df = split_data_frame_list(df, col, str) for col in get_df_columns_types(df, 'timestamp'): df = split_data_frame_list(df, col, str) geo_columns = get_shapely_columns(df) for column in geo_columns: with utils.Logger( 'Writing GeoJson with {} as geometry column'.format(column)): path = Path(__file__).resolve( ).parents[2] / 'data' / 'processed' / 'geojson' / ( filename + '_' + column + '.geojson') if not os.path.exists(path.parent): os.makedirs(path.parent) if isinstance(df, pd.DataFrame): df_geo = df.dropna(subset=[column]) df_geo = gpd.GeoDataFrame(df_geo, geometry=column) else: df_geo.set_geometry(column) # Drop other geometry columns df_geo = df_geo.drop( columns=[x for x in geo_columns if x != column]) try: os.remove(path) except OSError: pass df_geo.to_file(path, driver='GeoJSON') print('Saved at ', path)
def basic_info(waze, features): with utils.Logger('Extracting unique values', 'INFO'): unique_columns = ['id_segment', 'start_node', 'end_node', 'road_type', 'street', 'city', 'country', 'line', 'line_geo', 'type', 'geometry'] return waze.drop_duplicates(subset=['id_segment'])[unique_columns].set_index('id_segment')
def perform_join(osm, path, i=1): utils.Logger('Joining aadt+osm {}'.format(i), 'INFO').print_message() idx = [] for first, last in zip(path[:-1], path[1:]): idx.append(osm['edges'][((osm['edges']['u'] == first) & (osm['edges']['v'] == last)) | ( (osm['edges']['v'] == first) & (osm['edges']['u'] == last))].index[0]) return idx
def simple_statistics(waze, features): with utils.Logger('Calculating Simple Statistics', 'INFO'): if 'speedKMH' not in waze.columns: waze['speedKMH'] = waze['speed_kmh'] aggregations = aggregations_generator(['speed_kmh', 'delay']) features = waze.groupby('id_segment').agg(aggregations) features.columns = features.columns.get_level_values(1) return features
def get_aadt(city_name): """ info_path: path to AADT info shape_path: path to shape file return: GeoPandas DataFrame """ year = 2016 # TODO by city info_path_major = '../raw_data/AADT/AADF-data-major-roads.csv' shape_path_major = "../raw_data/AADT/shape/major-roads-link-network2016.shp" with utils.Logger('Loading data'): aadt_shape = gpd.read_file(shape_path_major) aadt = pd.read_csv(info_path_major) logging.debug('Filtering by city and year: {} {}'.format(city_name, year)) aadt = aadt[(aadt['ONS LA Name'].str.contains(city_name, case=False)) & (aadt['AADFYear'] == year)] with utils.Logger('Merging: {}, {}'.format(city_name, year)): aadt = aadt_shape.merge(aadt, right_on='CP', left_on='CP_Number') project = partial( pyproj.transform, pyproj.Proj(init='epsg:27700'), # source coordinate system pyproj.Proj(init='epsg:4326')) # destination coordinate system with utils.Logger('Transform coord system'): aadt['geometry'] = aadt['geometry'].apply( lambda x: transform(project, x)) aadt['begin'] = aadt['geometry'].apply( lambda line: list(map(Point, line.coords))) aadt[['begin', 'end']] = pd.DataFrame(aadt['begin'].values.tolist(), index=aadt.index) return gpd.GeoDataFrame(aadt)
def create_boxes(osm, threshold, max_distance=2): with utils.Logger('Creating boxes', 'INFO'): initial_geometry = box_around_city(osm) df = fishnet(initial_geometry, osm, threshold=threshold) for distance in range(1, max_distance + 1): df['osmids_neigh_{}'.format(distance)] = df.apply( enhance_osm_box_neighbours, args=(df, distance), axis=1) return df
def fake_feature(waze, features): """A feature prototype. How to use: - copy this - change the name - change the log content - write some feature code. - add the function name to the desired_features parameter on build_features Rules: 1. If this feature depends of other features, solve this inside this function. Do not excepct that someone will know which feature to ask for. 2. Name your features properly. We like to understand them and make sure that no other feature already have this name. Parameters ---------- waze : pd.DataFrame Raw waze data features : pd.DataFrame Unique segment data Returns ------- pd.DataFrame Features -- Unique segment with specific data. The index has to be the segment_id, the columns your features. """ with utils.Logger('This is doing something...', 'INFO'): # Null feature - it just construct a DataFrame with the index as # id_segment and a set of columns of your choice. features = null_feature(waze, columns=[]) # Some code ... return features
def fishnet(geometry, osm, threshold): utils.Logger('Calculating Fishnet with threshold of {}'.format(threshold), 'INFO').print_message() bounds = geometry.bounds xmin = int(bounds[0] // threshold) xmax = int(bounds[2] // threshold) ymin = int(bounds[1] // threshold) ymax = int(bounds[3] // threshold) ncols = int(xmax - xmin + 1) nrows = int(ymax - ymin + 1) result = [] for i in range(xmin, xmax + 1): for j in range(ymin, ymax + 1): b = box(i * threshold, j * threshold, (i + 1) * threshold, (j + 1) * threshold) g = geometry.intersection(b) if g.is_empty: continue result.append({ 'box': g, 'i': i, 'j': j, 'centroid': g.centroid, 'osmids': list(osm['edges'][osm['edges']['geometry'].apply( lambda x: x.intersects(g))]['geometry_id']) }) return pd.DataFrame(result)
def free_flow_speed_estimation(waze, features): """Estimates the maximum and minimum free flow speed values based on the fact that the value level is a proportion of the speed. Returns ------- 'free_flow_speed_min', 'free_flow_speed_max' The bounds of the estimation """ with utils.Logger('Estimating Free Flow Speed', 'INFO'): waze[['free_flow_speed_min', 'free_flow_speed_max']] = ( waze.apply(level_to_free_flow, axis=1).apply(pd.Series)) features = null_feature(waze) features[['free_flow_speed_min', 'free_flow_speed_max']] = ( waze[['id_segment', 'free_flow_speed_min', 'free_flow_speed_max']] .groupby('id_segment').apply(improve_free_flow_estimate).apply(pd.Series)) return features
def get_osm(city, G=False, load_cache=True, fishnet=False, threshold=0.01): """Process the graph generated by Open Stree Maps (OSM) to nodes, edges DataFrames. It also maps the edges and nodes of the graph to smaller quadrants. # TODO improve id system Parameters ---------- city : string Name of the city that you are working on. It works as a unique id. G : nx.DiGraph, optional A road network graph generated by OSMNX (the default is False, if there is cached data accessed by the name of the city) load_cache : bool, optional Whether to load cached data (the default is True, which loads cache) fishnet: bool, optional Whether to divide the city in regions as quadrants using the threshold threshold : float, optional Size of the smaller quadrants (the default is 0.01) Returns ------- dictionary A dictionary with: - 'G': OSMNX nx.DiGraph - 'edges': geopandas.GeoDataFrame with the features of all edges - 'nodes': geopandas.GeoDataFrame with the features of all nodes - 'boxes': geopandas.GeoDataFrame mapping edges to quadrants (optional) """ path = Path(__file__).resolve().parents[2] / 'data' / 'processed' / ( 'osm_' + city + '.p') if os.path.exists(path) and load_cache: with utils.Logger('Fetching data from cache'): osm = pickle.load(open(path, 'rb')) else: if not G: raise ('There is no cached data for {}. Please, insert a graph'. format(path)) with utils.Logger('Treating data'): osm = { 'nodes': gpd.GeoDataFrame(nodes_to_pd(G)), 'edges': edges_to_pd(G), 'G': G } osm['edges'] = gpd.GeoDataFrame(treat_osm_data(osm['edges'], )) osm['nodes']['point'] = osm['nodes'].apply( lambda node: Point([node['x'], node['y']]), axis=1) osm['edges']['geometry_id'] = osm['edges']['geometry'].apply(str) with utils.Logger('Calculating fishnet'): if fishnet: osm['boxes'] = create_boxes(osm, threshold=threshold) with utils.Logger('Saving OSM data to {}'.format(path)): pickle.dump(osm, open(path, 'wb')) return osm
def treat_waze_jams(city, waze, load_cache=True, columns={}): """Get Waze Jam Data and splits the segments to their smallest units, 'segment atom'. It is important because a 'segment atom' are indepent and can be in different jams depending on the day, time, weather and other conditions. This connects directly to the database and queries it. Parameters ---------- city : string Name of the city that you are working on. It works as a unique id. table : string, optional Table name (the default is False, which searches cached data) schema : bool, optional Schema name (the default is False, which searches cached data) con : bool, optional [description] (the default is False, which searches cached data) load_cache : bool, optional Whether to load data from cache (the default is True, which [default_description]) columns : dict, optional Point to these columns: - id - pub_utc_date - line as dict ==> {'uuid': 'id', 'time': 'pub_utc_date'} Returns ------- gpd.GeoPandasDataFrame 'Atomized' table """ path = Path(__file__).resolve().parents[2] / 'data' / 'processed' / ( 'waze_' + city + '.p') if os.path.exists(path) and load_cache: with utils.Logger('Fetching data from cache'): waze = pickle.load(open(path, 'rb')) else: with utils.Logger('Converting line to shape'): waze = waze.rename(columns=columns) try: waze['geometry'] = waze['line'].apply( lambda x: LineString(ast.literal_eval(x))) except: if isinstance(waze['line'][0], str): waze['line'] = waze['line'].apply(ast.literal_eval) waze['geometry'] = waze['line'].apply( lambda x: LineString([(i['x'], i['y']) for i in x])) with utils.Logger('Converting to datetime'): try: waze['startTime'] = pd.to_datetime(waze['startTime']) waze['endTime'] = pd.to_datetime(waze['endTime']) except: waze['put_utc_date'] = pd.to_datetime(waze['pub_utc_date']) with utils.Logger('Get waze segments', 'INFO'): if '_id' not in waze.columns: waze['_id'] = waze['id'] dask.set_options(pool=ThreadPool(40)) segments = [] for i, row in waze.iterrows(): segments.append(dask.delayed(get_segments)(row)) segments = dask.delayed(pd.concat)(segments) segments_waze = dask.delayed(segments).merge(waze, on='_id') with ProgressBar(): waze = dask.compute(segments_waze)[0] with utils.Logger('Saving csv to {}'.format(path)): pickle.dump(gpd.GeoDataFrame(waze), open(path, 'wb')) return gpd.GeoDataFrame(waze)
def join_osm_aadt( osm, aadt, city, n_threads=10, load_cache=False, force_computation=False): dask.set_options(pool=ThreadPool(n_threads)) save_path = '../processed_data/join/{}_'.format(city) if os.path.exists(save_path) and not load_cache: with utils.Logger('Fetching data from cache', 'INFO') as t: utils.Logger('Loading AADT').print_message() aadt = pickle.load(open(save_path + 'aadt.csv', 'rb')) utils.Logger('Loading AADT').print_message() osm = pickle.load(open(save_path + 'osm.csv', 'rb')) else: with utils.Logger('Calculate priority subgraphs', 'INFO') as t: priority_G = get_priority_G(osm, city, force=force_computation) with utils.Logger('Check OSM and AADT', 'INFO') as t: if 'CP_Number' in osm['edges']: osm = dt.get_osm(None, city) aadt = dt.get_aadt(city) with utils.Logger('Join aadt and osm', 'INFO') as t: final = [] for i, row in aadt.iterrows(): begin_aadt = dask.delayed( get_closest_osm_node)(row['begin'], osm, i) end_aadt = dask.delayed( get_closest_osm_node)(row['end'], osm, i) path = dask.delayed(calculate_shortest_path)( begin_aadt, end_aadt, priority_G, i) osm_idx = dask.delayed(perform_join)(osm, path, i) final.append({'index': i, 'begin_aadt': begin_aadt, 'end_aadt': end_aadt, 'path': path, 'osm_idx': osm_idx}) final = dask.compute(final)[0] for f in final: osm['edges'].loc[f['osm_idx'], 'CP_Number'] = aadt.loc[f['index'], 'CP_Number'] utils.Logger('OSM length {}'.format( len(osm['edges']))).print_message() aadt = aadt.merge( pd.DataFrame(final), right_on='index', left_index=True, how='left') utils.Logger('OSM length {}'.format( len(osm['edges']))).print_message() osm['edges'] = osm['edges'].merge(aadt, on='CP_Number', how='left') utils.Logger('OSM length {}'.format( len(osm['edges']))).print_message() with utils.Logger('Saving osm and aadt processed data to {}'.format(path), 'INFO') as t: pickle.dump(osm, open(save_path + 'osm.csv', 'wb')) pickle.dump(aadt, open(save_path + 'aadt.csv', 'wb')) return osm, aadt
def get_closest_osm_node(node, osm, line=1, radius=0.0059, priority=True): utils.Logger( 'Searching closest node {}'.format(line), 'INFO').print_message() if not isinstance(node, shapely.geometry.point.Point): node = shapely.geometry.Point(node) # TODO: pass this as args highway_priority = get_highway_priority() while True: radius = radius + 0.001 utils.Logger( 'Incrementing radius to {}, {}'.format( radius, line)).print_message() # Get distance from node node_box = bbox_from_point(node, radius) osm_intersection = osm['nodes'][osm['nodes'][ 'point'].apply(lambda x: x.intersects(node_box))] utils.Logger('Intersection Lenght {}, {}'.format( len(osm_intersection), line)).print_message() if len(osm_intersection) == 0: utils.Logger('Passing'.format(line)).print_message() continue distance = [(row['n'], node.distance(row['point'])) for i, row in osm_intersection.iterrows()] # Merge with node info distance = osm['nodes'].merge( pd.DataFrame(distance, columns=['n', 'distance']), right_on='n', left_on='n').sort_values(by='distance') # Get node that connect high priority highway edges for i, row in distance.iterrows(): # Get edges info a = get_edges_from_node(row['n'], osm) # If there is a priority edge, return node if priority: if radius > 0.0009 + 0.001 * 10: if a['highway'].apply( lambda x: highway_priority[highway_priority['highway'] == x].values[0][1]).min() < 3: utils.Logger( 'Returning low priority <3, {}'.format(line)).print_message() return row['n'] else: if a['highway'].apply( lambda x: highway_priority[highway_priority['highway'] == x].values[0][1]).min() < 2: utils.Logger('Returning high priority <2, {}'.format(line)).print_message() return row['n'] else: if a['highway'].apply( lambda x: highway_priority[highway_priority['highway'] == x].values[0][1]).min(): utils.Logger( 'Returning no priority, {}'.format(line)).print_message() return row['n']