def collect_bbbike_subregion_catalogue(confirmation_required=True, verbose=False): """ :param confirmation_required: [bool] (default: True) :param verbose: [bool] (default: False) Testing e.g. confirmation_required = True verbose = True collect_bbbike_subregion_catalogue(confirmation_required, verbose) """ if confirmed("To collect BBBike subregion catalogue? ", confirmation_required=confirmation_required): try: home_url = 'http://download.bbbike.org/osm/bbbike/' bbbike_subregion_catalogue = pd.read_html( home_url, header=0, parse_dates=['Last Modified'])[0].drop(0) bbbike_subregion_catalogue.Name = bbbike_subregion_catalogue.Name.map( lambda x: x.strip('/')) save_pickle(bbbike_subregion_catalogue, cd_dat("BBBike-subregion-catalogue.pickle"), verbose=verbose) bbbike_subregion_names = bbbike_subregion_catalogue.Name.tolist() save_pickle(bbbike_subregion_names, cd_dat("BBBike-subregion-name-list.pickle"), verbose=verbose) except Exception as e: print("Failed to get the required information ... {}.".format(e)) else: print( "The information collection process was not activated. The existing local copy will be loaded instead." )
def download_bbbike_subregion_osm_all_files(subregion_name, download_dir=None, download_confirmation_required=True): """ :param subregion_name: [str] :param download_dir: [str or None] :param download_confirmation_required: [bool] """ subregion_name_ = regulate_bbbike_input_subregion_name(subregion_name) bbbike_download_dictionary = fetch_bbbike_download_catalogue("BBBike-download-catalogue") sub_download_catalogue = bbbike_download_dictionary[subregion_name_] data_dir = cd_dat_bbbike(subregion_name_) if not download_dir else regulate_input_data_dir(download_dir) if confirmed("Confirm to download all available BBBike data for \"{}\"?".format(subregion_name_), confirmation_required=download_confirmation_required): print("\nStart to download all available OSM data for \"{}\" ... \n".format(subregion_name_)) for download_url, osm_filename in zip(sub_download_catalogue.URL, sub_download_catalogue.Filename): print("\n\n\"{}\" (below): ".format(osm_filename)) try: path_to_file = os.path.join(data_dir, subregion_name_, osm_filename) download(download_url, path_to_file) # if os.path.getsize(path_to_file) / (1024 ** 2) <= 5: # time.sleep(5) except Exception as e: print("\nFailed to download \"{}\". {}.".format(osm_filename, e)) print("\nCheck out the downloaded OSM data for \"{}\" at \"{}\".".format( subregion_name_, os.path.join(data_dir, subregion_name_))) else: print("The downloading process was not activated.")
def collect_bbbike_download_catalogue(confirmation_required=True): """ :param confirmation_required: [bool] """ if confirmed("To collect BBBike download dictionary? ", confirmation_required=confirmation_required): try: bbbike_subregion_names = fetch_bbbike_subregion_catalogue("BBBike-subregion-name-list", update=True) download_catalogue = [ fetch_bbbike_subregion_download_catalogue(subregion_name, update=True, confirmation_required=False) for subregion_name in bbbike_subregion_names] subregion_name, subregion_download_catalogue = bbbike_subregion_names[0], download_catalogue[0] # Available file formats file_fmt = [re.sub('{}|CHECKSUM'.format(subregion_name), '', f) for f in subregion_download_catalogue.Filename] save_pickle(file_fmt[:-2], cd_dat("BBBike-osm-file-formats.pickle")) # Available data types data_typ = subregion_download_catalogue.DataType.tolist() save_pickle(data_typ[:-2], cd_dat("BBBike-osm-data-types.pickle")) # available_file_formats = dict(zip(file_fmt, file_ext)) downloads_dictionary = dict(zip(bbbike_subregion_names, download_catalogue)) save_pickle(downloads_dictionary, cd_dat("BBBike-download-catalogue.pickle")) except Exception as e: print("Failed to collect BBBike download dictionary. {}".format(e)) else: print("The information collection process was not activated. The existing local copy will be loaded instead.")
def download_bbbike_subregion_osm(*subregion_name, osm_file_format, download_dir=None, update=False, download_confirmation_required=True): """ :param subregion_name: [str] :param osm_file_format: [str] :param download_dir: [str or None] :param update: [bool] :param download_confirmation_required: [bool] :return: """ for sub_reg_name in subregion_name: subregion_name_, osm_filename, download_url, path_to_file = validate_bbbike_download_info( sub_reg_name, osm_file_format, download_dir) if os.path.isfile(path_to_file) and not update: print("\"{}\" is already available for \"{}\" at: \n\"{}\".\n".format( osm_filename, subregion_name_, path_to_file)) else: if confirmed("\nTo download {} data for {}".format(osm_file_format, subregion_name_), confirmation_required=download_confirmation_required): try: download(download_url, path_to_file) print("\n\"{}\" has been downloaded for \"{}\", which is now available at \n\"{}\".\n".format( osm_filename, subregion_name_, path_to_file)) if os.path.getsize(path_to_file) / (1024 ** 2) <= 5: time.sleep(5) except Exception as e: print("\nFailed to download \"{}\". {}.".format(osm_filename, e)) else: print("The downloading process was not activated.")
def download_sub_subregion_osm_file(*subregion_name, osm_file_format, download_dir=None, update=False, download_confirmation_required=True, interval_sec=5): """ :param subregion_name: [str] case-insensitive, e.g. 'greater London', 'london' :param osm_file_format: [str] ".osm.pbf", ".shp.zip", or ".osm.bz2" :param download_dir: [str or None] directory to save the downloaded file(s), or None (using default directory) :param update: [bool] whether to update (i.e. re-download) data :param download_confirmation_required: [bool] whether to confirm before downloading :param interval_sec: [int or None] interval (in sec) between downloading two subregions """ subregions = retrieve_subregion_names_from(*subregion_name) if confirmed( "\nTo download {} data for all the following subregions: \n{}?\n". format(osm_file_format, ", ".join(subregions)), confirmation_required=download_confirmation_required): download_subregion_osm_file(*subregions, osm_file_format=osm_file_format, download_dir=download_dir, update=update, download_confirmation_required=False) if interval_sec: time.sleep(interval_sec)
def collect_continents_subregion_tables(confirmation_required=True): """ :param confirmation_required: [bool] whether to ask for a confirmation before starting to collect the information """ if confirmed("To collect information about subregions of each continent? ", confirmation_required=confirmation_required): try: home_link = 'https://download.geofabrik.de/' source = requests.get(home_link) soup = bs4.BeautifulSoup(source.text, 'lxml').find_all('td', {'class': 'subregion'}) source.close() continent_names = [td.a.text for td in soup] continent_links = [ urllib.parse.urljoin(home_link, td.a['href']) for td in soup ] subregion_tables = dict( zip(continent_names, [get_subregion_table(url) for url in continent_links])) save_pickle(subregion_tables, cd_dat("GeoFabrik-continents-subregion-tables.pickle")) except Exception as e: print( "Failed to collect the required information ... {}.".format(e)) else: print( "The information collection process was not activated. The existing local copy will be loaded instead." )
def drop(self, database_name=None): """ :param database_name: [str] name of database to disconnect from, or None (default) to disconnect the current one """ db_name = self.database_name if database_name is None else database_name if confirmed("Confirmed to drop the database \"{}\"?".format(db_name)): self.disconnect(db_name) self.engine.execute('DROP DATABASE IF EXISTS "{}"'.format(db_name))
def download_subregion_osm_file(*subregion_name, osm_file_format, download_dir=None, update=False, download_confirmation_required=True, verbose=True): """ :param subregion_name: [str] case-insensitive, e.g. 'greater London', 'london' :param osm_file_format: [str] ".osm.pbf", ".shp.zip", or ".osm.bz2" :param download_dir: [str] directory to save the downloaded file(s), or None (using default directory) :param update: [bool] whether to update (i.e. re-download) data :param download_confirmation_required: [bool] whether to confirm before downloading :param verbose: [bool] """ for sub_reg_name in subregion_name: # Get download URL subregion_name_, download_url = get_subregion_download_url( sub_reg_name, osm_file_format, update=False) if not download_dir: # Download the requested OSM file to default directory osm_filename, path_to_file = get_default_path_to_osm_file( subregion_name_, osm_file_format, mkdir=True) else: regulated_dir = regulate_input_data_dir(download_dir) osm_filename = get_default_osm_filename( subregion_name_, osm_file_format=osm_file_format) path_to_file = os.path.join(regulated_dir, osm_filename) if os.path.isfile(path_to_file) and not update: if verbose: print( "\n\"{}\" is already available for \"{}\" at: \n\"{}\".\n". format(osm_filename, subregion_name_, path_to_file)) else: pass else: if confirmed("\nTo download {} data for {}".format( osm_file_format, subregion_name_), confirmation_required=download_confirmation_required): op = "Updating" if os.path.isfile( path_to_file) else "Downloading" try: download(download_url, path_to_file) print("\n{} \"{}\" for \"{}\" ... Done.".format( op, osm_filename, subregion_name_)) print("Check out: \"{}\".".format(path_to_file)) except Exception as e: print("\nFailed to download \"{}\". {}.".format( osm_filename, e)) else: print("The downloading process was not activated.")
def drop_schema(self, *schema_names): """ :param schema_names: [str] name of one schema, or names of multiple schemas """ if schema_names: schemas = tuple(schema_name for schema_name in schema_names) else: schemas = tuple( x for x in sqlalchemy.engine.reflection.Inspector.from_engine( self.engine).get_schema_names() if x != 'public' and x != 'information_schema') if confirmed("Confirmed to drop the schema(s): {}".format(schemas)): self.engine.execute(('DROP SCHEMA IF EXISTS ' + '%s, ' * (len(schemas) - 1) + '%s CASCADE;') % schemas)
def download_sub_subregion_osm_file(*subregion_name, osm_file_format, download_dir=None, update=False, download_confirmation_required=True, interval_sec=5): """ :param subregion_name: [str] case-insensitive, e.g. 'greater London', 'london' :param osm_file_format: [str] ".osm.pbf", ".shp.zip", or ".osm.bz2" :param download_dir: [str; None (default)] directory to save the downloaded file(s); None (using default directory) :param update: [bool] (default: False) whether to update (i.e. re-download) data :param download_confirmation_required: [bool] (default: True) whether to confirm before downloading :param interval_sec: [int; None] (default: 5) interval (in sec) between downloading two subregions subregion_name_1 = 'bedfordshire' subregion_name_2 = 'rutland' osm_file_format = ".osm.pbf" download_dir = None update = False download_confirmation_required = True verbose = True interval_sec = 5 download_sub_subregion_osm_file(subregion_name_1, subregion_name_2, osm_file_format=osm_file_format, download_dir=download_dir, update=update, download_confirmation_required=download_confirmation_required, interval_sec=interval_sec) """ subregions = retrieve_names_of_subregions_of(*subregion_name) if confirmed( "\nTo download {} data for all the following subregions: \n{}?\n". format(osm_file_format, ", ".join(subregions)), confirmation_required=download_confirmation_required): download_subregion_osm_file(*subregions, osm_file_format=osm_file_format, download_dir=download_dir, update=update, download_confirmation_required=False) if interval_sec: time.sleep(interval_sec)
def collect_bbbike_subregion_download_catalogue(subregion_name, confirmation_required=True): def parse_dlc(dlc): dlc_href = dlc.get('href') # URL filename, download_url = dlc_href.strip('./'), urllib.parse.urljoin(url, dlc_href) if not dlc.has_attr('title'): file_format, file_size, last_update = 'Poly', None, None else: if len(dlc.contents) < 3: file_format, file_size = 'Txt', None else: file_format, file_size, _ = dlc.contents # File type and size file_format, file_size = file_format.strip(), file_size.text last_update = pd.to_datetime(dlc.get('title')) # Date and time parsed_dat = [filename, download_url, file_format, file_size, last_update] return parsed_dat subregion_name_ = regulate_bbbike_input_subregion_name(subregion_name) # if confirmed("To collect BBBike download catalogue for \"{}\"? ".format(subregion_name_), confirmation_required=confirmation_required): try: url = 'https://download.bbbike.org/osm/bbbike/{}/'.format(subregion_name_) source = urllib.request.urlopen(url) source_soup = bs4.BeautifulSoup(source, 'lxml') download_links_class = source_soup.find_all(name='a', attrs={'class': ['download_link', 'small']}) subregion_downloads_catalogue = pd.DataFrame(parse_dlc(x) for x in download_links_class) subregion_downloads_catalogue.columns = ['Filename', 'URL', 'DataType', 'Size', 'LastUpdate'] path_to_file = cd_dat_bbbike(subregion_name_, subregion_name_ + "-download-catalogue.pickle") save_pickle(subregion_downloads_catalogue, path_to_file) except Exception as e: print("Failed to collect download catalogue for \"{}\". {}".format(subregion_name_, e)) else: print("The information collection process was not activated. The existing local copy will be loaded instead.")
def collect_region_subregion_tier(confirmation_required=True, verbose=False): """ :param confirmation_required: [bool] (default: True) whether to confirm before collecting region-subregion tier :param verbose: [bool] (default: False) Testing e.g. confirmation_required = True verbose = True collect_region_subregion_tier(confirmation_required, verbose) """ # Find out the all regions and their subregions def compile_region_subregion_tier(sub_reg_tbls): """ :param sub_reg_tbls: [pd.DataFrame] obtained from fetch_continents_subregion_tables() :return: ([dict], [list]) a dictionary of region-subregion, and a list of (sub)regions without subregions """ having_subregions = copy.deepcopy(sub_reg_tbls) region_subregion_tiers = copy.deepcopy(sub_reg_tbls) non_subregions_list = [] for k, v in sub_reg_tbls.items(): if v is not None and isinstance(v, pd.DataFrame): region_subregion_tiers = update_nested_dict( sub_reg_tbls, {k: set(v.Subregion)}) else: non_subregions_list.append(k) for x in non_subregions_list: having_subregions.pop(x) having_subregions_temp = copy.deepcopy(having_subregions) while having_subregions_temp: for region_name, subregion_table in having_subregions.items(): # subregion_names, subregion_links = subregion_table.Subregion, subregion_table.SubregionURL sub_subregion_tables = dict( zip(subregion_names, [ get_subregion_table(link) for link in subregion_links ])) subregion_index, without_subregion_ = compile_region_subregion_tier( sub_subregion_tables) non_subregions_list += without_subregion_ region_subregion_tiers.update({region_name: subregion_index}) having_subregions_temp.pop(region_name) # Russian Federation in both pages of Asia and Europe, so that there are duplicates in non_subregions_list non_subregions_list = list( more_itertools.unique_everseen(non_subregions_list)) return region_subregion_tiers, non_subregions_list if confirmed( "To compile a region-subregion tier? (Note that it may take a few minutes.) ", confirmation_required=confirmation_required): try: subregion_tables = fetch_continents_subregion_tables(update=True) region_subregion_tier, non_subregions = compile_region_subregion_tier( subregion_tables) save_pickle(region_subregion_tier, cd_dat("GeoFabrik-region-subregion-tier.pickle"), verbose=verbose) save_json(region_subregion_tier, cd_dat("GeoFabrik-region-subregion-tier.json"), verbose=verbose) save_pickle(non_subregions, cd_dat("GeoFabrik-non-subregion-list.pickle"), verbose=verbose) except Exception as e: print("Failed to get the required information ... {}.".format(e)) else: print("The information collection process was not activated.")
def collect_subregion_info_catalogue(confirmation_required=True, verbose=False): """ :param confirmation_required: [bool] (default: False) whether to confirm before starting to collect information :param verbose: [bool] (default: False) Testing e.g. confirmation_required = True verbose = True collect_subregion_info_catalogue(confirmation_required, verbose) """ if confirmed( "To collect all available subregion links? (Note that it may take a few minutes.) ", confirmation_required=confirmation_required): home_url = 'http://download.geofabrik.de/' try: source = requests.get(home_url) soup = bs4.BeautifulSoup(source.text, 'lxml') source.close() avail_subregions = [ td.a.text for td in soup.find_all('td', {'class': 'subregion'}) ] avail_subregion_urls = [ urllib.parse.urljoin(home_url, td.a['href']) for td in soup.find_all('td', {'class': 'subregion'}) ] avail_subregion_url_tables = [ get_subregion_table(sub_url, verbose) for sub_url in avail_subregion_urls ] avail_subregion_url_tables = [ tbl for tbl in avail_subregion_url_tables if tbl is not None ] subregion_url_tables = list(avail_subregion_url_tables) while subregion_url_tables: subregion_url_tables_ = [] for subregion_url_table in subregion_url_tables: subregions = list(subregion_url_table.Subregion) subregion_urls = list(subregion_url_table.SubregionURL) subregion_url_tables_0 = [ get_subregion_table(subregion_url, verbose) for subregion_url in subregion_urls ] subregion_url_tables_ += [ tbl for tbl in subregion_url_tables_0 if tbl is not None ] # (Note that 'Russian Federation' data is available in both 'Asia' and 'Europe') avail_subregions += subregions avail_subregion_urls += subregion_urls avail_subregion_url_tables += subregion_url_tables_ subregion_url_tables = list(subregion_url_tables_) # Save a list of available subregions locally save_pickle(avail_subregions, cd_dat("GeoFabrik-subregion-name-list.pickle"), verbose=verbose) # Subregion index - {Subregion: URL} subregion_url_index = dict( zip(avail_subregions, avail_subregion_urls)) # Save subregion_index to local disk save_pickle( subregion_url_index, cd_dat("GeoFabrik-subregion-name-url-dictionary.pickle"), verbose=verbose) save_json(subregion_url_index, cd_dat("GeoFabrik-subregion-name-url-dictionary.json"), verbose=verbose) # All available URLs for downloading home_subregion_url_table = get_subregion_table(home_url) avail_subregion_url_tables.append(home_subregion_url_table) subregion_downloads_index = pd.DataFrame( pd.concat(avail_subregion_url_tables, ignore_index=True)) subregion_downloads_index.drop_duplicates(inplace=True) subregion_downloads_index_json = subregion_downloads_index.set_index( 'Subregion').to_json() # Save subregion_index_downloads to local disk save_pickle( subregion_downloads_index, cd_dat("GeoFabrik-subregion-downloads-catalogue.pickle"), verbose=verbose) save_json(subregion_downloads_index_json, cd_dat("GeoFabrik-subregion-downloads-catalogue.json"), verbose=verbose) except Exception as e: print("Failed to get the required information ... {}.".format(e)) else: print("The information collection process was not activated.")
def psql_osm_pbf_data_extracts(*subregion_name, database_name='OSM_Geofabrik', data_dir=None, update_osm_pbf=False, if_table_exists='replace', file_size_limit=50, parsed=True, fmt_other_tags=True, fmt_single_geom=True, fmt_multi_geom=True, rm_raw_file=False, verbose=False): """ Import data of selected or all (sub)regions, which do not have (sub-)subregions, into PostgreSQL server :param subregion_name: [str] :param database_name: [str] (default: 'OSM_Geofabrik') :param data_dir: [str; None (default)] :param update_osm_pbf: [bool] (default: False) :param if_table_exists: [str] 'replace' (default); 'append'; or 'fail' :param file_size_limit: [int] (default: 100) :param parsed: [bool] (default: True) :param fmt_other_tags: [bool] (default: True) :param fmt_single_geom: [bool] (default: True) :param fmt_multi_geom: [bool] (default: True) :param rm_raw_file: [bool] (default: False) :param verbose: [bool] (default: False) """ if not subregion_name: subregion_names = fetch_region_subregion_tier("GeoFabrik-non-subregion-list") confirm_msg = "To dump GeoFabrik OSM data extracts of all subregions to PostgreSQL? " else: subregion_names = retrieve_names_of_subregions_of(*subregion_name) confirm_msg = "To dump GeoFabrik OSM data extracts of the following subregions to PostgreSQL? \n{}?\n".format( ", ".join(subregion_names)) if confirmed(confirm_msg): # Connect to PostgreSQL server osmdb = OSM() osmdb.connect_db(database_name=database_name) err_subregion_names = [] for subregion_name_ in subregion_names: default_pbf_filename, default_path_to_pbf = get_default_path_to_osm_file(subregion_name_, ".osm.pbf") if not data_dir: # Go to default file path path_to_osm_pbf = default_path_to_pbf else: osm_pbf_dir = regulate_input_data_dir(data_dir) path_to_osm_pbf = os.path.join(osm_pbf_dir, default_pbf_filename) download_subregion_osm_file(subregion_name_, osm_file_format=".osm.pbf", download_dir=data_dir, update=update_osm_pbf, download_confirmation_required=False, verbose=verbose) file_size_in_mb = round(os.path.getsize(path_to_osm_pbf) / (1024 ** 2), 1) try: if file_size_in_mb <= file_size_limit: subregion_osm_pbf = read_osm_pbf(subregion_name_, data_dir, parsed, file_size_limit, fmt_other_tags, fmt_single_geom, fmt_multi_geom, update=False, download_confirmation_required=False, pickle_it=False, rm_osm_pbf=rm_raw_file) if subregion_osm_pbf is not None: osmdb.dump_osm_pbf_data(subregion_osm_pbf, table_name=subregion_name_, if_exists=if_table_exists) del subregion_osm_pbf gc.collect() else: print("\nParsing and importing \"{}\" feature-wisely to PostgreSQL ... ".format(subregion_name_)) # Reference: https://gdal.org/python/osgeo.ogr.Feature-class.html raw_osm_pbf = ogr.Open(path_to_osm_pbf) layer_count = raw_osm_pbf.GetLayerCount() for i in range(layer_count): lyr = raw_osm_pbf.GetLayerByIndex(i) # Hold the i-th layer lyr_name = lyr.GetName() print(" {} ... ".format(lyr_name), end="") try: lyr_feats = [feat for _, feat in enumerate(lyr)] feats_no, chunks_no = len(lyr_feats), math.ceil(file_size_in_mb / file_size_limit) chunked_lyr_feats = split_list(lyr_feats, chunks_no) del lyr_feats gc.collect() if osmdb.subregion_table_exists(lyr_name, subregion_name_) and if_table_exists == 'replace': osmdb.drop_subregion_data_by_layer(subregion_name_, lyr_name) # Loop through all available features for lyr_chunk in chunked_lyr_feats: lyr_chunk_dat = pd.DataFrame(rapidjson.loads(f.ExportToJson()) for f in lyr_chunk) lyr_chunk_dat = parse_layer_data(lyr_chunk_dat, lyr_name, fmt_other_tags, fmt_single_geom, fmt_multi_geom) if_exists_ = if_table_exists if if_table_exists == 'fail' else 'append' osmdb.dump_osm_pbf_data_by_layer(lyr_chunk_dat, if_exists=if_exists_, schema_name=lyr_name, table_name=subregion_name_) del lyr_chunk_dat gc.collect() print("Done. Total amount of features: {}".format(feats_no)) except Exception as e: print("Failed. {}".format(e)) raw_osm_pbf.Release() del raw_osm_pbf gc.collect() if rm_raw_file: remove_subregion_osm_file(path_to_osm_pbf, verbose=verbose) except Exception as e: print(e) err_subregion_names.append(subregion_name_) if subregion_name_ != subregion_names[-1]: time.sleep(60) if len(err_subregion_names) == 0: print("\nMission accomplished.\n") else: print("\nErrors occurred when parsing data of the following subregion(s):") print(*err_subregion_names, sep=", ") osmdb.disconnect() del osmdb