def zip_update_tables(path=None, csv=True): """check local json, update zips of outdated tables Grabs the json files in path, checks them against the metadata on StatsCan and grabs updated tables where there have been changes There isn't actually a "last modified date" part to the metadata What I'm doing is comparing the latest reference period. Almost all data changes will at least include incremental releases, so this should capture what I want Parameters ---------- path: str, default: None where to look for tables to update csv: boolean, default: True Downloads updates in CSV form by default, SDMX if false Returns ------- update_table_list: list list of the tables that were updated """ local_jsons = list_zipped_tables(path=path) tables = [j["productId"] for j in local_jsons] remote_jsons = get_cube_metadata(tables) update_table_list = [] for local, remote in zip(local_jsons, remote_jsons): if local["cubeEndDate"] != remote["cubeEndDate"]: update_table_list.append(local["productId"]) download_tables(update_table_list, path, csv=csv) return update_table_list
def h5_update_tables(h5file="stats_can.h5", path=None, tables=None): """update any stats_can tables contained in an h5 file Parameters ---------- h5file: str, default stats_can.h5 name of the h5file to store the tables in path: str or path, default = current working directory path to the h5file tables: str or list of str, optional, default None If included will only update the subset of tables already in the file and in the tables parameter """ if tables: local_jsons = metadata_from_h5(tables, h5file=h5file, path=path) else: if path: h5 = os.path.join(path, h5file) else: h5 = h5file with h5py.File(h5) as f: keys = [key for key in f.keys() if key.startswith("json")] local_jsons = [json.loads(f[key][()]) for key in keys] tables = [j["productId"] for j in local_jsons] remote_jsons = get_cube_metadata(tables) update_table_list = [] for local, remote in zip(local_jsons, remote_jsons): if local["cubeEndDate"] != remote["cubeEndDate"]: update_table_list.append(local["productId"]) tables_to_h5(update_table_list, h5file=h5file, path=path) return update_table_list
def download_tables(tables, path=None, csv=True): """Download a json file and zip of data for a list of tables to path. Parameters ---------- tables: list of str tables to be downloaded path: str or path object, default: None (will do current directory) Where to download the table and json csv: boolean, default True download in CSV format, if not download SDMX Returns ------- downloaded: list list of tables that were downloaded """ path = pathlib.Path(path) if path else pathlib.Path() metas = get_cube_metadata(tables) for meta in metas: product_id = meta["productId"] zip_url = get_full_table_download(product_id, csv=csv) zip_file_name = product_id + ("-eng.zip" if csv else ".zip") json_file_name = product_id + ".json" zip_file = path / zip_file_name json_file = path / json_file_name # Thanks http://evanhahn.com/python-requests-library-useragent/ response = requests.get(zip_url, stream=True, headers={"user-agent": None}) progress_bar = tqdm( desc=zip_file_name, total=int(response.headers.get("content-length", 0)), unit="B", unit_scale=True, ) # Thanks https://bit.ly/2sPYPYw with open(json_file, "w") as outfile: json.dump(meta, outfile) with open(zip_file, "wb") as handle: for chunk in response.iter_content(chunk_size=512): if chunk: # filter out keep-alive new chunks handle.write(chunk) progress_bar.update(len(chunk)) progress_bar.close() return [meta["productId"] for meta in metas]
def download_tables(tables, path=None, csv=True): """Download a json file and zip of data for a list of tables to path Parameters ---------- tables: list of str tables to be downloaded path: str, default: None (will do current directory) Where to download the table and json csv: boolean, default True download in CSV format, if not download SDMX Returns ------- downloaded: list list of tables that were downloaded """ metas = get_cube_metadata(tables) for meta in metas: product_id = meta["productId"] zip_url = get_full_table_download(product_id, csv=csv) if csv: zip_file = product_id + "-eng.zip" else: zip_file = product_id + ".zip" json_file = product_id + ".json" if path: zip_file = os.path.join(path, zip_file) json_file = os.path.join(path, json_file) # Thanks http://evanhahn.com/python-requests-library-useragent/ response = requests.get(zip_url, stream=True, headers={"user-agent": None}) # Thanks https://bit.ly/2sPYPYw with open(json_file, "w") as outfile: json.dump(meta, outfile) with open(zip_file, "wb") as handle: for chunk in response.iter_content(chunk_size=512): if chunk: # filter out keep-alive new chunks handle.write(chunk) downloaded = [meta["productId"] for meta in metas] return downloaded