def table_from_h5(table, h5file="stats_can.h5", path=None): """Read a table from h5 to a dataframe Parameters ---------- table: str name of the table to read h5file: str, default stats_can.h5 name of the h5file to retrieve the table from path: str or path, default = current working directory path to the h5file Returns ------- df: pd.DataFrame table in dataframe format """ table = "table_" + parse_tables(table)[0] if path: h5 = os.path.join(path, h5file) else: h5 = h5file try: with pd.HDFStore(h5, "r") as store: df = pd.read_hdf(store, key=table) except (KeyError, OSError): print("Downloading and loading " + table) tables_to_h5(tables=table, h5file=h5file, path=path) with pd.HDFStore(h5, "r") as store: df = pd.read_hdf(store, key=table) return df
def metadata_from_h5(tables, h5file="stats_can.h5", path=None): """Read table metadata from h5 Parameters ---------- table: str or list of str name of the tables to read h5file: str, default stats_can.h5 name of the h5file to retrieve the table from path: str or path, default = current working directory path to the h5file Returns ------- list of local table metadata """ if path: h5file = os.path.join(path, h5file) tables = ["json_" + tbl for tbl in parse_tables(tables)] jsons = [] try: with h5py.File(h5file, "r") as f: for tbl in tables: try: table_json = json.loads(f[tbl][()]) jsons += [table_json] except KeyError: print("Couldn't find table " + tbl) except OSError: print(f"{h5file} does not exist") return jsons
def get_full_table_download(table, csv=True): """https://www.statcan.gc.ca/eng/developers/wds/user-guide#a12-6 https://www.statcan.gc.ca/eng/developers/wds/user-guide#a12-7 Take a table name and return a url to a zipped file of that table Parameters ---------- table: str table name to download csv: boolean, default True download in CSV format, if not download SDMX Returns ------- str: path to the file download """ table = parse_tables(table)[0] if csv: url = SC_URL + "getFullTableDownloadCSV/" + table + "/en" else: url = SC_URL + "getFullTableDownloadSDMX/" + table result = requests.get(url) result = check_status(result) return result["object"]
def delete_tables(tables, path=None, h5file="stats_can.h5", csv=True): """Delete downloaded tables Parameters ---------- tables: list list of tables to delete path: str or os path object, default None where to look for the tables to delete h5file: str default stats_can.h5 h5file to remove from, set to None to remove zips csv: boolean, default True if h5file is None this specifies whether to delete zipped csv or SDMX Returns ------- to_delete: list list of deleted tables """ clean_tables = parse_tables(tables) available_tables_jsons = list_downloaded_tables(path=path, h5file=h5file) available_tables = [j["productId"] for j in available_tables_jsons] to_delete = [t for t in clean_tables if t in available_tables] if h5file: keys_to_del = [] for td in to_delete: json_to_del = "json_" + td tbl_to_del = "table_" + td keys_to_del.append(json_to_del) keys_to_del.append(tbl_to_del) if path: h5file = os.path.join(path, h5file) with h5py.File(h5file, "a") as f: for k in keys_to_del: del f[k] else: files_to_del = [] for td in to_delete: json_to_del = td + ".json" if csv: zip_to_del = td + "-eng.zip" else: zip_to_del = td + ".zip" files_to_del.append(zip_to_del) files_to_del.append(json_to_del) if path: files_to_del = [os.path.join(path, f) for f in files_to_del] for file in files_to_del: if os.path.exists(file): os.remove(file) return to_delete
def delete_tables(tables, path=None, h5file="stats_can.h5", csv=True): """Delete downloaded tables. Parameters ---------- tables: list list of tables to delete path: str or path object, default None where to look for the tables to delete h5file: str default stats_can.h5 h5file to remove from, set to None to remove zips csv: boolean, default True if h5file is None this specifies whether to delete zipped csv or SDMX Returns ------- to_delete: list list of deleted tables """ path = pathlib.Path(path) if path else pathlib.Path() clean_tables = parse_tables(tables) available_tables_jsons = list_downloaded_tables(path=path, h5file=h5file) available_tables = [j["productId"] for j in available_tables_jsons] to_delete = [t for t in clean_tables if t in available_tables] if h5file: keys_to_del = [] for td in to_delete: json_to_del = "json_" + td tbl_to_del = "table_" + td keys_to_del.append(json_to_del) keys_to_del.append(tbl_to_del) h5file = path / h5file with h5py.File(h5file, "a") as f: for k in keys_to_del: del f[k] else: files_to_del = [] for td in to_delete: json_to_del = td + ".json" zip_to_del = td + ("-eng.zip" if csv else ".zip") files_to_del.append(zip_to_del) files_to_del.append(json_to_del) for file in files_to_del: p = path / file if p.is_file(): p.unlink() return to_delete
def list_h5_tables(path=None, h5file="stats_can.h5"): """return a list of metadata for StatsCan tables from an hdf5 file Parameters ---------- path: str or path, default = current working directory path to the h5 file h5file: str, default stats_can.h5 name of the h5file to read table data from Returns ------- jsons: list list of available tables json data """ keys = h5_included_keys(h5file=h5file, path=path) tables = parse_tables([k for k in keys if k.startswith("json_")]) return metadata_from_h5(tables, h5file=h5file, path=path)
def tables_to_h5(tables, h5file="stats_can.h5", path=None): """Take a table and its metadata and put it in an hdf5 file. Parameters ---------- tables: list of str tables to add to the h5file h5file: str, default stats_can.h5 name of the h5file to store the tables in path: str or path, default = current working directory path to the h5file Returns ------- tables: list list of tables loaded into the file """ path = pathlib.Path(path) if path else pathlib.Path() h5file = path / h5file tables = parse_tables(tables) path = h5file.parent for table in tables: hkey = "table_" + table jkey = "json_" + table zip_file = table + "-eng.zip" json_file = table + ".json" zip_file = path / zip_file json_file = path / json_file if not json_file.is_file(): download_tables([table], path) df = zip_table_to_dataframe(table, path=path) with open(json_file) as f_name: df_json = json.load(f_name) with pd.HDFStore(h5file, "a") as store: store.put(key=hkey, value=df, format="table", complevel=1) with h5py.File(h5file, "a") as hfile: if jkey in hfile.keys(): del hfile[jkey] hfile.create_dataset(jkey, data=json.dumps(df_json)) zip_file.unlink() json_file.unlink() return tables
def tables_to_h5(tables, h5file="stats_can.h5", path=None): """Take a table and its metadata and put it in an hdf5 file Parameters ---------- tables: list of str tables to add to the h5file h5file: str, default stats_can.h5 name of the h5file to store the tables in path: str or path, default = current working directory path to the h5file Returns ------- tables: list list of tables loaded into the file """ if path: h5file = os.path.join(path, h5file) tables = parse_tables(tables) for table in tables: hkey = "table_" + table jkey = "json_" + table zip_file = table + "-eng.zip" json_file = table + ".json" if path: zip_file = os.path.join(path, zip_file) json_file = os.path.join(path, json_file) if not os.path.isfile(json_file): download_tables([table], path) df = zip_table_to_dataframe(table, path=path) with open(json_file) as f_name: df_json = json.load(f_name) with pd.HDFStore(h5file, "a") as store: df.to_hdf(store, key=hkey, format="table", complevel=1) with h5py.File(h5file, "a") as hfile: if jkey in hfile.keys(): del hfile[jkey] hfile.create_dataset(jkey, data=json.dumps(df_json)) os.remove(zip_file) os.remove(json_file) return tables
def get_cube_metadata(tables): """https://www.statcan.gc.ca/eng/developers/wds/user-guide#a11-1 Take a list of tables and return a list of dictionaries with their metadata Parameters ---------- tables : str or list of str IDs of tables to get metadata for Returns ------- list of dicts one for each table with its metadata """ tables = parse_tables(tables) tables = [{"productId": t} for t in tables] url = SC_URL + "getCubeMetadata" result = requests.post(url, json=tables) result = check_status(result) return [r["object"] for r in result]
def zip_table_to_dataframe(table, path=None): """Reads a StatsCan table into a pandas DataFrame If a zip file of the table does not exist in path, downloads it Parameters ---------- table: str the table to load to dataframe from zipped csv path: str, default: current working directory when module is loaded where to download the tables or load them Returns: df: pandas.DataFrame the table as a dataframe """ # Parse tables returns a list, can only do one table at a time here though table = parse_tables(table)[0] table_zip = table + "-eng.zip" if path: table_zip = os.path.join(path, table_zip) if not os.path.isfile(table_zip): download_tables([table], path) csv_file = table + ".csv" with zipfile.ZipFile(table_zip) as myzip: with myzip.open(csv_file) as myfile: col_names = pd.read_csv(myfile, nrows=0).columns # reopen the file or it misses the first row with myzip.open(csv_file) as myfile: types_dict = {"VALUE": float} types_dict.update( {col: str for col in col_names if col not in types_dict}) df = pd.read_csv(myfile, dtype=types_dict) possible_cats = [ "GEO", "DGUID", "STATUS", "SYMBOL", "TERMINATED", "DECIMALS", "UOM", "UOM_ID", "SCALAR_FACTOR", "SCALAR_ID", "VECTOR", "COORDINATE", "Wages", "National Occupational Classification for Statistics (NOC-S)", "Supplementary unemployment rates", "Sex", "Age group", "Labour force characteristics", "Statistics", "Data type", "Job permanency", "Union coverage", "Educational attainment", ] actual_cats = [col for col in possible_cats if col in col_names] df[actual_cats] = df[actual_cats].astype("category") try: df["REF_DATE"] = pd.to_datetime(df["REF_DATE"], format="%Y-%m") except TypeError: df["REF_DATE"] = pd.to_datetime(df["REF_DATE"]) return df