def fetch( self, output_folder: Path, cache: Dict[str, str], fetch_opts: List[Dict[str, Any]], skip_existing: bool = False, ) -> Dict[str, str]: # The link to the spreadsheet changes daily, so we parse the HTML to find the link every # time and download the latest version buffer = BytesIO() src_opts = fetch_opts[0] download(src_opts["url"], buffer) page = BeautifulSoup(buffer.getvalue().decode("utf8"), "lxml") for link in page.findAll("a"): if "href" in link.attrs and link.attrs.get("href").endswith( "xlsx"): href = link.attrs.get("href") if href.startswith("/"): href = "https://" + src_opts["url"].split("//")[1].split( "/")[0] + href return [ download_snapshot(href, output_folder, **src_opts.get("opts")) ] raise RuntimeError("No link to XLSX file found in page")
def _pull_source(cache_source: Dict[str, str]): url = cache_source.pop("url") output = cache_source.pop("output") buffer = BytesIO() try: download(url, buffer) with (output_folder / output).open("wb") as fd: fd.write(buffer.getvalue()) except: print(f"Cache pull failed for {url}") traceback.print_exc()
def _pull_source(cache_source: Dict[str, str]): url = cache_source.pop("url") output = cache_source.pop("output") logger.log_info(f"Downloading {url} into {output}") buffer = BytesIO() try: download(url, buffer) with (output_folder / output).open("wb") as fd: fd.write(buffer.getvalue()) logger.log_info(f"Downloaded {output} successfully") except: logger.log_error(f"Cache pull failed for {url}.", traceback=traceback.format_exc())
def main(output_folder: Path, only: List[str] = None, exclude: List[str] = None): # Perform a dry-run to update the data using the current configuration update_data(output_folder, only=only, exclude=exclude) # Download all the tables from the prod server to local storage output_tables = list((output_folder / "tables").glob("*.csv")) tables_summary = {table_path.stem: {} for table_path in output_tables} for table_path in output_tables: table_name = table_path.stem table_path_str = str(table_path) tables_summary[table_name]["local_curr"] = table_path_str local_prod = Path(table_path_str.replace(".csv", ".prod.csv")) with local_prod.open(mode="wb") as fd: try: download(f"{URL_OUTPUTS_PROD}/{table_path.name}", fd) tables_summary[table_name]["local_prod"] = str(local_prod) except: tables_summary[table_name]["local_prod"] = None # Compare the new vs prod data for table_name, table_data in tables_summary.items(): # Read both tables into memory curr_df = read_file(table_data["local_curr"]) if table_data["local_prod"] is None: prod_df = DataFrame(columns=curr_df.columns) else: prod_df = read_file(table_data["local_prod"]) # Compare the number of records table_data["records"] = f"{len(curr_df) - len(prod_df):+d}" # Compare the columns table_data["columns"] = compare_sets(set(curr_df.columns), set(prod_df.columns)) # Compare the keys if "key" in curr_df.columns: table_data["keys"] = compare_sets(set(curr_df.key.unique()), set(prod_df.key.unique())) # Compare the dates if "date" in curr_df.columns: table_data["dates"] = compare_sets(set(curr_df.date.unique()), set(prod_df.date.unique())) # Return the summary of changes return tables_summary
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts): buffer = BytesIO() download(sources[0], buffer, progress=True) data = None with zipfile.ZipFile(buffer) as zipped: data = zipped.read("WDIData.csv") data = read_csv(BytesIO(data)) assert data is not None data = data.rename( columns={ "Country Code": "3166-1-alpha-3", "Indicator Name": "indicator_name", "Indicator Code": "indicator_code", }) data = data.merge(aux["worldbank_indicators"]).merge( aux["country_codes"]) data = data.drop( columns=["Country Name", "3166-1-alpha-2", "3166-1-alpha-3"]) indicators = parse_opts.get( "indicators", {code: code for code in data.indicator_code.values}) min_year = int(parse_opts.get("min_year", 2015)) data = data[data.indicator_code.isin(indicators.values())] # Index data by indicator code for performance optimization keys = data.key.unique() indexed = { key: data[data.key == key].set_index("indicator_code") for key in keys } # There is probably a fancy pandas function to this more efficiently but this works for now map_func = partial(WorldbankDataSource._process_record, indexed, indicators, min_year) records = thread_map(map_func, keys, desc="WorldBank Indicators") # Some countries are better described as subregions data = DataFrame.from_records(records) data.loc[data.key == "MF", "key"] = "FR_MF" # Return all records in DataFrame form return data
v2_folder.mkdir(exist_ok=True, parents=True) # Download the v2 tables which can fit under 100MB for table_name in pbar( ( "by-age", "by-sex", "demographics", "economy", "epidemiology", "geography", "health", "hospitalizations", "index", "mobility", "oxford-government-response", "weather", "worldbank", "worldpop", ), desc="V2 download", ): for ext in ("csv", "json"): with tempfile.NamedTemporaryFile() as tmp: tmp_path = Path(tmp.name) download(f"{URL_OUTPUTS_PROD}/{table_name}.{ext}", tmp) # Check that the output is less than 100 MB before copying it to the output folder if tmp_path.stat().st_size < 100 * 1000 * 1000: shutil.copyfile(tmp_path, v2_folder / f"{table_name}.{ext}")
def perform_update(suite, paths): """ Performs an incremental update and merge of a given suite """ info('Checking for updates in %s' % suite) # print(paths) globalvars.suite = suite needsmerge = {} needsmerge['downloads'] = [] # all files that have to be downloaded regenrelease = False c = 0 for i in repo_order: # i = repository name needsmerge[i] = {} needsmerge[i]['mergelist'] = [] if paths[c]: info('Working on %s repo' % i) remote_path = paths[c].replace(spooldir, repos[i]['host']) try: remote_rel = requests.get(join(remote_path, 'Release')) except requests.exceptions.ConnectionError as err: warn('Caught exception: "%s". Retrying...' % err) return perform_update(suite, paths) local_rel_text = open(join(paths[c], 'Release')).read() diffs = {} if remote_is_newer(remote_rel.text, local_rel_text): download((join(remote_path, 'Release'), join(paths[c], 'Release'))) regenrelease = True diffs = compare_dict(parse_release(remote_rel.text), parse_release(local_rel_text)) if diffs: for k in diffs: if k.endswith('Packages.gz') or k.endswith('Sources.gz'): needsmerge[i]['mergelist'].append(k) rmt = join(paths[c].replace(spooldir, repos[i]['host']), k) loc = join(paths[c], k) dlf = (rmt, loc) needsmerge['downloads'].append(dlf) c += 1 # break # download what needs to be downloaded if needsmerge['downloads']: info('Downloading updates...') dlpool = Pool(cpunm) dlpool.map(download, needsmerge['downloads']) # create union of our Packages.gz and Sources.gz files we will merge uni = [] for i in repo_order: uni.append(needsmerge[i]['mergelist']) updpkg_list = set().union(*uni) # make a list of package lists to feed into merge() merge_list = [] for i in updpkg_list: pkgs = [] for j in repo_order: sui = suite # append valid aliases if repos[j]['aliases']: if suite in aliases[repos[j]['name']]: sui = aliases[repos[j]['name']][suite] elif repos[j]['skipmissing']: sui = None skips = ['jessie-security', 'ascii-security'] # hack if j == 'debian' and suite in skips: sui = None if sui: pkgs.append(join(spooldir, repos[j]['dists'], sui, i)) else: pkgs.append(None) merge_list.append(pkgs) # perform the actual merge if merge_list: info('Merging files...') mrgpool = Pool(cpunm) mrgpool.map(merge, merge_list) # generate Release files if we got any new files if needsmerge['downloads'] or regenrelease: info('Generating Release...') gen_release(suite)
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts): # Get all the weather stations with data up until last month from inventory today = datetime.date.today() min_date = (today - datetime.timedelta(days=30)).strftime("%Y%m%d") stations = read_csv(_INVENTORY_URL).rename(columns={ "LAT": "lat", "LON": "lon", "ELEV(M)": "elevation" }) stations = stations[stations.END > int(min_date)] stations["id"] = stations["USAF"] + stations["WBAN"].apply( lambda x: f"{x:05d}") # Download all the station data as a compressed file buffer = BytesIO() records_url = "https://www.ncei.noaa.gov/data/global-summary-of-the-day/archive/2020.tar.gz" download(records_url, buffer, progress=True) buffer.seek(0) with tarfile.open(fileobj=buffer, mode="r:gz") as stations_tar: # Build the station cache by uncompressing all files in memory station_cache = {} for member in tqdm(stations_tar.getmembers(), desc="Decompressing"): if not member.name.endswith(".csv"): continue # Read the records from the provided station data = read_csv(stations_tar.extractfile(member)).rename( columns=_COLUMN_MAPPING) # Fix data types data.noaa_station = data.noaa_station.astype(str) data.rainfall = data.rainfall.apply( NoaaGsodDataSource.conv_dist) data.snowfall = data.snowfall.apply( NoaaGsodDataSource.conv_dist) for temp_type in ("average", "minimum", "maximum"): col = f"{temp_type}_temperature" data[col] = data[col].apply(NoaaGsodDataSource.conv_temp) station_cache[member.name.replace(".csv", "")] = data # Get all the POI from metadata and go through each key metadata = dataframes[0][["key", "latitude", "longitude"]].dropna() # Convert all coordinates to radians stations["lat"] = stations.lat.apply(math.radians) stations["lon"] = stations.lon.apply(math.radians) metadata["lat"] = metadata.latitude.apply(math.radians) metadata["lon"] = metadata.longitude.apply(math.radians) # Make sure the stations and the cache are sent to each function call map_func = partial(NoaaGsodDataSource.process_location, station_cache, stations) # We don't care about the index while iterating over each metadata item map_iter = [record for _, record in metadata.iterrows()] # Shuffle the iterables to try to make better use of the caching shuffle(map_iter) # Bottleneck is network so we can use lots of threads in parallel records = concurrent.thread_map(map_func, map_iter, total=len(metadata)) return concat(records)
def main(): # Create the folder which will be published public_folder = SRC / ".." / "output" / "public" public_folder.mkdir(exist_ok=True, parents=True) # Create the v1 data.csv file main_table = read_file(f"{URL_OUTPUTS_PROD}/main.csv", low_memory=False) data = main_table[main_table.aggregation_level < 2] rename_columns = { "date": "Date", "key": "Key", "country_code": "CountryCode", "country_name": "CountryName", "subregion1_code": "RegionCode", "subregion1_name": "RegionName", "total_confirmed": "Confirmed", "total_deceased": "Deaths", "latitude": "Latitude", "longitude": "Longitude", "population": "Population", } data = data[rename_columns.keys()].rename(columns=rename_columns) data = data.dropna(subset=["Confirmed", "Deaths"], how="all") data = data.sort_values(["Date", "Key"]) export_csv(data, public_folder / "data.csv") # Create the v1 data_minimal.csv file export_csv(data[["Date", "Key", "Confirmed", "Deaths"]], public_folder / "data_minimal.csv") # Create the v1 data_latest.csv file latest = main_table[main_table.aggregation_level < 2] latest = latest.sort_values("date").groupby("key").last().reset_index() latest = latest[rename_columns.keys()].rename(columns=rename_columns) latest = latest.dropna(subset=["Confirmed", "Deaths"], how="all") latest = latest.sort_values(["Key", "Date"]) export_csv(latest, public_folder / "data_latest.csv") # Create the v1 weather.csv file weather = read_file(f"{URL_OUTPUTS_PROD}/weather.csv") weather = weather[weather.key.apply(lambda x: len(x.split("_")) < 3)] weather = weather.rename(columns={"noaa_distance": "distance", "noaa_station": "station"}) rename_columns = {col: snake_to_camel_case(col) for col in weather.columns} export_csv(weather.rename(columns=rename_columns), public_folder / "weather.csv") # Create the v1 mobility.csv file mobility = read_file(f"{URL_OUTPUTS_PROD}/mobility.csv") mobility = mobility[mobility.key.apply(lambda x: len(x.split("_")) < 3)] mobility = drop_na_records(mobility, ["date", "key"]) rename_columns = { col: snake_to_camel_case(col).replace("Mobility", "") for col in mobility.columns } export_csv(mobility.rename(columns=rename_columns), public_folder / "mobility.csv") # Create the v1 CSV files which only require simple column mapping v1_v2_name_map = {"response": "oxford-government-response"} for v1_name, v2_name in v1_v2_name_map.items(): data = read_file(f"{URL_OUTPUTS_PROD}/{v2_name}.csv") rename_columns = {col: snake_to_camel_case(col) for col in data.columns} export_csv(data.rename(columns=rename_columns), public_folder / f"{v1_name}.csv") # Create the v1 forecast.csv file export_csv( build_forecast(read_file(public_folder / "data_minimal.csv")), public_folder / "data_forecast.csv", ) # Convert all v1 CSV files to JSON using record format for csv_file in pbar([*(public_folder).glob("*.csv")], desc="V1 JSON conversion"): data = read_file(csv_file, low_memory=False) json_path = str(csv_file).replace("csv", "json") data.to_json(json_path, orient="records") # Create the v2 folder v2_folder = public_folder / "v2" v2_folder.mkdir(exist_ok=True, parents=True) # Download the v2 tables which can fit under 100MB for table_name in pbar( ( "by-age", "by-sex", "demographics", "economy", "epidemiology", "geography", "health", "hospitalizations", "index", "mobility", "oxford-government-response", "weather", "worldbank", "worldpop", ), desc="V2 download", ): for ext in ("csv", "json"): with tempfile.NamedTemporaryFile() as tmp: tmp_path = Path(tmp.name) download(f"{URL_OUTPUTS_PROD}/{table_name}.{ext}", tmp) # Check that the output is less than 100 MB before copying it to the output folder if tmp_path.stat().st_size < 100 * 1000 * 1000: shutil.copyfile(tmp_path, v2_folder / f"{table_name}.{ext}")