def test_date_range(self): start = "2020-01-01" end = "2020-01-09" expected = [ "2020-01-01", "2020-01-02", "2020-01-03", "2020-01-04", "2020-01-05", "2020-01-06", "2020-01-07", "2020-01-08", "2020-01-09", ] # Test normal case self.assertListEqual(list(date_range(start, end)), expected) # Test start > end with self.assertRaises(AssertionError): # pylint: disable=arguments-out-of-order list(date_range(end, start)) # Test start == end self.assertListEqual(list(date_range(start, start)), [expected[0]])
def fetch( self, output_folder: Path, cache: Dict[str, str], fetch_opts: List[Dict[str, Any]], skip_existing: bool = False, ) -> Dict[str, str]: # Data is published as snapshots, so we guess the URL based on the date opts = dict(fetch_opts[0]) url_tpl = opts.pop("url") # Keep trying URLs in reverse chronological order starting today until one works url = None date_start = "2021-08-31" date_end = date_today(offset=1) for date in reversed(list(date_range(date_start, date_end))): url = url_tpl.format(date=date.replace("-", "")) res = requests.head(url) if res.status_code == 200 and int( res.headers.get("Content-Length", "0")) > 0: # Pass the actual URLs down to fetch it url_opts = dict(url=url, **opts) return super().fetch(output_folder, cache, [url_opts], skip_existing=skip_existing)
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Data can only be retrieved one day at a time, and it starts on 2020-01-22 first = "2020-01-22" map_iter = list(date_range(first, date_today())) records = sum(thread_map(_get_daily_records, map_iter), []) return DataFrame.from_records(records)
def _make_location_key_and_date_table(index_table: Path, output_path: Path) -> None: # Use a temporary directory for intermediate files with temporary_directory() as workdir: # Make sure that there is an index table present assert index_table.exists(), "Index table not found" # Index table will determine if we use "key" or "location_key" as column name index_columns = get_table_columns(index_table) location_key = "location_key" if "location_key" in index_columns else "key" # Create a single-column table with only the keys keys_table_path = workdir / "location_keys.csv" with open(keys_table_path, "w") as fd: fd.write(f"{location_key}\n") fd.writelines( f"{value}\n" for value in table_read_column(index_table, location_key)) # Add a date to each region from index to allow iterative left joins max_date = (datetime.datetime.now() + datetime.timedelta(days=1)).date().isoformat() date_table_path = workdir / "dates.csv" with open(date_table_path, "w") as fd: fd.write("date\n") fd.writelines(f"{value}\n" for value in date_range("2020-01-01", max_date)) # Output all combinations of <key x date> table_cross_product(keys_table_path, date_table_path, output_path)
def fetch( self, output_folder: Path, cache: Dict[str, str], fetch_opts: List[Dict[str, Any]] ) -> Dict[str, str]: # Data is published as GitHub Releases, so we guess the URL based on the date opts = fetch_opts[0] url_tpl = opts["url"] # Go from <today + 1> until the last known date for which data is reported # NOTE: at the time of writing, last known date is October 20 working_url = None last_known_date = "2020-10-20" latest_date = (datetime.datetime.today() + datetime.timedelta(days=1)).date().isoformat() for date in reversed(list(date_range(last_known_date, latest_date))): try: url_test = url_tpl.format(date=date.replace("-", ".")) self.log_debug(f"Trying {url_test}") res = requests.get(url_test) if res.ok: working_url = url_test break except: continue # Make sure that we found a working URL assert working_url is not None, f"No working URL found for DXY data source" # Pass the actual URL down to fetch it return super().fetch(output_folder, cache, [{**opts, "url": working_url}])
def fetch( self, output_folder: Path, cache: Dict[str, str], fetch_opts: List[Dict[str, Any]], skip_existing: bool = False, ) -> Dict[str, str]: # Data is published as GitHub Releases, so we guess the URL based on the date opts = dict(fetch_opts[0]) url_tpl = opts.pop("url") urls = [] date_start = "2021-01-11" date_end = date_today(offset=1) for date in date_range(date_start, date_end): urls.append( dict(name=date, url=url_tpl.format(date=date.replace("-", "")), **opts)) # Pass the actual URLs down to fetch it return super().fetch(output_folder, cache, urls, skip_existing=skip_existing)
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: # Parse the data as a list of records records = [] for _, row in dataframes[0].iterrows(): row = row.to_dict() # Parse the start and end dates date_start = row["Effective Date"][:10] date_end = row["Valid Through Date"][:10] # Convert column name and delete unnecessary columns row["subregion1_name"] = row["Jurisdictions"] del row["Jurisdictions"] del row["Effective Date"] del row["Valid Through Date"] # Insert a record for each date in the range for date in date_range(date_start, date_end): record = {} record["date"] = date non_numeric_columns = ("date", "subregion1_name") for col, val in row.items(): if col in non_numeric_columns: record[col] = val else: record[_convert_column_name(col)] = safe_int_cast(val) records.append(record) # Convert to DataFrame and add metadata for matching data = DataFrame.from_records(records) data["country_code"] = "US" data["subregion2_code"] = None data["locality_code"] = None return data
def fetch( self, output_folder: Path, cache: Dict[str, str], fetch_opts: List[Dict[str, Any]], skip_existing: bool = False, ) -> Dict[str, str]: # Data is published as daily snapshots, so we guess the URL based on the date opts = dict(fetch_opts[0]) url_tpl = opts.pop("url") urls = [] date_start = "2020-05-06" date_end = date_today(offset=1) for date in date_range(date_start, date_end): datestr = "".join(reversed(date.split("-"))) urls.append( dict(name=date, url=url_tpl.format(date=datestr), **opts)) # Pass the actual URLs down to fetch it return super().fetch(output_folder, cache, urls, skip_existing=skip_existing)
def make_main_table( tables_folder: Path, output_path: Path, logger: ErrorLogger = ErrorLogger()) -> None: """ Build a flat view of all tables combined, joined by <key> or <key, date>. Arguments: tables_folder: Input folder where all CSV files exist. Returns: DataFrame: Flat table with all data combined. """ # Use a temporary directory for intermediate files with TemporaryDirectory() as workdir: workdir = Path(workdir) # Merge all output files into a single table keys_table_path = workdir / "keys.csv" keys_table = read_file(tables_folder / "index.csv", usecols=["key"]) export_csv(keys_table, keys_table_path) logger.log_info("Created keys table") # Add a date to each region from index to allow iterative left joins max_date = (datetime.datetime.now() + datetime.timedelta(days=1)).date().isoformat() date_list = date_range("2020-01-01", max_date) date_table_path = workdir / "dates.csv" export_csv(DataFrame(date_list, columns=["date"]), date_table_path) logger.log_info("Created dates table") # Create a temporary working table file which can be used during the steps temp_file_path = workdir / "main.tmp.csv" table_cross_product(keys_table_path, date_table_path, temp_file_path) logger.log_info("Created cross product table") # Add all the index columns to seed the main table main_table_path = workdir / "main.csv" table_join(temp_file_path, tables_folder / "index.csv", ["key"], main_table_path, how="outer") logger.log_info("Joined with table index") non_dated_columns = set(get_table_columns(main_table_path)) for table_file_path in pbar([*tables_folder.glob("*.csv")], desc="Make main table"): table_name = table_file_path.stem if table_name not in EXCLUDE_FROM_MAIN_TABLE: table_columns = get_table_columns(table_file_path) if "date" in table_columns: join_on = ["key", "date"] else: join_on = ["key"] # Keep track of columns which are not indexed by date non_dated_columns = non_dated_columns | set(table_columns) # Iteratively perform left outer joins on all tables table_join(main_table_path, table_file_path, join_on, temp_file_path, how="outer") shutil.move(temp_file_path, main_table_path) logger.log_info(f"Joined with table {table_name}") # Drop rows with null date or without a single dated record # TODO: figure out a memory-efficient way to do this # Ensure that the table is appropriately sorted ans write to output location table_sort(main_table_path, output_path) logger.log_info("Sorted main table")