コード例 #1
0
    def test_date_range(self):
        start = "2020-01-01"
        end = "2020-01-09"

        expected = [
            "2020-01-01",
            "2020-01-02",
            "2020-01-03",
            "2020-01-04",
            "2020-01-05",
            "2020-01-06",
            "2020-01-07",
            "2020-01-08",
            "2020-01-09",
        ]

        # Test normal case
        self.assertListEqual(list(date_range(start, end)), expected)

        # Test start > end
        with self.assertRaises(AssertionError):
            # pylint: disable=arguments-out-of-order
            list(date_range(end, start))

        # Test start == end
        self.assertListEqual(list(date_range(start, start)), [expected[0]])
コード例 #2
0
    def fetch(
        self,
        output_folder: Path,
        cache: Dict[str, str],
        fetch_opts: List[Dict[str, Any]],
        skip_existing: bool = False,
    ) -> Dict[str, str]:
        # Data is published as snapshots, so we guess the URL based on the date
        opts = dict(fetch_opts[0])
        url_tpl = opts.pop("url")

        # Keep trying URLs in reverse chronological order starting today until one works
        url = None
        date_start = "2021-08-31"
        date_end = date_today(offset=1)
        for date in reversed(list(date_range(date_start, date_end))):
            url = url_tpl.format(date=date.replace("-", ""))
            res = requests.head(url)
            if res.status_code == 200 and int(
                    res.headers.get("Content-Length", "0")) > 0:
                # Pass the actual URLs down to fetch it
                url_opts = dict(url=url, **opts)
                return super().fetch(output_folder,
                                     cache, [url_opts],
                                     skip_existing=skip_existing)
コード例 #3
0
 def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
           **parse_opts) -> DataFrame:
     # Data can only be retrieved one day at a time, and it starts on 2020-01-22
     first = "2020-01-22"
     map_iter = list(date_range(first, date_today()))
     records = sum(thread_map(_get_daily_records, map_iter), [])
     return DataFrame.from_records(records)
コード例 #4
0
def _make_location_key_and_date_table(index_table: Path,
                                      output_path: Path) -> None:
    # Use a temporary directory for intermediate files
    with temporary_directory() as workdir:

        # Make sure that there is an index table present
        assert index_table.exists(), "Index table not found"

        # Index table will determine if we use "key" or "location_key" as column name
        index_columns = get_table_columns(index_table)
        location_key = "location_key" if "location_key" in index_columns else "key"

        # Create a single-column table with only the keys
        keys_table_path = workdir / "location_keys.csv"
        with open(keys_table_path, "w") as fd:
            fd.write(f"{location_key}\n")
            fd.writelines(
                f"{value}\n"
                for value in table_read_column(index_table, location_key))

        # Add a date to each region from index to allow iterative left joins
        max_date = (datetime.datetime.now() +
                    datetime.timedelta(days=1)).date().isoformat()
        date_table_path = workdir / "dates.csv"
        with open(date_table_path, "w") as fd:
            fd.write("date\n")
            fd.writelines(f"{value}\n"
                          for value in date_range("2020-01-01", max_date))

        # Output all combinations of <key x date>
        table_cross_product(keys_table_path, date_table_path, output_path)
コード例 #5
0
    def fetch(
        self, output_folder: Path, cache: Dict[str, str], fetch_opts: List[Dict[str, Any]]
    ) -> Dict[str, str]:
        # Data is published as GitHub Releases, so we guess the URL based on the date
        opts = fetch_opts[0]
        url_tpl = opts["url"]

        # Go from <today + 1> until the last known date for which data is reported
        # NOTE: at the time of writing, last known date is October 20
        working_url = None
        last_known_date = "2020-10-20"
        latest_date = (datetime.datetime.today() + datetime.timedelta(days=1)).date().isoformat()
        for date in reversed(list(date_range(last_known_date, latest_date))):
            try:
                url_test = url_tpl.format(date=date.replace("-", "."))
                self.log_debug(f"Trying {url_test}")
                res = requests.get(url_test)
                if res.ok:
                    working_url = url_test
                    break
            except:
                continue

        # Make sure that we found a working URL
        assert working_url is not None, f"No working URL found for DXY data source"

        # Pass the actual URL down to fetch it
        return super().fetch(output_folder, cache, [{**opts, "url": working_url}])
コード例 #6
0
    def fetch(
        self,
        output_folder: Path,
        cache: Dict[str, str],
        fetch_opts: List[Dict[str, Any]],
        skip_existing: bool = False,
    ) -> Dict[str, str]:
        # Data is published as GitHub Releases, so we guess the URL based on the date
        opts = dict(fetch_opts[0])
        url_tpl = opts.pop("url")

        urls = []
        date_start = "2021-01-11"
        date_end = date_today(offset=1)
        for date in date_range(date_start, date_end):
            urls.append(
                dict(name=date,
                     url=url_tpl.format(date=date.replace("-", "")),
                     **opts))

        # Pass the actual URLs down to fetch it
        return super().fetch(output_folder,
                             cache,
                             urls,
                             skip_existing=skip_existing)
コード例 #7
0
    def parse_dataframes(
        self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:

        # Parse the data as a list of records
        records = []
        for _, row in dataframes[0].iterrows():
            row = row.to_dict()

            # Parse the start and end dates
            date_start = row["Effective Date"][:10]
            date_end = row["Valid Through Date"][:10]

            # Convert column name and delete unnecessary columns
            row["subregion1_name"] = row["Jurisdictions"]
            del row["Jurisdictions"]
            del row["Effective Date"]
            del row["Valid Through Date"]

            # Insert a record for each date in the range
            for date in date_range(date_start, date_end):
                record = {}
                record["date"] = date
                non_numeric_columns = ("date", "subregion1_name")
                for col, val in row.items():
                    if col in non_numeric_columns:
                        record[col] = val
                    else:
                        record[_convert_column_name(col)] = safe_int_cast(val)
                records.append(record)

        # Convert to DataFrame and add metadata for matching
        data = DataFrame.from_records(records)
        data["country_code"] = "US"
        data["subregion2_code"] = None
        data["locality_code"] = None

        return data
コード例 #8
0
    def fetch(
        self,
        output_folder: Path,
        cache: Dict[str, str],
        fetch_opts: List[Dict[str, Any]],
        skip_existing: bool = False,
    ) -> Dict[str, str]:
        # Data is published as daily snapshots, so we guess the URL based on the date
        opts = dict(fetch_opts[0])
        url_tpl = opts.pop("url")

        urls = []
        date_start = "2020-05-06"
        date_end = date_today(offset=1)
        for date in date_range(date_start, date_end):
            datestr = "".join(reversed(date.split("-")))
            urls.append(
                dict(name=date, url=url_tpl.format(date=datestr), **opts))

        # Pass the actual URLs down to fetch it
        return super().fetch(output_folder,
                             cache,
                             urls,
                             skip_existing=skip_existing)
コード例 #9
0
def make_main_table(
    tables_folder: Path,
    output_path: Path,
    logger: ErrorLogger = ErrorLogger()) -> None:
    """
    Build a flat view of all tables combined, joined by <key> or <key, date>.

    Arguments:
        tables_folder: Input folder where all CSV files exist.
    Returns:
        DataFrame: Flat table with all data combined.
    """

    # Use a temporary directory for intermediate files
    with TemporaryDirectory() as workdir:
        workdir = Path(workdir)

        # Merge all output files into a single table
        keys_table_path = workdir / "keys.csv"
        keys_table = read_file(tables_folder / "index.csv", usecols=["key"])
        export_csv(keys_table, keys_table_path)
        logger.log_info("Created keys table")

        # Add a date to each region from index to allow iterative left joins
        max_date = (datetime.datetime.now() +
                    datetime.timedelta(days=1)).date().isoformat()
        date_list = date_range("2020-01-01", max_date)
        date_table_path = workdir / "dates.csv"
        export_csv(DataFrame(date_list, columns=["date"]), date_table_path)
        logger.log_info("Created dates table")

        # Create a temporary working table file which can be used during the steps
        temp_file_path = workdir / "main.tmp.csv"
        table_cross_product(keys_table_path, date_table_path, temp_file_path)
        logger.log_info("Created cross product table")

        # Add all the index columns to seed the main table
        main_table_path = workdir / "main.csv"
        table_join(temp_file_path,
                   tables_folder / "index.csv", ["key"],
                   main_table_path,
                   how="outer")
        logger.log_info("Joined with table index")

        non_dated_columns = set(get_table_columns(main_table_path))
        for table_file_path in pbar([*tables_folder.glob("*.csv")],
                                    desc="Make main table"):
            table_name = table_file_path.stem
            if table_name not in EXCLUDE_FROM_MAIN_TABLE:

                table_columns = get_table_columns(table_file_path)
                if "date" in table_columns:
                    join_on = ["key", "date"]
                else:
                    join_on = ["key"]

                    # Keep track of columns which are not indexed by date
                    non_dated_columns = non_dated_columns | set(table_columns)

                # Iteratively perform left outer joins on all tables
                table_join(main_table_path,
                           table_file_path,
                           join_on,
                           temp_file_path,
                           how="outer")
                shutil.move(temp_file_path, main_table_path)
                logger.log_info(f"Joined with table {table_name}")

        # Drop rows with null date or without a single dated record
        # TODO: figure out a memory-efficient way to do this

        # Ensure that the table is appropriately sorted ans write to output location
        table_sort(main_table_path, output_path)
        logger.log_info("Sorted main table")