コード例 #1
0
ファイル: publish.py プロジェクト: yekeren/covid-19-open-data
def convert_tables_to_json(csv_folder: Path, output_folder: Path, **tqdm_kwargs) -> Iterable[Path]:

    # Convert all CSV files to JSON using values format
    map_iter = list(csv_folder.glob("**/*.csv"))
    map_opts = dict(total=len(map_iter), desc="Converting to JSON", **tqdm_kwargs)
    map_func = partial(_try_json_covert, get_schema(), csv_folder, output_folder)
    return list(pbar(map(map_func, map_iter), **map_opts))
コード例 #2
0
def publish_location_breakouts(
        tables_folder: Path,
        output_folder: Path,
        use_table_names: List[str] = None) -> List[Path]:
    """
    Breaks out each of the tables in `tables_folder` based on location key, and writes them into
    subdirectories of `output_folder`.

    Arguments:
        tables_folder: Directory containing input CSV files.
        output_folder: Output path for the resulting location data.
    """
    # Default to a known list of tables to use when none is given
    map_iter = _get_tables_in_folder(tables_folder, use_table_names
                                     or V2_TABLE_LIST)

    # Break out each table into separate folders based on the location key
    _logger.log_info(f"Breaking out tables {[x.stem for x in map_iter]}")
    map_func = partial(table_breakout,
                       output_folder=output_folder,
                       breakout_column="location_key")
    return list(
        pbar(map(map_func, map_iter),
             desc="Breaking out tables",
             total=len(map_iter)))
コード例 #3
0
    def _test_join_all(self, how_mem: str, how_pandas: str):

        # Create a custom function used to read tables casting to the expected schema
        read_table_ = partial(read_table, schema=SCHEMA, low_memory=False)

        # Test joining the index table with every other table
        left = SRC / "test" / "data" / "index.csv"
        for right in pbar([*(SRC / "test" / "data").glob("*.csv")],
                          leave=False):
            if left.name == right.name:
                continue

            left_columns = get_table_columns(left)
            right_columns = get_table_columns(right)

            if not "date" in right_columns:
                self._test_join_pair(read_table_, SCHEMA, left, right, ["key"],
                                     how_mem, how_pandas)

            if "date" in left_columns and not "date" in right_columns:
                self._test_join_pair(read_table_, SCHEMA, left, right, ["key"],
                                     how_mem, how_pandas)

            if "date" in left_columns and "date" in right_columns:
                self._test_join_pair(read_table_, SCHEMA, left, right,
                                     ["key", "date"], how_mem, how_pandas)
コード例 #4
0
def publish_location_aggregates(
    breakout_folder: Path,
    output_folder: Path,
    location_keys: Iterable[str],
    use_table_names: List[str] = None,
    **tqdm_kwargs,
) -> List[Path]:
    """
    This method joins *all* the tables for each location into a main.csv table.

    Arguments:
        tables_folder: Directory containing input CSV files.
        output_folder: Output path for the resulting location data.
        location_keys: List of location keys to do aggregation for.
    """

    # Create a main.csv file for each of the locations in parallel
    map_iter = list(location_keys)
    _logger.log_info(f"Aggregating outputs for {len(map_iter)} location keys")
    map_opts = dict(total=len(map_iter),
                    desc="Creating location subsets",
                    **tqdm_kwargs)
    map_func = partial(
        _aggregate_location_breakouts,
        breakout_folder,
        output_folder,
        use_table_names=use_table_names,
    )
    return list(pbar(map(map_func, map_iter), **map_opts))
コード例 #5
0
def _process_state(data: DataFrame) -> DataFrame:
    data["date"] = data["date"].apply(lambda x: str(x)[:10])
    data["subregion2_code"] = data["fips_code"].apply(
        lambda x: numeric_code_as_string(x, 5))
    data["key"] = "US_" + data["state"] + "_" + data["subregion2_code"]
    data.drop(
        columns=[
            "subregion2_code",
            "state",
            "fips_code",
            "county",
            "report_date_window_end",
            "report_date_window_start",
        ],
        inplace=True,
    )

    # Make sure the data is properly sorted, since we need to compute diffs
    data.sort_values(["key", "date"], inplace=True)

    # Get a mapping between rolling average column names and their daily counterparts
    col_prefixes = (
        "new_cases",
        "new_deaths",
        "new_test_results_reported",
        "admissions_covid_confirmed",
    )
    rolling_suffix = "_7_day_rolling_average"
    rolling_columns_map = {
        col + rolling_suffix: col.replace(rolling_suffix, "")
        for col in col_prefixes
    }

    # Seed the daily versions of the columns with empty values
    for name in rolling_columns_map.values():
        data[name] = None

    # Convert the rolling average columns to daily values one key at a time
    # This can probably be done with some clever grouping function instead, but iteratively is
    # fast enough and it works reliably.
    for key in pbar(data["key"].unique(),
                    desc="Computing daily values from rolling means"):
        mask = data["key"] == key
        for col, name in rolling_columns_map.items():
            subset = data.loc[mask, col].dropna()
            data.loc[subset.index, name] = recover_from_rolling_mean(subset, 7)

    # Get rid of unnecessary columns now that we have the daily values
    data.drop(columns=rolling_columns_map.keys(), inplace=True)

    return table_rename(
        data,
        {
            "new_cases": "new_confirmed",
            "new_deaths": "new_deceased",
            "new_test_results_reported": "new_tested",
            "admissions_covid_confirmed": "new_hospitalized",
        },
    )
コード例 #6
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        # Get all the weather stations with data up until last month from inventory
        year = int(parse_opts.get("year")) if "year" in parse_opts else None
        cur_date = datetime.date(year, 12,
                                 31) if year else datetime.date.today()
        min_date = (cur_date - datetime.timedelta(days=30)).strftime("%Y%m%d")
        stations = read_file(
            sources["inventory"]).rename(columns={
                "LAT": "lat",
                "LON": "lon",
                "ELEV(M)": "elevation"
            })
        stations = stations[stations.END > int(min_date)]
        stations["id"] = stations["USAF"] + stations["WBAN"].apply(
            lambda x: f"{x:05d}")

        # Open the station data as a compressed file
        station_cache = dict()
        with tarfile.open(sources["gsod"], mode="r:gz") as stations_tar:

            # Build the station cache by decompressing all files in memory
            map_iter = stations_tar.getmembers()
            map_func = partial(_extract_station, stations_tar)
            map_opts = dict(desc="Decompressing", total=len(map_iter))
            for station_item in pbar(map(map_func, map_iter), **map_opts):
                station_cache.update(station_item)

        # Get all the POI from metadata and go through each key
        keep_columns = ["key", "latitude", "longitude"]
        metadata = read_file(sources["geography"])[keep_columns].dropna()

        # Only use keys present in the metadata table
        metadata = metadata.merge(aux["metadata"])[keep_columns]

        # Convert all coordinates to radians
        stations["lat"] = stations["lat"].apply(math.radians)
        stations["lon"] = stations["lon"].apply(math.radians)
        metadata["lat"] = metadata["latitude"].apply(math.radians)
        metadata["lon"] = metadata["longitude"].apply(math.radians)

        # Use a manager to handle memory accessed across processes
        manager = Manager()
        station_cache = manager.dict(station_cache)

        # Make sure the stations and the cache are sent to each function call
        map_func = partial(_process_location, station_cache, stations)

        # We don't care about the index while iterating over each metadata item
        map_iter = (record for _, record in metadata.iterrows())

        # Bottleneck is network so we can use lots of threads in parallel
        records = process_map(map_func, map_iter, total=len(metadata))

        return concat(records)
コード例 #7
0
ファイル: publish.py プロジェクト: yekeren/covid-19-open-data
def copy_tables(tables_folder: Path, public_folder: Path) -> None:
    """
    Copy tables as-is from the tables folder into the public folder.

    Arguments:
        tables_folder: Input folder where all CSV files exist.
        public_folder: Output folder where the CSV files will be copied to.
    """
    for output_file in pbar([*tables_folder.glob("*.csv")], desc="Copy tables"):
        shutil.copy(output_file, public_folder / output_file.name)
コード例 #8
0
def publish_subset_latest(tables_folder: Path,
                          output_folder: Path,
                          key: str = "location_key",
                          **tqdm_kwargs) -> List[Path]:
    """
    This method outputs the latest record by date per location key for each of the input tables.

    Arguments:
        tables_folder: Directory containing input CSV files.
        output_folder: Output path for the resulting data.
        key: Column name to group by.
    """
    agg_table_name = "aggregated"

    # Create a latest subset version for each of the tables in parallel
    map_iter = [
        table for table in tables_folder.glob("*.csv")
        if table.stem != agg_table_name
    ]
    _logger.log_info(f"Computing latest subset for {len(map_iter)} tables")
    map_opts = dict(total=len(map_iter),
                    desc="Creating latest subsets",
                    **tqdm_kwargs)
    map_func = partial(_grouped_subset_latest, output_folder, group_column=key)
    for table in pbar(map(map_func, map_iter), **map_opts):
        yield table

    # Use a temporary directory for intermediate files
    with temporary_directory() as workdir:

        latest_dates_table = workdir / "dates.csv"
        latest_dates_map = _latest_date_by_group(output_folder, group_by=key)
        with open(latest_dates_table, "w") as fh:
            fh.write("location_key,date\n")
            for location_key, date in latest_dates_map.items():
                fh.write(f"{location_key},{date}\n")

        join_table_paths = [latest_dates_table]
        tables_in = (table for table in output_folder.glob("*.csv")
                     if table.stem in V3_TABLE_LIST)
        for table_file in tables_in:
            table_columns = get_table_columns(table_file)
            if "date" not in table_columns:
                join_table_paths.append(table_file)
            else:
                tmp_file = workdir / table_file.name
                table_rename(table_file, tmp_file, {"date": None})
                join_table_paths.append(tmp_file)

        # Join them all into a single file for the aggregate version
        output_agg = output_folder / f"{agg_table_name}.csv"
        table_merge(join_table_paths, output_agg, on=[key], how="OUTER")
        yield output_agg
コード例 #9
0
ファイル: publish.py プロジェクト: yekeren/covid-19-open-data
def create_table_subsets(main_table_path: Path, output_path: Path) -> Iterable[Path]:

    # Create subsets with each known key
    yield from _subset_grouped_key(main_table_path, output_path, desc="Grouped key subsets")

    # Create a subfolder to store the latest row for each key
    latest_path = output_path / "latest"
    latest_path.mkdir(parents=True, exist_ok=True)

    # Create a subset with the latest known day of data for each key
    map_opts = dict(desc="Latest subset")
    map_func = partial(_grouped_subset_latest, latest_path)
    map_iter = list(output_path.glob("*.csv")) + [main_table_path]
    yield from pbar(map(map_func, map_iter), **map_opts)
コード例 #10
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        data = table_rename(
            dataframes[0],
            {
                "Country Name": "country_name",
                "Country Code": "3166-1-alpha-3",
                "Indicator Name": "indicator_name",
                "Indicator Code": "indicator_code",
            },
            drop=False,
        )

        # Ensure all columns are of str type
        data.columns = list(map(str, data.columns))

        data = data.merge(aux["worldbank_indicators"]).merge(
            aux["country_codes"])
        data = data.drop(
            columns=["country_name", "3166-1-alpha-2", "3166-1-alpha-3"])

        indicators = parse_opts.get(
            "indicators", {code: code
                           for code in data.indicator_code.values})
        min_year = int(parse_opts.get("min_year", 2015))
        data = data[data.indicator_code.isin(indicators.values())]

        # Index data by indicator code for performance optimization
        keys = data.key.unique()
        indexed = {
            key: data[data.key == key].set_index("indicator_code")
            for key in keys
        }

        # There is probably a fancy pandas function to this more efficiently but this works for now
        map_func = partial(self._process_record, indexed, indicators, min_year)
        records = list(
            pbar(map(map_func, keys),
                 desc="WorldBank Indicators",
                 total=len(keys)))

        # Some countries are better described as subregions
        data = DataFrame.from_records(records)
        data.loc[data.key == "MF", "key"] = "FR_MF"

        # Return all records in DataFrame form
        return data
コード例 #11
0
    def parse(self, sources: List[str], aux: Dict[str, DataFrame],
              **parse_opts):

        data = None
        with zipfile.ZipFile(sources[0]) as zipped:
            data = zipped.read("WDIData.csv")
            data = read_csv(BytesIO(data))
        assert data is not None

        data = data.rename(
            columns={
                "Country Code": "3166-1-alpha-3",
                "Indicator Name": "indicator_name",
                "Indicator Code": "indicator_code",
            })

        data = data.merge(aux["worldbank_indicators"]).merge(
            aux["country_codes"])
        data = data.drop(
            columns=["Country Name", "3166-1-alpha-2", "3166-1-alpha-3"])

        indicators = parse_opts.get(
            "indicators", {code: code
                           for code in data.indicator_code.values})
        min_year = int(parse_opts.get("min_year", 2015))
        data = data[data.indicator_code.isin(indicators.values())]

        # Index data by indicator code for performance optimization
        keys = data.key.unique()
        indexed = {
            key: data[data.key == key].set_index("indicator_code")
            for key in keys
        }

        # There is probably a fancy pandas function to this more efficiently but this works for now
        map_func = partial(self._process_record, indexed, indicators, min_year)
        records = list(
            pbar(map(map_func, keys),
                 desc="WorldBank Indicators",
                 total=len(keys)))

        # Some countries are better described as subregions
        data = DataFrame.from_records(records)
        data.loc[data.key == "MF", "key"] = "FR_MF"

        # Return all records in DataFrame form
        return data
コード例 #12
0
def merge_location_breakout_tables(tables_folder: Path,
                                   output_path: Path) -> None:
    """
    Build a flat view of all tables combined, joined by <key> or <key, date>. This function
    requires for all the location breakout tables to be present under `tables_folder`.

    Arguments:
        tables_folder: Input directory where all CSV files exist.
        output_path: Output directory for the resulting main.csv file.
    """
    # Use only the aggregated main tables
    table_paths = list(sorted(tables_folder.glob("**/*.csv")))

    # Concatenate all the individual breakout tables together
    _logger.log_info(
        f"Concatenating {len(table_paths)} location breakout tables")
    table_concat(pbar(table_paths, desc="Concatenating tables"), output_path)
コード例 #13
0
def _subset_grouped_key(main_table_path: Path,
                        output_folder: Path,
                        desc: str = None) -> Iterable[Path]:
    """ Outputs a subsets of the table with only records with a particular key """

    # Read the header of the main file to get the columns
    with open(main_table_path, "r") as fd:
        header = next(fd)

    # Do a first sweep to get the number of keys so we can accurately report progress
    key_set = set()
    for line in read_lines(main_table_path, skip_empty=True):
        key, data = line.split(",", 1)
        key_set.add(key)

    # We make use of the main table being sorted by <key, date> and do a linear sweep of the file
    # assuming that once the key changes we won't see it again in future lines
    key_folder: Path = None
    current_key: str = None
    file_handle: TextIO = None
    progress_bar = pbar(total=len(key_set), desc=desc)
    for idx, line in enumerate(read_lines(main_table_path, skip_empty=True)):
        key, data = line.split(",", 1)

        # Skip the header line
        if idx == 0:
            continue

        # When the key changes, close the previous file handle and open a new one
        if current_key != key:
            if file_handle:
                file_handle.close()
            if key_folder:
                yield key_folder / "main.csv"
            current_key = key
            key_folder = output_folder / key
            key_folder.mkdir(exist_ok=True)
            file_handle = (key_folder / "main.csv").open("w")
            file_handle.write(f"{header}")
            progress_bar.update(1)

        file_handle.write(f"{key},{data}")

    # Close the last file handle and we are done
    file_handle.close()
    progress_bar.close()
コード例 #14
0
    def test_convert_csv_to_json_records(self):
        for json_convert_method in (
                _convert_csv_to_json_records_fast,
                _convert_csv_to_json_records_slow,
        ):
            with temporary_directory() as workdir:

                for csv_file in pbar([*(SRC / "test" / "data").glob("*.csv")],
                                     leave=False):
                    json_output = workdir / csv_file.name.replace(
                        "csv", "json")
                    json_convert_method(SCHEMA, csv_file, json_output)

                    with json_output.open("r") as fd:
                        json_obj = json.load(fd)
                        json_df = DataFrame(data=json_obj["data"],
                                            columns=json_obj["columns"])

                    csv_test_file = workdir / json_output.name.replace(
                        "json", "csv")
                    export_csv(json_df, csv_test_file, schema=SCHEMA)

                    _compare_tables_equal(self, csv_file, csv_test_file)
コード例 #15
0
        data = read_file(f"{URL_OUTPUTS_PROD}/{v2_name}.csv")
        rename_columns = {
            col: snake_to_camel_case(col)
            for col in data.columns
        }
        export_csv(data.rename(columns=rename_columns),
                   public_folder / f"{v1_name}.csv")

    # Create the v1 forecast.csv file
    export_csv(
        build_forecast(read_file(public_folder / "data_minimal.csv")),
        public_folder / "data_forecast.csv",
    )

    # Convert all v1 CSV files to JSON using record format
    for csv_file in pbar([*(public_folder).glob("*.csv")],
                         desc="V1 JSON conversion"):
        data = read_file(csv_file, low_memory=False)
        json_path = str(csv_file).replace("csv", "json")
        data.to_json(json_path, orient="records")

    # Create the v2 folder
    v2_folder = public_folder / "v2"
    v2_folder.mkdir(exist_ok=True, parents=True)

    # Download the v2 tables which can fit under 100MB
    for table_name in pbar(
        (
            "by-age",
            "by-sex",
            "demographics",
            "economy",
コード例 #16
0
def wikidata_property(
    prop: str,
    entities: List[str],
    query_template: str = _WD_QUERY,
    logger: ErrorLogger = ErrorLogger(),
    offset: int = 0,
    **tqdm_kwargs,
) -> Any:
    """
    Query a single property from Wikidata, and return all entities which are part of the provided
    list which contain that property.

    Arguments:
        prop: Wikidata property, for example P1082 for population.
        entities: List of Wikidata identifiers to query the desired property.
        query: [Optional] SPARQL query used to retrieve `prop`.
        logger: [Optional] ErrorLogger instance to use for logging.
        offset: [Optional] Number of items to skip in the result set.
    Returns:
        Iterable[Tuple[str, Any]]: Iterable of <Wikidata ID, property value>
    """
    # Time to wait before retry in case of failure
    wait_time = _INIT_WAIT_TIME

    # Build the query from template
    tpl = query_template + " LIMIT {limit} OFFSET {offset}"
    query = tpl.format(prop=prop, limit=_LIMIT, offset=offset)

    # Keep trying request until succeeds, or _max_retries is reached
    for i in range(_MAX_RETRIES):
        response = None

        try:
            start_time = time.monotonic()
            params = {"query": query, "format": "json"}
            req_opts = dict(headers=_REQUEST_HEADER,
                            params=params,
                            timeout=_WD_TIMEOUT)
            response = requests.get(_WD_URL, **req_opts)
            elapsed_time = time.monotonic() - start_time
            log_opts = dict(status=response.status_code,
                            url=_WD_URL,
                            time=elapsed_time,
                            **params)
            logger.log_info(f"Wikidata SPARQL server response", **log_opts)
            data = response.json()

            # Return the first binding available (there should be only one)
            for item in pbar(data["results"]["bindings"], **tqdm_kwargs):
                pid = item["pid"]["value"].split("/")[-1]
                if pid in entities:
                    yield pid, item["prop"]["value"]

            # Unless we got `_LIMIT` results, keep adding offset until we run our of results
            if len(data["results"]["bindings"]) == _LIMIT:
                yield from wikidata_property(
                    prop,
                    entities,
                    query_template=query_template,
                    logger=logger,
                    offset=offset + _LIMIT,
                    **tqdm_kwargs,
                )

            # If no exceptions were thrown, we have reached the end
            logger.log_info(f"Wikidata SPARQL results end reached")
            return

        except Exception as exc:

            # If we have reached the error limit, log and re-raise the error
            if i + 1 >= _MAX_RETRIES:
                msg = response.text if response is not None else "Unknown error"
                logger.log_error(msg,
                                 exc=exc,
                                 traceback=traceback.format_exc())
                raise exc

            # Use exponential backoff in case of error
            logger.log_info(
                f"({i + 1}) Request error. Retry in {wait_time} seconds...",
                exc=exc)
            time.sleep(wait_time)
            wait_time *= 2
コード例 #17
0
def main():

    # Create the folder which will be published
    public_folder = SRC / ".." / "output" / "public"
    public_folder.mkdir(exist_ok=True, parents=True)

    # Create the v1 data.csv file
    main_table = read_file(f"{URL_OUTPUTS_PROD}/main.csv", low_memory=False)
    data = main_table[main_table.aggregation_level < 2]
    rename_columns = {
        "date": "Date",
        "key": "Key",
        "country_code": "CountryCode",
        "country_name": "CountryName",
        "subregion1_code": "RegionCode",
        "subregion1_name": "RegionName",
        "total_confirmed": "Confirmed",
        "total_deceased": "Deaths",
        "latitude": "Latitude",
        "longitude": "Longitude",
        "population": "Population",
    }
    data = data[rename_columns.keys()].rename(columns=rename_columns)
    data = data.dropna(subset=["Confirmed", "Deaths"], how="all")
    data = data.sort_values(["Date", "Key"])
    export_csv(data, public_folder / "data.csv")

    # Create the v1 data_minimal.csv file
    export_csv(data[["Date", "Key", "Confirmed", "Deaths"]], public_folder / "data_minimal.csv")

    # Create the v1 data_latest.csv file
    latest = main_table[main_table.aggregation_level < 2]
    latest = latest.sort_values("date").groupby("key").last().reset_index()
    latest = latest[rename_columns.keys()].rename(columns=rename_columns)
    latest = latest.dropna(subset=["Confirmed", "Deaths"], how="all")
    latest = latest.sort_values(["Key", "Date"])
    export_csv(latest, public_folder / "data_latest.csv")

    # Create the v1 weather.csv file
    weather = read_file(f"{URL_OUTPUTS_PROD}/weather.csv")
    weather = weather[weather.key.apply(lambda x: len(x.split("_")) < 3)]
    weather = weather.rename(columns={"noaa_distance": "distance", "noaa_station": "station"})
    rename_columns = {col: snake_to_camel_case(col) for col in weather.columns}
    export_csv(weather.rename(columns=rename_columns), public_folder / "weather.csv")

    # Create the v1 mobility.csv file
    mobility = read_file(f"{URL_OUTPUTS_PROD}/mobility.csv")
    mobility = mobility[mobility.key.apply(lambda x: len(x.split("_")) < 3)]
    mobility = drop_na_records(mobility, ["date", "key"])
    rename_columns = {
        col: snake_to_camel_case(col).replace("Mobility", "") for col in mobility.columns
    }
    export_csv(mobility.rename(columns=rename_columns), public_folder / "mobility.csv")

    # Create the v1 CSV files which only require simple column mapping
    v1_v2_name_map = {"response": "oxford-government-response"}
    for v1_name, v2_name in v1_v2_name_map.items():
        data = read_file(f"{URL_OUTPUTS_PROD}/{v2_name}.csv")
        rename_columns = {col: snake_to_camel_case(col) for col in data.columns}
        export_csv(data.rename(columns=rename_columns), public_folder / f"{v1_name}.csv")

    # Create the v1 forecast.csv file
    export_csv(
        build_forecast(read_file(public_folder / "data_minimal.csv")),
        public_folder / "data_forecast.csv",
    )

    # Convert all v1 CSV files to JSON using record format
    for csv_file in pbar([*(public_folder).glob("*.csv")], desc="V1 JSON conversion"):
        data = read_file(csv_file, low_memory=False)
        json_path = str(csv_file).replace("csv", "json")
        data.to_json(json_path, orient="records")

    # Create the v2 folder
    v2_folder = public_folder / "v2"
    v2_folder.mkdir(exist_ok=True, parents=True)

    # Download the v2 tables which can fit under 100MB
    for table_name in pbar(
        (
            "by-age",
            "by-sex",
            "demographics",
            "economy",
            "epidemiology",
            "geography",
            "health",
            "hospitalizations",
            "index",
            "mobility",
            "oxford-government-response",
            "weather",
            "worldbank",
            "worldpop",
        ),
        desc="V2 download",
    ):
        for ext in ("csv", "json"):
            with tempfile.NamedTemporaryFile() as tmp:
                tmp_path = Path(tmp.name)
                download(f"{URL_OUTPUTS_PROD}/{table_name}.{ext}", tmp)
                # Check that the output is less than 100 MB before copying it to the output folder
                if tmp_path.stat().st_size < 100 * 1000 * 1000:
                    shutil.copyfile(tmp_path, v2_folder / f"{table_name}.{ext}")
コード例 #18
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        # Use a manager to handle memory accessed across processes
        manager = Manager()

        # Get all the weather stations with data up until last month from inventory
        today = datetime.date.today()
        min_date = (today - datetime.timedelta(days=30)).strftime("%Y%m%d")
        stations = read_file(
            sources["inventory"]).rename(columns={
                "LAT": "lat",
                "LON": "lon",
                "ELEV(M)": "elevation"
            })
        stations = stations[stations.END > int(min_date)]
        stations["id"] = stations["USAF"] + stations["WBAN"].apply(
            lambda x: f"{x:05d}")

        # Open the station data as a compressed file
        with tarfile.open(sources["gsod"], mode="r:gz") as stations_tar:

            # Build the station cache by decompressing all files in memory
            station_cache = manager.dict()
            for member in pbar(stations_tar.getmembers(),
                               desc="Decompressing"):

                if not member.name.endswith(".csv"):
                    continue

                # Read the records from the provided station
                data = read_file(
                    stations_tar.extractfile(member),
                    file_type="csv",
                    usecols=_COLUMN_MAPPING.keys(),
                ).rename(columns=_COLUMN_MAPPING)

                # Fix data types
                data["noaa_station"] = data["noaa_station"].astype(str)
                data["rainfall"] = data["rainfall"].apply(conv_dist)
                data["snowfall"] = data["snowfall"].apply(conv_dist)
                data["dew_point"] = data["dew_point"].apply(conv_temp)
                for temp_type in ("average", "minimum", "maximum"):
                    col = f"{temp_type}_temperature"
                    data[col] = data[col].apply(conv_temp)

                # Compute the relative humidity from the dew point and average temperature
                data["relative_humidity"] = data.apply(
                    lambda x: relative_humidity(x["average_temperature"], x[
                        "dew_point"]),
                    axis=1)

                station_cache[member.name.replace(".csv", "")] = data

        # Get all the POI from metadata and go through each key
        keep_columns = ["key", "latitude", "longitude"]
        metadata = read_file(sources["geography"])[keep_columns].dropna()

        # Only use keys present in the metadata table
        metadata = metadata.merge(aux["metadata"])[keep_columns]

        # Convert all coordinates to radians
        stations["lat"] = stations["lat"].apply(math.radians)
        stations["lon"] = stations["lon"].apply(math.radians)
        metadata["lat"] = metadata["latitude"].apply(math.radians)
        metadata["lon"] = metadata["longitude"].apply(math.radians)

        # Make sure the stations and the cache are sent to each function call
        map_func = partial(_process_location, station_cache, stations)

        # We don't care about the index while iterating over each metadata item
        map_iter = (record for _, record in metadata.iterrows())

        # Bottleneck is network so we can use lots of threads in parallel
        records = process_map(map_func, map_iter, total=len(metadata))

        return concat(records)
コード例 #19
0
def make_main_table(tables_folder: Path, output_path: Path) -> None:
    """
    Build a flat view of all tables combined, joined by <key> or <key, date>.

    Arguments:
        tables_folder: Input folder where all CSV files exist.
    Returns:
        DataFrame: Flat table with all data combined.
    """

    # Use a temporary directory for intermediate files
    with TemporaryDirectory() as workdir:
        workdir = Path(workdir)

        # Merge all output files into a single table
        keys_table_path = workdir / "keys.csv"
        keys_table = read_file(tables_folder / "index.csv", usecols=["key"])
        export_csv(keys_table, keys_table_path)
        print("Created keys table")

        # Add a date to each region from index to allow iterative left joins
        max_date = (datetime.datetime.now() +
                    datetime.timedelta(days=1)).date().isoformat()
        date_list = [
            date.date().isoformat()
            for date in date_range("2020-01-01", max_date)
        ]
        date_table_path = workdir / "dates.csv"
        export_csv(DataFrame(date_list, columns=["date"]), date_table_path)
        print("Created dates table")

        # Create a temporary working table file which can be used during the steps
        temp_file_path = workdir / "main.tmp.csv"
        table_cross_product(keys_table_path, date_table_path, temp_file_path)
        print("Created cross product table")

        # Add all the index columns to seed the main table
        main_table_path = workdir / "main.csv"
        table_join(temp_file_path, tables_folder / "index.csv", ["key"],
                   main_table_path)
        print("Joined with table index")

        non_dated_columns = set(get_table_columns(main_table_path))
        for table_file_path in pbar([*tables_folder.glob("*.csv")],
                                    desc="Make main table"):
            table_name = table_file_path.stem
            if table_name not in EXCLUDE_FROM_MAIN_TABLE:

                table_columns = get_table_columns(table_file_path)
                if "date" in table_columns:
                    join_on = ["key", "date"]
                else:
                    join_on = ["key"]

                    # Keep track of columns which are not indexed by date
                    non_dated_columns = non_dated_columns | set(table_columns)

                # Iteratively perform left outer joins on all tables
                table_join(main_table_path, table_file_path, join_on,
                           temp_file_path)
                shutil.move(temp_file_path, main_table_path)
                print(f"Joined with table {table_name}")

        # Drop rows with null date or without a single dated record
        # TODO: figure out a memory-efficient way to do this

        # Ensure that the table is appropriately sorted ans write to output location
        table_sort(main_table_path, output_path)
        print("Sorted main table")