Ejemplo n.º 1
0
    def parse(self, sources: Dict[Any, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        url_tpl = sources[0]
        metadata = aux["metadata"]
        metadata = metadata[metadata["country_code"] == "FR"]

        fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv")
        fr_iso_map = {iso: code for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"])}
        fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna()
        regions_iter = fr_codes["subregion1_code"].unique()
        deps_iter = [record for _, record in fr_codes.iterrows()]

        column_adapter = {
            "key": "key",
            "date": "date",
            "testsRealisesDetails": "_breakdown_tested",
            "testsPositifsDetails": "_breakdown_confirmed",
        }

        # Get country level data
        country = _get_country(url_tpl, column_adapter)

        # Get region level data
        get_region_func = partial(_get_region, url_tpl, column_adapter, fr_iso_map)
        regions = concat(list(thread_map(get_region_func, regions_iter)))

        # Get department level data
        get_department_func = partial(_get_department, url_tpl, column_adapter)
        departments = concat(list(thread_map(get_department_func, deps_iter)))

        data = concat([country, regions, departments])
        data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S"))

        data["_breakdown_tested"].fillna("", inplace=True)
        data["_breakdown_confirmed"].fillna("", inplace=True)

        records: Dict[str, List] = {"confirmed": [], "tested": []}
        for key, row in data.set_index("key").iterrows():
            for statistic in records.keys():
                if row[f"_breakdown_{statistic}"] != "":
                    for item in row[f"_breakdown_{statistic}"]:
                        records[statistic].append(
                            {
                                "key": key,
                                "date": row["date"],
                                "age": item["age"],
                                "sex": item.get("sexe"),
                                f"new_{statistic}": item["value"],
                            }
                        )

        df1 = DataFrame.from_records(records["tested"])
        df2 = DataFrame.from_records(records["confirmed"])
        data = df1.merge(df2, how="outer")

        data = data[~data["age"].isin(["0", "A", "B", "C", "D", "E"])]
        data["age"] = data["age"].apply(lambda x: age_group(safe_int_cast(x)))

        sex_adapter = lambda x: {"h": "male", "f": "female"}.get(x, "sex_unknown")
        data["sex"] = data["sex"].apply(sex_adapter)
        return data
Ejemplo n.º 2
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        url_tpl = sources[0]
        metadata = aux["metadata"]
        metadata = metadata[metadata["country_code"] == "FR"]

        fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv")
        fr_iso_map = {
            iso: code
            for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"])
        }
        fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna()
        regions_iter = fr_codes["subregion1_code"].unique()
        deps_iter = (record for _, record in fr_codes.iterrows())

        if parse_opts.get("country"):
            data = _get_country(url_tpl)

        else:
            get_region_func = partial(_get_region, url_tpl, fr_iso_map)
            regions = concat(list(thread_map(get_region_func, regions_iter)))

            get_department_func = partial(_get_department, url_tpl)
            departments = concat(
                list(
                    thread_map(get_department_func,
                               deps_iter,
                               total=len(fr_codes))))

            data = concat([regions, departments])

        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S"))
        return data
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts):
        url_tpl = sources[0]["url"]

        # Some states cannot be found in the dataset
        states_banlist = [
            "American Samoa",
            "District of Columbia",
            "Guam",
            "Northern Mariana Islands",
            "Puerto Rico",
            "Virgin Islands",
        ]

        states = aux["metadata"]
        states = states.loc[states["country_code"] == "US", "subregion1_name"].dropna().unique()
        states = [state for state in states if state not in states_banlist]
        states_url = [
            url_tpl.format(
                subregion1_name_path=state_name.replace(" ", "%20"),
                subregion1_name_file=state_name.replace(" ", "_"),
            )
            for state_name in states
        ]
        dataframes = {idx: df for idx, df in enumerate(thread_map(read_file, states_url))}
        return self.parse_dataframes(dataframes, aux, **parse_opts)
Ejemplo n.º 4
0
 def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
     # Data can only be retrieved one day at a time, and it starts on 2020-01-22
     first = "2020-01-22"
     today = datetime.now().date().isoformat()
     map_iter = [str(date)[:10] for date in date_range(first, today)]
     records = sum(thread_map(_get_daily_records, map_iter), [])
     return DataFrame.from_records(records)
Ejemplo n.º 5
0
def _download_open_data(
    logger: ErrorLogger,
    url_tpl: str,
    output_folder: Path,
    ibge_code: str,
    max_volumes: int = 12,
    **download_opts,
) -> Dict[str, str]:
    logger.log_debug(f"Downloading Brazil data for {ibge_code}...")

    # Since we are guessing the URL, we forgive errors in the download
    output = {}
    download_opts = dict(download_opts, ignore_failure=True)
    map_func = partial(download_snapshot, output_folder=output_folder, **download_opts)
    map_iter = [url_tpl.format(f"{ibge_code}-{idx + 1}") for idx in range(max_volumes)]
    for idx, file_path in enumerate(thread_map(map_func, map_iter)):
        if file_path is not None:
            output[f"{ibge_code}-{idx + 1}"] = file_path

    # Filter out empty files, which can happen if download fails in an unexpected way
    output = {name: path for name, path in output.items() if Path(path).stat().st_size > 0}

    # If the output is not split into volumes, fall back to single file URL
    if output:
        return output
    else:
        url = url_tpl.format(ibge_code)
        return {ibge_code: download_snapshot(url, output_folder, **download_opts)}
def download_folder(
    bucket_name: str,
    remote_path: str,
    local_folder: Path,
    filter_func: Callable[[Path], bool] = None,
) -> None:
    bucket = get_storage_bucket(bucket_name)

    def _download_blob(local_folder: Path, blob: Blob) -> None:
        # Remove the prefix from the remote path
        rel_path = blob.name.split(f"{remote_path}/", 1)[-1]
        if filter_func is None or filter_func(Path(rel_path)):
            print(f"Downloading {rel_path} to {local_folder}/")
            file_path = local_folder / rel_path
            file_path.parent.mkdir(parents=True, exist_ok=True)
            for i in range(BLOB_OP_MAX_RETRIES):
                try:
                    return blob.download_to_filename(str(file_path))
                except Exception as exc:
                    traceback.print_exc()
                    # Exponential back-off
                    time.sleep(2**i)
            raise IOError(f"Error downloading {rel_path}")

    map_func = partial(_download_blob, local_folder)
    _ = thread_map(map_func,
                   bucket.list_blobs(prefix=remote_path),
                   total=None,
                   disable=True)
    list(_)  # consume the results
def upload_folder(
    bucket_name: str,
    remote_path: str,
    local_folder: Path,
    filter_func: Callable[[Path], bool] = None,
) -> None:
    bucket = get_storage_bucket(bucket_name)

    def _upload_file(remote_path: str, file_path: Path):
        target_path = file_path.relative_to(local_folder)
        if filter_func is None or filter_func(target_path):
            print(f"Uploading {target_path} to {remote_path}/")
            blob = bucket.blob(os.path.join(remote_path, target_path))
            for i in range(BLOB_OP_MAX_RETRIES):
                try:
                    return blob.upload_from_filename(str(file_path))
                except Exception as exc:
                    traceback.print_exc()
                    # Exponential back-off
                    time.sleep(2**i)
            raise IOError(f"Error uploading {target_path}")

    map_func = partial(_upload_file, remote_path)
    _ = thread_map(map_func,
                   local_folder.glob("**/*.*"),
                   total=None,
                   disable=True)
    list(_)  # consume the results
def cache_pull() -> str:
    with TemporaryDirectory() as workdir:
        workdir = Path(workdir)
        now = datetime.datetime.utcnow()
        output_folder = workdir / now.strftime("%Y-%m-%d-%H")
        output_folder.mkdir(parents=True, exist_ok=True)

        def _pull_source(cache_source: Dict[str, str]):
            url = cache_source.pop("url")
            output = cache_source.pop("output")
            buffer = BytesIO()
            try:
                download(url, buffer)
                with (output_folder / output).open("wb") as fd:
                    fd.write(buffer.getvalue())
            except:
                print(f"Cache pull failed for {url}")
                traceback.print_exc()

        # Pull each of the sources from the cache config
        with (SRC / "cache" / "config.json").open("r") as fd:
            cache_list = json.load(fd)
        list(thread_map(_pull_source, cache_list))

        # Upload all cached data to the bucket
        upload_folder(GCS_BUCKET_PROD, "cache", workdir)

        # Build the sitemap for all cached files
        print("Building sitemap")
        sitemap = cache_build_map()
        bucket = get_storage_bucket(GCS_BUCKET_PROD)
        blob = bucket.blob("cache/sitemap.json")
        blob.upload_from_string(json.dumps(sitemap))

    return "OK"
Ejemplo n.º 9
0
def upload_folder(
    bucket_name: str,
    remote_path: str,
    local_folder: Path,
    filter_func: Callable[[Path], bool] = None,
) -> None:
    bucket = get_storage_bucket(bucket_name)

    def _upload_file(remote_path: str, file_path: Path):
        target_path = file_path.relative_to(local_folder)
        if filter_func is None or filter_func(target_path):
            logger.log_debug(f"Uploading {target_path} to {remote_path}/")
            blob = bucket.blob(os.path.join(remote_path, target_path))
            for i in range(BLOB_OP_MAX_RETRIES):
                try:
                    return blob.upload_from_filename(str(file_path))
                except Exception as exc:
                    log_message = f"Error uploading {target_path}."
                    logger.log_warning(log_message,
                                       traceback=traceback.format_exc())
                    # Exponential back-off
                    time.sleep(2**i)

            # If error persists, there must be something wrong with the network so we are better
            # off crashing the appengine server.
            error_message = f"Error uploading {target_path}"
            logger.log_error(error_message)
            raise IOError(error_message)

    map_func = partial(_upload_file, remote_path)
    map_iter = local_folder.glob("**/*.*")
    list(
        thread_map(map_func, map_iter, total=None, disable=True,
                   max_workers=8))
Ejemplo n.º 10
0
 def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
           **parse_opts) -> DataFrame:
     # Data can only be retrieved one day at a time, and it starts on 2020-01-22
     first = "2020-01-22"
     map_iter = list(date_range(first, date_today()))
     records = sum(thread_map(_get_daily_records, map_iter), [])
     return DataFrame.from_records(records)
Ejemplo n.º 11
0
def wikidata_property(prop: str,
                      entities: List[str],
                      query: str = _default_query,
                      error_logger: ErrorLogger = None,
                      **tqdm_kwargs) -> Iterable[Tuple[str, Any]]:
    """
    Query a single property from Wikidata, and return all entities which are part of the provided
    list which contain that property.

    Arguments:
        prop: Wikidata property, for example P1082 for population.
        entities: List of Wikidata identifiers to query the desired property.
        query: [Optional] SPARQL query used to retrieve `prop`.
        error_logger: [Optional] ErrorLogger instance to use for logging.
    Returns:
        Iterable[Tuple[str, Any]]: Iterable of <Wikidata ID, property value>
    """
    # Limit parallelization to avoid hitting rate limits
    tqdm_kwargs["max_workers"] = 6
    map_func = partial(_query_property,
                       prop,
                       query=query,
                       error_logger=error_logger)
    for entity, prop in zip(entities,
                            thread_map(map_func, entities, **tqdm_kwargs)):
        yield entity, prop
Ejemplo n.º 12
0
def convert_tables_to_json(csv_folder: Path,
                           output_folder: Path) -> Iterable[Path]:
    def try_json_covert(schema: Dict[str, str], csv_file: Path) -> Path:
        # JSON output defaults to same as the CSV file but with extension swapped
        json_output = output_folder / str(
            csv_file.relative_to(csv_folder)).replace(".csv", ".json")
        json_output.parent.mkdir(parents=True, exist_ok=True)

        # Converting to JSON is not critical and it may fail in some corner cases
        # As long as the "important" JSON files are created, this should be OK
        try:
            print(f"Converting {csv_file} to JSON")
            convert_csv_to_json_records(schema, csv_file, json_output)
            return json_output
        except Exception as exc:
            print(f"Unable to convert CSV file {csv_file} to JSON: ${exc}",
                  file=sys.stderr)
            traceback.print_exc()
            return None

    # Convert all CSV files to JSON using values format
    map_iter = list(csv_folder.glob("**/*.csv"))
    map_func = partial(try_json_covert, get_schema())
    for json_output in thread_map(map_func,
                                  map_iter,
                                  max_workers=2,
                                  desc="JSON conversion"):
        if json_output is not None:
            yield json_output
Ejemplo n.º 13
0
def download_folder(
    bucket_name: str,
    remote_path: str,
    local_folder: Path,
    filter_func: Callable[[Path], bool] = None,
) -> None:
    bucket = get_storage_bucket(bucket_name)

    def _download_blob(local_folder: Path, blob: Blob) -> None:
        # Remove the prefix from the remote path
        rel_path = blob.name.split(f"{remote_path}/", 1)[-1]
        if filter_func is None or filter_func(Path(rel_path)):
            logger.log_debug(f"Downloading {rel_path} to {local_folder}/")
            file_path = local_folder / rel_path
            file_path.parent.mkdir(parents=True, exist_ok=True)
            for i in range(BLOB_OP_MAX_RETRIES):
                try:
                    return blob.download_to_filename(str(file_path))
                except Exception as exc:
                    log_message = f"Error downloading {rel_path}."
                    logger.log_warning(log_message, traceback=traceback.format_exc())
                    # Exponential back-off
                    time.sleep(2 ** i)

            # If error persists, there must be something wrong with the network so we are better
            # off crashing the appengine server.
            error_message = f"Error downloading {rel_path}"
            logger.log_error(error_message)
            raise IOError(error_message)

    map_func = partial(_download_blob, local_folder)
    map_iter = bucket.list_blobs(prefix=remote_path)
    list(thread_map(map_func, map_iter, total=None, disable=True, max_workers=8))
Ejemplo n.º 14
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        url_tpl = sources[0]
        metadata = aux["metadata"]
        metadata = metadata[metadata["country_code"] == "FR"]

        fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv")
        fr_iso_map = {
            iso: code
            for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"])
        }
        fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna()
        regions_iter = fr_codes["subregion1_code"].unique()
        deps_iter = [record for _, record in fr_codes.iterrows()]

        # For country level, there is no need to estimate confirmed from tests
        column_adapter_country = dict(_column_adapter)
        column_adapter_country.pop("testsPositifs")

        # Get country level data
        country = _get_country(url_tpl, column_adapter_country)

        # Country level data has totals instead of diffs, so we compute the diffs by hand
        country.sort_values("date", inplace=True)
        country["new_confirmed"] = country["total_confirmed"].diff()
        country.drop(columns=["total_confirmed"], inplace=True)

        # For region level, we can only estimate confirmed from tests
        column_adapter_region = dict(_column_adapter)
        column_adapter_region.pop("casConfirmes")

        # Get region level data
        get_region_func = partial(_get_region, url_tpl, column_adapter_region,
                                  fr_iso_map)
        regions = concat(list(thread_map(get_region_func, regions_iter)))

        # Get department level data
        get_department_func = partial(_get_department, url_tpl,
                                      column_adapter_region)
        departments = concat(list(thread_map(get_department_func, deps_iter)))

        data = concat([country, regions, departments])
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S"))
        return data.sort_values("date")
Ejemplo n.º 15
0
 def parse_dataframes(
     self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
 ) -> DataFrame:
     google_keys = aux["google_key_map"].set_index("google_location_key")["key"].to_dict()
     data = concat(thread_map(_process_chunk, dataframes.values(), total=len(dataframes)))
     data[["key"]].drop_duplicates().to_csv("google_keys.csv", index=False)
     data["key"] = data["key"].apply(lambda x: google_keys.get(x, x))
     return data.dropna(subset=["key"])
Ejemplo n.º 16
0
    def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Ignore sources, we use an API for this data source
        keys = aux["metadata"].query('country_code == "RU"').key
        keys = [key for key in keys.values if len(key.split("_")) == 2]

        data = DataFrame.from_records(sum(thread_map(_get_province_records, keys), []))
        data = data[["key", "date", "total_confirmed", "total_deceased", "total_recovered"]]
        return data
Ejemplo n.º 17
0
    def parse(self, sources: Dict[Any, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        # Get a list of all keys to query the API with
        keys = aux["metadata"].query('country_code == "RU"').key
        keys = [key for key in keys.values if len(key.split("_")) == 2]

        map_func = partial(_get_province_records, sources[0])
        data = DataFrame.from_records(sum(thread_map(map_func, keys), []))
        data = data[["key", "date", "total_confirmed", "total_deceased", "total_recovered"]]
        return data
Ejemplo n.º 18
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        url_tpl = sources[0]
        metadata = aux["metadata"]
        metadata = metadata[metadata["country_code"] == "FR"]

        fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv")
        fr_iso_map = {
            iso: code
            for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"])
        }
        fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna()
        regions_iter = fr_codes["subregion1_code"].unique()
        deps_iter = (record for _, record in fr_codes.iterrows())

        if parse_opts.get("country"):
            # For country level, there is no need to estimate confirmed from tests
            _column_adapter_2 = dict(_column_adapter)
            _column_adapter_2.pop("testsPositifs")
            data = _get_country(url_tpl, _column_adapter_2)

        else:
            # For region level, we can only estimate confirmed from tests
            _column_adapter_2 = dict(_column_adapter)
            _column_adapter_2.pop("casConfirmes")

            get_region_func = partial(_get_region, url_tpl, _column_adapter_2,
                                      fr_iso_map)
            regions = concat(list(thread_map(get_region_func, regions_iter)))

            get_department_func = partial(_get_department, url_tpl,
                                          _column_adapter_2)
            departments = concat(
                list(
                    thread_map(get_department_func,
                               deps_iter,
                               total=len(fr_codes))))

            data = concat([regions, departments])

        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S"))
        return data
Ejemplo n.º 19
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Ignore sources, we use an API for this data source
        url_tpl = sources[0]
        keys = aux["metadata"].query('(country_code == "ID") & subregion1_code.notna()')["key"]
        keys = [key for key in keys.values if len(key.split("_")) == 2 and len(key) == 5]

        map_func = partial(_get_province_records, url_tpl)
        data = DataFrame.from_records(sum(thread_map(map_func, keys), []))
        return data
Ejemplo n.º 20
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        data = aux["knowledge_graph"].merge(aux["metadata"])[["key", "wikidata"]]

        # Load wikidata using parallel processing
        map_func = partial(self._process_item, parse_opts)
        map_iter = data.dropna().set_index("key")["wikidata"].iteritems()
        records = thread_map(map_func, list(map_iter), desc="Wikidata Properties")

        # Return all records in DataFrame form
        return DataFrame.from_records(records)
Ejemplo n.º 21
0
def get_source_outputs(
        data_pipelines: Iterable[DataPipeline]) -> Iterable[Dict]:
    """Map a list of pipeline names to their source configs."""

    for data_pipeline in tqdm(list(data_pipelines)):
        # print(f"Processing {data_pipeline.name}")
        map_iter = data_pipeline.data_sources
        map_func = partial(read_source_output, data_pipeline)
        map_opts = dict(desc="Downloading data tables", leave=False)
        yield from thread_map(map_func, map_iter, **map_opts)
Ejemplo n.º 22
0
    def parse_dataframes(
        self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:
        # Keep only dataframes which have data available in metadata
        keys = aux["metadata"]["key"]
        has_state = lambda state: keys.apply(lambda x: x.startswith(f"US_{state}")).any()
        dataframes = {state: df for state, df in dataframes.items() if has_state(state)}

        # Parallelize the work and process each state in a different process to speed up the work
        map_opts = dict(total=len(dataframes), desc="Processing states")
        return concat(thread_map(_process_state, dataframes.values(), **map_opts))
Ejemplo n.º 23
0
def _get_data(url_tpl: str, subregion_code_col: str, subregion_code_to_api_id_map: Dict[str, int],
              subregions: DataFrame) -> DataFrame:
    subregion_codes = subregions[subregion_code_col].values
    map_func = partial(_get_records, url_tpl, subregion_code_to_api_id_map)
    data = DataFrame.from_records(sum(thread_map(map_func, subregion_codes), []))
    data['date'] = data.apply(lambda r: _indonesian_date_to_isoformat(r.tgl), axis=1)
    # add location keys
    data = table_merge(
        [data, subregions],
        left_on="subregion_code", right_on=subregion_code_col, how="left")
    data = table_rename(data, _col_name_map, drop=True)
    return data
Ejemplo n.º 24
0
def _test_data_pipeline(pipeline_name: str, random_seed: int = 0):

    # Load the data pipeline to get the number of data sources
    data_pipeline = DataPipeline.load(pipeline_name)

    # Load the data pipeline, iterate over each data source and run it to get its output
    pipeline_count = len(data_pipeline.data_sources)
    map_func = partial(_test_data_source, pipeline_name, random_seed=random_seed)
    _ = thread_map(map_func, range(pipeline_count), total=pipeline_count, max_workers=4)

    # Consume the results
    list(_)
Ejemplo n.º 25
0
    def parse_dataframes(self, dataframes: List[DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts):

        # Get all the weather stations with data up until 2020
        stations = read_csv(
            _INVENTORY_URL,
            sep=r"\s+",
            names=("id", "lat", "lon", "measurement", "year_start",
                   "year_end"),
        )
        stations = stations[stations.year_end == 2020][[
            "id", "lat", "lon", "measurement"
        ]]

        # Filter stations that at least provide max and min temps
        measurements = ["TMIN", "TMAX"]
        stations = stations.groupby(["id", "lat",
                                     "lon"]).agg(lambda x: "|".join(x))
        stations = stations[stations.measurement.apply(
            lambda x: all(m in x for m in measurements))]
        stations = stations.reset_index()

        # Get all the POI from metadata and go through each key
        keep_columns = ["key", "latitude", "longitude"]
        metadata = dataframes[0][keep_columns].dropna()

        # Only use keys present in the metadata table
        metadata = metadata.merge(aux["metadata"])[keep_columns]

        # Convert all coordinates to radians
        stations["lat"] = stations.lat.apply(math.radians)
        stations["lon"] = stations.lon.apply(math.radians)
        metadata["lat"] = metadata.latitude.apply(math.radians)
        metadata["lon"] = metadata.longitude.apply(math.radians)

        # Use a cache to avoid having to query the same station multiple times
        station_cache: Dict[str, DataFrame] = {}

        # Make sure the stations and the cache are sent to each function call
        map_func = partial(NoaaGhcnDataSource.station_records, station_cache,
                           stations)

        # We don't care about the index while iterating over each metadata item
        map_iter = [record for _, record in metadata.iterrows()]

        # Shuffle the iterables to try to make better use of the caching
        shuffle(map_iter)

        # Bottleneck is network so we can use lots of threads in parallel
        records = thread_map(map_func, map_iter, total=len(metadata))

        return concat(records)
Ejemplo n.º 26
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        data = aux["knowledge_graph"].merge(aux["metadata"])[["key", "wikidata"]]
        entities = data.dropna().set_index("wikidata")

        # Load wikidata using parallel processing
        wikidata_props = {v: k for k, v in parse_opts.items()}
        map_func = partial(self._process_item, entities.index)
        for _, values in thread_map(map_func, wikidata_props.keys(), desc="Wikidata Properties"):
            values = ((x[0], *(x[1].split(",", 2) if x[1] else (None, None))) for x in values)
            df = DataFrame.from_records(values, columns=["wikidata", "latitude", "longitude"])
            entities = entities.join(df.set_index("wikidata"), how="outer")

        # Return all records in DataFrame form
        return entities
Ejemplo n.º 27
0
    def parse(self, sources: List[str], aux: Dict[str, DataFrame],
              **parse_opts):

        buffer = BytesIO()
        download(sources[0], buffer, progress=True)

        data = None
        with zipfile.ZipFile(buffer) as zipped:
            data = zipped.read("WDIData.csv")
            data = read_csv(BytesIO(data))
        assert data is not None

        data = data.rename(
            columns={
                "Country Code": "3166-1-alpha-3",
                "Indicator Name": "indicator_name",
                "Indicator Code": "indicator_code",
            })

        data = data.merge(aux["worldbank_indicators"]).merge(
            aux["country_codes"])
        data = data.drop(
            columns=["Country Name", "3166-1-alpha-2", "3166-1-alpha-3"])

        indicators = parse_opts.get(
            "indicators", {code: code
                           for code in data.indicator_code.values})
        min_year = int(parse_opts.get("min_year", 2015))
        data = data[data.indicator_code.isin(indicators.values())]

        # Index data by indicator code for performance optimization
        keys = data.key.unique()
        indexed = {
            key: data[data.key == key].set_index("indicator_code")
            for key in keys
        }

        # There is probably a fancy pandas function to this more efficiently but this works for now
        map_func = partial(WorldbankDataSource._process_record, indexed,
                           indicators, min_year)
        records = thread_map(map_func, keys, desc="WorldBank Indicators")

        # Some countries are better described as subregions
        data = DataFrame.from_records(records)
        data.loc[data.key == "MF", "key"] = "FR_MF"

        # Return all records in DataFrame form
        return data
Ejemplo n.º 28
0
def upload_folder(
    bucket_name: str,
    remote_path: str,
    local_folder: Path,
    filter_func: Callable[[Path], bool] = None,
) -> None:
    bucket = get_storage_bucket(bucket_name)

    def _upload_file(remote_path: str, file_path: Path):
        target_path = file_path.relative_to(local_folder)
        if filter_func is None or filter_func(target_path):
            logger.log_debug(f"Uploading {target_path} to {remote_path}/")
            blob = bucket.blob(os.path.join(remote_path, target_path))
            for i in range(BLOB_OP_MAX_RETRIES):
                try:
                    name, suffix = file_path.name, file_path.suffix

                    # If it's an extension we should compress, upload compressed file
                    if suffix[1:] in COMPRESS_EXTENSIONS:
                        with temporary_directory() as workdir:
                            gzipped_file = workdir / name
                            gzip_file(file_path, gzipped_file)
                            blob.content_encoding = "gzip"
                            return blob.upload_from_filename(gzipped_file)

                    # Otherwise upload the file as-is
                    else:
                        return blob.upload_from_filename(file_path)

                except Exception as exc:
                    log_message = f"Error uploading {target_path}."
                    logger.log_warning(log_message,
                                       traceback=traceback.format_exc())
                    # Exponential back-off
                    time.sleep(2**i)

            # If error persists, there must be something wrong with the network so we are better
            # off crashing the appengine server.
            error_message = f"Error uploading {target_path}"
            logger.log_error(error_message)
            raise IOError(error_message)

    map_func = partial(_upload_file, remote_path)
    map_iter = local_folder.glob("**/*.*")
    list(
        thread_map(map_func, map_iter, total=None, disable=True,
                   max_workers=8))
Ejemplo n.º 29
0
    def fetch(
        self,
        output_folder: Path,
        cache: Dict[str, str],
        fetch_opts: List[Dict[str, Any]],
        skip_existing: bool = False,
    ) -> Dict[str, str]:

        output = {}
        download_options = dict(fetch_opts[0], skip_existing=skip_existing)
        url_tpl = download_options.pop("url")
        map_opts = dict(desc="Downloading Brazil Open Data")
        map_iter = [code.lower() for code in _IBGE_STATES.values()]
        map_func = partial(_download_open_data, self, url_tpl, output_folder, **download_options)
        for partial_output in thread_map(map_func, map_iter, **map_opts):
            output.update(partial_output)

        return output
Ejemplo n.º 30
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:
        data = aux["knowledge_graph"].merge(
            aux["metadata"])[["key", "wikidata"]]
        entities = data.dropna().set_index("wikidata")

        # Load wikidata using parallel processing
        wikidata_props = {v: k for k, v in parse_opts.items()}
        map_func = partial(self._process_item, entities.index)
        map_opts = dict(desc="Wikidata Properties", total=len(wikidata_props))
        for _, values in thread_map(map_func, wikidata_props.keys(),
                                    **map_opts):
            values = _extract_coordinates(values)
            df = DataFrame.from_records(
                values, columns=["wikidata", "latitude", "longitude"])
            entities = entities.join(df.set_index("wikidata"), how="outer")

        # Return all records in DataFrame form
        return entities