Python get_data_from_athenaの例、src.utils.get_data_from_athena Pythonの例

コード例 #1

0

ファイルを表示

def _get_metadata_table(config):

    try:
        return get_data_from_athena(
            "select * from "
            f"{config['athena_database']}.{config['slug']}_analysis_metadata_variation ",
            config,
        )
    except:
        return get_data_from_athena(
            "select * from "
            f"{config['athena_database']}.{config['slug']}_metadata_metadata_ready ",
            config,
        )

コード例 #2

0

ファイルを表示

def check_existence(config):

    res = get_data_from_athena(f"show tables in \
                    {config['athena_database']} '{config['slug']}_{config['raw_table']}_{config['name']}'"
                               )

    return len(res) > 0

コード例 #3

0

ファイルを表示

ファイル: all_cities.py プロジェクト: limamarx/IDB-IDB-Invest-Coronavirus-Impact-Dashboard

def country_coarse(config):

    insert_groups = get_data_from_athena(
        'select distinct region_slug, "group" from '
        f"{config['athena_database']}.{config['slug']}_cities_country_resolutions "
        "where resolution = 3",
        config,
    ).to_dict("records")

    insert_into.start(config, insert_groups)

コード例 #4

0

ファイルを表示

ファイル: create_local_table_athena.py プロジェクト: limamarx/IDB-IDB-Invest-Coronavirus-Impact-Dashboard

def metadata_prepare(config):

    metadata = get_data_from_athena(
        "select * from "
        f"{config['athena_database']}.{config['slug']}_metadata_regions_metadata ",
        config,
    )

    metadata["timezone"] = metadata["region_shapefile_wkt"].apply(
        lambda x: _get_timezone(x, config))

    _save_local(metadata, config, wrangler=True)

コード例 #5

0

ファイルを表示

def country_cities(config):

    regions = get_data_from_athena(
        "select distinct region_slug "
        f"from {config['athena_database']}.{config['slug']}_{config['raw_table']}_{config['from_table']}",
        config,
    )["region_slug"].tolist()

    return [{
        "p_name": r,
        "p_path": f"region_slug={r}",
        "partition": r,
        "region_slug": r
    } for r in regions]

コード例 #6

0

ファイルを表示

def should_create_table(config):

    try:
        current_millis = get_data_from_athena(
            f"""
                select split("$path", '/')[7] current_millis 
                from {config['athena_database']}.{config['slug']}_{config['raw_table']}_{config['name']} 
                limit 1""", )
    except:
        current_millis = []

    if len(current_millis):
        return current_millis["current_millis"][0] != config.get(
            "current_millis")
    else:
        return True

コード例 #7

0

ファイルを表示

def grid(config):

    regions = list(
        get_data_from_athena(
            "select distinct region_slug from "
            f"{config['athena_database']}.{config['slug']}_metadata_metadata_prepare "
            "where grid = 'TRUE'"
            f"""or region_slug in ('{"','".join(config['selected_regions'])}')""",
            config,
        )["region_slug"])

    return [{
        "p_name": r,
        "p_path": f"region_slug={r}",
        "partition": r,
        "region_slug": r
    } for r in regions]

コード例 #8

0

ファイルを表示

def _get_remaining_dates(data, config):

    existing_dates = get_data_from_athena(
        "select distinct region_slug, "
        "date_parse(concat(cast(year as varchar), '-', cast(month as varchar), '-', cast(day as varchar)), '%Y-%m-%d') date "
        f"from {config['athena_database']}.{config['slug']}_{config['raw_table']}_{config['name']} ",
        config,
    ).assign(date=lambda df: df["date"].apply(lambda x: x.date()))

    dates = _all_dates(config)

    allpossibilities = (data.groupby([
        "region_slug"
    ]).apply(lambda x: dates[["date"]]).reset_index().drop("level_1", 1))

    return allpossibilities.merge(
        existing_dates, on=["region_slug", "date"], how="left",
        indicator=True).query('_merge == "left_only"')[["region_slug", "date"]]

コード例 #9

0

ファイルを表示

ファイル: all_cities.py プロジェクト: limamarx/IDB-IDB-Invest-Coronavirus-Impact-Dashboard

def country_resolutions(config):

    metadata = get_data_from_athena(
        "select region_slug, region_shapefile_wkt from "
        f"{config['athena_database']}.{config['slug']}_metadata_metadata_prepare "
        f"""where region_slug in ('{"','".join([j['region_slug'] for j in config['jobs']])}')""",
        config,
    )

    country_grid = (
        metadata.set_index("region_slug")["region_shapefile_wkt"].apply(
            _bigger_polygon).apply(_reescale).apply(_wkt_to_geojson).
        reset_index().groupby("region_slug")["region_shapefile_wkt"].apply(
            lambda x: _get_cell(x, config["coarse_resolutions"], config[
                "coarse_resolutions"])).reset_index().drop("level_1", 1))

    city_grid = pd.concat([
        (pd.read_csv(
            "data/support_tables/all_cities/" + job["file"],
            sep="|").set_index("region_name")["region_shapefile_wkt"].apply(
                lambda x: wkt.loads(x).convex_hull.to_wkt()).apply(
                    _wkt_to_geojson).reset_index().groupby("region_name")
         ["region_shapefile_wkt"].apply(lambda x: _get_cell(
             x,
             config["coarse_resolutions"],
             config["coarse_resolutions"],
             keep_wkt=True,
         )).reset_index().drop(["level_1", "id_parent"],
                               1).rename(columns={
                                   "region_name": "id",
                                   "id": "id_parent"
                               }).assign(region_slug=job["region_slug"],
                                         resolution=10))
        for job in config["jobs"]
    ])

    grid = pd.concat([city_grid, country_grid])
    grid["id_parent"] = grid["id_parent"].astype(str)
    grid["id"] = grid["id"].astype(str)

    create_table.from_local(grid, config, wrangler=True)

コード例 #10

0

ファイルを表示

def _region_slug_partition(config):

    data = _get_metadata_table(config)

    if config.get("selected_regions"):

        data = data[data["region_slug"].isin(config.get("selected_regions"))]

    else:

        if config.get("if_exists") == "append":

            # check if table exists
            try:
                skip = get_data_from_athena(
                    "select distinct region_shapefile_wkt from "
                    f"{config['athena_database']}.{config['slug']}_analysis_metadata_variation "
                    "where n_days is not null",
                    config,
                )
            except:
                skip = pd.DataFrame([], columns=["region_shapefile_wkt"])

            data = data[~data["region_shapefile_wkt"].
                        isin(skip["region_shapefile_wkt"])]

            if config["name"] == "analysis_daily":
                data = data[~data["region_slug"].isin(config["cv_exception"])]

    if config.get("mode") == "incremental":

        remaining_dates = _get_remaining_dates(data, config)

        if len(remaining_dates) == 0:
            return None

        data = _add_date_slug(data, remaining_dates, config)

    return _prepare_to_partition(data, config)

コード例 #11

0

ファイルを表示

ファイル: create_local_table_athena.py プロジェクト: limamarx/IDB-IDB-Invest-Coronavirus-Impact-Dashboard

def write_index(config):

    for table in config["to_write"]:

        df = get_data_from_athena(
            "select * from "
            f"{config['athena_database']}.{config['slug']}_{table['table']}")

        if "region_shapefile_wkt" in df.columns:
            df["region_shapefile_wkt"] = df["region_shapefile_wkt"].apply(
                lambda x: str(simplify(wkt.loads(x))))

        if table.get("overall_drop"):
            df = df.drop(table["overall_drop"], 1)

        # print(df.apply(lambda x: max(len(str))))

        drive_config = yaml.load(open("configs/drive-config.yaml", "r"))

        if config["slug"] == "dev":
            _write_sheets_table(
                df,
                table["worksheet"],
                config,
                drive_config[config["name"]][config["slug"]],
            )

        elif config["slug"] == "prod":

            _write_sheets_table(
                df,
                table["worksheet"],
                config,
                drive_config[config["name"]][config["slug"]],
            )

            df = df.drop(table["public_drop"], 1)
            _write_sheets_table(df, table["worksheet"], config,
                                drive_config[config["name"]]["public"])

コード例 #12

0

ファイルを表示

ファイル: create_local_table_athena.py プロジェクト: limamarx/IDB-IDB-Invest-Coronavirus-Impact-Dashboard

def metadata_osm_length(config):

    # Fetch regions configuration
    metadata = get_data_from_athena(
        "select region_slug, region_shapefile_wkt, rerun from "
        f"{config['athena_database']}.{config['slug']}_metadata_metadata_prepare "
        "order by region_slug",
        config,
    )

    # Force rerun
    rerun = metadata[metadata["rerun"] == "TRUE"]

    # Select regions to be update if that is the case, else update all
    if config.get("selected_regions"):

        metadata = metadata[metadata["region_slug"].isin(
            config.get("selected_regions"))]

    # Get current state of table
    try:
        current = get_data_from_athena(
            "select region_slug, region_shapefile_wkt, osm_length from "
            f"{config['athena_database']}.{config['slug']}_metadata_metadata_osm_length "
            "order by region_slug",
            config,
        )
    except:
        current = pd.DataFrame(
            [], columns=["region_slug", "region_shapefile_wkt", "osm_length"])

    # Update just regions that changed their shapes or do not exist yet
    if config["mode"] == "overwrite_partitions":

        try:
            skip = current[current["osm_length"] != ""][[
                "region_shapefile_wkt"
            ]]
        except:
            skip = pd.DataFrame([], columns=["region_shapefile_wkt"])

        selected = metadata[~metadata["region_shapefile_wkt"].
                            isin(skip["region_shapefile_wkt"])][[
                                "region_slug", "region_shapefile_wkt"
                            ]]

    selected = pd.concat([selected, rerun]).drop_duplicates()

    # Calculate OSM length for selected regions
    selected["osm_length"] = selected["region_shapefile_wkt"].apply(
        lambda x: _get_length(x, config))
    selected = selected.sort_values(by="region_slug")

    # If current table is empty, initilize
    if len(current):
        current.update(
            selected[["region_slug", "region_shapefile_wkt", "osm_length"]])
    else:
        current = selected[[
            "region_slug", "region_shapefile_wkt", "osm_length"
        ]]

    if config["verbose"]:
        print(current)
        print(list(current["region_slug"]))

    if len(metadata):

        _save_local(current, config, wrangler=True)