def _get_metadata_table(config): try: return get_data_from_athena( "select * from " f"{config['athena_database']}.{config['slug']}_analysis_metadata_variation ", config, ) except: return get_data_from_athena( "select * from " f"{config['athena_database']}.{config['slug']}_metadata_metadata_ready ", config, )
def check_existence(config): res = get_data_from_athena(f"show tables in \ {config['athena_database']} '{config['slug']}_{config['raw_table']}_{config['name']}'" ) return len(res) > 0
def country_coarse(config): insert_groups = get_data_from_athena( 'select distinct region_slug, "group" from ' f"{config['athena_database']}.{config['slug']}_cities_country_resolutions " "where resolution = 3", config, ).to_dict("records") insert_into.start(config, insert_groups)
def metadata_prepare(config): metadata = get_data_from_athena( "select * from " f"{config['athena_database']}.{config['slug']}_metadata_regions_metadata ", config, ) metadata["timezone"] = metadata["region_shapefile_wkt"].apply( lambda x: _get_timezone(x, config)) _save_local(metadata, config, wrangler=True)
def country_cities(config): regions = get_data_from_athena( "select distinct region_slug " f"from {config['athena_database']}.{config['slug']}_{config['raw_table']}_{config['from_table']}", config, )["region_slug"].tolist() return [{ "p_name": r, "p_path": f"region_slug={r}", "partition": r, "region_slug": r } for r in regions]
def should_create_table(config): try: current_millis = get_data_from_athena( f""" select split("$path", '/')[7] current_millis from {config['athena_database']}.{config['slug']}_{config['raw_table']}_{config['name']} limit 1""", ) except: current_millis = [] if len(current_millis): return current_millis["current_millis"][0] != config.get( "current_millis") else: return True
def grid(config): regions = list( get_data_from_athena( "select distinct region_slug from " f"{config['athena_database']}.{config['slug']}_metadata_metadata_prepare " "where grid = 'TRUE'" f"""or region_slug in ('{"','".join(config['selected_regions'])}')""", config, )["region_slug"]) return [{ "p_name": r, "p_path": f"region_slug={r}", "partition": r, "region_slug": r } for r in regions]
def _get_remaining_dates(data, config): existing_dates = get_data_from_athena( "select distinct region_slug, " "date_parse(concat(cast(year as varchar), '-', cast(month as varchar), '-', cast(day as varchar)), '%Y-%m-%d') date " f"from {config['athena_database']}.{config['slug']}_{config['raw_table']}_{config['name']} ", config, ).assign(date=lambda df: df["date"].apply(lambda x: x.date())) dates = _all_dates(config) allpossibilities = (data.groupby([ "region_slug" ]).apply(lambda x: dates[["date"]]).reset_index().drop("level_1", 1)) return allpossibilities.merge( existing_dates, on=["region_slug", "date"], how="left", indicator=True).query('_merge == "left_only"')[["region_slug", "date"]]
def country_resolutions(config): metadata = get_data_from_athena( "select region_slug, region_shapefile_wkt from " f"{config['athena_database']}.{config['slug']}_metadata_metadata_prepare " f"""where region_slug in ('{"','".join([j['region_slug'] for j in config['jobs']])}')""", config, ) country_grid = ( metadata.set_index("region_slug")["region_shapefile_wkt"].apply( _bigger_polygon).apply(_reescale).apply(_wkt_to_geojson). reset_index().groupby("region_slug")["region_shapefile_wkt"].apply( lambda x: _get_cell(x, config["coarse_resolutions"], config[ "coarse_resolutions"])).reset_index().drop("level_1", 1)) city_grid = pd.concat([ (pd.read_csv( "data/support_tables/all_cities/" + job["file"], sep="|").set_index("region_name")["region_shapefile_wkt"].apply( lambda x: wkt.loads(x).convex_hull.to_wkt()).apply( _wkt_to_geojson).reset_index().groupby("region_name") ["region_shapefile_wkt"].apply(lambda x: _get_cell( x, config["coarse_resolutions"], config["coarse_resolutions"], keep_wkt=True, )).reset_index().drop(["level_1", "id_parent"], 1).rename(columns={ "region_name": "id", "id": "id_parent" }).assign(region_slug=job["region_slug"], resolution=10)) for job in config["jobs"] ]) grid = pd.concat([city_grid, country_grid]) grid["id_parent"] = grid["id_parent"].astype(str) grid["id"] = grid["id"].astype(str) create_table.from_local(grid, config, wrangler=True)
def _region_slug_partition(config): data = _get_metadata_table(config) if config.get("selected_regions"): data = data[data["region_slug"].isin(config.get("selected_regions"))] else: if config.get("if_exists") == "append": # check if table exists try: skip = get_data_from_athena( "select distinct region_shapefile_wkt from " f"{config['athena_database']}.{config['slug']}_analysis_metadata_variation " "where n_days is not null", config, ) except: skip = pd.DataFrame([], columns=["region_shapefile_wkt"]) data = data[~data["region_shapefile_wkt"]. isin(skip["region_shapefile_wkt"])] if config["name"] == "analysis_daily": data = data[~data["region_slug"].isin(config["cv_exception"])] if config.get("mode") == "incremental": remaining_dates = _get_remaining_dates(data, config) if len(remaining_dates) == 0: return None data = _add_date_slug(data, remaining_dates, config) return _prepare_to_partition(data, config)
def write_index(config): for table in config["to_write"]: df = get_data_from_athena( "select * from " f"{config['athena_database']}.{config['slug']}_{table['table']}") if "region_shapefile_wkt" in df.columns: df["region_shapefile_wkt"] = df["region_shapefile_wkt"].apply( lambda x: str(simplify(wkt.loads(x)))) if table.get("overall_drop"): df = df.drop(table["overall_drop"], 1) # print(df.apply(lambda x: max(len(str)))) drive_config = yaml.load(open("configs/drive-config.yaml", "r")) if config["slug"] == "dev": _write_sheets_table( df, table["worksheet"], config, drive_config[config["name"]][config["slug"]], ) elif config["slug"] == "prod": _write_sheets_table( df, table["worksheet"], config, drive_config[config["name"]][config["slug"]], ) df = df.drop(table["public_drop"], 1) _write_sheets_table(df, table["worksheet"], config, drive_config[config["name"]]["public"])
def metadata_osm_length(config): # Fetch regions configuration metadata = get_data_from_athena( "select region_slug, region_shapefile_wkt, rerun from " f"{config['athena_database']}.{config['slug']}_metadata_metadata_prepare " "order by region_slug", config, ) # Force rerun rerun = metadata[metadata["rerun"] == "TRUE"] # Select regions to be update if that is the case, else update all if config.get("selected_regions"): metadata = metadata[metadata["region_slug"].isin( config.get("selected_regions"))] # Get current state of table try: current = get_data_from_athena( "select region_slug, region_shapefile_wkt, osm_length from " f"{config['athena_database']}.{config['slug']}_metadata_metadata_osm_length " "order by region_slug", config, ) except: current = pd.DataFrame( [], columns=["region_slug", "region_shapefile_wkt", "osm_length"]) # Update just regions that changed their shapes or do not exist yet if config["mode"] == "overwrite_partitions": try: skip = current[current["osm_length"] != ""][[ "region_shapefile_wkt" ]] except: skip = pd.DataFrame([], columns=["region_shapefile_wkt"]) selected = metadata[~metadata["region_shapefile_wkt"]. isin(skip["region_shapefile_wkt"])][[ "region_slug", "region_shapefile_wkt" ]] selected = pd.concat([selected, rerun]).drop_duplicates() # Calculate OSM length for selected regions selected["osm_length"] = selected["region_shapefile_wkt"].apply( lambda x: _get_length(x, config)) selected = selected.sort_values(by="region_slug") # If current table is empty, initilize if len(current): current.update( selected[["region_slug", "region_shapefile_wkt", "osm_length"]]) else: current = selected[[ "region_slug", "region_shapefile_wkt", "osm_length" ]] if config["verbose"]: print(current) print(list(current["region_slug"])) if len(metadata): _save_local(current, config, wrangler=True)