def metadata_osm_length(config): metadata = get_data_from_athena( "select * from " f"{config['athena_database']}.{config['slug']}_metadata_regions_metadata ", config, ) rerun = metadata[metadata["rerun"] == "TRUE"] if config["mode"] == "overwrite_partitions": try: skip = get_data_from_athena( "select distinct region_shapefile_wkt from " f"{config['athena_database']}.{config['slug']}_metadata_metadata_osm_length " "where osm_length is not null", config, ) except: skip = pd.DataFrame([], columns=["region_shapefile_wkt"]) metadata = metadata[ ~metadata["region_shapefile_wkt"].isin(skip["region_shapefile_wkt"]) ] def _get_length(x): lengths = osm_road_length.get(wkt.loads(x)) lengths = lengths[lengths.index.isin(config["accepted_osm_keys"])] return lengths["length"].sum() metadata = pd.concat([metadata, rerun]).drop_duplicates() if config["verbose"]: print(list(metadata["region_slug"])) metadata["osm_length"] = metadata["region_shapefile_wkt"].apply(_get_length) if len(metadata): res = wr.s3.to_parquet( df=metadata, path="s3://{bucket}/{prefix}/{slug}/{raw_table}/{name}".format(**config), dataset=True, database=config["athena_database"], table="{slug}_{raw_table}_{name}".format(**config), mode=config["mode"], partition_cols=["region_slug"], boto3_session=boto3.Session(region_name="us-east-1"), )
def check_existence(config): res = get_data_from_athena(f"show tables in \ {config['athena_database']} '{config['slug']}_{config['raw_table']}_{config['name']}'" ) return len(res) > 0
def coarse(config): insert_groups = get_data_from_athena( 'select distinct region_slug, "group" from ' f"{config['athena_database']}.{config['slug']}_grid_resolutions " "where resolution = 7", config, ).to_dict("records") insert_into.start(config, insert_groups)
def _region_slug_partition(config): data = get_data_from_athena( "select * from " f"{config['athena_database']}.{config['slug']}_metadata_metadata_ready " ).to_dict('records') for d in data: d['partition'] = d['region_slug'] return data
def resolutions(config): metadata = get_data_from_athena( "select region_slug, region_shapefile_wkt from " f"{config['athena_database']}.{config['slug']}_metadata_metadata_prepare " "where grid = 'TRUE' " f"""or region_slug in ('{"','".join(config['selected_regions'])}')""", config, ) metadata["wkt"] = metadata["region_shapefile_wkt"].apply(_reescale) # metadata["geojson"] = metadata["wkt_reescaled"].apply(_wkt_to_geojson) grid = ( metadata.groupby("region_slug")["wkt"].apply( lambda x: get_cells(x, config)) # Get h3 ids and wkts .reset_index()) create_table.from_local(grid, config, wrangler=True)
def write_index(config): for table in config["to_write"]: df = get_data_from_athena( "select * from " f"{config['athena_database']}.{config['slug']}_{table['table']}" ) if "region_shapefile_wkt" in df.columns: df["region_shapefile_wkt"] = df["region_shapefile_wkt"].apply( lambda x: str(simplify(wkt.loads(x))) ) if table.get("overall_drop"): df = df.drop(table["overall_drop"], 1) # print(df.apply(lambda x: max(len(str)))) drive_config = yaml.load(open("configs/drive-config.yaml", "r")) if config["slug"] == "dev": _write_sheets_table( df, table["worksheet"], config, drive_config[config["name"]][config["slug"]], ) elif config["slug"] == "prod": _write_sheets_table( df, table["worksheet"], config, drive_config[config["name"]][config["slug"]], ) df = df.drop(table["public_drop"], 1) _write_sheets_table( df, table["worksheet"], config, drive_config[config["name"]]["public"] )
def write_index(config): df = get_data_from_athena("select * from " f"{config['athena_database']}.{config['slug']}_" f"{config['raw_table']}_index") drive_config = yaml.load(open('configs/drive-config.yaml', 'r')) if config['slug'] == 'dev': _write_sheets_table(df, config, drive_config[config['name']][config['slug']]) elif config['slug'] == 'prod': _write_sheets_table(df, config, drive_config[config['name']][config['slug']]) drop_rows = [ 'observed', 'expected_2019', 'expected_2020', 'dashboard', 'ratio_19' ] df = df.drop(drop_rows, 1) _write_sheets_table(df, config, drive_config[config['name']]['public'])
def _region_slug_partition(config): data = get_data_from_athena( "select * from " f"{config['athena_database']}.{config['slug']}_metadata_metadata_ready ", config, ) rerun = data[data["rerun"] == "TRUE"] if config.get("if_exists") == "append": # check if table exists try: skip = get_data_from_athena( "select distinct region_shapefile_wkt from " f"{config['athena_database']}.{config['slug']}_analysis_metadata_variation " "where n_days is not null", config, ) except: skip = pd.DataFrame([], columns=["region_shapefile_wkt"]) data = data[~data["region_shapefile_wkt"]. isin(skip["region_shapefile_wkt"])] if config["name"] == "analysis_daily": data = data[~data["region_slug"].isin(config["cv_exception"])] if config.get("filter_by_coef"): skip = get_data_from_athena( "select region_slug from " f"{config['athena_database']}.{config['slug']}_analysis_metadata_variation " "where (weekly_approved = true or daily_approved = true) " f"""or (region_slug in ('{"','".join(config['cv_exception'])}')) """, config, ) data = data[data["region_slug"].isin(skip["region_slug"])] if config.get("sample_cities"): data = data[:config["sample_cities"]] data = pd.concat([data, rerun]).drop_duplicates() data = data.to_dict("records") for d in data: d["partition"] = d["region_slug"] if config["name"] == "analysis_daily": if d["region_slug"] in config["sampled"]: d["dates"] = sample_query_weeks( config["full_2019_interval"]["start"], config["full_2019_interval"]["end"], ) d["p_path"] = deepcopy( "country_iso={country_iso}/{partition}".format(**d)) else: d["p_path"] = deepcopy("region_slug={partition}".format(**d)) return data