def _get_id_dfs() -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Get dataframes with ids from database """ db.execute_query(query=io.read_file( path=os.path.join(os.path.dirname(__file__), "pg_ug.sql"))) df_pg_ug = db.db_to_df(fqtable="spei_v2.pg_ug", ids=["pg_id"]) df_m = (db.db_to_df(fqtable="staging.month", cols=["id"], ids=["year_id", "month"]).reset_index().rename(columns={ "year_id": "year", "id": "month_id" }).set_index(["year", "month"])) df_ug_pgm = (db.db_to_df( fqtable="staging.priogrid_month", cols=["id"], ids=["priogrid_gid", "month_id"], ).reset_index().rename(columns={ "id": "priogrid_month_id", "priogrid_gid": "pg_id" }).set_index(["pg_id", "month_id"]).join(df_pg_ug).reset_index().set_index( ["ug_id", "month_id"])[["pg_id", "priogrid_month_id"]]) return df_pg_ug, df_m, df_ug_pgm
def finalise_pgdata( df_static: pd.DataFrame, df_yearly: pd.DataFrame, df_core: pd.DataFrame, ) -> pd.DataFrame: """ Join all pgdata to PGY level """ spec = io.load_yaml(os.path.join(os.path.dirname(__file__), "spec.yaml")) df_skeleton = db.db_to_df( fqtable="skeleton.pgy_global", ids=["year", "pg_id"], cols=["year", "pg_id"], ) df = df_skeleton.join(df_static).join(df_core).join(df_yearly).sort_index() # Check that the default inner join doesn't discard any rows if not len(df) == len(df_skeleton): raise RuntimeError("Join was supposed to have all skeleton rows.") # Compute the _s, _y transformations df = compute_greatests(df) # Subset to wanted columns df = df[spec["cols_data"]] df = missing.extrapolate(df) df = df.add_prefix("pgd_") return df
def load_fvp(): """ Load FVP data """ log.info("Started loading FVP") with tempfile.TemporaryDirectory() as tempdir: _ = common.get_files_latest_fetch(name="fvp", tempdir=tempdir) df = io.csv_to_df(path=os.path.join(tempdir, "MasterData.csv")) df = df.drop(columns=["Conflict"]) df = df.rename(columns=lambda col: col.lower()) df = df.set_index(["year", "gwno"]) spec = io.load_yaml( path=os.path.join(os.path.dirname(__file__), "spec.yaml")) df = df[spec["cols"]] log.debug("Fetching df_keys") query = "SELECT id AS country_id, gwcode AS gwno FROM staging.country;" df = df.join( db.query_to_df(query=query).sort_values( by="country_id", ascending=False).drop_duplicates(subset=["gwno"]).set_index( ["gwno"])) log.debug("Joining to skeleton") df = db.db_to_df( fqtable="skeleton.cy_global", ids=["year", "country_id"], cols=["year", "country_id"], ).join(df.reset_index().set_index(["year", "country_id"]), how="left") df = df.drop(columns=["gwno"]) # Add consistent fvp_ prefix df = df.rename( columns=lambda col: col if col.startswith("fvp_") else f"fvp_{col}") df = df.sort_index(axis=1).sort_index(axis=0) # Push raw db.create_schema("fvp_v2") db.df_to_db(fqtable="fvp_v2.cy_unimp", df=df) # Extrapolate before imputing df = missing.extrapolate(df) # Impute and push for i, df_imp in enumerate( missing.impute_mice_generator( df=df, n_imp=10, estimator=DecisionTreeRegressor(max_features="sqrt"), parallel=True, )): db.df_to_db(df=df_imp, fqtable=f"fvp_v2.cy_imp_sklearn_{i}") log.info("Fininshed loading FVP")
def load_cdum() -> None: """ Load country dummies """ df = db.db_to_df(fqtable="staging.country", cols=["id"], ids=["id"]) df = df.reset_index().rename(columns={"id": "country_id"}) df["to_dummy"] = df["country_id"] df = df.set_index(["country_id"]) df = pd.get_dummies(df.to_dummy, prefix="cdum") db.drop_schema("cdum") db.create_schema("cdum") db.df_to_db(fqtable="cdum.c", df=df)
def _load_and_stage_wdi() -> pd.DataFrame: log.debug("Reading raw fetch.") with tempfile.TemporaryDirectory() as tempdir: paths = common.get_files_latest_fetch(name="wdi", tempdir=tempdir) path_zip = [ path for path in paths if os.path.basename(path) == "WDI_csv.zip" ].pop() io.unpack_zipfile(path_zip, destination=tempdir) df = io.csv_to_df(path=os.path.join(tempdir, "WDIData.csv")) # TODO: Build codebook from this _ = io.csv_to_df(path=os.path.join(tempdir, "WDISeries.csv")) log.debug("Preparing WDI.") df = _flip_wdi(df=df) # Get country_id isoab matching log.debug("Fetching df_keys") df_keys = db.query_to_df(query=""" SELECT id AS country_id, isoab AS countrycode FROM staging.country; """) # Drop duplicates, Soviet Union, Yugoslavia etc # Keep those with highest country_id, i.e. latest. df_keys = (df_keys.sort_values( by="country_id", ascending=False).drop_duplicates(subset=["countrycode"]).set_index( ["countrycode"])) # Join in keys log.debug("Joining in df_keys") df = df.join(df_keys) df = (df.reset_index().dropna(subset=["country_id"]).set_index( ["year", "country_id"]).add_prefix("wdi_").drop(columns=["wdi_countrycode"])) # Stage to CY skeleton log.debug("Fetching skeleton") df_skeleton = db.db_to_df(fqtable="skeleton.cy_global", cols=["year", "country_id" ]).set_index(["year", "country_id"]) df = df_skeleton.join(df, how="left") # Drop cols that have no values at all cols_completely_missing = missing.list_totally_missing(df) df = df.drop(columns=cols_completely_missing) log.debug( f"Dropped cols {cols_completely_missing} because they had no values") # order columns and rows df = df.sort_index(axis=1).sort_index(axis=0) return df
def load_reign() -> None: """ Load reign """ log.info("Started loading reign.") spec = io.load_yaml(os.path.join(os.path.dirname(__file__), "spec.yaml")) with tempfile.TemporaryDirectory() as tempdir: paths = common.get_files_latest_fetch(name="reign", tempdir=tempdir) path_csv = [path for path in paths if path.endswith(".csv")].pop() df = io.csv_to_df(path=path_csv) df = fix_ccodes(df, spec) df = encode_govt_dummies(df) df = df.set_index(["year", "month", "ccode"]) df = df.join( db.query_to_df(query=""" SELECT id AS country_id, gwcode AS ccode FROM staging.country WHERE gweyear=2016; """).set_index(["ccode"])) df = df.join( db.query_to_df(query=""" SELECT id AS month_id, year_id AS year, month FROM staging.month; """).set_index(["year", "month"])) df = df.reset_index().set_index(["month_id", "country_id"]) df = df.drop( columns=["year", "month", "ccode", "country", "government", "leader"]) df_skeleton = db.db_to_df( fqtable="skeleton.cm_global", cols=["month_id", "country_id"], ids=["month_id", "country_id"], ) len_skel = len(df_skeleton) df = df_skeleton.join(df, how="left") if not len(df) == len_skel: raise RuntimeError(f"Join not correct, {len_skel} != {len(df)}") df = df.add_prefix("reign_") db.drop_schema("reign_v2") db.create_schema("reign_v2") db.df_to_db(df=df, fqtable="reign_v2.cm_unimp") db.df_to_db( df=missing.fill_groups_with_time_means(missing.extrapolate(df)), fqtable="reign_v2.cm_extrapolated", ) log.info("Finished loading reign.")
def _load_and_stage_vdem() -> pd.DataFrame: """ Load and stage VDEM """ log.debug("Loading raw fetch data for VDEM.") with tempfile.TemporaryDirectory() as tempdir: _ = common.get_files_latest_fetch(name="vdem_v10", tempdir=tempdir) _ = io.unpack_zipfile( path_zip=os.path.join( tempdir, "Country_Year_V-Dem_Full+others_CSV_v10.zip" ), destination=tempdir, ) path_df = os.path.join( tempdir, "Country_Year_V-Dem_Full+others_CSV_v10", "V-Dem-CY-Full+Others-v10.csv", ) df = ( io.csv_to_df(path=path_df) .add_prefix("vdem_") .rename(columns={"vdem_year": "year"}) .set_index(["year", "vdem_country_text_id"]) ) df_keys = ( db.query_to_df( query=""" SELECT id AS country_id, isoab AS vdem_country_text_id FROM staging.country; """ ) .sort_values(by="country_id", ascending=False) .drop_duplicates(subset=["vdem_country_text_id"]) .set_index(["vdem_country_text_id"]) ) df = df.join(df_keys) # Drop where join failed df.dropna(subset=["country_id"]) df = df.reset_index().set_index(["year", "country_id"]).sort_index() df.isnull().mean().mean() # Stage to CY skeleton log.debug("Fetching skeleton") df_skeleton = db.db_to_df( fqtable="skeleton.cy_global", cols=["year", "country_id"] ).set_index(["year", "country_id"]) df = df_skeleton.join(df, how="left") cols_completely_missing = missing.list_totally_missing(df) df = df.drop(columns=cols_completely_missing) log.debug( f"Dropped cols {cols_completely_missing} because they had no values" ) # order columns and rows df = df.rename(columns=lambda col: col.lower()) cols = sorted(list(df.columns)) cols = [col for col in cols if not col.endswith("_codehigh")] cols = [col for col in cols if not col.endswith("_codelow")] cols = [col for col in cols if not col.endswith("_ord")] cols = [col for col in cols if not col.endswith("_sd")] cols = [col for col in cols if not col.endswith("_mean")] cols = [col for col in cols if not col.endswith("_nr")] cols = [col for col in cols if not col.endswith("_osp")] df = df[cols] df = df.sort_index(axis=1).sort_index(axis=0) return df
def load_icgcw() -> None: """ Load ICGCW """ log.info("Starting ICGCW import") spec = io.load_yaml( path=os.path.join(os.path.dirname(__file__), "spec.yaml")) # Get all the entries as list of dicts entries = load_and_parse_entries(parallel=True) entries = drop_unnamed_entries(entries) # Some renames depend on splits so we do splits twice entries = split_multi_country_entries(entries) entries = do_renames(entries, spec) entries = split_multi_country_entries(entries) entries = drop_drops(entries, drops=spec["drops"]) debug_log_unique_names(entries) df = pd.DataFrame(entries) df = set_dates(df) df = drop_duplicate_country_months(df) df = df.set_index(["year", "month", "name"]) cols = [ "alerts", "opportunities", "deteriorated", "improved", "unobserved", ] df = df[cols] df_c = (db.query_to_df( "SELECT id AS country_id, name FROM staging.country;").sort_values( by=["country_id"], ascending=False).drop_duplicates(subset=["name"]).set_index( ["name"])) df_m = (db.query_to_df(""" SELECT id AS month_id, year_id AS year, month FROM staging.month; """).set_index(["year", "month"]).sort_index()) df_skeleton = db.db_to_df( fqtable="skeleton.cm_global", cols=["month_id", "country_id"], ids=["month_id", "country_id"], ) df = (df.join(df_c).join(df_m).dropna(subset=["country_id"]).set_index( ["month_id", "country_id"]).sort_index()) df = df_skeleton.join(df, how="left") df = missing.extrapolate(df) df["unobserved"] = df["unobserved"].fillna(1) df = df.fillna(0) # orer cols and rows df = df.sort_index(axis=1).sort_index() df = df.add_prefix("icgcw_") # @TODO: Change this to icgcw without v2 once we drop flat schema = "icgcw_v2" db.drop_schema(schema) db.create_schema(schema) db.df_to_db(fqtable=f"{schema}.cm", df=df)
def refresh(self) -> pd.DataFrame: """ Refetch table from database and update cache """ log.info(f"Refreshing {self.fqtable}") df = db.db_to_df(fqtable=self.fqtable, ids=self.ids) io.df_to_parquet(df=df, path=self.path) return df