Beispiel #1
0
def _get_id_dfs() -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """ Get dataframes with ids from database """
    db.execute_query(query=io.read_file(
        path=os.path.join(os.path.dirname(__file__), "pg_ug.sql")))
    df_pg_ug = db.db_to_df(fqtable="spei_v2.pg_ug", ids=["pg_id"])

    df_m = (db.db_to_df(fqtable="staging.month",
                        cols=["id"],
                        ids=["year_id",
                             "month"]).reset_index().rename(columns={
                                 "year_id": "year",
                                 "id": "month_id"
                             }).set_index(["year", "month"]))

    df_ug_pgm = (db.db_to_df(
        fqtable="staging.priogrid_month",
        cols=["id"],
        ids=["priogrid_gid", "month_id"],
    ).reset_index().rename(columns={
        "id": "priogrid_month_id",
        "priogrid_gid": "pg_id"
    }).set_index(["pg_id", "month_id"]).join(df_pg_ug).reset_index().set_index(
        ["ug_id", "month_id"])[["pg_id", "priogrid_month_id"]])

    return df_pg_ug, df_m, df_ug_pgm
Beispiel #2
0
def finalise_pgdata(
    df_static: pd.DataFrame, df_yearly: pd.DataFrame, df_core: pd.DataFrame,
) -> pd.DataFrame:
    """ Join all pgdata to PGY level """
    spec = io.load_yaml(os.path.join(os.path.dirname(__file__), "spec.yaml"))
    df_skeleton = db.db_to_df(
        fqtable="skeleton.pgy_global",
        ids=["year", "pg_id"],
        cols=["year", "pg_id"],
    )
    df = df_skeleton.join(df_static).join(df_core).join(df_yearly).sort_index()

    # Check that the default inner join doesn't discard any rows
    if not len(df) == len(df_skeleton):
        raise RuntimeError("Join was supposed to have all skeleton rows.")

    # Compute the _s, _y transformations
    df = compute_greatests(df)

    # Subset to wanted columns
    df = df[spec["cols_data"]]

    df = missing.extrapolate(df)

    df = df.add_prefix("pgd_")

    return df
Beispiel #3
0
def load_fvp():
    """ Load FVP data """
    log.info("Started loading FVP")
    with tempfile.TemporaryDirectory() as tempdir:
        _ = common.get_files_latest_fetch(name="fvp", tempdir=tempdir)
        df = io.csv_to_df(path=os.path.join(tempdir, "MasterData.csv"))

    df = df.drop(columns=["Conflict"])
    df = df.rename(columns=lambda col: col.lower())
    df = df.set_index(["year", "gwno"])

    spec = io.load_yaml(
        path=os.path.join(os.path.dirname(__file__), "spec.yaml"))
    df = df[spec["cols"]]

    log.debug("Fetching df_keys")
    query = "SELECT id AS country_id, gwcode AS gwno FROM staging.country;"
    df = df.join(
        db.query_to_df(query=query).sort_values(
            by="country_id",
            ascending=False).drop_duplicates(subset=["gwno"]).set_index(
                ["gwno"]))

    log.debug("Joining to skeleton")
    df = db.db_to_df(
        fqtable="skeleton.cy_global",
        ids=["year", "country_id"],
        cols=["year", "country_id"],
    ).join(df.reset_index().set_index(["year", "country_id"]), how="left")

    df = df.drop(columns=["gwno"])

    # Add consistent fvp_ prefix
    df = df.rename(
        columns=lambda col: col if col.startswith("fvp_") else f"fvp_{col}")
    df = df.sort_index(axis=1).sort_index(axis=0)

    # Push raw
    db.create_schema("fvp_v2")
    db.df_to_db(fqtable="fvp_v2.cy_unimp", df=df)

    # Extrapolate before imputing
    df = missing.extrapolate(df)

    # Impute and push
    for i, df_imp in enumerate(
            missing.impute_mice_generator(
                df=df,
                n_imp=10,
                estimator=DecisionTreeRegressor(max_features="sqrt"),
                parallel=True,
            )):
        db.df_to_db(df=df_imp, fqtable=f"fvp_v2.cy_imp_sklearn_{i}")

    log.info("Fininshed loading FVP")
Beispiel #4
0
def load_cdum() -> None:
    """ Load country dummies """

    df = db.db_to_df(fqtable="staging.country", cols=["id"], ids=["id"])
    df = df.reset_index().rename(columns={"id": "country_id"})
    df["to_dummy"] = df["country_id"]
    df = df.set_index(["country_id"])
    df = pd.get_dummies(df.to_dummy, prefix="cdum")
    db.drop_schema("cdum")
    db.create_schema("cdum")
    db.df_to_db(fqtable="cdum.c", df=df)
Beispiel #5
0
def _load_and_stage_wdi() -> pd.DataFrame:

    log.debug("Reading raw fetch.")
    with tempfile.TemporaryDirectory() as tempdir:
        paths = common.get_files_latest_fetch(name="wdi", tempdir=tempdir)
        path_zip = [
            path for path in paths if os.path.basename(path) == "WDI_csv.zip"
        ].pop()
        io.unpack_zipfile(path_zip, destination=tempdir)
        df = io.csv_to_df(path=os.path.join(tempdir, "WDIData.csv"))
        # TODO: Build codebook from this
        _ = io.csv_to_df(path=os.path.join(tempdir, "WDISeries.csv"))

    log.debug("Preparing WDI.")
    df = _flip_wdi(df=df)
    # Get country_id isoab matching
    log.debug("Fetching df_keys")
    df_keys = db.query_to_df(query="""
        SELECT id AS country_id, isoab AS countrycode FROM staging.country;
        """)

    # Drop duplicates, Soviet Union, Yugoslavia etc
    # Keep those with highest country_id, i.e. latest.
    df_keys = (df_keys.sort_values(
        by="country_id",
        ascending=False).drop_duplicates(subset=["countrycode"]).set_index(
            ["countrycode"]))

    # Join in keys
    log.debug("Joining in df_keys")
    df = df.join(df_keys)
    df = (df.reset_index().dropna(subset=["country_id"]).set_index(
        ["year",
         "country_id"]).add_prefix("wdi_").drop(columns=["wdi_countrycode"]))

    # Stage to CY skeleton
    log.debug("Fetching skeleton")
    df_skeleton = db.db_to_df(fqtable="skeleton.cy_global",
                              cols=["year", "country_id"
                                    ]).set_index(["year", "country_id"])
    df = df_skeleton.join(df, how="left")

    # Drop cols that have no values at all
    cols_completely_missing = missing.list_totally_missing(df)
    df = df.drop(columns=cols_completely_missing)
    log.debug(
        f"Dropped cols {cols_completely_missing} because they had no values")

    # order columns and rows
    df = df.sort_index(axis=1).sort_index(axis=0)

    return df
Beispiel #6
0
def load_reign() -> None:
    """ Load reign """
    log.info("Started loading reign.")

    spec = io.load_yaml(os.path.join(os.path.dirname(__file__), "spec.yaml"))
    with tempfile.TemporaryDirectory() as tempdir:
        paths = common.get_files_latest_fetch(name="reign", tempdir=tempdir)
        path_csv = [path for path in paths if path.endswith(".csv")].pop()
        df = io.csv_to_df(path=path_csv)

    df = fix_ccodes(df, spec)
    df = encode_govt_dummies(df)

    df = df.set_index(["year", "month", "ccode"])
    df = df.join(
        db.query_to_df(query="""
                SELECT id AS country_id, gwcode AS ccode
                FROM staging.country WHERE gweyear=2016;
                """).set_index(["ccode"]))
    df = df.join(
        db.query_to_df(query="""
            SELECT id AS month_id, year_id AS year, month FROM staging.month;
            """).set_index(["year", "month"]))
    df = df.reset_index().set_index(["month_id", "country_id"])
    df = df.drop(
        columns=["year", "month", "ccode", "country", "government", "leader"])

    df_skeleton = db.db_to_df(
        fqtable="skeleton.cm_global",
        cols=["month_id", "country_id"],
        ids=["month_id", "country_id"],
    )
    len_skel = len(df_skeleton)
    df = df_skeleton.join(df, how="left")
    if not len(df) == len_skel:
        raise RuntimeError(f"Join not correct, {len_skel} != {len(df)}")

    df = df.add_prefix("reign_")

    db.drop_schema("reign_v2")
    db.create_schema("reign_v2")
    db.df_to_db(df=df, fqtable="reign_v2.cm_unimp")

    db.df_to_db(
        df=missing.fill_groups_with_time_means(missing.extrapolate(df)),
        fqtable="reign_v2.cm_extrapolated",
    )

    log.info("Finished loading reign.")
Beispiel #7
0
def _load_and_stage_vdem() -> pd.DataFrame:
    """ Load and stage VDEM """
    log.debug("Loading raw fetch data for VDEM.")
    with tempfile.TemporaryDirectory() as tempdir:
        _ = common.get_files_latest_fetch(name="vdem_v10", tempdir=tempdir)

        _ = io.unpack_zipfile(
            path_zip=os.path.join(
                tempdir, "Country_Year_V-Dem_Full+others_CSV_v10.zip"
            ),
            destination=tempdir,
        )
        path_df = os.path.join(
            tempdir,
            "Country_Year_V-Dem_Full+others_CSV_v10",
            "V-Dem-CY-Full+Others-v10.csv",
        )
        df = (
            io.csv_to_df(path=path_df)
            .add_prefix("vdem_")
            .rename(columns={"vdem_year": "year"})
            .set_index(["year", "vdem_country_text_id"])
        )

    df_keys = (
        db.query_to_df(
            query="""
            SELECT id AS country_id, isoab AS vdem_country_text_id
            FROM staging.country;
            """
        )
        .sort_values(by="country_id", ascending=False)
        .drop_duplicates(subset=["vdem_country_text_id"])
        .set_index(["vdem_country_text_id"])
    )
    df = df.join(df_keys)

    # Drop where join failed
    df.dropna(subset=["country_id"])
    df = df.reset_index().set_index(["year", "country_id"]).sort_index()
    df.isnull().mean().mean()

    # Stage to CY skeleton
    log.debug("Fetching skeleton")
    df_skeleton = db.db_to_df(
        fqtable="skeleton.cy_global", cols=["year", "country_id"]
    ).set_index(["year", "country_id"])
    df = df_skeleton.join(df, how="left")

    cols_completely_missing = missing.list_totally_missing(df)
    df = df.drop(columns=cols_completely_missing)
    log.debug(
        f"Dropped cols {cols_completely_missing} because they had no values"
    )

    # order columns and rows
    df = df.rename(columns=lambda col: col.lower())

    cols = sorted(list(df.columns))
    cols = [col for col in cols if not col.endswith("_codehigh")]
    cols = [col for col in cols if not col.endswith("_codelow")]
    cols = [col for col in cols if not col.endswith("_ord")]
    cols = [col for col in cols if not col.endswith("_sd")]
    cols = [col for col in cols if not col.endswith("_mean")]
    cols = [col for col in cols if not col.endswith("_nr")]
    cols = [col for col in cols if not col.endswith("_osp")]
    df = df[cols]

    df = df.sort_index(axis=1).sort_index(axis=0)

    return df
Beispiel #8
0
def load_icgcw() -> None:
    """ Load ICGCW """
    log.info("Starting ICGCW import")

    spec = io.load_yaml(
        path=os.path.join(os.path.dirname(__file__), "spec.yaml"))
    # Get all the entries as list of dicts
    entries = load_and_parse_entries(parallel=True)
    entries = drop_unnamed_entries(entries)

    # Some renames depend on splits so we do splits twice
    entries = split_multi_country_entries(entries)
    entries = do_renames(entries, spec)
    entries = split_multi_country_entries(entries)

    entries = drop_drops(entries, drops=spec["drops"])

    debug_log_unique_names(entries)

    df = pd.DataFrame(entries)
    df = set_dates(df)
    df = drop_duplicate_country_months(df)
    df = df.set_index(["year", "month", "name"])
    cols = [
        "alerts",
        "opportunities",
        "deteriorated",
        "improved",
        "unobserved",
    ]
    df = df[cols]

    df_c = (db.query_to_df(
        "SELECT id AS country_id, name FROM staging.country;").sort_values(
            by=["country_id"],
            ascending=False).drop_duplicates(subset=["name"]).set_index(
                ["name"]))
    df_m = (db.query_to_df("""
            SELECT id AS month_id, year_id AS year, month  FROM staging.month;
            """).set_index(["year", "month"]).sort_index())
    df_skeleton = db.db_to_df(
        fqtable="skeleton.cm_global",
        cols=["month_id", "country_id"],
        ids=["month_id", "country_id"],
    )

    df = (df.join(df_c).join(df_m).dropna(subset=["country_id"]).set_index(
        ["month_id", "country_id"]).sort_index())
    df = df_skeleton.join(df, how="left")

    df = missing.extrapolate(df)
    df["unobserved"] = df["unobserved"].fillna(1)
    df = df.fillna(0)

    # orer cols and rows
    df = df.sort_index(axis=1).sort_index()
    df = df.add_prefix("icgcw_")

    # @TODO: Change this to icgcw without v2 once we drop flat
    schema = "icgcw_v2"
    db.drop_schema(schema)
    db.create_schema(schema)
    db.df_to_db(fqtable=f"{schema}.cm", df=df)
Beispiel #9
0
 def refresh(self) -> pd.DataFrame:
     """ Refetch table from database and update cache """
     log.info(f"Refreshing {self.fqtable}")
     df = db.db_to_df(fqtable=self.fqtable, ids=self.ids)
     io.df_to_parquet(df=df, path=self.path)
     return df