Ejemplo n.º 1
0
def load_and_parse_entries(parallel: bool = False) -> List[Dict[Any, Any]]:
    """ Parse all entries in path_tar """

    with tempfile.TemporaryDirectory() as tempdir:
        common.get_files_latest_fetch(name="icgcw", tempdir=tempdir)

        paths = []
        for root, _, files in os.walk(tempdir):
            for fname in files:
                path = os.path.join(root, fname)
                if fname.endswith(".html"):
                    paths.append(path)
                else:
                    msg = (f"Found a file that wasn't .html. "
                           f"something broke for: {path}")
                    raise RuntimeError(msg)

        log.info(f"Found {len(paths)} files to parse")

        if parallel:
            log.info(f"Parsing {len(paths)} files in parallel")
            parsed_pages = joblib.Parallel(n_jobs=-1, verbose=10)(
                joblib.delayed(parse_page)(path) for path in paths)
        else:
            log.info(f"Parsing {len(paths)} files in sequence")
            parsed_pages = list(map(parse_page, sorted(paths)))

    parsed_entries = []
    for parsed_page in parsed_pages:
        for entry in parsed_page:
            parsed_entries.append(entry)

    return parsed_entries
Ejemplo n.º 2
0
def load_spei(parallel=True) -> None:
    """ Load SPEI """
    log.info("Started loading SPEI.")
    db.drop_schema("spei_v2")
    db.create_schema("spei_v2")
    df_pg_ug, df_m, df_ug_pgm = _get_id_dfs()

    with tempfile.TemporaryDirectory() as tempdir:
        paths = common.get_files_latest_fetch(name="spei", tempdir=tempdir)

        if parallel:
            with mp.Pool(processes=mp.cpu_count()) as pool:
                results = [
                    pool.apply_async(
                        func=_load_spei_from_path,
                        args=(path, df_pg_ug, df_m, df_ug_pgm),
                    ) for path in paths
                ]
                _ = [result.get() for result in results]
        else:
            for path in paths:
                _load_spei_from_path(path, df_pg_ug, df_m, df_ug_pgm)

    _stage_spei()
    log.info("Finished loading SPEI.")
Ejemplo n.º 3
0
def load_fvp():
    """ Load FVP data """
    log.info("Started loading FVP")
    with tempfile.TemporaryDirectory() as tempdir:
        _ = common.get_files_latest_fetch(name="fvp", tempdir=tempdir)
        df = io.csv_to_df(path=os.path.join(tempdir, "MasterData.csv"))

    df = df.drop(columns=["Conflict"])
    df = df.rename(columns=lambda col: col.lower())
    df = df.set_index(["year", "gwno"])

    spec = io.load_yaml(
        path=os.path.join(os.path.dirname(__file__), "spec.yaml"))
    df = df[spec["cols"]]

    log.debug("Fetching df_keys")
    query = "SELECT id AS country_id, gwcode AS gwno FROM staging.country;"
    df = df.join(
        db.query_to_df(query=query).sort_values(
            by="country_id",
            ascending=False).drop_duplicates(subset=["gwno"]).set_index(
                ["gwno"]))

    log.debug("Joining to skeleton")
    df = db.db_to_df(
        fqtable="skeleton.cy_global",
        ids=["year", "country_id"],
        cols=["year", "country_id"],
    ).join(df.reset_index().set_index(["year", "country_id"]), how="left")

    df = df.drop(columns=["gwno"])

    # Add consistent fvp_ prefix
    df = df.rename(
        columns=lambda col: col if col.startswith("fvp_") else f"fvp_{col}")
    df = df.sort_index(axis=1).sort_index(axis=0)

    # Push raw
    db.create_schema("fvp_v2")
    db.df_to_db(fqtable="fvp_v2.cy_unimp", df=df)

    # Extrapolate before imputing
    df = missing.extrapolate(df)

    # Impute and push
    for i, df_imp in enumerate(
            missing.impute_mice_generator(
                df=df,
                n_imp=10,
                estimator=DecisionTreeRegressor(max_features="sqrt"),
                parallel=True,
            )):
        db.df_to_db(df=df_imp, fqtable=f"fvp_v2.cy_imp_sklearn_{i}")

    log.info("Fininshed loading FVP")
Ejemplo n.º 4
0
def _load_and_stage_wdi() -> pd.DataFrame:

    log.debug("Reading raw fetch.")
    with tempfile.TemporaryDirectory() as tempdir:
        paths = common.get_files_latest_fetch(name="wdi", tempdir=tempdir)
        path_zip = [
            path for path in paths if os.path.basename(path) == "WDI_csv.zip"
        ].pop()
        io.unpack_zipfile(path_zip, destination=tempdir)
        df = io.csv_to_df(path=os.path.join(tempdir, "WDIData.csv"))
        # TODO: Build codebook from this
        _ = io.csv_to_df(path=os.path.join(tempdir, "WDISeries.csv"))

    log.debug("Preparing WDI.")
    df = _flip_wdi(df=df)
    # Get country_id isoab matching
    log.debug("Fetching df_keys")
    df_keys = db.query_to_df(query="""
        SELECT id AS country_id, isoab AS countrycode FROM staging.country;
        """)

    # Drop duplicates, Soviet Union, Yugoslavia etc
    # Keep those with highest country_id, i.e. latest.
    df_keys = (df_keys.sort_values(
        by="country_id",
        ascending=False).drop_duplicates(subset=["countrycode"]).set_index(
            ["countrycode"]))

    # Join in keys
    log.debug("Joining in df_keys")
    df = df.join(df_keys)
    df = (df.reset_index().dropna(subset=["country_id"]).set_index(
        ["year",
         "country_id"]).add_prefix("wdi_").drop(columns=["wdi_countrycode"]))

    # Stage to CY skeleton
    log.debug("Fetching skeleton")
    df_skeleton = db.db_to_df(fqtable="skeleton.cy_global",
                              cols=["year", "country_id"
                                    ]).set_index(["year", "country_id"])
    df = df_skeleton.join(df, how="left")

    # Drop cols that have no values at all
    cols_completely_missing = missing.list_totally_missing(df)
    df = df.drop(columns=cols_completely_missing)
    log.debug(
        f"Dropped cols {cols_completely_missing} because they had no values")

    # order columns and rows
    df = df.sort_index(axis=1).sort_index(axis=0)

    return df
Ejemplo n.º 5
0
def load_reign() -> None:
    """ Load reign """
    log.info("Started loading reign.")

    spec = io.load_yaml(os.path.join(os.path.dirname(__file__), "spec.yaml"))
    with tempfile.TemporaryDirectory() as tempdir:
        paths = common.get_files_latest_fetch(name="reign", tempdir=tempdir)
        path_csv = [path for path in paths if path.endswith(".csv")].pop()
        df = io.csv_to_df(path=path_csv)

    df = fix_ccodes(df, spec)
    df = encode_govt_dummies(df)

    df = df.set_index(["year", "month", "ccode"])
    df = df.join(
        db.query_to_df(query="""
                SELECT id AS country_id, gwcode AS ccode
                FROM staging.country WHERE gweyear=2016;
                """).set_index(["ccode"]))
    df = df.join(
        db.query_to_df(query="""
            SELECT id AS month_id, year_id AS year, month FROM staging.month;
            """).set_index(["year", "month"]))
    df = df.reset_index().set_index(["month_id", "country_id"])
    df = df.drop(
        columns=["year", "month", "ccode", "country", "government", "leader"])

    df_skeleton = db.db_to_df(
        fqtable="skeleton.cm_global",
        cols=["month_id", "country_id"],
        ids=["month_id", "country_id"],
    )
    len_skel = len(df_skeleton)
    df = df_skeleton.join(df, how="left")
    if not len(df) == len_skel:
        raise RuntimeError(f"Join not correct, {len_skel} != {len(df)}")

    df = df.add_prefix("reign_")

    db.drop_schema("reign_v2")
    db.create_schema("reign_v2")
    db.df_to_db(df=df, fqtable="reign_v2.cm_unimp")

    db.df_to_db(
        df=missing.fill_groups_with_time_means(missing.extrapolate(df)),
        fqtable="reign_v2.cm_extrapolated",
    )

    log.info("Finished loading reign.")
Ejemplo n.º 6
0
def _load_and_stage_vdem() -> pd.DataFrame:
    """ Load and stage VDEM """
    log.debug("Loading raw fetch data for VDEM.")
    with tempfile.TemporaryDirectory() as tempdir:
        _ = common.get_files_latest_fetch(name="vdem_v10", tempdir=tempdir)

        _ = io.unpack_zipfile(
            path_zip=os.path.join(
                tempdir, "Country_Year_V-Dem_Full+others_CSV_v10.zip"
            ),
            destination=tempdir,
        )
        path_df = os.path.join(
            tempdir,
            "Country_Year_V-Dem_Full+others_CSV_v10",
            "V-Dem-CY-Full+Others-v10.csv",
        )
        df = (
            io.csv_to_df(path=path_df)
            .add_prefix("vdem_")
            .rename(columns={"vdem_year": "year"})
            .set_index(["year", "vdem_country_text_id"])
        )

    df_keys = (
        db.query_to_df(
            query="""
            SELECT id AS country_id, isoab AS vdem_country_text_id
            FROM staging.country;
            """
        )
        .sort_values(by="country_id", ascending=False)
        .drop_duplicates(subset=["vdem_country_text_id"])
        .set_index(["vdem_country_text_id"])
    )
    df = df.join(df_keys)

    # Drop where join failed
    df.dropna(subset=["country_id"])
    df = df.reset_index().set_index(["year", "country_id"]).sort_index()
    df.isnull().mean().mean()

    # Stage to CY skeleton
    log.debug("Fetching skeleton")
    df_skeleton = db.db_to_df(
        fqtable="skeleton.cy_global", cols=["year", "country_id"]
    ).set_index(["year", "country_id"])
    df = df_skeleton.join(df, how="left")

    cols_completely_missing = missing.list_totally_missing(df)
    df = df.drop(columns=cols_completely_missing)
    log.debug(
        f"Dropped cols {cols_completely_missing} because they had no values"
    )

    # order columns and rows
    df = df.rename(columns=lambda col: col.lower())

    cols = sorted(list(df.columns))
    cols = [col for col in cols if not col.endswith("_codehigh")]
    cols = [col for col in cols if not col.endswith("_codelow")]
    cols = [col for col in cols if not col.endswith("_ord")]
    cols = [col for col in cols if not col.endswith("_sd")]
    cols = [col for col in cols if not col.endswith("_mean")]
    cols = [col for col in cols if not col.endswith("_nr")]
    cols = [col for col in cols if not col.endswith("_osp")]
    df = df[cols]

    df = df.sort_index(axis=1).sort_index(axis=0)

    return df
Ejemplo n.º 7
0
def load_initial_pgdata() -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """ Load pgdata into three dataframes: static, yearly and core """
    spec = io.load_yaml(os.path.join(os.path.dirname(__file__), "spec.yaml"))
    with tempfile.TemporaryDirectory() as tempdir:
        _ = common.get_files_latest_fetch(name="pgdata", tempdir=tempdir)

        varinfos = io.load_json(os.path.join(tempdir, "varinfos.json"))
        basegrid = io.load_json(os.path.join(tempdir, "basegrid.json"))

        varinfos_static = [vi for vi in varinfos if vi["type"] == "static"]
        varinfos_yearly = [vi for vi in varinfos if vi["type"] == "yearly"]
        varinfos_core = [vi for vi in varinfos if vi["type"] == "core"]
        varinfos_core = list(
            filter(
                lambda x: x["name"] not in spec["excludes_core"], varinfos_core
            )
        )

        # Build the indices for the dfs
        y_start = min([vinf["startYear"] for vinf in varinfos_yearly])
        y_end = max([vinf["endYear"] for vinf in varinfos_yearly])
        years = list(range(y_start, y_end + 1))
        gids = [cell["gid"] for cell in basegrid]

        df_static = _inserts_vinfs_data_to_df(
            tempdir=tempdir,
            df=pd.DataFrame(index=pd.Index(gids, name="gid")),
            vinfs=varinfos_static,
            ids=["gid"],
            drops=["year"],
        )
        df_static = _prepare(df_static, spec)

        df_yearly = _inserts_vinfs_data_to_df(
            tempdir=tempdir,
            df=pd.DataFrame(
                index=pd.MultiIndex.from_product(
                    [gids, years], names=["gid", "year"]
                )
            ),
            vinfs=varinfos_yearly,
            ids=["gid", "year"],
            drops=None,
        )
        df_yearly = _prepare(df_yearly, spec)

        df_core = _inserts_vinfs_data_to_df(
            tempdir=tempdir,
            df=pd.DataFrame(index=pd.Index(gids, name="gid")),
            vinfs=varinfos_core,
            ids=["gid"],
            drops=["year"],
        )
        df_core = _prepare(df_core, spec)

    # Set indices
    df_static = (
        df_static.reset_index()
        .rename(columns={"gid": "pg_id"})
        .set_index(["pg_id"])
        .sort_index()
    )
    df_yearly = (
        df_yearly.reset_index()
        .rename(columns={"gid": "pg_id"})
        .set_index(["year", "pg_id"])
        .sort_index()
    )
    df_core = (
        df_core.reset_index()
        .rename(columns={"gid": "pg_id"})
        .set_index(["pg_id"])
        .sort_index()
    )

    return df_static, df_yearly, df_core