Ejemplo n.º 1
0
def rollmax(s: pd.Series, window: int) -> pd.Series:
    """ Rolling max """
    check_has_multiindex(s)
    # See https://github.com/pandas-dev/pandas/issues/14013
    y = s.groupby(
        level=1).apply(lambda x: x.rolling(window=window, min_periods=0).max())

    return y
Ejemplo n.º 2
0
def rollmax(s: pd.Series, window: int) -> pd.Series:
    """ Rolling max """
    check_has_multiindex(s)
    # See https://github.com/pandas-dev/pandas/issues/14013
    y = (s.groupby(level=1).rolling(
        window=window, min_periods=0).max().reset_index(level=0, drop=True))

    return y
Ejemplo n.º 3
0
def tlead(s: pd.Series, time: int) -> pd.Series:
    """ Time lead """
    check_has_multiindex(s)
    if time < 1:
        msg = f"Time below 1 passed to tlead: {time} \n"
        msg += "Call tlag() instead \n"
        raise RuntimeError(msg)

    return s.groupby(level=1).shift(-time)
Ejemplo n.º 4
0
def moving_average(s: pd.Series, time: int) -> pd.Series:
    """ Moving average """
    check_has_multiindex(s)
    if time < 1:
        msg = f"Time below 1 passed to ma: {time} \n"
        raise RuntimeError(msg)

    # Groupby groupvar
    y = s.groupby(level=1)
    # Divide into rolling time window of size time
    # min_periods=0 lets the window grow with available data
    # and prevent the function from inducing missingness
    y = y.rolling(time, min_periods=0)
    # Compute the mean
    y = y.mean()
    # groupby and rolling do stuff to indices, return to original form
    y = y.reset_index(level=0, drop=True).sort_index()
    return y
Ejemplo n.º 5
0
def fill_groups_with_time_means(df: pd.DataFrame) -> pd.DataFrame:
    """ Fill completely missing groups with time means """

    log.debug("Filling completely missing groups with time means.")
    data.check_has_multiindex(df)

    # TODO: Handle properly
    if not (df.dtypes == np.float64).all():
        log.warning("Not all cols are float64, this might break.")

    # Only fill numeric cols
    cols = list(df.select_dtypes(include=[np.number]).columns.values)
    for g_i, g_df in df.groupby(level=1):
        # If missing everything from a group
        if g_df.isnull().all().all():
            log.debug(
                f"All missing for groupvar {g_i}, filling with time mean")
            # Get the times for this group
            times_group = g_df.index.get_level_values(0)
            # Fill all columns with the time mean
            df.loc[g_df.index,
                   cols] = (df.loc[times_group,
                                   cols].groupby(level=0).mean().values)
    return df
Ejemplo n.º 6
0
def delta(s: pd.Series, time: int = 1) -> pd.Series:
    """ Return the time-delta of s """

    check_has_multiindex(s)
    return s - tlag(s, time=time)
Ejemplo n.º 7
0
def demean(s: pd.Series) -> pd.Series:
    """ demean, s = s - mean_group(s) """
    check_has_multiindex(s)
    s_mean = s.groupby(level=1).transform("mean")
    return s - s_mean
Ejemplo n.º 8
0
def cweq(s: pd.Series, value: float, seed=None) -> pd.Series:
    """ Count while s equals value

    @TODO: Seed from series (series of seeds per groupvar?)

    """
    check_has_multiindex(s)

    def set_seed(count, s, seed, mask):
        """ Set count=seed in first time if mask was True there

        Example: We want time since conflict, which is time in peace.
        So we want count_while(conflict == 0).
        If our conflict series starts at 0 we might assume some longer
        previous history of peace.
        As the time count is summed cumulatively we can "seed" this
        counting sum with a starting value.

        This seed is therefore insterted into the first time period
        of the count IF the country is in peace at that time.
        Being in peace means the count is True, or ==1 as we
        already cast the masks T/F to the counters 1/0.

        """
        ix_timevar = s.index.get_level_values(0)
        first_time = ix_timevar == min(ix_timevar)
        mask_true = mask == 1
        first_time_where_mask_true = first_time & mask_true
        count.loc[first_time_where_mask_true] = seed
        return count

    # Drop NaN's
    s = s.dropna()

    # Boolean mask of where our condition (s==value) is True
    mask = s == value

    # This is a tricky one, print it out if its confusing.
    # Values of block_grouper are incremented when mask is NOT true.
    # This creates values that are constant (not incrementing) through a
    # consecutive spell of mask being True.
    # Grouping by this var thus lets the count.cumsum() restart for
    # each group of consecutive rows where mask is True and stay at
    # zero for the rows where block_grouper keeps incrementing,
    # which are the rows where mask is not met.
    # Note that mask is True when the criteria is fullfilled
    # Basically lets us assign a grouping id to each consecutive
    # spell of our condition being True.
    block_grouper = (~mask).groupby(level=1).cumsum()

    # Our mask becomes the basis for the count by casting it to int
    count = mask.astype(int)

    if seed:
        count = set_seed(count, s, seed, mask)

    # Get the groupvar-level index to group by
    ix_groupvar = s.index.get_level_values(1)

    # The time elapsed while condition is true
    y = count.groupby([block_grouper, ix_groupvar]).cumsum()
    y = y.astype(int)

    return y
Ejemplo n.º 9
0
    R docs at:
    https://cran.r-project.org/web/packages/EBMAforecast/EBMAforecast.pdf

    Ensure df_calib, df_test and s_calib_actual have multiindex set.

    """

    # Copy data so we don't mess with callers data
    df_calib = df_calib.copy()
    df_test = df_test.copy()
    s_calib_actual = s_calib_actual.copy()
    s_calib_actual.name = "actual"

    # Make sure we're all indexed as expected
    datautils.check_has_multiindex(df_calib)
    datautils.check_has_multiindex(df_test)
    datautils.check_has_multiindex(s_calib_actual)

    if not len(s_calib_actual) == len(df_calib):
        msg = "Number of rows in df_calib and s_calib_actual don't match"
        raise RuntimeError(msg)

    offset = 1e-10
    upper = 1 - offset
    lower = 0 + offset

    # Sort indexes so they're aligned
    # Clip predictions
    df_calib = df_calib.sort_index().clip(lower, upper)
    df_test = df_test.sort_index().clip(lower, upper)
Ejemplo n.º 10
0
def extrapolate(df: pd.DataFrame) -> pd.DataFrame:
    """ Interpolate and extrapolate """
    data.check_has_multiindex(df)
    return (df.sort_index().groupby(
        level=1).apply(lambda group: group.interpolate(limit_direction="both"))
            )
Ejemplo n.º 11
0
def impute_amelia(df: pd.DataFrame, n_imp: int) -> List[pd.DataFrame]:
    """ Wrapper for calling Amelia in an R subprocess

    Args:
        df: Dataframe with MultiIndex set
        n_imp: Number of imputations to perform
    Return:
        dfs: List of imputed dataframes
    """
    def read_template():
        this_dir = os.path.dirname(os.path.abspath(__file__))
        path_template = os.path.join(this_dir, "amelia_template.R")
        with open(path_template, "r") as f:
            template_str = f.read()

        template = string.Template(template_str)

        return template

    log.info("Started impute_amelia()")

    data.check_has_multiindex(df)
    timevar, groupvar = df.index.names

    log.debug(f"n_imp: {n_imp}")
    log.debug(f"timevar: {timevar}")
    log.debug(f"groupvar: {groupvar}")
    log.debug(f"df shape: {df.shape}")
    log.debug(f"Share missing: {df.isnull().mean().mean()}")

    with tempfile.TemporaryDirectory() as tempdir:

        path_csv_in = os.path.join(tempdir, "input.csv")
        path_rscript = os.path.join(tempdir, "impute_script.R")
        path_out_stem = os.path.join(tempdir, "imputed_")

        values = {
            "PATH_CSV_INPUT": path_csv_in,
            "PATH_CSV_OUTPUT_STEM": path_out_stem,
            "TIMEVAR": timevar,
            "GROUPVAR": groupvar,
            "N_IMP": n_imp,
            "N_CPUS": mp.cpu_count(),
        }

        template = read_template()
        rscript = template.substitute(values)

        df.to_csv(path_csv_in, index=True)
        log.info(f"Wrote {path_csv_in}")

        with open(path_rscript, "w") as f:
            f.write(rscript)
        log.info(f"Wrote {path_rscript}")
        log.debug(rscript)

        cmd = ["Rscript", path_rscript]
        run_subproc(cmd)

        dfs = []
        for i in range(n_imp):
            path_imputed = f"{path_out_stem}{i+1}.csv"
            df_imp = pd.read_csv(path_imputed)
            df_imp = df_imp.drop(columns=["Unnamed: 0"])
            df_imp = df_imp.set_index([timevar, groupvar])
            dfs.append(df_imp)
            log.info(f"Read {path_imputed}")

    log.info("Finished impute_amelia()")
    return dfs