Ejemplo n.º 1
0
def compress_df(df: XDataFrame, verbose=False) -> XDataFrame:
    """Reduce memory usage by converting data types.

    For compatibility with feather, float16 is not used.

    Returns:
        The reduce data frame.
    """
    _num_dtypes = [
        "int16",
        "int32",
        "int64",
        "float32",
        "float64",
    ]
    start_mem_usage = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype
        if col_type in _num_dtypes:
            min_val, max_val = df[col].min(), df[col].max()
            if str(col_type).startswith("int"):
                if (min_val >= np.iinfo(np.int8).min
                        and max_val <= np.iinfo(np.int8).max):
                    df[col] = df[col].astype(np.int8)
                elif (min_val >= np.iinfo(np.int16).min
                      and max_val <= np.iinfo(np.int16).max):
                    df[col] = df[col].astype(np.int16)
                elif (min_val >= np.iinfo(np.int32).min
                      and max_val <= np.iinfo(np.int32).max):
                    df[col] = df[col].astype(np.int32)
                elif (min_val >= np.iinfo(np.int64).min
                      and max_val <= np.iinfo(np.int64).max):
                    df[col] = df[col].astype(np.int64)
            else:
                # NOTE: half float is not supported in feather.

                if (min_val >= np.finfo(np.float32).min
                        and max_val <= np.finfo(np.float32).max):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem_usage = df.memory_usage().sum() / 1024**2
    if verbose:
        logger.warning("Memory reduced from {:.2f} MB to {:.2f} MB".format(
            start_mem_usage,
            end_mem_usage,
        ))

    return df
Ejemplo n.º 2
0
def reduce_mem_usage(df: XDataFrame,
                     verbose: bool = True,
                     debug: bool = True) -> XDataFrame:
    start_mem = df.memory_usage().sum() / 1024**2

    df = compress_df(df)

    end_mem = df.memory_usage().sum() / 1024**2
    reduction = (start_mem - end_mem) / start_mem

    msg = (f"Mem. usage decreased to {end_mem:5.2f} MB" +
           f" ({reduction * 100:.1f} % reduction)")
    if verbose:
        print(msg)

    if debug:
        logging.debug(msg)

    return df