Exemple #1
0
def save_result_for_ensemble(
    sub_folder,
    file_name,
    **kwargs,
):
    """"
    name = '{score}_name'
    """
    import os
    folder = f'./output/1level/{sub_folder}'
    if not os.path.exists(folder):
        os.makedirs(folder)

    file = f'./output/1level/{sub_folder}/{file_name}.h5'
    from file_cache.utils.other import replace_invalid_filename_char
    file = replace_invalid_filename_char(file)
    store = pd.HDFStore(file)

    if kwargs is not None:
        for key, value in kwargs.items():
            if key is not None:
                store[f'{key}'] = value
                logger.debug(
                    f'Stove {key} to file#{file}  , size:{value.shape}')

    store.close()
    logger.debug(f"Ensamble file save to file: {file}")
    return file
Exemple #2
0
def is_support_cache(*args, **kwargs):
    for arg in args:
        if not is_mini_args(arg) and arg is not None:
            logger.debug(f'There is {type(arg).__name__} in the args')
            return False
    for _, arg in kwargs.items():
        if not is_mini_args(arg) and arg is not None:
            logger.debug(f'There is {type(arg).__name__} in the kwargs')
            return False
    return True
def _reduce_mem_usage(df, verbose=True):
    if isinstance(df, pd.Series):
        return df
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    mem = df.memory_usage()
    mem = mem if isinstance(mem, (int, float)) else mem.sum()
    start_mem = mem / 1024**2
    for sn, col in enumerate(df.columns):
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
                        np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
                        np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
                        np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
                        np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(
                        np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
            if col_type == df[col].dtypes:
                logger.info(
                    f'No need to change type for:{col}#{col_type}, {sn}/{len(df.columns)}'
                )
            else:
                logger.info(
                    f'Change {col}#{col_type} => {df[col].dtypes}, {sn}/{len(df.columns)}'
                )
    mem = df.memory_usage()
    mem = mem if isinstance(mem, (int, float)) else mem.sum()
    end_mem = mem / 1024**2
    if verbose:
        logger.debug(
            'Mem. usage decreased from {:7.2f} to {:7.2f} Mb ({:.1f}% reduction)'
            .format(start_mem, end_mem,
                    100 * (start_mem - end_mem) / start_mem))
    return df
Exemple #4
0
def check_exception(df, index=None):
    df = df.copy(deep=True)
    if index is not None and index in df:
        df.set_index(index, inplace=True)
    df = df.select_dtypes(  #include=['float64', 'int'],
        exclude=['object', 'datetime64[ns]'], )
    try:
        x, y = np.where(np.isinf(df.values) | np.isnan(df.values))
    except Exception as error:
        logger.debug(df.dtypes.sort_values())
        raise error
    if len(x) > 0:
        #print(x.min(), x.max()+1, y.min(), y.max()+1)
        df = df.iloc[x.min():(x.max() + 3), y.min():(y.max() + 3)]
        error_part = df.iloc[:3, :4]
        logger.debug(f'check_exception:\n{error_part}')
        return error_part
    else:
        return pd.DataFrame()
Exemple #5
0
    def writeFile(self, key, val, file_type):
        if not self.enable :
            logger.debug('Cache is disable')
            return None

        if val is None or len(val)==0:
            logger.debug('Return value is None or empty')
            return val
        elif isinstance(val, tuple):
            val_tuple = val
        else:
            val_tuple = (val,)

        if all([ isinstance(item, (pd.DataFrame, pd.Series)) for item in val_tuple]) :
            path = self.get_path(key, file_type)
            if file_type == 'h5':
                for index, df in enumerate(val_tuple):
                    key = f'{self.df_key}_{index}'
                    logger.debug(f"====Write {len(df)} records to File#{path}, with:{key}")
                    df.to_hdf(path, key)
            elif file_type == 'pickle':
                pd.to_pickle(val, path)
            return val
        else:
            logger.warning(f'The return is not DataFrame or it is None:{[ isinstance(item, pd.DataFrame) for item in val_tuple]}')
            return val
Exemple #6
0
    def readFile(self, key, file_type):
        if self.enable:
            path = self.get_path(key, file_type)
            if os.path.exists(path):

                if file_type == 'h5':
                    with pd.HDFStore(path, mode='r') as store:
                        key_list = store.keys()
                    logger.debug(f"Read cache from file:{path},key:{key_list}")
                    if len(key_list) == 0:
                        return None
                    elif len(key_list) == 1 :
                        return pd.read_hdf(path, key_list[0])
                    else:
                        return tuple([ pd.read_hdf(path, key) for key in key_list])
                elif file_type == 'pickle':
                    return pd.read_pickle(path)

            else:
                logger.debug(f"Can not find cache from file:{path}")
                return None
        else:
            logger.debug( "disable cache")
                )
            else:
                logger.info(
                    f'Change {col}#{col_type} => {df[col].dtypes}, {sn}/{len(df.columns)}'
                )
    mem = df.memory_usage()
    mem = mem if isinstance(mem, (int, float)) else mem.sum()
    end_mem = mem / 1024**2
    if verbose:
        logger.debug(
            'Mem. usage decreased from {:7.2f} to {:7.2f} Mb ({:.1f}% reduction)'
            .format(start_mem, end_mem,
                    100 * (start_mem - end_mem) / start_mem))
    return df


if __name__ == '__main__':
    from file_cache.cache import file_cache
    #@file_cache()
    @reduce_mem()
    def test_df(test):
        from sklearn import datasets
        import pandas as pd
        import numpy as np
        data = datasets.load_boston()
        df = pd.DataFrame(data.data, columns=data.feature_names)
        print(df.dtypes)
        return df  #, df.copy(), df.copy()

    logger.debug(len(test_df('xx')))