def save_result_for_ensemble( sub_folder, file_name, **kwargs, ): """" name = '{score}_name' """ import os folder = f'./output/1level/{sub_folder}' if not os.path.exists(folder): os.makedirs(folder) file = f'./output/1level/{sub_folder}/{file_name}.h5' from file_cache.utils.other import replace_invalid_filename_char file = replace_invalid_filename_char(file) store = pd.HDFStore(file) if kwargs is not None: for key, value in kwargs.items(): if key is not None: store[f'{key}'] = value logger.debug( f'Stove {key} to file#{file} , size:{value.shape}') store.close() logger.debug(f"Ensamble file save to file: {file}") return file
def is_support_cache(*args, **kwargs): for arg in args: if not is_mini_args(arg) and arg is not None: logger.debug(f'There is {type(arg).__name__} in the args') return False for _, arg in kwargs.items(): if not is_mini_args(arg) and arg is not None: logger.debug(f'There is {type(arg).__name__} in the kwargs') return False return True
def _reduce_mem_usage(df, verbose=True): if isinstance(df, pd.Series): return df numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] mem = df.memory_usage() mem = mem if isinstance(mem, (int, float)) else mem.sum() start_mem = mem / 1024**2 for sn, col in enumerate(df.columns): col_type = df[col].dtypes if col_type in numerics: c_min = df[col].min() c_max = df[col].max() if str(col_type)[:3] == 'int': if c_min > np.iinfo(np.int8).min and c_max < np.iinfo( np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo( np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo( np.int32).max: df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo( np.int64).max: df[col] = df[col].astype(np.int64) else: if c_min > np.finfo(np.float16).min and c_max < np.finfo( np.float16).max: df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo( np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) if col_type == df[col].dtypes: logger.info( f'No need to change type for:{col}#{col_type}, {sn}/{len(df.columns)}' ) else: logger.info( f'Change {col}#{col_type} => {df[col].dtypes}, {sn}/{len(df.columns)}' ) mem = df.memory_usage() mem = mem if isinstance(mem, (int, float)) else mem.sum() end_mem = mem / 1024**2 if verbose: logger.debug( 'Mem. usage decreased from {:7.2f} to {:7.2f} Mb ({:.1f}% reduction)' .format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem)) return df
def check_exception(df, index=None): df = df.copy(deep=True) if index is not None and index in df: df.set_index(index, inplace=True) df = df.select_dtypes( #include=['float64', 'int'], exclude=['object', 'datetime64[ns]'], ) try: x, y = np.where(np.isinf(df.values) | np.isnan(df.values)) except Exception as error: logger.debug(df.dtypes.sort_values()) raise error if len(x) > 0: #print(x.min(), x.max()+1, y.min(), y.max()+1) df = df.iloc[x.min():(x.max() + 3), y.min():(y.max() + 3)] error_part = df.iloc[:3, :4] logger.debug(f'check_exception:\n{error_part}') return error_part else: return pd.DataFrame()
def writeFile(self, key, val, file_type): if not self.enable : logger.debug('Cache is disable') return None if val is None or len(val)==0: logger.debug('Return value is None or empty') return val elif isinstance(val, tuple): val_tuple = val else: val_tuple = (val,) if all([ isinstance(item, (pd.DataFrame, pd.Series)) for item in val_tuple]) : path = self.get_path(key, file_type) if file_type == 'h5': for index, df in enumerate(val_tuple): key = f'{self.df_key}_{index}' logger.debug(f"====Write {len(df)} records to File#{path}, with:{key}") df.to_hdf(path, key) elif file_type == 'pickle': pd.to_pickle(val, path) return val else: logger.warning(f'The return is not DataFrame or it is None:{[ isinstance(item, pd.DataFrame) for item in val_tuple]}') return val
def readFile(self, key, file_type): if self.enable: path = self.get_path(key, file_type) if os.path.exists(path): if file_type == 'h5': with pd.HDFStore(path, mode='r') as store: key_list = store.keys() logger.debug(f"Read cache from file:{path},key:{key_list}") if len(key_list) == 0: return None elif len(key_list) == 1 : return pd.read_hdf(path, key_list[0]) else: return tuple([ pd.read_hdf(path, key) for key in key_list]) elif file_type == 'pickle': return pd.read_pickle(path) else: logger.debug(f"Can not find cache from file:{path}") return None else: logger.debug( "disable cache")
) else: logger.info( f'Change {col}#{col_type} => {df[col].dtypes}, {sn}/{len(df.columns)}' ) mem = df.memory_usage() mem = mem if isinstance(mem, (int, float)) else mem.sum() end_mem = mem / 1024**2 if verbose: logger.debug( 'Mem. usage decreased from {:7.2f} to {:7.2f} Mb ({:.1f}% reduction)' .format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem)) return df if __name__ == '__main__': from file_cache.cache import file_cache #@file_cache() @reduce_mem() def test_df(test): from sklearn import datasets import pandas as pd import numpy as np data = datasets.load_boston() df = pd.DataFrame(data.data, columns=data.feature_names) print(df.dtypes) return df #, df.copy(), df.copy() logger.debug(len(test_df('xx')))