def run(self): time_now = datetime.datetime.now() if self.params.config_file is not None: config = Config(file_py=self.params.config_file) for name in config.config: self.config[name] = config.config[name] self.config[name]['execute_num'] = 0 self.config[name]['runner'] = (self.config[name]['function'], self.config[name]['args'], self.config[name]['kwargs'], name) self.config[name]['time_init'] = time_now for name in self.config: self._reset_time(name, time_now) if self.params.logger.params.log_file != '': self.params.logger.write(f'New task {name} has been added.') if self.params.verbose: self.params.logger.info(f'New task {name} has been added.') while True: time_now = datetime.datetime.now() for name in self.config: if self.config[name]['time_next'] > time_now: self.config[name]['time_record'] = time_now else: self._start(self.config[name]['runner']) self._reset_time(name, time_now) self.config[name]['execute_num'] += 1 try: if self.params.config_file is not None: config = Config(file_py=self.params.config_file) for name in config.config: if name not in self.config: self.config[name] = config.config[name] self.config[name]['execute_num'] = 0 self.config[name]['time_init'] = time_now self._reset_time(name, time_now) if self.params.logger.params.log_file != '': self.params.logger.write( f'New task {name} has been added.') if self.params.verbose: self.params.logger.info( f'New task {name} has been added.') for i, j in config.config[name].items(): self.config[name][i] = j self.config[name]['runner'] = ( self.config[name]['function'], self.config[name]['args'], self.config[name]['kwargs'], name) except Exception as msg: if self.params.logger.params.log_file != '': self.params.logger.write(str(msg)) if self.params.verbose: self.params.logger.info(str(msg))
def _params_init(self): self._params = Config() self._params.batch = 0 self._params.batch_size = 1 self._params.skip_size = None self._params.take_size = -1 self._params.shuffle_size = 1 self._params.prefetch_size = 1 self._params.sample = 0 self._params.step = 1 self._params.tensor_mode = 'numpy' self._params.options = defaultdict(dict)
def _params_init(self): self._params = Config() self._params.step = 1 self._params.tensor = 'numpy' self._params.mode = 'total' self._params.mode1 = 'total' self._params.index_data = defaultdict() self._params.index_data['total'] = 'total' self._params.data_from = 'tensor' self._params.data = defaultdict() self._params.index = defaultdict(list) self._params.map = defaultdict(list) self._params.batch = defaultdict(list) self._params.batch[self._params.mode] = [0, False, 0] self._params.enumerate = defaultdict(int) self._params.options = defaultdict(dict)
def __init__(self, nrows, ncols, left=None, bottom=None, right=None, top=None, wspace=None, hspace=None, width_ratios=None, height_ratios=None): """A grid layout to place subplots within a figure. Args: nrows, ncols : int The number of rows and columns of the grid. left, right, top, bottom : float, optional Extent of the subplots as a fraction of figure width or height. Left cannot be larger than right, and bottom cannot be larger than top. If not given, the values will be inferred from a figure or rcParams at draw time. wspace : float, optional The amount of width reserved for space between subplots, expressed as a fraction of the average axis width. If not given, the values will be inferred from a figure or rcParams when necessary. hspace : float, optional The amount of height reserved for space between subplots, expressed as a fraction of the average axis height. If not given, the values will be inferred from a figure or rcParams when necessary. width_ratios : array-like of length *ncols*, optional Defines the relative widths of the columns. Each column gets a relative width of ``width_ratios[i] / sum(width_ratios)``. If not given, all columns will have the same width. height_ratios : array-like of length *nrows*, optional Defines the relative heights of the rows. Each column gets a relative height of ``height_ratios[i] / sum(height_ratios)``. If not given, all rows will have the same height. """ self._grid = Config() self._grid.figure = {'figsize': (10, 6)} self._grid.grid = { 'nrows': nrows, 'ncols': ncols, 'left': left, 'bottom': bottom, 'right': right, 'top': top, 'wspace': wspace, 'hspace': hspace, 'width_ratios': width_ratios, 'height_ratios': height_ratios } self._grid.grid_id = dict()
def __init__(self, model, hp=None, name=None, method='random'): self.params = Config() if name is not None: self.params.name = name elif method=='random': self.params.name = 'RS' elif method=='grid': self.params.name = 'GS' self.params.model_init = model self.params.model_name = '' self.params.method = method if model in ['XGBClassifier', 'XGBRegressor', 'LGBMClassifier', 'LGBMRegressor']: self.hp = model_hp(model=model, method=method) if hp is not None: self.hp.from_HyperParameters(hp) self.params.model_name = model if model=='XGBClassifier': import xgboost as xgb assert xgb.__version__>=__xgboost_version__, f'xgboost version should be >={__xgboost_version__}.' self.params.model_init = xgb.XGBClassifier elif model=='LGBMClassifier': import xgboost as xgb assert xgb.__version__>=__xgboost_version__, f'xgboost version should be >={__xgboost_version__}.' self.params.model_init = xgb.XGBRegressor elif model=='LGBMClassifier': import lightgbm as lgb assert lgb.__version__>=__lightgbm_version__, f'lightgbm version should be >={__lightgbm_version__}.' self.params.model_init = lgb.LGBMClassifier elif model=='LGBMRegressor': import lightgbm as lgb assert lgb.__version__>=__lightgbm_version__, f'lightgbm version should be >={__lightgbm_version__}.' self.params.model_init = lgb.LGBMRegressor else: self.hp = hp self.best_params = dict() self.best_params_history = dict()
def __init__(self, target, width=25, verbose=1, unit_name='step'): """ Args: target: Total number of steps expected, None if unknown. width: Progress bar width on screen. verbose: Verbosity mode, 0 (silent), 1 (verbose) unit_name: Display name for step counts (usually "step" or "sample"). """ self.param = Config() self.param.width = width self.param.target = target self.param.time = time.time() self.param.n = 0 self.param.unit_name = unit_name self.param.verbose = verbose self.param.current = 0 if verbose: self.param.logger = Logger()
def __init__(self, logger=None, verbose=0, config_file=None): """ Args: logger: Logger object, linora.utils.Logger() class. verbose: Verbosity mode, 0 (silent), 1 (verbose). config_file: job task config file, if .py file. example: .py file name is schedulers_config.py, contain a dict, config = {'hhh':{'mode':'every_minute', 'time':50, 'function':function, 'args':[], 'kwargs':{}}} """ self.config = dict() self.params = Config() self.params.verbose = verbose if logger is None: logger = Logger() self.params.logger = logger self.params.config_file = config_file manager = multiprocessing.Manager() self.params.tracker_dict = manager.dict() self.params.runner_dict = defaultdict()
def __init__(self): self._params = Config() self._params.ydata = defaultdict(defaultdict) self._params.theme = 'ggplot' self._params.figure = {'figsize': (10, 6)} self._params.axis = { 'axis': None, 'xinvert': False, 'yinvert': False, 'xtick': {}, 'ytick': {}, 'xlabel': None, 'ylabel': None, 'xtickposition': None, 'ytickposition': None } self._params.label = { 'xlabel': { 'xlabel': None }, 'ylabel': { 'ylabel': None } } self._params.legend = {'loc': None} self._params.spine = { 'show': {}, 'color': {}, 'width': {}, 'style': {}, 'position': {} } self._params.title = {'label': None} self._params.set_label = True self._params.colorbar = set()
class BaseSearch(): def __init__(self, model, hp=None, name=None, method='random'): self.params = Config() if name is not None: self.params.name = name elif method=='random': self.params.name = 'RS' elif method=='grid': self.params.name = 'GS' self.params.model_init = model self.params.model_name = '' self.params.method = method if model in ['XGBClassifier', 'XGBRegressor', 'LGBMClassifier', 'LGBMRegressor']: self.hp = model_hp(model=model, method=method) if hp is not None: self.hp.from_HyperParameters(hp) self.params.model_name = model if model=='XGBClassifier': import xgboost as xgb assert xgb.__version__>=__xgboost_version__, f'xgboost version should be >={__xgboost_version__}.' self.params.model_init = xgb.XGBClassifier elif model=='LGBMClassifier': import xgboost as xgb assert xgb.__version__>=__xgboost_version__, f'xgboost version should be >={__xgboost_version__}.' self.params.model_init = xgb.XGBRegressor elif model=='LGBMClassifier': import lightgbm as lgb assert lgb.__version__>=__lightgbm_version__, f'lightgbm version should be >={__lightgbm_version__}.' self.params.model_init = lgb.LGBMClassifier elif model=='LGBMRegressor': import lightgbm as lgb assert lgb.__version__>=__lightgbm_version__, f'lightgbm version should be >={__lightgbm_version__}.' self.params.model_init = lgb.LGBMRegressor else: self.hp = hp self.best_params = dict() self.best_params_history = dict() def search(self, train_data, metrics, valid_data=None, iter_num=None, cv=3, metrics_min=True, speedy=True, speedy_param=(20000, 0.3), save_model_dir=None, save_model_name=None): """model params search method. Args: train_data: A list of (X, y, sample_weight) tuple pairs to use as train sets. metrics: model metrics function. valid_data: A list of (X, y, sample_weight) tuple pairs to use as validation sets. iter_num: search count. cv: cross validation fold. metrics_min: metrics value whether the smaller the better. speedy: whether use speedy method. speedy_param: if use speedy method, test_size will be set, test_size = 1-round(min(speedy_param[0], feature.shape[0]*speedy_param[1])/feature.shape[0], 2). save_model_dir: str, save model folder, only work with model='XGBClassifier' or 'XGBRegressor'. save_model_name: str, save model name prefix, only work with model='XGBClassifier' or 'XGBRegressor'. Returns: a best model params dict. Raises: params error. """ logger = Logger(name=self.params.name) logger.info(f"Start hyperparameter {self.params.method} search.") import warnings warnings.filterwarnings("ignore") if speedy: test_size = 1-round(min(speedy_param[0], len(train_data[1])*speedy_param[1])/len(train_data[1]), 2) if self.params.model_name=='XGBClassifier': self._xgb_weight(train_data[1]) if valid_data is not None: cv_score_list = [] if self.params.method=='grid': if iter_num is None: iter_num = self.hp.cardinality() else: iter_num = min(iter_num, self.hp.cardinality()) if iter_num is None: iter_num = 100 for i in range(1, iter_num+1): self.hp.update(self.best_params) self.params.model = self.params.model_init(**self.hp.params) score = [] if speedy: for _ in range(cv): index = train_test_split(train_data[0], train_data[1], test_size, seed=np.random.choice(range(100), 1)[0]) score.append(self._model_fit_predict(train_data, metrics, index, mode=1)) else: index_list = kfold(train_data[0], train_data[1], n_splits=cv, seed=np.random.choice(range(100), 1)[0]) for n, index in enumerate(index_list): score.append(self._model_fit_predict(train_data, metrics, index, mode=1)) cv_score = np.mean(score) if valid_data is not None: cv_score_list.append(cv_score) cv_score_list.sort() threshold = cv_score_list[int(len(cv_score_list)*(0.2 if metrics_min else 0.8))] if (metrics_min==True and threshold>=cv_score) or (metrics_min==False and threshold<=cv_score): cv_score = self._model_fit_predict(valid_data, metrics, index=None, mode=0) else: logger.info(f"Model {self.params.method} search progress: {i/iter_num*100:.1f}%, best score: {scoring:.4f}", enter=False if i<iter_num else True) continue if i==1: scoring = cv_score if (metrics_min==True and cv_score<=scoring) or (metrics_min==False and cv_score>=scoring): scoring = cv_score self.best_params = self.hp.params.copy() self.best_params_history[i] = {'score':scoring, 'best_params':self.best_params.copy()} if self.params.model_name in ['XGBClassifier', 'XGBRegressor']: if save_model_dir is not None: if save_model_name is None: save_model_name = self.params.name model.save_model(os.path.join(save_model_dir, f"{save_model_name}_model.json")) with open(os.path.join(save_model_dir, f"{save_model_name}_params.json"),'w') as f: json.dump(best_params, f) logger.info(f"Model {self.params.method} search progress: {i/iter_num*100:.1f}%, best score: {scoring:.4f}", enter=False if i<iter_num else True) logger.info(f"Model {self.params.method} search best score: {scoring:.4f}", close=True, time_mode=1) return self.best_params def _model_fit_predict(self, data, metrics, index=None, mode=1): if mode: if len(data)==2: self.params.model.fit(data[0].loc[index[0]], data[1][index[0]]) else: self.params.model.fit(data[0].loc[index[0]], data[1][index[0]], sample_weight=data[2][index[0]]) if index is None: cv_pred = pd.Series(self.params.model.predict(data[0]), index=data[1].index) else: cv_pred = pd.Series(self.params.model.predict(data[0].loc[index[1]]), index=data[1][index[1]].index) if len(data)==2: if index is None: return metrics(data[1], cv_pred) else: return metrics(data[1][index[1]], cv_pred) else: if index is None: return metrics(data[1], cv_pred, sample_weight=data[2]) else: return metrics(data[1][index[1]], cv_pred, sample_weight=data[2][index[1]]) def _xgb_weight(self, label): weight_dict = Counter(label) if len(weight_dict)==2: weight = int(np.ceil(weight_dict[min(weight_dict)]/weight_dict[max(weight_dict)])) else: weight_dict = {j:i for i,j in weight_dict.items()} weight = int(np.ceil(weight_dict[max(weight_dict)]/weight_dict[min(weight_dict)])) if self.params.method=='grid': self.hp.Choice('scale_pos_weight', [1, weight], weight, rank=6) else: self.hp.Choice('scale_pos_weight', [1, weight])
class DataSet(): def __init__(self): self._params_init() def _params_init(self): self._params = Config() self._params.batch = 0 self._params.batch_size = 1 self._params.skip_size = None self._params.take_size = -1 self._params.shuffle_size = 1 self._params.prefetch_size = 1 self._params.sample = 0 self._params.step = 1 self._params.tensor_mode = 'numpy' self._params.options = defaultdict(dict) def batch(self, batch_size, drop_remainder=False): """Combines consecutive elements of this dataset into batches. Args: batch_size: representing the number of consecutive elements of this dataset to combine in a single batch. drop_remainder: representing whether the last batch should be dropped in the case it has fewer than batch_size elements; the default behavior is not to drop the smaller batch. """ assert 'batch' not in self._params.options, '`batch` already exists.' assert isinstance(batch_size, int) and batch_size>0, '`batch_size` type should be int and greater than 0.' self._params.batch_size = batch_size self._params.drop_remainder = drop_remainder self._params.options['batch'].update({self._params.step: {'batch_size':batch_size, 'drop_remainder':drop_remainder}}) self._params.step += 1 return self def concatenate(self, dataset): """Creates a Dataset by concatenating the given dataset with this dataset. Args: dataset: la.data.TextLineDataset object, Dataset to be concatenated. """ self._params.data.append(dataset._params.data) self._params.options['concatenate'].update({self._params.step: None}) self._params.step += 1 return self def enumerate(self, start=0): """Enumerates the elements of this dataset. Args: start: int, representing the start value for enumeration. """ assert 'enumerate' not in self._params.options, '`enumerate` already exists.' self._params.enumerate = start self._params.options['enumerate'].update({self._params.step: {'start':start}}) self._params.step += 1 return self # def filter(self, filter_func): # """A transformation that filter dataset based on a filter_func. # Args: # filter_func: A function that return True or False # """ # if self.params.data_mode=='list': # filter_list = [i for i in range(len(self.params.data[0])) if filter_func([j[i] for j in self.params.data])] # else: # filter_list = [r for r, i in enumerate(self.params.data) if filter_func(i)] # if filter_list: # self.params.data_index = [i for i in self.params.data_index if i not in filter_list] # self.params.options['filter'].append((self.params.step, filter_func)) # self.params.step += 1 # return self def map(self, map_func): """Maps map_func across the elements of this dataset. Args: map_func: A function mapping a dataset element to another dataset element. map_size: representing the number elements to process asynchronously in parallel. """ assert 'map' not in self._params.options, '`map` already exists.' self._params.map_func = map_func self._params.options['map'].update({self._params.step: {'map_func':map_func}}) self._params.step += 1 return self def options(self): """Returns the options for this dataset and its inputs.""" return self._params.options def prefetch(self, prefetch_size): """Creates a Dataset that prefetches elements from this dataset. Args: prefetch_size: representing the maximum number of elements that will be buffered when prefetching. """ assert 'prefetch' not in self._params.options, '`prefetch` already exists.' assert 'take_while' not in self._params.options, '`prefetch` must be placed in `take_while` front.' assert isinstance(prefetch_size, int) and prefetch_size>0, '`prefetch_size` type should be int and greater than 0.' self._params.prefetch_size = prefetch_size self._params.options['prefetch'].update({self._params.step: {'prefetch_size':prefetch_size}}) self._params.step += 1 return self # def reduce(self, reduce_func): # """Reduces the input dataset to a single element. # Args: # reduce_func: A function that maps to new_state. It must take two arguments and return a new element # """ # if self.params.data_mode=='list': # return [functools.reduce(reduce_func, i[self.params.data_index]) for i in self.params.data] # return functools.reduce(reduce_func, self.params.data[self.params.data_index]) def repeat(self, repeat_size): """Repeats this dataset so each original value is seen count times. Args: repeat_size: representing the number of times the dataset should be repeated. """ assert 'take_while' not in self._params.options, '`repeat` must be placed in `take_while` front.' assert isinstance(repeat_size, int) and repeat_size>0, '`repeat_size` type should be int and greater than 0.' self._params.data = self._params.data*(repeat_size+1) self._params.options['repeat'].update({self._params.step: {'repeat_size':repeat_size}}) self._params.step += 1 return self # def shard(self, shard_size, shard_index): # """Creates a Dataset that includes only 1/num_shards of this dataset. # Args: # shard_size: representing the number of shards operating in parallel. # shard_index: representing the worker index. # """ # assert 'take_while' not in self.params.options, '`shard` must be placed in `take_while` front.' # assert isinstance(shard_size, int) and shard_size>0, '`shard_size` type should be int and greater than 0.' # assert isinstance(shard_index, int) and shard_index>=0, '`shard_index` type should be int and greater than or equal to 0.' # self.params.data_index = [self.params.data_index[i] for i in range(shard_index, len(self.params.data_index), shard_size)] # self.params.options['shard'].append((self.params.step, shard_size, shard_index)) # self.params.step += 1 # return self def shuffle(self, shuffle_size, seed=None): """Randomly shuffles the elements of this dataset. Args: shuffle_size: representing the number of elements from this dataset from which the new dataset will sample. seed: representing the random seed that will be used to create the distribution. """ assert 'shuffle' not in self._params.options, '`shuffle` already exists.' assert 'take_while' not in self._params.options, '`shuffle` must be placed in `take_while` front.' assert isinstance(shuffle_size, int) and shuffle_size>-2 and shuffle_size!=0, '`shuffle_size` type should be int and greater than 0 or equal to -1.' self._params.shuffle_size = shuffle_size self._params.options['shuffle'].update({self._params.step: {'shuffle_size':shuffle_size, 'seed':seed}}) self._params.step += 1 return self def skip(self, skip_size): """Creates a Dataset that skips count elements from this dataset. Skip all data for the first file at most. Args: skip_size: representing the number of elements of this dataset that should be skipped to form the new dataset. If count is greater than the size of this dataset, the new dataset will contain no elements. """ assert 'skip' not in self._params.options, '`skip` already exists.' assert 'take_while' not in self._params.options, '`skip` must be placed in `take_while` front.' assert isinstance(skip_size, int) and skip_size>0, '`skip_size` type should be int and greater than 0.' self._params.skip_size = skip_size self._params.options['skip'].update({self._params.step: {'skip_size':skip_size}}) self._params.step += 1 return self def take(self, take_size): """Creates a Dataset with at most count elements from this dataset. Args: take_size: representing the number of elements of this dataset that should be taken to form the new dataset. If count is -1, or if count is greater than the size of this dataset, the new dataset will contain all elements of this dataset. """ assert 'take' not in self._params.options, '`take` already exists.' assert 'take_while' not in self._params.options, '`take` must be placed in `take_while` front.' assert isinstance(take_size, int) and take_size>-2 and take_size!=0, '`take_size` type should be int and greater than 0 or equal to -1.' self._params.take_size = take_size self._params.options['take'].update({self._params.step: {'take_size':take_size}}) self._params.step += 1 return self # def take_while(self, take_func): # """A transformation that stops dataset iteration based on a take_func. # Args: # take_func: A function that return True or False # """ # temp = set() # index = self.params.data_index[:max([self.params.data_index.index(i) for i in range(len(self.params.data))])+1] # for r, i in enumerate(index): # if i in temp: # continue # temp.add(i) # if self.params.data_mode=='list': # if take_func([j[i] for j in self.params.data]): # self.params.data_index = self.params.data_index[:r] # break # else: # if take_func(self.params.data[i]): # self.params.data_index = self.params.data_index[:r] # break # self.params.options['take_while'].append((self.params.step, take_func)) # self.params.step += 1 # return self def to_tensor(self, mode='tf'): """Transform data from numpy array to tensor. Args: mode: Deep learning framework name, one of ['tf', 'pytorch', 'paddle', 'mxnet', 'mindspore']. """ assert 'to_tensor' not in self._params.options, '`to_tensor` already exists.' assert 'take_while' not in self._params.options, '`take` must be placed in `take_while` front.' if mode in ['tf', 'tensorflow']: from tensorflow import convert_to_tensor self._params.framework = convert_to_tensor elif mode in ['pytorch', 'torch']: from torch import as_tensor self._params.framework = as_tensor elif mode in ['paddle', 'paddlepaddle']: from paddle import to_tensor self._params.framework = to_tensor elif mode in ['mx', 'mxnet']: from mxnet.ndarray import array self._params.framework = array elif mode in ['mindspore']: from mindspore.numpy import array self._params.framework = array else: raise ValueError('`mode` value error.') self._params.tensor_mode = mode self._params.options['to_tensor'].update({self._params.step: {'mode':mode}}) self._params.step += 1 return self def _to_tensor(self, data): if self._params.tensor_mode=='numpy': return data return self._params.framework(data) def __iter__(self): self._params.shuffle_size = np.ceil(max(self._params.shuffle_size, self._params.prefetch_size, 1)/self._params.batch_size)*self._params.batch_size self._params.df = pd.read_csv(self._params.data[self._params.batch_file], sep=self._params.sep, iterator=True, header=self._params.header, skiprows=self._params.skip_size) self._params.values = self._params.df.get_chunk(self._params.shuffle_size) if 'shuffle' in self._params.options: self._params.values = self._params.values.sample(frac=1, random_state=self._params.shuffle_seed).reset_index(drop=True) self._params.batch_file += 1 self._params.batch_index = 0 return self def __next__(self): values = self._params.values.loc[self._params.batch_size*self._params.batch_index:self._params.batch_size*(self._params.batch_index+1)] if len(values)<self._params.batch_size: self._params.batch_index = 0 try: self._params.values = self._params.df.get_chunk(self._params.shuffle_size).reset_index(drop=True) except StopIteration: if self._params.batch_file==len(self._params.data): raise StopIteration self._params.df = pd.read_csv(self._params.data[self._params.batch_file], sep=self._params.sep, header=self._params.header, iterator=True) self._params.values = self._params.df.get_chunk(self._params.shuffle_size) self._params.batch_file += 1 if 'shuffle' in self._params.options: self._params.values = self._params.values.sample(frac=1, random_state=self._params.shuffle_seed).reset_index(drop=True) values = self._params.values.loc[0:self._params.batch_size] self._params.batch += 1 self._params.batch_index += 1 if self._params.take_size>0: if self._params.sample>=self._params.take_size: raise StopIteration self._params.sample += len(values) if 'map' in self._params.options: return self._to_tensor(values.apply(self._params.map_func, axis=1).values) return self._to_tensor(values.values)
import matplotlib.pyplot as plt from linora.utils._config import Config __all__ = ['Options'] Options = Config() Options.cmap = Config(**{'viridis':'viridis', 'jet':'jet'}) Options.dash_capstyle = Config(**{'butt': 'butt', 'projecting': 'projecting', 'round': 'round'}) Options.dash_joinstyle = Config(**{'miter': 'miter', 'round': 'round', 'bevel': 'bevel'}) Options.linelink = Config(**{'steps':'steps', 'steps_pre':'steps-pre', 'steps_mid':'steps-mid', 'steps_post':'steps-post'}) Options.linestyle = Config(**{'solid':'-', 'dashed':'--', 'dashdot':'-.', 'dotted':':'}) Options.fontsize = Config(**{ 'small_xx':'xx_small', 'small_x':'x-small', 'small':'small', 'medium':'medium', 'large':'large', 'large_x':'x-large', 'large_xx':'xx-large'}) Options.fontweight = Config(**{ 'book': 'book', 'normal': 'normal', 'bold': 'bold', 'demi': 'demi', 'semibold': 'semibold', 'roman': 'roman', 'black': 'black', 'extra bold': 'extra bold', 'light': 'light', 'regular': 'regular',
class DataSet(): def __init__(self): self._params_init() def _params_init(self): self._params = Config() self._params.step = 1 self._params.tensor = 'numpy' self._params.mode = 'total' self._params.mode1 = 'total' self._params.index_data = defaultdict() self._params.index_data['total'] = 'total' self._params.data_from = 'tensor' self._params.data = defaultdict() self._params.index = defaultdict(list) self._params.map = defaultdict(list) self._params.batch = defaultdict(list) self._params.batch[self._params.mode] = [0, False, 0] self._params.enumerate = defaultdict(int) self._params.options = defaultdict(dict) def batch(self, batch_size, drop_remainder=False): """Combines consecutive elements of this dataset into batches. Args: batch_size: representing the number of consecutive elements of this dataset to combine in a single batch. drop_remainder: representing whether the last batch should be dropped in the case it has fewer than batch_size elements; the default behavior is not to drop the smaller batch. """ assert isinstance( batch_size, int ) and batch_size > 0, '`batch_size` type should be int and greater than 0.' self._params.batch[self._params.mode][0] = batch_size self._params.batch[self._params.mode][1] = drop_remainder self._params.options['batch'].update({ self._params.step: { 'batch_size': batch_size, 'drop_remainder': drop_remainder } }) self._params.step += 1 return self def cardinality(self): """Returns the cardinality of the dataset, if known.""" return len(self._params.index[self._params.mode]) def concatenate(self, datasets): """Creates a Dataset by concatenating the given dataset with this dataset. Args: datasets: la.data.Dataset or list of la.data.Dataset to be concatenated. """ assert 'take_while' not in self._params.options, '`concatenate` must be placed in `take_while` front.' if not isinstance(datasets, list): self._concatenate(datasets) else: for dataset in datasets: assert self._params.data_mode == dataset._params.data_mode, 'The data types of the two data sets are inconsistent.' for dataset in datasets: self._concatenate(dataset) self._params.options['concatenate'].update({self._params.step: None}) self._params.step += 1 return self def _concatenate(self, dataset): if 'list' in self._params.data_mode: t = len(self._params.data[self._params.mode1][0]) else: t = len(self._params.data[self._params.mode1]) if 'list' in self._params.data_mode: assert len(self._params.data[self._params.mode1]) == len( dataset._params.data[self._params.mode1] ), 'Width needs to be consistent between data.' self._params.data[self._params.mode1] = [ np.concatenate([ self._params.data[self._params.mode1][i], dataset._params.data[self._params.mode1][i] ]) for i in range(len(self._params.data[self._params.mode1])) ] else: self._params.data[self._params.mode1] = np.concatenate([ self._params.data[self._params.mode1], dataset._params.data[self._params.mode1] ]) self._params.index[self._params.mode] += [ i + t for i in dataset._params.index[dataset._params.mode] ] def drop(self, names): """Drop current dataset. Args: name: str or list, drop dataset name. """ if isinstance(names, str): names = [names] for name in names: assert name != 'total', "`name` can't be 'total'." for name in names: if name in self._params.index_data: self._params.index_data.pop(name) if name in self._params.data: if name in [j for i, j in self._params.index_data.items()]: name1 = str(time.time()).split('.')[0] self._params.data[name1] = self._params.data.pop(name) for i, j in self._params.index_data.items(): if name == j: self._params.index_data[i] = name1 else: self._params.data.pop(name) if name in self._params.index: self._params.index.pop(name) if name in self._params.map: self._params.map.pop(name) if name in self._params.batch: self._params.batch.pop(name) if name in self._params.enumerate: self._params.enumerate.pop(name) if self._params.mode == name: self._params.mode = 'total' self._params.mode1 = 'total' for i in list(self._params.data): if i not in [j for k, j in self._params.index_data.items()]: self._params.data.pop(i) return self def enumerate(self, start=0): """Enumerates the elements of this dataset. Args: start: int, representing the start value for enumeration. """ self._params.enumerate[self._params.mode] = start self._params.options['enumerate'].update( {self._params.step: { 'start': start }}) self._params.step += 1 return self def filter(self, filter_func): """A transformation that filter dataset based on a filter_func. Args: filter_func: A function that return True or False, datasets that are kept as True. """ if self._params.data_mode == 'list': filter_list = [ i for i in range(len(self._params.data[self._params.mode1][0])) if filter_func( [j[i] for j in self._params.data[self._params.mode1]]) ] else: filter_list = [ r for r, i in enumerate(self._params.data[self._params.mode1]) if filter_func(i) ] if filter_list: self._params.index[self._params.mode] = [ i for i in self._params.index[self._params.mode] if i in filter_list ] self._params.options['filter'].update( {self._params.step: { 'filter_func': filter_func }}) self._params.step += 1 return self def get(self, name): """Select current dataset. Args: name: split dataset name. """ assert name in self._params.index, '`name` not in split dataset.' if self._params.batch[name][2] == -1: self._params.batch[name][0] = 0 self._params.batch[name][2] = 0 self._params.mode = name self._params.mode1 = self._params.index_data[name] for i in self._params.data: if i not in [j for k, j in self._params.index_data.items()]: self._params.data.pop(i) return self def join(self, join_dict, drop_exist_dataset=True): """Join Dataset. Args: join_dict: dict, {name: Dataset}, eg.{'train':la.data.Dataset.from_tensor()}. drop_exist_dataset: bool, If the name of the dataset is repeated, drop self exist dataset. """ for name in join_dict: assert name != 'total', "`name` can't be 'total'." for name in join_dict: if name in self._params.index: if drop_exist_dataset: self.drop(name) self._join(name, join_dict) else: self._join(name, join_dict) return self def _join(self, name, join_dict): self._params.data[name] = join_dict[name]._params.data[ join_dict[name]._params.mode1].copy() self._params.index[name] = join_dict[name]._params.index[ join_dict[name]._params.mode].copy() self._params.map[name] = join_dict[name]._params.map[ join_dict[name]._params.mode].copy() self._params.batch[name] = join_dict[name]._params.batch[ join_dict[name]._params.mode].copy() if join_dict[name]._params.mode in join_dict[name]._params.enumerate: self._params.enumerate[name] = join_dict[name]._params.enumerate[ join_dict[name]._params.mode].copy() self._params.index_data[name] = name def list_names(self): """list datasets name.""" return [i for i in self._params.index] def map(self, map_func, map_size=8): """Maps map_func across the elements of this dataset. Args: map_func: A function mapping a dataset element to another dataset element. map_size: representing the number elements to process asynchronously in parallel. """ assert isinstance( map_size, int ) and map_size > 0, '`map_size` type should be int and greater than 0.' self._params.map[self._params.mode] = [map_func, map_size] self._params.options['map'].update( {self._params.step: { 'map_func': map_func, 'map_size': map_size }}) self._params.step += 1 return self def options(self): """Returns the options for this dataset and its inputs.""" return self._params.options def prefetch(self, prefetch_size): """Creates a Dataset that prefetches elements from this dataset. Args: prefetch_size: representing the maximum number of elements that will be buffered when prefetching. """ assert 'take_while' not in self._params.options, '`prefetch` must be placed in `take_while` front.' assert isinstance( prefetch_size, int ) and prefetch_size > 0, '`prefetch_size` type should be int and greater than 0.' self._params.options['prefetch'].update( {self._params.step: { 'prefetch_size': prefetch_size }}) self._params.step += 1 return self def reduce(self, reduce_func): """Reduces the input dataset to a single element. Args: reduce_func: A function that maps to new_state. It must take two arguments and return a new element """ if self._params.data_mode == 'list': return [ functools.reduce(reduce_func, i[self._params.index[self._params.mode]]) for i in self._params.data[self._params.mode1] ] return functools.reduce( reduce_func, self._params.data[self._params.mode1][ self._params.index[self._params.mode]]) def rename(self, name_dict): """Rename current dataset. Args: name_dict: rename dataset name dict, eg.{'train':'train_set'}. """ for name in name_dict: assert name != 'total', "`name` can't be 'total'." assert name_dict[name] != 'total', "`name` can't be 'total'." assert name in self._params.index, "name not exist." assert name_dict[ name] not in self._params.index, "name already exist." for name in name_dict: if name in self._params.data: self._params.data[name_dict[name]] = self._params.data.pop( name) if name in self._params.index: self._params.index[name_dict[name]] = self._params.index.pop( name) if name in self._params.index_data: self._params.index_data[ name_dict[name]] = self._params.index_data.pop(name) for i, j in self._params.index_data.items(): if name == j: self._params.index_data[i] = [name_dict[name]] if name in self._params.map: self._params.map[name_dict[name]] = self._params.map.pop(name) if name in self._params.batch: self._params.batch[name_dict[name]] = self._params.batch.pop( name) if name in self._params.enumerate: self._params.enumerate[ name_dict[name]] = self._params.enumerate.pop(name) if self._params.mode == name: self._params.mode = name_dict[name] self._params.mode1 = self._params.index_data[self._params.mode] return self def repeat(self, repeat_size): """Repeats this dataset so each original value is seen count times. Args: repeat_size: representing the number of times the dataset should be repeated. """ assert 'take_while' not in self._params.options, '`repeat` must be placed in `take_while` front.' assert isinstance( repeat_size, int ) and repeat_size > 0, '`repeat_size` type should be int and greater than 0.' self._params.index[self._params.mode] = self._params.index[ self._params.mode] * (repeat_size + 1) self._params.options['repeat'].update( {self._params.step: { 'repeat_size': repeat_size }}) self._params.step += 1 return self def shard(self, shard_size, shard_index): """Creates a Dataset that includes only 1/num_shards of this dataset. Args: shard_size: representing the number of shards operating in parallel. shard_index: representing the worker index. """ assert 'take_while' not in self._params.options, '`shard` must be placed in `take_while` front.' assert isinstance( shard_size, int ) and shard_size > 0, '`shard_size` type should be int and greater than 0.' assert isinstance( shard_index, int ) and shard_index >= 0, '`shard_index` type should be int and greater than or equal to 0.' self._params.index[self._params.mode] = [ self._params.index[self._params.mode][i] for i in range( shard_index, len(self._params.index[self._params.mode]), shard_size) ] self._params.options['shard'].update({ self._params.step: { 'shard_size': shard_size, 'shard_index': shard_index } }) self._params.step += 1 return self def shuffle(self, shuffle_size, seed=None): """Randomly shuffles the elements of this dataset. Args: shuffle_size: representing the number of elements from this dataset from which the new dataset will sample. seed: representing the random seed that will be used to create the distribution. """ assert 'take_while' not in self._params.options, '`shuffle` must be placed in `take_while` front.' assert isinstance( shuffle_size, int ) and shuffle_size > -2 and shuffle_size != 0, '`shuffle_size` type should be int and greater than 0 or equal to -1.' if isinstance(self._params.index[self._params.mode], list): self._params.index[self._params.mode] = pd.Series( index=self._params.index[self._params.mode], data=1).index if shuffle_size > 0: t = [ self._params.index[self._params.mode][shuffle_size * i:shuffle_size * (i + 1)].to_list() for i in range( len(self._params.index[self._params.mode]) // shuffle_size + 1) ] [ random.shuffle( i, random=lambda: ((seed if seed is not None else random.randint(1, 99)) + self._params.batch[self._params.mode][2]) % 10 / 10) for i in t ] self._params.index[self._params.mode] = list( itertools.chain.from_iterable(t)) else: self._params.index[self._params.mode] = self._params.index[ self._params.mode].to_series().sample( frac=1, random_state=seed).tolist() self._params.options['shuffle'].update( {self._params.step: { 'shuffle_size': shuffle_size, 'seed': seed }}) self._params.step += 1 return self def skip(self, skip_size): """Creates a Dataset that skips count elements from this dataset. Args: skip_size: representing the number of elements of this dataset that should be skipped to form the new dataset. If count is greater than the size of this dataset, the new dataset will contain no elements. """ assert 'take_while' not in self._params.options, '`skip` must be placed in `take_while` front.' assert isinstance( skip_size, int ) and skip_size > 0, '`skip_size` type should be int and greater than 0.' self._params.index[self._params.mode] = self._params.index[ self._params.mode][skip_size:] self._params.options['skip'].update( {self._params.step: { 'skip_size': skip_size }}) self._params.step += 1 return self def split(self, split_dict, shuffle=True, seed=None): """Split Dataset. Args: split_dict: dict, {data_name:data_rate}, eg.{'train':0.7, 'test':0.3}. shuffle: whether randomly shuffles the elements of this dataset. seed: random seed. """ for i in split_dict: assert i not in self._params.index, f"`{i}` has exist." assert i != 'total', "`split_dict` key can't be 'total'." t = sum(split_dict[i] for i in split_dict) t = {i: split_dict[i] / t for i in split_dict} if self._params.data_from in ['from_folder', 'from_class_folder']: if isinstance(self._params.data[self._params.mode1], list): label = self._params.data[self._params.mode1][1][ self._params.index[self._params.mode]] index = np.array(self._params.index[self._params.mode]) for i in np.unique(label): index1 = index[label == i].tolist() n = 0 for j in t: self._params.index[j] += index1[n:n + int(t[j] * len(index1))] n += int(t[j] * len(index1)) if shuffle: for i in t: self._params.index[i] = pd.Series( self._params.index[i]).sample( frac=1, random_state=seed).tolist() else: self._split(t, shuffle, seed) else: self._split(t, shuffle, seed) for i in split_dict: self._params.batch[i] = [0, False, 0] self._params.index_data[i] = self._params.mode1 return self def _split(self, t, shuffle, seed): if shuffle: index = pd.Series(self._params.index[self._params.mode]).sample( frac=1, random_state=seed).tolist() else: index = self._params.index[self._params.mode] n = 0 for i in t: self._params.index[i] += index[n:n + int(t[i] * len(index))] n += int(t[i] * len(index)) def take(self, take_size): """Creates a Dataset with at most count elements from this dataset. Args: take_size: representing the number of elements of this dataset that should be taken to form the new dataset. If count is -1, or if count is greater than the size of this dataset, the new dataset will contain all elements of this dataset. """ assert 'take_while' not in self._params.options, '`take` must be placed in `take_while` front.' assert isinstance( take_size, int ) and take_size > -2 and take_size != 0, '`take_size` type should be int and greater than 0 or equal to -1.' if take_size != -1: self._params.index[self._params.mode] = self._params.index[ self._params.mode][:take_size] self._params.options['take'].update( {self._params.step: { 'take_size': take_size }}) self._params.step += 1 return self def take_while(self, take_func): """A transformation that stops dataset iteration based on a take_func. Args: take_func: A function that return True or False """ temp = set() index = self._params.index[self._params.mode][:max([ self._params.index[self._params.mode].index(i) for i in range(len(self._params.data[self._params.mode1])) ]) + 1] for r, i in enumerate(index): if i in temp: continue temp.add(i) if 'list' in self._params.data_mode: if take_func( [j[i] for j in self._params.data[self._params.mode1]]): self._params.index[self._params.mode] = self._params.index[ self._params.mode][:r] break else: if take_func(self._params.data[self._params.mode1][i]): self._params.index[self._params.mode] = self._params.index[ self._params.mode][:r] break self._params.options['take_while'].update( {self._params.step: { 'take_func': take_func }}) self._params.step += 1 return self def to_tensor(self, mode='tf'): """Transform data from numpy array to tensor. Args: mode: Deep learning framework name, one of ['tf', 'pytorch', 'paddle', 'mxnet', 'mindspore']. """ assert 'to_tensor' not in self._params.options, '`to_tensor` already exists.' assert 'take_while' not in self._params.options, '`take` must be placed in `take_while` front.' if mode in ['tf', 'tensorflow']: from tensorflow import convert_to_tensor self._params.framework = convert_to_tensor elif mode in ['pytorch', 'torch']: from torch import as_tensor self._params.framework = as_tensor elif mode in ['paddle', 'paddlepaddle']: from paddle import to_tensor self._params.framework = to_tensor elif mode in ['mx', 'mxnet']: from mxnet.ndarray import array self._params.framework = array elif mode in ['mindspore']: from mindspore.numpy import array self._params.framework = array else: raise ValueError('`mode` value error.') self._params.tensor = mode self._params.options['to_tensor'].update( {self._params.step: { 'mode': mode }}) self._params.step += 1 return self def unbatch(self): """Splits elements of a dataset into multiple elements.""" assert not isinstance(self._params.data[self._params.mode1], list), 'Input data cannot be a tuple.' assert self._params.mode == 'total', f'{self._params.mode} dataset not supported.' self._params.data[self._params.mode1] = np.array( list( itertools.chain.from_iterable( self._params.data[self._params.mode1]))) self._params.index[self._params.mode] = list( range(len(self._params.data[self._params.mode1]))) return self def unique(self): """A transformation that discards duplicate elements of a Dataset.""" if isinstance(self._params.data[self._params.mode1], list): return tuple([ np.unique(i) for i in self._params.data[self._params.mode1][ self._params.index[self._params.mode]] ]) else: return np.unique(self._params.data[self._params.mode1][ self._params.index[self._params.mode]]) def _to_tensor(self, data): if self._params.tensor == 'numpy': return data return self._params.framework(data) def _data_mode(self): self._params.data_mode = 'list_array' if isinstance( self._params.data[self._params.mode1], list) else 'array' if isinstance(self._params.data[self._params.mode1], list): t = [i[0] for i in self._params.data[self._params.mode1]] else: t = self._params.data[self._params.mode1][0] if isinstance(t, str): if isfile(t): if t.split('.')[-1] in [ 'png', 'jpg', 'jpeg', 'bmp', 'rgb', 'tif', 'tiff', 'webp' ]: self._params.data_mode = 'image' elif isinstance(t, list): for i in t: if isinstance(i, str): if isfile(i): if i.split('.')[-1] in [ 'png', 'jpg', 'jpeg', 'bmp', 'rgb', 'tif', 'tiff', 'webp' ]: self._params.data_mode = 'list_image' def __iter__(self): if 'list' in self._params.data_mode: if self._params.mode in self._params.map: self._batch_func = self._batch_list_map else: self._batch_func = self._batch_list elif self._params.mode in self._params.map: self._batch_func = self._batch_map else: self._batch_func = self._batch return self def __next__(self): if self._params.batch[self._params.mode][0] == 0: self._params.batch[self._params.mode][0] = 1 self._params.batch[self._params.mode][2] = -1 if self._params.mode in self._params.enumerate: self._params.enumerate[self._params.mode] += 1 return (self._params.enumerate[self._params.mode] - 1, self._to_tensor( self._batch_func( self._params.index[self._params.mode]))) return self._to_tensor( self._batch_func(self._params.index[self._params.mode])) loc = self._params.index[ self._params.mode][self._params.batch[self._params.mode][0] * self._params.batch[self._params.mode][2]:self. _params.batch[self._params.mode][0] * (self._params.batch[self._params.mode][2] + 1)] if len(loc) == 0: raise StopIteration elif len(loc) < self._params.batch[self._params.mode][0]: if self._params.batch[self._params.mode][1]: raise StopIteration self._params.batch[self._params.mode][2] += 1 if self._params.mode in self._params.enumerate: self._params.enumerate[self._params.mode] += 1 return (self._params.enumerate[self._params.mode] - 1, self._to_tensor(self._batch_func(loc))) return self._to_tensor(self._batch_func(loc))