def __init__( self, name_id: str, task_template: Union[dict, List[dict]], rolling_gen: RollingGen, ): """ Init RollingStrategy. Assumption: the str of name_id, the experiment name, and the trainer's experiment name are the same. Args: name_id (str): a unique name or id. Will be also the name of the Experiment. task_template (Union[dict, List[dict]]): a list of task_template or a single template, which will be used to generate many tasks using rolling_gen. rolling_gen (RollingGen): an instance of RollingGen """ super().__init__(name_id=name_id) self.exp_name = self.name_id if not isinstance(task_template, list): task_template = [task_template] self.task_template = task_template self.rg = rolling_gen assert issubclass( self.rg.__class__, RollingGen ), "The rolling strategy relies on the feature if RollingGen" self.tool = OnlineToolR(self.exp_name) self.ta = TimeAdjuster()
class MetaDatasetDS(MetaTaskDataset): def __init__( self, *, task_tpl: Union[dict, list], step: int, trunc_days: int = None, rolling_ext_days: int = 0, exp_name: Union[str, InternalData], segments: Union[Dict[Text, Tuple], float], hist_step_n: int = 10, task_mode: str = MetaTask.PROC_MODE_FULL, fill_method: str = "max", ): """ A dataset for meta model. Parameters ---------- task_tpl : Union[dict, list] Decide what tasks are used. - dict : the task template, the prepared task is generated with `step`, `trunc_days` and `RollingGen` - list : when list, use the list of tasks directly the list is supposed to be sorted according timeline step : int the rolling step trunc_days: int days to be truncated based on the test start rolling_ext_days: int sometimes users want to train meta models for a longer test period but with smaller rolling steps for more task samples. the total length of test periods will be `step + rolling_ext_days` exp_name : Union[str, InternalData] Decide what meta_info are used for prediction. - str: the name of the experiment to store the performance of data - InternalData: a prepared internal data segments: Union[Dict[Text, Tuple], float] the segments to divide data both left and right if segments is a float: the float represents the percentage of data for training hist_step_n: int length of historical steps for the meta infomation task_mode : str Please refer to the docs of MetaTask """ super().__init__(segments=segments) if isinstance(exp_name, InternalData): self.internal_data = exp_name else: self.internal_data = InternalData(task_tpl, step=step, exp_name=exp_name) self.internal_data.setup() self.task_tpl = deepcopy( task_tpl ) # FIXME: if the handler is shared, how to avoid the explosion of the memroy. self.trunc_days = trunc_days self.hist_step_n = hist_step_n self.step = step if isinstance(task_tpl, dict): rg = RollingGen(step=step, trunc_days=trunc_days, task_copy_func=deepcopy_basic_type ) # NOTE: trunc_days is very important !!!! task_iter = rg(task_tpl) if rolling_ext_days > 0: self.ta = TimeAdjuster(future=True) for t in task_iter: t["dataset"]["kwargs"]["segments"]["test"] = self.ta.shift( t["dataset"]["kwargs"]["segments"]["test"], step=rolling_ext_days, rtype=RollingGen.ROLL_EX) if task_mode == MetaTask.PROC_MODE_FULL: # Only pre initializing the task when full task is req # initializing handler and share it. init_task_handler(task_tpl) else: assert isinstance(task_tpl, list) task_iter = task_tpl self.task_list = [] self.meta_task_l = [] logger = get_module_logger("MetaDatasetDS") logger.info(f"Example task for training meta model: {task_iter[0]}") for t in tqdm(task_iter, desc="creating meta tasks"): try: self.meta_task_l.append( MetaTaskDS(t, meta_info=self._prepare_meta_ipt(t), mode=task_mode, fill_method=fill_method)) self.task_list.append(t) except ValueError as e: logger.warning(f"ValueError: {e}") assert len( self.meta_task_l ) > 0, "No meta tasks found. Please check the data and setting" def _prepare_meta_ipt(self, task): ic_df = self.internal_data.data_ic_df segs = task["dataset"]["kwargs"]["segments"] end = max([segs[k][1] for k in ("train", "valid") if k in segs]) ic_df_avail = ic_df.loc[:end, pd.IndexSlice[:, :end]] # meta data set focus on the **information** instead of preprocess # 1) filter the future info def mask_future(s): """mask future information""" # from qlib.utils import get_date_by_shift start, end = s.name end = get_date_by_shift(trading_date=end, shift=self.trunc_days - 1, future=True) return s.mask((s.index >= start) & (s.index <= end)) ic_df_avail = ic_df_avail.apply(mask_future) # apply to each col # 2) filter the info with too long periods total_len = self.step * self.hist_step_n if ic_df_avail.shape[0] >= total_len: return ic_df_avail.iloc[-total_len:] else: raise ValueError( "the history of distribution data is not long enough.") def _prepare_seg(self, segment: Text) -> List[MetaTask]: if isinstance(self.segments, float): train_task_n = int(len(self.meta_task_l) * self.segments) if segment == "train": return self.meta_task_l[:train_task_n] elif segment == "test": return self.meta_task_l[train_task_n:] else: raise NotImplementedError( f"This type of input is not supported") else: raise NotImplementedError(f"This type of input is not supported")
def __init__( self, *, task_tpl: Union[dict, list], step: int, trunc_days: int = None, rolling_ext_days: int = 0, exp_name: Union[str, InternalData], segments: Union[Dict[Text, Tuple], float], hist_step_n: int = 10, task_mode: str = MetaTask.PROC_MODE_FULL, fill_method: str = "max", ): """ A dataset for meta model. Parameters ---------- task_tpl : Union[dict, list] Decide what tasks are used. - dict : the task template, the prepared task is generated with `step`, `trunc_days` and `RollingGen` - list : when list, use the list of tasks directly the list is supposed to be sorted according timeline step : int the rolling step trunc_days: int days to be truncated based on the test start rolling_ext_days: int sometimes users want to train meta models for a longer test period but with smaller rolling steps for more task samples. the total length of test periods will be `step + rolling_ext_days` exp_name : Union[str, InternalData] Decide what meta_info are used for prediction. - str: the name of the experiment to store the performance of data - InternalData: a prepared internal data segments: Union[Dict[Text, Tuple], float] the segments to divide data both left and right if segments is a float: the float represents the percentage of data for training hist_step_n: int length of historical steps for the meta infomation task_mode : str Please refer to the docs of MetaTask """ super().__init__(segments=segments) if isinstance(exp_name, InternalData): self.internal_data = exp_name else: self.internal_data = InternalData(task_tpl, step=step, exp_name=exp_name) self.internal_data.setup() self.task_tpl = deepcopy( task_tpl ) # FIXME: if the handler is shared, how to avoid the explosion of the memroy. self.trunc_days = trunc_days self.hist_step_n = hist_step_n self.step = step if isinstance(task_tpl, dict): rg = RollingGen(step=step, trunc_days=trunc_days, task_copy_func=deepcopy_basic_type ) # NOTE: trunc_days is very important !!!! task_iter = rg(task_tpl) if rolling_ext_days > 0: self.ta = TimeAdjuster(future=True) for t in task_iter: t["dataset"]["kwargs"]["segments"]["test"] = self.ta.shift( t["dataset"]["kwargs"]["segments"]["test"], step=rolling_ext_days, rtype=RollingGen.ROLL_EX) if task_mode == MetaTask.PROC_MODE_FULL: # Only pre initializing the task when full task is req # initializing handler and share it. init_task_handler(task_tpl) else: assert isinstance(task_tpl, list) task_iter = task_tpl self.task_list = [] self.meta_task_l = [] logger = get_module_logger("MetaDatasetDS") logger.info(f"Example task for training meta model: {task_iter[0]}") for t in tqdm(task_iter, desc="creating meta tasks"): try: self.meta_task_l.append( MetaTaskDS(t, meta_info=self._prepare_meta_ipt(t), mode=task_mode, fill_method=fill_method)) self.task_list.append(t) except ValueError as e: logger.warning(f"ValueError: {e}") assert len( self.meta_task_l ) > 0, "No meta tasks found. Please check the data and setting"
class RollingStrategy(OnlineStrategy): """ This example strategy always uses the latest rolling model sas online models. """ def __init__( self, name_id: str, task_template: Union[dict, List[dict]], rolling_gen: RollingGen, ): """ Init RollingStrategy. Assumption: the str of name_id, the experiment name, and the trainer's experiment name are the same. Args: name_id (str): a unique name or id. Will be also the name of the Experiment. task_template (Union[dict, List[dict]]): a list of task_template or a single template, which will be used to generate many tasks using rolling_gen. rolling_gen (RollingGen): an instance of RollingGen """ super().__init__(name_id=name_id) self.exp_name = self.name_id if not isinstance(task_template, list): task_template = [task_template] self.task_template = task_template self.rg = rolling_gen self.tool = OnlineToolR(self.exp_name) self.ta = TimeAdjuster() def get_collector(self, process_list=[RollingGroup()], rec_key_func=None, rec_filter_func=None, artifacts_key=None): """ Get the instance of `Collector <../advanced/task_management.html#Task Collecting>`_ to collect results. The returned collector must distinguish results in different models. Assumption: the models can be distinguished based on the model name and rolling test segments. If you do not want this assumption, please implement your method or use another rec_key_func. Args: rec_key_func (Callable): a function to get the key of a recorder. If None, use recorder id. rec_filter_func (Callable, optional): filter the recorder by return True or False. Defaults to None. artifacts_key (List[str], optional): the artifacts key you want to get. If None, get all artifacts. """ def rec_key(recorder): task_config = recorder.load_object("task") model_key = task_config["model"]["class"] rolling_key = task_config["dataset"]["kwargs"]["segments"]["test"] return model_key, rolling_key if rec_key_func is None: rec_key_func = rec_key artifacts_collector = RecorderCollector( experiment=self.exp_name, process_list=process_list, rec_key_func=rec_key_func, rec_filter_func=rec_filter_func, artifacts_key=artifacts_key, ) return artifacts_collector def first_tasks(self) -> List[dict]: """ Use rolling_gen to generate different tasks based on task_template. Returns: List[dict]: a list of tasks """ return task_generator( tasks=self.task_template, generators=self.rg, # generate different date segment ) def prepare_tasks(self, cur_time) -> List[dict]: """ Prepare new tasks based on cur_time (None for the latest). You can find the last online models by OnlineToolR.online_models. Returns: List[dict]: a list of new tasks. """ latest_records, max_test = self._list_latest(self.tool.online_models()) if max_test is None: self.logger.warn(f"No latest online recorders, no new tasks.") return [] calendar_latest = D.calendar(end_time=cur_time)[-1] if cur_time is None else cur_time self.logger.info( f"The interval between current time {calendar_latest} and last rolling test begin time {max_test[0]} is {self.ta.cal_interval(calendar_latest, max_test[0])}, the rolling step is {self.rg.step}" ) if self.ta.cal_interval(calendar_latest, max_test[0]) >= self.rg.step: old_tasks = [] tasks_tmp = [] for rec in latest_records: task = rec.load_object("task") old_tasks.append(deepcopy(task)) test_begin = task["dataset"]["kwargs"]["segments"]["test"][0] # modify the test segment to generate new tasks task["dataset"]["kwargs"]["segments"]["test"] = (test_begin, calendar_latest) tasks_tmp.append(task) new_tasks_tmp = task_generator(tasks_tmp, self.rg) new_tasks = [task for task in new_tasks_tmp if task not in old_tasks] return new_tasks return [] def _list_latest(self, rec_list: List[Recorder]): """ List latest recorder form rec_list Args: rec_list (List[Recorder]): a list of Recorder Returns: List[Recorder], pd.Timestamp: the latest recorders and their test end time """ if len(rec_list) == 0: return rec_list, None max_test = max(rec.load_object("task")["dataset"]["kwargs"]["segments"]["test"] for rec in rec_list) latest_rec = [] for rec in rec_list: if rec.load_object("task")["dataset"]["kwargs"]["segments"]["test"] == max_test: latest_rec.append(rec) return latest_rec, max_test