Beispiel #1
0
    def __init__(
        self,
        name_id: str,
        task_template: Union[dict, List[dict]],
        rolling_gen: RollingGen,
    ):
        """
        Init RollingStrategy.

        Assumption: the str of name_id, the experiment name, and the trainer's experiment name are the same.

        Args:
            name_id (str): a unique name or id. Will be also the name of the Experiment.
            task_template (Union[dict, List[dict]]): a list of task_template or a single template, which will be used to generate many tasks using rolling_gen.
            rolling_gen (RollingGen): an instance of RollingGen
        """
        super().__init__(name_id=name_id)
        self.exp_name = self.name_id
        if not isinstance(task_template, list):
            task_template = [task_template]
        self.task_template = task_template
        self.rg = rolling_gen
        assert issubclass(
            self.rg.__class__, RollingGen
        ), "The rolling strategy relies on the feature if RollingGen"
        self.tool = OnlineToolR(self.exp_name)
        self.ta = TimeAdjuster()
Beispiel #2
0
class MetaDatasetDS(MetaTaskDataset):
    def __init__(
        self,
        *,
        task_tpl: Union[dict, list],
        step: int,
        trunc_days: int = None,
        rolling_ext_days: int = 0,
        exp_name: Union[str, InternalData],
        segments: Union[Dict[Text, Tuple], float],
        hist_step_n: int = 10,
        task_mode: str = MetaTask.PROC_MODE_FULL,
        fill_method: str = "max",
    ):
        """
        A dataset for meta model.

        Parameters
        ----------
        task_tpl : Union[dict, list]
            Decide what tasks are used.
            - dict : the task template, the prepared task is generated with `step`, `trunc_days` and `RollingGen`
            - list : when list, use the list of tasks directly
                     the list is supposed to be sorted according timeline
        step : int
            the rolling step
        trunc_days: int
            days to be truncated based on the test start
        rolling_ext_days: int
            sometimes users want to train meta models for a longer test period but with smaller rolling steps for more task samples.
            the total length of test periods will be `step + rolling_ext_days`

        exp_name : Union[str, InternalData]
            Decide what meta_info are used for prediction.
            - str: the name of the experiment to store the performance of data
            - InternalData: a prepared internal data
        segments: Union[Dict[Text, Tuple], float]
            the segments to divide data
            both left and right
            if segments is a float:
                the float represents the percentage of data for training
        hist_step_n: int
            length of historical steps for the meta infomation
        task_mode : str
            Please refer to the docs of MetaTask
        """
        super().__init__(segments=segments)
        if isinstance(exp_name, InternalData):
            self.internal_data = exp_name
        else:
            self.internal_data = InternalData(task_tpl,
                                              step=step,
                                              exp_name=exp_name)
            self.internal_data.setup()
        self.task_tpl = deepcopy(
            task_tpl
        )  # FIXME: if the handler is shared, how to avoid the explosion of the memroy.
        self.trunc_days = trunc_days
        self.hist_step_n = hist_step_n
        self.step = step

        if isinstance(task_tpl, dict):
            rg = RollingGen(step=step,
                            trunc_days=trunc_days,
                            task_copy_func=deepcopy_basic_type
                            )  # NOTE: trunc_days is very important !!!!
            task_iter = rg(task_tpl)
            if rolling_ext_days > 0:
                self.ta = TimeAdjuster(future=True)
                for t in task_iter:
                    t["dataset"]["kwargs"]["segments"]["test"] = self.ta.shift(
                        t["dataset"]["kwargs"]["segments"]["test"],
                        step=rolling_ext_days,
                        rtype=RollingGen.ROLL_EX)
            if task_mode == MetaTask.PROC_MODE_FULL:
                # Only pre initializing the task when full task is req
                # initializing handler and share it.
                init_task_handler(task_tpl)
        else:
            assert isinstance(task_tpl, list)
            task_iter = task_tpl

        self.task_list = []
        self.meta_task_l = []
        logger = get_module_logger("MetaDatasetDS")
        logger.info(f"Example task for training meta model: {task_iter[0]}")
        for t in tqdm(task_iter, desc="creating meta tasks"):
            try:
                self.meta_task_l.append(
                    MetaTaskDS(t,
                               meta_info=self._prepare_meta_ipt(t),
                               mode=task_mode,
                               fill_method=fill_method))
                self.task_list.append(t)
            except ValueError as e:
                logger.warning(f"ValueError: {e}")
        assert len(
            self.meta_task_l
        ) > 0, "No meta tasks found. Please check the data and setting"

    def _prepare_meta_ipt(self, task):
        ic_df = self.internal_data.data_ic_df

        segs = task["dataset"]["kwargs"]["segments"]
        end = max([segs[k][1] for k in ("train", "valid") if k in segs])
        ic_df_avail = ic_df.loc[:end, pd.IndexSlice[:, :end]]

        # meta data set focus on the **information** instead of preprocess
        # 1) filter the future info
        def mask_future(s):
            """mask future information"""
            # from qlib.utils import get_date_by_shift
            start, end = s.name
            end = get_date_by_shift(trading_date=end,
                                    shift=self.trunc_days - 1,
                                    future=True)
            return s.mask((s.index >= start) & (s.index <= end))

        ic_df_avail = ic_df_avail.apply(mask_future)  # apply to each col

        # 2) filter the info with too long periods
        total_len = self.step * self.hist_step_n
        if ic_df_avail.shape[0] >= total_len:
            return ic_df_avail.iloc[-total_len:]
        else:
            raise ValueError(
                "the history of distribution data is not long enough.")

    def _prepare_seg(self, segment: Text) -> List[MetaTask]:
        if isinstance(self.segments, float):
            train_task_n = int(len(self.meta_task_l) * self.segments)
            if segment == "train":
                return self.meta_task_l[:train_task_n]
            elif segment == "test":
                return self.meta_task_l[train_task_n:]
            else:
                raise NotImplementedError(
                    f"This type of input is not supported")
        else:
            raise NotImplementedError(f"This type of input is not supported")
Beispiel #3
0
    def __init__(
        self,
        *,
        task_tpl: Union[dict, list],
        step: int,
        trunc_days: int = None,
        rolling_ext_days: int = 0,
        exp_name: Union[str, InternalData],
        segments: Union[Dict[Text, Tuple], float],
        hist_step_n: int = 10,
        task_mode: str = MetaTask.PROC_MODE_FULL,
        fill_method: str = "max",
    ):
        """
        A dataset for meta model.

        Parameters
        ----------
        task_tpl : Union[dict, list]
            Decide what tasks are used.
            - dict : the task template, the prepared task is generated with `step`, `trunc_days` and `RollingGen`
            - list : when list, use the list of tasks directly
                     the list is supposed to be sorted according timeline
        step : int
            the rolling step
        trunc_days: int
            days to be truncated based on the test start
        rolling_ext_days: int
            sometimes users want to train meta models for a longer test period but with smaller rolling steps for more task samples.
            the total length of test periods will be `step + rolling_ext_days`

        exp_name : Union[str, InternalData]
            Decide what meta_info are used for prediction.
            - str: the name of the experiment to store the performance of data
            - InternalData: a prepared internal data
        segments: Union[Dict[Text, Tuple], float]
            the segments to divide data
            both left and right
            if segments is a float:
                the float represents the percentage of data for training
        hist_step_n: int
            length of historical steps for the meta infomation
        task_mode : str
            Please refer to the docs of MetaTask
        """
        super().__init__(segments=segments)
        if isinstance(exp_name, InternalData):
            self.internal_data = exp_name
        else:
            self.internal_data = InternalData(task_tpl,
                                              step=step,
                                              exp_name=exp_name)
            self.internal_data.setup()
        self.task_tpl = deepcopy(
            task_tpl
        )  # FIXME: if the handler is shared, how to avoid the explosion of the memroy.
        self.trunc_days = trunc_days
        self.hist_step_n = hist_step_n
        self.step = step

        if isinstance(task_tpl, dict):
            rg = RollingGen(step=step,
                            trunc_days=trunc_days,
                            task_copy_func=deepcopy_basic_type
                            )  # NOTE: trunc_days is very important !!!!
            task_iter = rg(task_tpl)
            if rolling_ext_days > 0:
                self.ta = TimeAdjuster(future=True)
                for t in task_iter:
                    t["dataset"]["kwargs"]["segments"]["test"] = self.ta.shift(
                        t["dataset"]["kwargs"]["segments"]["test"],
                        step=rolling_ext_days,
                        rtype=RollingGen.ROLL_EX)
            if task_mode == MetaTask.PROC_MODE_FULL:
                # Only pre initializing the task when full task is req
                # initializing handler and share it.
                init_task_handler(task_tpl)
        else:
            assert isinstance(task_tpl, list)
            task_iter = task_tpl

        self.task_list = []
        self.meta_task_l = []
        logger = get_module_logger("MetaDatasetDS")
        logger.info(f"Example task for training meta model: {task_iter[0]}")
        for t in tqdm(task_iter, desc="creating meta tasks"):
            try:
                self.meta_task_l.append(
                    MetaTaskDS(t,
                               meta_info=self._prepare_meta_ipt(t),
                               mode=task_mode,
                               fill_method=fill_method))
                self.task_list.append(t)
            except ValueError as e:
                logger.warning(f"ValueError: {e}")
        assert len(
            self.meta_task_l
        ) > 0, "No meta tasks found. Please check the data and setting"
Beispiel #4
0
class RollingStrategy(OnlineStrategy):

    """
    This example strategy always uses the latest rolling model sas online models.
    """

    def __init__(
        self,
        name_id: str,
        task_template: Union[dict, List[dict]],
        rolling_gen: RollingGen,
    ):
        """
        Init RollingStrategy.

        Assumption: the str of name_id, the experiment name, and the trainer's experiment name are the same.

        Args:
            name_id (str): a unique name or id. Will be also the name of the Experiment.
            task_template (Union[dict, List[dict]]): a list of task_template or a single template, which will be used to generate many tasks using rolling_gen.
            rolling_gen (RollingGen): an instance of RollingGen
        """
        super().__init__(name_id=name_id)
        self.exp_name = self.name_id
        if not isinstance(task_template, list):
            task_template = [task_template]
        self.task_template = task_template
        self.rg = rolling_gen
        self.tool = OnlineToolR(self.exp_name)
        self.ta = TimeAdjuster()

    def get_collector(self, process_list=[RollingGroup()], rec_key_func=None, rec_filter_func=None, artifacts_key=None):
        """
        Get the instance of `Collector <../advanced/task_management.html#Task Collecting>`_ to collect results. The returned collector must distinguish results in different models.

        Assumption: the models can be distinguished based on the model name and rolling test segments.
        If you do not want this assumption, please implement your method or use another rec_key_func.

        Args:
            rec_key_func (Callable): a function to get the key of a recorder. If None, use recorder id.
            rec_filter_func (Callable, optional): filter the recorder by return True or False. Defaults to None.
            artifacts_key (List[str], optional): the artifacts key you want to get. If None, get all artifacts.
        """

        def rec_key(recorder):
            task_config = recorder.load_object("task")
            model_key = task_config["model"]["class"]
            rolling_key = task_config["dataset"]["kwargs"]["segments"]["test"]
            return model_key, rolling_key

        if rec_key_func is None:
            rec_key_func = rec_key

        artifacts_collector = RecorderCollector(
            experiment=self.exp_name,
            process_list=process_list,
            rec_key_func=rec_key_func,
            rec_filter_func=rec_filter_func,
            artifacts_key=artifacts_key,
        )

        return artifacts_collector

    def first_tasks(self) -> List[dict]:
        """
        Use rolling_gen to generate different tasks based on task_template.

        Returns:
            List[dict]: a list of tasks
        """
        return task_generator(
            tasks=self.task_template,
            generators=self.rg,  # generate different date segment
        )

    def prepare_tasks(self, cur_time) -> List[dict]:
        """
        Prepare new tasks based on cur_time (None for the latest).

        You can find the last online models by OnlineToolR.online_models.

        Returns:
            List[dict]: a list of new tasks.
        """
        latest_records, max_test = self._list_latest(self.tool.online_models())
        if max_test is None:
            self.logger.warn(f"No latest online recorders, no new tasks.")
            return []
        calendar_latest = D.calendar(end_time=cur_time)[-1] if cur_time is None else cur_time
        self.logger.info(
            f"The interval between current time {calendar_latest} and last rolling test begin time {max_test[0]} is {self.ta.cal_interval(calendar_latest, max_test[0])}, the rolling step is {self.rg.step}"
        )
        if self.ta.cal_interval(calendar_latest, max_test[0]) >= self.rg.step:
            old_tasks = []
            tasks_tmp = []
            for rec in latest_records:
                task = rec.load_object("task")
                old_tasks.append(deepcopy(task))
                test_begin = task["dataset"]["kwargs"]["segments"]["test"][0]
                # modify the test segment to generate new tasks
                task["dataset"]["kwargs"]["segments"]["test"] = (test_begin, calendar_latest)
                tasks_tmp.append(task)
            new_tasks_tmp = task_generator(tasks_tmp, self.rg)
            new_tasks = [task for task in new_tasks_tmp if task not in old_tasks]
            return new_tasks
        return []

    def _list_latest(self, rec_list: List[Recorder]):
        """
        List latest recorder form rec_list

        Args:
            rec_list (List[Recorder]): a list of Recorder

        Returns:
            List[Recorder], pd.Timestamp: the latest recorders and their test end time
        """
        if len(rec_list) == 0:
            return rec_list, None
        max_test = max(rec.load_object("task")["dataset"]["kwargs"]["segments"]["test"] for rec in rec_list)
        latest_rec = []
        for rec in rec_list:
            if rec.load_object("task")["dataset"]["kwargs"]["segments"]["test"] == max_test:
                latest_rec.append(rec)
        return latest_rec, max_test