def prepare_tasks(self, cur_time) -> List[dict]: """ Prepare new tasks based on cur_time (None for the latest). You can find the last online models by OnlineToolR.online_models. Returns: List[dict]: a list of new tasks. """ latest_records, max_test = self._list_latest(self.tool.online_models()) if max_test is None: self.logger.warn(f"No latest online recorders, no new tasks.") return [] calendar_latest = D.calendar(end_time=cur_time)[-1] if cur_time is None else cur_time self.logger.info( f"The interval between current time {calendar_latest} and last rolling test begin time {max_test[0]} is {self.ta.cal_interval(calendar_latest, max_test[0])}, the rolling step is {self.rg.step}" ) if self.ta.cal_interval(calendar_latest, max_test[0]) >= self.rg.step: old_tasks = [] tasks_tmp = [] for rec in latest_records: task = rec.load_object("task") old_tasks.append(deepcopy(task)) test_begin = task["dataset"]["kwargs"]["segments"]["test"][0] # modify the test segment to generate new tasks task["dataset"]["kwargs"]["segments"]["test"] = (test_begin, calendar_latest) tasks_tmp.append(task) new_tasks_tmp = task_generator(tasks_tmp, self.rg) new_tasks = [task for task in new_tasks_tmp if task not in old_tasks] return new_tasks return []
def task_generating(self): print("========== task_generating ==========") tasks = task_generator( tasks=self.task_config, generators=self.rolling_gen, # generate different date segments ) pprint(tasks) return tasks
def first_tasks(self) -> List[dict]: """ Use rolling_gen to generate different tasks based on task_template. Returns: List[dict]: a list of tasks """ return task_generator( tasks=self.task_template, generators=self.rg, # generate different date segment )
def setup(self, trainer=TrainerR, trainer_kwargs={}): """ after running this function `self.data_ic_df` will become set. Each col represents a data. Each row represents the Timestamp of performance of that data. For example, .. code-block:: python 2021-06-21 2021-06-04 2021-05-21 2021-05-07 2021-04-20 2021-04-06 2021-03-22 2021-03-08 ... 2021-07-02 2021-06-18 2021-06-03 2021-05-20 2021-05-06 2021-04-19 2021-04-02 2021-03-19 ... datetime ... 2018-01-02 0.079782 0.115975 0.070866 0.028849 -0.081170 0.140380 0.063864 0.110987 ... 2018-01-03 0.123386 0.107789 0.071037 0.045278 -0.060782 0.167446 0.089779 0.124476 ... 2018-01-04 0.140775 0.097206 0.063702 0.042415 -0.078164 0.173218 0.098914 0.114389 ... 2018-01-05 0.030320 -0.037209 -0.044536 -0.047267 -0.081888 0.045648 0.059947 0.047652 ... 2018-01-08 0.107201 0.009219 -0.015995 -0.036594 -0.086633 0.108965 0.122164 0.108508 ... ... ... ... ... ... ... ... ... ... ... """ # 1) prepare the prediction of proxy models perf_task_tpl = deepcopy( self.task_tpl ) # this task is supposed to contains no complicated objects trainer = auto_filter_kwargs(trainer)(experiment_name=self.exp_name, **trainer_kwargs) # NOTE: # The handler is initialized for only once. if not trainer.has_worker(): self.dh = init_task_handler(perf_task_tpl) else: self.dh = init_instance_by_config( perf_task_tpl["dataset"]["kwargs"]["handler"]) seg = perf_task_tpl["dataset"]["kwargs"]["segments"] # We want to split the training time period into small segments. perf_task_tpl["dataset"]["kwargs"]["segments"] = { "train": (DatasetH.get_min_time(seg), DatasetH.get_max_time(seg)), "test": (None, None), } # NOTE: # we play a trick here # treat the training segments as test to create the rolling tasks rg = RollingGen(step=self.step, test_key="train", train_key=None, task_copy_func=deepcopy_basic_type) gen_task = task_generator(perf_task_tpl, [rg]) recorders = R.list_recorders(experiment_name=self.exp_name) if len(gen_task) == len(recorders): get_module_logger("Internal Data").info( "the data has been initialized") else: # train new models assert 0 == len( recorders ), "An empty experiment is required for setup `InternalData``" trainer.train(gen_task) # 2) extract the similarity matrix label_df = self.dh.fetch(col_set="label") # for recorders = R.list_recorders(experiment_name=self.exp_name) key_l = [] ic_l = [] for _, rec in tqdm(recorders.items(), desc="calc"): pred = rec.load_object("pred.pkl") task = rec.load_object("task") data_key = task["dataset"]["kwargs"]["segments"]["train"] key_l.append(data_key) ic_l.append( delayed(self._calc_perf)(pred.iloc[:, 0], label_df.iloc[:, 0])) ic_l = Parallel(n_jobs=-1)(ic_l) self.data_ic_df = pd.DataFrame(dict(zip(key_l, ic_l))) self.data_ic_df = self.data_ic_df.sort_index().sort_index(axis=1) del self.dh # handler is not useful now
def create_rolling_tasks(self): task = self.basic_task() task_l = task_generator( task, RollingGen(step=self.step, trunc_days=self.horizon + 1) ) # the last two days should be truncated to avoid information leakage return task_l