def train_with_sigana(): """train model followed by SigAnaRecord Returns ------- pred_score: pandas.DataFrame predict scores performance: dict model performance """ model = init_instance_by_config(task["model"]) dataset = init_instance_by_config(task["dataset"]) # start exp with R.start(experiment_name="workflow_with_sigana"): R.log_params(**flatten_dict(task)) model.fit(dataset) # predict and calculate ic and ric recorder = R.get_recorder() sar = SigAnaRecord(recorder, model=model, dataset=dataset) sar.generate() ic = sar.load(sar.get_path("ic.pkl")) ric = sar.load(sar.get_path("ric.pkl")) pred_score = sar.load("pred.pkl") smr = SignalMseRecord(recorder) smr.generate() uri_path = R.get_uri() return pred_score, {"ic": ic, "ric": ric}, uri_path
def backtest(self): self._init_qlib() model = init_instance_by_config(self.task["model"]) dataset = init_instance_by_config(self.task["dataset"]) self._train_model(model, dataset) strategy_config = { "class": "TopkDropoutStrategy", "module_path": "qlib.contrib.strategy.signal_strategy", "kwargs": { "signal": (model, dataset), "topk": 50, "n_drop": 5, }, } self.port_analysis_config["strategy"] = strategy_config self.port_analysis_config["backtest"]["benchmark"] = self.benchmark with R.start(experiment_name="backtest"): recorder = R.get_recorder() par = PortAnaRecord( recorder, self.port_analysis_config, risk_analysis_freq=["day", "30min", "5min"], indicator_analysis_freq=["day", "30min", "5min"], indicator_analysis_method="value_weighted", ) par.generate()
def train_with_sigana(uri_path: str = None): """train model followed by SigAnaRecord Returns ------- pred_score: pandas.DataFrame predict scores performance: dict model performance """ model = init_instance_by_config(CSI300_GBDT_TASK["model"]) dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) # start exp with R.start(experiment_name="workflow_with_sigana", uri=uri_path): R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) recorder = R.get_recorder() sr = SignalRecord(model, dataset, recorder) sr.generate() pred_score = sr.load("pred.pkl") # predict and calculate ic and ric sar = SigAnaRecord(recorder) sar.generate() ic = sar.load("ic.pkl") ric = sar.load("ric.pkl") uri_path = R.get_uri() return pred_score, {"ic": ic, "ric": ric}, uri_path
def end_task_train(rec: Recorder, experiment_name: str) -> Recorder: """ Finish task training with real model fitting and saving. Args: rec (Recorder): the recorder will be resumed experiment_name (str): the name of experiment Returns: Recorder: the model recorder """ with R.start(experiment_name=experiment_name, recorder_id=rec.info["id"], resume=True): task_config = R.load_object("task") # model & dataset initiation model: Model = init_instance_by_config(task_config["model"]) dataset: Dataset = init_instance_by_config(task_config["dataset"]) # model training model.fit(dataset) R.save_objects(**{"params.pkl": model}) # this dataset is saved for online inference. So the concrete data should not be dumped dataset.config(dump_all=False, recursive=True) R.save_objects(**{"dataset": dataset}) # fill placehorder placehorder_value = {"<MODEL>": model, "<DATASET>": dataset} task_config = fill_placeholder(task_config, placehorder_value) # generate records: prediction, backtest, and analysis records = task_config.get("record", []) if isinstance(records, dict): # prevent only one dict records = [records] for record in records: r = init_instance_by_config(record, recorder=rec) r.generate() return rec
def train_meta_model(self): """ training a meta model based on a simplified linear proxy model; """ # 1) leverage the simplified proxy forecasting model to train meta model. # - Only the dataset part is important, in current version of meta model will integrate the rb = RollingBenchmark(model_type=self.sim_task_model) sim_task = rb.basic_task() proxy_forecast_model_task = { # "model": "qlib.contrib.model.linear.LinearModel", "dataset": { "class": "qlib.data.dataset.DatasetH", "kwargs": { "handler": f"file://{(DIRNAME / 'handler_proxy.pkl').absolute()}", "segments": { "train": ("2008-01-01", "2010-12-31"), "test": ("2011-01-01", sim_task["dataset"]["kwargs"]["segments"]["test"][1]), }, }, }, # "record": ["qlib.workflow.record_temp.SignalRecord"] } # the proxy_forecast_model_task will be used to create meta tasks. # The test date of first task will be 2011-01-01. Each test segment will be about 20days # The tasks include all training tasks and test tasks. # 2) preparing meta dataset kwargs = dict( task_tpl=proxy_forecast_model_task, step=self.step, segments=0.62, # keep test period consistent with the dataset yaml trunc_days=1 + self.horizon, hist_step_n=30, fill_method="max", rolling_ext_days=0, ) # NOTE: # the input of meta model (internal data) are shared between proxy model and final forecasting model # but their task test segment are not aligned! It worked in my previous experiment. # So the misalignment will not affect the effectiveness of the method. with self._internal_data_path.open("rb") as f: internal_data = pickle.load(f) md = MetaDatasetDS(exp_name=internal_data, **kwargs) # 3) train and logging meta model with R.start(experiment_name=self.meta_exp_name): R.log_params(**kwargs) mm = MetaModelDS(step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.001, max_epoch=200, seed=43) mm.fit(md) R.save_objects(model=mm)
def _train_model(self, model, dataset): with R.start(experiment_name="train"): R.log_params(**flatten_dict(self.task)) model.fit(dataset) R.save_objects(**{"params.pkl": model}) # prediction recorder = R.get_recorder() sr = SignalRecord(model, dataset, recorder) sr.generate()
def train_mse(): model = init_instance_by_config(CSI300_GBDT_TASK["model"]) dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) with R.start(experiment_name="workflow"): R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) recorder = R.get_recorder() sr = SignalMseRecord(recorder, model=model, dataset=dataset) sr.generate() uri = R.get_uri() return uri
def run_exp(task_config, dataset, experiment_name, recorder_name, uri): model = init_instance_by_config(task_config["model"]) model_fit_kwargs = dict(dataset=dataset) # Let's start the experiment. with R.start( experiment_name=experiment_name, recorder_name=recorder_name, uri=uri, resume=True, ): # Setup log recorder_root_dir = R.get_recorder().get_local_dir() log_file = os.path.join(recorder_root_dir, "{:}.log".format(experiment_name)) set_log_basic_config(log_file) logger = get_module_logger("q.run_exp") logger.info("task_config::\n{:}".format( pprint.pformat(task_config, indent=2))) logger.info("[{:}] - [{:}]: {:}".format(experiment_name, recorder_name, uri)) logger.info("dataset={:}".format(dataset)) # Train model R.log_params(**flatten_dict(task_config)) if "save_path" in inspect.getfullargspec(model.fit).args: model_fit_kwargs["save_path"] = os.path.join( recorder_root_dir, "model.ckp") elif "save_dir" in inspect.getfullargspec(model.fit).args: model_fit_kwargs["save_dir"] = os.path.join( recorder_root_dir, "model-ckps") model.fit(**model_fit_kwargs) # Get the recorder recorder = R.get_recorder() R.save_objects(**{"model.pkl": model}) # Generate records: prediction, backtest, and analysis for record in task_config["record"]: record = record.copy() if record["class"] == "SignalRecord": srconf = { "model": model, "dataset": dataset, "recorder": recorder } record["kwargs"].update(srconf) sr = init_instance_by_config(record) sr.generate() else: rconf = {"recorder": recorder} record["kwargs"].update(rconf) ar = init_instance_by_config(record) ar.generate()
def train_multiseg(): model = init_instance_by_config(CSI300_GBDT_TASK["model"]) dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) with R.start(experiment_name="workflow"): R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) recorder = R.get_recorder() sr = MultiSegRecord(model, dataset, recorder) sr.generate(dict(valid="valid", test="test"), True) uri = R.get_uri() return uri
def begin_task_train(task_config: dict, experiment_name: str, recorder_name: str = None) -> Recorder: """ Begin task training to start a recorder and save the task config. Args: task_config (dict): the config of a task experiment_name (str): the name of experiment recorder_name (str): the given name will be the recorder name. None for using rid. Returns: Recorder: the model recorder """ with R.start(experiment_name=experiment_name, recorder_name=recorder_name): _log_task_info(task_config) return R.get_recorder()
def end_task_train(rec: Recorder, experiment_name: str) -> Recorder: """ Finish task training with real model fitting and saving. Args: rec (Recorder): the recorder will be resumed experiment_name (str): the name of experiment Returns: Recorder: the model recorder """ with R.start(experiment_name=experiment_name, recorder_id=rec.info["id"], resume=True): task_config = R.load_object("task") _exe_task(task_config) return rec
def ens_rolling(self): rc = RecorderCollector( experiment=self.rolling_exp, artifacts_key=["pred", "label"], process_list=[RollingEnsemble()], # rec_key_func=lambda rec: (self.COMB_EXP, rec.info["id"]), artifacts_path={ "pred": "pred.pkl", "label": "label.pkl" }, ) res = rc() with R.start(experiment_name=self.COMB_EXP): R.log_params(exp_name=self.rolling_exp) R.save_objects(**{ "pred.pkl": res["pred"], "label.pkl": res["label"] })
def fake_experiment(): """A fake experiment workflow to test uri Returns ------- pass_or_not_for_default_uri: bool pass_or_not_for_current_uri: bool temporary_exp_dir: str """ # start exp default_uri = R.get_uri() current_uri = "file:./temp-test-exp-mag" with R.start(experiment_name="fake_workflow_for_expm", uri=current_uri): R.log_params(**flatten_dict(CSI300_GBDT_TASK)) current_uri_to_check = R.get_uri() default_uri_to_check = R.get_uri() return default_uri == default_uri_to_check, current_uri == current_uri_to_check, current_uri
def run_exp(task_config, dataset, experiment_name, recorder_name, uri): # model initiaiton print("") print("[{:}] - [{:}]: {:}".format(experiment_name, recorder_name, uri)) print("dataset={:}".format(dataset)) model = init_instance_by_config(task_config["model"]) # start exp with R.start(experiment_name=experiment_name, recorder_name=recorder_name, uri=uri): log_file = R.get_recorder().root_uri / "{:}.log".format( experiment_name) set_log_basic_config(log_file) # train model R.log_params(**flatten_dict(task_config)) model.fit(dataset) recorder = R.get_recorder() R.save_objects(**{"model.pkl": model}) # generate records: prediction, backtest, and analysis for record in task_config["record"]: record = record.copy() if record["class"] == "SignalRecord": srconf = { "model": model, "dataset": dataset, "recorder": recorder } record["kwargs"].update(srconf) sr = init_instance_by_config(record) sr.generate() else: rconf = {"recorder": recorder} record["kwargs"].update(rconf) ar = init_instance_by_config(record) ar.generate()
def train(uri_path: str = None): """train model Returns ------- pred_score: pandas.DataFrame predict scores performance: dict model performance """ # model initiaiton model = init_instance_by_config(CSI300_GBDT_TASK["model"]) dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) # To test __repr__ print(dataset) print(R) # start exp with R.start(experiment_name="workflow", uri=uri_path): R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) R.save_objects(trained_model=model) # prediction recorder = R.get_recorder() # To test __repr__ print(recorder) # To test get_local_dir print(recorder.get_local_dir()) rid = recorder.id sr = SignalRecord(model, dataset, recorder) sr.generate() pred_score = sr.load("pred.pkl") # calculate ic and ric sar = SigAnaRecord(recorder) sar.generate() ic = sar.load("ic.pkl") ric = sar.load("ric.pkl") return pred_score, {"ic": ic, "ric": ric}, rid
def task_train(task_config: dict, experiment_name: str, recorder_name: str = None) -> Recorder: """ Task based training, will be divided into two steps. Parameters ---------- task_config : dict The config of a task. experiment_name: str The name of experiment recorder_name: str The name of recorder Returns ---------- Recorder: The instance of the recorder """ with R.start(experiment_name=experiment_name, recorder_name=recorder_name): _log_task_info(task_config) _exe_task(task_config) return R.get_recorder()
def begin_task_train(task_config: dict, experiment_name: str, recorder_name: str = None) -> Recorder: """ Begin task training to start a recorder and save the task config. Args: task_config (dict): the config of a task experiment_name (str): the name of experiment recorder_name (str): the given name will be the recorder name. None for using rid. Returns: Recorder: the model recorder """ with R.start(experiment_name=experiment_name, recorder_name=recorder_name): R.log_params(**flatten_dict(task_config)) R.save_objects(**{"task": task_config }) # keep the original format and datatype R.set_tags(**{"hostname": socket.gethostname()}) recorder: Recorder = R.get_recorder() return recorder
def end_task_train(rec: Recorder, experiment_name: str) -> Recorder: """ Finish task training with real model fitting and saving. Args: rec (Recorder): the recorder will be resumed experiment_name (str): the name of experiment Returns: Recorder: the model recorder """ with R.start(experiment_name=experiment_name, recorder_id=rec.info["id"], resume=True): task_config = R.load_object("task") # model & dataset initiation model: Model = init_instance_by_config(task_config["model"]) dataset: Dataset = init_instance_by_config(task_config["dataset"]) # model training model.fit(dataset) R.save_objects(**{"params.pkl": model}) # this dataset is saved for online inference. So the concrete data should not be dumped dataset.config(dump_all=False, recursive=True) R.save_objects(**{"dataset": dataset}) # generate records: prediction, backtest, and analysis records = task_config.get("record", []) if isinstance(records, dict): # prevent only one dict records = [records] for record in records: cls, kwargs = get_cls_kwargs( record, default_module="qlib.workflow.record_temp") if cls is SignalRecord: rconf = {"model": model, "dataset": dataset, "recorder": rec} else: rconf = {"recorder": rec} r = cls(**kwargs, **rconf) r.generate() return rec
def train(): """train model Returns ------- pred_score: pandas.DataFrame predict scores performance: dict model performance """ # model initiaiton model = init_instance_by_config(task["model"]) dataset = init_instance_by_config(task["dataset"]) # To test __repr__ print(dataset) print(R) # start exp with R.start(experiment_name="workflow"): R.log_params(**flatten_dict(task)) model.fit(dataset) # prediction recorder = R.get_recorder() # To test __repr__ print(recorder) rid = recorder.id sr = SignalRecord(model, dataset, recorder) sr.generate() pred_score = sr.load() # calculate ic and ric sar = SigAnaRecord(recorder) sar.generate() ic = sar.load(sar.get_path("ic.pkl")) ric = sar.load(sar.get_path("ric.pkl")) return pred_score, {"ic": ic, "ric": ric}, rid
def task_train(task_config: dict, experiment_name): """ task based training Parameters ---------- task_config : dict A dict describes a task setting. """ # model initiaiton model = init_instance_by_config(task_config["model"]) dataset = init_instance_by_config(task_config["dataset"]) # start exp with R.start(experiment_name=experiment_name): # train model R.log_params(**flatten_dict(task_config)) model.fit(dataset) recorder = R.get_recorder() R.save_objects(**{"params.pkl": model}) # generate records: prediction, backtest, and analysis for record in task_config["record"]: if record["class"] == SignalRecord.__name__: srconf = { "model": model, "dataset": dataset, "recorder": recorder } record["kwargs"].update(srconf) sr = init_instance_by_config(record) sr.generate() else: rconf = {"recorder": recorder} record["kwargs"].update(rconf) ar = init_instance_by_config(record) ar.generate()
def get_feature_importance(self): # this must be lightGBM, because it needs to get the feature importance rb = RollingBenchmark(model_type="gbdt") task = rb.basic_task() with R.start(experiment_name="feature_importance"): model = init_instance_by_config(task["model"]) dataset = init_instance_by_config(task["dataset"]) model.fit(dataset) fi = model.get_feature_importance() # Because the model use numpy instead of dataframe for training lightgbm # So the we must use following extra steps to get the right feature importance df = dataset.prepare(segments=slice(None), col_set="feature", data_key=DataHandlerLP.DK_R) cols = df.columns fi_named = { cols[int(k.split("_")[1])]: imp for k, imp in fi.to_dict().items() } return pd.Series(fi_named)
def run_exp( task_config, dataset, experiment_name, recorder_name, uri, model_obj_name="model.pkl", ): model = init_instance_by_config(task_config["model"]) model_fit_kwargs = dict(dataset=dataset) # Let's start the experiment. with R.start( experiment_name=experiment_name, recorder_name=recorder_name, uri=uri, resume=True, ): # Setup log recorder_root_dir = R.get_recorder().get_local_dir() log_file = os.path.join(recorder_root_dir, "{:}.log".format(experiment_name)) set_log_basic_config(log_file) logger = get_module_logger("q.run_exp") logger.info("task_config::\n{:}".format( pprint.pformat(task_config, indent=2))) logger.info("[{:}] - [{:}]: {:}".format(experiment_name, recorder_name, uri)) logger.info("dataset={:}".format(dataset)) # Train model try: if hasattr(model, "to"): # Recoverable model ori_device = model.device model = R.load_object(model_obj_name) model.to(ori_device) else: model = R.load_object(model_obj_name) logger.info( "[Find existing object from {:}]".format(model_obj_name)) except OSError: R.log_params(**flatten_dict(update_gpu(task_config, None))) if "save_path" in inspect.getfullargspec(model.fit).args: model_fit_kwargs["save_path"] = os.path.join( recorder_root_dir, "model.ckp") elif "save_dir" in inspect.getfullargspec(model.fit).args: model_fit_kwargs["save_dir"] = os.path.join( recorder_root_dir, "model-ckps") model.fit(**model_fit_kwargs) # remove model to CPU for saving if hasattr(model, "to"): old_device = model.device model.to("cpu") R.save_objects(**{model_obj_name: model}) model.to(old_device) else: R.save_objects(**{model_obj_name: model}) except Exception as e: raise ValueError("Something wrong: {:}".format(e)) # Get the recorder recorder = R.get_recorder() # Generate records: prediction, backtest, and analysis for record in task_config["record"]: record = deepcopy(record) if record["class"] == "MultiSegRecord": record["kwargs"] = dict(model=model, dataset=dataset, recorder=recorder) sr = init_instance_by_config(record) sr.generate(**record["generate_kwargs"]) elif record["class"] == "SignalRecord": srconf = { "model": model, "dataset": dataset, "recorder": recorder } record["kwargs"].update(srconf) sr = init_instance_by_config(record) sr.generate() else: rconf = {"recorder": recorder} record["kwargs"].update(rconf) ar = init_instance_by_config(record) ar.generate()
"freq": "day", "limit_threshold": 0.095, "deal_price": "close", "open_cost": 0.0005, "close_cost": 0.0015, "min_cost": 5, }, } # NOTE: This line is optional # It demonstrates that the dataset can be used standalone. example_df = dataset.prepare("train") print(example_df.head()) # start exp with R.start(experiment_name="上证50"): R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) R.save_objects(**{"params.pkl": model}) # prediction recorder = R.get_recorder() sr = SignalRecord(model, dataset, recorder) sr.generate() # Signal Analysis sar = SigAnaRecord(recorder) sar.generate() # backtest. If users want to use backtest based on their own prediction, # please refer to https://qlib.readthedocs.io/en/latest/component/recorder.html#record-template.
"close_cost": 0.0015, "min_cost": 5, "return_order": True, }, } # model initialization model = init_instance_by_config(task["model"]) dataset = init_instance_by_config(task["dataset"]) # NOTE: This line is optional # It demonstrates that the dataset can be used standalone. example_df = dataset.prepare("train") print(example_df.head()) # start exp with R.start(experiment_name="workflow"): R.log_params(**flatten_dict(task)) model.fit(dataset) R.save_objects(**{"params.pkl": model}) # prediction recorder = R.get_recorder() sr = SignalRecord(model, dataset, recorder) sr.generate() # backtest. If users want to use backtest based on their own prediction, # please refer to https://qlib.readthedocs.io/en/latest/component/recorder.html#record-template. par = PortAnaRecord(recorder, port_analysis_config) par.generate()
def backtest_only_daily(self): """ This backtest is used for comparing the nested execution and single layer execution Due to the low quality daily-level and miniute-level data, they are hardly comparable. So it is used for detecting serious bugs which make the results different greatly. .. code-block:: shell [1724971:MainThread](2021-12-07 16:24:31,156) INFO - qlib.workflow - [record_temp.py:441] - Portfolio analysis record 'port_analysis_1day.pkl' has been saved as the artifact of the Experiment 2 'The following are analysis results of benchmark return(1day).' risk mean 0.000651 std 0.012472 annualized_return 0.154967 information_ratio 0.805422 max_drawdown -0.160445 'The following are analysis results of the excess return without cost(1day).' risk mean 0.001375 std 0.006103 annualized_return 0.327204 information_ratio 3.475016 max_drawdown -0.024927 'The following are analysis results of the excess return with cost(1day).' risk mean 0.001184 std 0.006091 annualized_return 0.281801 information_ratio 2.998749 max_drawdown -0.029568 [1724971:MainThread](2021-12-07 16:24:31,170) INFO - qlib.workflow - [record_temp.py:466] - Indicator analysis record 'indicator_analysis_1day. pkl' has been saved as the artifact of the Experiment 2 'The following are analysis results of indicators(1day).' value ffr 1.0 pa 0.0 pos 0.0 [1724971:MainThread](2021-12-07 16:24:31,188) INFO - qlib.timer - [log.py:113] - Time cost: 0.007s | waiting `async_log` Done """ self._init_qlib() model = init_instance_by_config(self.task["model"]) dataset = init_instance_by_config(self.task["dataset"]) self._train_model(model, dataset) strategy_config = { "class": "TopkDropoutStrategy", "module_path": "qlib.contrib.strategy.signal_strategy", "kwargs": { "signal": (model, dataset), "topk": 50, "n_drop": 5, }, } pa_conf = deepcopy(self.port_analysis_config) pa_conf["strategy"] = strategy_config pa_conf["executor"] = { "class": "SimulatorExecutor", "module_path": "qlib.backtest.executor", "kwargs": { "time_per_step": "day", "generate_portfolio_metrics": True, "verbose": True, }, } pa_conf["backtest"]["benchmark"] = self.benchmark with R.start(experiment_name="backtest"): recorder = R.get_recorder() par = PortAnaRecord(recorder, pa_conf) par.generate()
def main(xargs): dataset_config = { "class": "DatasetH", "module_path": "qlib.data.dataset", "kwargs": { "handler": { "class": "Alpha360", "module_path": "qlib.contrib.data.handler", "kwargs": { "start_time": "2008-01-01", "end_time": "2020-08-01", "fit_start_time": "2008-01-01", "fit_end_time": "2014-12-31", "instruments": xargs.market, "infer_processors": [ { "class": "RobustZScoreNorm", "kwargs": { "fields_group": "feature", "clip_outlier": True } }, { "class": "Fillna", "kwargs": { "fields_group": "feature" } }, ], "learn_processors": [ { "class": "DropnaLabel" }, { "class": "CSRankNorm", "kwargs": { "fields_group": "label" } }, ], "label": ["Ref($close, -2) / Ref($close, -1) - 1"], }, }, "segments": { "train": ("2008-01-01", "2014-12-31"), "valid": ("2015-01-01", "2016-12-31"), "test": ("2017-01-01", "2020-08-01"), }, }, } model_config = { "class": "QuantTransformer", "module_path": "trade_models", "kwargs": { "loss": "mse", "GPU": "0", "metric": "loss", }, } task = {"model": model_config, "dataset": dataset_config} model = init_instance_by_config(model_config) dataset = init_instance_by_config(dataset_config) # start exp to train model with R.start(experiment_name="train_tt_model"): R.log_params(**flatten_dict(task)) model.fit(dataset) R.save_objects(trained_model=model) # prediction recorder = R.get_recorder() print(recorder) sr = SignalRecord(model, dataset, recorder) sr.generate() # backtest. If users want to use backtest based on their own prediction, # please refer to https://qlib.readthedocs.io/en/latest/component/recorder.html#record-template. par = PortAnaRecord(recorder, port_analysis_config) par.generate()