def run_exp(task_config, dataset, experiment_name, recorder_name, uri): model = init_instance_by_config(task_config["model"]) model_fit_kwargs = dict(dataset=dataset) # Let's start the experiment. with R.start( experiment_name=experiment_name, recorder_name=recorder_name, uri=uri, resume=True, ): # Setup log recorder_root_dir = R.get_recorder().get_local_dir() log_file = os.path.join(recorder_root_dir, "{:}.log".format(experiment_name)) set_log_basic_config(log_file) logger = get_module_logger("q.run_exp") logger.info("task_config::\n{:}".format( pprint.pformat(task_config, indent=2))) logger.info("[{:}] - [{:}]: {:}".format(experiment_name, recorder_name, uri)) logger.info("dataset={:}".format(dataset)) # Train model R.log_params(**flatten_dict(task_config)) if "save_path" in inspect.getfullargspec(model.fit).args: model_fit_kwargs["save_path"] = os.path.join( recorder_root_dir, "model.ckp") elif "save_dir" in inspect.getfullargspec(model.fit).args: model_fit_kwargs["save_dir"] = os.path.join( recorder_root_dir, "model-ckps") model.fit(**model_fit_kwargs) # Get the recorder recorder = R.get_recorder() R.save_objects(**{"model.pkl": model}) # Generate records: prediction, backtest, and analysis for record in task_config["record"]: record = record.copy() if record["class"] == "SignalRecord": srconf = { "model": model, "dataset": dataset, "recorder": recorder } record["kwargs"].update(srconf) sr = init_instance_by_config(record) sr.generate() else: rconf = {"recorder": recorder} record["kwargs"].update(rconf) ar = init_instance_by_config(record) ar.generate()
def get_all_results(folders) -> dict: results = dict() for fn in folders: exp = R.get_exp(experiment_name=fn, create=False) recorders = exp.list_recorders() result = dict() result["annualized_return_with_cost"] = list() result["information_ratio_with_cost"] = list() result["max_drawdown_with_cost"] = list() result["ic"] = list() result["icir"] = list() result["rank_ic"] = list() result["rank_icir"] = list() for recorder_id in recorders: if recorders[recorder_id].status == "FINISHED": recorder = R.get_recorder(recorder_id=recorder_id, experiment_name=fn) metrics = recorder.list_metrics() result["annualized_return_with_cost"].append(metrics["excess_return_with_cost.annualized_return"]) result["information_ratio_with_cost"].append(metrics["excess_return_with_cost.information_ratio"]) result["max_drawdown_with_cost"].append(metrics["excess_return_with_cost.max_drawdown"]) result["ic"].append(metrics["IC"]) result["icir"].append(metrics["ICIR"]) result["rank_ic"].append(metrics["Rank IC"]) result["rank_icir"].append(metrics["Rank ICIR"]) results[fn] = result return results
def train_with_sigana(): """train model followed by SigAnaRecord Returns ------- pred_score: pandas.DataFrame predict scores performance: dict model performance """ model = init_instance_by_config(task["model"]) dataset = init_instance_by_config(task["dataset"]) # start exp with R.start(experiment_name="workflow_with_sigana"): R.log_params(**flatten_dict(task)) model.fit(dataset) # predict and calculate ic and ric recorder = R.get_recorder() sar = SigAnaRecord(recorder, model=model, dataset=dataset) sar.generate() ic = sar.load(sar.get_path("ic.pkl")) ric = sar.load(sar.get_path("ric.pkl")) pred_score = sar.load("pred.pkl") smr = SignalMseRecord(recorder) smr.generate() uri_path = R.get_uri() return pred_score, {"ic": ic, "ric": ric}, uri_path
def backtest(self): self._init_qlib() model = init_instance_by_config(self.task["model"]) dataset = init_instance_by_config(self.task["dataset"]) self._train_model(model, dataset) strategy_config = { "class": "TopkDropoutStrategy", "module_path": "qlib.contrib.strategy.signal_strategy", "kwargs": { "signal": (model, dataset), "topk": 50, "n_drop": 5, }, } self.port_analysis_config["strategy"] = strategy_config self.port_analysis_config["backtest"]["benchmark"] = self.benchmark with R.start(experiment_name="backtest"): recorder = R.get_recorder() par = PortAnaRecord( recorder, self.port_analysis_config, risk_analysis_freq=["day", "30min", "5min"], indicator_analysis_freq=["day", "30min", "5min"], indicator_analysis_method="value_weighted", ) par.generate()
def train_with_sigana(uri_path: str = None): """train model followed by SigAnaRecord Returns ------- pred_score: pandas.DataFrame predict scores performance: dict model performance """ model = init_instance_by_config(CSI300_GBDT_TASK["model"]) dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) # start exp with R.start(experiment_name="workflow_with_sigana", uri=uri_path): R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) recorder = R.get_recorder() sr = SignalRecord(model, dataset, recorder) sr.generate() pred_score = sr.load("pred.pkl") # predict and calculate ic and ric sar = SigAnaRecord(recorder) sar.generate() ic = sar.load("ic.pkl") ric = sar.load("ric.pkl") uri_path = R.get_uri() return pred_score, {"ic": ic, "ric": ric}, uri_path
def _exe_task(task_config: dict): rec = R.get_recorder() # model & dataset initiation model: Model = init_instance_by_config(task_config["model"]) dataset: Dataset = init_instance_by_config(task_config["dataset"]) # FIXME: resume reweighter after merging data selection # reweighter: Reweighter = task_config.get("reweighter", None) # model training # auto_filter_kwargs(model.fit)(dataset, reweighter=reweighter) model.fit(dataset) R.save_objects(**{"params.pkl": model}) # this dataset is saved for online inference. So the concrete data should not be dumped dataset.config(dump_all=False, recursive=True) R.save_objects(**{"dataset": dataset}) # fill placehorder placehorder_value = {"<MODEL>": model, "<DATASET>": dataset} task_config = fill_placeholder(task_config, placehorder_value) # generate records: prediction, backtest, and analysis records = task_config.get("record", []) if isinstance(records, dict): # prevent only one dict records = [records] for record in records: # Some recorder require the parameter `model` and `dataset`. # try to automatically pass in them to the initialization function # to make defining the tasking easier r = init_instance_by_config( record, recorder=rec, default_module="qlib.workflow.record_temp", try_kwargs={ "model": model, "dataset": dataset }, ) r.generate()
def _train_model(self, model, dataset): with R.start(experiment_name="train"): R.log_params(**flatten_dict(self.task)) model.fit(dataset) R.save_objects(**{"params.pkl": model}) # prediction recorder = R.get_recorder() sr = SignalRecord(model, dataset, recorder) sr.generate()
def train_mse(): model = init_instance_by_config(CSI300_GBDT_TASK["model"]) dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) with R.start(experiment_name="workflow"): R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) recorder = R.get_recorder() sr = SignalMseRecord(recorder, model=model, dataset=dataset) sr.generate() uri = R.get_uri() return uri
def train_multiseg(): model = init_instance_by_config(CSI300_GBDT_TASK["model"]) dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) with R.start(experiment_name="workflow"): R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) recorder = R.get_recorder() sr = MultiSegRecord(model, dataset, recorder) sr.generate(dict(valid="valid", test="test"), True) uri = R.get_uri() return uri
def run_exp(task_config, dataset, experiment_name, recorder_name, uri): # model initiaiton print("") print("[{:}] - [{:}]: {:}".format(experiment_name, recorder_name, uri)) print("dataset={:}".format(dataset)) model = init_instance_by_config(task_config["model"]) # start exp with R.start(experiment_name=experiment_name, recorder_name=recorder_name, uri=uri): log_file = R.get_recorder().root_uri / "{:}.log".format( experiment_name) set_log_basic_config(log_file) # train model R.log_params(**flatten_dict(task_config)) model.fit(dataset) recorder = R.get_recorder() R.save_objects(**{"model.pkl": model}) # generate records: prediction, backtest, and analysis for record in task_config["record"]: record = record.copy() if record["class"] == "SignalRecord": srconf = { "model": model, "dataset": dataset, "recorder": recorder } record["kwargs"].update(srconf) sr = init_instance_by_config(record) sr.generate() else: rconf = {"recorder": recorder} record["kwargs"].update(rconf) ar = init_instance_by_config(record) ar.generate()
def __init__(self, recorder_id, experiment_id, provider_uri=r"E:\TDX\cjzq_tdx\vipdoc", region=REG_CN): self.record_id = recorder_id self.experiment_id = experiment_id qlib.init(provider_uri=provider_uri, region=region) R.set_uri('file:D:\\Code\\my_qlib\\examples\\mlruns') self.recorder = R.get_recorder(recorder_id=recorder_id, experiment_id=experiment_id) self.expr_dir = Path(self.recorder.uri[5:]).joinpath(experiment_id).joinpath(recorder_id) self.artifacts_dir = self.expr_dir.joinpath('artifacts') self.portfolio_dir = self.artifacts_dir.joinpath('portfolio_analysis') self.sig_dir = self.artifacts_dir.joinpath('sig_analysis')
def begin_task_train(task_config: dict, experiment_name: str, recorder_name: str = None) -> Recorder: """ Begin task training to start a recorder and save the task config. Args: task_config (dict): the config of a task experiment_name (str): the name of experiment recorder_name (str): the given name will be the recorder name. None for using rid. Returns: Recorder: the model recorder """ with R.start(experiment_name=experiment_name, recorder_name=recorder_name): _log_task_info(task_config) return R.get_recorder()
def train(uri_path: str = None): """train model Returns ------- pred_score: pandas.DataFrame predict scores performance: dict model performance """ # model initiaiton model = init_instance_by_config(CSI300_GBDT_TASK["model"]) dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) # To test __repr__ print(dataset) print(R) # start exp with R.start(experiment_name="workflow", uri=uri_path): R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) R.save_objects(trained_model=model) # prediction recorder = R.get_recorder() # To test __repr__ print(recorder) # To test get_local_dir print(recorder.get_local_dir()) rid = recorder.id sr = SignalRecord(model, dataset, recorder) sr.generate() pred_score = sr.load("pred.pkl") # calculate ic and ric sar = SigAnaRecord(recorder) sar.generate() ic = sar.load("ic.pkl") ric = sar.load("ric.pkl") return pred_score, {"ic": ic, "ric": ric}, rid
def task_train(task_config: dict, experiment_name: str, recorder_name: str = None) -> Recorder: """ Task based training, will be divided into two steps. Parameters ---------- task_config : dict The config of a task. experiment_name: str The name of experiment recorder_name: str The name of recorder Returns ---------- Recorder: The instance of the recorder """ with R.start(experiment_name=experiment_name, recorder_name=recorder_name): _log_task_info(task_config) _exe_task(task_config) return R.get_recorder()
def begin_task_train(task_config: dict, experiment_name: str, recorder_name: str = None) -> Recorder: """ Begin task training to start a recorder and save the task config. Args: task_config (dict): the config of a task experiment_name (str): the name of experiment recorder_name (str): the given name will be the recorder name. None for using rid. Returns: Recorder: the model recorder """ with R.start(experiment_name=experiment_name, recorder_name=recorder_name): R.log_params(**flatten_dict(task_config)) R.save_objects(**{"task": task_config }) # keep the original format and datatype R.set_tags(**{"hostname": socket.gethostname()}) recorder: Recorder = R.get_recorder() return recorder
def train(): """train model Returns ------- pred_score: pandas.DataFrame predict scores performance: dict model performance """ # model initiaiton model = init_instance_by_config(task["model"]) dataset = init_instance_by_config(task["dataset"]) # To test __repr__ print(dataset) print(R) # start exp with R.start(experiment_name="workflow"): R.log_params(**flatten_dict(task)) model.fit(dataset) # prediction recorder = R.get_recorder() # To test __repr__ print(recorder) rid = recorder.id sr = SignalRecord(model, dataset, recorder) sr.generate() pred_score = sr.load() # calculate ic and ric sar = SigAnaRecord(recorder) sar.generate() ic = sar.load(sar.get_path("ic.pkl")) ric = sar.load(sar.get_path("ric.pkl")) return pred_score, {"ic": ic, "ric": ric}, rid
def task_train(task_config: dict, experiment_name): """ task based training Parameters ---------- task_config : dict A dict describes a task setting. """ # model initiaiton model = init_instance_by_config(task_config["model"]) dataset = init_instance_by_config(task_config["dataset"]) # start exp with R.start(experiment_name=experiment_name): # train model R.log_params(**flatten_dict(task_config)) model.fit(dataset) recorder = R.get_recorder() R.save_objects(**{"params.pkl": model}) # generate records: prediction, backtest, and analysis for record in task_config["record"]: if record["class"] == SignalRecord.__name__: srconf = { "model": model, "dataset": dataset, "recorder": recorder } record["kwargs"].update(srconf) sr = init_instance_by_config(record) sr.generate() else: rconf = {"recorder": recorder} record["kwargs"].update(rconf) ar = init_instance_by_config(record) ar.generate()
def backtest_analysis(pred, rid): """backtest and analysis Parameters ---------- pred : pandas.DataFrame predict scores rid : str the id of the recorder to be used in this function Returns ------- analysis : pandas.DataFrame the analysis result """ recorder = R.get_recorder(experiment_name="workflow", recorder_id=rid) # backtest par = PortAnaRecord(recorder, port_analysis_config) par.generate() analysis_df = par.load(par.get_path("port_analysis.pkl")) print(analysis_df) return analysis_df
def get_all_results(folders) -> dict: results = dict() for fn in folders: try: exp = R.get_exp(experiment_name=fn, create=False) except ValueError: # No experiment results continue recorders = exp.list_recorders() result = dict() result["annualized_return_with_cost"] = list() result["information_ratio_with_cost"] = list() result["max_drawdown_with_cost"] = list() result["ic"] = list() result["icir"] = list() result["rank_ic"] = list() result["rank_icir"] = list() for recorder_id in recorders: if recorders[recorder_id].status == "FINISHED": recorder = R.get_recorder(recorder_id=recorder_id, experiment_name=fn) metrics = recorder.list_metrics() if "1day.excess_return_with_cost.annualized_return" not in metrics: print(f"{recorder_id} is skipped due to incomplete result") continue result["annualized_return_with_cost"].append( metrics["1day.excess_return_with_cost.annualized_return"]) result["information_ratio_with_cost"].append( metrics["1day.excess_return_with_cost.information_ratio"]) result["max_drawdown_with_cost"].append( metrics["1day.excess_return_with_cost.max_drawdown"]) result["ic"].append(metrics["IC"]) result["icir"].append(metrics["ICIR"]) result["rank_ic"].append(metrics["Rank IC"]) result["rank_icir"].append(metrics["Rank ICIR"]) results[fn] = result return results
def main(xargs): dataset_config = { "class": "DatasetH", "module_path": "qlib.data.dataset", "kwargs": { "handler": { "class": "Alpha360", "module_path": "qlib.contrib.data.handler", "kwargs": { "start_time": "2008-01-01", "end_time": "2020-08-01", "fit_start_time": "2008-01-01", "fit_end_time": "2014-12-31", "instruments": xargs.market, "infer_processors": [ { "class": "RobustZScoreNorm", "kwargs": { "fields_group": "feature", "clip_outlier": True } }, { "class": "Fillna", "kwargs": { "fields_group": "feature" } }, ], "learn_processors": [ { "class": "DropnaLabel" }, { "class": "CSRankNorm", "kwargs": { "fields_group": "label" } }, ], "label": ["Ref($close, -2) / Ref($close, -1) - 1"], }, }, "segments": { "train": ("2008-01-01", "2014-12-31"), "valid": ("2015-01-01", "2016-12-31"), "test": ("2017-01-01", "2020-08-01"), }, }, } model_config = { "class": "QuantTransformer", "module_path": "trade_models", "kwargs": { "loss": "mse", "GPU": "0", "metric": "loss", }, } task = {"model": model_config, "dataset": dataset_config} model = init_instance_by_config(model_config) dataset = init_instance_by_config(dataset_config) # start exp to train model with R.start(experiment_name="train_tt_model"): R.log_params(**flatten_dict(task)) model.fit(dataset) R.save_objects(trained_model=model) # prediction recorder = R.get_recorder() print(recorder) sr = SignalRecord(model, dataset, recorder) sr.generate() # backtest. If users want to use backtest based on their own prediction, # please refer to https://qlib.readthedocs.io/en/latest/component/recorder.html#record-template. par = PortAnaRecord(recorder, port_analysis_config) par.generate()
def backtest_analysis(pred, rid, uri_path: str = None): """backtest and analysis Parameters ---------- rid : str the id of the recorder to be used in this function uri_path: str mlflow uri path Returns ------- analysis : pandas.DataFrame the analysis result """ with R.uri_context(uri=uri_path): recorder = R.get_recorder(experiment_name="workflow", recorder_id=rid) dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) model = recorder.load_object("trained_model") port_analysis_config = { "executor": { "class": "SimulatorExecutor", "module_path": "qlib.backtest.executor", "kwargs": { "time_per_step": "day", "generate_portfolio_metrics": True, }, }, "strategy": { "class": "TopkDropoutStrategy", "module_path": "qlib.contrib.strategy.signal_strategy", "kwargs": { "signal": (model, dataset), "topk": 50, "n_drop": 5, }, }, "backtest": { "start_time": "2017-01-01", "end_time": "2020-08-01", "account": 100000000, "benchmark": CSI300_BENCH, "exchange_kwargs": { "freq": "day", "limit_threshold": 0.095, "deal_price": "close", "open_cost": 0.0005, "close_cost": 0.0015, "min_cost": 5, }, }, } # backtest par = PortAnaRecord(recorder, port_analysis_config, risk_analysis_freq="day") par.generate() analysis_df = par.load("port_analysis_1day.pkl") print(analysis_df) return analysis_df
"close_cost": 0.0015, "min_cost": 5, "return_order": True, }, } # model initialization model = init_instance_by_config(task["model"]) dataset = init_instance_by_config(task["dataset"]) # NOTE: This line is optional # It demonstrates that the dataset can be used standalone. example_df = dataset.prepare("train") print(example_df.head()) # start exp with R.start(experiment_name="workflow"): R.log_params(**flatten_dict(task)) model.fit(dataset) R.save_objects(**{"params.pkl": model}) # prediction recorder = R.get_recorder() sr = SignalRecord(model, dataset, recorder) sr.generate() # backtest. If users want to use backtest based on their own prediction, # please refer to https://qlib.readthedocs.io/en/latest/component/recorder.html#record-template. par = PortAnaRecord(recorder, port_analysis_config) par.generate()
def backtest_only_daily(self): """ This backtest is used for comparing the nested execution and single layer execution Due to the low quality daily-level and miniute-level data, they are hardly comparable. So it is used for detecting serious bugs which make the results different greatly. .. code-block:: shell [1724971:MainThread](2021-12-07 16:24:31,156) INFO - qlib.workflow - [record_temp.py:441] - Portfolio analysis record 'port_analysis_1day.pkl' has been saved as the artifact of the Experiment 2 'The following are analysis results of benchmark return(1day).' risk mean 0.000651 std 0.012472 annualized_return 0.154967 information_ratio 0.805422 max_drawdown -0.160445 'The following are analysis results of the excess return without cost(1day).' risk mean 0.001375 std 0.006103 annualized_return 0.327204 information_ratio 3.475016 max_drawdown -0.024927 'The following are analysis results of the excess return with cost(1day).' risk mean 0.001184 std 0.006091 annualized_return 0.281801 information_ratio 2.998749 max_drawdown -0.029568 [1724971:MainThread](2021-12-07 16:24:31,170) INFO - qlib.workflow - [record_temp.py:466] - Indicator analysis record 'indicator_analysis_1day. pkl' has been saved as the artifact of the Experiment 2 'The following are analysis results of indicators(1day).' value ffr 1.0 pa 0.0 pos 0.0 [1724971:MainThread](2021-12-07 16:24:31,188) INFO - qlib.timer - [log.py:113] - Time cost: 0.007s | waiting `async_log` Done """ self._init_qlib() model = init_instance_by_config(self.task["model"]) dataset = init_instance_by_config(self.task["dataset"]) self._train_model(model, dataset) strategy_config = { "class": "TopkDropoutStrategy", "module_path": "qlib.contrib.strategy.signal_strategy", "kwargs": { "signal": (model, dataset), "topk": 50, "n_drop": 5, }, } pa_conf = deepcopy(self.port_analysis_config) pa_conf["strategy"] = strategy_config pa_conf["executor"] = { "class": "SimulatorExecutor", "module_path": "qlib.backtest.executor", "kwargs": { "time_per_step": "day", "generate_portfolio_metrics": True, "verbose": True, }, } pa_conf["backtest"]["benchmark"] = self.benchmark with R.start(experiment_name="backtest"): recorder = R.get_recorder() par = PortAnaRecord(recorder, pa_conf) par.generate()
def run_exp( task_config, dataset, experiment_name, recorder_name, uri, model_obj_name="model.pkl", ): model = init_instance_by_config(task_config["model"]) model_fit_kwargs = dict(dataset=dataset) # Let's start the experiment. with R.start( experiment_name=experiment_name, recorder_name=recorder_name, uri=uri, resume=True, ): # Setup log recorder_root_dir = R.get_recorder().get_local_dir() log_file = os.path.join(recorder_root_dir, "{:}.log".format(experiment_name)) set_log_basic_config(log_file) logger = get_module_logger("q.run_exp") logger.info("task_config::\n{:}".format( pprint.pformat(task_config, indent=2))) logger.info("[{:}] - [{:}]: {:}".format(experiment_name, recorder_name, uri)) logger.info("dataset={:}".format(dataset)) # Train model try: if hasattr(model, "to"): # Recoverable model ori_device = model.device model = R.load_object(model_obj_name) model.to(ori_device) else: model = R.load_object(model_obj_name) logger.info( "[Find existing object from {:}]".format(model_obj_name)) except OSError: R.log_params(**flatten_dict(update_gpu(task_config, None))) if "save_path" in inspect.getfullargspec(model.fit).args: model_fit_kwargs["save_path"] = os.path.join( recorder_root_dir, "model.ckp") elif "save_dir" in inspect.getfullargspec(model.fit).args: model_fit_kwargs["save_dir"] = os.path.join( recorder_root_dir, "model-ckps") model.fit(**model_fit_kwargs) # remove model to CPU for saving if hasattr(model, "to"): old_device = model.device model.to("cpu") R.save_objects(**{model_obj_name: model}) model.to(old_device) else: R.save_objects(**{model_obj_name: model}) except Exception as e: raise ValueError("Something wrong: {:}".format(e)) # Get the recorder recorder = R.get_recorder() # Generate records: prediction, backtest, and analysis for record in task_config["record"]: record = deepcopy(record) if record["class"] == "MultiSegRecord": record["kwargs"] = dict(model=model, dataset=dataset, recorder=recorder) sr = init_instance_by_config(record) sr.generate(**record["generate_kwargs"]) elif record["class"] == "SignalRecord": srconf = { "model": model, "dataset": dataset, "recorder": recorder } record["kwargs"].update(srconf) sr = init_instance_by_config(record) sr.generate() else: rconf = {"recorder": recorder} record["kwargs"].update(rconf) ar = init_instance_by_config(record) ar.generate()