def train_with_sigana(uri_path: str = None): """train model followed by SigAnaRecord Returns ------- pred_score: pandas.DataFrame predict scores performance: dict model performance """ model = init_instance_by_config(CSI300_GBDT_TASK["model"]) dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) # start exp with R.start(experiment_name="workflow_with_sigana", uri=uri_path): R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) recorder = R.get_recorder() sr = SignalRecord(model, dataset, recorder) sr.generate() pred_score = sr.load("pred.pkl") # predict and calculate ic and ric sar = SigAnaRecord(recorder) sar.generate() ic = sar.load("ic.pkl") ric = sar.load("ric.pkl") uri_path = R.get_uri() return pred_score, {"ic": ic, "ric": ric}, uri_path
def train_with_sigana(): """train model followed by SigAnaRecord Returns ------- pred_score: pandas.DataFrame predict scores performance: dict model performance """ model = init_instance_by_config(task["model"]) dataset = init_instance_by_config(task["dataset"]) # start exp with R.start(experiment_name="workflow_with_sigana"): R.log_params(**flatten_dict(task)) model.fit(dataset) # predict and calculate ic and ric recorder = R.get_recorder() sar = SigAnaRecord(recorder, model=model, dataset=dataset) sar.generate() ic = sar.load(sar.get_path("ic.pkl")) ric = sar.load(sar.get_path("ric.pkl")) pred_score = sar.load("pred.pkl") smr = SignalMseRecord(recorder) smr.generate() uri_path = R.get_uri() return pred_score, {"ic": ic, "ric": ric}, uri_path
def train_meta_model(self): """ training a meta model based on a simplified linear proxy model; """ # 1) leverage the simplified proxy forecasting model to train meta model. # - Only the dataset part is important, in current version of meta model will integrate the rb = RollingBenchmark(model_type=self.sim_task_model) sim_task = rb.basic_task() proxy_forecast_model_task = { # "model": "qlib.contrib.model.linear.LinearModel", "dataset": { "class": "qlib.data.dataset.DatasetH", "kwargs": { "handler": f"file://{(DIRNAME / 'handler_proxy.pkl').absolute()}", "segments": { "train": ("2008-01-01", "2010-12-31"), "test": ("2011-01-01", sim_task["dataset"]["kwargs"]["segments"]["test"][1]), }, }, }, # "record": ["qlib.workflow.record_temp.SignalRecord"] } # the proxy_forecast_model_task will be used to create meta tasks. # The test date of first task will be 2011-01-01. Each test segment will be about 20days # The tasks include all training tasks and test tasks. # 2) preparing meta dataset kwargs = dict( task_tpl=proxy_forecast_model_task, step=self.step, segments=0.62, # keep test period consistent with the dataset yaml trunc_days=1 + self.horizon, hist_step_n=30, fill_method="max", rolling_ext_days=0, ) # NOTE: # the input of meta model (internal data) are shared between proxy model and final forecasting model # but their task test segment are not aligned! It worked in my previous experiment. # So the misalignment will not affect the effectiveness of the method. with self._internal_data_path.open("rb") as f: internal_data = pickle.load(f) md = MetaDatasetDS(exp_name=internal_data, **kwargs) # 3) train and logging meta model with R.start(experiment_name=self.meta_exp_name): R.log_params(**kwargs) mm = MetaModelDS(step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.001, max_epoch=200, seed=43) mm.fit(md) R.save_objects(model=mm)
def _train_model(self, model, dataset): with R.start(experiment_name="train"): R.log_params(**flatten_dict(self.task)) model.fit(dataset) R.save_objects(**{"params.pkl": model}) # prediction recorder = R.get_recorder() sr = SignalRecord(model, dataset, recorder) sr.generate()
def train_mse(): model = init_instance_by_config(CSI300_GBDT_TASK["model"]) dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) with R.start(experiment_name="workflow"): R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) recorder = R.get_recorder() sr = SignalMseRecord(recorder, model=model, dataset=dataset) sr.generate() uri = R.get_uri() return uri
def run_exp(task_config, dataset, experiment_name, recorder_name, uri): model = init_instance_by_config(task_config["model"]) model_fit_kwargs = dict(dataset=dataset) # Let's start the experiment. with R.start( experiment_name=experiment_name, recorder_name=recorder_name, uri=uri, resume=True, ): # Setup log recorder_root_dir = R.get_recorder().get_local_dir() log_file = os.path.join(recorder_root_dir, "{:}.log".format(experiment_name)) set_log_basic_config(log_file) logger = get_module_logger("q.run_exp") logger.info("task_config::\n{:}".format( pprint.pformat(task_config, indent=2))) logger.info("[{:}] - [{:}]: {:}".format(experiment_name, recorder_name, uri)) logger.info("dataset={:}".format(dataset)) # Train model R.log_params(**flatten_dict(task_config)) if "save_path" in inspect.getfullargspec(model.fit).args: model_fit_kwargs["save_path"] = os.path.join( recorder_root_dir, "model.ckp") elif "save_dir" in inspect.getfullargspec(model.fit).args: model_fit_kwargs["save_dir"] = os.path.join( recorder_root_dir, "model-ckps") model.fit(**model_fit_kwargs) # Get the recorder recorder = R.get_recorder() R.save_objects(**{"model.pkl": model}) # Generate records: prediction, backtest, and analysis for record in task_config["record"]: record = record.copy() if record["class"] == "SignalRecord": srconf = { "model": model, "dataset": dataset, "recorder": recorder } record["kwargs"].update(srconf) sr = init_instance_by_config(record) sr.generate() else: rconf = {"recorder": recorder} record["kwargs"].update(rconf) ar = init_instance_by_config(record) ar.generate()
def train_multiseg(): model = init_instance_by_config(CSI300_GBDT_TASK["model"]) dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) with R.start(experiment_name="workflow"): R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) recorder = R.get_recorder() sr = MultiSegRecord(model, dataset, recorder) sr.generate(dict(valid="valid", test="test"), True) uri = R.get_uri() return uri
def ens_rolling(self): rc = RecorderCollector( experiment=self.rolling_exp, artifacts_key=["pred", "label"], process_list=[RollingEnsemble()], # rec_key_func=lambda rec: (self.COMB_EXP, rec.info["id"]), artifacts_path={ "pred": "pred.pkl", "label": "label.pkl" }, ) res = rc() with R.start(experiment_name=self.COMB_EXP): R.log_params(exp_name=self.rolling_exp) R.save_objects(**{ "pred.pkl": res["pred"], "label.pkl": res["label"] })
def fake_experiment(): """A fake experiment workflow to test uri Returns ------- pass_or_not_for_default_uri: bool pass_or_not_for_current_uri: bool temporary_exp_dir: str """ # start exp default_uri = R.get_uri() current_uri = "file:./temp-test-exp-mag" with R.start(experiment_name="fake_workflow_for_expm", uri=current_uri): R.log_params(**flatten_dict(CSI300_GBDT_TASK)) current_uri_to_check = R.get_uri() default_uri_to_check = R.get_uri() return default_uri == default_uri_to_check, current_uri == current_uri_to_check, current_uri
def run_exp(task_config, dataset, experiment_name, recorder_name, uri): # model initiaiton print("") print("[{:}] - [{:}]: {:}".format(experiment_name, recorder_name, uri)) print("dataset={:}".format(dataset)) model = init_instance_by_config(task_config["model"]) # start exp with R.start(experiment_name=experiment_name, recorder_name=recorder_name, uri=uri): log_file = R.get_recorder().root_uri / "{:}.log".format( experiment_name) set_log_basic_config(log_file) # train model R.log_params(**flatten_dict(task_config)) model.fit(dataset) recorder = R.get_recorder() R.save_objects(**{"model.pkl": model}) # generate records: prediction, backtest, and analysis for record in task_config["record"]: record = record.copy() if record["class"] == "SignalRecord": srconf = { "model": model, "dataset": dataset, "recorder": recorder } record["kwargs"].update(srconf) sr = init_instance_by_config(record) sr.generate() else: rconf = {"recorder": recorder} record["kwargs"].update(rconf) ar = init_instance_by_config(record) ar.generate()
def train(uri_path: str = None): """train model Returns ------- pred_score: pandas.DataFrame predict scores performance: dict model performance """ # model initiaiton model = init_instance_by_config(CSI300_GBDT_TASK["model"]) dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"]) # To test __repr__ print(dataset) print(R) # start exp with R.start(experiment_name="workflow", uri=uri_path): R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) R.save_objects(trained_model=model) # prediction recorder = R.get_recorder() # To test __repr__ print(recorder) # To test get_local_dir print(recorder.get_local_dir()) rid = recorder.id sr = SignalRecord(model, dataset, recorder) sr.generate() pred_score = sr.load("pred.pkl") # calculate ic and ric sar = SigAnaRecord(recorder) sar.generate() ic = sar.load("ic.pkl") ric = sar.load("ric.pkl") return pred_score, {"ic": ic, "ric": ric}, rid
def begin_task_train(task_config: dict, experiment_name: str, recorder_name: str = None) -> Recorder: """ Begin task training to start a recorder and save the task config. Args: task_config (dict): the config of a task experiment_name (str): the name of experiment recorder_name (str): the given name will be the recorder name. None for using rid. Returns: Recorder: the model recorder """ with R.start(experiment_name=experiment_name, recorder_name=recorder_name): R.log_params(**flatten_dict(task_config)) R.save_objects(**{"task": task_config }) # keep the original format and datatype R.set_tags(**{"hostname": socket.gethostname()}) recorder: Recorder = R.get_recorder() return recorder
def train(): """train model Returns ------- pred_score: pandas.DataFrame predict scores performance: dict model performance """ # model initiaiton model = init_instance_by_config(task["model"]) dataset = init_instance_by_config(task["dataset"]) # To test __repr__ print(dataset) print(R) # start exp with R.start(experiment_name="workflow"): R.log_params(**flatten_dict(task)) model.fit(dataset) # prediction recorder = R.get_recorder() # To test __repr__ print(recorder) rid = recorder.id sr = SignalRecord(model, dataset, recorder) sr.generate() pred_score = sr.load() # calculate ic and ric sar = SigAnaRecord(recorder) sar.generate() ic = sar.load(sar.get_path("ic.pkl")) ric = sar.load(sar.get_path("ric.pkl")) return pred_score, {"ic": ic, "ric": ric}, rid
def task_train(task_config: dict, experiment_name): """ task based training Parameters ---------- task_config : dict A dict describes a task setting. """ # model initiaiton model = init_instance_by_config(task_config["model"]) dataset = init_instance_by_config(task_config["dataset"]) # start exp with R.start(experiment_name=experiment_name): # train model R.log_params(**flatten_dict(task_config)) model.fit(dataset) recorder = R.get_recorder() R.save_objects(**{"params.pkl": model}) # generate records: prediction, backtest, and analysis for record in task_config["record"]: if record["class"] == SignalRecord.__name__: srconf = { "model": model, "dataset": dataset, "recorder": recorder } record["kwargs"].update(srconf) sr = init_instance_by_config(record) sr.generate() else: rconf = {"recorder": recorder} record["kwargs"].update(rconf) ar = init_instance_by_config(record) ar.generate()
def main(xargs): dataset_config = { "class": "DatasetH", "module_path": "qlib.data.dataset", "kwargs": { "handler": { "class": "Alpha360", "module_path": "qlib.contrib.data.handler", "kwargs": { "start_time": "2008-01-01", "end_time": "2020-08-01", "fit_start_time": "2008-01-01", "fit_end_time": "2014-12-31", "instruments": xargs.market, "infer_processors": [ { "class": "RobustZScoreNorm", "kwargs": { "fields_group": "feature", "clip_outlier": True } }, { "class": "Fillna", "kwargs": { "fields_group": "feature" } }, ], "learn_processors": [ { "class": "DropnaLabel" }, { "class": "CSRankNorm", "kwargs": { "fields_group": "label" } }, ], "label": ["Ref($close, -2) / Ref($close, -1) - 1"], }, }, "segments": { "train": ("2008-01-01", "2014-12-31"), "valid": ("2015-01-01", "2016-12-31"), "test": ("2017-01-01", "2020-08-01"), }, }, } model_config = { "class": "QuantTransformer", "module_path": "trade_models", "kwargs": { "loss": "mse", "GPU": "0", "metric": "loss", }, } task = {"model": model_config, "dataset": dataset_config} model = init_instance_by_config(model_config) dataset = init_instance_by_config(dataset_config) # start exp to train model with R.start(experiment_name="train_tt_model"): R.log_params(**flatten_dict(task)) model.fit(dataset) R.save_objects(trained_model=model) # prediction recorder = R.get_recorder() print(recorder) sr = SignalRecord(model, dataset, recorder) sr.generate() # backtest. If users want to use backtest based on their own prediction, # please refer to https://qlib.readthedocs.io/en/latest/component/recorder.html#record-template. par = PortAnaRecord(recorder, port_analysis_config) par.generate()
"close_cost": 0.0015, "min_cost": 5, "return_order": True, }, } # model initialization model = init_instance_by_config(task["model"]) dataset = init_instance_by_config(task["dataset"]) # NOTE: This line is optional # It demonstrates that the dataset can be used standalone. example_df = dataset.prepare("train") print(example_df.head()) # start exp with R.start(experiment_name="workflow"): R.log_params(**flatten_dict(task)) model.fit(dataset) R.save_objects(**{"params.pkl": model}) # prediction recorder = R.get_recorder() sr = SignalRecord(model, dataset, recorder) sr.generate() # backtest. If users want to use backtest based on their own prediction, # please refer to https://qlib.readthedocs.io/en/latest/component/recorder.html#record-template. par = PortAnaRecord(recorder, port_analysis_config) par.generate()
"deal_price": "close", "open_cost": 0.0005, "close_cost": 0.0015, "min_cost": 5, }, }, } # NOTE: This line is optional # It demonstrates that the dataset can be used standalone. example_df = dataset.prepare("train") print(example_df.head()) # start exp with R.start(experiment_name="workflow"): R.log_params(**flatten_dict(CSI300_GBDT_TASK)) model.fit(dataset) R.save_objects(**{"params.pkl": model}) # prediction recorder = R.get_recorder() sr = SignalRecord(model, dataset, recorder) sr.generate() # Signal Analysis sar = SigAnaRecord(recorder) sar.generate() # backtest. If users want to use backtest based on their own prediction, # please refer to https://qlib.readthedocs.io/en/latest/component/recorder.html#record-template. par = PortAnaRecord(recorder, port_analysis_config, "day")
def run_exp( task_config, dataset, experiment_name, recorder_name, uri, model_obj_name="model.pkl", ): model = init_instance_by_config(task_config["model"]) model_fit_kwargs = dict(dataset=dataset) # Let's start the experiment. with R.start( experiment_name=experiment_name, recorder_name=recorder_name, uri=uri, resume=True, ): # Setup log recorder_root_dir = R.get_recorder().get_local_dir() log_file = os.path.join(recorder_root_dir, "{:}.log".format(experiment_name)) set_log_basic_config(log_file) logger = get_module_logger("q.run_exp") logger.info("task_config::\n{:}".format( pprint.pformat(task_config, indent=2))) logger.info("[{:}] - [{:}]: {:}".format(experiment_name, recorder_name, uri)) logger.info("dataset={:}".format(dataset)) # Train model try: if hasattr(model, "to"): # Recoverable model ori_device = model.device model = R.load_object(model_obj_name) model.to(ori_device) else: model = R.load_object(model_obj_name) logger.info( "[Find existing object from {:}]".format(model_obj_name)) except OSError: R.log_params(**flatten_dict(update_gpu(task_config, None))) if "save_path" in inspect.getfullargspec(model.fit).args: model_fit_kwargs["save_path"] = os.path.join( recorder_root_dir, "model.ckp") elif "save_dir" in inspect.getfullargspec(model.fit).args: model_fit_kwargs["save_dir"] = os.path.join( recorder_root_dir, "model-ckps") model.fit(**model_fit_kwargs) # remove model to CPU for saving if hasattr(model, "to"): old_device = model.device model.to("cpu") R.save_objects(**{model_obj_name: model}) model.to(old_device) else: R.save_objects(**{model_obj_name: model}) except Exception as e: raise ValueError("Something wrong: {:}".format(e)) # Get the recorder recorder = R.get_recorder() # Generate records: prediction, backtest, and analysis for record in task_config["record"]: record = deepcopy(record) if record["class"] == "MultiSegRecord": record["kwargs"] = dict(model=model, dataset=dataset, recorder=recorder) sr = init_instance_by_config(record) sr.generate(**record["generate_kwargs"]) elif record["class"] == "SignalRecord": srconf = { "model": model, "dataset": dataset, "recorder": recorder } record["kwargs"].update(srconf) sr = init_instance_by_config(record) sr.generate() else: rconf = {"recorder": recorder} record["kwargs"].update(rconf) ar = init_instance_by_config(record) ar.generate()
def _log_task_info(task_config: dict): R.log_params(**flatten_dict(task_config)) R.save_objects(**{"task": task_config}) # keep the original format and datatype R.set_tags(**{"hostname": socket.gethostname()})