Ejemplo n.º 1
0
def main(seed, config_file="configs/config_alstm.yaml"):

    # set random seed
    with open(config_file) as f:
        config = yaml.safe_load(f)

    # seed_suffix = "/seed1000" if "init" in config_file else f"/seed{seed}"
    seed_suffix = ""
    config["task"]["model"]["kwargs"].update({
        "seed":
        seed,
        "logdir":
        config["task"]["model"]["kwargs"]["logdir"] + seed_suffix
    })

    # initialize workflow
    qlib.init(
        provider_uri=config["qlib_init"]["provider_uri"],
        region=config["qlib_init"]["region"],
    )
    dataset = init_instance_by_config(config["task"]["dataset"])
    model = init_instance_by_config(config["task"]["model"])

    # train model
    model.fit(dataset)
Ejemplo n.º 2
0
def _exe_task(task_config: dict):
    rec = R.get_recorder()
    # model & dataset initiation
    model: Model = init_instance_by_config(task_config["model"])
    dataset: Dataset = init_instance_by_config(task_config["dataset"])
    # FIXME: resume reweighter after merging data selection
    # reweighter: Reweighter = task_config.get("reweighter", None)
    # model training
    # auto_filter_kwargs(model.fit)(dataset, reweighter=reweighter)
    model.fit(dataset)
    R.save_objects(**{"params.pkl": model})
    # this dataset is saved for online inference. So the concrete data should not be dumped
    dataset.config(dump_all=False, recursive=True)
    R.save_objects(**{"dataset": dataset})
    # fill placehorder
    placehorder_value = {"<MODEL>": model, "<DATASET>": dataset}
    task_config = fill_placeholder(task_config, placehorder_value)
    # generate records: prediction, backtest, and analysis
    records = task_config.get("record", [])
    if isinstance(records, dict):  # prevent only one dict
        records = [records]
    for record in records:
        # Some recorder require the parameter `model` and `dataset`.
        # try to automatically pass in them to the initialization function
        # to make defining the tasking easier
        r = init_instance_by_config(
            record,
            recorder=rec,
            default_module="qlib.workflow.record_temp",
            try_kwargs={
                "model": model,
                "dataset": dataset
            },
        )
        r.generate()
Ejemplo n.º 3
0
def train_with_sigana(uri_path: str = None):
    """train model followed by SigAnaRecord

    Returns
    -------
        pred_score: pandas.DataFrame
            predict scores
        performance: dict
            model performance
    """
    model = init_instance_by_config(CSI300_GBDT_TASK["model"])
    dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"])
    # start exp
    with R.start(experiment_name="workflow_with_sigana", uri=uri_path):
        R.log_params(**flatten_dict(CSI300_GBDT_TASK))
        model.fit(dataset)
        recorder = R.get_recorder()

        sr = SignalRecord(model, dataset, recorder)
        sr.generate()
        pred_score = sr.load("pred.pkl")

        # predict and calculate ic and ric
        sar = SigAnaRecord(recorder)
        sar.generate()
        ic = sar.load("ic.pkl")
        ric = sar.load("ric.pkl")

        uri_path = R.get_uri()
    return pred_score, {"ic": ic, "ric": ric}, uri_path
Ejemplo n.º 4
0
def train_with_sigana():
    """train model followed by SigAnaRecord

    Returns
    -------
        pred_score: pandas.DataFrame
            predict scores
        performance: dict
            model performance
    """
    model = init_instance_by_config(task["model"])
    dataset = init_instance_by_config(task["dataset"])

    # start exp
    with R.start(experiment_name="workflow_with_sigana"):
        R.log_params(**flatten_dict(task))
        model.fit(dataset)

        # predict and calculate ic and ric
        recorder = R.get_recorder()
        sar = SigAnaRecord(recorder, model=model, dataset=dataset)
        sar.generate()
        ic = sar.load(sar.get_path("ic.pkl"))
        ric = sar.load(sar.get_path("ric.pkl"))
        pred_score = sar.load("pred.pkl")

        smr = SignalMseRecord(recorder)
        smr.generate()
        uri_path = R.get_uri()
    return pred_score, {"ic": ic, "ric": ric}, uri_path
Ejemplo n.º 5
0
    def dump_and_load_dataset(self):
        """dump and load dataset state on disk"""
        self._init_qlib()
        self._prepare_calender_cache()
        dataset = init_instance_by_config(self.task["dataset"])
        dataset_backtest = init_instance_by_config(
            self.task["dataset_backtest"])

        ##=============dump dataset=============
        dataset.to_pickle(path="dataset.pkl")
        dataset_backtest.to_pickle(path="dataset_backtest.pkl")

        del dataset, dataset_backtest
        ##=============reload dataset=============
        with open("dataset.pkl", "rb") as file_dataset:
            dataset = pickle.load(file_dataset)

        with open("dataset_backtest.pkl", "rb") as file_dataset_backtest:
            dataset_backtest = pickle.load(file_dataset_backtest)

        self._prepare_calender_cache()
        ##=============reload_dataset=============
        dataset.init(init_type=DataHandlerLP.IT_LS)
        dataset_backtest.init()

        ##=============get data=============
        xtrain, xtest = dataset.prepare(["train", "test"])
        backtest_train, backtest_test = dataset_backtest.prepare(
            ["train", "test"])

        print(xtrain, xtest)
        print(backtest_train, backtest_test)
        del xtrain, xtest
        del backtest_train, backtest_test
Ejemplo n.º 6
0
    def backtest(self):
        self._init_qlib()
        model = init_instance_by_config(self.task["model"])
        dataset = init_instance_by_config(self.task["dataset"])
        self._train_model(model, dataset)
        strategy_config = {
            "class": "TopkDropoutStrategy",
            "module_path": "qlib.contrib.strategy.signal_strategy",
            "kwargs": {
                "signal": (model, dataset),
                "topk": 50,
                "n_drop": 5,
            },
        }
        self.port_analysis_config["strategy"] = strategy_config
        self.port_analysis_config["backtest"]["benchmark"] = self.benchmark

        with R.start(experiment_name="backtest"):

            recorder = R.get_recorder()
            par = PortAnaRecord(
                recorder,
                self.port_analysis_config,
                risk_analysis_freq=["day", "30min", "5min"],
                indicator_analysis_freq=["day", "30min", "5min"],
                indicator_analysis_method="value_weighted",
            )
            par.generate()
Ejemplo n.º 7
0
def end_task_train(rec: Recorder, experiment_name: str) -> Recorder:
    """
    Finish task training with real model fitting and saving.

    Args:
        rec (Recorder): the recorder will be resumed
        experiment_name (str): the name of experiment

    Returns:
        Recorder: the model recorder
    """
    with R.start(experiment_name=experiment_name,
                 recorder_id=rec.info["id"],
                 resume=True):
        task_config = R.load_object("task")
        # model & dataset initiation
        model: Model = init_instance_by_config(task_config["model"])
        dataset: Dataset = init_instance_by_config(task_config["dataset"])
        # model training
        model.fit(dataset)
        R.save_objects(**{"params.pkl": model})
        # this dataset is saved for online inference. So the concrete data should not be dumped
        dataset.config(dump_all=False, recursive=True)
        R.save_objects(**{"dataset": dataset})
        # fill placehorder
        placehorder_value = {"<MODEL>": model, "<DATASET>": dataset}
        task_config = fill_placeholder(task_config, placehorder_value)
        # generate records: prediction, backtest, and analysis
        records = task_config.get("record", [])
        if isinstance(records, dict):  # prevent only one dict
            records = [records]
        for record in records:
            r = init_instance_by_config(record, recorder=rec)
            r.generate()
    return rec
Ejemplo n.º 8
0
    def dump_and_load_dataset(self):
        """dump and load dataset state on disk"""
        self._init_qlib()
        self._prepare_calender_cache()
        dataset = init_instance_by_config(self.task["dataset"])
        dataset_backtest = init_instance_by_config(self.task["dataset_backtest"])

        ##=============dump dataset=============
        dataset.to_pickle(path="dataset.pkl")
        dataset_backtest.to_pickle(path="dataset_backtest.pkl")

        del dataset, dataset_backtest
        ##=============reload dataset=============
        with open("dataset.pkl", "rb") as file_dataset:
            dataset = pickle.load(file_dataset)

        with open("dataset_backtest.pkl", "rb") as file_dataset_backtest:
            dataset_backtest = pickle.load(file_dataset_backtest)

        self._prepare_calender_cache()
        ##=============reinit dataset=============
        dataset.config(
            handler_kwargs={
                "start_time": "2021-01-19 00:00:00",
                "end_time": "2021-01-25 16:00:00",
            },
            segments={
                "test": (
                    "2021-01-19 00:00:00",
                    "2021-01-25 16:00:00",
                ),
            },
        )
        dataset.setup_data(
            handler_kwargs={
                "init_type": DataHandlerLP.IT_LS,
            },
        )
        dataset_backtest.config(
            handler_kwargs={
                "start_time": "2021-01-19 00:00:00",
                "end_time": "2021-01-25 16:00:00",
            },
            segments={
                "test": (
                    "2021-01-19 00:00:00",
                    "2021-01-25 16:00:00",
                ),
            },
        )
        dataset_backtest.setup_data(handler_kwargs={})

        ##=============get data=============
        xtest = dataset.prepare("test")
        backtest_test = dataset_backtest.prepare("test")

        print(xtest, backtest_test)
        return
Ejemplo n.º 9
0
def train_multiseg():
    model = init_instance_by_config(CSI300_GBDT_TASK["model"])
    dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"])
    with R.start(experiment_name="workflow"):
        R.log_params(**flatten_dict(CSI300_GBDT_TASK))
        model.fit(dataset)
        recorder = R.get_recorder()
        sr = MultiSegRecord(model, dataset, recorder)
        sr.generate(dict(valid="valid", test="test"), True)
        uri = R.get_uri()
    return uri
Ejemplo n.º 10
0
def run_exp(task_config, dataset, experiment_name, recorder_name, uri):

    model = init_instance_by_config(task_config["model"])
    model_fit_kwargs = dict(dataset=dataset)

    # Let's start the experiment.
    with R.start(
            experiment_name=experiment_name,
            recorder_name=recorder_name,
            uri=uri,
            resume=True,
    ):
        # Setup log
        recorder_root_dir = R.get_recorder().get_local_dir()
        log_file = os.path.join(recorder_root_dir,
                                "{:}.log".format(experiment_name))
        set_log_basic_config(log_file)
        logger = get_module_logger("q.run_exp")
        logger.info("task_config::\n{:}".format(
            pprint.pformat(task_config, indent=2)))
        logger.info("[{:}] - [{:}]: {:}".format(experiment_name, recorder_name,
                                                uri))
        logger.info("dataset={:}".format(dataset))

        # Train model
        R.log_params(**flatten_dict(task_config))
        if "save_path" in inspect.getfullargspec(model.fit).args:
            model_fit_kwargs["save_path"] = os.path.join(
                recorder_root_dir, "model.ckp")
        elif "save_dir" in inspect.getfullargspec(model.fit).args:
            model_fit_kwargs["save_dir"] = os.path.join(
                recorder_root_dir, "model-ckps")
        model.fit(**model_fit_kwargs)
        # Get the recorder
        recorder = R.get_recorder()
        R.save_objects(**{"model.pkl": model})

        # Generate records: prediction, backtest, and analysis
        for record in task_config["record"]:
            record = record.copy()
            if record["class"] == "SignalRecord":
                srconf = {
                    "model": model,
                    "dataset": dataset,
                    "recorder": recorder
                }
                record["kwargs"].update(srconf)
                sr = init_instance_by_config(record)
                sr.generate()
            else:
                rconf = {"recorder": recorder}
                record["kwargs"].update(rconf)
                ar = init_instance_by_config(record)
                ar.generate()
Ejemplo n.º 11
0
def train_mse():
    model = init_instance_by_config(CSI300_GBDT_TASK["model"])
    dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"])
    with R.start(experiment_name="workflow"):
        R.log_params(**flatten_dict(CSI300_GBDT_TASK))
        model.fit(dataset)
        recorder = R.get_recorder()
        sr = SignalMseRecord(recorder, model=model, dataset=dataset)
        sr.generate()
        uri = R.get_uri()
    return uri
Ejemplo n.º 12
0
    def get_data(self):
        """use dataset to get highreq data"""
        self._init_qlib()
        self._prepare_calender_cache()

        dataset = init_instance_by_config(self.task["dataset"])
        xtrain, xtest = dataset.prepare(["train", "test"])
        print(xtrain, xtest)

        dataset_backtest = init_instance_by_config(self.task["dataset_backtest"])
        backtest_train, backtest_test = dataset_backtest.prepare(["train", "test"])
        print(backtest_train, backtest_test)

        return
Ejemplo n.º 13
0
def objective(trial):
    task = {
        "model": {
            "class": "LGBModel",
            "module_path": "qlib.contrib.model.gbdt",
            "kwargs": {
                "loss": "mse",
                "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1),
                "learning_rate": trial.suggest_uniform("learning_rate", 0, 1),
                "subsample": trial.suggest_uniform("subsample", 0, 1),
                "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 1e4),
                "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 1e4),
                "max_depth": 10,
                "num_leaves": trial.suggest_int("num_leaves", 1, 1024),
                "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
                "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
                "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
                "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 50),
                "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
            },
        },
    }

    evals_result = dict()
    model = init_instance_by_config(task["model"])
    model.fit(dataset, evals_result=evals_result)
    return min(evals_result["valid"])
Ejemplo n.º 14
0
    def __init__(self,
                 config: Tuple[list, tuple, dict],
                 filter_pipe=None,
                 swap_level=True,
                 freq="day"):
        """
        Parameters
        ----------
        config : Tuple[list, tuple, dict]
            Please refer to the doc of DLWParser
        filter_pipe :
            Filter pipe for the instruments
        swap_level :
            Whether to swap level of MultiIndex
        """
        if filter_pipe is not None:
            assert isinstance(filter_pipe,
                              list), "The type of `filter_pipe` must be list."
            filter_pipe = [
                init_instance_by_config(
                    fp,
                    None if "module_path" in fp else filter_module,
                    accept_types=BaseDFilter) for fp in filter_pipe
            ]

        self.filter_pipe = filter_pipe
        self.swap_level = swap_level
        self.freq = freq
        super().__init__(config)
Ejemplo n.º 15
0
    def _gen_data(self, config, datasets=["train", "valid", "test"]):
        try:
            path = config.pop("path")
        except KeyError as e:
            raise ValueError("Must specify the path to save the dataset.") from e
        if os.path.isfile(path):
            start = time.time()
            print_log("Dataset exists, load from disk.", __name__)

            # res = dataset.prepare(['train', 'valid', 'test'])
            with open(path, "rb") as f:
                data = pkl.load(f)
            if isinstance(data, dict):
                res = [data[i] for i in datasets]
            else:
                res = data.prepare(datasets)
            print_log(f"Data loaded, time cost: {time.time() - start:.2f}", __name__)
        else:
            if not os.path.exists(os.path.dirname(path)):
                os.makedirs(os.path.dirname(path))
            print_log("Generating dataset", __name__)
            start_time = time.time()
            self._prepare_calender_cache()
            dataset = init_instance_by_config(config)
            dataset.config(dump_all=True, recursive=True)
            dataset.to_pickle(path)
            res = dataset.prepare(datasets)
            print_log(f"Data generated, time cost: {(time.time() - start_time):.2f}", __name__)
        return res
Ejemplo n.º 16
0
    def _gen_dataset(self, config):
        try:
            path = config.pop("path")
        except KeyError as e:
            raise ValueError("Must specify the path to save the dataset.") from e
        if os.path.isfile(path):
            start = time.time()
            print_log("Dataset exists, load from disk.", __name__)

            with open(path, "rb") as f:
                dataset = pkl.load(f)
            print_log(f"Data loaded, time cost: {time.time() - start:.2f}", __name__)
        else:
            start = time.time()
            if not os.path.exists(os.path.dirname(path)):
                os.makedirs(os.path.dirname(path))
            print_log("Generating dataset", __name__)
            self._prepare_calender_cache()
            dataset = init_instance_by_config(config)
            print_log(f"Dataset init, time cost: {time.time() - start:.2f}", __name__)
            dataset.prepare(["train", "valid", "test"])
            print_log(f"Dataset prepared, time cost: {time.time() - start:.2f}", __name__)
            dataset.config(dump_all=True, recursive=True)
            dataset.to_pickle(path)
        return dataset
Ejemplo n.º 17
0
    def basic_task(self):
        """For fast training rolling"""
        if self.model_type == "gbdt":
            conf_path = DIRNAME.parent.parent / "benchmarks" / "LightGBM" / "workflow_config_lightgbm_Alpha158.yaml"
            # dump the processed data on to disk for later loading to speed up the processing
            h_path = DIRNAME / "lightgbm_alpha158_handler_horizon{}.pkl".format(
                self.horizon)
        elif self.model_type == "linear":
            conf_path = DIRNAME.parent.parent / "benchmarks" / "Linear" / "workflow_config_linear_Alpha158.yaml"
            h_path = DIRNAME / "linear_alpha158_handler_horizon{}.pkl".format(
                self.horizon)
        else:
            raise AssertionError("Model type is not supported!")
        with conf_path.open("r") as f:
            conf = yaml.safe_load(f)

        # modify dataset horizon
        conf["task"]["dataset"]["kwargs"]["handler"]["kwargs"]["label"] = [
            "Ref($close, -{}) / Ref($close, -1) - 1".format(self.horizon + 1)
        ]

        task = conf["task"]

        if not h_path.exists():
            h_conf = task["dataset"]["kwargs"]["handler"]
            h = init_instance_by_config(h_conf)
            h.to_pickle(h_path, dump_all=True)

        task["dataset"]["kwargs"]["handler"] = f"file://{h_path}"
        task["record"] = ["qlib.workflow.record_temp.SignalRecord"]
        return task
Ejemplo n.º 18
0
def train(uri_path: str = None):
    """train model

    Returns
    -------
        pred_score: pandas.DataFrame
            predict scores
        performance: dict
            model performance
    """

    # model initiaiton
    model = init_instance_by_config(CSI300_GBDT_TASK["model"])
    dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"])
    # To test __repr__
    print(dataset)
    print(R)

    # start exp
    with R.start(experiment_name="workflow", uri=uri_path):
        R.log_params(**flatten_dict(CSI300_GBDT_TASK))
        model.fit(dataset)
        R.save_objects(trained_model=model)
        # prediction
        recorder = R.get_recorder()
        # To test __repr__
        print(recorder)
        # To test get_local_dir
        print(recorder.get_local_dir())
        rid = recorder.id
        sr = SignalRecord(model, dataset, recorder)
        sr.generate()
        pred_score = sr.load("pred.pkl")

        # calculate ic and ric
        sar = SigAnaRecord(recorder)
        sar.generate()
        ic = sar.load("ic.pkl")
        ric = sar.load("ric.pkl")

    return pred_score, {"ic": ic, "ric": ric}, rid
Ejemplo n.º 19
0
def run_exp(task_config, dataset, experiment_name, recorder_name, uri):

    # model initiaiton
    print("")
    print("[{:}] - [{:}]: {:}".format(experiment_name, recorder_name, uri))
    print("dataset={:}".format(dataset))

    model = init_instance_by_config(task_config["model"])

    # start exp
    with R.start(experiment_name=experiment_name,
                 recorder_name=recorder_name,
                 uri=uri):

        log_file = R.get_recorder().root_uri / "{:}.log".format(
            experiment_name)
        set_log_basic_config(log_file)

        # train model
        R.log_params(**flatten_dict(task_config))
        model.fit(dataset)
        recorder = R.get_recorder()
        R.save_objects(**{"model.pkl": model})

        # generate records: prediction, backtest, and analysis
        for record in task_config["record"]:
            record = record.copy()
            if record["class"] == "SignalRecord":
                srconf = {
                    "model": model,
                    "dataset": dataset,
                    "recorder": recorder
                }
                record["kwargs"].update(srconf)
                sr = init_instance_by_config(record)
                sr.generate()
            else:
                rconf = {"recorder": recorder}
                record["kwargs"].update(rconf)
                ar = init_instance_by_config(record)
                ar.generate()
Ejemplo n.º 20
0
    def __init__(self,
                 handler_config: dict,
                 fetch_kwargs: dict = {},
                 is_group=False):
        """
        Parameters
        ----------
        handler_config : dict
            handler_config will be used to describe the handlers

            .. code-block::

                <handler_config> := {
                    "group_name1": <handler>
                    "group_name2": <handler>
                }
                or
                <handler_config> := <handler>
                <handler> := DataHandler Instance | DataHandler Config

        fetch_kwargs : dict
            fetch_kwargs will be used to describe the different arguments of fetch method, such as col_set, squeeze, data_key, etc.

        is_group: bool
            is_group will be used to describe whether the key of handler_config is group

        """
        from qlib.data.dataset.handler import DataHandler

        if is_group:
            self.handlers = {
                grp: init_instance_by_config(config, accept_types=DataHandler)
                for grp, config in handler_config.items()
            }
        else:
            self.handlers = init_instance_by_config(handler_config,
                                                    accept_types=DataHandler)

        self.is_group = is_group
        self.fetch_kwargs = {"col_set": DataHandler.CS_RAW}
        self.fetch_kwargs.update(fetch_kwargs)
Ejemplo n.º 21
0
 def collect_data(self):
     self._init_qlib()
     model = init_instance_by_config(self.task["model"])
     dataset = init_instance_by_config(self.task["dataset"])
     self._train_model(model, dataset)
     executor_config = self.port_analysis_config["executor"]
     backtest_config = self.port_analysis_config["backtest"]
     backtest_config["benchmark"] = self.benchmark
     strategy_config = {
         "class": "TopkDropoutStrategy",
         "module_path": "qlib.contrib.strategy.signal_strategy",
         "kwargs": {
             "signal": (model, dataset),
             "topk": 50,
             "n_drop": 5,
         },
     }
     data_generator = collect_data(executor=executor_config,
                                   strategy=strategy_config,
                                   **backtest_config)
     for trade_decision in data_generator:
         print(trade_decision)
Ejemplo n.º 22
0
def train():
    """train model

    Returns
    -------
        pred_score: pandas.DataFrame
            predict scores
        performance: dict
            model performance
    """

    # model initiaiton
    model = init_instance_by_config(task["model"])
    dataset = init_instance_by_config(task["dataset"])
    # To test __repr__
    print(dataset)
    print(R)

    # start exp
    with R.start(experiment_name="workflow"):
        R.log_params(**flatten_dict(task))
        model.fit(dataset)

        # prediction
        recorder = R.get_recorder()
        # To test __repr__
        print(recorder)
        rid = recorder.id
        sr = SignalRecord(model, dataset, recorder)
        sr.generate()
        pred_score = sr.load()

        # calculate ic and ric
        sar = SigAnaRecord(recorder)
        sar.generate()
        ic = sar.load(sar.get_path("ic.pkl"))
        ric = sar.load(sar.get_path("ric.pkl"))

    return pred_score, {"ic": ic, "ric": ric}, rid
Ejemplo n.º 23
0
def end_task_train(rec: Recorder, experiment_name: str) -> Recorder:
    """
    Finish task training with real model fitting and saving.

    Args:
        rec (Recorder): the recorder will be resumed
        experiment_name (str): the name of experiment

    Returns:
        Recorder: the model recorder
    """
    with R.start(experiment_name=experiment_name,
                 recorder_id=rec.info["id"],
                 resume=True):
        task_config = R.load_object("task")
        # model & dataset initiation
        model: Model = init_instance_by_config(task_config["model"])
        dataset: Dataset = init_instance_by_config(task_config["dataset"])
        # model training
        model.fit(dataset)
        R.save_objects(**{"params.pkl": model})
        # this dataset is saved for online inference. So the concrete data should not be dumped
        dataset.config(dump_all=False, recursive=True)
        R.save_objects(**{"dataset": dataset})
        # generate records: prediction, backtest, and analysis
        records = task_config.get("record", [])
        if isinstance(records, dict):  # prevent only one dict
            records = [records]
        for record in records:
            cls, kwargs = get_cls_kwargs(
                record, default_module="qlib.workflow.record_temp")
            if cls is SignalRecord:
                rconf = {"model": model, "dataset": dataset, "recorder": rec}
            else:
                rconf = {"recorder": rec}
            r = cls(**kwargs, **rconf)
            r.generate()

    return rec
Ejemplo n.º 24
0
def task_train(task_config: dict, experiment_name):
    """
    task based training

    Parameters
    ----------
    task_config : dict
        A dict describes a task setting.
    """

    # model initiaiton
    model = init_instance_by_config(task_config["model"])
    dataset = init_instance_by_config(task_config["dataset"])

    # start exp
    with R.start(experiment_name=experiment_name):
        # train model
        R.log_params(**flatten_dict(task_config))
        model.fit(dataset)
        recorder = R.get_recorder()
        R.save_objects(**{"params.pkl": model})

        # generate records: prediction, backtest, and analysis
        for record in task_config["record"]:
            if record["class"] == SignalRecord.__name__:
                srconf = {
                    "model": model,
                    "dataset": dataset,
                    "recorder": recorder
                }
                record["kwargs"].update(srconf)
                sr = init_instance_by_config(record)
                sr.generate()
            else:
                rconf = {"recorder": recorder}
                record["kwargs"].update(rconf)
                ar = init_instance_by_config(record)
                ar.generate()
Ejemplo n.º 25
0
    def get_feature_importance(self):
        # this must be lightGBM, because it needs to get the feature importance
        rb = RollingBenchmark(model_type="gbdt")
        task = rb.basic_task()

        model = init_instance_by_config(task["model"])
        dataset = init_instance_by_config(task["dataset"])
        model.fit(dataset)

        fi = model.get_feature_importance()

        # Because the model use numpy instead of dataframe for training lightgbm
        # So the we must use following extra steps to get the right feature importance
        df = dataset.prepare(segments=slice(None),
                             col_set="feature",
                             data_key=DataHandlerLP.DK_R)
        cols = df.columns
        fi_named = {
            cols[int(k.split("_")[1])]: imp
            for k, imp in fi.to_dict().items()
        }

        return pd.Series(fi_named)
Ejemplo n.º 26
0
 def _dump_pre_handler(self, path):
     handler_config = {
         "class": "Alpha158",
         "module_path": "qlib.contrib.data.handler",
         "kwargs": {
             "start_time": self.start_time,
             "end_time": self.end_time,
             "instruments": self.MARKET,
             "infer_processors": [],
             "learn_processors": [],
         },
     }
     pre_handler = init_instance_by_config(handler_config)
     pre_handler.config(dump_all=True)
     pre_handler.to_pickle(path)
Ejemplo n.º 27
0
 def init_vars(self, init_cash, position_dict, freq: str,
               benchmark_config: dict):
     self.init_cash = init_cash
     self.current_position: BasePosition = init_instance_by_config({
         "class":
         self._pos_type,
         "kwargs": {
             "cash": init_cash,
             "position_dict": position_dict,
         },
         "module_path":
         "qlib.backtest.position",
     })
     self.portfolio_metrics = None
     self.hist_positions = {}
     self.reset(freq=freq, benchmark_config=benchmark_config)
Ejemplo n.º 28
0
def create_signal_from(
    obj: Union[Signal, Tuple[BaseModel, Dataset], List, Dict, Text, pd.Series,
               pd.DataFrame]
) -> Signal:
    """
    create signal from diverse information
    This method will choose the right method to create a signal based on `obj`
    Please refer to the code below.
    """
    if isinstance(obj, Signal):
        return obj
    elif isinstance(obj, (tuple, list)):
        return ModelSignal(*obj)
    elif isinstance(obj, (dict, str)):
        return init_instance_by_config(obj)
    elif isinstance(obj, (pd.DataFrame, pd.Series)):
        return SignalWCache(signal=obj)
    else:
        raise NotImplementedError(f"This type of signal is not supported")
Ejemplo n.º 29
0
def main(xargs, exp_yaml):
    assert Path(exp_yaml).exists(), "{:} does not exist.".format(exp_yaml)

    with open(exp_yaml) as fp:
        config = yaml.safe_load(fp)
    config = update_gpu(config, xargs.gpu)
    # config = update_market(config, 'csi300')

    qlib.init(**config.get("qlib_init"))
    dataset_config = config.get("task").get("dataset")
    dataset = init_instance_by_config(dataset_config)
    pprint("args: {:}".format(xargs))
    pprint(dataset_config)
    pprint(dataset)

    for irun in range(xargs.times):
        run_exp(config.get("task"), dataset, xargs.alg,
                "recorder-{:02d}-{:02d}".format(irun,
                                                xargs.times), xargs.save_dir)
Ejemplo n.º 30
0
    def dump_data_for_proxy_model(self):
        """
        Dump data for training meta model.
        The meta model will be trained upon the proxy forecasting model.
        This dataset is for the proxy forecasting model.
        """
        topk = 30
        fi = self.get_feature_importance()
        col_selected = fi.nlargest(topk)

        rb = RollingBenchmark(model_type=self.sim_task_model)
        task = rb.basic_task()
        dataset = init_instance_by_config(task["dataset"])
        prep_ds = dataset.prepare(slice(None),
                                  col_set=["feature", "label"],
                                  data_key=DataHandlerLP.DK_L)

        feature_df = prep_ds["feature"]
        label_df = prep_ds["label"]

        feature_selected = feature_df.loc[:, col_selected.index]

        feature_selected = feature_selected.groupby("datetime").apply(
            lambda df: (df - df.mean()).div(df.std()))
        feature_selected = feature_selected.fillna(0.0)

        df_all = {
            "label": label_df.reindex(feature_selected.index),
            "feature": feature_selected,
        }
        df_all = pd.concat(df_all, axis=1)
        df_all.to_pickle(DIRNAME / "fea_label_df.pkl")

        # dump data in handler format for aligning the interface
        handler = DataHandlerLP(
            data_loader={
                "class": "qlib.data.dataset.loader.StaticDataLoader",
                "kwargs": {
                    "config": DIRNAME / "fea_label_df.pkl"
                },
            })
        handler.to_pickle(DIRNAME / "handler_proxy.pkl", dump_all=True)