Ejemplo n.º 1
0
    def train(self, tasks: list, train_func: Callable = None, experiment_name: str = None, **kwargs) -> List[Recorder]:
        """
        Given a list of `task`s and return a list of trained Recorder. The order can be guaranteed.

        Args:
            tasks (list): a list of definitions based on `task` dict
            train_func (Callable): the training method which needs at least `tasks` and `experiment_name`. None for the default training method.
            experiment_name (str): the experiment name, None for use default name.
            kwargs: the params for train_func.

        Returns:
            List[Recorder]: a list of Recorders
        """
        if isinstance(tasks, dict):
            tasks = [tasks]
        if len(tasks) == 0:
            return []
        if train_func is None:
            train_func = self.train_func
        if experiment_name is None:
            experiment_name = self.experiment_name
        recs = []
        for task in tqdm(tasks, desc="train tasks"):
            if self._call_in_subproc:
                get_module_logger("TrainerR").info("running models in sub process (for forcing release memroy).")
                train_func = call_in_subproc(train_func, C)
            rec = train_func(task, experiment_name, recorder_name=self.default_rec_name, **kwargs)
            rec.set_tags(**{self.STATUS_KEY: self.STATUS_BEGIN})
            recs.append(rec)
        return recs
Ejemplo n.º 2
0
    def _load_internal(self, instrument, start_index, end_index, freq):

        _calendar = Cal.calendar(freq=freq)
        resample_data = np.empty(end_index - start_index + 1, dtype="float32")

        for cur_index in range(start_index, end_index + 1):
            cur_time = _calendar[cur_index]
            # To load expression accurately, more historical data are required
            start_ws, end_ws = self.feature.get_extended_window_size()
            if end_ws > 0:
                raise ValueError(
                    "PIT database does not support referring to future period (e.g. expressions like `Ref('$$roewa_q', -1)` are not supported"
                )

            # The calculated value will always the last element, so the end_offset is zero.
            try:
                s = self._load_feature(instrument, -start_ws, 0, cur_time)
                resample_data[
                    cur_index -
                    start_index] = s.iloc[-1] if len(s) > 0 else np.nan
            except FileNotFoundError:
                get_module_logger("base").warning(
                    f"WARN: period data not found for {str(self)}")
                return pd.Series(dtype="float32", name=str(self))

        resample_series = pd.Series(resample_data,
                                    index=pd.RangeIndex(
                                        start_index, end_index + 1),
                                    dtype="float32",
                                    name=str(self))
        return resample_series
Ejemplo n.º 3
0
def get_mongodb() -> Database:
    """
    Get database in MongoDB, which means you need to declare the address and the name of a database at first.

    For example:

        Using qlib.init():

            mongo_conf = {
                "task_url": task_url,  # your MongoDB url
                "task_db_name": task_db_name,  # database name
            }
            qlib.init(..., mongo=mongo_conf)

        After qlib.init():

            C["mongo"] = {
                "task_url" : "mongodb://localhost:27017/",
                "task_db_name" : "rolling_db"
            }

    Returns:
        Database: the Database instance
    """
    try:
        cfg = C["mongo"]
    except KeyError:
        get_module_logger("task").error(
            "Please configure `C['mongo']` before using TaskManager")
        raise

    client = MongoClient(cfg["task_url"])
    return client.get_database(name=cfg["task_db_name"])
Ejemplo n.º 4
0
    def __call__(self, ensemble_dict: dict) -> pd.DataFrame:
        """using sample:
        from qlib.model.ens.ensemble import AverageEnsemble
        pred_res['new_key_name'] = AverageEnsemble()(predict_dict)

        Parameters
        ----------
        ensemble_dict : dict
            Dictionary you want to ensemble

        Returns
        -------
        pd.DataFrame
            The dictionary including ensenbling result
        """
        # need to flatten the nested dict
        ensemble_dict = flatten_dict(ensemble_dict, sep=FLATTEN_TUPLE)
        get_module_logger("AverageEnsemble").info(
            f"keys in group: {list(ensemble_dict.keys())}")
        values = list(ensemble_dict.values())
        # NOTE: this may change the style underlying data!!!!
        # from pd.DataFrame to pd.Series
        results = pd.concat(values, axis=1)
        results = results.groupby("datetime").apply(
            lambda df: (df - df.mean()) / df.std())
        results = results.mean(axis=1)
        results = results.sort_index()
        return results
Ejemplo n.º 5
0
 def __call__(self, ensemble_dict: dict) -> pd.DataFrame:
     get_module_logger("RollingEnsemble").info(
         f"keys in group: {list(ensemble_dict.keys())}")
     artifact_list = list(ensemble_dict.values())
     artifact_list.sort(
         key=lambda x: x.index.get_level_values("datetime").min())
     artifact = pd.concat(artifact_list)
     # If there are duplicated predition, use the latest perdiction
     artifact = artifact[~artifact.index.duplicated(keep="last")]
     artifact = artifact.sort_index()
     return artifact
Ejemplo n.º 6
0
    def load(self, instruments=None, start_time=None, end_time=None) -> pd.DataFrame:
        if instruments is not None:
            get_module_logger(self.__class__.__name__).warning(f"instruments[{instruments}] is ignored")

        if self.is_group:
            df = pd.concat(
                {
                    grp: dh.fetch(selector=slice(start_time, end_time), level="datetime", **self.fetch_kwargs)
                    for grp, dh in self.handlers.items()
                },
                axis=1,
            )
        else:
            df = self.handlers.fetch(selector=slice(start_time, end_time), level="datetime", **self.fetch_kwargs)
        return df
Ejemplo n.º 7
0
    def __init__(
        self,
        *,
        riskmodel_root,
        market="csi500",
        turn_limit=None,
        name_mapping={},
        optimizer_kwargs={},
        verbose=False,
        **kwargs,
    ):
        super().__init__(**kwargs)

        self.logger = get_module_logger("EnhancedIndexingStrategy")

        self.riskmodel_root = riskmodel_root
        self.market = market
        self.turn_limit = turn_limit

        self.factor_exp_path = name_mapping.get("factor_exp",
                                                self.FACTOR_EXP_NAME)
        self.factor_cov_path = name_mapping.get("factor_cov",
                                                self.FACTOR_COV_NAME)
        self.specific_risk_path = name_mapping.get("specific_risk",
                                                   self.SPECIFIC_RISK_NAME)
        self.blacklist_path = name_mapping.get("blacklist",
                                               self.BLACKLIST_NAME)

        self.optimizer = EnhancedIndexingOptimizer(**optimizer_kwargs)

        self.verbose = verbose

        self._riskdata_cache = {}
Ejemplo n.º 8
0
    def get_range_limit(self, **kwargs: Any) -> Tuple[int, int]:
        """
        return the expected step range for limiting the decision execution time
        Both left and right are **closed**

        if no available trade_range, `default_value` will be returned

        It is only used in `NestedExecutor`
        - The outmost strategy will not follow any range limit (but it may give range_limit)
        - The inner most strategy's range_limit will be useless due to atomic executors don't have such
          features.

        **NOTE**:
        1) This function must be called after `self.update` in following cases(ensured by NestedExecutor):
        - user relies on the auto-clip feature of `self.update`

        2) This function will be called after _init_sub_trading in NestedExecutor.

        Parameters
        ----------
        **kwargs:
            {
                "default_value": <default_value>, # using dict is for distinguish no value provided or None provided
                "inner_calendar": <trade calendar of inner strategy>
                # because the range limit  will control the step range of inner strategy, inner calendar will be a
                # important parameter when trade_range is callable
            }

        Returns
        -------
        Tuple[int, int]:

        Raises
        ------
        NotImplementedError:
            If the following criteria meet
            1) the decision can't provide a unified start and end
            2) default_value is not provided
        """
        try:
            _start_idx, _end_idx = self._get_range_limit(**kwargs)
        except NotImplementedError as e:
            if "default_value" in kwargs:
                return kwargs["default_value"]
            else:
                # Default to get full index
                raise NotImplementedError(f"The decision didn't provide an index range") from e

        # clip index
        if getattr(self, "total_step", None) is not None:
            # if `self.update` is called.
            # Then the _start_idx, _end_idx should be clipped
            assert self.total_step is not None
            if _start_idx < 0 or _end_idx >= self.total_step:
                logger = get_module_logger("decision")
                logger.warning(
                    f"[{_start_idx},{_end_idx}] go beyond the total_step({self.total_step}), it will be clipped.",
                )
                _start_idx, _end_idx = max(0, _start_idx), min(self.total_step - 1, _end_idx)
        return _start_idx, _end_idx
Ejemplo n.º 9
0
    def __init__(self,
                 net_config=None,
                 opt_config=None,
                 metric="",
                 GPU=0,
                 seed=None,
                 **kwargs):
        # Set logger.
        self.logger = get_module_logger("QuantTransformer")
        self.logger.info("QuantTransformer PyTorch version...")

        # set hyper-parameters.
        self.net_config = net_config or DEFAULT_NET_CONFIG
        self.opt_config = opt_config or DEFAULT_OPT_CONFIG
        self.metric = metric
        self.device = torch.device("cuda:{:}".format(
            GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu")
        self.seed = seed

        self.logger.info("Transformer parameters setting:"
                         "\nnet_config : {:}"
                         "\nopt_config : {:}"
                         "\nmetric     : {:}"
                         "\ndevice     : {:}"
                         "\nseed       : {:}".format(
                             self.net_config,
                             self.opt_config,
                             self.metric,
                             self.device,
                             self.seed,
                         ))

        if self.seed is not None:
            random.seed(self.seed)
            np.random.seed(self.seed)
            torch.manual_seed(self.seed)
            if self.use_gpu:
                torch.cuda.manual_seed(self.seed)
                torch.cuda.manual_seed_all(self.seed)

        self.model = get_transformer(self.net_config)
        self.model.set_super_run_type(super_core.SuperRunMode.FullModel)
        self.logger.info("model: {:}".format(self.model))
        self.logger.info("model size: {:.3f} MB".format(
            count_parameters(self.model)))

        if self.opt_config["optimizer"] == "adam":
            self.train_optimizer = optim.Adam(self.model.parameters(),
                                              lr=self.opt_config["lr"])
        elif self.opt_config["optimizer"] == "adam":
            self.train_optimizer = optim.SGD(self.model.parameters(),
                                             lr=self.opt_config["lr"])
        else:
            raise NotImplementedError(
                "optimizer {:} is not supported!".format(optimizer))

        self.fitted = False
        self.model.to(self.device)
Ejemplo n.º 10
0
    def collect(self,
                artifacts_key=None,
                rec_filter_func=None,
                only_exist=True) -> dict:
        """
        Collect different artifacts based on recorder after filtering.

        Args:
            artifacts_key (str or List, optional): the artifacts key you want to get. If None, use the default.
            rec_filter_func (Callable, optional): filter the recorder by return True or False. If None, use the default.
            only_exist (bool, optional): if only collect the artifacts when a recorder really has.
                If True, the recorder with exception when loading will not be collected. But if False, it will raise the exception.

        Returns:
            dict: the dict after collected like {artifact: {rec_key: object}}
        """
        if artifacts_key is None:
            artifacts_key = self.artifacts_key
        if rec_filter_func is None:
            rec_filter_func = self.rec_filter_func

        if isinstance(artifacts_key, str):
            artifacts_key = [artifacts_key]

        collect_dict = {}
        # filter records
        recs = self.experiment.list_recorders(**self.list_kwargs)
        recs_flt = {}
        for rid, rec in recs.items():
            if rec_filter_func is None or rec_filter_func(rec):
                recs_flt[rid] = rec

        logger = get_module_logger("RecorderCollector")
        for _, rec in recs_flt.items():
            rec_key = self.rec_key_func(rec)
            for key in artifacts_key:
                if self.ART_KEY_RAW == key:
                    artifact = rec
                else:
                    try:
                        artifact = rec.load_object(self.artifacts_path[key])
                    except Exception as e:
                        if only_exist:
                            # only collect existing artifact
                            continue
                        raise e
                # give user some warning if the values are overridden
                cdd = collect_dict.setdefault(key, {})
                if rec_key in cdd:
                    logger.warning(
                        f"key '{rec_key}' is duplicated. Previous value will be overrides. Please check you `rec_key_func`"
                    )
                cdd[rec_key] = artifact

        return collect_dict
Ejemplo n.º 11
0
def run_exp(task_config, dataset, experiment_name, recorder_name, uri):

    model = init_instance_by_config(task_config["model"])
    model_fit_kwargs = dict(dataset=dataset)

    # Let's start the experiment.
    with R.start(
            experiment_name=experiment_name,
            recorder_name=recorder_name,
            uri=uri,
            resume=True,
    ):
        # Setup log
        recorder_root_dir = R.get_recorder().get_local_dir()
        log_file = os.path.join(recorder_root_dir,
                                "{:}.log".format(experiment_name))
        set_log_basic_config(log_file)
        logger = get_module_logger("q.run_exp")
        logger.info("task_config::\n{:}".format(
            pprint.pformat(task_config, indent=2)))
        logger.info("[{:}] - [{:}]: {:}".format(experiment_name, recorder_name,
                                                uri))
        logger.info("dataset={:}".format(dataset))

        # Train model
        R.log_params(**flatten_dict(task_config))
        if "save_path" in inspect.getfullargspec(model.fit).args:
            model_fit_kwargs["save_path"] = os.path.join(
                recorder_root_dir, "model.ckp")
        elif "save_dir" in inspect.getfullargspec(model.fit).args:
            model_fit_kwargs["save_dir"] = os.path.join(
                recorder_root_dir, "model-ckps")
        model.fit(**model_fit_kwargs)
        # Get the recorder
        recorder = R.get_recorder()
        R.save_objects(**{"model.pkl": model})

        # Generate records: prediction, backtest, and analysis
        for record in task_config["record"]:
            record = record.copy()
            if record["class"] == "SignalRecord":
                srconf = {
                    "model": model,
                    "dataset": dataset,
                    "recorder": recorder
                }
                record["kwargs"].update(srconf)
                sr = init_instance_by_config(record)
                sr.generate()
            else:
                rconf = {"recorder": recorder}
                record["kwargs"].update(rconf)
                ar = init_instance_by_config(record)
                ar.generate()
Ejemplo n.º 12
0
    def __init__(self, name_id: str):
        """
        Init OnlineStrategy.
        This module **MUST** use `Trainer <../reference/api.html#Trainer>`_ to finishing model training.

        Args:
            name_id (str): a unique name or id.
            trainer (Trainer, optional): a instance of Trainer. Defaults to None.
        """
        self.name_id = name_id
        self.logger = get_module_logger(self.__class__.__name__)
        self.tool = OnlineTool()
Ejemplo n.º 13
0
    def __init__(self, socketio, app):
        super(RequestListener, self).__init__()

        # define flask app instances
        self.socketio = socketio
        self.app = app

        # define server instances
        self.channel = init_rabbitmq_channel(C.queue_host, C.queue_user,
                                             C.queue_pwd)
        self.channel.queue_declare(queue=C.task_queue, durable=True)
        self.channel.queue_declare(queue=C.message_queue, durable=True)
        self.logger = get_module_logger(self.__class__.__name__)
        self.redis_t = get_redis_connection()
    def __init__(self, net_config=None, opt_config=None, metric="", GPU=0, seed=None, **kwargs):
        # Set logger.
        self.logger = get_module_logger("QuantTransformer")
        self.logger.info("QuantTransformer pytorch version...")

        # set hyper-parameters.
        self.net_config = net_config or default_net_config
        self.opt_config = opt_config or default_opt_config
        self.metric = metric
        self.device = torch.device("cuda:{:}".format(GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu")
        self.seed = seed

        self.logger.info(
            "Transformer parameters setting:"
            "\nnet_config : {:}"
            "\nopt_config : {:}"
            "\nmetric     : {:}"
            "\ndevice     : {:}"
            "\nseed       : {:}".format(
                self.net_config,
                self.opt_config,
                self.metric,
                self.device,
                self.seed,
            )
        )

        if self.seed is not None:
            np.random.seed(self.seed)
            torch.manual_seed(self.seed)

        self.model = TransformerModel(
            d_feat=self.net_config["d_feat"],
            embed_dim=self.net_config["hidden_size"],
            depth=self.net_config["depth"],
            pos_drop=self.net_config["pos_drop"],
        )
        self.logger.info("model: {:}".format(self.model))
        self.logger.info("model size: {:.3f} MB".format(count_parameters(self.model)))

        if self.opt_config["optimizer"] == "adam":
            self.train_optimizer = optim.Adam(self.model.parameters(), lr=self.opt_config["lr"])
        elif self.opt_config["optimizer"] == "adam":
            self.train_optimizer = optim.SGD(self.model.parameters(), lr=self.opt_config["lr"])
        else:
            raise NotImplementedError("optimizer {:} is not supported!".format(optimizer))

        self.fitted = False
        self.model.to(self.device)
Ejemplo n.º 15
0
    def __init__(self, is_interface=False, update_interval=24, max_workers=20):
        """

        Parameters
        ----------
        is_interface : bool
            whether this class needs to run or simply provides interface for a queue to call
        update_interval : int
            the hourly interval to update the cache
        max_workers: int
            multi-process count
        """
        super(DataUpdater, self).__init__()
        self.logger = get_module_logger(self.__class__.__name__)
        self.is_interface = is_interface
        self.update_interval = update_interval
        self.max_workers = max_workers
Ejemplo n.º 16
0
def main():
    LOG = get_module_logger(__file__)

    from qlib_server.request_handler import RequestHandler
    from qlib_server.data_processor import DataProcessor

    LOG.info("QLibServer starting...")
    threads = []
    if "request_handler" in ARGS.module:
        threads.append(RequestHandler())
    if "data_processor" in ARGS.module:
        threads.append(DataProcessor())

    for t in threads:
        t.start()

    for t in threads:
        t.join()
Ejemplo n.º 17
0
    def __init__(self, d_feat=6, seed=None, **kwargs):
        # Set logger.
        self.logger = get_module_logger("NAIVE")
        self.logger.info("NAIVE version...")

        # set hyper-parameters.
        self.d_feat = d_feat
        self.seed = seed

        self.logger.info(
            "NAIVE parameters setting: d_feat={:}, seed={:}".format(
                self.d_feat, self.seed))

        if self.seed is not None:
            random.seed(self.seed)
            np.random.seed(self.seed)

        self.fitted = False
Ejemplo n.º 18
0
    def __init__(
        self,
        log_every_n_episode: int = 20,
        total_episodes: int | None = None,
        float_format: str = ":.4f",
        counter_format: str = ":4d",
        loglevel: int | LogLevel = LogLevel.PERIODIC,
    ):
        super().__init__(loglevel)
        # TODO: support log_every_n_step
        self.log_every_n_episode = log_every_n_episode
        self.total_episodes = total_episodes

        self.counter_format = counter_format
        self.float_format = float_format

        self.prefix = ""

        self.console_logger = get_module_logger(__name__, level=logging.INFO)
Ejemplo n.º 19
0
    def __init__(
        self,
        time_per_step: str,
        start_time: Union[str, pd.Timestamp] = None,
        end_time: Union[str, pd.Timestamp] = None,
        indicator_config: dict = {},
        generate_portfolio_metrics: bool = False,
        verbose: bool = False,
        track_data: bool = False,
        trade_exchange: Exchange = None,
        common_infra: CommonInfrastructure = None,
        settle_type=BasePosition.ST_NO,
        **kwargs,
    ):
        """
        Parameters
        ----------
        time_per_step : str
            trade time per trading step, used for generate the trade calendar
        show_indicator: bool, optional
            whether to show indicators, :
            - 'pa', the price advantage
            - 'pos', the positive rate
            - 'ffr', the fulfill rate
        indicator_config: dict, optional
            config for calculating trade indicator, including the following fields:
            - 'show_indicator': whether to show indicators, optional, default by False. The indicators includes
                - 'pa', the price advantage
                - 'pos', the positive rate
                - 'ffr', the fulfill rate
            - 'pa_config': config for calculating price advantage(pa), optional
                - 'base_price': the based price than which the trading price is advanced, Optional, default by 'twap'
                    - If 'base_price' is 'twap', the based price is the time weighted average price
                    - If 'base_price' is 'vwap', the based price is the volume weighted average price
                - 'weight_method': weighted method when calculating total trading pa by different orders' pa in each step, optional, default by 'mean'
                    - If 'weight_method' is 'mean', calculating mean value of different orders' pa
                    - If 'weight_method' is 'amount_weighted', calculating amount weighted average value of different orders' pa
                    - If 'weight_method' is 'value_weighted', calculating value weighted average value of different orders' pa
            - 'ffr_config': config for calculating fulfill rate(ffr), optional
                - 'weight_method': weighted method when calculating total trading ffr by different orders' ffr in each step, optional, default by 'mean'
                    - If 'weight_method' is 'mean', calculating mean value of different orders' ffr
                    - If 'weight_method' is 'amount_weighted', calculating amount weighted average value of different orders' ffr
                    - If 'weight_method' is 'value_weighted', calculating value weighted average value of different orders' ffr
            Example:
                {
                    'show_indicator': True,
                    'pa_config': {
                        "agg": "twap",  # "vwap"
                        "price": "$close", # default to use deal price of the exchange
                    },
                    'ffr_config':{
                        'weight_method': 'value_weighted',
                    }
                }
        generate_portfolio_metrics : bool, optional
            whether to generate portfolio_metrics, by default False
        verbose : bool, optional
            whether to print trading info, by default False
        track_data : bool, optional
            whether to generate trade_decision, will be used when training rl agent
            - If `self.track_data` is true, when making data for training, the input `trade_decision` of `execute` will be generated by `collect_data`
            - Else,  `trade_decision` will not be generated

        trade_exchange : Exchange
            exchange that provides market info, used to generate portfolio_metrics
            - If generate_portfolio_metrics is None, trade_exchange will be ignored
            - Else If `trade_exchange` is None, self.trade_exchange will be set with common_infra

        common_infra : CommonInfrastructure, optional:
            common infrastructure for backtesting, may including:
            - trade_account : Account, optional
                trade account for trading
            - trade_exchange : Exchange, optional
                exchange that provides market info

        settle_type : str
            Please refer to the docs of BasePosition.settle_start
        """
        self.time_per_step = time_per_step
        self.indicator_config = indicator_config
        self.generate_portfolio_metrics = generate_portfolio_metrics
        self.verbose = verbose
        self.track_data = track_data
        self._trade_exchange = trade_exchange
        self.level_infra = LevelInfrastructure()
        self.level_infra.reset_infra(common_infra=common_infra)
        self._settle_type = settle_type
        self.reset(start_time=start_time,
                   end_time=end_time,
                   common_infra=common_infra)
        if common_infra is None:
            get_module_logger("BaseExecutor").warning(
                f"`common_infra` is not set for {self}")

        # record deal order amount in one day
        self.dealt_order_amount = defaultdict(float)
        self.deal_day = None
Ejemplo n.º 20
0
 def __init__(self):
     """
     Init OnlineTool.
     """
     self.logger = get_module_logger(self.__class__.__name__)
Ejemplo n.º 21
0
    def run_epoch(self,
                  phase,
                  task_list,
                  epoch,
                  opt,
                  loss_l,
                  ignore_weight=False):
        if phase == "train":
            self.tn.train()
            torch.set_grad_enabled(True)
        else:
            self.tn.eval()
            torch.set_grad_enabled(False)
        running_loss = 0.0
        pred_y_all = []
        for task in tqdm(task_list, desc=f"{phase} Task", leave=False):
            meta_input = task.get_meta_input()
            pred, weights = self.tn(
                meta_input["X"],
                meta_input["y"],
                meta_input["time_perf"],
                meta_input["time_belong"],
                meta_input["X_test"],
                ignore_weight=ignore_weight,
            )
            if self.criterion == "mse":
                criterion = nn.MSELoss()
                loss = criterion(pred, meta_input["y_test"])
            elif self.criterion == "ic_loss":
                criterion = ICLoss()
                try:
                    loss = criterion(pred,
                                     meta_input["y_test"],
                                     meta_input["test_idx"],
                                     skip_size=50)
                except ValueError as e:
                    get_module_logger("MetaModelDS").warning(
                        f"Exception `{e}` when calculating IC loss")
                    continue

            assert not np.isnan(loss.detach().item()), "NaN loss!"

            if phase == "train":
                opt.zero_grad()
                norm_loss = nn.MSELoss()
                loss.backward()
                opt.step()
            elif phase == "test":
                pass

            pred_y_all.append(
                pd.DataFrame({
                    "pred":
                    pd.Series(pred.detach().cpu().numpy(),
                              index=meta_input["test_idx"]),
                    "label":
                    pd.Series(meta_input["y_test"].detach().cpu().numpy(),
                              index=meta_input["test_idx"]),
                }))
            running_loss += loss.detach().item()
        running_loss = running_loss / len(task_list)
        loss_l.setdefault(phase, []).append(running_loss)

        pred_y_all = pd.concat(pred_y_all)
        ic = pred_y_all.groupby("datetime").apply(
            lambda df: df["pred"].corr(df["label"], method="spearman")).mean()

        R.log_metrics(**{f"loss/{phase}": running_loss, "step": epoch})
        R.log_metrics(**{f"ic/{phase}": ic, "step": epoch})
Ejemplo n.º 22
0
import copy
from typing import Union, List

from ....data.dataset.weight import Reweighter
from ....model.meta.dataset import MetaTaskDataset
from ....model.meta.model import MetaTaskModel
from ....workflow import R
from .utils import ICLoss
from .dataset import MetaDatasetDS

from qlib.log import get_module_logger
from qlib.data.dataset.weight import Reweighter
from qlib.model.meta.task import MetaTask
from qlib.contrib.meta.data_selection.net import PredNet

logger = get_module_logger("data selection")


class TimeReweighter(Reweighter):
    def __init__(self, time_weight: pd.Series):
        self.time_weight = time_weight

    def reweight(self, data: Union[pd.DataFrame, pd.Series]):
        # TODO: handling TSDataSampler
        w_s = pd.Series(1.0, index=data.index)
        for k, w in self.time_weight.items():
            w_s.loc[slice(*k)] = w
        logger.info(f"Reweighting result: {w_s}")
        return w_s

Ejemplo n.º 23
0
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import struct
from pathlib import Path
from typing import Iterable, Union, Dict, Mapping, Tuple, List

import numpy as np
import pandas as pd

from qlib.log import get_module_logger
from qlib.data.storage import CalendarStorage, InstrumentStorage, FeatureStorage, CalVT, InstKT, InstVT

logger = get_module_logger("file_storage")


class FileStorageMixin:
    @property
    def uri(self) -> Path:
        _provider_uri = self.kwargs.get("provider_uri", None)
        if _provider_uri is None:
            raise ValueError(
                f"The `provider_uri` parameter is not found in {self.__class__.__name__}, "
                f'please specify `provider_uri` in the "provider\'s backend"')
        return Path(_provider_uri).expanduser().joinpath(
            f"{self.storage_name}s", self.file_name)

    def check(self):
        """check self.uri

        Raises
Ejemplo n.º 24
0
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import numpy as np
import cvxpy as cp

from typing import Union, Optional, Dict, Any, List

from qlib.log import get_module_logger
from .base import BaseOptimizer

logger = get_module_logger("EnhancedIndexingOptimizer")


class EnhancedIndexingOptimizer(BaseOptimizer):
    """
    Portfolio Optimizer for Enhanced Indexing

    Notations:
        w0: current holding weights
        wb: benchmark weight
        r: expected return
        F: factor exposure
        cov_b: factor covariance
        var_u: residual variance (diagonal)
        lamb: risk aversion parameter
        delta: total turnover limit
        b_dev: benchmark deviation limit
        f_dev: factor deviation limit

    Also denote:
Ejemplo n.º 25
0
    def __init__(self,
                 d_feat=6,
                 hidden_size=64,
                 num_layers=2,
                 dropout=0.0,
                 n_epochs=200,
                 pre_epoch=40,
                 dw=0.5,
                 loss_type="cosine",
                 len_seq=60,
                 len_win=0,
                 lr=0.001,
                 metric="mse",
                 batch_size=2000,
                 early_stop=20,
                 loss="mse",
                 optimizer="adam",
                 n_splits=2,
                 GPU=0,
                 seed=None,
                 **kwargs):
        # Set logger.
        self.logger = get_module_logger("ADARNN")
        self.logger.info("ADARNN pytorch version...")
        os.environ["CUDA_VISIBLE_DEVICES"] = str(GPU)

        # set hyper-parameters.
        self.d_feat = d_feat
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.n_epochs = n_epochs
        self.pre_epoch = pre_epoch
        self.dw = dw
        self.loss_type = loss_type
        self.len_seq = len_seq
        self.len_win = len_win
        self.lr = lr
        self.metric = metric
        self.batch_size = batch_size
        self.early_stop = early_stop
        self.optimizer = optimizer.lower()
        self.loss = loss
        self.n_splits = n_splits
        self.device = torch.device(
            "cuda:%d" %
            (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu")
        self.seed = seed

        self.logger.info("ADARNN parameters setting:"
                         "\nd_feat : {}"
                         "\nhidden_size : {}"
                         "\nnum_layers : {}"
                         "\ndropout : {}"
                         "\nn_epochs : {}"
                         "\nlr : {}"
                         "\nmetric : {}"
                         "\nbatch_size : {}"
                         "\nearly_stop : {}"
                         "\noptimizer : {}"
                         "\nloss_type : {}"
                         "\nvisible_GPU : {}"
                         "\nuse_GPU : {}"
                         "\nseed : {}".format(
                             d_feat,
                             hidden_size,
                             num_layers,
                             dropout,
                             n_epochs,
                             lr,
                             metric,
                             batch_size,
                             early_stop,
                             optimizer.lower(),
                             loss,
                             GPU,
                             self.use_gpu,
                             seed,
                         ))

        if self.seed is not None:
            np.random.seed(self.seed)
            torch.manual_seed(self.seed)

        n_hiddens = [hidden_size for _ in range(num_layers)]
        self.model = AdaRNN(
            use_bottleneck=False,
            bottleneck_width=64,
            n_input=d_feat,
            n_hiddens=n_hiddens,
            n_output=1,
            dropout=dropout,
            model_type="AdaRNN",
            len_seq=len_seq,
            trans_loss=loss_type,
        )
        self.logger.info("model:\n{:}".format(self.model))
        self.logger.info("model size: {:.4f} MB".format(
            count_parameters(self.model)))

        if optimizer.lower() == "adam":
            self.train_optimizer = optim.Adam(self.model.parameters(),
                                              lr=self.lr)
        elif optimizer.lower() == "gd":
            self.train_optimizer = optim.SGD(self.model.parameters(),
                                             lr=self.lr)
        else:
            raise NotImplementedError(
                "optimizer {} is not supported!".format(optimizer))

        self.fitted = False
        self.model.cuda()
Ejemplo n.º 26
0
    def __init__(
        self,
        model_config,
        tra_config,
        model_type="LSTM",
        lr=1e-3,
        n_epochs=500,
        early_stop=50,
        smooth_steps=5,
        max_steps_per_epoch=None,
        freeze_model=False,
        model_init_state=None,
        lamb=0.0,
        rho=0.99,
        seed=None,
        logdir=None,
        eval_train=True,
        eval_test=False,
        avg_params=True,
        **kwargs,
    ):

        np.random.seed(seed)
        torch.manual_seed(seed)

        self.logger = get_module_logger("TRA")
        self.logger.info("TRA Model...")

        self.model = eval(model_type)(**model_config).to(device)
        if model_init_state:
            self.model.load_state_dict(torch.load(model_init_state, map_location="cpu")["model"])
        if freeze_model:
            for param in self.model.parameters():
                param.requires_grad_(False)
        else:
            self.logger.info("# model params: %d" % sum([p.numel() for p in self.model.parameters()]))

        self.tra = TRA(self.model.output_size, **tra_config).to(device)
        self.logger.info("# tra params: %d" % sum([p.numel() for p in self.tra.parameters()]))

        self.optimizer = optim.Adam(list(self.model.parameters()) + list(self.tra.parameters()), lr=lr)

        self.model_config = model_config
        self.tra_config = tra_config
        self.lr = lr
        self.n_epochs = n_epochs
        self.early_stop = early_stop
        self.smooth_steps = smooth_steps
        self.max_steps_per_epoch = max_steps_per_epoch
        self.lamb = lamb
        self.rho = rho
        self.seed = seed
        self.logdir = logdir
        self.eval_train = eval_train
        self.eval_test = eval_test
        self.avg_params = avg_params

        if self.tra.num_states > 1 and not self.eval_train:
            self.logger.warn("`eval_train` will be ignored when using TRA")

        if self.logdir is not None:
            if os.path.exists(self.logdir):
                self.logger.warn(f"logdir {self.logdir} is not empty")
            os.makedirs(self.logdir, exist_ok=True)

        self.fitted = False
        self.global_step = -1
Ejemplo n.º 27
0
    def __init__(
        self,
        *,
        task_tpl: Union[dict, list],
        step: int,
        trunc_days: int = None,
        rolling_ext_days: int = 0,
        exp_name: Union[str, InternalData],
        segments: Union[Dict[Text, Tuple], float],
        hist_step_n: int = 10,
        task_mode: str = MetaTask.PROC_MODE_FULL,
        fill_method: str = "max",
    ):
        """
        A dataset for meta model.

        Parameters
        ----------
        task_tpl : Union[dict, list]
            Decide what tasks are used.
            - dict : the task template, the prepared task is generated with `step`, `trunc_days` and `RollingGen`
            - list : when list, use the list of tasks directly
                     the list is supposed to be sorted according timeline
        step : int
            the rolling step
        trunc_days: int
            days to be truncated based on the test start
        rolling_ext_days: int
            sometimes users want to train meta models for a longer test period but with smaller rolling steps for more task samples.
            the total length of test periods will be `step + rolling_ext_days`

        exp_name : Union[str, InternalData]
            Decide what meta_info are used for prediction.
            - str: the name of the experiment to store the performance of data
            - InternalData: a prepared internal data
        segments: Union[Dict[Text, Tuple], float]
            the segments to divide data
            both left and right
            if segments is a float:
                the float represents the percentage of data for training
        hist_step_n: int
            length of historical steps for the meta infomation
        task_mode : str
            Please refer to the docs of MetaTask
        """
        super().__init__(segments=segments)
        if isinstance(exp_name, InternalData):
            self.internal_data = exp_name
        else:
            self.internal_data = InternalData(task_tpl,
                                              step=step,
                                              exp_name=exp_name)
            self.internal_data.setup()
        self.task_tpl = deepcopy(
            task_tpl
        )  # FIXME: if the handler is shared, how to avoid the explosion of the memroy.
        self.trunc_days = trunc_days
        self.hist_step_n = hist_step_n
        self.step = step

        if isinstance(task_tpl, dict):
            rg = RollingGen(step=step,
                            trunc_days=trunc_days,
                            task_copy_func=deepcopy_basic_type
                            )  # NOTE: trunc_days is very important !!!!
            task_iter = rg(task_tpl)
            if rolling_ext_days > 0:
                self.ta = TimeAdjuster(future=True)
                for t in task_iter:
                    t["dataset"]["kwargs"]["segments"]["test"] = self.ta.shift(
                        t["dataset"]["kwargs"]["segments"]["test"],
                        step=rolling_ext_days,
                        rtype=RollingGen.ROLL_EX)
            if task_mode == MetaTask.PROC_MODE_FULL:
                # Only pre initializing the task when full task is req
                # initializing handler and share it.
                init_task_handler(task_tpl)
        else:
            assert isinstance(task_tpl, list)
            task_iter = task_tpl

        self.task_list = []
        self.meta_task_l = []
        logger = get_module_logger("MetaDatasetDS")
        logger.info(f"Example task for training meta model: {task_iter[0]}")
        for t in tqdm(task_iter, desc="creating meta tasks"):
            try:
                self.meta_task_l.append(
                    MetaTaskDS(t,
                               meta_info=self._prepare_meta_ipt(t),
                               mode=task_mode,
                               fill_method=fill_method))
                self.task_list.append(t)
            except ValueError as e:
                logger.warning(f"ValueError: {e}")
        assert len(
            self.meta_task_l
        ) > 0, "No meta tasks found. Please check the data and setting"
Ejemplo n.º 28
0
    def setup(self, trainer=TrainerR, trainer_kwargs={}):
        """
        after running this function `self.data_ic_df` will become set.
        Each col represents a data.
        Each row represents the Timestamp of performance of that data.
        For example,

        .. code-block:: python

                       2021-06-21 2021-06-04 2021-05-21 2021-05-07 2021-04-20 2021-04-06 2021-03-22 2021-03-08  ...
                       2021-07-02 2021-06-18 2021-06-03 2021-05-20 2021-05-06 2021-04-19 2021-04-02 2021-03-19  ...
            datetime                                                                                            ...
            2018-01-02   0.079782   0.115975   0.070866   0.028849  -0.081170   0.140380   0.063864   0.110987  ...
            2018-01-03   0.123386   0.107789   0.071037   0.045278  -0.060782   0.167446   0.089779   0.124476  ...
            2018-01-04   0.140775   0.097206   0.063702   0.042415  -0.078164   0.173218   0.098914   0.114389  ...
            2018-01-05   0.030320  -0.037209  -0.044536  -0.047267  -0.081888   0.045648   0.059947   0.047652  ...
            2018-01-08   0.107201   0.009219  -0.015995  -0.036594  -0.086633   0.108965   0.122164   0.108508  ...
            ...               ...        ...        ...        ...        ...        ...        ...        ...  ...

        """

        # 1) prepare the prediction of proxy models
        perf_task_tpl = deepcopy(
            self.task_tpl
        )  # this task is supposed to contains no complicated objects

        trainer = auto_filter_kwargs(trainer)(experiment_name=self.exp_name,
                                              **trainer_kwargs)
        # NOTE:
        # The handler is initialized for only once.
        if not trainer.has_worker():
            self.dh = init_task_handler(perf_task_tpl)
        else:
            self.dh = init_instance_by_config(
                perf_task_tpl["dataset"]["kwargs"]["handler"])

        seg = perf_task_tpl["dataset"]["kwargs"]["segments"]

        # We want to split the training time period into small segments.
        perf_task_tpl["dataset"]["kwargs"]["segments"] = {
            "train": (DatasetH.get_min_time(seg), DatasetH.get_max_time(seg)),
            "test": (None, None),
        }

        # NOTE:
        # we play a trick here
        # treat the training segments as test to create the rolling tasks
        rg = RollingGen(step=self.step,
                        test_key="train",
                        train_key=None,
                        task_copy_func=deepcopy_basic_type)
        gen_task = task_generator(perf_task_tpl, [rg])

        recorders = R.list_recorders(experiment_name=self.exp_name)
        if len(gen_task) == len(recorders):
            get_module_logger("Internal Data").info(
                "the data has been initialized")
        else:
            # train new models
            assert 0 == len(
                recorders
            ), "An empty experiment is required for setup `InternalData``"
            trainer.train(gen_task)

        # 2) extract the similarity matrix
        label_df = self.dh.fetch(col_set="label")
        # for
        recorders = R.list_recorders(experiment_name=self.exp_name)

        key_l = []
        ic_l = []
        for _, rec in tqdm(recorders.items(), desc="calc"):
            pred = rec.load_object("pred.pkl")
            task = rec.load_object("task")
            data_key = task["dataset"]["kwargs"]["segments"]["train"]
            key_l.append(data_key)
            ic_l.append(
                delayed(self._calc_perf)(pred.iloc[:, 0], label_df.iloc[:, 0]))

        ic_l = Parallel(n_jobs=-1)(ic_l)
        self.data_ic_df = pd.DataFrame(dict(zip(key_l, ic_l)))
        self.data_ic_df = self.data_ic_df.sort_index().sort_index(axis=1)

        del self.dh  # handler is not useful now
Ejemplo n.º 29
0
    def __init__(
        self,
        model_config,
        tra_config,
        model_type="RNN",
        lr=1e-3,
        n_epochs=500,
        early_stop=50,
        update_freq=1,
        max_steps_per_epoch=None,
        lamb=0.0,
        rho=0.99,
        alpha=1.0,
        seed=None,
        logdir=None,
        eval_train=False,
        eval_test=False,
        pretrain=False,
        init_state=None,
        reset_router=False,
        freeze_model=False,
        freeze_predictors=False,
        transport_method="none",
        memory_mode="sample",
    ):

        self.logger = get_module_logger("TRA")

        assert memory_mode in ["sample", "daily"], "invalid memory mode"
        assert transport_method in ["none", "router", "oracle"], f"invalid transport method {transport_method}"
        assert transport_method == "none" or tra_config["num_states"] > 1, "optimal transport requires `num_states` > 1"
        assert (
            memory_mode != "daily" or tra_config["src_info"] == "TPE"
        ), "daily transport can only support TPE as `src_info`"

        if transport_method == "router" and not eval_train:
            self.logger.warning("`eval_train` will be ignored when using TRA.router")

        if seed is not None:
            np.random.seed(seed)
            torch.manual_seed(seed)

        self.model_config = model_config
        self.tra_config = tra_config
        self.model_type = model_type
        self.lr = lr
        self.n_epochs = n_epochs
        self.early_stop = early_stop
        self.update_freq = update_freq
        self.max_steps_per_epoch = max_steps_per_epoch
        self.lamb = lamb
        self.rho = rho
        self.alpha = alpha
        self.seed = seed
        self.logdir = logdir
        self.eval_train = eval_train
        self.eval_test = eval_test
        self.pretrain = pretrain
        self.init_state = init_state
        self.reset_router = reset_router
        self.freeze_model = freeze_model
        self.freeze_predictors = freeze_predictors
        self.transport_method = transport_method
        self.use_daily_transport = memory_mode == "daily"
        self.transport_fn = transport_daily if self.use_daily_transport else transport_sample

        self._writer = None
        if self.logdir is not None:
            if os.path.exists(self.logdir):
                self.logger.warning(f"logdir {self.logdir} is not empty")
            os.makedirs(self.logdir, exist_ok=True)
            if SummaryWriter is not None:
                self._writer = SummaryWriter(log_dir=self.logdir)

        self._init_model()
Ejemplo n.º 30
0
import re
from typing import Iterable, overload, Tuple, List, Text, Union, Dict

import numpy as np
import pandas as pd
from qlib.log import get_module_logger

# calendar value type
CalVT = str

# instrument value
InstVT = List[Tuple[CalVT, CalVT]]
# instrument key
InstKT = Text

logger = get_module_logger("storage")

"""
If the user is only using it in `qlib`, you can customize Storage to implement only the following methods:

class UserCalendarStorage(CalendarStorage):

    @property
    def data(self) -> Iterable[CalVT]:
        '''get all data

        Raises
        ------
        ValueError
            If the data(storage) does not exist, raise ValueError
        '''