def train(self, tasks: list, train_func: Callable = None, experiment_name: str = None, **kwargs) -> List[Recorder]: """ Given a list of `task`s and return a list of trained Recorder. The order can be guaranteed. Args: tasks (list): a list of definitions based on `task` dict train_func (Callable): the training method which needs at least `tasks` and `experiment_name`. None for the default training method. experiment_name (str): the experiment name, None for use default name. kwargs: the params for train_func. Returns: List[Recorder]: a list of Recorders """ if isinstance(tasks, dict): tasks = [tasks] if len(tasks) == 0: return [] if train_func is None: train_func = self.train_func if experiment_name is None: experiment_name = self.experiment_name recs = [] for task in tqdm(tasks, desc="train tasks"): if self._call_in_subproc: get_module_logger("TrainerR").info("running models in sub process (for forcing release memroy).") train_func = call_in_subproc(train_func, C) rec = train_func(task, experiment_name, recorder_name=self.default_rec_name, **kwargs) rec.set_tags(**{self.STATUS_KEY: self.STATUS_BEGIN}) recs.append(rec) return recs
def _load_internal(self, instrument, start_index, end_index, freq): _calendar = Cal.calendar(freq=freq) resample_data = np.empty(end_index - start_index + 1, dtype="float32") for cur_index in range(start_index, end_index + 1): cur_time = _calendar[cur_index] # To load expression accurately, more historical data are required start_ws, end_ws = self.feature.get_extended_window_size() if end_ws > 0: raise ValueError( "PIT database does not support referring to future period (e.g. expressions like `Ref('$$roewa_q', -1)` are not supported" ) # The calculated value will always the last element, so the end_offset is zero. try: s = self._load_feature(instrument, -start_ws, 0, cur_time) resample_data[ cur_index - start_index] = s.iloc[-1] if len(s) > 0 else np.nan except FileNotFoundError: get_module_logger("base").warning( f"WARN: period data not found for {str(self)}") return pd.Series(dtype="float32", name=str(self)) resample_series = pd.Series(resample_data, index=pd.RangeIndex( start_index, end_index + 1), dtype="float32", name=str(self)) return resample_series
def get_mongodb() -> Database: """ Get database in MongoDB, which means you need to declare the address and the name of a database at first. For example: Using qlib.init(): mongo_conf = { "task_url": task_url, # your MongoDB url "task_db_name": task_db_name, # database name } qlib.init(..., mongo=mongo_conf) After qlib.init(): C["mongo"] = { "task_url" : "mongodb://localhost:27017/", "task_db_name" : "rolling_db" } Returns: Database: the Database instance """ try: cfg = C["mongo"] except KeyError: get_module_logger("task").error( "Please configure `C['mongo']` before using TaskManager") raise client = MongoClient(cfg["task_url"]) return client.get_database(name=cfg["task_db_name"])
def __call__(self, ensemble_dict: dict) -> pd.DataFrame: """using sample: from qlib.model.ens.ensemble import AverageEnsemble pred_res['new_key_name'] = AverageEnsemble()(predict_dict) Parameters ---------- ensemble_dict : dict Dictionary you want to ensemble Returns ------- pd.DataFrame The dictionary including ensenbling result """ # need to flatten the nested dict ensemble_dict = flatten_dict(ensemble_dict, sep=FLATTEN_TUPLE) get_module_logger("AverageEnsemble").info( f"keys in group: {list(ensemble_dict.keys())}") values = list(ensemble_dict.values()) # NOTE: this may change the style underlying data!!!! # from pd.DataFrame to pd.Series results = pd.concat(values, axis=1) results = results.groupby("datetime").apply( lambda df: (df - df.mean()) / df.std()) results = results.mean(axis=1) results = results.sort_index() return results
def __call__(self, ensemble_dict: dict) -> pd.DataFrame: get_module_logger("RollingEnsemble").info( f"keys in group: {list(ensemble_dict.keys())}") artifact_list = list(ensemble_dict.values()) artifact_list.sort( key=lambda x: x.index.get_level_values("datetime").min()) artifact = pd.concat(artifact_list) # If there are duplicated predition, use the latest perdiction artifact = artifact[~artifact.index.duplicated(keep="last")] artifact = artifact.sort_index() return artifact
def load(self, instruments=None, start_time=None, end_time=None) -> pd.DataFrame: if instruments is not None: get_module_logger(self.__class__.__name__).warning(f"instruments[{instruments}] is ignored") if self.is_group: df = pd.concat( { grp: dh.fetch(selector=slice(start_time, end_time), level="datetime", **self.fetch_kwargs) for grp, dh in self.handlers.items() }, axis=1, ) else: df = self.handlers.fetch(selector=slice(start_time, end_time), level="datetime", **self.fetch_kwargs) return df
def __init__( self, *, riskmodel_root, market="csi500", turn_limit=None, name_mapping={}, optimizer_kwargs={}, verbose=False, **kwargs, ): super().__init__(**kwargs) self.logger = get_module_logger("EnhancedIndexingStrategy") self.riskmodel_root = riskmodel_root self.market = market self.turn_limit = turn_limit self.factor_exp_path = name_mapping.get("factor_exp", self.FACTOR_EXP_NAME) self.factor_cov_path = name_mapping.get("factor_cov", self.FACTOR_COV_NAME) self.specific_risk_path = name_mapping.get("specific_risk", self.SPECIFIC_RISK_NAME) self.blacklist_path = name_mapping.get("blacklist", self.BLACKLIST_NAME) self.optimizer = EnhancedIndexingOptimizer(**optimizer_kwargs) self.verbose = verbose self._riskdata_cache = {}
def get_range_limit(self, **kwargs: Any) -> Tuple[int, int]: """ return the expected step range for limiting the decision execution time Both left and right are **closed** if no available trade_range, `default_value` will be returned It is only used in `NestedExecutor` - The outmost strategy will not follow any range limit (but it may give range_limit) - The inner most strategy's range_limit will be useless due to atomic executors don't have such features. **NOTE**: 1) This function must be called after `self.update` in following cases(ensured by NestedExecutor): - user relies on the auto-clip feature of `self.update` 2) This function will be called after _init_sub_trading in NestedExecutor. Parameters ---------- **kwargs: { "default_value": <default_value>, # using dict is for distinguish no value provided or None provided "inner_calendar": <trade calendar of inner strategy> # because the range limit will control the step range of inner strategy, inner calendar will be a # important parameter when trade_range is callable } Returns ------- Tuple[int, int]: Raises ------ NotImplementedError: If the following criteria meet 1) the decision can't provide a unified start and end 2) default_value is not provided """ try: _start_idx, _end_idx = self._get_range_limit(**kwargs) except NotImplementedError as e: if "default_value" in kwargs: return kwargs["default_value"] else: # Default to get full index raise NotImplementedError(f"The decision didn't provide an index range") from e # clip index if getattr(self, "total_step", None) is not None: # if `self.update` is called. # Then the _start_idx, _end_idx should be clipped assert self.total_step is not None if _start_idx < 0 or _end_idx >= self.total_step: logger = get_module_logger("decision") logger.warning( f"[{_start_idx},{_end_idx}] go beyond the total_step({self.total_step}), it will be clipped.", ) _start_idx, _end_idx = max(0, _start_idx), min(self.total_step - 1, _end_idx) return _start_idx, _end_idx
def __init__(self, net_config=None, opt_config=None, metric="", GPU=0, seed=None, **kwargs): # Set logger. self.logger = get_module_logger("QuantTransformer") self.logger.info("QuantTransformer PyTorch version...") # set hyper-parameters. self.net_config = net_config or DEFAULT_NET_CONFIG self.opt_config = opt_config or DEFAULT_OPT_CONFIG self.metric = metric self.device = torch.device("cuda:{:}".format( GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") self.seed = seed self.logger.info("Transformer parameters setting:" "\nnet_config : {:}" "\nopt_config : {:}" "\nmetric : {:}" "\ndevice : {:}" "\nseed : {:}".format( self.net_config, self.opt_config, self.metric, self.device, self.seed, )) if self.seed is not None: random.seed(self.seed) np.random.seed(self.seed) torch.manual_seed(self.seed) if self.use_gpu: torch.cuda.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) self.model = get_transformer(self.net_config) self.model.set_super_run_type(super_core.SuperRunMode.FullModel) self.logger.info("model: {:}".format(self.model)) self.logger.info("model size: {:.3f} MB".format( count_parameters(self.model))) if self.opt_config["optimizer"] == "adam": self.train_optimizer = optim.Adam(self.model.parameters(), lr=self.opt_config["lr"]) elif self.opt_config["optimizer"] == "adam": self.train_optimizer = optim.SGD(self.model.parameters(), lr=self.opt_config["lr"]) else: raise NotImplementedError( "optimizer {:} is not supported!".format(optimizer)) self.fitted = False self.model.to(self.device)
def collect(self, artifacts_key=None, rec_filter_func=None, only_exist=True) -> dict: """ Collect different artifacts based on recorder after filtering. Args: artifacts_key (str or List, optional): the artifacts key you want to get. If None, use the default. rec_filter_func (Callable, optional): filter the recorder by return True or False. If None, use the default. only_exist (bool, optional): if only collect the artifacts when a recorder really has. If True, the recorder with exception when loading will not be collected. But if False, it will raise the exception. Returns: dict: the dict after collected like {artifact: {rec_key: object}} """ if artifacts_key is None: artifacts_key = self.artifacts_key if rec_filter_func is None: rec_filter_func = self.rec_filter_func if isinstance(artifacts_key, str): artifacts_key = [artifacts_key] collect_dict = {} # filter records recs = self.experiment.list_recorders(**self.list_kwargs) recs_flt = {} for rid, rec in recs.items(): if rec_filter_func is None or rec_filter_func(rec): recs_flt[rid] = rec logger = get_module_logger("RecorderCollector") for _, rec in recs_flt.items(): rec_key = self.rec_key_func(rec) for key in artifacts_key: if self.ART_KEY_RAW == key: artifact = rec else: try: artifact = rec.load_object(self.artifacts_path[key]) except Exception as e: if only_exist: # only collect existing artifact continue raise e # give user some warning if the values are overridden cdd = collect_dict.setdefault(key, {}) if rec_key in cdd: logger.warning( f"key '{rec_key}' is duplicated. Previous value will be overrides. Please check you `rec_key_func`" ) cdd[rec_key] = artifact return collect_dict
def run_exp(task_config, dataset, experiment_name, recorder_name, uri): model = init_instance_by_config(task_config["model"]) model_fit_kwargs = dict(dataset=dataset) # Let's start the experiment. with R.start( experiment_name=experiment_name, recorder_name=recorder_name, uri=uri, resume=True, ): # Setup log recorder_root_dir = R.get_recorder().get_local_dir() log_file = os.path.join(recorder_root_dir, "{:}.log".format(experiment_name)) set_log_basic_config(log_file) logger = get_module_logger("q.run_exp") logger.info("task_config::\n{:}".format( pprint.pformat(task_config, indent=2))) logger.info("[{:}] - [{:}]: {:}".format(experiment_name, recorder_name, uri)) logger.info("dataset={:}".format(dataset)) # Train model R.log_params(**flatten_dict(task_config)) if "save_path" in inspect.getfullargspec(model.fit).args: model_fit_kwargs["save_path"] = os.path.join( recorder_root_dir, "model.ckp") elif "save_dir" in inspect.getfullargspec(model.fit).args: model_fit_kwargs["save_dir"] = os.path.join( recorder_root_dir, "model-ckps") model.fit(**model_fit_kwargs) # Get the recorder recorder = R.get_recorder() R.save_objects(**{"model.pkl": model}) # Generate records: prediction, backtest, and analysis for record in task_config["record"]: record = record.copy() if record["class"] == "SignalRecord": srconf = { "model": model, "dataset": dataset, "recorder": recorder } record["kwargs"].update(srconf) sr = init_instance_by_config(record) sr.generate() else: rconf = {"recorder": recorder} record["kwargs"].update(rconf) ar = init_instance_by_config(record) ar.generate()
def __init__(self, name_id: str): """ Init OnlineStrategy. This module **MUST** use `Trainer <../reference/api.html#Trainer>`_ to finishing model training. Args: name_id (str): a unique name or id. trainer (Trainer, optional): a instance of Trainer. Defaults to None. """ self.name_id = name_id self.logger = get_module_logger(self.__class__.__name__) self.tool = OnlineTool()
def __init__(self, socketio, app): super(RequestListener, self).__init__() # define flask app instances self.socketio = socketio self.app = app # define server instances self.channel = init_rabbitmq_channel(C.queue_host, C.queue_user, C.queue_pwd) self.channel.queue_declare(queue=C.task_queue, durable=True) self.channel.queue_declare(queue=C.message_queue, durable=True) self.logger = get_module_logger(self.__class__.__name__) self.redis_t = get_redis_connection()
def __init__(self, net_config=None, opt_config=None, metric="", GPU=0, seed=None, **kwargs): # Set logger. self.logger = get_module_logger("QuantTransformer") self.logger.info("QuantTransformer pytorch version...") # set hyper-parameters. self.net_config = net_config or default_net_config self.opt_config = opt_config or default_opt_config self.metric = metric self.device = torch.device("cuda:{:}".format(GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") self.seed = seed self.logger.info( "Transformer parameters setting:" "\nnet_config : {:}" "\nopt_config : {:}" "\nmetric : {:}" "\ndevice : {:}" "\nseed : {:}".format( self.net_config, self.opt_config, self.metric, self.device, self.seed, ) ) if self.seed is not None: np.random.seed(self.seed) torch.manual_seed(self.seed) self.model = TransformerModel( d_feat=self.net_config["d_feat"], embed_dim=self.net_config["hidden_size"], depth=self.net_config["depth"], pos_drop=self.net_config["pos_drop"], ) self.logger.info("model: {:}".format(self.model)) self.logger.info("model size: {:.3f} MB".format(count_parameters(self.model))) if self.opt_config["optimizer"] == "adam": self.train_optimizer = optim.Adam(self.model.parameters(), lr=self.opt_config["lr"]) elif self.opt_config["optimizer"] == "adam": self.train_optimizer = optim.SGD(self.model.parameters(), lr=self.opt_config["lr"]) else: raise NotImplementedError("optimizer {:} is not supported!".format(optimizer)) self.fitted = False self.model.to(self.device)
def __init__(self, is_interface=False, update_interval=24, max_workers=20): """ Parameters ---------- is_interface : bool whether this class needs to run or simply provides interface for a queue to call update_interval : int the hourly interval to update the cache max_workers: int multi-process count """ super(DataUpdater, self).__init__() self.logger = get_module_logger(self.__class__.__name__) self.is_interface = is_interface self.update_interval = update_interval self.max_workers = max_workers
def main(): LOG = get_module_logger(__file__) from qlib_server.request_handler import RequestHandler from qlib_server.data_processor import DataProcessor LOG.info("QLibServer starting...") threads = [] if "request_handler" in ARGS.module: threads.append(RequestHandler()) if "data_processor" in ARGS.module: threads.append(DataProcessor()) for t in threads: t.start() for t in threads: t.join()
def __init__(self, d_feat=6, seed=None, **kwargs): # Set logger. self.logger = get_module_logger("NAIVE") self.logger.info("NAIVE version...") # set hyper-parameters. self.d_feat = d_feat self.seed = seed self.logger.info( "NAIVE parameters setting: d_feat={:}, seed={:}".format( self.d_feat, self.seed)) if self.seed is not None: random.seed(self.seed) np.random.seed(self.seed) self.fitted = False
def __init__( self, log_every_n_episode: int = 20, total_episodes: int | None = None, float_format: str = ":.4f", counter_format: str = ":4d", loglevel: int | LogLevel = LogLevel.PERIODIC, ): super().__init__(loglevel) # TODO: support log_every_n_step self.log_every_n_episode = log_every_n_episode self.total_episodes = total_episodes self.counter_format = counter_format self.float_format = float_format self.prefix = "" self.console_logger = get_module_logger(__name__, level=logging.INFO)
def __init__( self, time_per_step: str, start_time: Union[str, pd.Timestamp] = None, end_time: Union[str, pd.Timestamp] = None, indicator_config: dict = {}, generate_portfolio_metrics: bool = False, verbose: bool = False, track_data: bool = False, trade_exchange: Exchange = None, common_infra: CommonInfrastructure = None, settle_type=BasePosition.ST_NO, **kwargs, ): """ Parameters ---------- time_per_step : str trade time per trading step, used for generate the trade calendar show_indicator: bool, optional whether to show indicators, : - 'pa', the price advantage - 'pos', the positive rate - 'ffr', the fulfill rate indicator_config: dict, optional config for calculating trade indicator, including the following fields: - 'show_indicator': whether to show indicators, optional, default by False. The indicators includes - 'pa', the price advantage - 'pos', the positive rate - 'ffr', the fulfill rate - 'pa_config': config for calculating price advantage(pa), optional - 'base_price': the based price than which the trading price is advanced, Optional, default by 'twap' - If 'base_price' is 'twap', the based price is the time weighted average price - If 'base_price' is 'vwap', the based price is the volume weighted average price - 'weight_method': weighted method when calculating total trading pa by different orders' pa in each step, optional, default by 'mean' - If 'weight_method' is 'mean', calculating mean value of different orders' pa - If 'weight_method' is 'amount_weighted', calculating amount weighted average value of different orders' pa - If 'weight_method' is 'value_weighted', calculating value weighted average value of different orders' pa - 'ffr_config': config for calculating fulfill rate(ffr), optional - 'weight_method': weighted method when calculating total trading ffr by different orders' ffr in each step, optional, default by 'mean' - If 'weight_method' is 'mean', calculating mean value of different orders' ffr - If 'weight_method' is 'amount_weighted', calculating amount weighted average value of different orders' ffr - If 'weight_method' is 'value_weighted', calculating value weighted average value of different orders' ffr Example: { 'show_indicator': True, 'pa_config': { "agg": "twap", # "vwap" "price": "$close", # default to use deal price of the exchange }, 'ffr_config':{ 'weight_method': 'value_weighted', } } generate_portfolio_metrics : bool, optional whether to generate portfolio_metrics, by default False verbose : bool, optional whether to print trading info, by default False track_data : bool, optional whether to generate trade_decision, will be used when training rl agent - If `self.track_data` is true, when making data for training, the input `trade_decision` of `execute` will be generated by `collect_data` - Else, `trade_decision` will not be generated trade_exchange : Exchange exchange that provides market info, used to generate portfolio_metrics - If generate_portfolio_metrics is None, trade_exchange will be ignored - Else If `trade_exchange` is None, self.trade_exchange will be set with common_infra common_infra : CommonInfrastructure, optional: common infrastructure for backtesting, may including: - trade_account : Account, optional trade account for trading - trade_exchange : Exchange, optional exchange that provides market info settle_type : str Please refer to the docs of BasePosition.settle_start """ self.time_per_step = time_per_step self.indicator_config = indicator_config self.generate_portfolio_metrics = generate_portfolio_metrics self.verbose = verbose self.track_data = track_data self._trade_exchange = trade_exchange self.level_infra = LevelInfrastructure() self.level_infra.reset_infra(common_infra=common_infra) self._settle_type = settle_type self.reset(start_time=start_time, end_time=end_time, common_infra=common_infra) if common_infra is None: get_module_logger("BaseExecutor").warning( f"`common_infra` is not set for {self}") # record deal order amount in one day self.dealt_order_amount = defaultdict(float) self.deal_day = None
def __init__(self): """ Init OnlineTool. """ self.logger = get_module_logger(self.__class__.__name__)
def run_epoch(self, phase, task_list, epoch, opt, loss_l, ignore_weight=False): if phase == "train": self.tn.train() torch.set_grad_enabled(True) else: self.tn.eval() torch.set_grad_enabled(False) running_loss = 0.0 pred_y_all = [] for task in tqdm(task_list, desc=f"{phase} Task", leave=False): meta_input = task.get_meta_input() pred, weights = self.tn( meta_input["X"], meta_input["y"], meta_input["time_perf"], meta_input["time_belong"], meta_input["X_test"], ignore_weight=ignore_weight, ) if self.criterion == "mse": criterion = nn.MSELoss() loss = criterion(pred, meta_input["y_test"]) elif self.criterion == "ic_loss": criterion = ICLoss() try: loss = criterion(pred, meta_input["y_test"], meta_input["test_idx"], skip_size=50) except ValueError as e: get_module_logger("MetaModelDS").warning( f"Exception `{e}` when calculating IC loss") continue assert not np.isnan(loss.detach().item()), "NaN loss!" if phase == "train": opt.zero_grad() norm_loss = nn.MSELoss() loss.backward() opt.step() elif phase == "test": pass pred_y_all.append( pd.DataFrame({ "pred": pd.Series(pred.detach().cpu().numpy(), index=meta_input["test_idx"]), "label": pd.Series(meta_input["y_test"].detach().cpu().numpy(), index=meta_input["test_idx"]), })) running_loss += loss.detach().item() running_loss = running_loss / len(task_list) loss_l.setdefault(phase, []).append(running_loss) pred_y_all = pd.concat(pred_y_all) ic = pred_y_all.groupby("datetime").apply( lambda df: df["pred"].corr(df["label"], method="spearman")).mean() R.log_metrics(**{f"loss/{phase}": running_loss, "step": epoch}) R.log_metrics(**{f"ic/{phase}": ic, "step": epoch})
import copy from typing import Union, List from ....data.dataset.weight import Reweighter from ....model.meta.dataset import MetaTaskDataset from ....model.meta.model import MetaTaskModel from ....workflow import R from .utils import ICLoss from .dataset import MetaDatasetDS from qlib.log import get_module_logger from qlib.data.dataset.weight import Reweighter from qlib.model.meta.task import MetaTask from qlib.contrib.meta.data_selection.net import PredNet logger = get_module_logger("data selection") class TimeReweighter(Reweighter): def __init__(self, time_weight: pd.Series): self.time_weight = time_weight def reweight(self, data: Union[pd.DataFrame, pd.Series]): # TODO: handling TSDataSampler w_s = pd.Series(1.0, index=data.index) for k, w in self.time_weight.items(): w_s.loc[slice(*k)] = w logger.info(f"Reweighting result: {w_s}") return w_s
# Copyright (c) Microsoft Corporation. # Licensed under the MIT License. import struct from pathlib import Path from typing import Iterable, Union, Dict, Mapping, Tuple, List import numpy as np import pandas as pd from qlib.log import get_module_logger from qlib.data.storage import CalendarStorage, InstrumentStorage, FeatureStorage, CalVT, InstKT, InstVT logger = get_module_logger("file_storage") class FileStorageMixin: @property def uri(self) -> Path: _provider_uri = self.kwargs.get("provider_uri", None) if _provider_uri is None: raise ValueError( f"The `provider_uri` parameter is not found in {self.__class__.__name__}, " f'please specify `provider_uri` in the "provider\'s backend"') return Path(_provider_uri).expanduser().joinpath( f"{self.storage_name}s", self.file_name) def check(self): """check self.uri Raises
# Copyright (c) Microsoft Corporation. # Licensed under the MIT License. import numpy as np import cvxpy as cp from typing import Union, Optional, Dict, Any, List from qlib.log import get_module_logger from .base import BaseOptimizer logger = get_module_logger("EnhancedIndexingOptimizer") class EnhancedIndexingOptimizer(BaseOptimizer): """ Portfolio Optimizer for Enhanced Indexing Notations: w0: current holding weights wb: benchmark weight r: expected return F: factor exposure cov_b: factor covariance var_u: residual variance (diagonal) lamb: risk aversion parameter delta: total turnover limit b_dev: benchmark deviation limit f_dev: factor deviation limit Also denote:
def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0, n_epochs=200, pre_epoch=40, dw=0.5, loss_type="cosine", len_seq=60, len_win=0, lr=0.001, metric="mse", batch_size=2000, early_stop=20, loss="mse", optimizer="adam", n_splits=2, GPU=0, seed=None, **kwargs): # Set logger. self.logger = get_module_logger("ADARNN") self.logger.info("ADARNN pytorch version...") os.environ["CUDA_VISIBLE_DEVICES"] = str(GPU) # set hyper-parameters. self.d_feat = d_feat self.hidden_size = hidden_size self.num_layers = num_layers self.dropout = dropout self.n_epochs = n_epochs self.pre_epoch = pre_epoch self.dw = dw self.loss_type = loss_type self.len_seq = len_seq self.len_win = len_win self.lr = lr self.metric = metric self.batch_size = batch_size self.early_stop = early_stop self.optimizer = optimizer.lower() self.loss = loss self.n_splits = n_splits self.device = torch.device( "cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") self.seed = seed self.logger.info("ADARNN parameters setting:" "\nd_feat : {}" "\nhidden_size : {}" "\nnum_layers : {}" "\ndropout : {}" "\nn_epochs : {}" "\nlr : {}" "\nmetric : {}" "\nbatch_size : {}" "\nearly_stop : {}" "\noptimizer : {}" "\nloss_type : {}" "\nvisible_GPU : {}" "\nuse_GPU : {}" "\nseed : {}".format( d_feat, hidden_size, num_layers, dropout, n_epochs, lr, metric, batch_size, early_stop, optimizer.lower(), loss, GPU, self.use_gpu, seed, )) if self.seed is not None: np.random.seed(self.seed) torch.manual_seed(self.seed) n_hiddens = [hidden_size for _ in range(num_layers)] self.model = AdaRNN( use_bottleneck=False, bottleneck_width=64, n_input=d_feat, n_hiddens=n_hiddens, n_output=1, dropout=dropout, model_type="AdaRNN", len_seq=len_seq, trans_loss=loss_type, ) self.logger.info("model:\n{:}".format(self.model)) self.logger.info("model size: {:.4f} MB".format( count_parameters(self.model))) if optimizer.lower() == "adam": self.train_optimizer = optim.Adam(self.model.parameters(), lr=self.lr) elif optimizer.lower() == "gd": self.train_optimizer = optim.SGD(self.model.parameters(), lr=self.lr) else: raise NotImplementedError( "optimizer {} is not supported!".format(optimizer)) self.fitted = False self.model.cuda()
def __init__( self, model_config, tra_config, model_type="LSTM", lr=1e-3, n_epochs=500, early_stop=50, smooth_steps=5, max_steps_per_epoch=None, freeze_model=False, model_init_state=None, lamb=0.0, rho=0.99, seed=None, logdir=None, eval_train=True, eval_test=False, avg_params=True, **kwargs, ): np.random.seed(seed) torch.manual_seed(seed) self.logger = get_module_logger("TRA") self.logger.info("TRA Model...") self.model = eval(model_type)(**model_config).to(device) if model_init_state: self.model.load_state_dict(torch.load(model_init_state, map_location="cpu")["model"]) if freeze_model: for param in self.model.parameters(): param.requires_grad_(False) else: self.logger.info("# model params: %d" % sum([p.numel() for p in self.model.parameters()])) self.tra = TRA(self.model.output_size, **tra_config).to(device) self.logger.info("# tra params: %d" % sum([p.numel() for p in self.tra.parameters()])) self.optimizer = optim.Adam(list(self.model.parameters()) + list(self.tra.parameters()), lr=lr) self.model_config = model_config self.tra_config = tra_config self.lr = lr self.n_epochs = n_epochs self.early_stop = early_stop self.smooth_steps = smooth_steps self.max_steps_per_epoch = max_steps_per_epoch self.lamb = lamb self.rho = rho self.seed = seed self.logdir = logdir self.eval_train = eval_train self.eval_test = eval_test self.avg_params = avg_params if self.tra.num_states > 1 and not self.eval_train: self.logger.warn("`eval_train` will be ignored when using TRA") if self.logdir is not None: if os.path.exists(self.logdir): self.logger.warn(f"logdir {self.logdir} is not empty") os.makedirs(self.logdir, exist_ok=True) self.fitted = False self.global_step = -1
def __init__( self, *, task_tpl: Union[dict, list], step: int, trunc_days: int = None, rolling_ext_days: int = 0, exp_name: Union[str, InternalData], segments: Union[Dict[Text, Tuple], float], hist_step_n: int = 10, task_mode: str = MetaTask.PROC_MODE_FULL, fill_method: str = "max", ): """ A dataset for meta model. Parameters ---------- task_tpl : Union[dict, list] Decide what tasks are used. - dict : the task template, the prepared task is generated with `step`, `trunc_days` and `RollingGen` - list : when list, use the list of tasks directly the list is supposed to be sorted according timeline step : int the rolling step trunc_days: int days to be truncated based on the test start rolling_ext_days: int sometimes users want to train meta models for a longer test period but with smaller rolling steps for more task samples. the total length of test periods will be `step + rolling_ext_days` exp_name : Union[str, InternalData] Decide what meta_info are used for prediction. - str: the name of the experiment to store the performance of data - InternalData: a prepared internal data segments: Union[Dict[Text, Tuple], float] the segments to divide data both left and right if segments is a float: the float represents the percentage of data for training hist_step_n: int length of historical steps for the meta infomation task_mode : str Please refer to the docs of MetaTask """ super().__init__(segments=segments) if isinstance(exp_name, InternalData): self.internal_data = exp_name else: self.internal_data = InternalData(task_tpl, step=step, exp_name=exp_name) self.internal_data.setup() self.task_tpl = deepcopy( task_tpl ) # FIXME: if the handler is shared, how to avoid the explosion of the memroy. self.trunc_days = trunc_days self.hist_step_n = hist_step_n self.step = step if isinstance(task_tpl, dict): rg = RollingGen(step=step, trunc_days=trunc_days, task_copy_func=deepcopy_basic_type ) # NOTE: trunc_days is very important !!!! task_iter = rg(task_tpl) if rolling_ext_days > 0: self.ta = TimeAdjuster(future=True) for t in task_iter: t["dataset"]["kwargs"]["segments"]["test"] = self.ta.shift( t["dataset"]["kwargs"]["segments"]["test"], step=rolling_ext_days, rtype=RollingGen.ROLL_EX) if task_mode == MetaTask.PROC_MODE_FULL: # Only pre initializing the task when full task is req # initializing handler and share it. init_task_handler(task_tpl) else: assert isinstance(task_tpl, list) task_iter = task_tpl self.task_list = [] self.meta_task_l = [] logger = get_module_logger("MetaDatasetDS") logger.info(f"Example task for training meta model: {task_iter[0]}") for t in tqdm(task_iter, desc="creating meta tasks"): try: self.meta_task_l.append( MetaTaskDS(t, meta_info=self._prepare_meta_ipt(t), mode=task_mode, fill_method=fill_method)) self.task_list.append(t) except ValueError as e: logger.warning(f"ValueError: {e}") assert len( self.meta_task_l ) > 0, "No meta tasks found. Please check the data and setting"
def setup(self, trainer=TrainerR, trainer_kwargs={}): """ after running this function `self.data_ic_df` will become set. Each col represents a data. Each row represents the Timestamp of performance of that data. For example, .. code-block:: python 2021-06-21 2021-06-04 2021-05-21 2021-05-07 2021-04-20 2021-04-06 2021-03-22 2021-03-08 ... 2021-07-02 2021-06-18 2021-06-03 2021-05-20 2021-05-06 2021-04-19 2021-04-02 2021-03-19 ... datetime ... 2018-01-02 0.079782 0.115975 0.070866 0.028849 -0.081170 0.140380 0.063864 0.110987 ... 2018-01-03 0.123386 0.107789 0.071037 0.045278 -0.060782 0.167446 0.089779 0.124476 ... 2018-01-04 0.140775 0.097206 0.063702 0.042415 -0.078164 0.173218 0.098914 0.114389 ... 2018-01-05 0.030320 -0.037209 -0.044536 -0.047267 -0.081888 0.045648 0.059947 0.047652 ... 2018-01-08 0.107201 0.009219 -0.015995 -0.036594 -0.086633 0.108965 0.122164 0.108508 ... ... ... ... ... ... ... ... ... ... ... """ # 1) prepare the prediction of proxy models perf_task_tpl = deepcopy( self.task_tpl ) # this task is supposed to contains no complicated objects trainer = auto_filter_kwargs(trainer)(experiment_name=self.exp_name, **trainer_kwargs) # NOTE: # The handler is initialized for only once. if not trainer.has_worker(): self.dh = init_task_handler(perf_task_tpl) else: self.dh = init_instance_by_config( perf_task_tpl["dataset"]["kwargs"]["handler"]) seg = perf_task_tpl["dataset"]["kwargs"]["segments"] # We want to split the training time period into small segments. perf_task_tpl["dataset"]["kwargs"]["segments"] = { "train": (DatasetH.get_min_time(seg), DatasetH.get_max_time(seg)), "test": (None, None), } # NOTE: # we play a trick here # treat the training segments as test to create the rolling tasks rg = RollingGen(step=self.step, test_key="train", train_key=None, task_copy_func=deepcopy_basic_type) gen_task = task_generator(perf_task_tpl, [rg]) recorders = R.list_recorders(experiment_name=self.exp_name) if len(gen_task) == len(recorders): get_module_logger("Internal Data").info( "the data has been initialized") else: # train new models assert 0 == len( recorders ), "An empty experiment is required for setup `InternalData``" trainer.train(gen_task) # 2) extract the similarity matrix label_df = self.dh.fetch(col_set="label") # for recorders = R.list_recorders(experiment_name=self.exp_name) key_l = [] ic_l = [] for _, rec in tqdm(recorders.items(), desc="calc"): pred = rec.load_object("pred.pkl") task = rec.load_object("task") data_key = task["dataset"]["kwargs"]["segments"]["train"] key_l.append(data_key) ic_l.append( delayed(self._calc_perf)(pred.iloc[:, 0], label_df.iloc[:, 0])) ic_l = Parallel(n_jobs=-1)(ic_l) self.data_ic_df = pd.DataFrame(dict(zip(key_l, ic_l))) self.data_ic_df = self.data_ic_df.sort_index().sort_index(axis=1) del self.dh # handler is not useful now
def __init__( self, model_config, tra_config, model_type="RNN", lr=1e-3, n_epochs=500, early_stop=50, update_freq=1, max_steps_per_epoch=None, lamb=0.0, rho=0.99, alpha=1.0, seed=None, logdir=None, eval_train=False, eval_test=False, pretrain=False, init_state=None, reset_router=False, freeze_model=False, freeze_predictors=False, transport_method="none", memory_mode="sample", ): self.logger = get_module_logger("TRA") assert memory_mode in ["sample", "daily"], "invalid memory mode" assert transport_method in ["none", "router", "oracle"], f"invalid transport method {transport_method}" assert transport_method == "none" or tra_config["num_states"] > 1, "optimal transport requires `num_states` > 1" assert ( memory_mode != "daily" or tra_config["src_info"] == "TPE" ), "daily transport can only support TPE as `src_info`" if transport_method == "router" and not eval_train: self.logger.warning("`eval_train` will be ignored when using TRA.router") if seed is not None: np.random.seed(seed) torch.manual_seed(seed) self.model_config = model_config self.tra_config = tra_config self.model_type = model_type self.lr = lr self.n_epochs = n_epochs self.early_stop = early_stop self.update_freq = update_freq self.max_steps_per_epoch = max_steps_per_epoch self.lamb = lamb self.rho = rho self.alpha = alpha self.seed = seed self.logdir = logdir self.eval_train = eval_train self.eval_test = eval_test self.pretrain = pretrain self.init_state = init_state self.reset_router = reset_router self.freeze_model = freeze_model self.freeze_predictors = freeze_predictors self.transport_method = transport_method self.use_daily_transport = memory_mode == "daily" self.transport_fn = transport_daily if self.use_daily_transport else transport_sample self._writer = None if self.logdir is not None: if os.path.exists(self.logdir): self.logger.warning(f"logdir {self.logdir} is not empty") os.makedirs(self.logdir, exist_ok=True) if SummaryWriter is not None: self._writer = SummaryWriter(log_dir=self.logdir) self._init_model()
import re from typing import Iterable, overload, Tuple, List, Text, Union, Dict import numpy as np import pandas as pd from qlib.log import get_module_logger # calendar value type CalVT = str # instrument value InstVT = List[Tuple[CalVT, CalVT]] # instrument key InstKT = Text logger = get_module_logger("storage") """ If the user is only using it in `qlib`, you can customize Storage to implement only the following methods: class UserCalendarStorage(CalendarStorage): @property def data(self) -> Iterable[CalVT]: '''get all data Raises ------ ValueError If the data(storage) does not exist, raise ValueError '''