Ejemplo n.º 1
0
    def fit(
        self,
        dataset: DatasetH,
        evals_result=dict(),
        save_path=None,
    ):
        label_train, label_valid = dataset.prepare(
            ["train", "valid"],
            col_set=["label"],
            data_key=DataHandlerLP.DK_R,
        )
        self.fit_thresh(label_train)
        df_train, df_valid = dataset.prepare(
            ["train", "valid"],
            col_set=["feature", "label"],
            data_key=DataHandlerLP.DK_L,
        )
        df_train = self.gen_market_label(df_train, label_train)
        df_valid = self.gen_market_label(df_valid, label_valid)

        x_train, y_train, m_train = df_train["feature"], df_train["label"], df_train["market_return"]
        x_valid, y_valid, m_valid = df_valid["feature"], df_valid["label"], df_valid["market_return"]

        evals_result["train"] = []
        evals_result["valid"] = []
        # load pretrained base_model

        if self.base_model == "LSTM":
            pretrained_model = LSTMModel()
        elif self.base_model == "GRU":
            pretrained_model = GRUModel()
        else:
            raise ValueError("unknown base model name `%s`" % self.base_model)

        if self.model_path is not None:
            self.logger.info("Loading pretrained model...")
            pretrained_model.load_state_dict(torch.load(self.model_path, map_location=self.device))

            model_dict = self.ADD_model.enc_excess.state_dict()
            pretrained_dict = {k: v for k, v in pretrained_model.rnn.state_dict().items() if k in model_dict}
            model_dict.update(pretrained_dict)
            self.ADD_model.enc_excess.load_state_dict(model_dict)
            model_dict = self.ADD_model.enc_market.state_dict()
            pretrained_dict = {k: v for k, v in pretrained_model.rnn.state_dict().items() if k in model_dict}
            model_dict.update(pretrained_dict)
            self.ADD_model.enc_market.load_state_dict(model_dict)
            self.logger.info("Loading pretrained model Done...")

        self.bootstrap_fit(x_train, y_train, m_train, x_valid, y_valid, m_valid)

        best_param = copy.deepcopy(self.ADD_model.state_dict())
        save_path = get_or_create_path(save_path)
        torch.save(best_param, save_path)
        if self.use_gpu:
            torch.cuda.empty_cache()
Ejemplo n.º 2
0
 def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
     if not self.fitted:
         raise ValueError("model is not fitted yet!")
     x_test = dataset.prepare(segment,
                              col_set="feature",
                              data_key=DataHandlerLP.DK_I)
     return self.infer(x_test)
Ejemplo n.º 3
0
    def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
        if not self.fitted:
            raise ValueError("The model is not fitted yet!")
        x_test = dataset.prepare(segment,
                                 col_set="feature",
                                 data_key=DataHandlerLP.DK_I)
        index = x_test.index

        with torch.no_grad():
            self.model.eval()
            x_values = x_test.values
            sample_num, batch_size = x_values.shape[0], self.opt_config[
                "batch_size"]
            preds = []
            for begin in range(sample_num)[::batch_size]:
                if sample_num - begin < batch_size:
                    end = sample_num
                else:
                    end = begin + batch_size
                x_batch = torch.from_numpy(x_values[begin:end]).float().to(
                    self.device)
                with torch.no_grad():
                    pred = self.model(x_batch).detach().cpu().numpy()
                preds.append(pred)
        return pd.Series(np.concatenate(preds), index=index)
Ejemplo n.º 4
0
    def _prepare_data(self, dataset: DatasetH):
        df_train, df_valid = dataset.prepare(["train", "valid"],
                                             col_set=["feature", "label"],
                                             data_key=DataHandlerLP.DK_L)

        x_train, y_train = df_train["feature"], df_train["label"]
        x_valid, y_valid = df_train["feature"], df_valid["label"]
        if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
            l_name = df_train["label"].columns[0]
            # Convert label into alpha
            df_train["label"][l_name] = df_train["label"][l_name] - df_train[
                "label"][l_name].mean(level=0)
            df_valid["label"][l_name] = df_valid["label"][l_name] - df_valid[
                "label"][l_name].mean(level=0)
            mapping_fn = lambda x: 0 if x < 0 else 1
            df_train["label_c"] = df_train["label"][l_name].apply(mapping_fn)
            df_valid["label_c"] = df_valid["label"][l_name].apply(mapping_fn)
            x_train, y_train = df_train["feature"], df_train["label_c"].values
            x_valid, y_valid = df_valid["feature"], df_valid["label_c"].values
        else:
            raise ValueError("LightGBM doesn't support multi-label training")

        dtrain = lgb.Dataset(x_train.values, label=y_train)
        dvalid = lgb.Dataset(x_valid.values, label=y_valid)
        return dtrain, dvalid
Ejemplo n.º 5
0
    def hf_signal_test(self, dataset: DatasetH, threhold=0.2):
        """
        Test the sigal in high frequency test set
        """
        if self.model == None:
            raise ValueError("Model hasn't been trained yet")
        df_test = dataset.prepare("test",
                                  col_set=["feature", "label"],
                                  data_key=DataHandlerLP.DK_I)
        df_test.dropna(inplace=True)
        x_test, y_test = df_test["feature"], df_test["label"]
        # Convert label into alpha
        y_test[y_test.columns[0]] = y_test[y_test.columns[0]] - y_test[
            y_test.columns[0]].mean(level=0)

        res = pd.Series(self.model.predict(x_test.values), index=x_test.index)
        y_test["pred"] = res

        up_p, down_p, up_a, down_a = self._cal_signal_metrics(
            y_test, threhold, 1 - threhold)
        print("===============================")
        print("High frequency signal test")
        print("===============================")
        print("Test set precision: ")
        print("Positive precision: {}, Negative precision: {}".format(
            up_p, down_p))
        print("Test Alpha Average in test set: ")
        print("Positive average alpha: {}, Negative average alpha: {}".format(
            up_a, down_a))
Ejemplo n.º 6
0
    def fit(self, dataset: DatasetH):
        def _prepare_dataset(df_data):
            features = df_data["feature"].values
            features = self.process_data(features)
            labels = df_data["label"].values.squeeze()
            return dict(features=features, labels=labels)

        df_train, df_valid, df_test = dataset.prepare(
            ["train", "valid", "test"],
            col_set=["feature", "label"],
            data_key=DataHandlerLP.DK_L,
        )
        train_dataset, valid_dataset, test_dataset = (
            _prepare_dataset(df_train),
            _prepare_dataset(df_valid),
            _prepare_dataset(df_test),
        )
        # df_train['feature']['CLOSE1'].values
        # train_dataset['features'][:, -1]
        train_mse_loss = self.mse(self.model(train_dataset["features"]),
                                  train_dataset["labels"])
        valid_mse_loss = self.mse(self.model(valid_dataset["features"]),
                                  valid_dataset["labels"])
        self.logger.info("Training MSE loss: {:}".format(train_mse_loss))
        self.logger.info("Validation MSE loss: {:}".format(valid_mse_loss))
        self.fitted = True
Ejemplo n.º 7
0
    def fit(
            self,
            dataset: DatasetH,
            evals_result=dict(),
            verbose=True,
            save_path=None,
    ):

        df_train, df_valid, df_test = dataset.prepare(
            ["train", "valid", "test"],
            col_set=["feature", "label"],
            data_key=DataHandlerLP.DK_L,
        )

        x_train, y_train = df_train["feature"], df_train["label"]
        x_valid, y_valid = df_valid["feature"], df_valid["label"]

        if save_path == None:
            save_path = create_save_path(save_path)
        stop_steps = 0
        train_loss = 0
        best_score = -np.inf
        best_epoch = 0
        evals_result["train"] = []
        evals_result["valid"] = []

        # train
        self.logger.info("training...")
        self.fitted = True

        for step in range(self.n_epochs):
            self.logger.info("Epoch%d:", step)
            self.logger.info("training...")
            self.train_epoch(x_train, y_train)
            self.logger.info("evaluating...")
            train_loss, train_score = self.test_epoch(x_train, y_train)
            val_loss, val_score = self.test_epoch(x_valid, y_valid)
            self.logger.info("train %.6f, valid %.6f" %
                             (train_score, val_score))
            evals_result["train"].append(train_score)
            evals_result["valid"].append(val_score)

            if val_score > best_score:
                best_score = val_score
                stop_steps = 0
                best_epoch = step
                best_param = copy.deepcopy(self.model.state_dict())
            else:
                stop_steps += 1
                if stop_steps >= self.early_stop:
                    self.logger.info("early stop")
                    break

        self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
        self.model.load_state_dict(best_param)
        torch.save(best_param, save_path)

        if self.use_gpu:
            torch.cuda.empty_cache()
Ejemplo n.º 8
0
    def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
        x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I)
        index = x_test.index
        self.ADD_model.eval()
        x_values = x_test.values
        preds = []

        daily_index, daily_count = self.get_daily_inter(x_test, shuffle=False)

        for idx, count in zip(daily_index, daily_count):
            batch = slice(idx, idx + count)
            x_batch = torch.from_numpy(x_values[batch]).float().to(self.device)

            with torch.no_grad():
                pred = self.ADD_model(x_batch)
                pred = pred["excess"].detach().cpu().numpy()

            preds.append(pred)

        r = pd.Series(np.concatenate(preds), index=index)
        return r
Ejemplo n.º 9
0
 def _prepare_data(self, dataset: DatasetH):
     df_train, df_valid = dataset.prepare(
         ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
     )
     return transform_df(df_train), transform_df(df_valid)
Ejemplo n.º 10
0
    def fit(
            self,
            dataset: DatasetH,
            evals_result=dict(),
            save_path=None,
    ):

        df_train, df_valid = dataset.prepare(
            ["train", "valid"],
            col_set=["feature", "label"],
            data_key=DataHandlerLP.DK_L,
        )
        #  splits = ['2011-06-30']
        days = df_train.index.get_level_values(level=0).unique()
        train_splits = np.array_split(days, self.n_splits)
        train_splits = [df_train[s[0]:s[-1]] for s in train_splits]
        train_loader_list = [
            get_stock_loader(df, self.batch_size) for df in train_splits
        ]

        save_path = get_or_create_path(save_path)
        stop_steps = 0
        best_score = -np.inf
        best_epoch = 0
        evals_result["train"] = []
        evals_result["valid"] = []

        # train
        self.logger.info("training...")
        self.fitted = True
        best_score = -np.inf
        best_epoch = 0
        weight_mat, dist_mat = None, None

        for step in range(self.n_epochs):
            self.logger.info("Epoch%d:", step)
            self.logger.info("training...")
            weight_mat, dist_mat = self.train_AdaRNN(train_loader_list, step,
                                                     dist_mat, weight_mat)
            self.logger.info("evaluating...")
            train_metrics = self.test_epoch(df_train)
            valid_metrics = self.test_epoch(df_valid)
            self.log_metrics("train: ", train_metrics)
            self.log_metrics("valid: ", valid_metrics)

            valid_score = valid_metrics[self.metric]
            train_score = train_metrics[self.metric]
            evals_result["train"].append(train_score)
            evals_result["valid"].append(valid_score)
            if valid_score > best_score:
                best_score = valid_score
                stop_steps = 0
                best_epoch = step
                best_param = copy.deepcopy(self.model.state_dict())
            else:
                stop_steps += 1
                if stop_steps >= self.early_stop:
                    self.logger.info("early stop")
                    break

        self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
        self.model.load_state_dict(best_param)
        torch.save(best_param, save_path)

        if self.use_gpu:
            torch.cuda.empty_cache()
        return best_score
Ejemplo n.º 11
0
    def fit(
        self,
        dataset: DatasetH,
        save_dir: Optional[Text] = None,
    ):
        def _prepare_dataset(df_data):
            return th_data.TensorDataset(
                torch.from_numpy(df_data["feature"].values).float(),
                torch.from_numpy(df_data["label"].values).squeeze().float(),
            )

        def _prepare_loader(dataset, shuffle):
            return th_data.DataLoader(
                dataset,
                batch_size=self.opt_config["batch_size"],
                drop_last=False,
                pin_memory=True,
                num_workers=self.opt_config["num_workers"],
                shuffle=shuffle,
            )

        df_train, df_valid, df_test = dataset.prepare(
            ["train", "valid", "test"],
            col_set=["feature", "label"],
            data_key=DataHandlerLP.DK_L,
        )
        train_dataset, valid_dataset, test_dataset = (
            _prepare_dataset(df_train),
            _prepare_dataset(df_valid),
            _prepare_dataset(df_test),
        )
        train_loader, valid_loader, test_loader = (
            _prepare_loader(train_dataset, True),
            _prepare_loader(valid_dataset, False),
            _prepare_loader(test_dataset, False),
        )

        save_dir = get_or_create_path(save_dir, return_dir=True)
        self.logger.info("Fit procedure for [{:}] with save path={:}".format(
            self.__class__.__name__, save_dir))

        def _internal_test(ckp_epoch=None, results_dict=None):
            with torch.no_grad():
                train_loss, train_score = self.train_or_test_epoch(
                    train_loader, self.model, self.loss_fn, self.metric_fn,
                    False, None)
                valid_loss, valid_score = self.train_or_test_epoch(
                    valid_loader, self.model, self.loss_fn, self.metric_fn,
                    False, None)
                test_loss, test_score = self.train_or_test_epoch(
                    test_loader, self.model, self.loss_fn, self.metric_fn,
                    False, None)
                xstr = (
                    "train-score={:.6f}, valid-score={:.6f}, test-score={:.6f}"
                    .format(train_score, valid_score, test_score))
                if ckp_epoch is not None and isinstance(results_dict, dict):
                    results_dict["train"][ckp_epoch] = train_score
                    results_dict["valid"][ckp_epoch] = valid_score
                    results_dict["test"][ckp_epoch] = test_score
                return dict(train=train_score,
                            valid=valid_score,
                            test=test_score), xstr

        # Pre-fetch the potential checkpoints
        ckp_path = os.path.join(save_dir,
                                "{:}.pth".format(self.__class__.__name__))
        if os.path.exists(ckp_path):
            ckp_data = torch.load(ckp_path, map_location=self.device)
            stop_steps, best_score, best_epoch = (
                ckp_data["stop_steps"],
                ckp_data["best_score"],
                ckp_data["best_epoch"],
            )
            start_epoch, best_param = ckp_data["start_epoch"], ckp_data[
                "best_param"]
            results_dict = ckp_data["results_dict"]
            self.model.load_state_dict(ckp_data["net_state_dict"])
            self.train_optimizer.load_state_dict(ckp_data["opt_state_dict"])
            self.logger.info(
                "Resume from existing checkpoint: {:}".format(ckp_path))
        else:
            stop_steps, best_score, best_epoch = 0, -np.inf, -1
            start_epoch, best_param = 0, None
            results_dict = dict(train=OrderedDict(),
                                valid=OrderedDict(),
                                test=OrderedDict())
            _, eval_str = _internal_test(-1, results_dict)
            self.logger.info(
                "Training from scratch, metrics@start: {:}".format(eval_str))

        for iepoch in range(start_epoch, self.opt_config["epochs"]):
            self.logger.info(
                "Epoch={:03d}/{:03d} ::==>> Best valid @{:03d} ({:.6f})".
                format(iepoch, self.opt_config["epochs"], best_epoch,
                       best_score))
            train_loss, train_score = self.train_or_test_epoch(
                train_loader,
                self.model,
                self.loss_fn,
                self.metric_fn,
                True,
                self.train_optimizer,
            )
            self.logger.info("Training :: loss={:.6f}, score={:.6f}".format(
                train_loss, train_score))

            current_eval_scores, eval_str = _internal_test(
                iepoch, results_dict)
            self.logger.info("Evaluating :: {:}".format(eval_str))

            if current_eval_scores["valid"] > best_score:
                stop_steps, best_epoch, best_score = (
                    0,
                    iepoch,
                    current_eval_scores["valid"],
                )
                best_param = copy.deepcopy(self.model.state_dict())
            else:
                stop_steps += 1
                if stop_steps >= self.opt_config["early_stop"]:
                    self.logger.info(
                        "early stop at {:}-th epoch, where the best is @{:}".
                        format(iepoch, best_epoch))
                    break
            save_info = dict(
                net_config=self.net_config,
                opt_config=self.opt_config,
                net_state_dict=self.model.state_dict(),
                opt_state_dict=self.train_optimizer.state_dict(),
                best_param=best_param,
                stop_steps=stop_steps,
                best_score=best_score,
                best_epoch=best_epoch,
                results_dict=results_dict,
                start_epoch=iepoch + 1,
            )
            torch.save(save_info, ckp_path)
        self.logger.info("The best score: {:.6f} @ {:02d}-th epoch".format(
            best_score, best_epoch))
        self.model.load_state_dict(best_param)
        _, eval_str = _internal_test("final", results_dict)
        self.logger.info("Reload the best parameter :: {:}".format(eval_str))

        if self.use_gpu:
            torch.cuda.empty_cache()
        self.fitted = True
Ejemplo n.º 12
0
    def setup(self, trainer=TrainerR, trainer_kwargs={}):
        """
        after running this function `self.data_ic_df` will become set.
        Each col represents a data.
        Each row represents the Timestamp of performance of that data.
        For example,

        .. code-block:: python

                       2021-06-21 2021-06-04 2021-05-21 2021-05-07 2021-04-20 2021-04-06 2021-03-22 2021-03-08  ...
                       2021-07-02 2021-06-18 2021-06-03 2021-05-20 2021-05-06 2021-04-19 2021-04-02 2021-03-19  ...
            datetime                                                                                            ...
            2018-01-02   0.079782   0.115975   0.070866   0.028849  -0.081170   0.140380   0.063864   0.110987  ...
            2018-01-03   0.123386   0.107789   0.071037   0.045278  -0.060782   0.167446   0.089779   0.124476  ...
            2018-01-04   0.140775   0.097206   0.063702   0.042415  -0.078164   0.173218   0.098914   0.114389  ...
            2018-01-05   0.030320  -0.037209  -0.044536  -0.047267  -0.081888   0.045648   0.059947   0.047652  ...
            2018-01-08   0.107201   0.009219  -0.015995  -0.036594  -0.086633   0.108965   0.122164   0.108508  ...
            ...               ...        ...        ...        ...        ...        ...        ...        ...  ...

        """

        # 1) prepare the prediction of proxy models
        perf_task_tpl = deepcopy(
            self.task_tpl
        )  # this task is supposed to contains no complicated objects

        trainer = auto_filter_kwargs(trainer)(experiment_name=self.exp_name,
                                              **trainer_kwargs)
        # NOTE:
        # The handler is initialized for only once.
        if not trainer.has_worker():
            self.dh = init_task_handler(perf_task_tpl)
        else:
            self.dh = init_instance_by_config(
                perf_task_tpl["dataset"]["kwargs"]["handler"])

        seg = perf_task_tpl["dataset"]["kwargs"]["segments"]

        # We want to split the training time period into small segments.
        perf_task_tpl["dataset"]["kwargs"]["segments"] = {
            "train": (DatasetH.get_min_time(seg), DatasetH.get_max_time(seg)),
            "test": (None, None),
        }

        # NOTE:
        # we play a trick here
        # treat the training segments as test to create the rolling tasks
        rg = RollingGen(step=self.step,
                        test_key="train",
                        train_key=None,
                        task_copy_func=deepcopy_basic_type)
        gen_task = task_generator(perf_task_tpl, [rg])

        recorders = R.list_recorders(experiment_name=self.exp_name)
        if len(gen_task) == len(recorders):
            get_module_logger("Internal Data").info(
                "the data has been initialized")
        else:
            # train new models
            assert 0 == len(
                recorders
            ), "An empty experiment is required for setup `InternalData``"
            trainer.train(gen_task)

        # 2) extract the similarity matrix
        label_df = self.dh.fetch(col_set="label")
        # for
        recorders = R.list_recorders(experiment_name=self.exp_name)

        key_l = []
        ic_l = []
        for _, rec in tqdm(recorders.items(), desc="calc"):
            pred = rec.load_object("pred.pkl")
            task = rec.load_object("task")
            data_key = task["dataset"]["kwargs"]["segments"]["train"]
            key_l.append(data_key)
            ic_l.append(
                delayed(self._calc_perf)(pred.iloc[:, 0], label_df.iloc[:, 0]))

        ic_l = Parallel(n_jobs=-1)(ic_l)
        self.data_ic_df = pd.DataFrame(dict(zip(key_l, ic_l)))
        self.data_ic_df = self.data_ic_df.sort_index().sort_index(axis=1)

        del self.dh  # handler is not useful now