Example #1
0
    def fit(
        self,
        dataset: DatasetH,
        evals_result=dict(),
        save_path=None,
    ):
        label_train, label_valid = dataset.prepare(
            ["train", "valid"],
            col_set=["label"],
            data_key=DataHandlerLP.DK_R,
        )
        self.fit_thresh(label_train)
        df_train, df_valid = dataset.prepare(
            ["train", "valid"],
            col_set=["feature", "label"],
            data_key=DataHandlerLP.DK_L,
        )
        df_train = self.gen_market_label(df_train, label_train)
        df_valid = self.gen_market_label(df_valid, label_valid)

        x_train, y_train, m_train = df_train["feature"], df_train["label"], df_train["market_return"]
        x_valid, y_valid, m_valid = df_valid["feature"], df_valid["label"], df_valid["market_return"]

        evals_result["train"] = []
        evals_result["valid"] = []
        # load pretrained base_model

        if self.base_model == "LSTM":
            pretrained_model = LSTMModel()
        elif self.base_model == "GRU":
            pretrained_model = GRUModel()
        else:
            raise ValueError("unknown base model name `%s`" % self.base_model)

        if self.model_path is not None:
            self.logger.info("Loading pretrained model...")
            pretrained_model.load_state_dict(torch.load(self.model_path, map_location=self.device))

            model_dict = self.ADD_model.enc_excess.state_dict()
            pretrained_dict = {k: v for k, v in pretrained_model.rnn.state_dict().items() if k in model_dict}
            model_dict.update(pretrained_dict)
            self.ADD_model.enc_excess.load_state_dict(model_dict)
            model_dict = self.ADD_model.enc_market.state_dict()
            pretrained_dict = {k: v for k, v in pretrained_model.rnn.state_dict().items() if k in model_dict}
            model_dict.update(pretrained_dict)
            self.ADD_model.enc_market.load_state_dict(model_dict)
            self.logger.info("Loading pretrained model Done...")

        self.bootstrap_fit(x_train, y_train, m_train, x_valid, y_valid, m_valid)

        best_param = copy.deepcopy(self.ADD_model.state_dict())
        save_path = get_or_create_path(save_path)
        torch.save(best_param, save_path)
        if self.use_gpu:
            torch.cuda.empty_cache()
Example #2
0
 def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
     if not self.fitted:
         raise ValueError("model is not fitted yet!")
     x_test = dataset.prepare(segment,
                              col_set="feature",
                              data_key=DataHandlerLP.DK_I)
     return self.infer(x_test)
Example #3
0
    def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
        if not self.fitted:
            raise ValueError("The model is not fitted yet!")
        x_test = dataset.prepare(segment,
                                 col_set="feature",
                                 data_key=DataHandlerLP.DK_I)
        index = x_test.index

        with torch.no_grad():
            self.model.eval()
            x_values = x_test.values
            sample_num, batch_size = x_values.shape[0], self.opt_config[
                "batch_size"]
            preds = []
            for begin in range(sample_num)[::batch_size]:
                if sample_num - begin < batch_size:
                    end = sample_num
                else:
                    end = begin + batch_size
                x_batch = torch.from_numpy(x_values[begin:end]).float().to(
                    self.device)
                with torch.no_grad():
                    pred = self.model(x_batch).detach().cpu().numpy()
                preds.append(pred)
        return pd.Series(np.concatenate(preds), index=index)
Example #4
0
    def _prepare_data(self, dataset: DatasetH):
        df_train, df_valid = dataset.prepare(["train", "valid"],
                                             col_set=["feature", "label"],
                                             data_key=DataHandlerLP.DK_L)

        x_train, y_train = df_train["feature"], df_train["label"]
        x_valid, y_valid = df_train["feature"], df_valid["label"]
        if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
            l_name = df_train["label"].columns[0]
            # Convert label into alpha
            df_train["label"][l_name] = df_train["label"][l_name] - df_train[
                "label"][l_name].mean(level=0)
            df_valid["label"][l_name] = df_valid["label"][l_name] - df_valid[
                "label"][l_name].mean(level=0)
            mapping_fn = lambda x: 0 if x < 0 else 1
            df_train["label_c"] = df_train["label"][l_name].apply(mapping_fn)
            df_valid["label_c"] = df_valid["label"][l_name].apply(mapping_fn)
            x_train, y_train = df_train["feature"], df_train["label_c"].values
            x_valid, y_valid = df_valid["feature"], df_valid["label_c"].values
        else:
            raise ValueError("LightGBM doesn't support multi-label training")

        dtrain = lgb.Dataset(x_train.values, label=y_train)
        dvalid = lgb.Dataset(x_valid.values, label=y_valid)
        return dtrain, dvalid
Example #5
0
    def hf_signal_test(self, dataset: DatasetH, threhold=0.2):
        """
        Test the sigal in high frequency test set
        """
        if self.model == None:
            raise ValueError("Model hasn't been trained yet")
        df_test = dataset.prepare("test",
                                  col_set=["feature", "label"],
                                  data_key=DataHandlerLP.DK_I)
        df_test.dropna(inplace=True)
        x_test, y_test = df_test["feature"], df_test["label"]
        # Convert label into alpha
        y_test[y_test.columns[0]] = y_test[y_test.columns[0]] - y_test[
            y_test.columns[0]].mean(level=0)

        res = pd.Series(self.model.predict(x_test.values), index=x_test.index)
        y_test["pred"] = res

        up_p, down_p, up_a, down_a = self._cal_signal_metrics(
            y_test, threhold, 1 - threhold)
        print("===============================")
        print("High frequency signal test")
        print("===============================")
        print("Test set precision: ")
        print("Positive precision: {}, Negative precision: {}".format(
            up_p, down_p))
        print("Test Alpha Average in test set: ")
        print("Positive average alpha: {}, Negative average alpha: {}".format(
            up_a, down_a))
Example #6
0
    def fit(self, dataset: DatasetH):
        def _prepare_dataset(df_data):
            features = df_data["feature"].values
            features = self.process_data(features)
            labels = df_data["label"].values.squeeze()
            return dict(features=features, labels=labels)

        df_train, df_valid, df_test = dataset.prepare(
            ["train", "valid", "test"],
            col_set=["feature", "label"],
            data_key=DataHandlerLP.DK_L,
        )
        train_dataset, valid_dataset, test_dataset = (
            _prepare_dataset(df_train),
            _prepare_dataset(df_valid),
            _prepare_dataset(df_test),
        )
        # df_train['feature']['CLOSE1'].values
        # train_dataset['features'][:, -1]
        train_mse_loss = self.mse(self.model(train_dataset["features"]),
                                  train_dataset["labels"])
        valid_mse_loss = self.mse(self.model(valid_dataset["features"]),
                                  valid_dataset["labels"])
        self.logger.info("Training MSE loss: {:}".format(train_mse_loss))
        self.logger.info("Validation MSE loss: {:}".format(valid_mse_loss))
        self.fitted = True
Example #7
0
    def fit(
            self,
            dataset: DatasetH,
            evals_result=dict(),
            verbose=True,
            save_path=None,
    ):

        df_train, df_valid, df_test = dataset.prepare(
            ["train", "valid", "test"],
            col_set=["feature", "label"],
            data_key=DataHandlerLP.DK_L,
        )

        x_train, y_train = df_train["feature"], df_train["label"]
        x_valid, y_valid = df_valid["feature"], df_valid["label"]

        if save_path == None:
            save_path = create_save_path(save_path)
        stop_steps = 0
        train_loss = 0
        best_score = -np.inf
        best_epoch = 0
        evals_result["train"] = []
        evals_result["valid"] = []

        # train
        self.logger.info("training...")
        self.fitted = True

        for step in range(self.n_epochs):
            self.logger.info("Epoch%d:", step)
            self.logger.info("training...")
            self.train_epoch(x_train, y_train)
            self.logger.info("evaluating...")
            train_loss, train_score = self.test_epoch(x_train, y_train)
            val_loss, val_score = self.test_epoch(x_valid, y_valid)
            self.logger.info("train %.6f, valid %.6f" %
                             (train_score, val_score))
            evals_result["train"].append(train_score)
            evals_result["valid"].append(val_score)

            if val_score > best_score:
                best_score = val_score
                stop_steps = 0
                best_epoch = step
                best_param = copy.deepcopy(self.model.state_dict())
            else:
                stop_steps += 1
                if stop_steps >= self.early_stop:
                    self.logger.info("early stop")
                    break

        self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
        self.model.load_state_dict(best_param)
        torch.save(best_param, save_path)

        if self.use_gpu:
            torch.cuda.empty_cache()
Example #8
0
    def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
        x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I)
        index = x_test.index
        self.ADD_model.eval()
        x_values = x_test.values
        preds = []

        daily_index, daily_count = self.get_daily_inter(x_test, shuffle=False)

        for idx, count in zip(daily_index, daily_count):
            batch = slice(idx, idx + count)
            x_batch = torch.from_numpy(x_values[batch]).float().to(self.device)

            with torch.no_grad():
                pred = self.ADD_model(x_batch)
                pred = pred["excess"].detach().cpu().numpy()

            preds.append(pred)

        r = pd.Series(np.concatenate(preds), index=index)
        return r
Example #9
0
 def _prepare_data(self, dataset: DatasetH):
     df_train, df_valid = dataset.prepare(
         ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
     )
     return transform_df(df_train), transform_df(df_valid)
Example #10
0
    def fit(
            self,
            dataset: DatasetH,
            evals_result=dict(),
            save_path=None,
    ):

        df_train, df_valid = dataset.prepare(
            ["train", "valid"],
            col_set=["feature", "label"],
            data_key=DataHandlerLP.DK_L,
        )
        #  splits = ['2011-06-30']
        days = df_train.index.get_level_values(level=0).unique()
        train_splits = np.array_split(days, self.n_splits)
        train_splits = [df_train[s[0]:s[-1]] for s in train_splits]
        train_loader_list = [
            get_stock_loader(df, self.batch_size) for df in train_splits
        ]

        save_path = get_or_create_path(save_path)
        stop_steps = 0
        best_score = -np.inf
        best_epoch = 0
        evals_result["train"] = []
        evals_result["valid"] = []

        # train
        self.logger.info("training...")
        self.fitted = True
        best_score = -np.inf
        best_epoch = 0
        weight_mat, dist_mat = None, None

        for step in range(self.n_epochs):
            self.logger.info("Epoch%d:", step)
            self.logger.info("training...")
            weight_mat, dist_mat = self.train_AdaRNN(train_loader_list, step,
                                                     dist_mat, weight_mat)
            self.logger.info("evaluating...")
            train_metrics = self.test_epoch(df_train)
            valid_metrics = self.test_epoch(df_valid)
            self.log_metrics("train: ", train_metrics)
            self.log_metrics("valid: ", valid_metrics)

            valid_score = valid_metrics[self.metric]
            train_score = train_metrics[self.metric]
            evals_result["train"].append(train_score)
            evals_result["valid"].append(valid_score)
            if valid_score > best_score:
                best_score = valid_score
                stop_steps = 0
                best_epoch = step
                best_param = copy.deepcopy(self.model.state_dict())
            else:
                stop_steps += 1
                if stop_steps >= self.early_stop:
                    self.logger.info("early stop")
                    break

        self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
        self.model.load_state_dict(best_param)
        torch.save(best_param, save_path)

        if self.use_gpu:
            torch.cuda.empty_cache()
        return best_score
Example #11
0
    def fit(
        self,
        dataset: DatasetH,
        save_dir: Optional[Text] = None,
    ):
        def _prepare_dataset(df_data):
            return th_data.TensorDataset(
                torch.from_numpy(df_data["feature"].values).float(),
                torch.from_numpy(df_data["label"].values).squeeze().float(),
            )

        def _prepare_loader(dataset, shuffle):
            return th_data.DataLoader(
                dataset,
                batch_size=self.opt_config["batch_size"],
                drop_last=False,
                pin_memory=True,
                num_workers=self.opt_config["num_workers"],
                shuffle=shuffle,
            )

        df_train, df_valid, df_test = dataset.prepare(
            ["train", "valid", "test"],
            col_set=["feature", "label"],
            data_key=DataHandlerLP.DK_L,
        )
        train_dataset, valid_dataset, test_dataset = (
            _prepare_dataset(df_train),
            _prepare_dataset(df_valid),
            _prepare_dataset(df_test),
        )
        train_loader, valid_loader, test_loader = (
            _prepare_loader(train_dataset, True),
            _prepare_loader(valid_dataset, False),
            _prepare_loader(test_dataset, False),
        )

        save_dir = get_or_create_path(save_dir, return_dir=True)
        self.logger.info("Fit procedure for [{:}] with save path={:}".format(
            self.__class__.__name__, save_dir))

        def _internal_test(ckp_epoch=None, results_dict=None):
            with torch.no_grad():
                train_loss, train_score = self.train_or_test_epoch(
                    train_loader, self.model, self.loss_fn, self.metric_fn,
                    False, None)
                valid_loss, valid_score = self.train_or_test_epoch(
                    valid_loader, self.model, self.loss_fn, self.metric_fn,
                    False, None)
                test_loss, test_score = self.train_or_test_epoch(
                    test_loader, self.model, self.loss_fn, self.metric_fn,
                    False, None)
                xstr = (
                    "train-score={:.6f}, valid-score={:.6f}, test-score={:.6f}"
                    .format(train_score, valid_score, test_score))
                if ckp_epoch is not None and isinstance(results_dict, dict):
                    results_dict["train"][ckp_epoch] = train_score
                    results_dict["valid"][ckp_epoch] = valid_score
                    results_dict["test"][ckp_epoch] = test_score
                return dict(train=train_score,
                            valid=valid_score,
                            test=test_score), xstr

        # Pre-fetch the potential checkpoints
        ckp_path = os.path.join(save_dir,
                                "{:}.pth".format(self.__class__.__name__))
        if os.path.exists(ckp_path):
            ckp_data = torch.load(ckp_path, map_location=self.device)
            stop_steps, best_score, best_epoch = (
                ckp_data["stop_steps"],
                ckp_data["best_score"],
                ckp_data["best_epoch"],
            )
            start_epoch, best_param = ckp_data["start_epoch"], ckp_data[
                "best_param"]
            results_dict = ckp_data["results_dict"]
            self.model.load_state_dict(ckp_data["net_state_dict"])
            self.train_optimizer.load_state_dict(ckp_data["opt_state_dict"])
            self.logger.info(
                "Resume from existing checkpoint: {:}".format(ckp_path))
        else:
            stop_steps, best_score, best_epoch = 0, -np.inf, -1
            start_epoch, best_param = 0, None
            results_dict = dict(train=OrderedDict(),
                                valid=OrderedDict(),
                                test=OrderedDict())
            _, eval_str = _internal_test(-1, results_dict)
            self.logger.info(
                "Training from scratch, metrics@start: {:}".format(eval_str))

        for iepoch in range(start_epoch, self.opt_config["epochs"]):
            self.logger.info(
                "Epoch={:03d}/{:03d} ::==>> Best valid @{:03d} ({:.6f})".
                format(iepoch, self.opt_config["epochs"], best_epoch,
                       best_score))
            train_loss, train_score = self.train_or_test_epoch(
                train_loader,
                self.model,
                self.loss_fn,
                self.metric_fn,
                True,
                self.train_optimizer,
            )
            self.logger.info("Training :: loss={:.6f}, score={:.6f}".format(
                train_loss, train_score))

            current_eval_scores, eval_str = _internal_test(
                iepoch, results_dict)
            self.logger.info("Evaluating :: {:}".format(eval_str))

            if current_eval_scores["valid"] > best_score:
                stop_steps, best_epoch, best_score = (
                    0,
                    iepoch,
                    current_eval_scores["valid"],
                )
                best_param = copy.deepcopy(self.model.state_dict())
            else:
                stop_steps += 1
                if stop_steps >= self.opt_config["early_stop"]:
                    self.logger.info(
                        "early stop at {:}-th epoch, where the best is @{:}".
                        format(iepoch, best_epoch))
                    break
            save_info = dict(
                net_config=self.net_config,
                opt_config=self.opt_config,
                net_state_dict=self.model.state_dict(),
                opt_state_dict=self.train_optimizer.state_dict(),
                best_param=best_param,
                stop_steps=stop_steps,
                best_score=best_score,
                best_epoch=best_epoch,
                results_dict=results_dict,
                start_epoch=iepoch + 1,
            )
            torch.save(save_info, ckp_path)
        self.logger.info("The best score: {:.6f} @ {:02d}-th epoch".format(
            best_score, best_epoch))
        self.model.load_state_dict(best_param)
        _, eval_str = _internal_test("final", results_dict)
        self.logger.info("Reload the best parameter :: {:}".format(eval_str))

        if self.use_gpu:
            torch.cuda.empty_cache()
        self.fitted = True