def fit( self, dataset: DatasetH, evals_result=dict(), save_path=None, ): label_train, label_valid = dataset.prepare( ["train", "valid"], col_set=["label"], data_key=DataHandlerLP.DK_R, ) self.fit_thresh(label_train) df_train, df_valid = dataset.prepare( ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L, ) df_train = self.gen_market_label(df_train, label_train) df_valid = self.gen_market_label(df_valid, label_valid) x_train, y_train, m_train = df_train["feature"], df_train["label"], df_train["market_return"] x_valid, y_valid, m_valid = df_valid["feature"], df_valid["label"], df_valid["market_return"] evals_result["train"] = [] evals_result["valid"] = [] # load pretrained base_model if self.base_model == "LSTM": pretrained_model = LSTMModel() elif self.base_model == "GRU": pretrained_model = GRUModel() else: raise ValueError("unknown base model name `%s`" % self.base_model) if self.model_path is not None: self.logger.info("Loading pretrained model...") pretrained_model.load_state_dict(torch.load(self.model_path, map_location=self.device)) model_dict = self.ADD_model.enc_excess.state_dict() pretrained_dict = {k: v for k, v in pretrained_model.rnn.state_dict().items() if k in model_dict} model_dict.update(pretrained_dict) self.ADD_model.enc_excess.load_state_dict(model_dict) model_dict = self.ADD_model.enc_market.state_dict() pretrained_dict = {k: v for k, v in pretrained_model.rnn.state_dict().items() if k in model_dict} model_dict.update(pretrained_dict) self.ADD_model.enc_market.load_state_dict(model_dict) self.logger.info("Loading pretrained model Done...") self.bootstrap_fit(x_train, y_train, m_train, x_valid, y_valid, m_valid) best_param = copy.deepcopy(self.ADD_model.state_dict()) save_path = get_or_create_path(save_path) torch.save(best_param, save_path) if self.use_gpu: torch.cuda.empty_cache()
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: raise ValueError("model is not fitted yet!") x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) return self.infer(x_test)
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: raise ValueError("The model is not fitted yet!") x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) index = x_test.index with torch.no_grad(): self.model.eval() x_values = x_test.values sample_num, batch_size = x_values.shape[0], self.opt_config[ "batch_size"] preds = [] for begin in range(sample_num)[::batch_size]: if sample_num - begin < batch_size: end = sample_num else: end = begin + batch_size x_batch = torch.from_numpy(x_values[begin:end]).float().to( self.device) with torch.no_grad(): pred = self.model(x_batch).detach().cpu().numpy() preds.append(pred) return pd.Series(np.concatenate(preds), index=index)
def _prepare_data(self, dataset: DatasetH): df_train, df_valid = dataset.prepare(["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) x_train, y_train = df_train["feature"], df_train["label"] x_valid, y_valid = df_train["feature"], df_valid["label"] if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: l_name = df_train["label"].columns[0] # Convert label into alpha df_train["label"][l_name] = df_train["label"][l_name] - df_train[ "label"][l_name].mean(level=0) df_valid["label"][l_name] = df_valid["label"][l_name] - df_valid[ "label"][l_name].mean(level=0) mapping_fn = lambda x: 0 if x < 0 else 1 df_train["label_c"] = df_train["label"][l_name].apply(mapping_fn) df_valid["label_c"] = df_valid["label"][l_name].apply(mapping_fn) x_train, y_train = df_train["feature"], df_train["label_c"].values x_valid, y_valid = df_valid["feature"], df_valid["label_c"].values else: raise ValueError("LightGBM doesn't support multi-label training") dtrain = lgb.Dataset(x_train.values, label=y_train) dvalid = lgb.Dataset(x_valid.values, label=y_valid) return dtrain, dvalid
def hf_signal_test(self, dataset: DatasetH, threhold=0.2): """ Test the sigal in high frequency test set """ if self.model == None: raise ValueError("Model hasn't been trained yet") df_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I) df_test.dropna(inplace=True) x_test, y_test = df_test["feature"], df_test["label"] # Convert label into alpha y_test[y_test.columns[0]] = y_test[y_test.columns[0]] - y_test[ y_test.columns[0]].mean(level=0) res = pd.Series(self.model.predict(x_test.values), index=x_test.index) y_test["pred"] = res up_p, down_p, up_a, down_a = self._cal_signal_metrics( y_test, threhold, 1 - threhold) print("===============================") print("High frequency signal test") print("===============================") print("Test set precision: ") print("Positive precision: {}, Negative precision: {}".format( up_p, down_p)) print("Test Alpha Average in test set: ") print("Positive average alpha: {}, Negative average alpha: {}".format( up_a, down_a))
def fit(self, dataset: DatasetH): def _prepare_dataset(df_data): features = df_data["feature"].values features = self.process_data(features) labels = df_data["label"].values.squeeze() return dict(features=features, labels=labels) df_train, df_valid, df_test = dataset.prepare( ["train", "valid", "test"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L, ) train_dataset, valid_dataset, test_dataset = ( _prepare_dataset(df_train), _prepare_dataset(df_valid), _prepare_dataset(df_test), ) # df_train['feature']['CLOSE1'].values # train_dataset['features'][:, -1] train_mse_loss = self.mse(self.model(train_dataset["features"]), train_dataset["labels"]) valid_mse_loss = self.mse(self.model(valid_dataset["features"]), valid_dataset["labels"]) self.logger.info("Training MSE loss: {:}".format(train_mse_loss)) self.logger.info("Validation MSE loss: {:}".format(valid_mse_loss)) self.fitted = True
def fit( self, dataset: DatasetH, evals_result=dict(), verbose=True, save_path=None, ): df_train, df_valid, df_test = dataset.prepare( ["train", "valid", "test"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L, ) x_train, y_train = df_train["feature"], df_train["label"] x_valid, y_valid = df_valid["feature"], df_valid["label"] if save_path == None: save_path = create_save_path(save_path) stop_steps = 0 train_loss = 0 best_score = -np.inf best_epoch = 0 evals_result["train"] = [] evals_result["valid"] = [] # train self.logger.info("training...") self.fitted = True for step in range(self.n_epochs): self.logger.info("Epoch%d:", step) self.logger.info("training...") self.train_epoch(x_train, y_train) self.logger.info("evaluating...") train_loss, train_score = self.test_epoch(x_train, y_train) val_loss, val_score = self.test_epoch(x_valid, y_valid) self.logger.info("train %.6f, valid %.6f" % (train_score, val_score)) evals_result["train"].append(train_score) evals_result["valid"].append(val_score) if val_score > best_score: best_score = val_score stop_steps = 0 best_epoch = step best_param = copy.deepcopy(self.model.state_dict()) else: stop_steps += 1 if stop_steps >= self.early_stop: self.logger.info("early stop") break self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) self.model.load_state_dict(best_param) torch.save(best_param, save_path) if self.use_gpu: torch.cuda.empty_cache()
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) index = x_test.index self.ADD_model.eval() x_values = x_test.values preds = [] daily_index, daily_count = self.get_daily_inter(x_test, shuffle=False) for idx, count in zip(daily_index, daily_count): batch = slice(idx, idx + count) x_batch = torch.from_numpy(x_values[batch]).float().to(self.device) with torch.no_grad(): pred = self.ADD_model(x_batch) pred = pred["excess"].detach().cpu().numpy() preds.append(pred) r = pd.Series(np.concatenate(preds), index=index) return r
def _prepare_data(self, dataset: DatasetH): df_train, df_valid = dataset.prepare( ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L ) return transform_df(df_train), transform_df(df_valid)
def fit( self, dataset: DatasetH, evals_result=dict(), save_path=None, ): df_train, df_valid = dataset.prepare( ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L, ) # splits = ['2011-06-30'] days = df_train.index.get_level_values(level=0).unique() train_splits = np.array_split(days, self.n_splits) train_splits = [df_train[s[0]:s[-1]] for s in train_splits] train_loader_list = [ get_stock_loader(df, self.batch_size) for df in train_splits ] save_path = get_or_create_path(save_path) stop_steps = 0 best_score = -np.inf best_epoch = 0 evals_result["train"] = [] evals_result["valid"] = [] # train self.logger.info("training...") self.fitted = True best_score = -np.inf best_epoch = 0 weight_mat, dist_mat = None, None for step in range(self.n_epochs): self.logger.info("Epoch%d:", step) self.logger.info("training...") weight_mat, dist_mat = self.train_AdaRNN(train_loader_list, step, dist_mat, weight_mat) self.logger.info("evaluating...") train_metrics = self.test_epoch(df_train) valid_metrics = self.test_epoch(df_valid) self.log_metrics("train: ", train_metrics) self.log_metrics("valid: ", valid_metrics) valid_score = valid_metrics[self.metric] train_score = train_metrics[self.metric] evals_result["train"].append(train_score) evals_result["valid"].append(valid_score) if valid_score > best_score: best_score = valid_score stop_steps = 0 best_epoch = step best_param = copy.deepcopy(self.model.state_dict()) else: stop_steps += 1 if stop_steps >= self.early_stop: self.logger.info("early stop") break self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) self.model.load_state_dict(best_param) torch.save(best_param, save_path) if self.use_gpu: torch.cuda.empty_cache() return best_score
def fit( self, dataset: DatasetH, save_dir: Optional[Text] = None, ): def _prepare_dataset(df_data): return th_data.TensorDataset( torch.from_numpy(df_data["feature"].values).float(), torch.from_numpy(df_data["label"].values).squeeze().float(), ) def _prepare_loader(dataset, shuffle): return th_data.DataLoader( dataset, batch_size=self.opt_config["batch_size"], drop_last=False, pin_memory=True, num_workers=self.opt_config["num_workers"], shuffle=shuffle, ) df_train, df_valid, df_test = dataset.prepare( ["train", "valid", "test"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L, ) train_dataset, valid_dataset, test_dataset = ( _prepare_dataset(df_train), _prepare_dataset(df_valid), _prepare_dataset(df_test), ) train_loader, valid_loader, test_loader = ( _prepare_loader(train_dataset, True), _prepare_loader(valid_dataset, False), _prepare_loader(test_dataset, False), ) save_dir = get_or_create_path(save_dir, return_dir=True) self.logger.info("Fit procedure for [{:}] with save path={:}".format( self.__class__.__name__, save_dir)) def _internal_test(ckp_epoch=None, results_dict=None): with torch.no_grad(): train_loss, train_score = self.train_or_test_epoch( train_loader, self.model, self.loss_fn, self.metric_fn, False, None) valid_loss, valid_score = self.train_or_test_epoch( valid_loader, self.model, self.loss_fn, self.metric_fn, False, None) test_loss, test_score = self.train_or_test_epoch( test_loader, self.model, self.loss_fn, self.metric_fn, False, None) xstr = ( "train-score={:.6f}, valid-score={:.6f}, test-score={:.6f}" .format(train_score, valid_score, test_score)) if ckp_epoch is not None and isinstance(results_dict, dict): results_dict["train"][ckp_epoch] = train_score results_dict["valid"][ckp_epoch] = valid_score results_dict["test"][ckp_epoch] = test_score return dict(train=train_score, valid=valid_score, test=test_score), xstr # Pre-fetch the potential checkpoints ckp_path = os.path.join(save_dir, "{:}.pth".format(self.__class__.__name__)) if os.path.exists(ckp_path): ckp_data = torch.load(ckp_path, map_location=self.device) stop_steps, best_score, best_epoch = ( ckp_data["stop_steps"], ckp_data["best_score"], ckp_data["best_epoch"], ) start_epoch, best_param = ckp_data["start_epoch"], ckp_data[ "best_param"] results_dict = ckp_data["results_dict"] self.model.load_state_dict(ckp_data["net_state_dict"]) self.train_optimizer.load_state_dict(ckp_data["opt_state_dict"]) self.logger.info( "Resume from existing checkpoint: {:}".format(ckp_path)) else: stop_steps, best_score, best_epoch = 0, -np.inf, -1 start_epoch, best_param = 0, None results_dict = dict(train=OrderedDict(), valid=OrderedDict(), test=OrderedDict()) _, eval_str = _internal_test(-1, results_dict) self.logger.info( "Training from scratch, metrics@start: {:}".format(eval_str)) for iepoch in range(start_epoch, self.opt_config["epochs"]): self.logger.info( "Epoch={:03d}/{:03d} ::==>> Best valid @{:03d} ({:.6f})". format(iepoch, self.opt_config["epochs"], best_epoch, best_score)) train_loss, train_score = self.train_or_test_epoch( train_loader, self.model, self.loss_fn, self.metric_fn, True, self.train_optimizer, ) self.logger.info("Training :: loss={:.6f}, score={:.6f}".format( train_loss, train_score)) current_eval_scores, eval_str = _internal_test( iepoch, results_dict) self.logger.info("Evaluating :: {:}".format(eval_str)) if current_eval_scores["valid"] > best_score: stop_steps, best_epoch, best_score = ( 0, iepoch, current_eval_scores["valid"], ) best_param = copy.deepcopy(self.model.state_dict()) else: stop_steps += 1 if stop_steps >= self.opt_config["early_stop"]: self.logger.info( "early stop at {:}-th epoch, where the best is @{:}". format(iepoch, best_epoch)) break save_info = dict( net_config=self.net_config, opt_config=self.opt_config, net_state_dict=self.model.state_dict(), opt_state_dict=self.train_optimizer.state_dict(), best_param=best_param, stop_steps=stop_steps, best_score=best_score, best_epoch=best_epoch, results_dict=results_dict, start_epoch=iepoch + 1, ) torch.save(save_info, ckp_path) self.logger.info("The best score: {:.6f} @ {:02d}-th epoch".format( best_score, best_epoch)) self.model.load_state_dict(best_param) _, eval_str = _internal_test("final", results_dict) self.logger.info("Reload the best parameter :: {:}".format(eval_str)) if self.use_gpu: torch.cuda.empty_cache() self.fitted = True