def forecast(self) -> Forecast: """Прогноз годовой доходности.""" loader = data_loader.DescribedDataLoader( self._tickers, self._end, self._phenotype["data"], data_params.ForecastParams, ) model = self.get_model(loader, False) m_list = [] s_list = [] with torch.no_grad(): model.eval() for batch in loader: m, s = model(batch) m_list.append(m) s_list.append(s) m_forecast = torch.cat(m_list, dim=0).numpy().flatten() s_forecast = torch.cat(s_list, dim=0).numpy().flatten() forecast_days = self._phenotype["data"]["forecast_days"] history_days = self._phenotype["data"]["history_days"] year_mul = YEAR_IN_TRADING_DAYS / forecast_days m_forecast = pd.Series(m_forecast, index=list(self._tickers)).mul(year_mul) s_forecast = pd.Series(s_forecast, index=list(self._tickers)).mul(year_mul ** 0.5) return Forecast( tickers=self._tickers, date=self._end, history_days=history_days, forecast_days=forecast_days, mean=m_forecast, std=s_forecast, )
def make_data_loader(): return data_loader.DescribedDataLoader( ("MTSS", "BANE"), pd.Timestamp("2020-03-20"), DATA_PARAMS, data_params.TrainParams, )
def _validate(self, model: nn.Module) -> NoReturn: """Валидация модели.""" loader = data_loader.DescribedDataLoader( self._tickers, self._end, self._phenotype["data"], data_params.ValParams ) if len(loader.dataset) // len(self._tickers) == 0: print("~~> Valid: skipped...") return loss_fn = normal_llh llh_sum = 0.0 weight_sum = 0.0 print(f"Val size - {len(loader.dataset)}") with torch.no_grad(): model.eval() bar = tqdm.tqdm(loader, file=sys.stdout, desc="~~> Valid") for batch in bar: output = model(batch) loss, weight = loss_fn(output, batch) llh_sum += -loss.item() weight_sum += weight bar.set_postfix_str(f"{llh_sum / weight_sum:.5f}")
def _train_model(self) -> nn.Module: """Тренировка модели.""" phenotype = self._phenotype loader = data_loader.DescribedDataLoader( self._tickers, self._end, phenotype["data"], data_params.TrainParams ) model = self._make_untrained_model(loader) optimizer = optim.AdamW(model.parameters(), **phenotype["optimizer"]) steps_per_epoch = len(loader) scheduler_params = dict(phenotype["scheduler"]) epochs = scheduler_params.pop("epochs") total_steps = 1 + int(steps_per_epoch * epochs) scheduler_params["total_steps"] = total_steps scheduler = lr_scheduler.OneCycleLR(optimizer, **scheduler_params) print(f"Epochs - {epochs:.2f}") print(f"Train size - {len(loader.dataset)}") len_deque = int(total_steps ** 0.5) llh_sum = 0.0 llh_deque = collections.deque([0], maxlen=len_deque) weight_sum = 0.0 weight_deque = collections.deque([0], maxlen=len_deque) loss_fn = normal_llh loader = itertools.repeat(loader) loader = itertools.chain.from_iterable(loader) loader = itertools.islice(loader, total_steps) model.train() bar = tqdm.tqdm(loader, file=sys.stdout, total=total_steps, desc="~~> Train") for batch in bar: optimizer.zero_grad() output = model(batch) loss, weight = loss_fn(output, batch) llh_sum += -loss.item() - llh_deque[0] llh_deque.append(-loss.item()) weight_sum += weight - weight_deque[0] weight_deque.append(weight) loss.backward() optimizer.step() scheduler.step() llh = llh_sum / weight_sum bar.set_postfix_str(f"{llh:.5f}") # Такое условие позволяет отсеять NaN if not (llh > LOW_LLH): raise GradientsError(llh) self._validate(model) return model
def _eval_llh(self) -> tuple[float, float]: """Вычисляет логарифм правдоподобия. Прогнозы пересчитываются в дневное выражение для сопоставимости и вычисляется логарифм правдоподобия. Модель загружается при наличии сохраненных весов или обучается с нуля. """ loader = data_loader.DescribedDataLoader( self._tickers, self._end, self._phenotype["data"], data_params.TestParams, ) n_tickers = len(self._tickers) days, rez = divmod(len(loader.dataset), n_tickers) if rez: history = int(self._phenotype["data"]["history_days"]) raise TooLongHistoryError( f"Слишком большая длинна истории - {history}") model = self.prepare_model(loader) model.to(DEVICE) loss_fn = log_normal_llh_mix llh_sum = 0 weight_sum = 0 all_means = [] all_vars = [] all_labels = [] llh_adj = np.log(data_params.FORECAST_DAYS) / 2 with torch.no_grad(): model.eval() bars = tqdm.tqdm(loader, file=sys.stdout, desc="~~> Test") for batch in bars: loss, mean, var = loss_fn(model, batch) llh_sum -= loss.item() weight_sum += mean.shape[0] all_means.append(mean) all_vars.append(var) all_labels.append(batch["Label"]) bars.set_postfix_str(f"{llh_sum / weight_sum + llh_adj:.5f}") all_means = torch.cat(all_means).cpu().numpy().flatten() all_vars = torch.cat(all_vars).cpu().numpy().flatten() all_labels = torch.cat(all_labels).cpu().numpy().flatten() llh = llh_sum / weight_sum + llh_adj ir = _opt_port( all_means, all_vars, all_labels, self._tickers, self._end, self._phenotype, ) return llh, ir
def _eval_llh(self) -> float: """Вычисляет логарифм правдоподобия. Прогнозы пересчитываются в дневное выражение для сопоставимости и вычисляется логарифм правдоподобия. Модель загружается при наличии сохраненных весов или обучается с нуля. """ loader = data_loader.DescribedDataLoader( self._tickers, self._end, self._phenotype["data"], data_params.TestParams, ) n_tickers = len(self._tickers) days, rez = divmod(len(loader.dataset), n_tickers) if rez: raise TooLongHistoryError model = self.prepare_model(loader) model.to(DEVICE) loss_fn = log_normal_llh llh_sum = 0 weight_sum = 0 llh_all = [] print(f"Тестовых дней: {days}") print(f"Тестовых примеров: {len(loader.dataset)}") with torch.no_grad(): model.eval() bars = tqdm.tqdm(loader, file=sys.stdout, desc="~~> Test") for batch in bars: mean, std = model(batch) loss, weight, llh = loss_fn((mean, std), batch) llh_sum -= loss.item() weight_sum += weight llh_all.append(llh) bars.set_postfix_str(f"{llh_sum / weight_sum:.5f}") llh_all = torch.cat(llh_all) print( f"STD: {llh_all.std(unbiased=True).item() / len(llh_all) ** 0.5:.4f}" ) return llh_sum / weight_sum
def forecast(self) -> Forecast: """Прогноз годовой доходности.""" loader = data_loader.DescribedDataLoader( self._tickers, self._end, self._phenotype["data"], data_params.ForecastParams, ) model = self.prepare_model(loader) model.to(DEVICE) means = [] stds = [] with torch.no_grad(): model.eval() for batch in loader: dist = model.dist(batch) means.append(dist.mean - torch.tensor(1.0)) stds.append(dist.variance**0.5) means = torch.cat(means, dim=0).cpu().numpy().flatten() stds = torch.cat(stds, dim=0).cpu().numpy().flatten() means = pd.Series(means, index=list(self._tickers)) means = means.mul(YEAR_IN_TRADING_DAYS / data_params.FORECAST_DAYS) stds = pd.Series(stds, index=list(self._tickers)) stds = stds.mul( (YEAR_IN_TRADING_DAYS / data_params.FORECAST_DAYS)**0.5) return Forecast( tickers=self._tickers, date=self._end, history_days=self._phenotype["data"]["history_days"], mean=means, std=stds, risk_aversion=self._phenotype["utility"]["risk_aversion"], error_tolerance=self._phenotype["utility"]["error_tolerance"], )
def make_data_loader(): return data_loader.DescribedDataLoader(TICKERS, DATE, PARAMS, data_params.ForecastParams)
def _train_model(self) -> nn.Module: """Тренировка модели.""" phenotype = self._phenotype try: loader = data_loader.DescribedDataLoader( self._tickers, self._end, phenotype["data"], data_params.TrainParams, ) except ValueError: history = int(self._phenotype["data"]["history_days"]) raise TooLongHistoryError(f"Слишком большая длина истории: {history}") if len(loader.features_description) == 1: raise DegeneratedModelError("Отсутствуют активные признаки в генотипе") model = self._make_untrained_model(loader) model.to(DEVICE) optimizer = optim.AdamW(model.parameters(), **phenotype["optimizer"]) steps_per_epoch = len(loader) scheduler_params = dict(phenotype["scheduler"]) epochs = scheduler_params.pop("epochs") total_steps = 1 + int(steps_per_epoch * epochs) scheduler_params["total_steps"] = total_steps scheduler = optim.lr_scheduler.OneCycleLR(optimizer, **scheduler_params) LOGGER.info(f"Epochs - {epochs:.2f} / Train size - {len(loader.dataset)}") modules = sum(1 for _ in model.modules()) model_params = sum(tensor.numel() for tensor in model.parameters()) LOGGER.info(f"Количество слоев / параметров - {modules} / {model_params}") batch_size = (model_params * 4) * self._phenotype["data"]["batch_size"] / (2**10) ** 3 if batch_size > MAX_BATCH_SIZE: raise TooLargeModelError(f"Размер батча {batch_size:.0f} > {MAX_BATCH_SIZE}Gb") llh_sum = 0 llh_deque = collections.deque([0], maxlen=steps_per_epoch) weight_sum = 0 weight_deque = collections.deque([0], maxlen=steps_per_epoch) loss_fn = log_normal_llh_mix loader = itertools.repeat(loader) loader = itertools.chain.from_iterable(loader) loader = itertools.islice(loader, total_steps) model.train() bars = tqdm.tqdm(loader, file=sys.stdout, total=total_steps, desc="~~> Train") llh_min = None llh_adj = np.log(data_params.FORECAST_DAYS) / 2 for batch in bars: optimizer.zero_grad() loss, means, _ = loss_fn(model, batch) llh_sum += -loss.item() - llh_deque[0] llh_deque.append(-loss.item()) weight_sum += means.shape[0] - weight_deque[0] weight_deque.append(means.shape[0]) loss.backward() optimizer.step() scheduler.step() llh = llh_sum / weight_sum + llh_adj bars.set_postfix_str(f"{llh:.5f}") if llh_min is None: llh_min = llh - LLH_DRAW_DOWN total_time = bars.format_dict total_time = total_time["total"] / (1 + total_time["n"]) * total_time["elapsed"] if total_time > DAY_IN_SECONDS: raise DegeneratedModelError(f"Большое время тренировки: {total_time:.0f} >" f" {DAY_IN_SECONDS}") # Такое условие позволяет отсеять NaN if not (llh > llh_min): raise GradientsError(f"LLH снизилось - начальное: {llh_min + LLH_DRAW_DOWN:0.5f}") return model
def _eval_llh(self) -> float: """Вычисляет логарифм правдоподобия. Прогнозы пересчитываются в дневное выражение для сопоставимости и вычисляется логарифм правдоподобия. Модель загружается при наличии сохраненных весов или обучается с нуля. """ loader = data_loader.DescribedDataLoader( self._tickers, self._end, self._phenotype["data"], data_params.TestParams ) n_tickers = len(self._tickers) days, rez = divmod(len(loader.dataset), n_tickers) if rez: raise TooLongHistoryError model = self.get_model(loader) forecast_days = torch.tensor(self._phenotype["data"]["forecast_days"], dtype=torch.float) loss_fn = normal_llh llh_sum = 0.0 weight_sum = 0.0 m_all = [] s_all = [] r_all = [] print(f"Тестовых дней: {days}") print(f"Тестовых примеров: {len(loader.dataset)}") with torch.no_grad(): model.eval() bar = tqdm.tqdm(loader, file=sys.stdout, desc="~~> Test") for batch in bar: m, s = model(batch) m_all.append(m) s_all.append(s) r_all.append(batch["Label"]) loss, weight = loss_fn((m / forecast_days, s / forecast_days ** 0.5), batch) llh_sum -= loss.item() weight_sum += weight bar.set_postfix_str(f"{llh_sum / weight_sum:.5f}") m_all = torch.cat(m_all).flatten().numpy() s_all = torch.cat(s_all).flatten().numpy() r_all = torch.cat(r_all).flatten().numpy() port = [] simple = [] w = np.full(n_tickers, 1) w = w / w.sum() for day in range(days): m = m_all[day::days] s = s_all[day::days] r = r_all[day::days] mp = (m * w).sum() sp_2 = ((s * w) ** 2).sum() b = (s ** 2 * w) / sp_2 grad = (m - mp) - (b - 1) * sp_2 buy = np.argmax(grad) grad[w == 0] = np.inf sell = np.argmin(grad) sell_q = min(0.01, w[sell]) w[buy] = w[buy] + sell_q w[sell] = w[sell] - sell_q port.append((r * w).sum()) simple.append(r.mean()) w = w * (1 + r) w = w / w.sum() port = np.array(port) simple = np.array(simple) print(f"Количество акций в портфеле: {1 / (w * w).sum():.1f}") print(f"Port: {port.mean() * 252:.2%} - {port.std() * 252 ** 0.5:.2%}") print(f"Simple: {simple.mean() * 252:.2%} - {simple.std() * 252 ** 0.5:.2%}") print( f"Diff: {(port.mean() - 0.5 * port.std() ** 2) * 252:.2%} - " f"{(simple.mean() - 0.5 * simple.std() ** 2) * 252:.2%} = " f"{((port.mean() - simple.mean()) - 0.5 * (port.std() ** 2 - simple.std() ** 2)) * 252:.2%}" ) return llh_sum / weight_sum