class ReviewerV1: dataset_dir: str = to_abs_path(__file__, "../../storage/dataset/dataset/v001/") exp_dir: str = to_abs_path(__file__, "../../storage/experiments/v001/") reviewer_prefix: str = "v001" grid_params: Union[str, Dict[str, List]] = "V1_SET1" backtester_type: str = "BacktesterV1" exec_start: int = 0 exec_end: int = None n_jobs: int = 16 def __post_init__(self): if isinstance(self.grid_params, str): self.grid_params = getattr(paramset, self.grid_params) self.grid_params["dataset_dir"] = self.dataset_dir self.grid_params["exp_dir"] = self.exp_dir self._build_backtesters() def _load_data_dict(self): data_dict = {} for key in ("labels", "predictions", "probabilities"): data_dict[key] = pd.read_parquet( os.path.join(self.exp_dir, f"generated_output/{key}.parquet.zstd")) return data_dict def _display_timeseries(self, data_dict): columns = data_dict["predictions"].columns _, ax = plt.subplots(len(columns), 1, figsize=(24, 2.5 * len(columns))) for idx, column in enumerate(columns): data_dict["labels"][column].rename("label").plot(ax=ax[idx], alpha=0.5) data_dict["predictions"][column].rename("prediction").plot( ax=ax[idx]) ax[idx].legend() ax[idx].set_title(column) plt.tight_layout() plt.show() def _build_levels(self, data): levels = {} for column in data.columns: levels[column] = pd.qcut(data[column], 10, labels=False, retbins=False) return pd.concat(levels, axis=1) def _build_total_performance(self, data_dict): total_performance = (data_dict["labels"] * data_dict["predictions"] >= 0).mean() total_performance["mean"] = total_performance.mean() return total_performance def _build_performance_on_levels(self, data_dict, levels): performance = data_dict["labels"] * data_dict["predictions"] >= 0 performance_on_levels = [] for column in performance.columns: performance_on_levels.append(performance[column].groupby( levels[column]).mean()) performance_on_levels = pd.concat(performance_on_levels, axis=1) performance_on_levels["mean"] = performance_on_levels.mean(axis=1) return performance_on_levels def display_performance(self): data_dict = self._load_data_dict() display_markdown("#### Timeseries", raw=True) self._display_timeseries(data_dict=data_dict) display_markdown("#### Total Performance", raw=True) total_performance = self._build_total_performance(data_dict=data_dict) display( ft.display(total_performance.rename("bin_acc").to_frame().T, axis=1)) # Build levels label_levels = self._build_levels(data=data_dict["labels"]) prediction_levels = self._build_levels(data=data_dict["predictions"]) abs_prediction_levels = self._build_levels( data=data_dict["predictions"].abs()) probability_levels = self._build_levels( data=data_dict["probabilities"]) display_markdown("#### Performance on label levels", raw=True) display( ft.display( self._build_performance_on_levels(data_dict=data_dict, levels=label_levels))) display_markdown("#### Performance on prediction levels", raw=True) display( ft.display( self._build_performance_on_levels(data_dict=data_dict, levels=prediction_levels))) display_markdown("#### Performance on abs(prediction) levels", raw=True) display( ft.display( self._build_performance_on_levels( data_dict=data_dict, levels=abs_prediction_levels))) display_markdown("#### Performance on probability levels", raw=True) display( ft.display( self._build_performance_on_levels(data_dict=data_dict, levels=probability_levels))) def _exists_artifact(self, index): exists = [] for artifact_type in ["metrics", "report", "params"]: file_path = os.path.join( self.grid_params["exp_dir"], f"reports/{artifact_type}_{self.reviewer_prefix}_{index}_{self.grid_params['base_currency']}.parquet.zstd", ) if artifact_type in ("params"): exists.append( os.path.exists(file_path.replace(".parquet.zstd", ".json"))) continue exists.append(os.path.exists(file_path)) exists = all(exists) if exists is True: print(f"[!] Found backtests already done: {index}") return exists def _build_backtesters(self): def _is_valid_params(param): if param["adjust_prediction"] is True: if isinstance(param["exit_threshold"], (int, float)): return False if param["max_n_updated"] is None: return False if param["exit_threshold"] != "auto": if param["achieve_ratio"] != 1: return False return True grid_params = list(grid(self.grid_params)) # Filter grid_params grid_params = [ grid_param for grid_param in grid_params if _is_valid_params(param=grid_param) is True ] # Build backtesters self.backtesters = [ getattr(backtester, self.backtester_type)( report_prefix=f"{self.reviewer_prefix}_{idx}", **params) for idx, params in enumerate(grid_params) ][self.exec_start:self.exec_end] self.backtesters = [ backtester for backtester in self.backtesters if self._exists_artifact( index=backtester.report_prefix.split("_")[-1]) is not True ] def _load_artifact(self, artifact_type, index): assert artifact_type in ("metrics", "report", "params") file_path = os.path.join( self.grid_params["exp_dir"], f"reports/{artifact_type}_{self.reviewer_prefix}_{index}_{self.grid_params['base_currency']}.parquet.zstd", ) if artifact_type in ("metrics", "report"): artifact = pd.read_parquet(file_path) else: artifact = json.load( open(file_path.replace(".parquet.zstd", ".json"), "r")) return artifact def _load_artifacts(self, artifact_type, with_index=False): assert artifact_type in ("metrics", "report") file_paths = glob( os.path.join( self.grid_params["exp_dir"], f"reports/{artifact_type}_{self.reviewer_prefix}_*_{self.grid_params['base_currency']}.parquet.zstd", )) file_paths = sorted( file_paths, key=lambda x: int( x.split(f"{self.reviewer_prefix}_")[-1].split( f'_{self.grid_params["base_currency"]}')[0]), ) artifacts = [pd.read_parquet(file_path) for file_path in file_paths] index = pd.Index([ int( file_path.split(f"{artifact_type}_{self.reviewer_prefix}_") [-1].split( f"_{self.grid_params['base_currency']}.parquet.zstd")[0]) for file_path in file_paths ]) if with_index is True: return artifacts, index return artifacts def _build_metrics(self): artifacts, index = self._load_artifacts(artifact_type="metrics", with_index=True) metrics = pd.concat(artifacts) metrics.index = index return metrics def display_params(self, index, in_shell=False): display_markdown(f"#### Params: {index}", raw=True) params = (pd.Series( self._load_artifact(artifact_type="params", index=index)).rename("params").to_frame()) if in_shell is True: print(tabulate(params, headers="keys", tablefmt="psql")) else: display(params) def display_report(self, index, in_shell=False): report = self._load_artifact(artifact_type="report", index=index) display_markdown(f"#### Report: {index}", raw=True) _, ax = plt.subplots(4, 1, figsize=(12, 12), sharex=True) for idx, column in enumerate( ["capital", "cache", "return", "trade_return"]): if column == "trade_return": report[column].dropna().apply(lambda x: sum(x)).plot( ax=ax[idx]) else: report[column].plot(ax=ax[idx]) ax[idx].set_title(f"historical {column}") plt.tight_layout() if in_shell is True: plt.show(block=True) else: plt.show() def display_metrics(self, in_shell=False): metrics = self._build_metrics() if in_shell is True: print(tabulate(metrics, headers="keys", tablefmt="psql")) else: display(metrics) def display(self, in_shell=False): self.display_metrics(in_shell=in_shell) metrics = self._build_metrics() best_index = metrics["total_return"].sort_values( ascending=False).index[0] display_markdown(f"### [+] Best index: {best_index}", raw=True) display(metrics.loc[best_index]) self.display_params(index=best_index, in_shell=in_shell) self.display_report(index=best_index, in_shell=in_shell) def run(self, in_shell=False, display_performance=False): if in_shell is False: if display_performance is True: self.display_performance() print(f"[+] Found backtests to start: {len(self.backtesters)}") Parallel(n_jobs=self.n_jobs, verbose=1)([ delayed(backtester.run)(display=False) for backtester in self.backtesters ]) self.display(in_shell=in_shell)
import os import pandas as pd from glob import glob from tqdm import tqdm from common_utils_dev import ( make_dirs, load_text, get_filename_by_path, to_parquet, get_filename_by_path, to_abs_path, ) CONFIG = { "raw_spot_rawdata_dir": to_abs_path(__file__, "../../storage/dataset/rawdata/raw/spot/"), "raw_future_rawdata_dir": to_abs_path(__file__, "../../storage/dataset/rawdata/raw/future/"), "cleaned_rawdata_store_dir": to_abs_path(__file__, "../../storage/dataset/rawdata/cleaned/"), "candidate_assets_path": to_abs_path(__file__, "./candidate_assets.txt"), "query_min_start_dt": "2018-01-01", "boundary_dt_must_have_data": "2019-09-01", } def build_rawdata( raw_spot_rawdata_dir=CONFIG["raw_spot_rawdata_dir"],
from copy import copy from contextlib import contextmanager from abc import abstractmethod import torch import torch.nn as nn from common_utils_dev import load_text, load_json, to_abs_path, get_parent_dir from .utils import save_model, load_model, weights_init from .criterions import CRITERIONS from ..datasets.dataset import Dataset from torch.utils.data import DataLoader from trainer.models import backbones COMMON_CONFIG = { "data_dir": to_abs_path(__file__, "../../../storage/dataset/dataset/v001/train"), "exp_dir": to_abs_path(__file__, "../../../storage/experiments/v001"), "test_data_dir": to_abs_path(__file__, "../../../storage/dataset/dataset/v001/test"), } DATA_CONFIG = { "checkpoint_dir": "./check_point", "generate_output_dir": "./generated_output", "base_feature_assets": ["BTC-USDT"], } MODEL_CONFIG = { "lookback_window": 120, "batch_size": 512,
import os import pandas as pd import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from typing import Union, Optional, List, Dict from tqdm import tqdm from .basic_predictor import BasicPredictor from .utils import inverse_preprocess_data from common_utils_dev import to_parquet, to_abs_path COMMON_CONFIG = { "data_dir": to_abs_path(__file__, "../../../storage/dataset/dataset/v001/train"), "exp_dir": to_abs_path(__file__, "../../../storage/experiments/v001"), "test_data_dir": to_abs_path( __file__, "../../../storage/dataset/dataset/v001/test" ), } DATA_CONFIG = { "checkpoint_dir": "./check_point", "generate_output_dir": "./generated_output", "base_feature_assets": ["BTC-USDT"], } MODEL_CONFIG = { "lookback_window": 120, "batch_size": 512, "lr": 0.0001, "epochs": 15,
import os import kaggle from common_utils_dev import to_abs_path, make_dirs SPOT_CONFIG = { "dataset_name": "jorijnsmit/binance-full-history", "store_dir": to_abs_path(__file__, "../../storage/dataset/rawdata/raw/spot/"), } FUTURE_CONFIG = { "dataset_name": "nicolaes/binance-futures", "store_dir": to_abs_path(__file__, "../../storage/dataset/rawdata/raw/future/"), } def download(username, key): for config in [SPOT_CONFIG, FUTURE_CONFIG]: dataset_name = config["dataset_name"] store_dir = config["store_dir"] make_dirs([store_dir]) # Set env to authenticate os.environ["KAGGLE_USERNAME"] = username os.environ["KAGGLE_KEY"] = key kaggle.api.authenticate() # Download
import torch from glob import glob from typing import Optional, List import pandas as pd import numpy as np from tqdm import tqdm from functools import partial from itertools import combinations from sklearn import preprocessing import joblib from common_utils_dev import make_dirs, to_parquet, to_abs_path, get_filename_by_path from pandarallel import pandarallel from dataclasses import dataclass CONFIG = { "rawdata_dir": to_abs_path(__file__, "../../storage/dataset/rawdata/cleaned/"), "data_store_dir": to_abs_path(__file__, "../../storage/dataset/dataset/v001/"), "lookahead_window": 30, "train_ratio": 0.80, "scaler_type": "StandardScaler", "winsorize_threshold": 6, "query_min_start_dt": "2018-06-01", } OHLC = ["open", "high", "low", "close"] OHLC_COMBINATIONS = list(combinations(OHLC, 2)) HOUR_TO_8CLASS = {idx: idx // 3 for idx in range(24)} @dataclass class DatasetBuilder:
import pandas as pd from glob import glob from tqdm import tqdm from common_utils_dev import ( make_dirs, load_text, get_filename_by_path, to_parquet, get_filename_by_path, to_abs_path, ) CONFIG = { "raw_spot_rawdata_dir": to_abs_path( __file__, "../../storage/dataset/rawdata/raw/spot/" ), "raw_future_rawdata_dir": to_abs_path( __file__, "../../storage/dataset/rawdata/raw/future/" ), "cleaned_rawdata_store_dir": to_abs_path( __file__, "../../storage/dataset/rawdata/cleaned/" ), "candidate_assets_path": to_abs_path(__file__, "./candidate_assets.txt"), "query_min_start_dt": "2018-01-01", "boundary_dt_must_have_data": "2019-09-01", } def build_rawdata( raw_spot_rawdata_dir=CONFIG["raw_spot_rawdata_dir"],