class ReviewerV1:
    dataset_dir: str = to_abs_path(__file__,
                                   "../../storage/dataset/dataset/v001/")
    exp_dir: str = to_abs_path(__file__, "../../storage/experiments/v001/")
    reviewer_prefix: str = "v001"
    grid_params: Union[str, Dict[str, List]] = "V1_SET1"
    backtester_type: str = "BacktesterV1"
    exec_start: int = 0
    exec_end: int = None
    n_jobs: int = 16

    def __post_init__(self):
        if isinstance(self.grid_params, str):
            self.grid_params = getattr(paramset, self.grid_params)

        self.grid_params["dataset_dir"] = self.dataset_dir
        self.grid_params["exp_dir"] = self.exp_dir

        self._build_backtesters()

    def _load_data_dict(self):
        data_dict = {}
        for key in ("labels", "predictions", "probabilities"):
            data_dict[key] = pd.read_parquet(
                os.path.join(self.exp_dir,
                             f"generated_output/{key}.parquet.zstd"))

        return data_dict

    def _display_timeseries(self, data_dict):
        columns = data_dict["predictions"].columns
        _, ax = plt.subplots(len(columns), 1, figsize=(24, 2.5 * len(columns)))

        for idx, column in enumerate(columns):
            data_dict["labels"][column].rename("label").plot(ax=ax[idx],
                                                             alpha=0.5)
            data_dict["predictions"][column].rename("prediction").plot(
                ax=ax[idx])
            ax[idx].legend()
            ax[idx].set_title(column)

        plt.tight_layout()
        plt.show()

    def _build_levels(self, data):
        levels = {}
        for column in data.columns:
            levels[column] = pd.qcut(data[column],
                                     10,
                                     labels=False,
                                     retbins=False)

        return pd.concat(levels, axis=1)

    def _build_total_performance(self, data_dict):
        total_performance = (data_dict["labels"] * data_dict["predictions"] >=
                             0).mean()
        total_performance["mean"] = total_performance.mean()

        return total_performance

    def _build_performance_on_levels(self, data_dict, levels):
        performance = data_dict["labels"] * data_dict["predictions"] >= 0

        performance_on_levels = []
        for column in performance.columns:
            performance_on_levels.append(performance[column].groupby(
                levels[column]).mean())

        performance_on_levels = pd.concat(performance_on_levels, axis=1)
        performance_on_levels["mean"] = performance_on_levels.mean(axis=1)

        return performance_on_levels

    def display_performance(self):
        data_dict = self._load_data_dict()

        display_markdown("#### Timeseries", raw=True)
        self._display_timeseries(data_dict=data_dict)

        display_markdown("#### Total Performance", raw=True)
        total_performance = self._build_total_performance(data_dict=data_dict)
        display(
            ft.display(total_performance.rename("bin_acc").to_frame().T,
                       axis=1))

        # Build levels
        label_levels = self._build_levels(data=data_dict["labels"])
        prediction_levels = self._build_levels(data=data_dict["predictions"])
        abs_prediction_levels = self._build_levels(
            data=data_dict["predictions"].abs())
        probability_levels = self._build_levels(
            data=data_dict["probabilities"])

        display_markdown("#### Performance on label levels", raw=True)
        display(
            ft.display(
                self._build_performance_on_levels(data_dict=data_dict,
                                                  levels=label_levels)))

        display_markdown("#### Performance on prediction levels", raw=True)
        display(
            ft.display(
                self._build_performance_on_levels(data_dict=data_dict,
                                                  levels=prediction_levels)))

        display_markdown("#### Performance on abs(prediction) levels",
                         raw=True)
        display(
            ft.display(
                self._build_performance_on_levels(
                    data_dict=data_dict, levels=abs_prediction_levels)))

        display_markdown("#### Performance on probability levels", raw=True)
        display(
            ft.display(
                self._build_performance_on_levels(data_dict=data_dict,
                                                  levels=probability_levels)))

    def _exists_artifact(self, index):
        exists = []
        for artifact_type in ["metrics", "report", "params"]:
            file_path = os.path.join(
                self.grid_params["exp_dir"],
                f"reports/{artifact_type}_{self.reviewer_prefix}_{index}_{self.grid_params['base_currency']}.parquet.zstd",
            )

            if artifact_type in ("params"):
                exists.append(
                    os.path.exists(file_path.replace(".parquet.zstd",
                                                     ".json")))
                continue

            exists.append(os.path.exists(file_path))

        exists = all(exists)

        if exists is True:
            print(f"[!] Found backtests already done: {index}")

        return exists

    def _build_backtesters(self):
        def _is_valid_params(param):
            if param["adjust_prediction"] is True:
                if isinstance(param["exit_threshold"], (int, float)):
                    return False

                if param["max_n_updated"] is None:
                    return False

            if param["exit_threshold"] != "auto":
                if param["achieve_ratio"] != 1:
                    return False

            return True

        grid_params = list(grid(self.grid_params))

        # Filter grid_params
        grid_params = [
            grid_param for grid_param in grid_params
            if _is_valid_params(param=grid_param) is True
        ]

        # Build backtesters
        self.backtesters = [
            getattr(backtester, self.backtester_type)(
                report_prefix=f"{self.reviewer_prefix}_{idx}", **params)
            for idx, params in enumerate(grid_params)
        ][self.exec_start:self.exec_end]
        self.backtesters = [
            backtester for backtester in self.backtesters
            if self._exists_artifact(
                index=backtester.report_prefix.split("_")[-1]) is not True
        ]

    def _load_artifact(self, artifact_type, index):
        assert artifact_type in ("metrics", "report", "params")

        file_path = os.path.join(
            self.grid_params["exp_dir"],
            f"reports/{artifact_type}_{self.reviewer_prefix}_{index}_{self.grid_params['base_currency']}.parquet.zstd",
        )

        if artifact_type in ("metrics", "report"):
            artifact = pd.read_parquet(file_path)
        else:
            artifact = json.load(
                open(file_path.replace(".parquet.zstd", ".json"), "r"))

        return artifact

    def _load_artifacts(self, artifact_type, with_index=False):
        assert artifact_type in ("metrics", "report")

        file_paths = glob(
            os.path.join(
                self.grid_params["exp_dir"],
                f"reports/{artifact_type}_{self.reviewer_prefix}_*_{self.grid_params['base_currency']}.parquet.zstd",
            ))
        file_paths = sorted(
            file_paths,
            key=lambda x: int(
                x.split(f"{self.reviewer_prefix}_")[-1].split(
                    f'_{self.grid_params["base_currency"]}')[0]),
        )

        artifacts = [pd.read_parquet(file_path) for file_path in file_paths]
        index = pd.Index([
            int(
                file_path.split(f"{artifact_type}_{self.reviewer_prefix}_")
                [-1].split(
                    f"_{self.grid_params['base_currency']}.parquet.zstd")[0])
            for file_path in file_paths
        ])

        if with_index is True:
            return artifacts, index

        return artifacts

    def _build_metrics(self):
        artifacts, index = self._load_artifacts(artifact_type="metrics",
                                                with_index=True)
        metrics = pd.concat(artifacts)
        metrics.index = index

        return metrics

    def display_params(self, index, in_shell=False):
        display_markdown(f"#### Params: {index}", raw=True)

        params = (pd.Series(
            self._load_artifact(artifact_type="params",
                                index=index)).rename("params").to_frame())

        if in_shell is True:
            print(tabulate(params, headers="keys", tablefmt="psql"))
        else:
            display(params)

    def display_report(self, index, in_shell=False):
        report = self._load_artifact(artifact_type="report", index=index)

        display_markdown(f"#### Report: {index}", raw=True)
        _, ax = plt.subplots(4, 1, figsize=(12, 12), sharex=True)

        for idx, column in enumerate(
            ["capital", "cache", "return", "trade_return"]):
            if column == "trade_return":
                report[column].dropna().apply(lambda x: sum(x)).plot(
                    ax=ax[idx])

            else:
                report[column].plot(ax=ax[idx])

            ax[idx].set_title(f"historical {column}")

        plt.tight_layout()

        if in_shell is True:
            plt.show(block=True)
        else:
            plt.show()

    def display_metrics(self, in_shell=False):
        metrics = self._build_metrics()

        if in_shell is True:
            print(tabulate(metrics, headers="keys", tablefmt="psql"))
        else:
            display(metrics)

    def display(self, in_shell=False):
        self.display_metrics(in_shell=in_shell)

        metrics = self._build_metrics()
        best_index = metrics["total_return"].sort_values(
            ascending=False).index[0]

        display_markdown(f"### [+] Best index: {best_index}", raw=True)

        display(metrics.loc[best_index])
        self.display_params(index=best_index, in_shell=in_shell)
        self.display_report(index=best_index, in_shell=in_shell)

    def run(self, in_shell=False, display_performance=False):
        if in_shell is False:
            if display_performance is True:
                self.display_performance()

        print(f"[+] Found backtests to start: {len(self.backtesters)}")

        Parallel(n_jobs=self.n_jobs, verbose=1)([
            delayed(backtester.run)(display=False)
            for backtester in self.backtesters
        ])

        self.display(in_shell=in_shell)
Beispiel #2
0
import os
import pandas as pd
from glob import glob
from tqdm import tqdm
from common_utils_dev import (
    make_dirs,
    load_text,
    get_filename_by_path,
    to_parquet,
    get_filename_by_path,
    to_abs_path,
)

CONFIG = {
    "raw_spot_rawdata_dir":
    to_abs_path(__file__, "../../storage/dataset/rawdata/raw/spot/"),
    "raw_future_rawdata_dir":
    to_abs_path(__file__, "../../storage/dataset/rawdata/raw/future/"),
    "cleaned_rawdata_store_dir":
    to_abs_path(__file__, "../../storage/dataset/rawdata/cleaned/"),
    "candidate_assets_path":
    to_abs_path(__file__, "./candidate_assets.txt"),
    "query_min_start_dt":
    "2018-01-01",
    "boundary_dt_must_have_data":
    "2019-09-01",
}


def build_rawdata(
    raw_spot_rawdata_dir=CONFIG["raw_spot_rawdata_dir"],
from copy import copy
from contextlib import contextmanager
from abc import abstractmethod

import torch
import torch.nn as nn
from common_utils_dev import load_text, load_json, to_abs_path, get_parent_dir
from .utils import save_model, load_model, weights_init
from .criterions import CRITERIONS
from ..datasets.dataset import Dataset
from torch.utils.data import DataLoader
from trainer.models import backbones

COMMON_CONFIG = {
    "data_dir":
    to_abs_path(__file__, "../../../storage/dataset/dataset/v001/train"),
    "exp_dir":
    to_abs_path(__file__, "../../../storage/experiments/v001"),
    "test_data_dir":
    to_abs_path(__file__, "../../../storage/dataset/dataset/v001/test"),
}

DATA_CONFIG = {
    "checkpoint_dir": "./check_point",
    "generate_output_dir": "./generated_output",
    "base_feature_assets": ["BTC-USDT"],
}

MODEL_CONFIG = {
    "lookback_window": 120,
    "batch_size": 512,
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Union, Optional, List, Dict
from tqdm import tqdm
from .basic_predictor import BasicPredictor
from .utils import inverse_preprocess_data
from common_utils_dev import to_parquet, to_abs_path

COMMON_CONFIG = {
    "data_dir": to_abs_path(__file__, "../../../storage/dataset/dataset/v001/train"),
    "exp_dir": to_abs_path(__file__, "../../../storage/experiments/v001"),
    "test_data_dir": to_abs_path(
        __file__, "../../../storage/dataset/dataset/v001/test"
    ),
}

DATA_CONFIG = {
    "checkpoint_dir": "./check_point",
    "generate_output_dir": "./generated_output",
    "base_feature_assets": ["BTC-USDT"],
}

MODEL_CONFIG = {
    "lookback_window": 120,
    "batch_size": 512,
    "lr": 0.0001,
    "epochs": 15,
Beispiel #5
0
import os
import kaggle
from common_utils_dev import to_abs_path, make_dirs

SPOT_CONFIG = {
    "dataset_name": "jorijnsmit/binance-full-history",
    "store_dir": to_abs_path(__file__,
                             "../../storage/dataset/rawdata/raw/spot/"),
}

FUTURE_CONFIG = {
    "dataset_name":
    "nicolaes/binance-futures",
    "store_dir":
    to_abs_path(__file__, "../../storage/dataset/rawdata/raw/future/"),
}


def download(username, key):
    for config in [SPOT_CONFIG, FUTURE_CONFIG]:
        dataset_name = config["dataset_name"]
        store_dir = config["store_dir"]

        make_dirs([store_dir])

        # Set env to authenticate
        os.environ["KAGGLE_USERNAME"] = username
        os.environ["KAGGLE_KEY"] = key
        kaggle.api.authenticate()

        # Download
import torch
from glob import glob
from typing import Optional, List
import pandas as pd
import numpy as np
from tqdm import tqdm
from functools import partial
from itertools import combinations
from sklearn import preprocessing
import joblib
from common_utils_dev import make_dirs, to_parquet, to_abs_path, get_filename_by_path
from pandarallel import pandarallel
from dataclasses import dataclass

CONFIG = {
    "rawdata_dir": to_abs_path(__file__,
                               "../../storage/dataset/rawdata/cleaned/"),
    "data_store_dir": to_abs_path(__file__,
                                  "../../storage/dataset/dataset/v001/"),
    "lookahead_window": 30,
    "train_ratio": 0.80,
    "scaler_type": "StandardScaler",
    "winsorize_threshold": 6,
    "query_min_start_dt": "2018-06-01",
}
OHLC = ["open", "high", "low", "close"]
OHLC_COMBINATIONS = list(combinations(OHLC, 2))
HOUR_TO_8CLASS = {idx: idx // 3 for idx in range(24)}


@dataclass
class DatasetBuilder:
Beispiel #7
0
import pandas as pd
from glob import glob
from tqdm import tqdm
from common_utils_dev import (
    make_dirs,
    load_text,
    get_filename_by_path,
    to_parquet,
    get_filename_by_path,
    to_abs_path,
)


CONFIG = {
    "raw_spot_rawdata_dir": to_abs_path(
        __file__, "../../storage/dataset/rawdata/raw/spot/"
    ),
    "raw_future_rawdata_dir": to_abs_path(
        __file__, "../../storage/dataset/rawdata/raw/future/"
    ),
    "cleaned_rawdata_store_dir": to_abs_path(
        __file__, "../../storage/dataset/rawdata/cleaned/"
    ),
    "candidate_assets_path": to_abs_path(__file__, "./candidate_assets.txt"),
    "query_min_start_dt": "2018-01-01",
    "boundary_dt_must_have_data": "2019-09-01",
}


def build_rawdata(
    raw_spot_rawdata_dir=CONFIG["raw_spot_rawdata_dir"],