def store_report(self, report):
        metrics = self.build_metrics().to_frame().T
        to_parquet(
            df=metrics.astype("float32"),
            path=os.path.join(
                self.report_store_dir,
                f"metrics_{self.report_prefix}_{self.base_currency}.parquet.zstd",
            ),
        )

        to_parquet(
            df=report,
            path=os.path.join(
                self.report_store_dir,
                f"report_{self.report_prefix}_{self.base_currency}.parquet.zstd",
            ),
        )

        params = {
            "base_currency": self.base_currency,
            "position_side": self.position_side,
            "entry_ratio": self.entry_ratio,
            "commission": self.commission,
            "min_holding_minutes": self.min_holding_minutes,
            "max_holding_minutes": self.max_holding_minutes,
            "compound_interest": self.compound_interest,
            "order_criterion": self.order_criterion,
            "possible_in_debt": self.possible_in_debt,
            "achieved_with_commission": self.achieved_with_commission,
            "max_n_updated": self.max_n_updated,
            "tradable_coins": tuple(self.tradable_coins.tolist()),
            "exit_if_achieved": self.exit_if_achieved,
            "achieve_ratio": self.achieve_ratio,
            "positive_entry_threshold": self.positive_entry_threshold,
            "negative_entry_threshold": self.negative_entry_threshold,
            "exit_threshold": self.exit_threshold,
            "positive_probability_threshold": self.positive_probability_threshold,
            "negative_probability_threshold": self.negative_probability_threshold,
            "adjust_prediction": self.adjust_prediction,
        }
        with open(
            os.path.join(
                self.report_store_dir,
                f"params_{self.report_prefix}_{self.base_currency}.json",
            ),
            "w",
        ) as f:
            json.dump(params, f)

        print(f"[+] Report is stored: {self.report_prefix}_{self.base_currency}")
    def store_artifacts(
        self,
        features,
        labels,
        pricing,
        feature_scaler,
        label_scaler,
        train_ratio,
        params,
        data_store_dir,
    ):
        # Make dirs
        train_data_store_dir = os.path.join(data_store_dir, "train")
        test_data_store_dir = os.path.join(data_store_dir, "test")
        make_dirs([train_data_store_dir, test_data_store_dir])

        # Store params
        joblib.dump(feature_scaler,
                    os.path.join(data_store_dir, "feature_scaler.pkl"))
        joblib.dump(label_scaler,
                    os.path.join(data_store_dir, "label_scaler.pkl"))

        with open(os.path.join(data_store_dir, "dataset_params.json"),
                  "w") as f:
            json.dump(params, f)

        print(f"[+] Metadata is stored")

        # Store dataset
        boundary_index = int(len(features.index) * train_ratio)

        for file_name, data in [
            ("X.parquet.zstd", features),
            ("Y.parquet.zstd", labels),
            ("pricing.parquet.zstd", pricing),
        ]:
            to_parquet(
                df=data.iloc[:boundary_index],
                path=os.path.join(train_data_store_dir, file_name),
            )

            to_parquet(
                df=data.iloc[boundary_index:],
                path=os.path.join(test_data_store_dir, file_name),
            )

        print(f"[+] Dataset is stored")
Beispiel #3
0
def build_rawdata(
    raw_spot_rawdata_dir=CONFIG["raw_spot_rawdata_dir"],
    raw_future_rawdata_dir=CONFIG["raw_future_rawdata_dir"],
    cleaned_rawdata_store_dir=CONFIG["cleaned_rawdata_store_dir"],
    candidate_assets_path=CONFIG["candidate_assets_path"],
    query_min_start_dt=CONFIG["query_min_start_dt"],
    boundary_dt_must_have_data=CONFIG["boundary_dt_must_have_data"],
):
    make_dirs([cleaned_rawdata_store_dir])
    candidate_assets = load_text(path=candidate_assets_path)

    count_files = 0
    for candidate_asset in tqdm(candidate_assets):
        spot_file_path = os.path.join(raw_spot_rawdata_dir,
                                      f"{candidate_asset}.parquet")
        future_file_path = os.path.join(raw_future_rawdata_dir,
                                        f"{candidate_asset}.parquet")

        spot_df = pd.read_parquet(spot_file_path)[[
            "open", "high", "low", "close"
        ]].sort_index()
        future_df = pd.read_parquet(future_file_path)[[
            "open", "high", "low", "close"
        ]].sort_index()

        df = pd.concat(
            [spot_df[spot_df.index < future_df.index[0]], future_df])
        df = df.resample("1T").ffill()

        df = df[query_min_start_dt:]
        if df.index[0] > pd.Timestamp(boundary_dt_must_have_data):
            print(f"[!] Skiped: {candidate_asset}")
            continue

        assert not df.isnull().any().any()
        assert len(df.index.unique()) == len(df.index)

        store_filename = candidate_asset + ".parquet.zstd"
        df.index = df.index.tz_localize("utc")
        to_parquet(df=df,
                   path=os.path.join(cleaned_rawdata_store_dir,
                                     store_filename))
        count_files += 1

    print(f"[+] Built rawdata: {count_files}")
    def generate(self, save_dir=None):
        assert self.mode in ("test")
        self.model.eval()

        if save_dir is None:
            save_dir = self.data_config["generate_output_dir"]

        # Mutate 1 min to handle logic, entry: open, exit: open
        index = self.test_data_loader.dataset.index
        index = index.set_levels(index.levels[0] + pd.Timedelta(minutes=1), level=0)

        predictions = []
        labels = []
        probabilities = []
        for idx in tqdm(range(len(self.test_data_loader))):
            test_data_dict = self._generate_test_data_dict()

            pred_abs_factor, pred_sign_factor = self.model(
                x=test_data_dict["X"], id=test_data_dict["ID"]
            )
            preds = self._invert_to_prediction(
                pred_abs_factor=pred_abs_factor, pred_sign_factor=pred_sign_factor
            )

            predictions += preds.view(-1).cpu().tolist()
            labels += test_data_dict["Y"].view(-1).cpu().tolist()
            probabilities += (
                self._build_probabilities(pred_sign_factor=pred_sign_factor)
                .view(-1)
                .cpu()
                .tolist()
            )

        predictions = (
            pd.Series(predictions, index=index)
            .sort_index()
            .unstack()[self.dataset_params["labels_columns"]]
        )
        labels = (
            pd.Series(labels, index=index)
            .sort_index()
            .unstack()[self.dataset_params["labels_columns"]]
        )
        probabilities = (
            pd.Series(probabilities, index=index)
            .sort_index()
            .unstack()[self.dataset_params["labels_columns"]]
        )

        # Rescale
        predictions = inverse_preprocess_data(
            data=predictions * self.dataset_params["winsorize_threshold"],
            scaler=self.label_scaler,
        )
        labels = inverse_preprocess_data(
            data=labels * self.dataset_params["winsorize_threshold"],
            scaler=self.label_scaler,
        )

        prediction_abs_bins = self._build_abs_bins(df=predictions)
        probability_bins = self._build_abs_bins(df=probabilities)

        # Store signals
        for data_type, data in [
            ("predictions", predictions),
            ("labels", labels),
            ("probabilities", probabilities),
            ("prediction_abs_bins", prediction_abs_bins),
            ("probability_bins", probability_bins),
        ]:
            to_parquet(
                df=data, path=os.path.join(save_dir, f"{data_type}.parquet.zstd"),
            )