Beispiel #1
0
def download(username, key):
    for config in [SPOT_CONFIG, FUTURE_CONFIG]:
        dataset_name = config["dataset_name"]
        store_dir = config["store_dir"]

        make_dirs([store_dir])

        # Set env to authenticate
        os.environ["KAGGLE_USERNAME"] = username
        os.environ["KAGGLE_KEY"] = key
        kaggle.api.authenticate()

        # Download
        kaggle.api.dataset_download_files(dataset_name,
                                          path=store_dir,
                                          unzip=True)
    def store_artifacts(
        self,
        features,
        labels,
        pricing,
        feature_scaler,
        label_scaler,
        train_ratio,
        params,
        data_store_dir,
    ):
        # Make dirs
        train_data_store_dir = os.path.join(data_store_dir, "train")
        test_data_store_dir = os.path.join(data_store_dir, "test")
        make_dirs([train_data_store_dir, test_data_store_dir])

        # Store params
        joblib.dump(feature_scaler,
                    os.path.join(data_store_dir, "feature_scaler.pkl"))
        joblib.dump(label_scaler,
                    os.path.join(data_store_dir, "label_scaler.pkl"))

        with open(os.path.join(data_store_dir, "dataset_params.json"),
                  "w") as f:
            json.dump(params, f)

        print(f"[+] Metadata is stored")

        # Store dataset
        boundary_index = int(len(features.index) * train_ratio)

        for file_name, data in [
            ("X.parquet.zstd", features),
            ("Y.parquet.zstd", labels),
            ("pricing.parquet.zstd", pricing),
        ]:
            to_parquet(
                df=data.iloc[:boundary_index],
                path=os.path.join(train_data_store_dir, file_name),
            )

            to_parquet(
                df=data.iloc[boundary_index:],
                path=os.path.join(test_data_store_dir, file_name),
            )

        print(f"[+] Dataset is stored")
Beispiel #3
0
def build_rawdata(
    raw_spot_rawdata_dir=CONFIG["raw_spot_rawdata_dir"],
    raw_future_rawdata_dir=CONFIG["raw_future_rawdata_dir"],
    cleaned_rawdata_store_dir=CONFIG["cleaned_rawdata_store_dir"],
    candidate_assets_path=CONFIG["candidate_assets_path"],
    query_min_start_dt=CONFIG["query_min_start_dt"],
    boundary_dt_must_have_data=CONFIG["boundary_dt_must_have_data"],
):
    make_dirs([cleaned_rawdata_store_dir])
    candidate_assets = load_text(path=candidate_assets_path)

    count_files = 0
    for candidate_asset in tqdm(candidate_assets):
        spot_file_path = os.path.join(raw_spot_rawdata_dir,
                                      f"{candidate_asset}.parquet")
        future_file_path = os.path.join(raw_future_rawdata_dir,
                                        f"{candidate_asset}.parquet")

        spot_df = pd.read_parquet(spot_file_path)[[
            "open", "high", "low", "close"
        ]].sort_index()
        future_df = pd.read_parquet(future_file_path)[[
            "open", "high", "low", "close"
        ]].sort_index()

        df = pd.concat(
            [spot_df[spot_df.index < future_df.index[0]], future_df])
        df = df.resample("1T").ffill()

        df = df[query_min_start_dt:]
        if df.index[0] > pd.Timestamp(boundary_dt_must_have_data):
            print(f"[!] Skiped: {candidate_asset}")
            continue

        assert not df.isnull().any().any()
        assert len(df.index.unique()) == len(df.index)

        store_filename = candidate_asset + ".parquet.zstd"
        df.index = df.index.tz_localize("utc")
        to_parquet(df=df,
                   path=os.path.join(cleaned_rawdata_store_dir,
                                     store_filename))
        count_files += 1

    print(f"[+] Built rawdata: {count_files}")
    def build(self):
        self.report_store_dir = os.path.join(self.exp_dir, "reports/")
        make_dirs([self.report_store_dir])

        self.historical_data_dict = self._build_historical_data_dict(
            base_currency=self.base_currency,
            historical_data_path_dict={
                "pricing": os.path.join(self.dataset_dir, "test/pricing.parquet.zstd"),
                "predictions": os.path.join(
                    self.exp_dir, "generated_output/predictions.parquet.zstd"
                ),
                "probabilities": os.path.join(
                    self.exp_dir, "generated_output/probabilities.parquet.zstd"
                ),
                "labels": os.path.join(
                    self.exp_dir, "generated_output/labels.parquet.zstd"
                ),
            },
        )
        self.tradable_coins = self.historical_data_dict["predictions"].columns
        self.index = (
            self.historical_data_dict["predictions"].index
            & self.historical_data_dict["pricing"].index
        ).sort_values()
        for key in self.historical_data_dict.keys():
            self.historical_data_dict[key] = self.historical_data_dict[key].reindex(
                self.index
            )

        prediction_abs_bins = self._load_prediction_abs_bins()
        probability_bins = self._load_probability_bins()
        self._set_bins(
            prediction_abs_bins=prediction_abs_bins,
            probability_bins=probability_bins,
            index=self.tradable_coins,
        )
    def build(
        self,
        rawdata_dir=CONFIG["rawdata_dir"],
        data_store_dir=CONFIG["data_store_dir"],
        lookahead_window=CONFIG["lookahead_window"],
        train_ratio=CONFIG["train_ratio"],
        scaler_type=CONFIG["scaler_type"],
        winsorize_threshold=CONFIG["winsorize_threshold"],
        query_min_start_dt=CONFIG["query_min_start_dt"],
    ):
        assert scaler_type in ("RobustScaler", "StandardScaler")
        pandarallel.initialize()

        # Make dirs
        make_dirs([data_store_dir])

        # Set file_names
        file_names = sorted(glob(os.path.join(rawdata_dir, "*")))
        assert len(file_names) != 0

        # Build rawdata
        rawdata = self.build_rawdata(file_names=file_names,
                                     query_min_start_dt=query_min_start_dt)
        gc.collect()

        # Build features
        features, class_features = self.build_features(rawdata=rawdata)
        self.feature_scaler = self.build_scaler(data=features,
                                                scaler_type=scaler_type)
        features = self.preprocess_features(
            features=features, winsorize_threshold=winsorize_threshold)
        features = pd.concat([features, class_features],
                             axis=1)[self.features_columns].sort_index()
        gc.collect()

        # build labels
        labels = self.build_labels(rawdata=rawdata,
                                   lookahead_window=lookahead_window)
        self.label_scaler = self.build_scaler(data=labels,
                                              scaler_type=scaler_type)
        labels = self.preprocess_labels(
            labels=labels, winsorize_threshold=winsorize_threshold)
        gc.collect()

        # Masking with common index
        common_index = (features.index & labels.index).sort_values()
        features = features.reindex(common_index)
        labels = labels.reindex(common_index)
        pricing = rawdata.reindex(common_index)

        params = {
            "lookahead_window": lookahead_window,
            "train_ratio": train_ratio,
            "scaler_type": scaler_type,
            "features_columns": features.columns.tolist(),
            "labels_columns": labels.columns.tolist(),
            "tradable_coins": self.tradable_coins,
            "winsorize_threshold": winsorize_threshold,
            "query_min_start_dt": query_min_start_dt,
        }

        # Store Artifacts
        self.store_artifacts(
            features=features,
            labels=labels,
            pricing=pricing,
            feature_scaler=self.feature_scaler,
            label_scaler=self.label_scaler,
            train_ratio=train_ratio,
            params=params,
            data_store_dir=data_store_dir,
        )