def store_report(self, report): metrics = self.build_metrics().to_frame().T to_parquet( df=metrics.astype("float32"), path=os.path.join( self.report_store_dir, f"metrics_{self.report_prefix}_{self.base_currency}.parquet.zstd", ), ) to_parquet( df=report, path=os.path.join( self.report_store_dir, f"report_{self.report_prefix}_{self.base_currency}.parquet.zstd", ), ) params = { "base_currency": self.base_currency, "position_side": self.position_side, "entry_ratio": self.entry_ratio, "commission": self.commission, "min_holding_minutes": self.min_holding_minutes, "max_holding_minutes": self.max_holding_minutes, "compound_interest": self.compound_interest, "order_criterion": self.order_criterion, "possible_in_debt": self.possible_in_debt, "achieved_with_commission": self.achieved_with_commission, "max_n_updated": self.max_n_updated, "tradable_coins": tuple(self.tradable_coins.tolist()), "exit_if_achieved": self.exit_if_achieved, "achieve_ratio": self.achieve_ratio, "positive_entry_threshold": self.positive_entry_threshold, "negative_entry_threshold": self.negative_entry_threshold, "exit_threshold": self.exit_threshold, "positive_probability_threshold": self.positive_probability_threshold, "negative_probability_threshold": self.negative_probability_threshold, "adjust_prediction": self.adjust_prediction, } with open( os.path.join( self.report_store_dir, f"params_{self.report_prefix}_{self.base_currency}.json", ), "w", ) as f: json.dump(params, f) print(f"[+] Report is stored: {self.report_prefix}_{self.base_currency}")
def store_artifacts( self, features, labels, pricing, feature_scaler, label_scaler, train_ratio, params, data_store_dir, ): # Make dirs train_data_store_dir = os.path.join(data_store_dir, "train") test_data_store_dir = os.path.join(data_store_dir, "test") make_dirs([train_data_store_dir, test_data_store_dir]) # Store params joblib.dump(feature_scaler, os.path.join(data_store_dir, "feature_scaler.pkl")) joblib.dump(label_scaler, os.path.join(data_store_dir, "label_scaler.pkl")) with open(os.path.join(data_store_dir, "dataset_params.json"), "w") as f: json.dump(params, f) print(f"[+] Metadata is stored") # Store dataset boundary_index = int(len(features.index) * train_ratio) for file_name, data in [ ("X.parquet.zstd", features), ("Y.parquet.zstd", labels), ("pricing.parquet.zstd", pricing), ]: to_parquet( df=data.iloc[:boundary_index], path=os.path.join(train_data_store_dir, file_name), ) to_parquet( df=data.iloc[boundary_index:], path=os.path.join(test_data_store_dir, file_name), ) print(f"[+] Dataset is stored")
def build_rawdata( raw_spot_rawdata_dir=CONFIG["raw_spot_rawdata_dir"], raw_future_rawdata_dir=CONFIG["raw_future_rawdata_dir"], cleaned_rawdata_store_dir=CONFIG["cleaned_rawdata_store_dir"], candidate_assets_path=CONFIG["candidate_assets_path"], query_min_start_dt=CONFIG["query_min_start_dt"], boundary_dt_must_have_data=CONFIG["boundary_dt_must_have_data"], ): make_dirs([cleaned_rawdata_store_dir]) candidate_assets = load_text(path=candidate_assets_path) count_files = 0 for candidate_asset in tqdm(candidate_assets): spot_file_path = os.path.join(raw_spot_rawdata_dir, f"{candidate_asset}.parquet") future_file_path = os.path.join(raw_future_rawdata_dir, f"{candidate_asset}.parquet") spot_df = pd.read_parquet(spot_file_path)[[ "open", "high", "low", "close" ]].sort_index() future_df = pd.read_parquet(future_file_path)[[ "open", "high", "low", "close" ]].sort_index() df = pd.concat( [spot_df[spot_df.index < future_df.index[0]], future_df]) df = df.resample("1T").ffill() df = df[query_min_start_dt:] if df.index[0] > pd.Timestamp(boundary_dt_must_have_data): print(f"[!] Skiped: {candidate_asset}") continue assert not df.isnull().any().any() assert len(df.index.unique()) == len(df.index) store_filename = candidate_asset + ".parquet.zstd" df.index = df.index.tz_localize("utc") to_parquet(df=df, path=os.path.join(cleaned_rawdata_store_dir, store_filename)) count_files += 1 print(f"[+] Built rawdata: {count_files}")
def generate(self, save_dir=None): assert self.mode in ("test") self.model.eval() if save_dir is None: save_dir = self.data_config["generate_output_dir"] # Mutate 1 min to handle logic, entry: open, exit: open index = self.test_data_loader.dataset.index index = index.set_levels(index.levels[0] + pd.Timedelta(minutes=1), level=0) predictions = [] labels = [] probabilities = [] for idx in tqdm(range(len(self.test_data_loader))): test_data_dict = self._generate_test_data_dict() pred_abs_factor, pred_sign_factor = self.model( x=test_data_dict["X"], id=test_data_dict["ID"] ) preds = self._invert_to_prediction( pred_abs_factor=pred_abs_factor, pred_sign_factor=pred_sign_factor ) predictions += preds.view(-1).cpu().tolist() labels += test_data_dict["Y"].view(-1).cpu().tolist() probabilities += ( self._build_probabilities(pred_sign_factor=pred_sign_factor) .view(-1) .cpu() .tolist() ) predictions = ( pd.Series(predictions, index=index) .sort_index() .unstack()[self.dataset_params["labels_columns"]] ) labels = ( pd.Series(labels, index=index) .sort_index() .unstack()[self.dataset_params["labels_columns"]] ) probabilities = ( pd.Series(probabilities, index=index) .sort_index() .unstack()[self.dataset_params["labels_columns"]] ) # Rescale predictions = inverse_preprocess_data( data=predictions * self.dataset_params["winsorize_threshold"], scaler=self.label_scaler, ) labels = inverse_preprocess_data( data=labels * self.dataset_params["winsorize_threshold"], scaler=self.label_scaler, ) prediction_abs_bins = self._build_abs_bins(df=predictions) probability_bins = self._build_abs_bins(df=probabilities) # Store signals for data_type, data in [ ("predictions", predictions), ("labels", labels), ("probabilities", probabilities), ("prediction_abs_bins", prediction_abs_bins), ("probability_bins", probability_bins), ]: to_parquet( df=data, path=os.path.join(save_dir, f"{data_type}.parquet.zstd"), )