EXAMPLE_PREDS_COL ) # download all the things napi = NumerAPI() current_round = napi.get_current_round() # Tournament data changes every week so we specify the round in their name. Training # and validation data only change periodically, so no need to download them every time. print('Downloading dataset files...') Path("./v4").mkdir(parents=False, exist_ok=True) napi.download_dataset("v4/train.parquet") napi.download_dataset("v4/validation.parquet") napi.download_dataset("v4/live.parquet", f"v4/live_{current_round}.parquet") napi.download_dataset("v4/validation_example_preds.parquet") napi.download_dataset("v4/features.json") print('Reading minimal training data') # read the feature metadata and get a feature set (or all the features) with open("v4/features.json", "r") as f: feature_metadata = json.load(f) # features = list(feature_metadata["feature_stats"].keys()) # get all the features # features = feature_metadata["feature_sets"]["small"] # get the small feature set features = feature_metadata["feature_sets"]["medium"] # get the medium feature set # read in just those features along with era and target columns read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]
# a value of 10 means use every 10th row downsample_cross_val = 20 downsample_full_train = 2 # if model_selection_loop=True get OOS performance for training_data # and use that to select best model # if model_selection_loop=False, just predict on tournament data using existing models and model config model_selection_loop = True model_config_name = "advanced_example_model" napi = NumerAPI() current_round = napi.get_current_round() Path("./v4").mkdir(parents=False, exist_ok=True) napi.download_dataset("v4/train.parquet") napi.download_dataset("v4/features.json") print("Entering model selection loop. This may take awhile.") if model_selection_loop: model_config = {} print('reading training_data') training_data = pd.read_parquet('v4/train.parquet') # keep track of some prediction columns ensemble_cols = set() pred_cols = set() # pick some targets to use possible_targets = [ c for c in training_data.columns if c.startswith("target_")