def apply_test_data(): data, test_data, cat_col_names, num_col_names = data_load() bsize = 2500 * 3 * 2 * 2 data_config = DataConfig(target=["target"], continuous_cols=num_col_names, categorical_cols=cat_col_names, num_workers=4) trainer_config = TrainerConfig(auto_lr_find=True, batch_size=bsize, max_epochs=100, gpus=1) optimizer_config = OptimizerConfig() model_config = TabNetModelConfig(task="classification", learning_rate=1e-3 * bsize / 1024, n_d=24, n_a=24, n_steps=5, gamma=1.3) tabular_mode = TabularModel(data_config=data_config, optimizer_config=optimizer_config, model_config=model_config, trainer_config=trainer_config) for i in range(10): diri = f"Analysis/basic_tabnet_rep{i}" tabular_mode.load_from_checkpoint(dir=diri)
def test_classification( classification_data, continuous_cols, categorical_cols, continuous_feature_transform, normalize_continuous_features, num_gaussian ): (train, test, target) = classification_data if len(continuous_cols) + len(categorical_cols) == 0: assert True else: data_config = DataConfig( target=target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, continuous_feature_transform=continuous_feature_transform, normalize_continuous_features=normalize_continuous_features, ) model_config_params = dict(task="classification") mdn_config = MixtureDensityHeadConfig(num_gaussian=num_gaussian) model_config_params['mdn_config'] = mdn_config model_config = CategoryEmbeddingMDNConfig(**model_config_params) trainer_config = TrainerConfig( max_epochs=3, checkpoints=None, early_stopping=None, gpus=0, fast_dev_run=True ) optimizer_config = OptimizerConfig() with pytest.raises(AssertionError): tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, test=test)
def test_feature_extractor( regression_data, model_config_class, continuous_cols, categorical_cols, ): (train, test, target) = regression_data data_config = DataConfig( target=target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, ) model_config_params = dict(task="regression") model_config = model_config_class(**model_config_params) trainer_config = TrainerConfig(max_epochs=3, checkpoints=None, early_stopping=None, gpus=0, fast_dev_run=True) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit( train=train, test=test, ) dt = DeepFeatureExtractor(tabular_model) enc_df = dt.fit_transform(test) assert any([col for col in enc_df.columns if "backbone" in col])
def test_embedding_transformer(regression_data): (train, test, target) = regression_data data_config = DataConfig( target=target, continuous_cols=[ "AveRooms", "AveBedrms", "Population", "AveOccup", "Latitude", "Longitude", ], categorical_cols=["HouseAgeBin"], ) model_config_params = dict( task="regression", input_embed_dim=8, num_attn_blocks=1, num_heads=2, ) model_config = TabTransformerConfig(**model_config_params) trainer_config = TrainerConfig( max_epochs=1, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True, ) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, test=test) transformer = CategoricalEmbeddingTransformer(tabular_model) train_transform = transformer.fit_transform(train) embed_cols = [ col for col in train_transform.columns if "HouseAgeBin_embed_dim" in col ] assert len(train["HouseAgeBin"].unique()) + 1 == len( transformer._mapping["HouseAgeBin"].keys()) assert all([ val.shape[0] == len(embed_cols) for val in transformer._mapping["HouseAgeBin"].values() ])
def test_ssl( classification_data, continuous_cols, categorical_cols, continuous_feature_transform, normalize_continuous_features, ssl_task, aug_task, ): (train, test, target) = classification_data if len(continuous_cols) + len(categorical_cols) == 0: assert True else: data_config = DataConfig( target=target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, continuous_feature_transform=continuous_feature_transform, normalize_continuous_features=normalize_continuous_features, ) model_config_params = dict( task="ssl", input_embed_dim=8, num_attn_blocks=1, num_heads=2, ssl_task=ssl_task, aug_task=aug_task, ) model_config = TabTransformerConfig(**model_config_params) trainer_config = TrainerConfig( max_epochs=1, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True, ) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, test=test) result = tabular_model.evaluate(test) assert "test_mean_squared_error" in result[0].keys()
def __init__(self, model_pars=None, data_pars=None, compute_pars=None): self.model_pars, self.compute_pars, self.data_pars = model_pars, compute_pars, data_pars if model_pars is None: self.model = None else: ############################################################### dm = data_pars['cols_model_group_custom'] data_config = DataConfig( target=dm[ 'coly'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented continuous_cols=dm['colnum'], categorical_cols=dm['colcat'], ) model_config = CategoryEmbeddingModelConfig( **model_pars['model_pars'], ) trainer_config = TrainerConfig( **compute_pars.get('compute_pars', {})) optimizer_config = OptimizerConfig() self.config_pars = { 'data_config': data_config, 'model_config': model_config, 'optimizer_config': optimizer_config, 'trainer_config': trainer_config, } self.model = TabularModel(**self.config_pars) self.guide = None self.pred_summary = None ### All MC summary if VERBOSE: log(self.guide, self.model)
def test_ssl( regression_data, continuous_cols, categorical_cols, continuous_feature_transform, normalize_continuous_features, deep_layers, batch_norm_continuous_input, attention_pooling, ssl_task, aug_task, ): (train, test, target) = regression_data if len(continuous_cols) + len(categorical_cols) == 0: assert True else: data_config = DataConfig( target=target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, continuous_feature_transform=continuous_feature_transform, normalize_continuous_features=normalize_continuous_features, ) model_config_params = dict(task="ssl", ssl_task=ssl_task, aug_task=aug_task) model_config_params["deep_layers"] = deep_layers model_config_params["batch_norm_continuous_input"] = batch_norm_continuous_input model_config_params["attention_pooling"] = attention_pooling model_config = AutoIntConfig(**model_config_params) trainer_config = TrainerConfig( max_epochs=3, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True, ) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, test=test) result = tabular_model.evaluate(test) assert "test_mean_squared_error" in result[0].keys()
def test_ssl(regression_data, continuous_cols, categorical_cols, continuous_feature_transform, normalize_continuous_features, target_range, ssl_task, aug_task): (train, test, target) = regression_data if len(continuous_cols) + len(categorical_cols) == 0: assert True else: data_config = DataConfig( target=target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, continuous_feature_transform=continuous_feature_transform, normalize_continuous_features=normalize_continuous_features, ) model_config_params = dict(task="ssl", input_embed_dim=8, num_attn_blocks=1, num_heads=2, ssl_task=ssl_task, aug_task=aug_task) if target_range: _target_range = [] for target in data_config.target: _target_range.append(( float(train[target].min()), float(train[target].max()), )) model_config_params["target_range"] = _target_range model_config = FTTransformerConfig(**model_config_params) trainer_config = TrainerConfig(max_epochs=1, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, test=test) result = tabular_model.evaluate(test) assert "test_mean_squared_error" in result[0].keys()
def __init__(self, model_pars=None, data_pars=None, compute_pars=None): self.model_pars, self.compute_pars, self.data_pars = model_pars, compute_pars, data_pars if model_pars is None: self.model = None else: ############################################################### dm = data_pars['cols_model_type2'] data_config = DataConfig( target=dm[ 'coly'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented continuous_cols=dm['colcontinuous'], categorical_cols=dm['colsparse'], ) class_name = model_pars.get( 'model_class', "CategoryEmbeddingModelConfig").split("::")[-1] assert class_name in MODEL_DICT, "ModelConfig not available" # Pick the needed ModelConfig #################################### model_class = MODEL_DICT[class_name] if class_name == "CategoryEmbeddingMDNConfig": ### Mixture Desnsity Model ## Check https://github.com/manujosephv/pytorch_tabular/blob/main/tests/test_mdn.py#L99 self.model_pars['model_pars'][ 'mdn_config'] = MixtureDensityHeadConfig( num_gaussian=self.model_pars['model_pars'] ['num_gaussian']) # remove these as they cause errors to other modelConfigs del self.model_pars['model_pars']['num_gaussian'] else: for x in ['num_gaussian', 'mdn_config']: try: del self.model_pars['model_pars'][x] except: pass model_config = model_class(**model_pars['model_pars']) # Remove it since it's unused for other models and can cause errors # del self.model_pars['model_pars']['mdn_config'] trainer_config = TrainerConfig(**compute_pars.get( 'compute_pars', {})) # For testing quickly, max_epochs=1 ) optimizer_config = OptimizerConfig( **compute_pars.get('optimizer_pars', {})) self.config_pars = { 'data_config': data_config, 'model_config': model_config, 'optimizer_config': optimizer_config, 'trainer_config': trainer_config, } self.model = TabularModel(**self.config_pars) self.guide = None self.pred_summary = None ### All MC summary log(self.guide, self.model)
def test_ssl( classification_data, continuous_cols, categorical_cols, continuous_feature_transform, normalize_continuous_features, ssl_task, aug_task, ): (train, test, target) = classification_data if len(continuous_cols) + len(categorical_cols) == 0: assert True else: data_config = DataConfig( target=target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, continuous_feature_transform=continuous_feature_transform, normalize_continuous_features=normalize_continuous_features, ) model_config_params = { "task": "ssl", "ssl_task": ssl_task, "aug_task": aug_task, } model_config = TabNetModelConfig(**model_config_params) trainer_config = TrainerConfig( max_epochs=3, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True, ) optimizer_config = OptimizerConfig() with pytest.raises(AssertionError): tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, test=test)
def test_save_load( regression_data, model_config_class, continuous_cols, categorical_cols, custom_metrics, custom_loss, custom_optimizer, tmpdir, ): (train, test, target) = regression_data data_config = DataConfig( target=target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, ) model_config_class, model_config_params = model_config_class model_config_params["task"] = "regression" model_config = model_config_class(**model_config_params) trainer_config = TrainerConfig( max_epochs=3, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True, ) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit( train=train, test=test, metrics=custom_metrics, loss=custom_loss, optimizer=custom_optimizer, optimizer_params={}, ) result_1 = tabular_model.evaluate(test) # sv_dir = tmpdir/"save_model" # sv_dir.mkdir(exist_ok=True, parents=True) sv_dir = tmpdir.mkdir("saved_model") tabular_model.save_model(str(sv_dir)) new_mdl = TabularModel.load_from_checkpoint(str(sv_dir)) result_2 = new_mdl.evaluate(test) assert (result_1[0][f"test_{tabular_model.model.hparams.metrics[0]}"] == result_2[0][f"test_{new_mdl.model.hparams.metrics[0]}"])
def test_regression( regression_data, multi_target, continuous_cols, categorical_cols, continuous_feature_transform, normalize_continuous_features, target_range, deep_layers, batch_norm_continuous_input, attention_pooling, ): (train, test, target) = regression_data if len(continuous_cols) + len(categorical_cols) == 0: assert True else: data_config = DataConfig( target=target + ["MedInc"] if multi_target else target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, continuous_feature_transform=continuous_feature_transform, normalize_continuous_features=normalize_continuous_features, ) model_config_params = dict(task="regression") if target_range: _target_range = [] for target in data_config.target: _target_range.append( ( float(train[target].min()), float(train[target].max()), ) ) model_config_params["target_range"] = _target_range model_config_params["deep_layers"] = deep_layers model_config_params["batch_norm_continuous_input"] = batch_norm_continuous_input model_config_params["attention_pooling"] = attention_pooling model_config = AutoIntConfig(**model_config_params) trainer_config = TrainerConfig( max_epochs=3, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True, ) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, test=test) result = tabular_model.evaluate(test) # print(result[0]["valid_loss"]) assert "test_mean_squared_error" in result[0].keys() pred_df = tabular_model.predict(test) assert pred_df.shape[0] == test.shape[0]
def test_date_encoding(timeseries_data, freq): (train, test, target) = timeseries_data train, valid = train_test_split(train, random_state=42) data_config = DataConfig( target=target + ["Occupancy"], continuous_cols=[ "Temperature", "Humidity", "Light", "CO2", "HumidityRatio" ], categorical_cols=[], date_columns=[("date", freq)], encode_date_columns=True, ) model_config_params = dict(task="regression") model_config = CategoryEmbeddingModelConfig(**model_config_params) trainer_config = TrainerConfig(max_epochs=1, checkpoints=None, early_stopping=None) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) config = tabular_model.config datamodule = TabularDatamodule( train=train, validation=valid, config=config, test=test, ) datamodule.prepare_data() if freq != "S": datamodule.setup("fit") config = datamodule.config if freq == "H": assert "_Hour" in datamodule.train.columns elif freq == "D": assert "_Dayofyear" in datamodule.train.columns elif freq == "T": assert "_Minute" in datamodule.train.columns elif freq == "S": try: datamodule.setup("fit") assert False except RuntimeError: assert True
def test_regression( regression_data, multi_target, embed_categorical, continuous_cols, categorical_cols, continuous_feature_transform, normalize_continuous_features, target_range, ): (train, test, target) = regression_data if len(continuous_cols) + len(categorical_cols) == 0: assert True else: data_config = DataConfig( target=target + ["MedInc"] if multi_target else target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, continuous_feature_transform=continuous_feature_transform, normalize_continuous_features=normalize_continuous_features, ) model_config_params = dict( task="regression", depth=2, num_trees=50, embed_categorical=embed_categorical, ) if target_range: _target_range = [] for target in data_config.target: _target_range.append(( float(train[target].min()), float(train[target].max()), )) model_config_params["target_range"] = _target_range model_config = NodeConfig(**model_config_params) trainer_config = TrainerConfig(max_epochs=1, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, test=test) result = tabular_model.evaluate(test) assert "test_mean_squared_error" in result[0].keys() pred_df = tabular_model.predict(test) assert pred_df.shape[0] == test.shape[0]
def load_model(path=""): global model, session import cloudpickle as pickle model0 = pickle.load(open(path + '/model/model.pkl', mode='rb')) model = Model() # Empty model model.model_pars = model0.model_pars model.compute_pars = model0.compute_pars model.data_pars = model0.data_pars ### Custom part # model.model = TabularModel.load_from_checkpoint( "ztmp/data/output/torch_tabular/torch_checkpoint") model.model = TabularModel.load_from_checkpoint( path +"/model/torch_checkpoint") session = None return model, session
def __init__(self, model_pars=None, data_pars=None, compute_pars=None): self.model_pars, self.compute_pars, self.data_pars = model_pars, compute_pars, data_pars if model_pars is None: self.model = None else: ############################################################### dm = data_pars['cols_model_group_custom'] data_config = DataConfig( target = dm['coly'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented continuous_cols = dm['colnum'], categorical_cols = dm['colcat'], ) class_name = model_pars.get('model_class', "CategoryEmbeddingModelConfig" ).split("::")[-1] assert class_name in MODEL_DICT, "ModelConfig not available" log2(class_name) # Pick the needed ModelConfig #model_class = globals()[ class_name ] model_class = None if class_name == "CategoryEmbeddingModelConfig": model_class = CategoryEmbeddingModelConfig elif class_name == "TabNetModelConfig": model_class = TabNetModelConfig else: model_class = NodeConfig #model_class = MODEL_DICT[class_name] model_config = model_class( **self.model_pars.get('model_pars', {}) ) trainer_config = TrainerConfig( **compute_pars.get('compute_pars', {} )) # For testing quickly, max_epochs=1 ) optimizer_config = OptimizerConfig(**compute_pars.get('optimizer_pars', {} )) self.config_pars = { 'data_config' : data_config, 'model_config' : model_config, 'optimizer_config' : optimizer_config, 'trainer_config' : trainer_config, } self.model = TabularModel(**self.config_pars) self.guide = None self.pred_summary = None ### All MC summary if VERBOSE: log( self.model)
def test2(nrows=10000): """ python source/models/torch_tabular.py test """ global model, session df,colcat, colnum, coly = test_dataset_covtype(1000) target_name = coly df.head() train, test = train_test_split(df, random_state=42) train, val = train_test_split(train, random_state=42) num_classes = len(set(train[target_name].values.ravel())) data_config = DataConfig( target=target_name, continuous_cols=colnum, categorical_cols=colcat, continuous_feature_transform=None,#"quantile_normal", normalize_continuous_features=False ) model_config = CategoryEmbeddingModelConfig(task="classification", metrics=["f1","accuracy"], metrics_params=[{"num_classes":num_classes},{}]) trainer_config = TrainerConfig(gpus=None, fast_dev_run=True) experiment_config = ExperimentConfig(project_name="PyTorch Tabular Example", run_name="node_forest_cov", exp_watch="gradients", log_target="wandb", log_logits=True) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, # experiment_config=experiment_config, ) tabular_model.fit( train=train, validation=val) result = tabular_model.evaluate(val) log(result) test.drop(columns=target_name, inplace=True) pred_df = tabular_model.predict(val.iloc[:100,:]) log(pred_df)
def test_save_for_inference( regression_data, model_config_class, continuous_cols, categorical_cols, custom_metrics, custom_loss, custom_optimizer, save_type, tmpdir, ): (train, test, target) = regression_data data_config = DataConfig( target=target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, ) model_config_class, model_config_params = model_config_class model_config_params["task"] = "regression" model_config = model_config_class(**model_config_params) trainer_config = TrainerConfig( max_epochs=3, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True, ) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit( train=train, test=test, metrics=custom_metrics, loss=custom_loss, optimizer=custom_optimizer, optimizer_params={}, ) sv_dir = tmpdir.mkdir("saved_model") tabular_model.save_model_for_inference( sv_dir / "model.pt" if type == "pytorch" else sv_dir / "model.onnx", kind=save_type, ) assert os.path.exists(sv_dir / "model.pt" if type == "pytorch" else sv_dir / "model.onnx")
def main(): # Generate Synthetic Data data, cat_col_names, num_col_names = make_mixed_classification( n_samples=10000, n_features=20, n_categories=4) train, test = train_test_split(data, random_state=42) train, val = train_test_split(train, random_state=42) # ##########Define the Configs############ data_config = DataConfig(target=["target"], continuous_cols=num_col_names, categorical_cols=cat_col_names) trainer_config = TrainerConfig(auto_lr_find=True, batch_size=1024, max_epochs=100, gpus=1) optimizer_config = OptimizerConfig() model_config = CategoryEmbeddingModelConfig(task="classification", layers="1024-512-512", activation="LeakyReLU", learning_rate=1e-3) tabular_mode = TabularModel(data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config) # Training the Model tabular_mode.fit(train=train, validation=val) # Evaluating the Model # #Loss and Metrics on New Data¶ result = tabular_mode.evaluate(test) # #New Predictions as DataFrame pred_df = tabular_mode.predict(test) pred_df.head() print_metrics(test['target'], pred_df["prediction"], tag="Holdout") # saving model tabular_mode.save_model("Analysis/basic")
def test_regression( regression_data, multi_target, continuous_cols, categorical_cols, continuous_feature_transform, normalize_continuous_features, target_range, ): (train, test, target) = regression_data if len(continuous_cols) + len(categorical_cols) == 0: assert True else: data_config = DataConfig( target=target + ["MedInc"] if multi_target else target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, continuous_feature_transform=continuous_feature_transform, normalize_continuous_features=normalize_continuous_features, ) model_config_params = dict(task="regression") if target_range: _target_range = [] for target in data_config.target: _target_range.append(( train[target].min().item(), train[target].max().item(), )) model_config_params["target_range"] = _target_range model_config = CategoryEmbeddingModelConfig(**model_config_params) trainer_config = TrainerConfig(max_epochs=3, checkpoints=None, early_stopping=None, gpus=0) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, test=test) result = tabular_model.evaluate(test) # print(result[0]["valid_loss"]) assert "valid_loss" in result[0].keys() pred_df = tabular_model.predict(test) assert pred_df.shape[0] == test.shape[0]
def test_regression( regression_data, multi_target, continuous_cols, categorical_cols, continuous_feature_transform, normalize_continuous_features, variant, num_gaussian, ): (train, test, target) = regression_data if len(continuous_cols) + len(categorical_cols) == 0: assert True else: data_config = DataConfig( target=target + ["MedInc"] if multi_target else target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, continuous_feature_transform=continuous_feature_transform, normalize_continuous_features=normalize_continuous_features, ) model_config_params = dict(task="regression") mdn_config = MixtureDensityHeadConfig(num_gaussian=num_gaussian) model_config_params["mdn_config"] = mdn_config model_config = variant(**model_config_params) trainer_config = TrainerConfig( max_epochs=3, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True, ) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, test=test) result = tabular_model.evaluate(test) # print(result[0]["valid_loss"]) assert "test_mean_squared_error" in result[0].keys() pred_df = tabular_model.predict(test) assert pred_df.shape[0] == test.shape[0]
def test_classification( classification_data, continuous_cols, categorical_cols, continuous_feature_transform, normalize_continuous_features, deep_layers, batch_norm_continuous_input, ): (train, test, target) = classification_data if len(continuous_cols) + len(categorical_cols) == 0: assert True else: data_config = DataConfig( target=target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, continuous_feature_transform=continuous_feature_transform, normalize_continuous_features=normalize_continuous_features, ) model_config_params = dict(task="classification") model_config_params["deep_layers"] = deep_layers model_config_params["batch_norm_continuous_input"] = batch_norm_continuous_input model_config = AutoIntConfig(**model_config_params) trainer_config = TrainerConfig( max_epochs=3, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True, ) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, test=test) result = tabular_model.evaluate(test) # print(result[0]["valid_loss"]) assert "test_accuracy" in result[0].keys() pred_df = tabular_model.predict(test) assert pred_df.shape[0] == test.shape[0]
def test_classification( classification_data, continuous_cols, categorical_cols, embed_categorical, continuous_feature_transform, normalize_continuous_features, ): (train, test, target) = classification_data if len(continuous_cols) + len(categorical_cols) == 0: assert True else: data_config = DataConfig( target=target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, continuous_feature_transform=continuous_feature_transform, normalize_continuous_features=normalize_continuous_features, ) model_config_params = dict( task="classification", depth=2, num_trees=50, embed_categorical=embed_categorical, ) model_config = NodeConfig(**model_config_params) trainer_config = TrainerConfig(max_epochs=1, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, test=test) result = tabular_model.evaluate(test) assert "test_accuracy" in result[0].keys() pred_df = tabular_model.predict(test) assert pred_df.shape[0] == test.shape[0]
def test_classification( classification_data, continuous_cols, categorical_cols, continuous_feature_transform, normalize_continuous_features, ): (train, test, target) = classification_data if len(continuous_cols) + len(categorical_cols) == 0: assert True else: data_config = DataConfig( target=target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, continuous_feature_transform=continuous_feature_transform, normalize_continuous_features=normalize_continuous_features, ) model_config_params = dict(task="classification") model_config = CategoryEmbeddingModelConfig(**model_config_params) trainer_config = TrainerConfig(max_epochs=3, checkpoints=None, early_stopping=None, gpus=0) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, test=test) result = tabular_model.evaluate(test) # print(result[0]["valid_loss"]) assert "valid_loss" in result[0].keys() pred_df = tabular_model.predict(test) assert pred_df.shape[0] == test.shape[0]
def main_64(): # Generate Synthetic Data global train data, test_data, cat_col_names, num_col_names = data_load() bsize = 2500*3*2*2 # ##########Define the Configs############ data_config = DataConfig( target=["target"], continuous_cols=num_col_names, categorical_cols=cat_col_names, num_workers=4 ) trainer_config = TrainerConfig( auto_lr_find=True, batch_size=bsize, max_epochs=100, gpus=1 ) optimizer_config = OptimizerConfig() model_config = TabNetModelConfig( task="classification", learning_rate=1e-3*bsize/1024, n_d=64, n_a=64, n_steps=5, gamma=1.3 ) # Training the Model # tabular_mode.fit(train=train, validation=val) # # Evaluating the Model # # #Loss and Metrics on New Data¶ # result = tabular_mode.evaluate(test) cv = StratifiedKFold(n_splits=10, shuffle=True) res_pred = [] res_test = [] for i, (train_idx, test_idx) in enumerate(cv.split(X=data, y=data.target.values)): train, test = data.iloc[train_idx], data.iloc[test_idx] train, val = train_test_split(train, random_state=42) tabular_mode = TabularModel( data_config=data_config, optimizer_config=optimizer_config, model_config=model_config, trainer_config=trainer_config ) weighted_loss = get_class_weighted_cross_entropy(train["target"].values.ravel(), mu=0.1) # Training the Model tabular_mode.fit(train=train, validation=val, max_epochs=100, loss=weighted_loss) pred_df = tabular_mode.predict(test).loc[:, ["prediction"]] res_pred.append(pred_df) tabular_mode.save_model(f"Analysis/basic_tabnet_rep{i}") pred = tabular_mode.predict(test_data) res_test.append(pred) # #New Predictions as DataFrame pred_tot = pd.concat(res_pred).sort_index() print_metrics(data['target'], pred_tot["prediction"], tag="Holdout") pred_df = pd.concat([res_testi.loc[:, ["0_probability"]] for res_testi in res_test], axis=1).apply(np.mean, axis=1) pred_df2 = pred_df.map(lambda x: 1 if x>0.5 else 0) sample_submisson = pd.read_csv("Data/sample_submission.csv") sample_submisson["target"] = pred_df2.values sample_submisson.to_csv("Analysis/submission_2.csv", index=False) print(confusion_matrix(data['target'], pred_tot["prediction"]))
def test_ssl( regression_data, continuous_cols, categorical_cols, continuous_feature_transform, normalize_continuous_features, target_range, target_transform, custom_metrics, custom_loss, custom_optimizer, ssl_task, aug_task, ): (train, test, target) = regression_data if len(continuous_cols) + len(categorical_cols) == 0: assert True else: data_config = DataConfig( target=target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, continuous_feature_transform=continuous_feature_transform, normalize_continuous_features=normalize_continuous_features, ) model_config_params = dict(task="ssl", ssl_task=ssl_task, aug_task=aug_task) if target_range: _target_range = [] for target in data_config.target: _target_range.append(( float(train[target].min()), float(train[target].max()), )) model_config_params["target_range"] = _target_range model_config = CategoryEmbeddingModelConfig(**model_config_params) trainer_config = TrainerConfig( max_epochs=3, checkpoints=None, early_stopping=None, gpus=None, fast_dev_run=True, ) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit( train=train, test=test, metrics=custom_metrics, target_transform=target_transform, loss=custom_loss, optimizer=custom_optimizer, optimizer_params={}, ) result = tabular_model.evaluate(test) if custom_metrics is None: assert "test_mean_squared_error" in result[0].keys() else: assert "test_fake_metric" in result[0].keys()
def main(): # Generate Synthetic Data data, test_data, cat_col_names, num_col_names = data_load() bsize = 2500 * 2 # ##########Define the Configs############ data_config = DataConfig(target=["target"], continuous_cols=num_col_names, categorical_cols=cat_col_names, num_workers=4) trainer_config = TrainerConfig(auto_lr_find=True, batch_size=bsize, max_epochs=100, gpus=1) optimizer_config = OptimizerConfig() # model_config = TabNetModelConfig( # task="classification", # learning_rate=1e-3*bsize/1024, # n_d=16, # n_a=16, # n_steps=5, # gamma=1.3 # ) model_config = NodeConfig( task="classification", num_layers=2, # Number of Dense Layers num_trees=1024, # Number of Trees in each layer depth=3, # Depth of each Tree embed_categorical=True, # If True, will use a learned embedding, else it will use LeaveOneOutEncoding for categorical columns learning_rate=1e-3, additional_tree_output_dim=5) # Training the Model # tabular_mode.fit(train=train, validation=val) # # Evaluating the Model # # #Loss and Metrics on New Data¶ # result = tabular_mode.evaluate(test) cv = StratifiedKFold(n_splits=10, shuffle=True) res_pred = [] res_test = [] for i, (train_idx, test_idx) in enumerate(cv.split(X=data, y=data.target.values)): train, test = data.iloc[train_idx], data.iloc[test_idx] train, val = train_test_split(train, random_state=42) tabular_mode = TabularModel(data_config=data_config, optimizer_config=optimizer_config, model_config=model_config, trainer_config=trainer_config) weighted_loss = get_class_weighted_cross_entropy( train["target"].values.ravel(), mu=0.1) # Training the Model tabular_mode.fit(train=train, validation=val, max_epochs=100, loss=weighted_loss) pred_df = tabular_mode.predict(test).loc[:, ["prediction"]] res_pred.append(pred_df) print( f"Fold {i} AUC score: {roc_auc_score(test.target.values, pred_df.prediction.values)}" ) # tabular_mode.save_model(f"Analysis/basic_tabnet_rep{i}") ns = 20000 nrep = int(test_data.shape[0] / ns) nlist = [] for i in range(nrep): pp = tabular_mode.predict(test_data.iloc[np.arange( ns * i, ns * (i + 1))]) nlist.append(pp) pred = pd.concat(nlist) res_test.append(pred) pred_df = pd.concat( [res_testi.loc[:, ["0_probability"]] for res_testi in res_test], axis=1).apply(np.mean, axis=1) pred_df2 = pred_df.map(lambda x: 0 if x > 0.5 else 1) sample_submisson = pd.read_csv("Data/sample_submission.csv") sample_submisson["target"] = pred_df2.values # ns = 20000 # nrep = int(test_data.shape[0] / ns) # nlist = [] # for i in range(nrep): # pp = tabular_mode.predict(test_data.iloc[np.arange(ns * i, ns * (i + 1))]) # nlist.append(pp) # #New Predictions as DataFrame pred_tot = pd.concat(res_pred).sort_index() print_metrics(data['target'], pred_tot["prediction"], tag="Holdout") # pred_df = pd.concat([res_testi.loc[:, ["0_probability"]] for res_testi in res_test], axis=1).apply(np.mean, axis=1) # pred_df2 = pred_df.map(lambda x: 1 if x>0.5 else 0) # sample_submisson = pd.read_csv("Data/sample_submission.csv") # sample_submisson["target"] = pred_tot.prediction.values sample_submisson.to_csv("Analysis/submission_2_node.csv", index=False) print(confusion_matrix(data['target'], pred_tot["prediction"]))
def test_dataloader( regression_data, validation_split, multi_target, continuous_cols, categorical_cols, continuous_feature_transform, normalize_continuous_features, target_transform, embedding_dims, ): (train, test, target) = regression_data train, valid = train_test_split(train, random_state=42) if len(continuous_cols) + len(categorical_cols) == 0: assert True else: data_config = DataConfig( target=target + ["MedInc"] if multi_target else target, continuous_cols=continuous_cols, categorical_cols=categorical_cols, continuous_feature_transform=continuous_feature_transform, normalize_continuous_features=normalize_continuous_features, validation_split=validation_split, ) model_config_params = dict(task="regression", embedding_dims=embedding_dims) model_config = CategoryEmbeddingModelConfig(**model_config_params) trainer_config = TrainerConfig(max_epochs=1, checkpoints=None, early_stopping=None) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) config = tabular_model.config datamodule = TabularDatamodule( train=train, validation=valid, config=config, test=test, target_transform=target_transform, ) datamodule.prepare_data() datamodule.setup("fit") config = datamodule.config if len(categorical_cols) > 0: assert config.categorical_cardinality[0] == 5 if embedding_dims is None: assert config.embedding_dims[0][-1] == 3 else: assert config.embedding_dims[0][-1] == embedding_dims[0][-1] if normalize_continuous_features and len(continuous_cols) > 0: assert round( datamodule.train[config.continuous_cols[0]].mean()) == 0 assert round( datamodule.train[config.continuous_cols[0]].std()) == 1 # assert round(datamodule.validation[config.continuous_cols[0]].mean()) == 0 # assert round(datamodule.validation[config.continuous_cols[0]].std()) == 1 val_loader = datamodule.val_dataloader() _val_loader = datamodule.prepare_inference_dataloader(valid) chk_1 = next(iter(val_loader))["continuous"] chk_2 = next(iter(_val_loader))["continuous"] assert np.not_equal(chk_1, chk_2).sum().item() == 0
'target' ], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented continuous_cols=num_col_names, categorical_cols=cat_col_names, ) trainer_config = TrainerConfig( auto_lr_find= True, # Runs the LRFinder to automatically derive a learning rate batch_size=1024, max_epochs=100, gpus=1, #index of the GPU to use. 0, means CPU ) optimizer_config = OptimizerConfig() model_config = CategoryEmbeddingModelConfig( task="classification", layers="1024-512-512", # Number of nodes in each layer activation="LeakyReLU", # Activation between each layers learning_rate=1e-3) tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, ) tabular_model.fit(train=train, validation=val) result = tabular_model.evaluate(test) pred_df = tabular_model.predict(test) tabular_model.save_model("Analysis/basic") loaded_model = TabularModel.load_from_checkpoint("Analysis/basic")
def test2(nrows=10000): """ python source/models/torch_tabular.py test """ global model, session #X = np.random.rand(10000,20) #y = np.random.binomial(n=1, p=0.5, size=[10000]) BASE_DIR = Path.home().joinpath('data/input/covtype/') datafile = BASE_DIR.joinpath('covtype.data.gz') datafile.parent.mkdir(parents=True, exist_ok=True) url = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz" if not datafile.exists(): wget.download(url, datafile.as_posix()) target_name = ["Covertype"] colcat = [ "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3", "Wilderness_Area4", "Soil_Type1", "Soil_Type2", "Soil_Type3", "Soil_Type4", "Soil_Type5", "Soil_Type6", "Soil_Type7", "Soil_Type8", "Soil_Type9", "Soil_Type10", "Soil_Type11", "Soil_Type12", "Soil_Type13", "Soil_Type14", "Soil_Type15", "Soil_Type16", "Soil_Type17", "Soil_Type18", "Soil_Type19", "Soil_Type20", "Soil_Type21", "Soil_Type22", "Soil_Type23", "Soil_Type24", "Soil_Type25", "Soil_Type26", "Soil_Type27", "Soil_Type28", "Soil_Type29", "Soil_Type30", "Soil_Type31", "Soil_Type32", "Soil_Type33", "Soil_Type34", "Soil_Type35", "Soil_Type36", "Soil_Type37", "Soil_Type38", "Soil_Type39", "Soil_Type40" ] colnum = [ "Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways", "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm", "Horizontal_Distance_To_Fire_Points" ] feature_columns = (colnum + colcat + target_name) df = pd.read_csv(datafile, header=None, names=feature_columns, nrows=nrows) df.head() train, test = train_test_split(df, random_state=42) train, val = train_test_split(train, random_state=42) num_classes = len(set(train[target_name].values.ravel())) data_config = DataConfig( target=target_name, continuous_cols=colnum, categorical_cols=colcat, continuous_feature_transform=None, #"quantile_normal", normalize_continuous_features=False) model_config = CategoryEmbeddingModelConfig(task="classification", metrics=["f1", "accuracy"], metrics_params=[{ "num_classes": num_classes }, {}]) trainer_config = TrainerConfig(gpus=None, fast_dev_run=True) experiment_config = ExperimentConfig( project_name="PyTorch Tabular Example", run_name="node_forest_cov", exp_watch="gradients", log_target="wandb", log_logits=True) optimizer_config = OptimizerConfig() tabular_model = TabularModel( data_config=data_config, model_config=model_config, optimizer_config=optimizer_config, trainer_config=trainer_config, # experiment_config=experiment_config, ) tabular_model.fit(train=train, validation=val) result = tabular_model.evaluate(val) log(result) test.drop(columns=target_name, inplace=True) pred_df = tabular_model.predict(val.iloc[:100, :]) log(pred_df)