def _set_default_params(self): try: from autogluon.text import ag_text_presets except ImportError: raise ImportError(AG_TEXT_IMPORT_ERROR) super()._set_default_params() self.params = ag_text_presets.create('default')
def electra_base_all_text_e10_no_decay(): cfg = ag_text_presets.create('electra_base_all_text_e10') cfg['models']['MultimodalTextModel']['search_space'][ 'optimization.num_train_epochs'] = 10 cfg['models']['MultimodalTextModel']['search_space'][ 'optimization.layerwise_lr_decay'] = 1.0 return cfg
def extract_pretrained_embedding(dataset): hyperparameters = ag_text_presets.create('default') hyperparameters['models']['MultimodalTextModel']['search_space'][ 'model.num_trainable_layers'] = 0 hyperparameters['models']['MultimodalTextModel']['search_space'][ 'model._disable_update'] = True hyperparameters['models']['MultimodalTextModel']['search_space'][ 'optimization.num_train_epochs'] = 1 hyperparameters['models']['MultimodalTextModel']['search_space'][ 'preprocessing.categorical.convert_to_text'] = True hyperparameters['models']['MultimodalTextModel']['search_space']['optimization.lr'] = 0. seed = 123 train_dataset = dataset_registry.create(dataset, 'train') test_dataset = dataset_registry.create(dataset, 'test') train_data1, tuning_data1 = sklearn.model_selection.train_test_split( train_dataset.data, test_size=0.05, random_state=np.random.RandomState(seed)) column_types, inferred_problem_type = infer_column_problem_types(train_data1, tuning_data1, label_columns=train_dataset.label_columns, problem_type=train_dataset.problem_type) text_feature_columns = [col_name for col_name in train_dataset.feature_columns if column_types[col_name] == 'text'] train_text_only_data = train_dataset.data[text_feature_columns + train_dataset.label_columns] test_text_only_data = test_dataset.data[text_feature_columns + test_dataset.label_columns] sampled_train_data = train_text_only_data.sample(10) predictor = TextPredictor(label=train_dataset.label_columns) predictor.fit(train_data=sampled_train_data, column_types=column_types, hyperparameters=hyperparameters) train_features = predictor.extract_embedding(train_text_only_data) test_features = predictor.extract_embedding(test_text_only_data) save_base_dir = f'embeddings/{dataset}/pretrain_text_embedding' os.makedirs(save_base_dir, exist_ok=True) np.save(os.path.join(save_base_dir, 'train.npy'), train_features) np.save(os.path.join(save_base_dir, 'test.npy'), test_features) with open(os.path.join(save_base_dir, 'text_columns.json'), 'w') as in_f: json.dump(text_feature_columns, in_f)
def get_test_hyperparameters(): config = ag_text_presets.create('default') search_space = config['models']['MultimodalTextModel']['search_space'] search_space['optimization.num_train_epochs'] = 1 search_space['model.backbone.name'] = 'google_electra_small' return config
def get_preset_models(path, problem_type, eval_metric, hyperparameters, feature_metadata=None, num_classes=None, quantile_levels=None, level: int = 1, ensemble_type=StackerEnsembleModel, ensemble_kwargs: dict = None, ag_args_fit=None, ag_args=None, ag_args_ensemble=None, name_suffix: str = None, default_priorities=None, invalid_model_names: list = None, excluded_model_types: list = None, hyperparameter_preprocess_func=None, hyperparameter_preprocess_kwargs=None, silent=True): hyperparameters = process_hyperparameters(hyperparameters) if hyperparameter_preprocess_func is not None: if hyperparameter_preprocess_kwargs is None: hyperparameter_preprocess_kwargs = dict() hyperparameters = hyperparameter_preprocess_func( hyperparameters, **hyperparameter_preprocess_kwargs) if problem_type not in [ BINARY, MULTICLASS, REGRESSION, SOFTCLASS, QUANTILE ]: raise NotImplementedError invalid_name_set = set() if invalid_model_names is not None: invalid_name_set.update(invalid_model_names) invalid_type_set = set() if excluded_model_types is not None: logger.log(20, f'Excluded Model Types: {excluded_model_types}') invalid_type_set.update(excluded_model_types) if default_priorities is None: default_priorities = copy.deepcopy(DEFAULT_MODEL_PRIORITY) if problem_type in PROBLEM_TYPE_MODEL_PRIORITY: default_priorities.update( PROBLEM_TYPE_MODEL_PRIORITY[problem_type]) level_key = level if level in hyperparameters.keys() else 'default' if level_key not in hyperparameters.keys() and level_key == 'default': hyperparameters = {'default': hyperparameters} hp_level = hyperparameters[level_key] model_cfg_priority_dict = defaultdict(list) for model_type in hp_level: if problem_type == QUANTILE and model_type not in DEFAULT_QUANTILE_MODEL: logger.warning( f"Model type '{model_type}' does not support `problem_type='{QUANTILE}'` yet. This model will be ignored." ) continue models_of_type = hp_level[model_type] if not isinstance(models_of_type, list): models_of_type = [models_of_type] model_cfgs_to_process = [] for model_cfg in models_of_type: if model_type in invalid_type_set: logger.log( 20, f"\tFound '{model_type}' model in hyperparameters, but '{model_type}' is present in `excluded_model_types` and will be removed." ) continue # Don't include excluded models if isinstance(model_cfg, str): if model_type == 'AG_TEXT_NN': AG_TEXT_IMPORT_ERROR = 'autogluon.text has not been installed. ' \ 'You may try to install "autogluon.text" ' \ 'first by running. ' \ '`python3 -m pip install autogluon.text`' try: from autogluon.text import ag_text_presets except ImportError: raise ImportError(AG_TEXT_IMPORT_ERROR) model_cfgs_to_process.append( ag_text_presets.create(model_cfg)) else: model_cfgs_to_process += get_preset_custom( name=model_cfg, problem_type=problem_type, num_classes=num_classes) else: model_cfgs_to_process.append(model_cfg) for model_cfg in model_cfgs_to_process: model_cfg = clean_model_cfg(model_cfg=model_cfg, model_type=model_type, ag_args=ag_args, ag_args_ensemble=ag_args_ensemble, ag_args_fit=ag_args_fit) model_cfg[AG_ARGS]['priority'] = model_cfg[AG_ARGS].get( 'priority', default_priorities.get(model_type, DEFAULT_CUSTOM_MODEL_PRIORITY)) model_priority = model_cfg[AG_ARGS]['priority'] # Check if model_cfg is valid is_valid = is_model_cfg_valid(model_cfg, level=level, problem_type=problem_type) if AG_ARGS_FIT in model_cfg and not model_cfg[AG_ARGS_FIT]: model_cfg.pop(AG_ARGS_FIT) if is_valid: model_cfg_priority_dict[model_priority].append(model_cfg) model_cfg_priority_list = [ model for priority in sorted(model_cfg_priority_dict.keys(), reverse=True) for model in model_cfg_priority_dict[priority] ] if not silent: logger.log(20, 'Model configs that will be trained (in order):') models = [] model_args_fit = {} for model_cfg in model_cfg_priority_list: model = model_factory(model_cfg, path=path, problem_type=problem_type, eval_metric=eval_metric, num_classes=num_classes, quantile_levels=quantile_levels, name_suffix=name_suffix, ensemble_type=ensemble_type, ensemble_kwargs=ensemble_kwargs, invalid_name_set=invalid_name_set, level=level, feature_metadata=feature_metadata) invalid_name_set.add(model.name) if 'hyperparameter_tune_kwargs' in model_cfg[AG_ARGS]: model_args_fit[model.name] = { 'hyperparameter_tune_kwargs': model_cfg[AG_ARGS]['hyperparameter_tune_kwargs'] } if not silent: logger.log(20, f'\t{model.name}: \t{model_cfg}') models.append(model) return models, model_args_fit
def _set_default_params(self): super()._set_default_params() try_import_autogluon_text() from autogluon.text import ag_text_presets self.params = ag_text_presets.create('default')
def train_model(dataset_name, text_presets, save_dir, model, tabular_presets, num_gpus=None, get_competition_results=False, seed=123): set_seed(seed) if get_competition_results: train_dataset = dataset_registry.create(dataset_name, 'train') test_dataset = dataset_registry.create(dataset_name, 'competition') else: train_dataset = dataset_registry.create(dataset_name, 'train') test_dataset = dataset_registry.create(dataset_name, 'test') feature_columns = train_dataset.feature_columns label_columns = train_dataset.label_columns metric = train_dataset.metric problem_type = train_dataset.problem_type train_data1, tuning_data1 = sklearn.model_selection.train_test_split( train_dataset.data, test_size=0.05, random_state=np.random.RandomState(seed)) train_data = train_dataset.data test_data = test_dataset.data column_types, inferred_problem_type = infer_column_problem_types( train_data1, tuning_data1, label_columns=label_columns, problem_type=problem_type) train_data = train_data[feature_columns + label_columns] # tuning_data = tuning_data[feature_columns + label_columns] if not get_competition_results: test_data = test_data[feature_columns + label_columns] train_tic = time.time() if model == 'ag_tabular_quick': MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8)) predictor = TabularPredictor(label=label_columns[0], path=save_dir, problem_type=problem_type) predictor.fit(train_data, time_limit=30, feature_generator=feature_generator) elif model == 'ag_tabular_without_text': no_text_feature_columns = [] for col_name in feature_columns: if column_types[col_name] != _TEXT: no_text_feature_columns.append(col_name) train_data = train_data[no_text_feature_columns + label_columns] # tuning_data = tuning_data[no_text_feature_columns + label_columns] test_data = test_data[no_text_feature_columns + label_columns] predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets in ['best_quality']: predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS, presets=tabular_presets) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS, num_bag_folds=5, num_stack_levels=1) elif tabular_presets == 'no': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'ag_tabular_old': predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'ag_text_only': text_feature_columns = [ col_name for col_name in feature_columns if column_types[col_name] == _TEXT ] train_data = train_data[text_feature_columns + label_columns] test_data = test_data[text_feature_columns + label_columns] predictor = TextPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) hparams = ag_text_presets.create(text_presets) if len(train_data) > 500000: hparams = set_epoch3(hparams) predictor.fit(train_data=train_data, hyperparameters=hparams, num_gpus=num_gpus, seed=seed) elif model == 'ag_text_multimodal': predictor = TextPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) hparams = ag_text_presets.create(text_presets) if len(train_data) > 500000: hparams = set_epoch3(hparams) predictor.fit(train_data=train_data, hyperparameters=hparams, num_gpus=num_gpus, seed=seed) elif model == 'pre_embedding' or model == 'tune_embedding_multimodal' or model == 'tune_embedding_text': feature_generator = AutoMLPipelineFeatureGenerator( enable_text_special_features=False, enable_text_ngram_features=False) pre_embedding_folder = os.path.join(_CURR_DIR, 'pre_computed_embeddings') if model == 'pre_embedding': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'pretrain_text_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'pretrain_text_embedding', 'test.npy')) elif model == 'tune_embedding_multimodal': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'multimodal_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'multimodal_embedding', 'test.npy')) elif model == 'tune_embedding_text': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'tuned_text_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'tuned_text_embedding', 'test.npy')) else: raise NotImplementedError train_data = train_data.join( pd.DataFrame(train_features, columns=[ f'pre_feat{i}' for i in range(train_features.shape[1]) ])) train_data.reset_index(drop=True, inplace=True) test_data = test_data.join( pd.DataFrame(test_features, columns=[ f'pre_feat{i}' for i in range(test_features.shape[1]) ])) test_data.reset_index(drop=True, inplace=True) predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'tabular_multimodal' or model == 'tabular_multimodal_just_table': if model == 'tabular_multimodal': MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8), enable_raw_text_features=True) hyperparameters = get_multimodal_tabular_hparam_just_gbm( text_presets=text_presets) else: MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8), enable_raw_text_features=True, enable_text_special_features=False, enable_text_ngram_features=False) hyperparameters = multimodal_tabular_just_table_hparam( text_presets=text_presets) predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '3fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=3, num_stack_levels=1, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError else: raise NotImplementedError train_toc = time.time() inference_tic = time.time() predictions = predictor.predict(test_data, as_pandas=True) predictor.save() inference_toc = time.time() if problem_type == MULTICLASS or problem_type == BINARY: prediction_prob = predictor.predict_proba(test_data, as_pandas=True) prediction_prob.to_csv( os.path.join(save_dir, 'test_prediction_prob.csv')) predictions.to_csv(os.path.join(save_dir, 'test_prediction.csv')) gt = test_data[label_columns[0]] gt.to_csv(os.path.join(save_dir, 'ground_truth.csv')) if not get_competition_results: score = predictor.evaluate(test_data) with open(os.path.join(save_dir, 'test_score.json'), 'w') as of: json.dump({metric: score}, of) with open(os.path.join(save_dir, 'speed_stats.json'), 'w') as of: json.dump( { 'train_time': train_toc - train_tic, 'inference_time': inference_toc - inference_tic, 'cpuinfo': cpuinfo.get_cpu_info() }, of)
def get_preset_models(path, problem_type, eval_metric, hyperparameters, level: int = 1, ensemble_type=StackerEnsembleModel, ensemble_kwargs: dict = None, ag_args_fit=None, ag_args=None, ag_args_ensemble=None, name_suffix: str = None, default_priorities=None, invalid_model_names: list = None, excluded_model_types: list = None, hyperparameter_preprocess_func=None, hyperparameter_preprocess_kwargs=None, silent=True): hyperparameters = process_hyperparameters(hyperparameters) if hyperparameter_preprocess_func is not None: if hyperparameter_preprocess_kwargs is None: hyperparameter_preprocess_kwargs = dict() hyperparameters = hyperparameter_preprocess_func(hyperparameters, **hyperparameter_preprocess_kwargs) if problem_type not in [BINARY, MULTICLASS, REGRESSION, SOFTCLASS, QUANTILE]: raise NotImplementedError invalid_name_set = set() if invalid_model_names is not None: invalid_name_set.update(invalid_model_names) invalid_type_set = set() if excluded_model_types is not None: logger.log(20, f'Excluded Model Types: {excluded_model_types}') invalid_type_set.update(excluded_model_types) if default_priorities is None: default_priorities = copy.deepcopy(DEFAULT_MODEL_PRIORITY) if problem_type in PROBLEM_TYPE_MODEL_PRIORITY: default_priorities.update(PROBLEM_TYPE_MODEL_PRIORITY[problem_type]) level_key = level if level in hyperparameters.keys() else 'default' if level_key not in hyperparameters.keys() and level_key == 'default': hyperparameters = {'default': hyperparameters} hp_level = hyperparameters[level_key] model_cfg_priority_dict = defaultdict(list) model_type_list = list(hp_level.keys()) if 'NN' in model_type_list: raise ValueError("'NN' model has been deprecated. Please specify 'NN_MXNET' or 'NN_TORCH' in its place (Tabular Neural Networks implemented in different backend frameworks).") for model_type in model_type_list: if problem_type == QUANTILE: if model_type not in DEFAULT_QUANTILE_MODEL: if model_type == 'NN_MXNET' and 'NN_TORCH' in DEFAULT_QUANTILE_MODEL and 'NN_TORCH' not in model_type_list: model_type = 'NN_TORCH' hp_level['NN_TORCH'] = hp_level.pop('NN_MXNET') logger.log(15, "Quantile regression must use NN_TORCH instead of NN_MXNET, switching NN_MXET -> NN_TORCH.") else: continue models_of_type = hp_level[model_type] if not isinstance(models_of_type, list): models_of_type = [models_of_type] model_cfgs_to_process = [] for model_cfg in models_of_type: if model_type in invalid_type_set: logger.log(20, f"\tFound '{model_type}' model in hyperparameters, but '{model_type}' is present in `excluded_model_types` and will be removed.") continue # Don't include excluded models if isinstance(model_cfg, str): if model_type == 'AG_TEXT_NN': AG_TEXT_IMPORT_ERROR = 'autogluon.text has not been installed. ' \ 'You may try to install "autogluon.text" ' \ 'first by running. ' \ '`python3 -m pip install autogluon.text`' try: from autogluon.text import ag_text_presets except ImportError: raise ImportError(AG_TEXT_IMPORT_ERROR) model_cfgs_to_process.append(ag_text_presets.create(model_cfg)) else: model_cfgs_to_process += get_preset_custom(name=model_cfg, problem_type=problem_type) else: model_cfgs_to_process.append(model_cfg) for model_cfg in model_cfgs_to_process: model_cfg = clean_model_cfg(model_cfg=model_cfg, model_type=model_type, ag_args=ag_args, ag_args_ensemble=ag_args_ensemble, ag_args_fit=ag_args_fit, problem_type=problem_type) model_cfg[AG_ARGS]['priority'] = model_cfg[AG_ARGS].get('priority', default_priorities.get(model_type, DEFAULT_CUSTOM_MODEL_PRIORITY)) model_priority = model_cfg[AG_ARGS]['priority'] # Check if model_cfg is valid is_valid = is_model_cfg_valid(model_cfg, level=level, problem_type=problem_type) if AG_ARGS_FIT in model_cfg and not model_cfg[AG_ARGS_FIT]: model_cfg.pop(AG_ARGS_FIT) if is_valid: model_cfg_priority_dict[model_priority].append(model_cfg) model_cfg_priority_list = [model for priority in sorted(model_cfg_priority_dict.keys(), reverse=True) for model in model_cfg_priority_dict[priority]] if not silent: logger.log(20, 'Model configs that will be trained (in order):') models = [] model_args_fit = {} for model_cfg in model_cfg_priority_list: model = model_factory(model_cfg, path=path, problem_type=problem_type, eval_metric=eval_metric, name_suffix=name_suffix, ensemble_type=ensemble_type, ensemble_kwargs=ensemble_kwargs, invalid_name_set=invalid_name_set, level=level) invalid_name_set.add(model.name) if 'hyperparameter_tune_kwargs' in model_cfg[AG_ARGS]: model_args_fit[model.name] = {'hyperparameter_tune_kwargs': model_cfg[AG_ARGS]['hyperparameter_tune_kwargs']} if 'ag_args_ensemble' in model_cfg and not model_cfg['ag_args_ensemble']: model_cfg.pop('ag_args_ensemble') if not silent: logger.log(20, f'\t{model.name}: \t{model_cfg}') models.append(model) return models, model_args_fit