def train_model(df_train: pd.DataFrame, df_test: pd.DataFrame, label: str, verbosity: int = 0, random_state: int = 0) -> TabularPredictor: """ Train an autogluon model for df_train, df_test. Specify the label column. Optionally, you can set verbosity to control how much output AutoGluon produces during training. The function caches models that have been trained on the same data by computing the hash of df_train and comparing that to existing models. Returns the predictor object. TODO: Optimize this bad boy for experiments. Would be k-fold cross-validation instead of train-test split and a AG-preset that opts for highest quality model. Also no or very high time_limit. """ logger = logging.getLogger('pfd') d = 'agModels' # folder to store trained models checksum = calculate_model_hash(df_train, label, random_state) model_path = f'{d}/{checksum}' logger.info(f'Calculated a checksum of {checksum}.') try: predictor = TabularPredictor.load(model_path) except FileNotFoundError: logger.info("Didn't find a model to load from the cache.") p = TabularPredictor(label=label, path=model_path) predictor = p.fit(train_data=df_train, tuning_data=df_test, time_limit=20, verbosity=verbosity, presets='medium_quality_faster_train') return predictor
def train(self, train_data, eval_metric=EVAL_METRIC, quality=QUALITY, time_limit=TIME_LIMIT, verbosity=VERBOSITY): """Train prospective models.""" # predictor gives us default access to the *best* predictor that # was trained on the task (otherwise we're just wrapping AutoGluon) # create custom feature generator to force autogluon to use our features # as they are fg = AutoMLPipelineFeatureGenerator(enable_categorical_features=False, enable_datetime_features=False, enable_text_special_features=False, enable_text_ngram_features=False) # create our own feature metadata object as we know what the type of every # feature we have. Skip the label column in the training data when doing so fmd = FeatureMetadata(dict.fromkeys(train_data.columns[:-1], 'int')) task = TabularPredictor( label='label', eval_metric=eval_metric, path=self.outpath, verbosity=verbosity, ) return task.fit(train_data=train_data, time_limit=time_limit, presets=self.QUALITY_PRESETS[quality], feature_generator=fg, feature_metadata=fmd)
def _fit(self, X: List[Config[ModelConfig]], y: npt.NDArray[np.float32]) -> None: X_numpy = self.config_transformer.fit_transform(X) # We need to train one predictor per output feature self.predictors = [] for i in range(y.shape[1]): df = pd.DataFrame(np.concatenate([X_numpy, y[:, i:i + 1]], axis=-1)) predictor = TabularPredictor( df.shape[1] - 1, problem_type="regression", eval_metric="root_mean_squared_error", ) predictor.fit(df, time_limit=self.time_limit, verbosity=0) self.predictors.append(predictor)
def train(args): set_seed(args.seed) if args.task is not None: feature_columns, label_column, eval_metric, all_metrics = TASKS[args.task] else: raise NotImplementedError if args.exp_dir is None: args.exp_dir = 'autogluon_text_{}'.format(args.task) train_df = load_pd.load(args.train_file) dev_df = load_pd.load(args.dev_file) test_df = load_pd.load(args.test_file) train_df = train_df[feature_columns + [label_column]] dev_df = dev_df[feature_columns + [label_column]] test_df = test_df[feature_columns] if args.task == 'mrpc' or args.task == 'sts': # Augmenting the un-ordered set manually. train_df_other_part = pd.DataFrame({feature_columns[0]: train_df[feature_columns[1]], feature_columns[1]: train_df[feature_columns[0]], label_column: train_df[label_column]}) real_train_df = pd.concat([train_df, train_df_other_part]) real_dev_df = dev_df else: real_train_df = train_df real_dev_df = dev_df if args.mode == 'stacking': predictor = TabularPredictor(label=label_column, eval_metric=eval_metric, path=args.exp_dir) predictor.fit(train_data=real_train_df, tuning_data=real_dev_df, hyperparameters='multimodal', num_bag_folds=5, num_stack_levels=1) elif args.mode == 'weighted': predictor = TabularPredictor(label=label_column, eval_metric=eval_metric, path=args.exp_dir) predictor.fit(train_data=real_train_df, tuning_data=real_dev_df, hyperparameters='multimodal') elif args.mode == 'single': # When no embedding is used, # we will just use TextPredictor that will train a single model internally. predictor = TextPredictor(label=label_column, eval_metric=eval_metric, path=args.exp_dir) predictor.fit(train_data=real_train_df, tuning_data=real_dev_df, seed=args.seed) else: raise NotImplementedError dev_metric_score = predictor.evaluate(dev_df) dev_predictions = predictor.predict(dev_df, as_pandas=True) test_predictions = predictor.predict(test_df, as_pandas=True) dev_predictions.to_csv(os.path.join(args.exp_dir, 'dev_prediction.csv')) test_predictions.to_csv(os.path.join(args.exp_dir, 'test_prediction.csv')) with open(os.path.join(args.exp_dir, 'final_model_scores.json'), 'w') as of: json.dump({f'valid_{eval_metric}': dev_metric_score}, of)
def run(args): if args.task == 'product_sentiment': train_df, test_df, label_column = load_machine_hack_product_sentiment(args.train_file, args.test_file) elif args.task == 'mercari_price': train_df, test_df, label_column = load_mercari_price_prediction(args.train_file, args.test_file) elif args.task == 'price_of_books': train_df, test_df, label_column = load_price_of_books(args.train_file, args.test_file) elif args.task == 'data_scientist_salary': train_df, test_df, label_column = load_data_scientist_salary(args.train_file, args.test_file) else: raise NotImplementedError hyperparameters = get_hyperparameter_config('multimodal') if args.preset is not None and args.mode in ['stacking', 'weighted']: hyperparameters['AG_TEXT_NN']['presets'] = args.preset if args.mode == 'stacking': predictor = TabularPredictor(label=label_column, eval_metric=args.eval_metric, path=args.exp_dir) predictor.fit(train_data=train_df, hyperparameters=hyperparameters, num_bag_folds=5, num_stack_levels=1) elif args.mode == 'weighted': predictor = TabularPredictor(label=label_column, eval_metric=args.eval_metric, path=args.exp_dir) predictor.fit(train_data=train_df, hyperparameters=hyperparameters) elif args.mode == 'single': # When no embedding is used, # we will just use TextPredictor that will train a single model internally. predictor = TextPredictor(label=label_column, eval_metric=args.eval_metric, path=args.exp_dir) predictor.fit(train_data=train_df, presets=args.preset, seed=args.seed) else: raise NotImplementedError if args.task == 'product_sentiment': test_probabilities = predictor.predict_proba(test_df, as_pandas=True, as_multiclass=True) test_probabilities.to_csv(os.path.join(args.exp_dir, 'submission.csv'), index=False) elif args.task == 'data_scientist_salary': predictions = predictor.predict(test_df, as_pandas=False) submission = pd.read_excel(args.sample_submission, engine='openpyxl') submission.loc[:, label_column] = predictions submission.to_excel(os.path.join(args.exp_dir, 'submission.xlsx')) elif args.task == 'price_of_books': predictions = predictor.predict(test_df, as_pandas=False) submission = pd.read_excel(args.sample_submission, engine='openpyxl') submission.loc[:, label_column] = np.power(10, predictions) - 1 submission.to_excel(os.path.join(args.exp_dir, 'submission.xlsx')) elif args.task == 'mercari_price': test_predictions = predictor.predict(test_df, as_pandas=False) submission = pd.read_csv(args.sample_submission) submission.loc[:, label_column] = np.exp(test_predictions) - 1 submission.to_csv(os.path.join(args.exp_dir, 'submission.csv'), index=False) else: raise NotImplementedError
elif args.model_type == 'large': tabular_hparams = get_tabular_hparams( electra_large_late_fusion_concate_e10_avg3()) else: raise NotImplementedError time_str = strftime("%Y-%m-%d_%H-%M-%S", gmtime()) if args.ensemble_type == 'weighted' or args.ensemble_type == 'stack': predictor = TabularPredictor(path=os.path.join(args.save_dir, args.model_type, time_str), problem_type=train_dataset.problem_type, eval_metric=train_dataset.metric, label=label_columns[0]) if args.ensemble_type == 'weighted': predictor.fit(concat_df[feature_columns + [label_columns[0]]], feature_generator=feature_generator, hyperparameters=tabular_hparams) else: predictor.fit(concat_df[feature_columns + [label_columns[0]]], feature_generator=feature_generator, num_bag_folds=5, num_stack_levels=1, hyperparameters=tabular_hparams) predictor.save() else: predictor = TextPredictor(path=os.path.join(args.save_dir, args.model_type, time_str), problem_type=train_dataset.problem_type, eval_metric=train_dataset.metric, label=label_columns[0]) predictor.fit(concat_df[feature_columns + [label_columns[0]]],
all_train_data = all_train_data[keep_ind] train_data, test_data = train_test_split(all_train_data, test_size=0.2, random_state=np.random.RandomState(seed)) train_data.to_csv(os.path.join(directory, output_subdir, train_name), index=False) test_data.to_csv(os.path.join(directory, output_subdir, test_name), index=False) print(f'#Train={len(train_data)}, #Dev={len(test_data)}') # Test run autogluon: from autogluon.tabular import TabularDataset, TabularPredictor from autogluon import TabularPrediction as task from sklearn.feature_extraction.text import CountVectorizer from autogluon.features.generators import AutoMLPipelineFeatureGenerator MAX_NGRAM = 300 time_limit = 300 feature_generator = AutoMLPipelineFeatureGenerator(vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8)) predictor = TabularPredictor(label=label, path=directory+output_subdir, problem_type=problem_type) predictor.fit(train_data, time_limit=time_limit, feature_generator=feature_generator, hyperparameters={'GBM':{}}) predictor.evaluate(test_data) # Compute checksum: from auto_mm_bench.utils import sha1sum print("Train hash:\n", sha1sum(os.path.join(directory, output_subdir, train_name))) print("Test hash:\n", sha1sum(os.path.join(directory, output_subdir, test_name)))
def inner_test_tabular(testname): # Find the named test test = None for t in tests: if t['name'] == testname: test = t assert test is not None, f"Could not find test {testname}" # Build the dataset (dftrain, dftest) = make_dataset(request=test, seed=0) # Check the synthetic dataset itself hasn't changed. We round it to 3dp otherwise tiny floating point differences # between platforms can give a different hash that still yields same prediction scores. # Ultimately it doesn't matter how we do this as long as the same dataset gives the same hash function on # different python versions and architectures. current_hash = hashlib.sha256( dftrain.round(decimals=3).values.tobytes()).hexdigest()[0:10] proposedconfig = "Proposed new config:\n" proposedconfig += f"'dataset_hash' : '{current_hash}'," assert current_hash == test[ 'dataset_hash'], f"Test '{testname}' input dataset has changed. All scores will change.\n" + proposedconfig # Now run the Predictor 1 or more times with various parameters, and make sure we get # back the expected results. # Params can either omitted, or a single run, or a list of runs. if 'params' not in test: test['params'] = {'predict': {}, 'fit': {}} if not isinstance(test['params'], list): test['params'] = [test['params']] for params in test['params']: # Run this model and set of params predictor = TabularPredictor(label='label', **params['predict']) predictor.fit(dftrain, **params['fit']) leaderboard = predictor.leaderboard(dftest, silent=True) leaderboard = leaderboard.sort_values( by='model' ) # So we can pre-generate sample config in alphabetical order # Store proposed new config based on the current run, in case the developer wants to keep thee results (just cut and paste). proposedconfig = "Proposed new config:\n" proposedconfig += "'expected_score_range' : {\n" for model in leaderboard['model']: midx_in_leaderboard = leaderboard.index.values[leaderboard['model'] == model][0] if np.isnan(leaderboard['score_test'][midx_in_leaderboard]): values = "np.nan, np.nan" else: if model in test['expected_score_range'] and not np.isnan( test['expected_score_range'][model][1]): currentprecision = test['expected_score_range'][model][1] else: currentprecision = 0.01 values = "{}, {}".format( myfloor(leaderboard['score_test'][midx_in_leaderboard], currentprecision), currentprecision) proposedconfig += f" '{model}': ({values}),\n" proposedconfig += "},\n" # First validate the model list was as expected. assert set(leaderboard['model']) == set( test['expected_score_range'].keys() ), (f"Test '{testname}' params {params} got unexpected model list.\n" + proposedconfig) # Now validate the scores for each model were as expected. all_assertions_met = True currentconfig = "Existing config:\n" currentconfig += "'expected_score_range' : {\n" for model in sorted(test['expected_score_range']): midx_in_leaderboard = leaderboard.index.values[leaderboard['model'] == model][0] assert leaderboard['model'][midx_in_leaderboard] == model expectedrange = test['expected_score_range'][model][1] expectedmin = test['expected_score_range'][model][0] expectedmax = expectedmin + expectedrange if np.isnan(expectedmin): values = "np.nan, np.nan" else: values = "{}, {}".format(expectedmin, expectedrange) if (( (leaderboard['score_test'][midx_in_leaderboard] >= expectedmin) and (leaderboard['score_test'][midx_in_leaderboard] <= expectedmax)) or (np.isnan(leaderboard['score_test'][midx_in_leaderboard]) and np.isnan(expectedmin))): currentconfig += f" '{model}': ({values}),\n" else: currentconfig += f" '{model}': ({values}), # <--- not met, got {leaderboard['score_test'][midx_in_leaderboard]} \n" all_assertions_met = False currentconfig += "},\n" assert all_assertions_met, f"Test '{testname}', params {params} had unexpected scores:\n" + currentconfig + proposedconfig # Clean up this model created with specific params. predictor.delete_models(models_to_keep=[], dry_run=False)
def train_model(dataset_name, text_presets, save_dir, model, tabular_presets, num_gpus=None, get_competition_results=False, seed=123): set_seed(seed) if get_competition_results: train_dataset = dataset_registry.create(dataset_name, 'train') test_dataset = dataset_registry.create(dataset_name, 'competition') else: train_dataset = dataset_registry.create(dataset_name, 'train') test_dataset = dataset_registry.create(dataset_name, 'test') feature_columns = train_dataset.feature_columns label_columns = train_dataset.label_columns metric = train_dataset.metric problem_type = train_dataset.problem_type train_data1, tuning_data1 = sklearn.model_selection.train_test_split( train_dataset.data, test_size=0.05, random_state=np.random.RandomState(seed)) train_data = train_dataset.data test_data = test_dataset.data column_types, inferred_problem_type = infer_column_problem_types( train_data1, tuning_data1, label_columns=label_columns, problem_type=problem_type) train_data = train_data[feature_columns + label_columns] # tuning_data = tuning_data[feature_columns + label_columns] if not get_competition_results: test_data = test_data[feature_columns + label_columns] train_tic = time.time() if model == 'ag_tabular_quick': MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8)) predictor = TabularPredictor(label=label_columns[0], path=save_dir, problem_type=problem_type) predictor.fit(train_data, time_limit=30, feature_generator=feature_generator) elif model == 'ag_tabular_without_text': no_text_feature_columns = [] for col_name in feature_columns: if column_types[col_name] != _TEXT: no_text_feature_columns.append(col_name) train_data = train_data[no_text_feature_columns + label_columns] # tuning_data = tuning_data[no_text_feature_columns + label_columns] test_data = test_data[no_text_feature_columns + label_columns] predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets in ['best_quality']: predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS, presets=tabular_presets) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS, num_bag_folds=5, num_stack_levels=1) elif tabular_presets == 'no': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'ag_tabular_old': predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'ag_text_only': text_feature_columns = [ col_name for col_name in feature_columns if column_types[col_name] == _TEXT ] train_data = train_data[text_feature_columns + label_columns] test_data = test_data[text_feature_columns + label_columns] predictor = TextPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) hparams = ag_text_presets.create(text_presets) if len(train_data) > 500000: hparams = set_epoch3(hparams) predictor.fit(train_data=train_data, hyperparameters=hparams, num_gpus=num_gpus, seed=seed) elif model == 'ag_text_multimodal': predictor = TextPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) hparams = ag_text_presets.create(text_presets) if len(train_data) > 500000: hparams = set_epoch3(hparams) predictor.fit(train_data=train_data, hyperparameters=hparams, num_gpus=num_gpus, seed=seed) elif model == 'pre_embedding' or model == 'tune_embedding_multimodal' or model == 'tune_embedding_text': feature_generator = AutoMLPipelineFeatureGenerator( enable_text_special_features=False, enable_text_ngram_features=False) pre_embedding_folder = os.path.join(_CURR_DIR, 'pre_computed_embeddings') if model == 'pre_embedding': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'pretrain_text_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'pretrain_text_embedding', 'test.npy')) elif model == 'tune_embedding_multimodal': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'multimodal_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'multimodal_embedding', 'test.npy')) elif model == 'tune_embedding_text': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'tuned_text_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'tuned_text_embedding', 'test.npy')) else: raise NotImplementedError train_data = train_data.join( pd.DataFrame(train_features, columns=[ f'pre_feat{i}' for i in range(train_features.shape[1]) ])) train_data.reset_index(drop=True, inplace=True) test_data = test_data.join( pd.DataFrame(test_features, columns=[ f'pre_feat{i}' for i in range(test_features.shape[1]) ])) test_data.reset_index(drop=True, inplace=True) predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'tabular_multimodal' or model == 'tabular_multimodal_just_table': if model == 'tabular_multimodal': MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8), enable_raw_text_features=True) hyperparameters = get_multimodal_tabular_hparam_just_gbm( text_presets=text_presets) else: MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8), enable_raw_text_features=True, enable_text_special_features=False, enable_text_ngram_features=False) hyperparameters = multimodal_tabular_just_table_hparam( text_presets=text_presets) predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '3fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=3, num_stack_levels=1, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError else: raise NotImplementedError train_toc = time.time() inference_tic = time.time() predictions = predictor.predict(test_data, as_pandas=True) predictor.save() inference_toc = time.time() if problem_type == MULTICLASS or problem_type == BINARY: prediction_prob = predictor.predict_proba(test_data, as_pandas=True) prediction_prob.to_csv( os.path.join(save_dir, 'test_prediction_prob.csv')) predictions.to_csv(os.path.join(save_dir, 'test_prediction.csv')) gt = test_data[label_columns[0]] gt.to_csv(os.path.join(save_dir, 'ground_truth.csv')) if not get_competition_results: score = predictor.evaluate(test_data) with open(os.path.join(save_dir, 'test_score.json'), 'w') as of: json.dump({metric: score}, of) with open(os.path.join(save_dir, 'speed_stats.json'), 'w') as of: json.dump( { 'train_time': train_toc - train_tic, 'inference_time': inference_toc - inference_tic, 'cpuinfo': cpuinfo.get_cpu_info() }, of)
label_columns = train_dataset.label_columns train_data = train_dataset.data test_data = test_dataset.data concat_df = pd.concat([train_data, test_data]) concat_df.reset_index(drop=True, inplace=True) competition_df = competition_dataset.data[feature_columns] if args.model_type == 'base': tabular_hparams = get_tabular_hparams(electra_base_late_fusion_concate_e10_avg3()) elif args.model_type == 'large': tabular_hparams = get_tabular_hparams(electra_large_late_fusion_concate_e10_avg3()) else: raise NotImplementedError time_str = strftime("%Y-%m-%d_%H-%M-%S", gmtime()) predictor = TabularPredictor( path=os.path.join(args.save_dir, args.model_type, time_str), problem_type=train_dataset.problem_type, eval_metric='log_loss', label=label_columns[0]) predictor.fit(concat_df[feature_columns + [label_columns[0]]], feature_generator=feature_generator, num_bag_folds=5, num_stack_levels=1, hyperparameters=tabular_hparams) predictor.save() predictions = predictor.predict_proba(competition_df, as_pandas=True) predictions.to_csv(os.path.join(args.save_dir, args.model_type, time_str, 'pred_probabilities.csv'))
""" Example script for quantile regression with tabular data, demonstrating simple use-case """ import numpy as np from autogluon.tabular import TabularDataset, TabularPredictor # Training time: train_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv') # can be local CSV file as well, returns Pandas DataFrame train_data = train_data.head(1000) # subsample for faster demo print(train_data.head()) label = 'age' # which column we want to predict save_path = 'ag_models/' # where to save trained models quantile_levels = [0.1, 0.5, 0.9] # which quantiles of numeric label-variable we want to predict predictor = TabularPredictor(label=label, path=save_path, problem_type='quantile', quantile_levels=quantile_levels) predictor.fit(train_data, calibrate=True, num_bag_folds=5) # here we fit with 5-fold bagging and calibrate quantile estimates via conformal method # Inference time: test_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv') # another Pandas DataFrame test_data = test_data.head(1000) # subsample for faster demo predictor = TabularPredictor.load(save_path) # unnecessary here, we just to demonstrate how to load previously-trained predictor from file y_pred = predictor.predict(test_data) print(y_pred) # each column contains estimates of a particular quantile-level of the label variable # Check coverage of prediction intervals (ie. how often they contain the observed Y value): num_quantiles = len(quantile_levels) y_pred = y_pred.to_numpy() y_target = test_data[label].to_numpy() for i in range(num_quantiles // 2): low_idx = i high_idx = num_quantiles - i - 1 low_quantile = quantile_levels[low_idx] # which quantile to use for lower end of prediction interval
def fit_static(X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): from autogluon.tabular import TabularDataset, TabularPredictor from autogluon.tabular.models.knn.knn_rapids_model import KNNRapidsModel from autogluon.tabular.models.lr.lr_rapids_model import LinearRapidsModel num_classes = kwargs['num_classes'] if kwargs['verbose'] is not None and kwargs['verbose'] is True: verbosity = 2 else: verbosity = 0 labels = kwargs['labels'] num_gpus = kwargs['n_gpus'] accuracy = kwargs.get('accuracy', 10) interpretability = kwargs.get('interpretability', 1) is_acceptance = kwargs.get('IS_ACCEPTANCE', False) is_backend_tuning = kwargs.get('IS_BACKEND_TUNING', False) lb = None if num_classes >= 2: from sklearn.preprocessing import LabelEncoder lb = LabelEncoder() lb.fit(labels) y = lb.transform(y) label = '____TARGET_____' import datatable as dt y_dt = dt.Frame(y, names=[label]) if eval_set is not None: valid_X = eval_set[0][0] valid_y = eval_set[0][1] if num_classes >= 2: valid_y = lb.transform(valid_y) valid_y_dt = dt.Frame(valid_y, names=[label]) assert X.shape[1] == valid_X.shape[1], "Bad shape to rbind: %s %s : %s %s" % ( X.shape, X.names, valid_X.shape, valid_X.names) X = dt.rbind([X, valid_X]) y_dt = dt.rbind([y_dt, valid_y_dt]) sw = None if sample_weight is not None: sw = '____SAMPLE_WEIGHT_____' sw_dt = dt.Frame(sample_weight, names=[sw]) if sample_weight_eval_set is not None: swes_dt = dt.Frame(sample_weight_eval_set[0], names=[sw]) sw_dt = dt.rbind([sw_dt, swes_dt]) X = dt.cbind([X, y_dt, sw_dt]) else: X = dt.cbind([X, y_dt]) X = X.to_pandas() # AutoGluon needs pandas, not numpy eval_metric = AutoGluonModel.get_eval_metric(**kwargs) time_limit = AutoGluonModel.get_time_limit(accuracy) presets = AutoGluonModel.get_presets(accuracy, interpretability, is_acceptance, is_backend_tuning) model = TabularPredictor( label=label, sample_weight=sw, eval_metric=eval_metric, verbosity=verbosity, # learner_kwargs={'ignored_columns': ['id']} ) hyperparameters = { KNNRapidsModel: {}, LinearRapidsModel: {}, 'RF': {}, 'XGB': {'ag_args_fit': {'num_gpus': num_gpus}}, 'CAT': {'ag_args_fit': {'num_gpus': num_gpus}}, 'GBM': [{}, {'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, 'GBMLarge'], 'NN': {'ag_args_fit': {'num_gpus': num_gpus}}, 'FASTAI': {'ag_args_fit': {'num_gpus': num_gpus}}, } kwargs_fit = dict(hyperparameters=hyperparameters) if accuracy >= 5: kwargs_fit.update(dict(presets=presets, time_limit=time_limit)) model.fit(X, **kwargs_fit) print(model.leaderboard(silent=True)) return model
) # can be local CSV file as well, returns Pandas DataFrame train_data = train_data.head(500) # subsample for faster demo print(train_data.head()) label = 'age' # specifies which column do we want to predict save_path = 'ag_models/' # where to save trained models quantiles_topredict = [ 0.1, 0.5, 0.9 ] # which quantiles of numeric label-variable we want to predict predictor = TabularPredictor(label=label, path=save_path, problem_type='quantile', quantile_levels=quantiles_topredict) predictor.fit( train_data, time_limit=30 ) # time_limit is optional, you should increase it for real applications # Inference time: test_data = TabularDataset( 'https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv' ) # another Pandas DataFrame predictor = TabularPredictor.load( save_path ) # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file y_pred = predictor.predict(test_data) print(y_pred) # each column contains estimates for one target quantile-level ldr = predictor.leaderboard( test_data) # evaluate performance of every trained model print(f"Quantile-regression evaluated using metric = {predictor.eval_metric}")
train_data, test_data = train_test_split(all_train_data, test_size=0.2, stratify=all_train_data[label], random_state=np.random.RandomState(seed)) train_data.to_csv(os.path.join(directory, output_subdir, train_name), index=False) test_data.to_csv(os.path.join(directory, output_subdir, test_name), index=False) print(f'#Train={len(train_data)}, #Dev={len(test_data)}') # Test run autogluon: from autogluon.tabular import TabularDataset, TabularPredictor from autogluon import TabularPrediction as task from sklearn.feature_extraction.text import CountVectorizer from autogluon.features.generators import AutoMLPipelineFeatureGenerator MAX_NGRAM = 300 time_limit = 30 feature_generator = AutoMLPipelineFeatureGenerator(vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8)) predictor = TabularPredictor(label=label, path=directory+output_subdir, problem_type=problem_type) predictor.fit(train_data, time_limit=time_limit, feature_generator=feature_generator) predictor.evaluate(test_data) # Compute checksum: from auto_mm_bench.utils import sha1sum print("Train hash:\n", sha1sum(os.path.join(directory, output_subdir, train_name))) print("Test hash:\n", sha1sum(os.path.join(directory, output_subdir, test_name)))