def test_sample_weight(): dataset = {'url': 'https://autogluon.s3.amazonaws.com/datasets/toyRegression.zip', 'name': 'toyRegression', 'problem_type': REGRESSION, 'label': 'y', 'performance_val': 0.183} directory_prefix = './datasets/' train_file = 'train_data.csv' test_file = 'test_data.csv' train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url']) print(f"Evaluating Benchmark Dataset {dataset['name']}") directory = directory_prefix + dataset['name'] + "/" savedir = directory + 'AutogluonOutput/' shutil.rmtree(savedir, ignore_errors=True) # Delete AutoGluon output directory to ensure previous runs' information has been removed. sample_weight = 'sample_weights' weights = np.abs(np.random.rand(len(train_data),)) test_weights = np.abs(np.random.rand(len(test_data),)) train_data[sample_weight] = weights test_data_weighted = test_data.copy() test_data_weighted[sample_weight] = test_weights fit_args = {'time_limit': 20} predictor = TabularPredictor(label=dataset['label'], path=savedir, problem_type=dataset['problem_type'], sample_weight=sample_weight).fit(train_data, **fit_args) ldr = predictor.leaderboard(test_data) perf = predictor.evaluate(test_data) # Run again with weight_evaluation: predictor = TabularPredictor(label=dataset['label'], path=savedir, problem_type=dataset['problem_type'], sample_weight=sample_weight, weight_evaluation=True).fit(train_data, **fit_args) perf = predictor.evaluate(test_data_weighted) predictor.distill(time_limit=10) ldr = predictor.leaderboard(test_data_weighted)
def test_quantile(): quantile_levels = [0.01, 0.02, 0.05, 0.98, 0.99] dataset = { 'url': 'https://autogluon.s3.amazonaws.com/datasets/toyRegression.zip', 'name': 'toyRegression', 'problem_type': QUANTILE, 'label': 'y' } directory_prefix = './datasets/' train_file = 'train_data.csv' test_file = 'test_data.csv' train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url']) print(f"Evaluating Benchmark Dataset {dataset['name']}") directory = directory_prefix + dataset['name'] + "/" savedir = directory + 'AutogluonOutput/' shutil.rmtree( savedir, ignore_errors=True ) # Delete AutoGluon output directory to ensure previous runs' information has been removed. fit_args = {'time_limit': 20} predictor = TabularPredictor(label=dataset['label'], path=savedir, problem_type=dataset['problem_type'], quantile_levels=quantile_levels).fit( train_data, **fit_args) ldr = predictor.leaderboard(test_data) perf = predictor.evaluate(test_data)
def train(args): set_seed(args.seed) if args.task is not None: feature_columns, label_column, eval_metric, all_metrics = TASKS[args.task] else: raise NotImplementedError if args.exp_dir is None: args.exp_dir = 'autogluon_text_{}'.format(args.task) train_df = load_pd.load(args.train_file) dev_df = load_pd.load(args.dev_file) test_df = load_pd.load(args.test_file) train_df = train_df[feature_columns + [label_column]] dev_df = dev_df[feature_columns + [label_column]] test_df = test_df[feature_columns] if args.task == 'mrpc' or args.task == 'sts': # Augmenting the un-ordered set manually. train_df_other_part = pd.DataFrame({feature_columns[0]: train_df[feature_columns[1]], feature_columns[1]: train_df[feature_columns[0]], label_column: train_df[label_column]}) real_train_df = pd.concat([train_df, train_df_other_part]) real_dev_df = dev_df else: real_train_df = train_df real_dev_df = dev_df if args.mode == 'stacking': predictor = TabularPredictor(label=label_column, eval_metric=eval_metric, path=args.exp_dir) predictor.fit(train_data=real_train_df, tuning_data=real_dev_df, hyperparameters='multimodal', num_bag_folds=5, num_stack_levels=1) elif args.mode == 'weighted': predictor = TabularPredictor(label=label_column, eval_metric=eval_metric, path=args.exp_dir) predictor.fit(train_data=real_train_df, tuning_data=real_dev_df, hyperparameters='multimodal') elif args.mode == 'single': # When no embedding is used, # we will just use TextPredictor that will train a single model internally. predictor = TextPredictor(label=label_column, eval_metric=eval_metric, path=args.exp_dir) predictor.fit(train_data=real_train_df, tuning_data=real_dev_df, seed=args.seed) else: raise NotImplementedError dev_metric_score = predictor.evaluate(dev_df) dev_predictions = predictor.predict(dev_df, as_pandas=True) test_predictions = predictor.predict(test_df, as_pandas=True) dev_predictions.to_csv(os.path.join(args.exp_dir, 'dev_prediction.csv')) test_predictions.to_csv(os.path.join(args.exp_dir, 'test_prediction.csv')) with open(os.path.join(args.exp_dir, 'final_model_scores.json'), 'w') as of: json.dump({f'valid_{eval_metric}': dev_metric_score}, of)
all_train_data = all_train_data[keep_ind] train_data, test_data = train_test_split(all_train_data, test_size=0.2, random_state=np.random.RandomState(seed)) train_data.to_csv(os.path.join(directory, output_subdir, train_name), index=False) test_data.to_csv(os.path.join(directory, output_subdir, test_name), index=False) print(f'#Train={len(train_data)}, #Dev={len(test_data)}') # Test run autogluon: from autogluon.tabular import TabularDataset, TabularPredictor from autogluon import TabularPrediction as task from sklearn.feature_extraction.text import CountVectorizer from autogluon.features.generators import AutoMLPipelineFeatureGenerator MAX_NGRAM = 300 time_limit = 300 feature_generator = AutoMLPipelineFeatureGenerator(vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8)) predictor = TabularPredictor(label=label, path=directory+output_subdir, problem_type=problem_type) predictor.fit(train_data, time_limit=time_limit, feature_generator=feature_generator, hyperparameters={'GBM':{}}) predictor.evaluate(test_data) # Compute checksum: from auto_mm_bench.utils import sha1sum print("Train hash:\n", sha1sum(os.path.join(directory, output_subdir, train_name))) print("Test hash:\n", sha1sum(os.path.join(directory, output_subdir, test_name)))
'XGB': { 'n_estimators': 1000, 'learning_rate': ag.Real(0.01, 0.1, log=True) } } predictor = TabularPredictor(label=label, path=save_path).fit( train_data, hyperparameters=hyperparameters, hyperparameter_tune_kwargs='auto', time_limit=60) results = predictor.fit_summary() # display detailed summary of fit() process print(results) # Inference time: test_data = TabularDataset( 'https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv' ) # another Pandas DataFrame print(test_data.head()) perf = predictor.evaluate( test_data ) # shorthand way to evaluate our predictor if test-labels are available # Otherwise we make predictions and can evaluate them later: y_pred = predictor.predict_proba(test_data) perf = predictor.evaluate_predictions(y_true=test_data[label], y_pred=y_pred, auxiliary_metrics=True)
ag_predictor_args["path"] = args.model_dir ag_fit_args = config["ag_fit_args"] predictor = TabularPredictor(**ag_predictor_args).fit( train_data, **ag_fit_args) logger.info("Best model: %s", predictor.get_model_best()) # Leaderboard lb = predictor.leaderboard() lb.to_csv(f'{args.output_data_dir}/leaderboard.csv', index=False) logger.info("Saved leaderboard to output.") # Feature importance feature_importance = predictor.feature_importance(test_data) feature_importance.to_csv(f'{args.output_data_dir}/feature_importance.csv') logger.info("Saved feature importance to output.") # Evaluation evaluation = predictor.evaluate(test_data) with open(f'{args.output_data_dir}/evaluation.json', 'w') as f: json.dump(evaluation, f) logger.info("Saved evaluation to output.") predictor.save_space() # ---------------------------- Inference ----------------------------------- test_data_nolabel = test_data.drop(labels=ag_predictor_args['label'], axis=1) y_pred = predictor.predict(test_data_nolabel) y_pred.to_csv(f'{args.output_data_dir}/predictions.csv', index=False)
def train_model(dataset_name, text_presets, save_dir, model, tabular_presets, num_gpus=None, get_competition_results=False, seed=123): set_seed(seed) if get_competition_results: train_dataset = dataset_registry.create(dataset_name, 'train') test_dataset = dataset_registry.create(dataset_name, 'competition') else: train_dataset = dataset_registry.create(dataset_name, 'train') test_dataset = dataset_registry.create(dataset_name, 'test') feature_columns = train_dataset.feature_columns label_columns = train_dataset.label_columns metric = train_dataset.metric problem_type = train_dataset.problem_type train_data1, tuning_data1 = sklearn.model_selection.train_test_split( train_dataset.data, test_size=0.05, random_state=np.random.RandomState(seed)) train_data = train_dataset.data test_data = test_dataset.data column_types, inferred_problem_type = infer_column_problem_types( train_data1, tuning_data1, label_columns=label_columns, problem_type=problem_type) train_data = train_data[feature_columns + label_columns] # tuning_data = tuning_data[feature_columns + label_columns] if not get_competition_results: test_data = test_data[feature_columns + label_columns] train_tic = time.time() if model == 'ag_tabular_quick': MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8)) predictor = TabularPredictor(label=label_columns[0], path=save_dir, problem_type=problem_type) predictor.fit(train_data, time_limit=30, feature_generator=feature_generator) elif model == 'ag_tabular_without_text': no_text_feature_columns = [] for col_name in feature_columns: if column_types[col_name] != _TEXT: no_text_feature_columns.append(col_name) train_data = train_data[no_text_feature_columns + label_columns] # tuning_data = tuning_data[no_text_feature_columns + label_columns] test_data = test_data[no_text_feature_columns + label_columns] predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets in ['best_quality']: predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS, presets=tabular_presets) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS, num_bag_folds=5, num_stack_levels=1) elif tabular_presets == 'no': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'ag_tabular_old': predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'ag_text_only': text_feature_columns = [ col_name for col_name in feature_columns if column_types[col_name] == _TEXT ] train_data = train_data[text_feature_columns + label_columns] test_data = test_data[text_feature_columns + label_columns] predictor = TextPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) hparams = ag_text_presets.create(text_presets) if len(train_data) > 500000: hparams = set_epoch3(hparams) predictor.fit(train_data=train_data, hyperparameters=hparams, num_gpus=num_gpus, seed=seed) elif model == 'ag_text_multimodal': predictor = TextPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) hparams = ag_text_presets.create(text_presets) if len(train_data) > 500000: hparams = set_epoch3(hparams) predictor.fit(train_data=train_data, hyperparameters=hparams, num_gpus=num_gpus, seed=seed) elif model == 'pre_embedding' or model == 'tune_embedding_multimodal' or model == 'tune_embedding_text': feature_generator = AutoMLPipelineFeatureGenerator( enable_text_special_features=False, enable_text_ngram_features=False) pre_embedding_folder = os.path.join(_CURR_DIR, 'pre_computed_embeddings') if model == 'pre_embedding': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'pretrain_text_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'pretrain_text_embedding', 'test.npy')) elif model == 'tune_embedding_multimodal': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'multimodal_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'multimodal_embedding', 'test.npy')) elif model == 'tune_embedding_text': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'tuned_text_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'tuned_text_embedding', 'test.npy')) else: raise NotImplementedError train_data = train_data.join( pd.DataFrame(train_features, columns=[ f'pre_feat{i}' for i in range(train_features.shape[1]) ])) train_data.reset_index(drop=True, inplace=True) test_data = test_data.join( pd.DataFrame(test_features, columns=[ f'pre_feat{i}' for i in range(test_features.shape[1]) ])) test_data.reset_index(drop=True, inplace=True) predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'tabular_multimodal' or model == 'tabular_multimodal_just_table': if model == 'tabular_multimodal': MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8), enable_raw_text_features=True) hyperparameters = get_multimodal_tabular_hparam_just_gbm( text_presets=text_presets) else: MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8), enable_raw_text_features=True, enable_text_special_features=False, enable_text_ngram_features=False) hyperparameters = multimodal_tabular_just_table_hparam( text_presets=text_presets) predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '3fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=3, num_stack_levels=1, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError else: raise NotImplementedError train_toc = time.time() inference_tic = time.time() predictions = predictor.predict(test_data, as_pandas=True) predictor.save() inference_toc = time.time() if problem_type == MULTICLASS or problem_type == BINARY: prediction_prob = predictor.predict_proba(test_data, as_pandas=True) prediction_prob.to_csv( os.path.join(save_dir, 'test_prediction_prob.csv')) predictions.to_csv(os.path.join(save_dir, 'test_prediction.csv')) gt = test_data[label_columns[0]] gt.to_csv(os.path.join(save_dir, 'ground_truth.csv')) if not get_competition_results: score = predictor.evaluate(test_data) with open(os.path.join(save_dir, 'test_score.json'), 'w') as of: json.dump({metric: score}, of) with open(os.path.join(save_dir, 'speed_stats.json'), 'w') as of: json.dump( { 'train_time': train_toc - train_tic, 'inference_time': inference_toc - inference_tic, 'cpuinfo': cpuinfo.get_cpu_info() }, of)