def run_example(self): train_data = task.Dataset(file_path='./data/churn-train.csv') train_data = train_data.head( 500) # subsample 500 data points for faster demo print(train_data.head()) label_column = 'churn_probability' print("Summary of class variable: \n", train_data[label_column].describe()) dir = 'agModels-predictClass' # specifies folder where to store trained models predictor = task.fit(train_data=train_data, label=label_column, eval_metric="mean_absolute_error") test_data = task.Dataset(file_path='./data/churn-test.csv') y_test = test_data[label_column] # values to predict test_data_nolab = test_data.drop( labels=[label_column], axis=1) # delete label column to prove we're not cheating print(test_data_nolab.head()) #predictor = task.load(dir) # unnecessary, just demonstrates how to load previously-trained predictor from file y_pred = predictor.predict(test_data_nolab) print("Predictions: ", y_pred) perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True) print("MAE: " + perf) return perf
def frc_AutoGluon(df_train, df_test, categoricalVars, experiment_label='grocery', responseVar='wk1_sales_all_stores'): import autogluon as ag from autogluon import TabularPrediction as task # autogluon.task.tabular_prediction.TabularPredictor for varName in categoricalVars: df_train[varName] = df_train[varName].astype(str) df_test[varName] = df_test[varName].astype(str) # AutoGluon format train_data = task.Dataset(df=df_train) test_data = task.Dataset(df=df_test) model = task.fit(train_data=train_data, output_directory="auto_gluon/" + experiment_label, label=responseVar, hyperparameter_tune=False) # Forecast with the best model autogluon_frc = model.predict(test_data) # Forecast with all the models individual_frc = {'AG_'+model_to_use: model.predict(test_data, model=model_to_use) \ for model_to_use in model.model_names} return { 'autoGluon_frc': autogluon_frc, 'autoGluon_model': model, 'individual_frc': individual_frc }
def train(self, train_data, val_data, params): train_dataset = TabularPrediction.Dataset(train_data) val_dataset = TabularPrediction.Dataset(val_data) output_dir = os.path.join(self.get_output_folders()[0], dt.now().strftime('%Y%m%d%H%M%S')) hp_tune = params["hp_tune"] ag_params = params["autogluon"] self._label_column = params["label"] if hp_tune is True: hp_params = ag_params["hyperparameters"] time_limits = hp_params["time_limits"] num_trials = hp_params["num_trials"] hyperparameters = self.__create_hp_params(hp_params) search_strategy = hp_params["search_strategy"] self._model = TabularPrediction.fit( train_data=train_dataset, tuning_data=val_dataset, label=self._label_column, output_directory=output_dir, time_limits=time_limits, num_trials=num_trials, hyperparameter_tune=hp_tune, hyperparameters=hyperparameters, search_strategy=search_strategy ) else: self._model = TabularPrediction.fit( train_data=train_dataset, tuning_data=val_dataset, label=self._label_column, output_directory=output_dir ) self.__dump_params(output_dir, params) self._model.fit_summary()
def run(self, train_path, test_path, target, task): train_data = task.Dataset(file_path=train_path) predictor = task.fit(train_data=train_data, label=label_column, eval_metric="f1_macro", num_bagging_folds=5) test_data = task.Dataset(file_path=test_path) y_test = test_data[target] y_pred = predictor.predict(test_data) return predictor.evaluate_predictions(y_true=y_test.to_numpy(), y_pred=y_pred, auxiliary_metrics=True)
def convert_gluon(X_train, y_train): feature_list = list() for i in range(len(X_train[0])): feature_list.append('feature_' + str(i)) feature_list.append('class') data = dict() for i in range(len(X_train)): for j in range(len(feature_list) - 1): if i > 0: try: data[feature_list[j]] = data[feature_list[j]] + [ X_train[i][j] ] except: pass else: data[feature_list[j]] = [X_train[i][j]] print(data) data['class'] = y_train data = pd.DataFrame(data, columns=list(data)) data = task.Dataset(data) return data
def train(args): # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to # the current container environment, but here we just use simple cpu context. model_dir = args.model_dir train_dir = args.train_dir filename = args.filename target = args.target debug = args.debug eval_metric = args.eval_metric presets = args.presets num_gpus = int(os.environ['SM_NUM_GPUS']) current_host = args.current_host hosts = args.hosts logging.info(train_dir) train_data = task.Dataset(file_path=os.path.join(train_dir, filename)) if debug: subsample_size = 500 # subsample subset of data for faster demo, try setting this to much larger values train_data = train_data.sample(n=subsample_size, random_state=0) predictor = task.fit(train_data = train_data, label=target, output_directory=model_dir, eval_metric=eval_metric, presets=presets) return predictor
def load_data(directory_prefix, train_file, test_file, name, url=None): if not os.path.exists(directory_prefix): os.mkdir(directory_prefix) directory = directory_prefix + name + "/" train_file_path = directory + train_file test_file_path = directory + test_file if (not os.path.exists(train_file_path)) or (not os.path.exists(test_file_path)): # fetch files from s3: print("%s data not found locally, so fetching from %s" % (name, url)) zip_name = ag.download(url, directory_prefix) ag.unzip(zip_name, directory_prefix) os.remove(zip_name) train_data = task.Dataset(file_path=train_file_path) test_data = task.Dataset(file_path=test_file_path) return train_data, test_data
def predict(cls, prediction_input: DataFrame): """For the input, do the predictions and return them. Args: prediction_input (a pandas dataframe): The data on which to do the predictions. There will be one prediction per row in the dataframe""" prediction_data = task.Dataset(df=prediction_input) print("Prediction Data: ") print(prediction_data.head()) return cls.model.predict(prediction_data)
def train(self, data, params): self.data = data self.train_data = task.Dataset(data.unscaled_df) autogluon_dir = f'agModels-predictClass/{uuid.uuid4()}' # specifies folder where to store trained models self.predictor = task.fit(train_data=self.train_data, label=self.metadata.get("output")[0], output_directory=autogluon_dir) self.state = "TRAINED"
def frc_AutoGluon(df_train, df_test, categoricalVars, responseVar = 'wk1_sales_all_stores'): import autogluon as ag from autogluon import TabularPrediction as task for varName in categoricalVars: df_train[varName] = df_train[varName].astype(str) df_test[varName] = df_test[varName].astype(str) # AutoGluon format train_data = task.Dataset(df=df_train) test_data = task.Dataset(df=df_test) model = task.fit(train_data=train_data, output_directory="auto_gluon", label=responseVar, hyperparameter_tune=False) # Forecast with the best model autogluon_frc = model.predict(test_data) return {'autoGluon_frc': autogluon_frc, 'autoGluon_model':model}
def __load_input_data(path: str) -> TabularDataset: """ Load training data as dataframe :param path: :return: DataFrame """ input_data_files = os.listdir(path) try: input_dfs = [pd.read_csv(f'{path}/{data_file}') for data_file in input_data_files] return task.Dataset(df=pd.concat(input_dfs)) except: print(f'No csv data in {path}!') return None
def evaluate(predictor, args): train_dir = args.train_dir train_file = args.filename test_file = train_file.replace('train', 'test', 1) target = args.target training_job_name = args.training_job_name s3_output = args.s3_output dataset_name = train_file.split('_')[0] logging.info(dataset_name) test_data = task.Dataset(file_path=os.path.join(train_dir, test_file)) u = urlparse(s3_output, allow_fragments=False) bucket = u.netloc logging.info(bucket) prefix = u.path.strip('/') logging.info(prefix) s3 = boto3.client('s3') y_test = test_data[target] test_data_nolab = test_data.drop(labels=[target], axis=1) y_pred = predictor.predict(test_data_nolab) y_pred_df = pd.DataFrame.from_dict({'True': y_test, 'Predicted': y_pred}) pred_file = f'{dataset_name}_test_predictions.csv' y_pred_df.to_csv(pred_file, index=False, header=True) leaderboard = predictor.leaderboard() lead_file = f'{dataset_name}_leaderboard.csv' leaderboard.to_csv(lead_file) perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True) del perf['confusion_matrix'] perf_file = f'{dataset_name}_model_performance.txt' with open(perf_file, 'w') as f: print(json.dumps(perf, indent=4), file=f) summary = predictor.fit_summary() summ_file = f'{dataset_name}_fit_summary.txt' with open(summ_file, 'w') as f: print(summary, file=f) files_to_upload = [pred_file, lead_file, perf_file, summ_file] for file in files_to_upload: s3.upload_file(file, bucket, os.path.join(prefix, training_job_name.replace('mxnet-training', 'autogluon', 1), file))
def train(args): # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to # the current container environment, but here we just use simple cpu context. model_dir = args.model_dir target = args.label_column train_file_path = get_file_path(args.train, args.train_filename) train_data = task.Dataset(file_path= train_file_path ) subsample_size = int(args.train_rows) # subsample subset of data for faster demo, try setting this to much larger values train_data = train_data.sample(n=subsample_size, random_state=0) predictor = task.fit(train_data = train_data, label=target, output_directory=model_dir) return predictor
def transform_fn(net, data, input_content_type, output_content_type): """ Transform a request using the Gluon model. Called once per request. :param net: The Gluon model. :param data: The request payload. :param input_content_type: The request content type. ('text/csv') :param output_content_type: The (desired) response content type. ('text/csv') :return: response payload and content type. """ start = timer() # text/csv if input_content_type == 'text/csv': # Load dataset df = pd.read_csv(StringIO(data)) ds = task.Dataset(df=df) # Predict predictions = net.predict(ds) print(f'Prediction counts: {Counter(predictions.tolist())}') # Form response output = StringIO() pd.DataFrame(predictions).to_csv(output, header=False, index=False) response_body = output.getvalue() # If target column passed, evaluate predictions performance target = net.label_column if target in ds: print(f'Label column ({target}) found in input data. ' 'Therefore, evaluating prediction performance...') performance = net.evaluate_predictions(y_true=ds[target], y_pred=predictions.tolist(), auxiliary_metrics=True) print(json.dumps(performance, indent=4)) else: raise NotImplementedError("content_type must be 'text/csv'") elapsed_time = round(timer() - start, 3) print(f'Elapsed time: {round(timer()-start,3)} seconds') return response_body, output_content_type
def Load_GLUON(dataDownstream, dataFeaturized): df = pd.DataFrame(columns=['column', 'feature_type']) df.to_csv('AutoGluon_predictions.csv', index=False) # dataDownstream train = copy.deepcopy(dataDownstream) train['label_target'] = 1 train_data = task.Dataset(df=train) label_column = 'label_target' try: features = task.fit(train_data=train_data, label=label_column) except: AlwaysTrue = 1 agl_predictions = pd.read_csv('AutoGluon_predictions.csv') predictions = agl_predictions['feature_type'].values.tolist() return predictions
def train(args): # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to # the current container environment, but here we just use simple cpu context. num_gpus = int(os.environ['SM_NUM_GPUS']) current_host = args.current_host hosts = args.hosts model_dir = args.model_dir target = args.target # load training and validation data training_dir = args.train filename = args.filename logging.info(training_dir) train_data = task.Dataset(file_path=training_dir + '/' + filename) predictor = task.fit(train_data=train_data, label=target, output_directory=model_dir) return predictor
def train_regression_autogluon(args, train_df, test_df): mx.npx.reset_np() from autogluon import TabularPrediction as task predictor = task.fit(train_data=task.Dataset(df=train_df), output_directory=args.out_dir, label='thrpt', eval_metric='mean_absolute_error') #performance = predictor.evaluate(test_df) test_prediction = predictor.predict(test_df) ret = np.zeros((len(test_prediction), 2), dtype=np.float32) for i, (lhs, rhs) in enumerate(zip(test_df['thrpt'].to_numpy(), test_prediction)): ret[i][0] = lhs ret[i][1] = rhs df_result = pd.DataFrame(ret, columns=['gt', 'pred']) df_result.to_csv(os.path.join(args.out_dir, 'pred_result.csv')) plot_save_figure(gt_thrpt=test_df['thrpt'].to_numpy(), pred_thrpt=test_prediction, save_dir=args.out_dir) mx.npx.set_np()
def train(args): # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to # the current container environment, but here we just use simple cpu context. num_gpus = int(os.environ['SM_NUM_GPUS']) current_host = args.current_host hosts = args.hosts model_dir = args.model_dir target = args.target # load training and validation data training_dir = args.train filename = args.filename logging.info(training_dir) hyperparameters = { 'GBM': [ {}, { 'extra_trees': True, 'AG_args': { 'name_suffix': 'XT' } }, ], 'RF': {}, 'XT': {}, 'KNN': {}, 'custom': ['GBM'] } presets = 'medium_quality_faster_train' train_data = task.Dataset(file_path=training_dir + '/' + filename) predictor = task.fit(train_data=train_data, label=target, output_directory=model_dir, presets=presets, hyperparameters=hyperparameters) return predictor
os.environ['SM_TRAINING_ENV'])['job_name']) return parser.parse_args() if __name__ == '__main__': args = parse_args() predictor = train(args) training_dir = args.train train_file = args.filename test_file = train_file.replace('train', 'test', 1) dataset_name = train_file.split('_')[0] print(dataset_name) test_data = task.Dataset(file_path=os.path.join(training_dir, test_file)) u = urlparse(args.s3_output, allow_fragments=False) bucket = u.netloc print(bucket) prefix = u.path.strip('/') print(prefix) s3 = boto3.client('s3') try: y_test = test_data[args.target] # values to predict # delete label column to prove we're not cheating test_data_nolab = test_data.drop(labels=[args.target], axis=1) y_pred = predictor.predict(test_data_nolab) y_pred_df = pd.DataFrame.from_dict({
def processData(data, label_column=None, output_directory=None, ag_predictor=None, problem_type=None, eval_metric=None): """ Converts pandas Dataframe to matrix of entirely numerical values (stored in DataFrame). Performs same data preprocessing as used for AutoGluon's tabular neural network model, to deal with issues such as: missing value imputation, one-hot encoding of categoricals, handling of high-cardinality categoricals, handling unknown categorical feature-levels at test-time, etc. If ag_predictor is not None, uses existing autogluon predictor object to process data (must have tabularNN as first model). To process training data, ag_predictor should = None. For test data, should != None. Returns: Tuple (X, y, ag_predictor) where y may be None if labels are not present in test data. """ # fit dummy neural network model just to preprocess data. Here we ensure no embedding layers are used. if ag_predictor is None: if label_column is None: raise ValueError( "when processing training data, label_column cannot be None") elif not label_column in data.columns: raise ValueError( "label_column cannot be missing from training data") ag_predictor = task.fit(train_data=task.Dataset(data), tuning_data=task.Dataset(data), label=label_column, hyperparameter_tune=False, problem_type=problem_type, eval_metric=eval_metric, hyperparameters={ 'NN': { 'num_epochs': 0, 'proc.embed_min_categories': np.inf } }, num_bagging_folds=0, stack_ensemble_levels=0, label_count_threshold=1, verbosity=2, feature_generator_kwargs={ 'enable_nlp_vectorizer_features': False, 'enable_nlp_ratio_features': False }) model = ag_predictor._trainer.load_model( ag_predictor._trainer.get_model_names_all() [0]) # This must be the neural net model which contains data processor if 'NeuralNet' not in model.name: raise ValueError( "Data preprocessing error. This model should be the NeuralNet, not the: %s" % model.name) bad_inds = [] # row-indices to remove from dataset if label_column is not None and label_column in data.columns: label_cleaner = ag_predictor._learner.label_cleaner y = data[label_column].values data = data.drop([label_column], axis=1, inplace=False) y = label_cleaner.transform(y) if np.sum(y.isna()) > 0: bad_inds = y.index[y.apply(np.isnan)].tolist( ) # remove these inds as label is NaN (due to very rare classes) warnings.warn( "Dropped these rows from data in preprocessing, due to missing labels: " + str(bad_inds)) else: y = None data_initial_processed = ag_predictor._learner.transform_features( data) # general autogluon data processing. # data_fg = ag_predictor._learner.general_data_processing(X=data, X_test=data, holdout_frac=0.0, num_bagging_folds=0) tabNN_data = model.process_data( data_initial_processed, is_test=True ) # neural net-specific autogluon data processing required to turn tabular data into numerical matrix. numeric_data = tabNN_data.dataset._data # list of mxnet.NDArrays if len(numeric_data) != 1: raise ValueError("Data Preprocessing failed.") numpy_data = numeric_data[0].asnumpy() # 2D Numpy array X = pd.DataFrame(numpy_data) X.columns = ['feature' + str(i) for i in range(X.shape[1])] if len(bad_inds) > 0: y.drop(index=bad_inds, inplace=True) X.drop(index=bad_inds, axis=0, inplace=True) return (X, y, ag_predictor)
dataset = regression_dataset directory = dataset['name'] + "/" train_file = 'train_data.csv' test_file = 'test_data.csv' train_file_path = directory + train_file test_file_path = directory + test_file if (not os.path.exists(train_file_path)) or ( not os.path.exists(test_file_path)): # fetch files from s3: print("%s data not found locally, so fetching from %s" % (dataset['name'], dataset['url'])) os.system("wget " + dataset['url'] + " -O temp.zip && unzip -o temp.zip && rm temp.zip") train_data = task.Dataset(file_path=train_file_path) test_data = task.Dataset(file_path=test_file_path) train_data = train_data.head(subsample_size) # subsample for faster demo test_data = test_data.head(subsample_size) # subsample for faster run label_column = dataset['label_column'] # Fit model ensemble: predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, cache_data=True, auto_stack=True, time_limits=time_limits, eval_metric='mean_absolute_error') # Distill ensemble-predictor into single model:
def transform_fn(models, data, input_content_type, output_content_type): """ Transform a request using the Gluon model. Called once per request. :param models: The Gluon model and the column info. :param data: The request payload. :param input_content_type: The request content type. ('text/csv') :param output_content_type: The (desired) response content type. ('text/csv') :return: response payload and content type. """ start = timer() net = models[0] column_dict = models[1] # text/csv if input_content_type == 'text/csv': # Load dataset columns = column_dict['columns'] df = pd.read_csv(StringIO(data), header=None) df_preprosessed = preprocess(df, columns, net.label_column) ds = task.Dataset(df=df_preprosessed) try: predictions = net.predict(ds) except: try: predictions = net.predict(ds.fillna(0.0)) warnings.warn('Filled NaN\'s with 0.0 in order to predict.') except Exception as e: response_body = e return response_body, output_content_type # Print prediction counts, limit in case of regression problem pred_counts = Counter(predictions.tolist()) n_display_items = 30 if len(pred_counts) > n_display_items: print(f'Top {n_display_items} prediction counts: ' f'{dict(take(n_display_items, pred_counts.items()))}') else: print(f'Prediction counts: {pred_counts}') # Form response output = StringIO() pd.DataFrame(predictions).to_csv(output, header=False, index=False) response_body = output.getvalue() # If target column passed, evaluate predictions performance target = net.label_column if target in ds: print(f'Label column ({target}) found in input data. ' 'Therefore, evaluating prediction performance...') try: performance = net.evaluate_predictions(y_true=ds[target], y_pred=predictions, auxiliary_metrics=True) print(json.dumps(performance, indent=4)) time.sleep(0.1) except Exception as e: # Print exceptions on evaluate, continue to return predictions print(f'Exception: {e}') else: raise NotImplementedError("content_type must be 'text/csv'") elapsed_time = round(timer()-start,3) print(f'Elapsed time: {round(timer()-start,3)} seconds') return response_body, output_content_type
import pandas as pd import autogluon.core as ag from autogluon import TabularPrediction as task from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score #autogluon label_column = 'test' dir = 'agModels-predictClass_jiagnwei' train_data = task.Dataset(file_path="/dataset/jiangweitrai.csv") test_data = task.Dataset(file_path="/dataset/jiangweitrai.csv") # TODO predictor = task.fit(train_data=train_data, label='test', output_directory=dir, auto_stack=True, time_limits=1800) results = predictor.fit_summary() print(predictor.feature_importance(dataset=test_data, subsample_size=None)) # predictor = task.load(dir) # print(predictor.info()) # print(predictor.feature_importance(dataset=train_data))
# Run Auto-WEKA: (num_models_trained, num_models_ensemble, fit_time, y_pred, y_prob, predict_time, class_order) = autoweka_fit_predict(train_data=train_data, test_data=test_data, label_column=label_column, problem_type=problem_type, output_directory=output_directory, autoweka_path=autoweka_path, eval_metric=eval_metric, runtime_sec=runtime_sec, random_state=random_state, num_cores=num_cores) # Can use autogluon.tabular.Predictor to evaluate predictions (assuming metric correctly specified): ag_predictor = task.fit(task.Dataset(df=train_data), label=label_column, problem_type=problem_type, eval_metric=eval_metric, hyperparameters={'GBM': { 'num_boost_round': 2 }}) if eval_metric == 'roc_auc': preds_toevaluate = y_prob[:, 1] elif eval_metric == 'log_loss': preds_toevaluate = y_prob else: preds_toevaluate = y_pred perf = ag_predictor.evaluate_predictions( test_data[label_column], preds_toevaluate
if not args.evaluate: if args.walltime <= 120: excluded_model_types = ["KNN"] else: excluded_model_types = [] # Create output directory pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True) (X_train, y_train), (X_valid, y_valid) = load_data(use_test=False) df_train = convert_to_dataframe(X_train, y_train) df_valid = convert_to_dataframe(X_valid, y_valid) predictor = task.fit( train_data=task.Dataset(df=df_train), tuning_data=task.Dataset(df=df_valid), label="label", output_directory=output_dir, time_limits=args.walltime, hyperparameter_tune=True, auto_stack=True, excluded_model_types=excluded_model_types, ) else: _, (X_test, y_test) = load_data(use_test=True) print("Convert arrays to DataFrame...") df_test = convert_to_dataframe(X_test, y_test) print("Loading models...")
def test_tabularHPO(): # Aggregate performance summaries obtained in previous benchmark run: prev_perf_vals = [dataset['performance_val'] for dataset in datasets] previous_avg_performance = np.mean(prev_perf_vals) previous_median_performance = np.median(prev_perf_vals) previous_worst_performance = np.max(prev_perf_vals) # Run benchmark: performance_vals = [0.0] * len( datasets) # performance obtained in this run with warnings.catch_warnings(record=True) as caught_warnings: for idx in range(len(datasets)): seed(seed_val) np.random.seed(seed_val) mx.random.seed(seed_val) dataset = datasets[idx] print("Evaluating Benchmark Dataset %s (%d of %d)" % (dataset['name'], idx + 1, len(datasets))) directory = dataset['name'] + "/" train_file_path = directory + train_file test_file_path = directory + test_file if (not os.path.exists(train_file_path)) or ( not os.path.exists(test_file_path)): # fetch files from s3: print("%s data not found locally, so fetching from %s" % (dataset['name'], dataset['url'])) os.system("wget " + dataset['url'] + " -O temp.zip && unzip -o temp.zip && rm temp.zip") savedir = directory + 'AutogluonOutput/' shutil.rmtree( savedir, ignore_errors=True ) # Delete AutoGluon output directory to ensure previous runs' information has been removed. label_column = dataset['label_column'] train_data = task.Dataset(file_path=train_file_path) test_data = task.Dataset(file_path=test_file_path) y_test = test_data[label_column] test_data = test_data.drop(labels=[label_column], axis=1) if fast_benchmark: train_data = train_data.head( subsample_size) # subsample for fast_benchmark predictor = None # reset from last Dataset if fast_benchmark: predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, hyperparameter_tune=hyperparameter_tune, hyperparameters=hyperparameters, time_limits=time_limits, num_trials=num_trials, verbosity=verbosity) else: predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, hyperparameter_tune=hyperparameter_tune, verbosity=verbosity) results = predictor.fit_summary(verbosity=0) if predictor.problem_type != dataset['problem_type']: warnings.warn( "For dataset %s: Autogluon inferred problem_type = %s, but should = %s" % (dataset['name'], predictor.problem_type, dataset['problem_type'])) predictor = None # We delete predictor here to test loading previously-trained predictor from file predictor = task.load(savedir) y_pred = predictor.predict(test_data) perf_dict = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True) if dataset['problem_type'] != REGRESSION: perf = 1.0 - perf_dict[ 'accuracy_score'] # convert accuracy to error-rate else: perf = 1.0 - perf_dict[ 'r2_score'] # unexplained variance score. performance_vals[idx] = perf print("Performance on dataset %s: %s (previous perf=%s)" % (dataset['name'], performance_vals[idx], dataset['performance_val'])) if (not fast_benchmark) and ( performance_vals[idx] > dataset['performance_val'] * perf_threshold): warnings.warn( "Performance on dataset %s is %s times worse than previous performance." % (dataset['name'], performance_vals[idx] / (EPS + dataset['performance_val']))) # Summarize: avg_perf = np.mean(performance_vals) median_perf = np.median(performance_vals) worst_perf = np.max(performance_vals) for idx in range(len(datasets)): print("Performance on dataset %s: %s (previous perf=%s)" % (datasets[idx]['name'], performance_vals[idx], datasets[idx]['performance_val'])) print("Average performance: %s" % avg_perf) print("Median performance: %s" % median_perf) print("Worst performance: %s" % worst_perf) if not fast_benchmark: if avg_perf > previous_avg_performance * perf_threshold: warnings.warn( "Average Performance is %s times worse than previously." % (avg_perf / (EPS + previous_avg_performance))) if median_perf > previous_median_performance * perf_threshold: warnings.warn( "Median Performance is %s times worse than previously." % (median_perf / (EPS + previous_median_performance))) if worst_perf > previous_worst_performance * perf_threshold: warnings.warn( "Worst Performance is %s times worse than previously." % (worst_perf / (EPS + previous_worst_performance))) # List all warnings again to make sure they are seen: print("\n\n WARNINGS:") for w in caught_warnings: warnings.warn(w.message)
""" wide and deep test, follow code from autogluon autogluon's NN architecture is based on wide and deep network """ from autogluon import TabularPrediction as task from data_config.data_config import load_data, data_config if __name__ == '__main__': res = {} for data_name in data_config.keys(): ylabel = data_config[data_name]['ylabel'] X_train, X_valid = load_data(data_name, combine_y=True) train_data = task.Dataset(df=X_train) test_data = task.Dataset(df=X_valid) savedir = f'{data_name}/' # where to save trained models predictor = task.fit( train_data=train_data, label=ylabel, output_directory=savedir, eval_metric='roc_auc', verbosity=2, visualizer='tensorboard', random_seed=0, save_space=True, keep_only_best=True, ) auc = predictor.evaluate(X_valid) res[data_name] = auc print(res)
# nn_options = { # specifies non-default hyperparameter values for neural network models # "num_epochs": 100, # number of training epochs (controls training time of NN models) # "learning_rate": ag.space.Real( # 0.001, 0.1, default=0.01, log=True # ), # learning rate used in training (real-valued hyperparameter searched on log-scale) # "activation": ag.space.Categorical( # None, swish, "relu", "tanh", "sigmoid" # ), # activation function used in NN (categorical hyperparameter, default = first entry) # "layers": ag.space.Categorical(*(nunits for _ in range(10))), # # Each choice for categorical hyperparameter 'layers' corresponds to list of sizes for each NN layer to use # "dropout_prob": 0.0, # } # hyperparameters = {"NN": nn_options} predictor = task.fit( train_data=task.Dataset(df=df_train), # tuning_data=task.Dataset(df=df_valid), label="label", output_directory=output_dir, time_limits=args.walltime, hyperparameter_tune=False, auto_stack=True, excluded_model_types=excluded_model_types, dist_ip_addrs=ips, ) else: _, (X_test, y_test) = load_data(use_test=True) print("Convert arrays to DataFrame...") df_test = convert_to_dataframe(X_test, y_test)
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2020/10/23 8:32 # @Author : iszhang # @Email : # @File : ag_main.py # @software: PyCharm from autogluon import TabularPrediction as task import sys sys.path.append('C:/Users/ThinkPad/PycharmProjects/TabNet&AutoGluon/utils') import utils.data_utils as data_utils # pd.set_option('display.max_columns', None) train_data = task.Dataset(file_path='../Data/5.Haberman/haberman.csv') label_column = 'status' dir = 'agModels-predictClass' # specifies folder where to store trained models # print(train_data.head(10)) # print(train_data.info()) # print(train_data.describe()) if __name__ == '__main__': # predictor = task.fit(train_data=train_data, label=label_column, output_directory=dir, time_limits=100) # results = predictor.fit_summary() # print("AutoGluon infers problem type is: ", predictor.problem_type) # print("AutoGluon identified the following types of features:") # print(predictor.feature_metadata) # # predictor.leaderboard(train_data, silent=True) # # print(results)
""" Example script for predicting columns of tables, demonstrating simple use-case """ from autogluon import TabularPrediction as task # Training time: train_data = task.Dataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv') # can be local CSV file as well, returns Pandas DataFrame train_data = train_data.head(500) # subsample for faster demo print(train_data.head()) label_column = 'class' # specifies which column do we want to predict savedir = 'ag_models/' # where to save trained models predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir) # NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead: predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, presets='best_quality', eval_metric=YOUR_METRIC_NAME) results = predictor.fit_summary() # Inference time: test_data = task.Dataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv') # another Pandas DataFrame y_test = test_data[label_column] test_data = test_data.drop(labels=[label_column],axis=1) # delete labels from test data since we wouldn't have them in practice print(test_data.head()) predictor = task.load(savedir) # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file y_pred = predictor.predict(test_data) perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)