def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Constant predictor (sklearn dummy) ****\n") save_metadata(config, version=sklearn.__version__) is_classification = config.type == 'classification' predictor = DummyClassifier( strategy='prior') if is_classification else DummyRegressor( strategy='median') encode = config.framework_params[ 'encode'] if 'encode' in config.framework_params else False X_train = dataset.train.X_enc if encode else dataset.train.X y_train = dataset.train.y_enc if encode else dataset.train.y X_test = dataset.test.X_enc if encode else dataset.test.X y_test = dataset.test.y_enc if encode else dataset.test.y with Timer() as training: predictor.fit(X_train, y_train) with Timer() as predict: predictions = predictor.predict(X_test) probabilities = predictor.predict_proba( X_test) if is_classification else None save_predictions(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=encode) return dict(models_count=1, training_duration=training.duration, predict_duration=predict.duration)
def run(dataset: Dataset, config: TaskConfig): log.info(f"\n**** Decision Tree [sklearn v{sklearn.__version__}] ****\n") is_classification = config.type == 'classification' X_train, X_test = impute_array(*unsparsify(dataset.train.X_enc, dataset.test.X_enc, fmt='array')) y_train, y_test = unsparsify(dataset.train.y_enc, dataset.test.y_enc, fmt='array') estimator = DecisionTreeClassifier if is_classification else DecisionTreeRegressor predictor = estimator(random_state=config.seed, **config.framework_params) with Timer() as training: predictor.fit(X_train, y_train) with Timer() as predict: predictions = predictor.predict(X_test) probabilities = predictor.predict_proba(X_test) if is_classification else None save_predictions(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=is_classification) return dict( models_count=1, training_duration=training.duration, predict_duration=predict.duration )
def run(dataset: Dataset, config: TaskConfig): #TODO: use rpy2 instead? not necessary here though as the call is very simple log.info("\n**** Autoxgboost (R) ****\n") is_classification = config.type == 'classification' if not is_classification: raise ValueError('Regression is not supported.') here = dir_of(__file__) with Timer() as training: run_cmd( r"""Rscript --vanilla -e "source('{script}'); run('{train}', '{test}', target.index = {target_index}, '{output}', {cores}, time.budget = {time_budget})" """ .format(script=os.path.join(here, 'exec.R'), train=dataset.train.path, test=dataset.test.path, target_index=dataset.target.index + 1, output=config.output_predictions_file, cores=config.cores, time_budget=config.max_runtime_seconds), _live_output_=True) log.info("Predictions saved to %s", config.output_predictions_file) return dict(training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info( f"\n**** Gradient Boosting [sklearn v{sklearn.__version__}] ****\n") save_metadata(config, version=sklearn.__version__) is_classification = config.type == 'classification' X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y, dataset.test.y estimator = GradientBoostingClassifier if is_classification else GradientBoostingRegressor predictor = estimator(random_state=config.seed, **config.framework_params) with Timer() as training: predictor.fit(X_train, y_train) predictions = predictor.predict(X_test) probabilities = predictor.predict_proba( X_test) if is_classification else None save_predictions(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test) return dict(models_count=1, training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** dabl AnyClassifier ****\n") is_classification = config.type == 'classification' print(dataset.train.X) X_train, X_test = pd.DataFrame(dataset.train.X).astype(str), pd.DataFrame(dataset.test.X).astype(str) y_train, y_test = pd.Series(dataset.train.y), pd.Series(dataset.test.y) estimator = AnyClassifier if is_classification else None predictor = estimator(**config.framework_params) with Timer() as training: predictor.fit(X_train, y_train) predictions = predictor.predict(X_test) probabilities = predictor.predict_proba(X_test) if is_classification else None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test) return dict( models_count=1, training_duration=training.duration )
def run(dataset: Dataset, config: TaskConfig): log.info("****TabNet****") save_metadata(config) is_classification = config.type == 'classification' X_train, X_test = dataset.train.X, dataset.test.X X_train, X_test = impute(X_train, X_test) X = np.concatenate((X_train, X_test), axis=0) enc = OrdinalEncoder() enc.fit(X) X_train = enc.transform(X_train) X_test = enc.transform(X_test) y_train, y_test = dataset.train.y, dataset.test.y estimator = TabNetClassifier if is_classification else TabNetRegressor predictor = estimator() # you can change hyperparameters if not is_classification: y_train = np.reshape(y_train.astype(np.float32), (-1, 1)) y_test = np.reshape(y_test.astype(np.float32), (-1, 1)) with Timer() as training: predictor.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)]) with Timer() as predict: predictions = predictor.predict(X_test) probabilities = predictor.predict_proba( X_test) if is_classification else None save_predictions(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test) return dict(models_count=1, training_duration=training.duration, predict_duration=predict.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Oboe ****\n") is_classification = config.type == 'classification' if not is_classification: # regression currently fails (as of 26.02.2019: still under development state by oboe team) raise ValueError('Regression is not yet supported (under development).') X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y_enc, dataset.test.y_enc training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')} n_cores = config.framework_params.get('_n_cores', config.cores) log.info('Running oboe with a maximum time of {}s on {} cores.'.format(config.max_runtime_seconds, n_cores)) log.warning('We completely ignore the advice to optimize towards metric: {}.'.format(config.metric)) aml = AutoLearner(p_type='classification' if is_classification else 'regression', n_cores=n_cores, runtime_limit=config.max_runtime_seconds, **training_params) aml_models = lambda: [aml.ensemble, *aml.ensemble.base_learners] if len(aml.ensemble.base_learners) > 0 else [] with Timer() as training: try: aml.fit(X_train, y_train) except IndexError as e: if len(aml_models()) == 0: # incorrect handling of some IndexError in oboe if ensemble is empty raise NoResultError("Oboe could not produce any model in the requested time.") from e raise e predictions = aml.predict(X_test).reshape(len(X_test)) if is_classification: target_values_enc = dataset.target.label_encoder.transform(dataset.target.values) probabilities = Encoder('one-hot', target=False, encoded_type=float).fit(target_values_enc).transform(predictions) else: probabilities = None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=True) return dict( models_count=len(aml_models()), training_duration=training.duration )
def run_random_forest(dataset, config, tuner, log): is_classification = config.type == 'classification' X_train, X_test = impute(dataset.train.X, dataset.test.X) y_train, y_test = dataset.train.y, dataset.test.y estimator = RandomForestClassifier if is_classification else RandomForestRegressor best_score, best_params, best_model = None, None, None score_higher_better = True tuner.update_search_space(SEARCH_SPACE) start_time = time.time() while True: try: param_idx, cur_params = tuner.generate_parameters() cur_model = estimator(random_state=config.seed, **cur_params) # Here score is the output of score() from the estimator cur_score = cross_val_score(cur_model, X_train, y_train) cur_score = sum(cur_score) / float(len(cur_score)) if best_score is None or (score_higher_better and cur_score > best_score) or ( not score_higher_better and cur_score < best_score): best_score, best_params, best_model = cur_score, cur_params, cur_model log.info("Trial {}: \n{}\nScore: {}\n".format( param_idx, cur_params, cur_score)) tuner.receive_trial_result(param_idx, cur_params, cur_score) current_time = time.time() elapsed_time = current_time - start_time if elapsed_time > config.max_runtime_seconds: break except: break # This line is required to fully terminate some advisors tuner.handle_terminate() log.info("Tuning done, the best parameters are:\n{}\n".format(best_params)) # retrain on the whole dataset with Timer() as training: best_model.fit(X_train, y_train) predictions = best_model.predict(X_test) probabilities = best_model.predict_proba( X_test) if is_classification else None return probabilities, predictions, training, y_test
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Random Forest (sklearn) ****\n") is_classification = config.type == 'classification' X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y_enc, dataset.test.y_enc training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } n_jobs = config.framework_params.get( '_n_jobs', config.cores ) # useful to disable multicore, regardless of the dataset config log.info( "Running RandomForest with a maximum time of {}s on {} cores.".format( config.max_runtime_seconds, n_jobs)) log.warning( "We completely ignore the requirement to stay within the time limit.") log.warning( "We completely ignore the advice to optimize towards metric: {}.". format(config.metric)) estimator = RandomForestClassifier if is_classification else RandomForestRegressor rf = estimator(n_jobs=n_jobs, random_state=config.seed, **training_params) with Timer() as training: rf.fit(X_train, y_train) predictions = rf.predict(X_test) probabilities = rf.predict_proba(X_test) if is_classification else None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=True) return dict(models_count=len(rf), training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Decision Tree (sklearn) ****\n") is_classification = config.type == 'classification' X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y, dataset.test.y estimator = DecisionTreeClassifier if is_classification else DecisionTreeRegressor predictor = estimator(random_state=config.seed, **config.framework_params) with Timer() as training: predictor.fit(X_train, y_train) predictions = predictor.predict(X_test) probabilities = predictor.predict_proba( X_test) if is_classification else None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test) return dict(models_count=1, training_duration=training.duration)
def run_in_venv(caller_file, script_file: str, *args, input_data: Union[dict, ns], dataset: Dataset, config: TaskConfig, process_results=None, python_exec=None): here = dir_of(caller_file) venv_bin_path = os.path.join(here, 'venv', 'bin') if python_exec is None: # use local virtual env by default python_exec = os.path.join(venv_bin_path, 'python -W ignore') script_path = os.path.join(here, script_file) cmd = f"{python_exec} {script_path}" input_data = ns.from_dict(input_data) with TemporaryDirectory() as tmpdir: def make_path(k, v, parents=None): if isinstance(v, np.ndarray): path = os.path.join(tmpdir, '.'.join(parents + [k, 'npy'])) if vector_keys.match(k): v = v.reshape(-1, 1) np.save(path, v, allow_pickle=True) return k, path return k, v ds = ns.walk(input_data, make_path) dataset.release() config.result_dir = tmpdir config.result_file = mktemp(dir=tmpdir) params = json_dumps(dict(dataset=ds, config=config), style='compact') with Timer() as proc_timer: output, err = run_cmd( cmd, *args, _input_str_=params, _live_output_=True, _error_level_=logging.DEBUG, _env_=dict(PATH=os.pathsep.join( [venv_bin_path, os.environ['PATH']]), PYTHONPATH=os.pathsep.join([ rconfig().root_dir, ]), AMLB_PATH=os.path.join(rconfig().root_dir, "amlb")), ) res = ns(lambda: None) if os.path.exists(config.result_file): res = json_load(config.result_file, as_namespace=True) log.debug("Result from subprocess:\n%s", res) if not res: raise NoResultError(f"Process crashed:\n{err}") if res.error_message is not None: raise NoResultError(res.error_message) for name in ['predictions', 'truth', 'probabilities']: res[name] = np.load( res[name], allow_pickle=True) if res[name] is not None else None if callable(process_results): res = process_results(res) if res.output_file: save_predictions( dataset=dataset, output_file=res.output_file, predictions=res.predictions.reshape(-1) if res.predictions is not None else None, truth=res.truth.reshape(-1) if res.truth is not None else None, probabilities=res.probabilities, probabilities_labels=res.probabilities_labels, target_is_encoded=res.target_is_encoded) return dict(models_count=res.models_count if res.models_count is not None else 1, training_duration=res.training_duration if res.training_duration is not None else proc_timer.duration, predict_duration=res.predict_duration, **res.others.__dict__)
def run(dataset: Dataset, config: TaskConfig): log.info(f"\n**** AutoWEKA [v{config.framework_version}]****\n") save_metadata(config) is_classification = config.type == 'classification' if not is_classification: raise ValueError('Regression is not supported.') # Mapping of benchmark metrics to Weka metrics metrics_mapping = dict(acc='errorRate', auc='areaUnderROC', logloss='kBInformation') metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if metric is None: raise ValueError("Performance metric {} not supported.".format( config.metric)) train_file = dataset.train.path test_file = dataset.test.path # Weka to requires target as the last attribute if dataset.target.index != len(dataset.predictors): train_file = reorder_dataset(dataset.train.path, target_src=dataset.target.index) test_file = reorder_dataset(dataset.test.path, target_src=dataset.target.index) training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } parallelRuns = config.framework_params.get('_parallelRuns', config.cores) memLimit = config.framework_params.get('_memLimit', 'auto') if memLimit == 'auto': memLimit = max( min(config.max_mem_size_mb, math.ceil(config.max_mem_size_mb / parallelRuns)), 1024) # AutoWEKA default memLimit log.info("Using %sMB memory per run on %s parallel runs.", memLimit, parallelRuns) f = split_path(config.output_predictions_file) f.extension = '.weka_pred.csv' weka_file = path_from_split(f) cmd_root = "java -cp {here}/lib/autoweka/autoweka.jar weka.classifiers.meta.AutoWEKAClassifier ".format( here=dir_of(__file__)) cmd_params = dict( t='"{}"'.format(train_file), T='"{}"'.format(test_file), memLimit=memLimit, classifications= '"weka.classifiers.evaluation.output.prediction.CSV -distribution -file \\\"{}\\\""' .format(weka_file), timeLimit=int(config.max_runtime_seconds / 60), parallelRuns=parallelRuns, metric=metric, seed=config.seed % (1 << 16), # weka accepts only int16 as seeds **training_params) cmd = cmd_root + ' '.join( ["-{} {}".format(k, v) for k, v in cmd_params.items()]) with Timer() as training: run_cmd(cmd, _live_output_=True) # if target values are not sorted alphabetically in the ARFF file, then class probabilities are returned in the original order # interestingly, other frameworks seem to always sort the target values first # that's why we need to specify the probabilities labels here: sorting+formatting is done in saving function probabilities_labels = dataset.target.values if not os.path.exists(weka_file): raise NoResultError("AutoWEKA failed producing any prediction.") with open(weka_file, 'r') as weka_file: probabilities = [] predictions = [] truth = [] for line in weka_file.readlines()[1:-1]: inst, actual, predicted, error, *distribution = line.split(',') pred_probabilities = [ pred_probability.replace('*', '').replace('\n', '') for pred_probability in distribution ] _, pred = predicted.split(':') _, tru = actual.split(':') probabilities.append(pred_probabilities) predictions.append(pred) truth.append(tru) save_predictions(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=truth, probabilities_labels=probabilities_labels) return dict(training_duration=training.duration)
def run_in_venv(caller_file, script_file: str, *args, input_data: Union[dict, ns], dataset: Dataset, config: TaskConfig, process_results=None, python_exec=None): here = dir_of(caller_file) if python_exec is None: # use local virtual env by default python_exec = os.path.join(here, 'venv/bin/python -W ignore') script_path = os.path.join(here, script_file) cmd = f"{python_exec} {script_path}" input_data = ns.from_dict(input_data) with TmpDir() as tmpdir: def make_path(k, v, parents=None): if isinstance(v, np.ndarray): path = os.path.join(tmpdir, '.'.join(parents + [k, 'npy'])) if vector_keys.match(k): v = v.reshape(-1, 1) np.save(path, v, allow_pickle=True) return k, path return k, v ds = ns.walk(input_data, make_path) dataset.release() config.result_token = str(uuid.uuid1()) config.result_dir = tmpdir params = json_dumps(dict(dataset=ds, config=config), style='compact') with Timer() as proc_timer: output, err = run_cmd(cmd, *args, _input_str_=params, _live_output_=True, _env_=dict(PYTHONPATH=os.pathsep.join([ rconfig().root_dir, os.path.join(rconfig().root_dir, "amlb"), ]))) out = io.StringIO(output) res = ns() for line in out: li = line.rstrip() if li == config.result_token: res = json_loads(out.readline(), as_namespace=True) break if res.error_message is not None: raise NoResultError(res.error_message) for name in ['predictions', 'truth', 'probabilities']: res[name] = np.load( res[name], allow_pickle=True) if res[name] is not None else None log.debug("Result from subprocess:\n%s", res) if callable(process_results): res = process_results(res) save_predictions_to_file( dataset=dataset, output_file=res.output_file, predictions=res.predictions.reshape(-1) if res.predictions is not None else None, truth=res.truth.reshape(-1) if res.truth is not None else None, probabilities=res.probabilities, target_is_encoded=res.target_is_encoded) return dict(models_count=res.models_count if res.models_count is not None else 1, training_duration=res.training_duration if res.training_duration is not None else proc_timer.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** H2O AutoML ****\n") # Mapping of benchmark metrics to H2O metrics metrics_mapping = dict(acc='mean_per_class_error', auc='AUC', logloss='logloss', mae='mae', mse='mse', rmse='rmse', rmsle='rmsle') sort_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if sort_metric is None: # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported, defaulting to AUTO.", config.metric) try: training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } nthreads = config.framework_params.get('_nthreads', config.cores) log.info("Starting H2O cluster with %s cores, %sMB memory.", nthreads, config.max_mem_size_mb) h2o.init(nthreads=nthreads, min_mem_size=str(config.max_mem_size_mb) + "M", max_mem_size=str(config.max_mem_size_mb) + "M", log_dir=os.path.join(config.output_dir, 'logs', config.name, str(config.fold))) # Load train as an H2O Frame, but test as a Pandas DataFrame log.debug("Loading train data from %s.", dataset.train.path) train = h2o.import_file(dataset.train.path) # train.impute(method='mean') log.debug("Loading test data from %s.", dataset.test.path) test = h2o.import_file(dataset.test.path) # test.impute(method='mean') log.info("Running model on task %s, fold %s.", config.name, config.fold) log.debug( "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.", config.max_runtime_seconds, config.cores, sort_metric) aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds, sort_metric=sort_metric, seed=config.seed, **training_params) with Timer() as training: aml.train(y=dataset.target.index, training_frame=train) if not aml.leader: raise NoResultError( "H2O could not produce any model in the requested time.") lb = aml.leaderboard.as_data_frame() log.debug("Leaderboard:\n%s", lb.to_string()) lbf = split_path(config.output_predictions_file) lbf.extension = '.leaderboard.csv' lbf = path_from_split(lbf) write_csv(lb, lbf) h2o_preds = aml.predict(test).as_data_frame(use_pandas=False) preds = to_data_frame(h2o_preds[1:], columns=h2o_preds[0]) y_pred = preds.iloc[:, 0] h2o_truth = test[:, dataset.target.index].as_data_frame(use_pandas=False, header=False) y_truth = to_data_frame(h2o_truth) predictions = y_pred.values probabilities = preds.iloc[:, 1:].values truth = y_truth.values save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=truth) return dict(models_count=len(aml.leaderboard), training_duration=training.duration) finally: if h2o.connection(): h2o.remove_all() h2o.connection().close() if h2o.connection().local_server: h2o.connection().local_server.shutdown()
def run_random_forest(dataset, config, tuner, log): """ Using the given tuner, tune a random forest within the given time constraint. This function uses cross validation score as the feedback score to the tuner. The search space on which tuners search on is defined above empirically as a global variable. """ limit_type, trial_limit = config.framework_params['limit_type'], None if limit_type == 'ntrials': trial_limit = int(config.framework_params['trial_limit']) X_train, X_test = preprocess_random_forest(dataset, log) y_train, y_test = dataset.train.y, dataset.test.y is_classification = config.type == 'classification' estimator = RandomForestClassifier if is_classification else RandomForestRegressor best_score, best_params, best_model = None, None, None score_higher_better = True tuner.update_search_space(SEARCH_SPACE) start_time = time.time() trial_count = 0 intermediate_scores = [] intermediate_best_scores = [] # should be monotonically increasing while True: try: trial_count += 1 param_idx, cur_params = tuner.generate_parameters() train_params = cur_params.copy() if 'TRIAL_BUDGET' in cur_params: train_params.pop('TRIAL_BUDGET') if cur_params['max_leaf_nodes'] == 0: train_params.pop('max_leaf_nodes') if cur_params['max_depth'] == 0: train_params.pop('max_depth') log.info("Trial {}: \n{}\n".format(param_idx, cur_params)) cur_model = estimator(random_state=config.seed, **train_params) # Here score is the output of score() from the estimator cur_score = cross_val_score(cur_model, X_train, y_train) cur_score = sum(cur_score) / float(len(cur_score)) if np.isnan(cur_score): cur_score = 0 log.info("Score: {}\n".format(cur_score)) if best_score is None or (score_higher_better and cur_score > best_score) or (not score_higher_better and cur_score < best_score): best_score, best_params, best_model = cur_score, cur_params, cur_model intermediate_scores.append(cur_score) intermediate_best_scores.append(best_score) tuner.receive_trial_result(param_idx, cur_params, cur_score) if limit_type == 'time': current_time = time.time() elapsed_time = current_time - start_time if elapsed_time >= config.max_runtime_seconds: break elif limit_type == 'ntrials': if trial_count >= trial_limit: break except: break # This line is required to fully terminate some advisors tuner.handle_terminate() log.info("Tuning done, the best parameters are:\n{}\n".format(best_params)) # retrain on the whole dataset with Timer() as training: best_model.fit(X_train, y_train) predictions = best_model.predict(X_test) probabilities = best_model.predict_proba(X_test) if is_classification else None return probabilities, predictions, training, y_test, intermediate_scores, intermediate_best_scores
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** H2O AutoML ****\n") # Mapping of benchmark metrics to H2O metrics metrics_mapping = dict(acc='mean_per_class_error', auc='AUC', logloss='logloss', mae='mae', mse='mse', r2='r2', rmse='rmse', rmsle='rmsle') sort_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if sort_metric is None: # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported, defaulting to AUTO.", config.metric) try: training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } nthreads = config.framework_params.get('_nthreads', config.cores) jvm_memory = str( round(config.max_mem_size_mb * 2 / 3)) + "M" # leaving 1/3rd of available memory for XGBoost log.info("Starting H2O cluster with %s cores, %s memory.", nthreads, jvm_memory) max_port_range = 49151 min_port_range = 1024 rnd_port = os.getpid() % (max_port_range - min_port_range) + min_port_range port = config.framework_params.get('_port', rnd_port) h2o.init( nthreads=nthreads, port=port, min_mem_size=jvm_memory, max_mem_size=jvm_memory, strict_version_check=config.framework_params.get( '_strict_version_check', True) # log_dir=os.path.join(config.output_dir, 'logs', config.name, str(config.fold)) ) # Load train as an H2O Frame, but test as a Pandas DataFrame log.debug("Loading train data from %s.", dataset.train.path) train = h2o.import_file(dataset.train.path, destination_frame=frame_name('train', config)) # train.impute(method='mean') log.debug("Loading test data from %s.", dataset.test.path) test = h2o.import_file(dataset.test.path, destination_frame=frame_name('test', config)) # test.impute(method='mean') log.info("Running model on task %s, fold %s.", config.name, config.fold) log.debug( "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.", config.max_runtime_seconds, config.cores, sort_metric) aml = H2OAutoML( max_runtime_secs=config.max_runtime_seconds, max_runtime_secs_per_model=round( config.max_runtime_seconds / 2), # to prevent timeout on ensembles sort_metric=sort_metric, seed=config.seed, **training_params) monitor = ( BackendMemoryMonitoring( frequency_seconds=rconfig().monitoring.frequency_seconds, check_on_exit=True, verbosity=rconfig().monitoring.verbosity) if config.framework_params.get('_monitor_backend', False) # else contextlib.nullcontext # Py 3.7+ only else contextlib.contextmanager(iter)([0])) with Timer() as training: with monitor: aml.train(y=dataset.target.index, training_frame=train) if not aml.leader: raise NoResultError( "H2O could not produce any model in the requested time.") save_predictions(aml, test, dataset=dataset, config=config) save_artifacts(aml, dataset=dataset, config=config) return dict(models_count=len(aml.leaderboard), training_duration=training.duration) finally: if h2o.connection(): # h2o.remove_all() h2o.connection().close() if h2o.connection().local_server: h2o.connection().local_server.shutdown()
def run(dataset: Dataset, config: TaskConfig): log.info( "\n**** Random Forest (sklearn) Tuned with NNI EvolutionTuner ****\n") is_classification = config.type == 'classification' X_train, X_test = impute(dataset.train.X, dataset.test.X) y_train, y_test = dataset.train.y, dataset.test.y estimator = RandomForestClassifier if is_classification else RandomForestRegressor # model = estimator(random_state=config.seed, **config.framework_params) best_score, best_params, best_model = None, None, None score_higher_better = True log.info( "Tuning hyperparameters with NNI EvolutionTuner with a maximum time of {}s\n" .format(config.max_runtime_seconds)) tuner = EvolutionTuner() tuner.update_search_space(SEARCH_SPACE) start_time = time.time() param_idx = 0 while True: try: cur_params = tuner.generate_parameters(param_idx) cur_model = estimator(random_state=config.seed, **cur_params, **config.framework_params) # Here score is the output of score() from the estimator cur_score = cross_val_score(cur_model, X_train, y_train) cur_score = sum(cur_score) / float(len(cur_score)) if best_score is None or (score_higher_better and cur_score > best_score) or ( not score_higher_better and cur_score < best_score): best_score, best_params, best_model = cur_score, cur_params, cur_model log.info("Trial {}: \n{}\nScore: {}\n".format( param_idx, cur_params, cur_score)) tuner.receive_trial_result(param_idx, cur_params, cur_score) param_idx += 1 current_time = time.time() elapsed_time = current_time - start_time if elapsed_time > config.max_runtime_seconds: break except: break log.info("Tuning done, the best parameters are:\n{}\n".format(best_params)) # retrain on the whole dataset with Timer() as training: best_model.fit(X_train, y_train) predictions = best_model.predict(X_test) probabilities = best_model.predict_proba( X_test) if is_classification else None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test) return dict(models_count=1, training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Hyperopt-sklearn ****\n") is_classification = config.type == 'classification' default = lambda: 0 metrics_to_loss_mapping = dict( acc=(default, False), # lambda y, pred: 1.0 - accuracy_score(y, pred) auc=(lambda y, pred: 1.0 - roc_auc_score(y, pred), False), f1=(lambda y, pred: 1.0 - f1_score(y, pred), False), # logloss=(log_loss, True), mae=(mean_absolute_error, False), mse=(mean_squared_error, False), msle=(mean_squared_log_error, False), r2=(default, False), # lambda y, pred: 1.0 - r2_score(y, pred) ) loss_fn, continuous_loss_fn = metrics_to_loss_mapping[ config.metric] if config.metric in metrics_to_loss_mapping else (None, False) if loss_fn is None: log.warning("Performance metric %s not supported: defaulting to %s.", config.metric, 'accuracy' if is_classification else 'r2') if loss_fn is default: loss_fn = None training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } log.warning("Ignoring cores constraint of %s cores.", config.cores) log.info( "Running hyperopt-sklearn with a maximum time of %ss on %s cores, optimizing %s.", config.max_runtime_seconds, 'all', config.metric) X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y_enc, dataset.test.y_enc if is_classification: classifier = any_classifier('clf') regressor = None else: classifier = None regressor = any_regressor('rgr') estimator = HyperoptEstimator(classifier=classifier, regressor=regressor, algo=tpe.suggest, loss_fn=loss_fn, continuous_loss_fn=continuous_loss_fn, trial_timeout=config.max_runtime_seconds, seed=config.seed, **training_params) with InterruptTimeout(config.max_runtime_seconds * 4 / 3, sig=signal.SIGQUIT): with InterruptTimeout(config.max_runtime_seconds, before_interrupt=ft.partial( kill_proc_tree, timeout=5, include_parent=False)): with Timer() as training: estimator.fit(X_train, y_train) predictions = estimator.predict(X_test) if is_classification: target_values_enc = dataset.target.label_encoder.transform( dataset.target.values) probabilities = Encoder( 'one-hot', target=False, encoded_type=float).fit(target_values_enc).transform(predictions) else: probabilities = None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=True) return dict(models_count=len(estimator.trials), training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** TPOT ****\n") is_classification = config.type == 'classification' # Mapping of benchmark metrics to TPOT metrics metrics_mapping = dict(acc='accuracy', auc='roc_auc', f1='f1', logloss='neg_log_loss', mae='neg_mean_absolute_error', mse='neg_mean_squared_error', msle='neg_mean_squared_log_error', r2='r2') scoring_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if scoring_metric is None: raise ValueError("Performance metric {} not supported.".format( config.metric)) X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y_enc, dataset.test.y_enc training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } n_jobs = config.framework_params.get( '_n_jobs', config.cores ) # useful to disable multicore, regardless of the dataset config log.info( 'Running TPOT with a maximum time of %ss on %s cores, optimizing %s.', config.max_runtime_seconds, n_jobs, scoring_metric) runtime_min = (config.max_runtime_seconds / 60) estimator = TPOTClassifier if is_classification else TPOTRegressor tpot = estimator(n_jobs=n_jobs, max_time_mins=runtime_min, scoring=scoring_metric, random_state=config.seed, **training_params) with Timer() as training: tpot.fit(X_train, y_train) log.info('Predicting on the test set.') predictions = tpot.predict(X_test) try: probabilities = tpot.predict_proba( X_test) if is_classification else None except RuntimeError: # TPOT throws a RuntimeError if the optimized pipeline does not support `predict_proba`. target_values_enc = dataset.target.label_encoder.transform( dataset.target.values) probabilities = Encoder( 'one-hot', target=False, encoded_type=float).fit(target_values_enc).transform(predictions) save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=is_classification) save_artifacts(tpot, config) return dict(models_count=len(tpot.evaluated_individuals_), training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** H2O AutoML ****\n") # Mapping of benchmark metrics to H2O metrics metrics_mapping = dict(acc='mean_per_class_error', auc='AUC', logloss='logloss', mae='mae', mse='mse', rmse='rmse', rmsle='rmsle') sort_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if sort_metric is None: # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported, defaulting to AUTO.", config.metric) try: training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } nthreads = config.framework_params.get('_nthreads', config.cores) log.info("Starting H2O cluster with %s cores, %sMB memory.", nthreads, config.max_mem_size_mb) h2o.init( nthreads=nthreads, min_mem_size=str(config.max_mem_size_mb) + "M", max_mem_size=str(config.max_mem_size_mb) + "M", # log_dir=os.path.join(config.output_dir, 'logs', config.name, str(config.fold)) ) # Load train as an H2O Frame, but test as a Pandas DataFrame log.debug("Loading train data from %s.", dataset.train.path) train = h2o.import_file(dataset.train.path, destination_frame=frame_name('train', config)) # train.impute(method='mean') log.debug("Loading test data from %s.", dataset.test.path) test = h2o.import_file(dataset.test.path, destination_frame=frame_name('test', config)) # test.impute(method='mean') log.info("Running model on task %s, fold %s.", config.name, config.fold) log.debug( "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.", config.max_runtime_seconds, config.cores, sort_metric) aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds, sort_metric=sort_metric, seed=config.seed, **training_params) with Timer() as training: aml.train(y=dataset.target.index, training_frame=train) if not aml.leader: raise NoResultError( "H2O could not produce any model in the requested time.") save_predictions(aml, test, dataset=dataset, config=config) save_artifacts(aml, dataset=dataset, config=config) return dict(models_count=len(aml.leaderboard), training_duration=training.duration) finally: if h2o.connection(): h2o.remove_all() h2o.connection().close() if h2o.connection().local_server: h2o.connection().local_server.shutdown()
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** AutoSklearn ****\n") warnings.simplefilter(action='ignore', category=FutureWarning) warnings.simplefilter(action='ignore', category=DeprecationWarning) is_classification = config.type == 'classification' # Mapping of benchmark metrics to autosklearn metrics metrics_mapping = dict(acc=metrics.accuracy, auc=metrics.roc_auc, f1=metrics.f1, logloss=metrics.log_loss, mae=metrics.mean_absolute_error, mse=metrics.mean_squared_error, r2=metrics.r2) perf_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if perf_metric is None: # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported.", config.metric) # Set resources based on datasize log.info( "Running auto-sklearn with a maximum time of %ss on %s cores with %sMB, optimizing %s.", config.max_runtime_seconds, config.cores, config.max_mem_size_mb, perf_metric) log.info("Environment: %s", os.environ) X_train = dataset.train.X_enc y_train = dataset.train.y_enc # log.info("finite=%s", np.isfinite(X_train)) predictors_type = [ 'Categorical' if p.is_categorical() else 'Numerical' for p in dataset.predictors ] training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } n_jobs = config.framework_params.get('_n_jobs', config.cores) ml_memory_limit = config.framework_params.get('_ml_memory_limit', 'auto') ensemble_memory_limit = config.framework_params.get( '_ensemble_memory_limit', 'auto') # when memory is large enough, we should have: # (cores - 1) * ml_memory_limit_mb + ensemble_memory_limit_mb = config.max_mem_size_mb total_memory_mb = system_memory_mb().total if ml_memory_limit == 'auto': ml_memory_limit = max( min(config.max_mem_size_mb, math.ceil(total_memory_mb / n_jobs)), 3072) # 3072 is autosklearn defaults if ensemble_memory_limit == 'auto': ensemble_memory_limit = max( math.ceil(ml_memory_limit - (total_memory_mb - config.max_mem_size_mb)), math.ceil(ml_memory_limit / 3), # default proportions 1024) # 1024 is autosklearn defaults log.info( "Using %sMB memory per ML job and %sMB for ensemble job on a total of %s jobs.", ml_memory_limit, ensemble_memory_limit, n_jobs) log.warning( "Using meta-learned initialization, which might be bad (leakage).") # TODO: do we need to set per_run_time_limit too? estimator = AutoSklearnClassifier if is_classification else AutoSklearnRegressor auto_sklearn = estimator( time_left_for_this_task=config.max_runtime_seconds, n_jobs=n_jobs, ml_memory_limit=ml_memory_limit, ensemble_memory_limit=ensemble_memory_limit, seed=config.seed, **training_params) with Timer() as training: auto_sklearn.fit(X_train, y_train, metric=perf_metric, feat_type=predictors_type) # Convert output to strings for classification log.info("Predicting on the test set.") X_test = dataset.test.X_enc y_test = dataset.test.y_enc predictions = auto_sklearn.predict(X_test) probabilities = auto_sklearn.predict_proba( X_test) if is_classification else None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=True) save_artifacts(auto_sklearn, config) return dict(models_count=len(auto_sklearn.get_models_with_weights()), training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info(f"\n**** MLNet [v{config.framework_version}] ****\n") avaible_task_list = ['classification', 'regression'] if config.type not in avaible_task_list: raise ValueError(f'{config.type} is not supported.') dir_path = os.path.dirname(os.path.realpath(__file__)) DOTNET_INSTALL_DIR = os.path.join(dir_path, 'lib') os.environ['DOTNET_ROOT'] = DOTNET_INSTALL_DIR os.environ['MLNetCLIEnablePredict'] = 'True' os.environ['MLNET_MAX_THREAD'] = str(config.cores) mlnet = os.path.join(DOTNET_INSTALL_DIR, 'mlnet') train_time_in_seconds = config.max_runtime_seconds sub_command = config.type # set up MODELBUILDER_AUTOML MODELBUILDER_AUTOML = config.framework_params.get('automl_type', 'NNI') os.environ['MODELBUILDER_AUTOML'] = MODELBUILDER_AUTOML artifacts = config.framework_params.get('_save_artifacts', []) tmpdir = tempfile.mkdtemp() tmp_output_folder = os.path.join(tmpdir, str(config.fold)) output_dir = output_subdir( 'models', config=config) if 'models' in artifacts else tmp_output_folder log_dir = output_subdir( 'logs', config=config) if 'logs' in artifacts else tmp_output_folder log_path = os.path.join(log_dir, 'log.txt') try: label = dataset.target.index train_dataset_path = dataset.train.data_path('csv') test_dataset_path = dataset.test.data_path('csv') log.info(f'train dataset: {train_dataset_path}') log.info(f'test dataset: {test_dataset_path}') cmd = ( f"{mlnet} {sub_command}" f" --dataset {train_dataset_path} --test-dataset {test_dataset_path} --train-time {train_time_in_seconds}" f" --label-col {label} --output {os.path.dirname(output_dir)} --name {config.fold}" f" --verbosity q --log-file-path {log_path}") with Timer() as training: run_cmd(cmd) train_result_json = os.path.join(output_dir, '{}.mbconfig'.format(config.fold)) if not os.path.exists(train_result_json): raise NoResultError("MLNet failed producing any prediction.") with open(train_result_json, 'r') as f: json_str = f.read() mb_config = json.loads(json_str) model_path = os.path.join(output_dir, f"{config.fold}.zip") output_prediction_path = os.path.join( log_dir, "prediction.txt" ) # keeping this in log dir as it contains useful error when prediction fails models_count = len(mb_config['RunHistory']['Trials']) # predict predict_cmd = ( f"{mlnet} predict --task-type {config.type}" f" --model {model_path} --dataset {test_dataset_path} --label-col {dataset.target.name} > {output_prediction_path}" ) with Timer() as prediction: run_cmd(predict_cmd) if config.type == 'classification': prediction_df = pd.read_csv(output_prediction_path, dtype={'PredictedLabel': 'object'}) save_predictions( dataset=dataset, output_file=config.output_predictions_file, predictions=prediction_df['PredictedLabel'].values, truth=dataset.test.y, probabilities=prediction_df.values[:, :-1], probabilities_labels=list( prediction_df.columns.values[:-1]), ) if config.type == 'regression': prediction_df = pd.read_csv(output_prediction_path) save_predictions( dataset=dataset, output_file=config.output_predictions_file, predictions=prediction_df['Score'].values, truth=dataset.test.y, ) return dict( models_count=models_count, training_duration=training.duration, predict_duration=prediction.duration, ) finally: if 'logs' in artifacts: logs_zip = os.path.join(log_dir, "logs.zip") zip_path(log_dir, logs_zip) clean_dir(log_dir, filter_=lambda p: p != logs_zip) if 'models' in artifacts: models_zip = os.path.join(output_dir, "models.zip") zip_path(output_dir, models_zip) clean_dir(output_dir, filter_=lambda p: p != models_zip) shutil.rmtree(tmpdir, ignore_errors=True)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Tuned Random Forest (sklearn) ****\n") is_classification = config.type == 'classification' training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } tuning_params = config.framework_params.get('_tuning', training_params) n_jobs = config.framework_params.get( '_n_jobs', config.cores ) # useful to disable multicore, regardless of the dataset config # Impute any missing data (can test using -t 146606) X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y_enc, dataset.test.y_enc log.info( "Running RandomForest with a maximum time of {}s on {} cores.".format( config.max_runtime_seconds, n_jobs)) estimator = RandomForestClassifier if is_classification else RandomForestRegressor metric = dict(auc='roc_auc', logloss='neg_log_loss', acc='accuracy')[config.metric] n_features = X_train.shape[1] default_value = max(1, int(math.sqrt(n_features))) below_default = pick_values_uniform(start=1, end=default_value, length=5 + 1)[:-1] # 5 below above_default = pick_values_uniform(start=default_value, end=n_features, length=10 + 1 - len(below_default))[1:] # 5 above # Mix up the order of `max_features` to try, so that a fair range is tried even if we have too little time # to try all possible values. Order: [sqrt(p), 1, p, random order for remaining values] # max_features_to_try = below_default[1:] + above_default[:-1] # max_features_values = ([default_value, 1, n_features] # + random.sample(max_features_to_try, k=len(max_features_to_try))) max_features_values = [default_value] + below_default + above_default # Define up to how much of total time we spend 'optimizing' `max_features`. # (the remainder if used for fitting the final model). safety_factor = 0.85 with stopit.ThreadingTimeout(seconds=int(config.max_runtime_seconds * safety_factor)): log.info("Evaluating multiple values for `max_features`: %s.", max_features_values) max_feature_scores = [] tuning_durations = [] for i, max_features_value in enumerate(max_features_values): log.info("[{:2d}/{:2d}] Evaluating max_features={}".format( i + 1, len(max_features_values), max_features_value)) imputation = Imputer() random_forest = estimator(n_jobs=n_jobs, random_state=config.seed, max_features=max_features_value, **tuning_params) pipeline = Pipeline(steps=[('preprocessing', imputation), ('learning', random_forest)]) with Timer() as cv_scoring: try: scores = cross_val_score(estimator=pipeline, X=dataset.train.X_enc, y=dataset.train.y_enc, scoring=metric, cv=5) max_feature_scores.append( (statistics.mean(scores), max_features_value)) except stopit.utils.TimeoutException as toe: log.error( "Failed CV scoring for max_features=%s : Timeout", max_features_value) tuning_durations.append( (max_features_value, cv_scoring.duration)) raise toe except Exception as e: log.error("Failed CV scoring for max_features=%s :\n%s", max_features_value, e) log.debug("Exception:", exc_info=True) tuning_durations.append((max_features_value, cv_scoring.duration)) log.info("Tuning scores:\n%s", sorted(max_feature_scores)) log.info("Tuning durations:\n%s", sorted(tuning_durations)) _, best_max_features_value = max( max_feature_scores) if len(max_feature_scores) > 0 else (math.nan, 'auto') log.info("Training final model with `max_features={}`.".format( best_max_features_value)) rf = estimator(n_jobs=n_jobs, random_state=config.seed, max_features=best_max_features_value, **training_params) with Timer() as training: rf.fit(X_train, y_train) predictions = rf.predict(X_test) probabilities = rf.predict_proba(X_test) if is_classification else None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=True) return dict(models_count=len(rf), training_duration=training.duration + sum(map(lambda t: t[1], tuning_durations)))