def save_predictions(model, test, dataset, config, predictions_file=None, preview=True): h2o_preds = model.predict(test).as_data_frame(use_pandas=False) preds = to_data_frame(h2o_preds[1:], columns=h2o_preds[0]) y_pred = preds.iloc[:, 0] h2o_truth = test[:, dataset.target.index].as_data_frame(use_pandas=False, header=False) y_truth = to_data_frame(h2o_truth) predictions = y_pred.values probabilities = preds.iloc[:, 1:].values prob_labels = h2o_preds[0][1:] truth = y_truth.values save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file if predictions_file is None else predictions_file, probabilities=probabilities, probabilities_labels=prob_labels, predictions=predictions, truth=truth, preview=preview)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** dabl AnyClassifier ****\n") is_classification = config.type == 'classification' print(dataset.train.X) X_train, X_test = pd.DataFrame(dataset.train.X).astype(str), pd.DataFrame(dataset.test.X).astype(str) y_train, y_test = pd.Series(dataset.train.y), pd.Series(dataset.test.y) estimator = AnyClassifier if is_classification else None predictor = estimator(**config.framework_params) with Timer() as training: predictor.fit(X_train, y_train) predictions = predictor.predict(X_test) probabilities = predictor.predict_proba(X_test) if is_classification else None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test) return dict( models_count=1, training_duration=training.duration )
def save_predictions(model, test, dataset, config, predictions_file=None, preview=True): h2o_preds = model.predict(test).as_data_frame(use_pandas=False) preds = to_data_frame(h2o_preds[1:], columns=h2o_preds[0]) y_pred = preds.iloc[:, 0] h2o_truth = test[:, dataset.target.index].as_data_frame(use_pandas=False, header=False) y_truth = to_data_frame(h2o_truth) predictions = y_pred.values probabilities = preds.iloc[:, 1:].values prob_labels = h2o_preds[0][1:] if all([re.fullmatch(r"p\d+", p) for p in prob_labels]): # for categories represented as numerical values, h2o prefixes the probabilities columns with p # in this case, we let the app setting the labels to avoid mismatch prob_labels = None truth = y_truth.values save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file if predictions_file is None else predictions_file, probabilities=probabilities, probabilities_labels=prob_labels, predictions=predictions, truth=truth, preview=preview)
def run(dataset: Dataset, config: TaskConfig): validate_config(config) tuner = NNITuner(config) if config.framework_params['limit_type'] == 'time': log.info("Tuning {} with NNI {} with a maximum time of {}s\n".format( config.framework_params['arch_type'], tuner.description, config.max_runtime_seconds)) elif config.framework_params['limit_type'] == 'ntrials': log.info( "Tuning {} with NNI {} with a maximum number of trials of {}\n". format(config.framework_params['arch_type'], tuner.description, config.framework_params['trial_limit'])) log.info("Note: any time constraints are ignored.") probabilities, predictions, train_timer, y_test, intermediate_scores, intermediate_best_scores = run_experiment( dataset, config, tuner, log) save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test) scores_file = '/'.join( config.output_predictions_file.split('/') [:-3]) + '/scorelogs/' + config.output_predictions_file.split('/')[-1] assert (len(intermediate_scores) == len(intermediate_best_scores)) save_scores_to_file(intermediate_scores, intermediate_best_scores, scores_file) return dict(models_count=1, training_duration=train_timer.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Constant predictor (sklearn dummy) ****\n") is_classification = config.type == 'classification' predictor = DummyClassifier( strategy='prior') if is_classification else DummyRegressor( strategy='median') encode = config.framework_params[ 'encode'] if 'encode' in config.framework_params else False X_train = dataset.train.X_enc if encode else dataset.train.X y_train = dataset.train.y_enc if encode else dataset.train.y X_test = dataset.test.X_enc if encode else dataset.test.X y_test = dataset.test.y_enc if encode else dataset.test.y with Timer() as training: predictor.fit(X_train, y_train) predictions = predictor.predict(X_test) probabilities = predictor.predict_proba( X_test) if is_classification else None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=encode) return dict(models_count=1, training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Gradient Boosting (sklearn %s) ****\n", sklearn.__version__) is_classification = config.type == 'classification' X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y, dataset.test.y estimator = GradientBoostingClassifier if is_classification else GradientBoostingRegressor predictor = estimator(random_state=config.seed, **config.framework_params) with Timer() as training: predictor.fit(X_train, y_train) predictions = predictor.predict(X_test) probabilities = predictor.predict_proba( X_test) if is_classification else None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test) return dict(models_count=1, training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Oboe ****\n") is_classification = config.type == 'classification' if not is_classification: # regression currently fails (as of 26.02.2019: still under development state by oboe team) raise ValueError('Regression is not yet supported (under development).') X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y_enc, dataset.test.y_enc training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')} n_cores = config.framework_params.get('_n_cores', config.cores) log.info('Running oboe with a maximum time of {}s on {} cores.'.format(config.max_runtime_seconds, n_cores)) log.warning('We completely ignore the advice to optimize towards metric: {}.'.format(config.metric)) aml = AutoLearner(p_type='classification' if is_classification else 'regression', n_cores=n_cores, runtime_limit=config.max_runtime_seconds, **training_params) aml_models = lambda: [aml.ensemble, *aml.ensemble.base_learners] if len(aml.ensemble.base_learners) > 0 else [] with Timer() as training: try: aml.fit(X_train, y_train) except IndexError as e: if len(aml_models()) == 0: # incorrect handling of some IndexError in oboe if ensemble is empty raise NoResultError("Oboe could not produce any model in the requested time.") from e raise e predictions = aml.predict(X_test).reshape(len(X_test)) if is_classification: target_values_enc = dataset.target.label_encoder.transform(dataset.target.values) probabilities = Encoder('one-hot', target=False, encoded_type=float).fit(target_values_enc).transform(predictions) else: probabilities = None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=True) return dict( models_count=len(aml_models()), training_duration=training.duration )
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Random Forest (sklearn) ****\n") is_classification = config.type == 'classification' X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y_enc, dataset.test.y_enc training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } n_jobs = config.framework_params.get( '_n_jobs', config.cores ) # useful to disable multicore, regardless of the dataset config log.info( "Running RandomForest with a maximum time of {}s on {} cores.".format( config.max_runtime_seconds, n_jobs)) log.warning( "We completely ignore the requirement to stay within the time limit.") log.warning( "We completely ignore the advice to optimize towards metric: {}.". format(config.metric)) estimator = RandomForestClassifier if is_classification else RandomForestRegressor rf = estimator(n_jobs=n_jobs, random_state=config.seed, **training_params) with Timer() as training: rf.fit(X_train, y_train) predictions = rf.predict(X_test) probabilities = rf.predict_proba(X_test) if is_classification else None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=True) return dict(models_count=len(rf), training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): if 'tuner_type' not in config.framework_params: raise RuntimeError( 'framework.yaml does not have a "tuner_type" field.') tuner = NNITuner(config) log.info("Tuning {} with NNI {} with a maximum time of {}s\n".format( config.framework_params['arch_type'], tuner.description, config.max_runtime_seconds)) probabilities, predictions, train_timer, y_test = run_experiment( dataset, config, tuner, log) save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test) return dict(models_count=1, training_duration=train_timer.duration)
def run(dataset: Dataset, config: TaskConfig): with TmpDir() as tmpdir: ds = ns( train=ns( X_enc=os.path.join(tmpdir, 'train.X_enc'), y=os.path.join(tmpdir, 'train.y') ), test=ns( X_enc=os.path.join(tmpdir, 'test.X_enc'), y=os.path.join(tmpdir, 'test.y') ) ) write_csv(dataset.train.X_enc, ds.train.X_enc), write_csv(dataset.train.y.reshape(-1, 1), ds.train.y), write_csv(dataset.test.X_enc, ds.test.X_enc), write_csv(dataset.test.y.reshape(-1, 1), ds.test.y), dataset.release() config.result_token = str(uuid.uuid1()) config.result_dir = tmpdir params = json_dumps(dict(dataset=ds, config=config), style='compact') output, err = run_cmd('{python} {here}/exec_proc.py'.format(python=PYTHON, here=dir_of(__file__)), _input_str_=params) out = io.StringIO(output) res = ns() for line in out: li = line.rstrip() if li == config.result_token: res = json_loads(out.readline(), as_namespace=True) break def load_data(path): return read_csv(path, as_data_frame=False, header=False) log.debug("Result from subprocess:\n%s", res) save_predictions_to_file(dataset=dataset, output_file=res.output_file, probabilities=load_data(res.probabilities) if res.probabilities is not None else None, predictions=load_data(res.predictions).squeeze(), truth=load_data(res.truth).squeeze(), target_is_encoded=res.target_is_encoded)
def run_in_venv(caller_file, script_file: str, *args, input_data: Union[dict, ns], dataset: Dataset, config: TaskConfig, process_results=None, python_exec=None): here = dir_of(caller_file) venv_bin_path = os.path.join(here, 'venv', 'bin') if python_exec is None: # use local virtual env by default python_exec = os.path.join(venv_bin_path, 'python -W ignore') script_path = os.path.join(here, script_file) cmd = f"{python_exec} {script_path}" input_data = ns.from_dict(input_data) with TmpDir() as tmpdir: def make_path(k, v, parents=None): if isinstance(v, np.ndarray): path = os.path.join(tmpdir, '.'.join(parents + [k, 'npy'])) if vector_keys.match(k): v = v.reshape(-1, 1) np.save(path, v, allow_pickle=True) return k, path return k, v ds = ns.walk(input_data, make_path) dataset.release() config.result_token = str(uuid.uuid1()) config.result_dir = tmpdir params = json_dumps(dict(dataset=ds, config=config), style='compact') with Timer() as proc_timer: output, err = run_cmd( cmd, *args, _input_str_=params, _live_output_=True, _error_level_=logging.DEBUG, _env_=dict(PATH=os.pathsep.join( [venv_bin_path, os.environ['PATH']]), PYTHONPATH=os.pathsep.join([ rconfig().root_dir, ]), AMLB_PATH=os.path.join(rconfig().root_dir, "amlb")), ) out = io.StringIO(output) res = ns() for line in out: li = line.rstrip() if li == config.result_token: res = json_loads(out.readline(), as_namespace=True) break if res.error_message is not None: raise NoResultError(res.error_message) for name in ['predictions', 'truth', 'probabilities']: res[name] = np.load( res[name], allow_pickle=True) if res[name] is not None else None log.debug("Result from subprocess:\n%s", res) if callable(process_results): res = process_results(res) if res.output_file: save_predictions_to_file( dataset=dataset, output_file=res.output_file, predictions=res.predictions.reshape(-1) if res.predictions is not None else None, truth=res.truth.reshape(-1) if res.truth is not None else None, probabilities=res.probabilities, probabilities_labels=res.probabilities_labels, target_is_encoded=res.target_is_encoded) return dict(models_count=res.models_count if res.models_count is not None else 1, training_duration=res.training_duration if res.training_duration is not None else proc_timer.duration, **res.others.__dict__)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Hyperopt-sklearn ****\n") is_classification = config.type == 'classification' default = lambda: 0 metrics_to_loss_mapping = dict( acc=(default, False), # lambda y, pred: 1.0 - accuracy_score(y, pred) auc=(lambda y, pred: 1.0 - roc_auc_score(y, pred), False), f1=(lambda y, pred: 1.0 - f1_score(y, pred), False), # logloss=(log_loss, True), mae=(mean_absolute_error, False), mse=(mean_squared_error, False), msle=(mean_squared_log_error, False), r2=(default, False), # lambda y, pred: 1.0 - r2_score(y, pred) ) loss_fn, continuous_loss_fn = metrics_to_loss_mapping[ config.metric] if config.metric in metrics_to_loss_mapping else (None, False) if loss_fn is None: log.warning("Performance metric %s not supported: defaulting to %s.", config.metric, 'accuracy' if is_classification else 'r2') if loss_fn is default: loss_fn = None training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } log.warning("Ignoring cores constraint of %s cores.", config.cores) log.info( "Running hyperopt-sklearn with a maximum time of %ss on %s cores, optimizing %s.", config.max_runtime_seconds, 'all', config.metric) X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y_enc, dataset.test.y_enc if is_classification: classifier = any_classifier('clf') regressor = None else: classifier = None regressor = any_regressor('rgr') estimator = HyperoptEstimator(classifier=classifier, regressor=regressor, algo=tpe.suggest, loss_fn=loss_fn, continuous_loss_fn=continuous_loss_fn, trial_timeout=config.max_runtime_seconds, seed=config.seed, **training_params) with InterruptTimeout(config.max_runtime_seconds * 4 / 3, sig=signal.SIGQUIT): with InterruptTimeout(config.max_runtime_seconds, before_interrupt=ft.partial( kill_proc_tree, timeout=5, include_parent=False)): with Timer() as training: estimator.fit(X_train, y_train) predictions = estimator.predict(X_test) if is_classification: target_values_enc = dataset.target.label_encoder.transform( dataset.target.values) probabilities = Encoder( 'one-hot', target=False, encoded_type=float).fit(target_values_enc).transform(predictions) else: probabilities = None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=True) return dict(models_count=len(estimator.trials), training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** TPOT ****\n") is_classification = config.type == 'classification' # Mapping of benchmark metrics to TPOT metrics metrics_mapping = dict(acc='accuracy', auc='roc_auc', f1='f1', logloss='neg_log_loss', mae='neg_mean_absolute_error', mse='neg_mean_squared_error', msle='neg_mean_squared_log_error', r2='r2') scoring_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if scoring_metric is None: raise ValueError("Performance metric {} not supported.".format( config.metric)) X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y_enc, dataset.test.y_enc training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } n_jobs = config.framework_params.get( '_n_jobs', config.cores ) # useful to disable multicore, regardless of the dataset config log.info( 'Running TPOT with a maximum time of %ss on %s cores, optimizing %s.', config.max_runtime_seconds, n_jobs, scoring_metric) runtime_min = (config.max_runtime_seconds / 60) estimator = TPOTClassifier if is_classification else TPOTRegressor tpot = estimator(n_jobs=n_jobs, max_time_mins=runtime_min, scoring=scoring_metric, random_state=config.seed, **training_params) with Timer() as training: tpot.fit(X_train, y_train) log.info('Predicting on the test set.') predictions = tpot.predict(X_test) try: probabilities = tpot.predict_proba( X_test) if is_classification else None except RuntimeError: # TPOT throws a RuntimeError if the optimized pipeline does not support `predict_proba`. target_values_enc = dataset.target.label_encoder.transform( dataset.target.values) probabilities = Encoder( 'one-hot', target=False, encoded_type=float).fit(target_values_enc).transform(predictions) save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=is_classification) save_artifacts(tpot, config) return dict(models_count=len(tpot.evaluated_individuals_), training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Tuned Random Forest (sklearn) ****\n") is_classification = config.type == 'classification' training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } tuning_params = config.framework_params.get('_tuning', training_params) n_jobs = config.framework_params.get( '_n_jobs', config.cores ) # useful to disable multicore, regardless of the dataset config # Impute any missing data (can test using -t 146606) X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y_enc, dataset.test.y_enc log.info( "Running RandomForest with a maximum time of {}s on {} cores.".format( config.max_runtime_seconds, n_jobs)) estimator = RandomForestClassifier if is_classification else RandomForestRegressor metric = dict(auc='roc_auc', logloss='neg_log_loss', acc='accuracy')[config.metric] n_features = X_train.shape[1] default_value = max(1, int(math.sqrt(n_features))) below_default = pick_values_uniform(start=1, end=default_value, length=5 + 1)[:-1] # 5 below above_default = pick_values_uniform(start=default_value, end=n_features, length=10 + 1 - len(below_default))[1:] # 5 above # Mix up the order of `max_features` to try, so that a fair range is tried even if we have too little time # to try all possible values. Order: [sqrt(p), 1, p, random order for remaining values] # max_features_to_try = below_default[1:] + above_default[:-1] # max_features_values = ([default_value, 1, n_features] # + random.sample(max_features_to_try, k=len(max_features_to_try))) max_features_values = [default_value] + below_default + above_default # Define up to how much of total time we spend 'optimizing' `max_features`. # (the remainder if used for fitting the final model). safety_factor = 0.85 with stopit.ThreadingTimeout(seconds=int(config.max_runtime_seconds * safety_factor)): log.info("Evaluating multiple values for `max_features`: %s.", max_features_values) max_feature_scores = [] tuning_durations = [] for i, max_features_value in enumerate(max_features_values): log.info("[{:2d}/{:2d}] Evaluating max_features={}".format( i + 1, len(max_features_values), max_features_value)) imputation = Imputer() random_forest = estimator(n_jobs=n_jobs, random_state=config.seed, max_features=max_features_value, **tuning_params) pipeline = Pipeline(steps=[('preprocessing', imputation), ('learning', random_forest)]) with Timer() as cv_scoring: try: scores = cross_val_score(estimator=pipeline, X=dataset.train.X_enc, y=dataset.train.y_enc, scoring=metric, cv=5) max_feature_scores.append( (statistics.mean(scores), max_features_value)) except stopit.utils.TimeoutException as toe: log.error( "Failed CV scoring for max_features=%s : Timeout", max_features_value) tuning_durations.append( (max_features_value, cv_scoring.duration)) raise toe except Exception as e: log.error("Failed CV scoring for max_features=%s :\n%s", max_features_value, e) log.debug("Exception:", exc_info=True) tuning_durations.append((max_features_value, cv_scoring.duration)) log.info("Tuning scores:\n%s", sorted(max_feature_scores)) log.info("Tuning durations:\n%s", sorted(tuning_durations)) _, best_max_features_value = max( max_feature_scores) if len(max_feature_scores) > 0 else (math.nan, 'auto') log.info("Training final model with `max_features={}`.".format( best_max_features_value)) rf = estimator(n_jobs=n_jobs, random_state=config.seed, max_features=best_max_features_value, **training_params) with Timer() as training: rf.fit(X_train, y_train) predictions = rf.predict(X_test) probabilities = rf.predict_proba(X_test) if is_classification else None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=True) return dict(models_count=len(rf), training_duration=training.duration + sum(map(lambda t: t[1], tuning_durations)))
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** H2O AutoML ****\n") # Mapping of benchmark metrics to H2O metrics metrics_mapping = dict(acc='mean_per_class_error', auc='AUC', logloss='logloss', mae='mae', mse='mse', rmse='rmse', rmsle='rmsle') sort_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if sort_metric is None: # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported, defaulting to AUTO.", config.metric) try: training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } nthreads = config.framework_params.get('_nthreads', config.cores) log.info("Starting H2O cluster with %s cores, %sMB memory.", nthreads, config.max_mem_size_mb) h2o.init(nthreads=nthreads, min_mem_size=str(config.max_mem_size_mb) + "M", max_mem_size=str(config.max_mem_size_mb) + "M", log_dir=os.path.join(config.output_dir, 'logs', config.name, str(config.fold))) # Load train as an H2O Frame, but test as a Pandas DataFrame log.debug("Loading train data from %s.", dataset.train.path) train = h2o.import_file(dataset.train.path) # train.impute(method='mean') log.debug("Loading test data from %s.", dataset.test.path) test = h2o.import_file(dataset.test.path) # test.impute(method='mean') log.info("Running model on task %s, fold %s.", config.name, config.fold) log.debug( "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.", config.max_runtime_seconds, config.cores, sort_metric) aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds, sort_metric=sort_metric, seed=config.seed, **training_params) with Timer() as training: aml.train(y=dataset.target.index, training_frame=train) if not aml.leader: raise NoResultError( "H2O could not produce any model in the requested time.") lb = aml.leaderboard.as_data_frame() log.debug("Leaderboard:\n%s", lb.to_string()) lbf = split_path(config.output_predictions_file) lbf.extension = '.leaderboard.csv' lbf = path_from_split(lbf) write_csv(lb, lbf) h2o_preds = aml.predict(test).as_data_frame(use_pandas=False) preds = to_data_frame(h2o_preds[1:], columns=h2o_preds[0]) y_pred = preds.iloc[:, 0] h2o_truth = test[:, dataset.target.index].as_data_frame(use_pandas=False, header=False) y_truth = to_data_frame(h2o_truth) predictions = y_pred.values probabilities = preds.iloc[:, 1:].values truth = y_truth.values save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=truth) return dict(models_count=len(aml.leaderboard), training_duration=training.duration) finally: if h2o.connection(): h2o.remove_all() h2o.connection().close() if h2o.connection().local_server: h2o.connection().local_server.shutdown()
def run(dataset: Dataset, config: TaskConfig): log.info( "\n**** Random Forest (sklearn) Tuned with NNI EvolutionTuner ****\n") is_classification = config.type == 'classification' X_train, X_test = impute(dataset.train.X, dataset.test.X) y_train, y_test = dataset.train.y, dataset.test.y estimator = RandomForestClassifier if is_classification else RandomForestRegressor # model = estimator(random_state=config.seed, **config.framework_params) best_score, best_params, best_model = None, None, None score_higher_better = True log.info( "Tuning hyperparameters with NNI EvolutionTuner with a maximum time of {}s\n" .format(config.max_runtime_seconds)) tuner = EvolutionTuner() tuner.update_search_space(SEARCH_SPACE) start_time = time.time() param_idx = 0 while True: try: cur_params = tuner.generate_parameters(param_idx) cur_model = estimator(random_state=config.seed, **cur_params, **config.framework_params) # Here score is the output of score() from the estimator cur_score = cross_val_score(cur_model, X_train, y_train) cur_score = sum(cur_score) / float(len(cur_score)) if best_score is None or (score_higher_better and cur_score > best_score) or ( not score_higher_better and cur_score < best_score): best_score, best_params, best_model = cur_score, cur_params, cur_model log.info("Trial {}: \n{}\nScore: {}\n".format( param_idx, cur_params, cur_score)) tuner.receive_trial_result(param_idx, cur_params, cur_score) param_idx += 1 current_time = time.time() elapsed_time = current_time - start_time if elapsed_time > config.max_runtime_seconds: break except: break log.info("Tuning done, the best parameters are:\n{}\n".format(best_params)) # retrain on the whole dataset with Timer() as training: best_model.fit(X_train, y_train) predictions = best_model.predict(X_test) probabilities = best_model.predict_proba( X_test) if is_classification else None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test) return dict(models_count=1, training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** AutoSklearn ****\n") warnings.simplefilter(action='ignore', category=FutureWarning) warnings.simplefilter(action='ignore', category=DeprecationWarning) is_classification = config.type == 'classification' # Mapping of benchmark metrics to autosklearn metrics metrics_mapping = dict(acc=metrics.accuracy, auc=metrics.roc_auc, f1=metrics.f1, logloss=metrics.log_loss, mae=metrics.mean_absolute_error, mse=metrics.mean_squared_error, r2=metrics.r2) perf_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if perf_metric is None: # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported.", config.metric) # Set resources based on datasize log.info( "Running auto-sklearn with a maximum time of %ss on %s cores with %sMB, optimizing %s.", config.max_runtime_seconds, config.cores, config.max_mem_size_mb, perf_metric) log.info("Environment: %s", os.environ) X_train = dataset.train.X_enc y_train = dataset.train.y_enc # log.info("finite=%s", np.isfinite(X_train)) predictors_type = [ 'Categorical' if p.is_categorical() else 'Numerical' for p in dataset.predictors ] training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } n_jobs = config.framework_params.get('_n_jobs', config.cores) ml_memory_limit = config.framework_params.get('_ml_memory_limit', 'auto') ensemble_memory_limit = config.framework_params.get( '_ensemble_memory_limit', 'auto') # when memory is large enough, we should have: # (cores - 1) * ml_memory_limit_mb + ensemble_memory_limit_mb = config.max_mem_size_mb total_memory_mb = system_memory_mb().total if ml_memory_limit == 'auto': ml_memory_limit = max( min(config.max_mem_size_mb, math.ceil(total_memory_mb / n_jobs)), 3072) # 3072 is autosklearn defaults if ensemble_memory_limit == 'auto': ensemble_memory_limit = max( math.ceil(ml_memory_limit - (total_memory_mb - config.max_mem_size_mb)), math.ceil(ml_memory_limit / 3), # default proportions 1024) # 1024 is autosklearn defaults log.info( "Using %sMB memory per ML job and %sMB for ensemble job on a total of %s jobs.", ml_memory_limit, ensemble_memory_limit, n_jobs) log.warning( "Using meta-learned initialization, which might be bad (leakage).") # TODO: do we need to set per_run_time_limit too? estimator = AutoSklearnClassifier if is_classification else AutoSklearnRegressor auto_sklearn = estimator( time_left_for_this_task=config.max_runtime_seconds, n_jobs=n_jobs, ml_memory_limit=ml_memory_limit, ensemble_memory_limit=ensemble_memory_limit, seed=config.seed, **training_params) with Timer() as training: auto_sklearn.fit(X_train, y_train, metric=perf_metric, feat_type=predictors_type) # Convert output to strings for classification log.info("Predicting on the test set.") X_test = dataset.test.X_enc y_test = dataset.test.y_enc predictions = auto_sklearn.predict(X_test) probabilities = auto_sklearn.predict_proba( X_test) if is_classification else None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=True) save_artifacts(auto_sklearn, config) return dict(models_count=len(auto_sklearn.get_models_with_weights()), training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** AutoWEKA ****\n") is_classification = config.type == 'classification' if not is_classification: raise ValueError('Regression is not supported.') # Mapping of benchmark metrics to Weka metrics metrics_mapping = dict(acc='errorRate', auc='areaUnderROC', logloss='kBInformation') metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if metric is None: raise ValueError("Performance metric {} not supported.".format( config.metric)) train_file = dataset.train.path test_file = dataset.test.path # Weka to requires target as the last attribute if dataset.target.index != len(dataset.predictors): train_file = reorder_dataset(dataset.train.path, target_src=dataset.target.index) test_file = reorder_dataset(dataset.test.path, target_src=dataset.target.index) training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } parallelRuns = config.framework_params.get('_parallelRuns', config.cores) memLimit = config.framework_params.get('_memLimit', 'auto') if memLimit == 'auto': memLimit = max( min(config.max_mem_size_mb, math.ceil(config.max_mem_size_mb / parallelRuns)), 1024) # AutoWEKA default memLimit log.info("Using %sMB memory per run on %s parallel runs.", memLimit, parallelRuns) f = split_path(config.output_predictions_file) f.extension = '.weka_pred.csv' weka_file = path_from_split(f) cmd_root = "java -cp {here}/lib/autoweka/autoweka.jar weka.classifiers.meta.AutoWEKAClassifier ".format( here=dir_of(__file__)) cmd_params = dict( t='"{}"'.format(train_file), T='"{}"'.format(test_file), memLimit=memLimit, classifications= '"weka.classifiers.evaluation.output.prediction.CSV -distribution -file \\\"{}\\\""' .format(weka_file), timeLimit=int(config.max_runtime_seconds / 60), parallelRuns=parallelRuns, metric=metric, seed=config.seed % (1 << 16), # weka accepts only int16 as seeds **training_params) cmd = cmd_root + ' '.join( ["-{} {}".format(k, v) for k, v in cmd_params.items()]) with Timer() as training: run_cmd(cmd, _live_output_=True) # if target values are not sorted alphabetically in the ARFF file, then class probabilities are returned in the original order # interestingly, other frameworks seem to always sort the target values first # that's why we need to specify the probabilities labels here: sorting+formatting is done in saving function probabilities_labels = dataset.target.values if not os.path.exists(weka_file): raise NoResultError("AutoWEKA failed producing any prediction.") with open(weka_file, 'r') as weka_file: probabilities = [] predictions = [] truth = [] for line in weka_file.readlines()[1:-1]: inst, actual, predicted, error, *distribution = line.split(',') pred_probabilities = [ pred_probability.replace('*', '').replace('\n', '') for pred_probability in distribution ] _, pred = predicted.split(':') _, tru = actual.split(':') probabilities.append(pred_probabilities) predictions.append(pred) truth.append(tru) save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=truth, probabilities_labels=probabilities_labels) return dict(training_duration=training.duration)