def run(dataset: Dataset, config: TaskConfig): log.info( f"\n**** Gradient Boosting [sklearn v{sklearn.__version__}] ****\n") save_metadata(config, version=sklearn.__version__) is_classification = config.type == 'classification' X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y, dataset.test.y estimator = GradientBoostingClassifier if is_classification else GradientBoostingRegressor predictor = estimator(random_state=config.seed, **config.framework_params) with Timer() as training: predictor.fit(X_train, y_train) predictions = predictor.predict(X_test) probabilities = predictor.predict_proba( X_test) if is_classification else None save_predictions(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test) return dict(models_count=1, training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Constant predictor (sklearn dummy) ****\n") save_metadata(config, version=sklearn.__version__) is_classification = config.type == 'classification' predictor = DummyClassifier( strategy='prior') if is_classification else DummyRegressor( strategy='median') encode = config.framework_params[ 'encode'] if 'encode' in config.framework_params else False X_train = dataset.train.X_enc if encode else dataset.train.X y_train = dataset.train.y_enc if encode else dataset.train.y X_test = dataset.test.X_enc if encode else dataset.test.X y_test = dataset.test.y_enc if encode else dataset.test.y with Timer() as training: predictor.fit(X_train, y_train) with Timer() as predict: predictions = predictor.predict(X_test) probabilities = predictor.predict_proba( X_test) if is_classification else None save_predictions(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=encode) return dict(models_count=1, training_duration=training.duration, predict_duration=predict.duration)
def run(dataset: Dataset, config: TaskConfig): log.info(f"\n**** Decision Tree [sklearn v{sklearn.__version__}] ****\n") is_classification = config.type == 'classification' X_train, X_test = impute_array(*unsparsify(dataset.train.X_enc, dataset.test.X_enc, fmt='array')) y_train, y_test = unsparsify(dataset.train.y_enc, dataset.test.y_enc, fmt='array') estimator = DecisionTreeClassifier if is_classification else DecisionTreeRegressor predictor = estimator(random_state=config.seed, **config.framework_params) with Timer() as training: predictor.fit(X_train, y_train) with Timer() as predict: predictions = predictor.predict(X_test) probabilities = predictor.predict_proba(X_test) if is_classification else None save_predictions(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=is_classification) return dict( models_count=1, training_duration=training.duration, predict_duration=predict.duration )
def run(dataset: Dataset, config: TaskConfig): log.info("****TabNet****") save_metadata(config) is_classification = config.type == 'classification' X_train, X_test = dataset.train.X, dataset.test.X X_train, X_test = impute(X_train, X_test) X = np.concatenate((X_train, X_test), axis=0) enc = OrdinalEncoder() enc.fit(X) X_train = enc.transform(X_train) X_test = enc.transform(X_test) y_train, y_test = dataset.train.y, dataset.test.y estimator = TabNetClassifier if is_classification else TabNetRegressor predictor = estimator() # you can change hyperparameters if not is_classification: y_train = np.reshape(y_train.astype(np.float32), (-1, 1)) y_test = np.reshape(y_test.astype(np.float32), (-1, 1)) with Timer() as training: predictor.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)]) with Timer() as predict: predictions = predictor.predict(X_test) probabilities = predictor.predict_proba( X_test) if is_classification else None save_predictions(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test) return dict(models_count=1, training_duration=training.duration, predict_duration=predict.duration)
def run_in_venv(caller_file, script_file: str, *args, input_data: Union[dict, ns], dataset: Dataset, config: TaskConfig, process_results=None, python_exec=None): here = dir_of(caller_file) venv_bin_path = os.path.join(here, 'venv', 'bin') if python_exec is None: # use local virtual env by default python_exec = os.path.join(venv_bin_path, 'python -W ignore') script_path = os.path.join(here, script_file) cmd = f"{python_exec} {script_path}" input_data = ns.from_dict(input_data) with TemporaryDirectory() as tmpdir: def make_path(k, v, parents=None): if isinstance(v, np.ndarray): path = os.path.join(tmpdir, '.'.join(parents + [k, 'npy'])) if vector_keys.match(k): v = v.reshape(-1, 1) np.save(path, v, allow_pickle=True) return k, path return k, v ds = ns.walk(input_data, make_path) dataset.release() config.result_dir = tmpdir config.result_file = mktemp(dir=tmpdir) params = json_dumps(dict(dataset=ds, config=config), style='compact') with Timer() as proc_timer: output, err = run_cmd( cmd, *args, _input_str_=params, _live_output_=True, _error_level_=logging.DEBUG, _env_=dict(PATH=os.pathsep.join( [venv_bin_path, os.environ['PATH']]), PYTHONPATH=os.pathsep.join([ rconfig().root_dir, ]), AMLB_PATH=os.path.join(rconfig().root_dir, "amlb")), ) res = ns(lambda: None) if os.path.exists(config.result_file): res = json_load(config.result_file, as_namespace=True) log.debug("Result from subprocess:\n%s", res) if not res: raise NoResultError(f"Process crashed:\n{err}") if res.error_message is not None: raise NoResultError(res.error_message) for name in ['predictions', 'truth', 'probabilities']: res[name] = np.load( res[name], allow_pickle=True) if res[name] is not None else None if callable(process_results): res = process_results(res) if res.output_file: save_predictions( dataset=dataset, output_file=res.output_file, predictions=res.predictions.reshape(-1) if res.predictions is not None else None, truth=res.truth.reshape(-1) if res.truth is not None else None, probabilities=res.probabilities, probabilities_labels=res.probabilities_labels, target_is_encoded=res.target_is_encoded) return dict(models_count=res.models_count if res.models_count is not None else 1, training_duration=res.training_duration if res.training_duration is not None else proc_timer.duration, predict_duration=res.predict_duration, **res.others.__dict__)
def run(dataset: Dataset, config: TaskConfig): log.info(f"\n**** AutoWEKA [v{config.framework_version}]****\n") save_metadata(config) is_classification = config.type == 'classification' if not is_classification: raise ValueError('Regression is not supported.') # Mapping of benchmark metrics to Weka metrics metrics_mapping = dict(acc='errorRate', auc='areaUnderROC', logloss='kBInformation') metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if metric is None: raise ValueError("Performance metric {} not supported.".format( config.metric)) train_file = dataset.train.path test_file = dataset.test.path # Weka to requires target as the last attribute if dataset.target.index != len(dataset.predictors): train_file = reorder_dataset(dataset.train.path, target_src=dataset.target.index) test_file = reorder_dataset(dataset.test.path, target_src=dataset.target.index) training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } parallelRuns = config.framework_params.get('_parallelRuns', config.cores) memLimit = config.framework_params.get('_memLimit', 'auto') if memLimit == 'auto': memLimit = max( min(config.max_mem_size_mb, math.ceil(config.max_mem_size_mb / parallelRuns)), 1024) # AutoWEKA default memLimit log.info("Using %sMB memory per run on %s parallel runs.", memLimit, parallelRuns) f = split_path(config.output_predictions_file) f.extension = '.weka_pred.csv' weka_file = path_from_split(f) cmd_root = "java -cp {here}/lib/autoweka/autoweka.jar weka.classifiers.meta.AutoWEKAClassifier ".format( here=dir_of(__file__)) cmd_params = dict( t='"{}"'.format(train_file), T='"{}"'.format(test_file), memLimit=memLimit, classifications= '"weka.classifiers.evaluation.output.prediction.CSV -distribution -file \\\"{}\\\""' .format(weka_file), timeLimit=int(config.max_runtime_seconds / 60), parallelRuns=parallelRuns, metric=metric, seed=config.seed % (1 << 16), # weka accepts only int16 as seeds **training_params) cmd = cmd_root + ' '.join( ["-{} {}".format(k, v) for k, v in cmd_params.items()]) with Timer() as training: run_cmd(cmd, _live_output_=True) # if target values are not sorted alphabetically in the ARFF file, then class probabilities are returned in the original order # interestingly, other frameworks seem to always sort the target values first # that's why we need to specify the probabilities labels here: sorting+formatting is done in saving function probabilities_labels = dataset.target.values if not os.path.exists(weka_file): raise NoResultError("AutoWEKA failed producing any prediction.") with open(weka_file, 'r') as weka_file: probabilities = [] predictions = [] truth = [] for line in weka_file.readlines()[1:-1]: inst, actual, predicted, error, *distribution = line.split(',') pred_probabilities = [ pred_probability.replace('*', '').replace('\n', '') for pred_probability in distribution ] _, pred = predicted.split(':') _, tru = actual.split(':') probabilities.append(pred_probabilities) predictions.append(pred) truth.append(tru) save_predictions(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=truth, probabilities_labels=probabilities_labels) return dict(training_duration=training.duration)
def run(dataset: Dataset, config: TaskConfig): log.info(f"\n**** MLNet [v{config.framework_version}] ****\n") avaible_task_list = ['classification', 'regression'] if config.type not in avaible_task_list: raise ValueError(f'{config.type} is not supported.') dir_path = os.path.dirname(os.path.realpath(__file__)) DOTNET_INSTALL_DIR = os.path.join(dir_path, 'lib') os.environ['DOTNET_ROOT'] = DOTNET_INSTALL_DIR os.environ['MLNetCLIEnablePredict'] = 'True' os.environ['MLNET_MAX_THREAD'] = str(config.cores) mlnet = os.path.join(DOTNET_INSTALL_DIR, 'mlnet') train_time_in_seconds = config.max_runtime_seconds sub_command = config.type # set up MODELBUILDER_AUTOML MODELBUILDER_AUTOML = config.framework_params.get('automl_type', 'NNI') os.environ['MODELBUILDER_AUTOML'] = MODELBUILDER_AUTOML artifacts = config.framework_params.get('_save_artifacts', []) tmpdir = tempfile.mkdtemp() tmp_output_folder = os.path.join(tmpdir, str(config.fold)) output_dir = output_subdir( 'models', config=config) if 'models' in artifacts else tmp_output_folder log_dir = output_subdir( 'logs', config=config) if 'logs' in artifacts else tmp_output_folder log_path = os.path.join(log_dir, 'log.txt') try: label = dataset.target.index train_dataset_path = dataset.train.data_path('csv') test_dataset_path = dataset.test.data_path('csv') log.info(f'train dataset: {train_dataset_path}') log.info(f'test dataset: {test_dataset_path}') cmd = ( f"{mlnet} {sub_command}" f" --dataset {train_dataset_path} --test-dataset {test_dataset_path} --train-time {train_time_in_seconds}" f" --label-col {label} --output {os.path.dirname(output_dir)} --name {config.fold}" f" --verbosity q --log-file-path {log_path}") with Timer() as training: run_cmd(cmd) train_result_json = os.path.join(output_dir, '{}.mbconfig'.format(config.fold)) if not os.path.exists(train_result_json): raise NoResultError("MLNet failed producing any prediction.") with open(train_result_json, 'r') as f: json_str = f.read() mb_config = json.loads(json_str) model_path = os.path.join(output_dir, f"{config.fold}.zip") output_prediction_path = os.path.join( log_dir, "prediction.txt" ) # keeping this in log dir as it contains useful error when prediction fails models_count = len(mb_config['RunHistory']['Trials']) # predict predict_cmd = ( f"{mlnet} predict --task-type {config.type}" f" --model {model_path} --dataset {test_dataset_path} --label-col {dataset.target.name} > {output_prediction_path}" ) with Timer() as prediction: run_cmd(predict_cmd) if config.type == 'classification': prediction_df = pd.read_csv(output_prediction_path, dtype={'PredictedLabel': 'object'}) save_predictions( dataset=dataset, output_file=config.output_predictions_file, predictions=prediction_df['PredictedLabel'].values, truth=dataset.test.y, probabilities=prediction_df.values[:, :-1], probabilities_labels=list( prediction_df.columns.values[:-1]), ) if config.type == 'regression': prediction_df = pd.read_csv(output_prediction_path) save_predictions( dataset=dataset, output_file=config.output_predictions_file, predictions=prediction_df['Score'].values, truth=dataset.test.y, ) return dict( models_count=models_count, training_duration=training.duration, predict_duration=prediction.duration, ) finally: if 'logs' in artifacts: logs_zip = os.path.join(log_dir, "logs.zip") zip_path(log_dir, logs_zip) clean_dir(log_dir, filter_=lambda p: p != logs_zip) if 'models' in artifacts: models_zip = os.path.join(output_dir, "models.zip") zip_path(output_dir, models_zip) clean_dir(output_dir, filter_=lambda p: p != models_zip) shutil.rmtree(tmpdir, ignore_errors=True)
def run_in_venv(caller_file, script_file: str, *args, input_data: Union[dict, ns], dataset: Dataset, config: TaskConfig, options: Union[None, dict, ns] = None, process_results=None, python_exec=None): here = dir_of(caller_file) if python_exec is None: # use local virtual env by default python_exec = venv_python_exec(here) script_path = os.path.join(here, script_file) cmd = f"{python_exec} {script_path}" options = ns.from_dict(options) if options else ns() ser_config = options['serialization'] env = options['env'] or ns() with TemporaryDirectory() as tmpdir: ds = _make_input_dataset(input_data, dataset, tmpdir, serialization=ser_config) config.result_dir = tmpdir config.result_file = mktemp(dir=tmpdir) params = json_dumps(dict(dataset=ds, config=config, options=options), style='compact') log.debug("Params passed to subprocess:\n%s", params) cmon = rconfig().monitoring monitor = (dict(interval_seconds=cmon.interval_seconds, verbosity=cmon.verbosity) if 'sub_proc_memory' in cmon.statistics else None) env = dict(PATH=os.pathsep.join([venv_bin(here), os.environ['PATH']]), PYTHONPATH=os.pathsep.join([ rconfig().root_dir, ]), AMLB_PATH=os.path.join(rconfig().root_dir, "amlb"), AMLB_LOG_TRACE=str( logging.TRACE if hasattr(logging, 'TRACE') else ''), **{k: str(v) for k, v in env}) with Timer() as proc_timer: output, err = run_cmd(cmd, *args, _input_str_=params, _live_output_=True, _error_level_=logging.DEBUG, _env_=env, _monitor_=monitor) res = ns(lambda: None) if os.path.exists(config.result_file): res = json_load(config.result_file, as_namespace=True) log.debug("Result from subprocess:\n%s", res) if not res: raise NoResultError(f"Process crashed:\n{err}") if res.error_message is not None: raise NoResultError(res.error_message) for name in ['predictions', 'truth', 'probabilities']: res[name] = deserialize_data( res[name], config=ser_config) if res[name] is not None else None if callable(process_results): res = process_results(res) if res.output_file: save_predictions(dataset=dataset, output_file=res.output_file, predictions=as_vec(res.predictions), truth=(as_vec(res.truth) if res.truth is not None else dataset.test.y_enc if res.target_is_encoded else dataset.test.y), probabilities=res.probabilities, probabilities_labels=res.probabilities_labels, target_is_encoded=res.target_is_encoded) return dict(models_count=res.models_count if res.models_count is not None else 1, training_duration=res.training_duration if res.training_duration is not None else proc_timer.duration, predict_duration=res.predict_duration, **res.others.__dict__)