def run(dataset: Dataset, config: TaskConfig): from frameworks.shared.caller import run_in_venv X_train, X_test = impute_array( *unsparsify(dataset.train.X_enc, dataset.test.X_enc, fmt='array')) y_train, y_test = unsparsify(dataset.train.y_enc, dataset.test.y_enc, fmt='array') data = dict(train=dict(X=X_train, y=y_train), test=dict(X=X_test, y=y_test)) def process_results(results): if results.probabilities is not None and not results.probabilities.shape: # numpy load always return an array prob_format = results.probabilities.item() if prob_format == "predictions": target_values_enc = dataset.target.label_encoder.transform( dataset.target.values) results.probabilities = Encoder( 'one-hot', target=False, encoded_type=float).fit(target_values_enc).transform( results.predictions) else: raise ValueError( f"Unknown probabilities format: {prob_format}") return results return run_in_venv(__file__, "exec.py", input_data=data, dataset=dataset, config=config, process_results=process_results)
def run(dataset: Dataset, config: TaskConfig): log.info(f"\n**** Decision Tree [sklearn v{sklearn.__version__}] ****\n") is_classification = config.type == 'classification' X_train, X_test = impute_array(*unsparsify(dataset.train.X_enc, dataset.test.X_enc, fmt='array')) y_train, y_test = unsparsify(dataset.train.y_enc, dataset.test.y_enc, fmt='array') estimator = DecisionTreeClassifier if is_classification else DecisionTreeRegressor predictor = estimator(random_state=config.seed, **config.framework_params) with Timer() as training: predictor.fit(X_train, y_train) with Timer() as predict: predictions = predictor.predict(X_test) probabilities = predictor.predict_proba(X_test) if is_classification else None save_predictions(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=is_classification) return dict( models_count=1, training_duration=training.duration, predict_duration=predict.duration )
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Constant predictor (sklearn dummy) ****\n") is_classification = config.type == 'classification' predictor = DummyClassifier( strategy='prior') if is_classification else DummyRegressor( strategy='median') encode = config.framework_params.get('_encode', False) X_train = unsparsify(dataset.train.X_enc if encode else dataset.train.X, fmt='array') y_train = unsparsify(dataset.train.y_enc if encode else dataset.train.y, fmt='array') X_test = unsparsify(dataset.test.X_enc if encode else dataset.test.X, fmt='array') y_test = unsparsify(dataset.test.y_enc if encode else dataset.test.y, fmt='array') with Timer() as training: predictor.fit(X_train, y_train) with Timer() as predict: predictions = predictor.predict(X_test) probabilities = predictor.predict_proba( X_test) if is_classification else None save_predictions(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=encode) return dict(models_count=1, training_duration=training.duration, predict_duration=predict.duration)
def run(dataset: Dataset, config: TaskConfig): from frameworks.shared.caller import run_in_venv X_train, X_test = dataset.train.X_enc, dataset.test.X_enc y_train, y_test = unsparsify(dataset.train.y_enc, dataset.test.y_enc) data = dict(train=dict(X=X_train, y=y_train), test=dict(X=X_test, y=y_test), predictors_type=[ 'Numerical' if p.is_numerical() else 'Categorical' for p in dataset.predictors ]) return run_in_venv(__file__, "exec.py", input_data=data, dataset=dataset, config=config)