def fit(self, X, Y, sample_weight=None): if sample_weight is not None: Xt, Xtest, Yt, Ytest, sample_weightt, sample_weightest = model_selection.train_test_split(X, Y, sample_weight, test_size=0.1, random_state=0) else: Xt, Xtest, Yt, Ytest = model_selection.train_test_split(X, Y, test_size=0.2, random_state=0) sample_weightt = None sample_weighttest = None self.clf.fit(Xt, Yt, sample_weightt) #Y1 = self.clf.predict(Xtest) logging.info("Doing scoring Xtest=%s Ytest=%s" % (str(Xtest.shape), str(Ytest.shape))) score1 = self.scorer(self.clf, Xtest, Ytest) should_stop_count = 0 improvement_buffer = []; for i in xrange(0,1000): logging.info("IML training iteration %d (should_stop=%d)" % (i, should_stop_count)) t1 = unix_time_millis() clf2 = self.model(self.params) t2 = unix_time_millis() clf2.fit(Xt, Yt, sample_weightt) t3 = unix_time_millis() self.merge(clf2) t4 = unix_time_millis() #Y2 = self.clf.predict(Xtest) score2 = self.scorer(self.clf, Xtest, Ytest) t5 = unix_time_millis() self.last_increase = score2/score1 if len(improvement_buffer) < improvement_buffer_size: improvement_buffer.append(self.last_increase) else: improvement_buffer = improvement_buffer[1:] improvement_buffer.append(self.last_increase) cum_improvement = reduce (lambda cum, x : cum * x, improvement_buffer) logging.info("IML run done, score: %f -> %f last_inc=%.3f imp_buf=%s cum_imp=%.3f" % (score1, score2, self.last_increase, improvement_buffer, cum_improvement)) logging.info(" IML run timing : create=%f fit=%f merge=%f score=%f total=%f" % (t2-t1, t3-t2, t4-t3, t5-t4, t5-t1)) #if not self.last_increase > step_improvement_min: # should_stop_count = should_stop_count + 1 #else: # should_stop_count = 0 #if i > nb_trees_per_steps and should_stop_count >= no_improvement_steps_threshold: # break if i > min_steps and cum_improvement <= min_improvement_over_buffer: break #Y1 = Y2 score1 = score2
def main(exec_folder, output_dataset, keptInputColumns): start = unix_time_millis() listener = ProgressListener() split_desc = json.load(open(osp.join(exec_folder, "_esplit.json"))) preprocessing_params = json.load(open(osp.join(exec_folder, "rpreprocessing_params.json"))) modeling_params = json.load(open(osp.join(exec_folder, "rmodeling_params.json"))) with listener.push_state(constants.STATE_LOADING_SRC): input_df = df_from_split_desc_no_normalization(split_desc, "full", preprocessing_params["per_feature"]) logging.info("Loaded full df: shape=(%d,%d)" % input_df.shape) input_df_orig = input_df.copy() input_df = utils.normalize_dataframe(input_df, preprocessing_params["per_feature"]) with listener.push_state("Collecting preprocessing data"): collector = ClusteringPreprocessingDataCollector(input_df, preprocessing_params) collector_data = collector.build() preproc_handler = ClusteringPreprocessingHandler({}, preprocessing_params, exec_folder) preproc_handler.collector_data = collector_data pipeline = preproc_handler.build_preprocessing_pipeline() with listener.push_state("Preprocessing data"): transformed_train = pipeline.fit_and_process(input_df) start_train = unix_time_millis() (clf, actual_params, cluster_labels, additional_columns) = clustering_fit(modeling_params, transformed_train) # if model has custom labels, use them try: cluster_names = clf.get_cluster_labels() except AttributeError: cluster_names = ["cluster_%s" % i for i in range(len(np.unique(cluster_labels)))] cl = pd.Series(data=cluster_labels, name="cluster_labels").map(lambda i: cluster_names[i]) cl.index = transformed_train["TRAIN"].index final_df = pd.concat([input_df_orig.join(cl, how='left'), additional_columns], axis=1) if keptInputColumns is not None: final_df = final_df[keptInputColumns + ['cluster_labels']] if preprocessing_params["outliers"]["method"] == "CLUSTER": final_df['cluster_labels'].fillna(constants.CLUSTER_OUTLIERS, inplace=True) dataiku.Dataset(output_dataset).write_from_dataframe(final_df) end = unix_time_millis() utils.write_done_traininfo(exec_folder, start, start_train, end, listener)
def main(exec_folder): start = unix_time_millis() listener = ProgressListener() def update_fn(): utils.write_running_traininfo(exec_folder, start, listener) split_desc = json.load(open(osp.join(exec_folder, "_esplit.json"))) preprocessing_params = json.load( open(osp.join(exec_folder, "rpreprocessing_params.json"))) modeling_params = json.load( open(osp.join(exec_folder, "rmodeling_params.json"))) with listener.push_state(constants.STATE_LOADING_SRC): update_fn() train_df = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"]) logging.info("Loaded full df: shape=(%d,%d)" % train_df.shape) with listener.push_state("Collecting preprocessing data"): update_fn() collector = ClusteringPreprocessingDataCollector( train_df, preprocessing_params) collector_data = collector.build() preproc_handler = ClusteringPreprocessingHandler({}, preprocessing_params, exec_folder) preproc_handler.collector_data = collector_data pipeline = preproc_handler.build_preprocessing_pipeline() with listener.push_state("Preprocessing data"): orig_index = train_df.index.copy() transformed_train = pipeline.fit_and_process(train_df) preproc_handler.save_data() preproc_handler.report(pipeline) start_train = unix_time_millis() clustering_train_score_save(transformed_train, orig_index, preprocessing_params, modeling_params, exec_folder, listener, update_fn, pipeline) end = unix_time_millis() utils.write_done_traininfo(exec_folder, start, start_train, end, listener)
def __init__(self, run_folder, modeling_params, validation_sequence, prediction_type, test_df_index, target_map, use_multi_gpus, base_model): self.run_folder = run_folder self.modeling_params = modeling_params self.validation_sequence = validation_sequence self.prediction_type = prediction_type self.test_df_index = test_df_index self.target_map = target_map self.use_multi_gpus = use_multi_gpus self.base_model = base_model self.epoch_start = None self.all_scorers = get_grid_scorers( self.modeling_params, self.prediction_type, self.target_map, custom_make_scorer=self._scorer_func) self.model_best_score = None # Share the name of metric used to optimize model # The user can then retrieve it to write his own callback for example self.evaluation_metric = self.modeling_params['metrics'][ 'evaluationMetric'] set_variable( "DKU_MODEL_METRIC", "Test {}".format(METRICS_NAMES[self.evaluation_metric])) set_variable( "DKU_MODEL_METRIC_GREATER_IS_BETTER", greater_is_better( self.evaluation_metric, self.modeling_params["metrics"].get( "customEvaluationMetricGIB", True))) # Initialize model info self.model_training_info = { "startedAt": unix_time_millis(), "epochs": [], 'metric': modeling_params["metrics"]["evaluationMetric"], } # We want to compute the metrics on the training data as well. To do it in a Keras way # we retrieve, after each batch, the value of y and y_pred for this batch for the model at this # stage of the training, accumulate them and then compute the score and all the values retrieved during the # epoch. This means that it does not correspond exactly to the score on the training # data with a fixed model at the end of an epoch, but to the score of an evolving model. # Those values are stored in TensorFlow Variable in the model so we need to tell TensorFlow that we want to # to retrieve them # Variables to accumulate values of y and y_pred after each batch self.y_list = None self.y_pred_list = None # TensorFlow Variables that are placeholders for values of y and y_pred self.var_y = tf.Variable(0., validate_shape=False) self.var_y_pred = tf.Variable(0., validate_shape=False)
def on_epoch_begin(self, epoch, logs=None): self.epoch_start = unix_time_millis() self.model_training_info["currentNumStepsTraining"] = 0 self.model_training_info["currentNumStepsScoring"] = 0 self.model_training_info["currentEpoch"] = epoch self._update_model_info() # Reinitialize the accumulators of y and y_pred at the beginning of each epoch. self.y_list = [] self.y_pred_list = []
def _update_epoch_graph(self, train_score, test_score, epoch): epoch_finish_time = unix_time_millis() new_point = { 'time': epoch_finish_time - self.epoch_start, 'index': epoch + 1, 'trainScore': train_score, 'testScore': test_score, "epoch": epoch } self.model_training_info['epochs'].append(new_point) self._update_model_info()
def __init__(self, parallel, m_folder=None, n_splits=None, n_candidates=None, timeout=None, n_jobs=None, evaluation_metric=None, metric_sign=1): self.parallel = parallel self.m_folder = m_folder self.n_splits = n_splits self.n_candidates = n_candidates self._watching = self.m_folder is not None self.grid_search_summary = [] self.end_time = time.time( ) + timeout * 60 if timeout is not None else None # timeout in minutes self.initial_grid_points = [] self.initial_grid_point_ids = [] self.n_jobs = n_jobs self.evaluation_metric = evaluation_metric self.metric_sign = metric_sign self.start_time = unix_time_millis() self.is_interrupted = False if self._watching: self.grid_folder = os.path.join(self.m_folder, 'grid') self.grid_tmp_folder = os.path.join(self.m_folder, 'grid.tmp') interrupt_optimization.set_interrupt_folder(self.m_folder) self.grid_search_file = os.path.join(self.m_folder, 'grid_search_done_py.json') self.grid_search_summary = dkujson.load_from_filepath(self.grid_search_file) \ if os.path.exists(self.grid_search_file) else [] self.initial_grid_point_ids = [ x['grid_point_id'] for x in self.grid_search_summary ] self.initial_grid_points = self.grid_search_summary[:] for grid_point_id in self.initial_grid_point_ids: logging.info( "Using precomputed score for Grid point {}".format( grid_point_id)) super(CVInterruptWatcherThread, self).__init__()
def _dku_fit_and_score(estimator, X, y, scorer, train, test, verbose, is_interruptible, parameters, cvwatcher, fit_params, error_score='raise', m_folder=None, split_id=None, parameter_id=None, sample_weight=None, algo_supports_weight=True): if cvwatcher.is_interrupted and is_interruptible: return None current_thread = threading.current_thread() current_thread.name = "GS-%s" % (current_thread.ident) if verbose > 1: if parameters is None: msg = '' else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) logging.info("Fit p=%s s=%s: %s %s" % (parameter_id, split_id, msg, (64 - len(msg)) * '.')) if parameters is not None: estimator.set_params(**parameters) start_time = unix_time_millis() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} # XGBoost early stopping if fit_params.get("early_stopping_rounds") is not None: if fit_params.get("eval_set") is None: # log the train and test objective but optimize on the test (last tuple used for early stopping eval) fit_params["eval_set"] = [(X_train, y_train), (X_test, y_test)] else: pass # still keep the possibility to use a fixed eval_set if sample_weight is not None: w_train, _ = _safe_split(estimator, sample_weight, y, train) w_test, _ = _safe_split(estimator, sample_weight, y, test) if algo_supports_weight: # fit with sample weights whenever they are enabled AND the algorithm supports them fit_params["sample_weight"] = np.array(w_train) fit_params = dict([(k, _dku_index_param_value(X, v, train)) for k, v in fit_params.items()]) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = unix_time_millis() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score train_score = error_score warnings.warn( "Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = unix_time_millis() - start_time if sample_weight is not None: # score with sample weights whenever they are enabled, regardless of the support by the algorithm test_score = _dku_score(estimator, X_test, y_test, scorer, sample_weight=w_test, indices=test) train_score = _dku_score(estimator, X_train, y_train, scorer, sample_weight=w_train, indices=train) else: test_score = _dku_score(estimator, X_test, y_test, scorer, indices=test) train_score = _dku_score(estimator, X_train, y_train, scorer, indices=train) score_time = unix_time_millis() - start_time - fit_time if verbose > 1: end_msg = "%s (ft=%.1fs st=%.1fs sc=%s)" % ( msg, fit_time / 1000, score_time / 1000, test_score) logging.info("Done p=%s s=%s: %s" % (parameter_id, split_id, end_msg)) num_samples = _num_samples(X_test) best_iteration = getattr(estimator, 'best_iteration', None) ret = { "train_score": train_score, "test_score": test_score, "num_samples": num_samples, "fit_time": fit_time, "score_time": score_time, "time": fit_time + score_time, "parameters": parameters, "parameter_id": parameter_id, "grid_point_id": get_grid_point_id(parameters, split_id), "best_iteration": best_iteration, "done_at": unix_time_millis() } if m_folder is not None: tmp_file = os.path.join( m_folder, 'grid.tmp/grid_search_{}.{}.gridpoint'.format( parameter_id, split_id)) dest_file = os.path.join( m_folder, 'grid/grid_search_{}.{}.gridpoint'.format(parameter_id, split_id)) dkujson.dump_to_filepath(tmp_file, ret) os.rename(tmp_file, dest_file) return ret
def main(exec_folder, selection_state_folder, operation_mode): """The whole execution of the saved model train takes place in a single folder ?""" start = unix_time_millis() start_train = start listener = ProgressListener() def update_fn(): utils.write_running_traininfo(exec_folder, start, listener) split_desc = json.load(open(osp.join(exec_folder, "_esplit.json"))) core_params = json.load(open(osp.join(exec_folder, "core_params.json"))) preprocessing_params = json.load( open(osp.join(exec_folder, "rpreprocessing_params.json"))) weight_method = core_params.get("weight", {}).get("weightMethod", None) with_sample_weight = weight_method in { "SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT" } with_class_weight = weight_method in { "CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT" } calibrate_proba = core_params.get("calibration", {}).get("calibrationMethod", None) in ["SIGMOID", "ISOTONIC"] modeling_params = json.load( open(osp.join(exec_folder, "rmodeling_params.json"))) # For KERAS backend, need to tag special features, because they are only processed with process function, # not fit_and_process if modeling_params["algorithm"] == "KERAS_CODE": tag_special_features(preprocessing_params['per_feature']) def do_full_fit_and_save(): """Fit on 100% and save the clf and out params""" with listener.push_state(constants.STATE_LOADING_TRAIN): update_fn() full_df = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"], core_params["prediction_type"]) logging.info("Loaded FULL df: shape=(%d,%d)" % full_df.shape) with listener.push_state("Collecting preprocessing data"): update_fn() collector = ClusteringPreprocessingDataCollector( full_df, preprocessing_params) collector_data = collector.build() pipeline, preproc_handler = build_pipeline_and_handler( collector_data, core_params, exec_folder, preprocessing_params, selection_state_folder=selection_state_folder, allow_empty_mf=modeling_params["algorithm"] == "KERAS_CODE") # TODO if core_params["prediction_type"] in ( constants.BINARY_CLASSIFICATION, constants.MULTICLASS): target_map = preproc_handler.target_map else: target_map = None with listener.push_state("Preprocessing full set"): preprocessor_fit_full_df = full_df # For KERAS backend, we might need to take a subsample of the input_df to prevent from memory errors if modeling_params["algorithm"] == "KERAS_CODE": need_subsampling = preprocessing_params[ "preprocessingFitSampleRatio"] < 1 full_df_orig = full_df.copy() if need_subsampling: preprocessor_fit_full_df = preprocessor_fit_full_df.sample( frac=preprocessing_params[ "preprocessingFitSampleRatio"], random_state=preprocessing_params[ "preprocessingFitSampleSeed"]) transformed_full = pipeline.fit_and_process( preprocessor_fit_full_df) if with_sample_weight: assert transformed_full["weight"].values.min( ) > 0, "Sample weights must be positive" preproc_handler.save_data() preproc_handler.report(pipeline) if modeling_params["algorithm"] == "KERAS_CODE": modeling_set = {"run_folder": exec_folder, "listener": listener} def update_modeling_state(): status = utils.make_running_traininfo( modeling_set["run_folder"], start, modeling_set["listener"]) utils.write_model_status(modeling_set, status) empty_df = pd.DataFrame() return prediction_train_model_keras( transformed_full, full_df_orig, empty_df, pipeline, modeling_params, core_params, preprocessing_params["per_feature"], exec_folder, listener, update_modeling_state, preproc_handler.target_map, pipeline.generated_features_mapping) else: return fit_score_save(pipeline, target_map, transformed_full) def fit_score_save(pipeline, target_map, transformed_full): with listener.push_state(constants.STATE_FITTING): update_fn() if core_params["prediction_type"] in ( constants.BINARY_CLASSIFICATION, constants.MULTICLASS): (clf, out_params, prepared_X, iipd) = classification_fit( modeling_params, split_desc, transformed_full, core_params["prediction_type"], exec_folder, target_map=target_map, with_sample_weight=with_sample_weight, with_class_weight=with_class_weight) if calibrate_proba: method = core_params.get( "calibration", {}).get("calibrationMethod").lower() calibrated_clf = CalibratedClassifierCV(clf, cv="prefit", method=method) test_X = transformed_full["TRAIN"] test_X, is_sparse = prepare_multiframe( test_X, modeling_set['modelingParams']) test_y = transformed_full["target"].astype(int) if with_sample_weight: test_weight = transformed_full["weight"].astype(float) calibrated_clf.fit(test_X, test_y, sample_weight=test_weight) else: calibrated_clf.fit(test_X, test_y) clf = calibrated_clf else: (clf, out_params, prepared_X, iipd) = regression_fit_single( modeling_params, split_desc, transformed_full, exec_folder, with_sample_weight=with_sample_weight) with listener.push_state(constants.STATE_SAVING): save_prediction_model(clf, out_params, listener, update_fn, exec_folder) with listener.push_state(constants.STATE_SCORING): train_X = transformed_full["TRAIN"] train_y = transformed_full["target"] if core_params[ "prediction_type"] == constants.BINARY_CLASSIFICATION: ClassificationModelIntrinsicScorer(modeling_params, clf, train_X, train_y, pipeline, exec_folder, prepared_X, iipd, calibrate_proba).score() BinaryModelSerializer(train_X.columns(), clf, modeling_params, exec_folder, target_map, calibrate_proba).serialize() elif core_params["prediction_type"] == constants.MULTICLASS: ClassificationModelIntrinsicScorer(modeling_params, clf, train_X, train_y, pipeline, exec_folder, prepared_X, iipd, calibrate_proba).score() MulticlassModelSerializer(train_X.columns(), clf, modeling_params, exec_folder, target_map, calibrate_proba).serialize() else: RegressionModelIntrinsicScorer(modeling_params, clf, train_X, train_y, pipeline, exec_folder, prepared_X, iipd).score() RegressionModelSerializer(train_X.columns(), clf, modeling_params, exec_folder).serialize() return out_params if operation_mode == "TRAIN_SPLITTED_ONLY": with listener.push_state(constants.STATE_LOADING_TRAIN): update_fn() train_df = df_from_split_desc(split_desc, "train", preprocessing_params["per_feature"], core_params["prediction_type"]) logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape) with listener.push_state(constants.STATE_LOADING_TEST): update_fn() test_df = df_from_split_desc(split_desc, "test", preprocessing_params["per_feature"], core_params["prediction_type"]) logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape) with listener.push_state("Collecting preprocessing data"): update_fn() collector = PredictionPreprocessingDataCollector( train_df, preprocessing_params) collector_data = collector.build() pipeline, preproc_handler = build_pipeline_and_handler( collector_data, core_params, exec_folder, preprocessing_params, selection_state_folder=selection_state_folder, allow_empty_mf=modeling_params["algorithm"] == "KERAS_CODE") # TODO if core_params["prediction_type"] in ( constants.BINARY_CLASSIFICATION, constants.MULTICLASS): target_map = preproc_handler.target_map else: target_map = None with listener.push_state("Preprocessing train set"): preprocessor_fit_df = train_df # For KERAS backend, we might need to take a subsample of the input_df to prevent from memory errors if modeling_params["algorithm"] == "KERAS_CODE": train_df_orig = train_df.copy() need_subsampling = preprocessing_params[ "preprocessingFitSampleRatio"] < 1 if need_subsampling: preprocessor_fit_df = preprocessor_fit_df.sample( frac=preprocessing_params[ "preprocessingFitSampleRatio"], random_state=preprocessing_params[ "preprocessingFitSampleSeed"]) transformed_train = pipeline.fit_and_process(preprocessor_fit_df) if with_sample_weight: assert transformed_train["weight"].values.min( ) > 0, "Sample weights must be positive" preproc_handler.save_data() preproc_handler.report(pipeline) # For KERAS backend, cannot process test directly, because my have special features that may not # hold in memory if modeling_params["algorithm"] != "KERAS_CODE": with listener.push_state("Preprocessing test set"): test_df_index = test_df.index.copy() transformed_test = pipeline.process(test_df) if with_sample_weight: assert transformed_test["weight"].values.min( ) > 0, "Sample weights must be positive" if modeling_params["algorithm"] == "PYTHON_ENSEMBLE": prediction_train_score_save_ensemble(train_df, test_df, core_params, split_desc, modeling_params, exec_folder, listener, target_map, update_fn, pipeline, with_sample_weight) elif modeling_params["algorithm"] == "KERAS_CODE": modeling_set = {"run_folder": exec_folder, "listener": listener} def update_modeling_state(): status = utils.make_running_traininfo( modeling_set["run_folder"], start, modeling_set["listener"]) utils.write_model_status(modeling_set, status) prediction_train_model_keras( transformed_train, train_df_orig, test_df, pipeline, modeling_params, core_params, preprocessing_params["per_feature"], exec_folder, listener, update_modeling_state, preproc_handler.target_map, pipeline.generated_features_mapping) else: prediction_train_score_save(transformed_train, transformed_test, test_df_index, core_params, split_desc, modeling_params, exec_folder, listener, target_map, update_fn, pipeline, exec_folder) elif operation_mode == "TRAIN_FULL_ONLY": # Not yet functional ... do_full_fit_and_save() elif operation_mode == "TRAIN_KFOLD": out_params = do_full_fit_and_save() full_df_clean = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"], core_params["prediction_type"]) optimized_params = out_params["resolved"] logging.info("Regridifying post-train params: %s" % json.dumps(optimized_params)) # Regridify to a unary grid the optimized params optimized_params_grid = intercom.backend_json_call( "ml/prediction/regridify-to-pretrain", { "preTrain": json.dumps(modeling_params), "postTrain": json.dumps(optimized_params) }) logging.info("Using unary grid params: %s" % json.dumps(optimized_params_grid)) prediction_train_model_kfold(full_df_clean, core_params, split_desc, preprocessing_params, optimized_params_grid, exec_folder, exec_folder, listener, update_fn, with_sample_weight, with_class_weight, calibrate_proba) else: do_full_fit_and_save() # Do the split and scoring but don't save data with listener.push_state(constants.STATE_LOADING_TRAIN): update_fn() train_df = df_from_split_desc(split_desc, "train", preprocessing_params["per_feature"], core_params["prediction_type"]) logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape) with listener.push_state(constants.STATE_LOADING_TEST): update_fn() test_df = df_from_split_desc(split_desc, "test", preprocessing_params["per_feature"], core_params["prediction_type"]) logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape) with listener.push_state("Collecting preprocessing data"): update_fn() collector = PredictionPreprocessingDataCollector( train_df, preprocessing_params) collector_data = collector.build() pipeline, preproc_handler = build_pipeline_and_handler( collector_data, core_params, exec_folder, preprocessing_params, selection_state_folder=selection_state_folder, allow_empty_mf=modeling_params["algorithm"] == "KERAS_CODE") # TODO if core_params["prediction_type"] in ( constants.BINARY_CLASSIFICATION, constants.MULTICLASS): target_map = preproc_handler.target_map else: target_map = None with listener.push_state("Preprocessing train set"): preprocessor_fit_df = train_df # For KERAS backend, we might need to take a subsample of the input_df to prevent from memory errors if modeling_params["algorithm"] == "KERAS_CODE": need_subsampling = preprocessing_params[ "preprocessingFitSampleRatio"] < 1 train_df_orig = train_df.copy() if need_subsampling: preprocessor_fit_df = preprocessor_fit_df.sample( frac=preprocessing_params[ "preprocessingFitSampleRatio"], random_state=preprocessing_params[ "preprocessingFitSampleSeed"]) transformed_train = pipeline.fit_and_process(preprocessor_fit_df) # For KERAS backend, cannot process test directly, because my have special features that may not # hold in memory if modeling_params["algorithm"] != "KERAS_CODE": with listener.push_state("Preprocessing test set"): test_df_index = test_df.index.copy() transformed_test = pipeline.process(test_df) if modeling_params["algorithm"] == "KERAS_CODE": modeling_set = {"run_folder": exec_folder, "listener": listener} def update_modeling_state(): status = utils.make_running_traininfo( modeling_set["run_folder"], start, modeling_set["listener"]) utils.write_model_status(modeling_set, status) prediction_train_model_keras(transformed_train, train_df_orig, test_df, pipeline, modeling_params, core_params, preprocessing_params["per_feature"], exec_folder, listener, update_modeling_state, preproc_handler.target_map, pipeline.generated_features_mapping, save_model=False) else: with listener.push_state(constants.STATE_FITTING): update_fn() if core_params["prediction_type"] in ( constants.BINARY_CLASSIFICATION, constants.MULTICLASS): (clf, out_params, prepared_X, iipd) = classification_fit( modeling_params, split_desc, transformed_train, core_params["prediction_type"], target_map=target_map, with_sample_weight=with_sample_weight, with_class_weight=with_class_weight) else: (clf, out_params, prepared_X, iipd) = regression_fit_single( modeling_params, split_desc, transformed_train, exec_folder, with_sample_weight=with_sample_weight) with listener.push_state(constants.STATE_SCORING): train_X = transformed_train["TRAIN"] train_y = transformed_train["target"] if core_params[ "prediction_type"] == constants.BINARY_CLASSIFICATION: ClassificationModelIntrinsicScorer( modeling_params, clf, train_X, train_y, pipeline, exec_folder, prepared_X, iipd, calibrate_proba).score() BinaryModelSerializer(train_X.columns(), clf, modeling_params, exec_folder, target_map).serialize() binary_classification_scorer_with_valid( modeling_params, clf, transformed_test, exec_folder, test_df_index, target_map=target_map, with_sample_weight=with_sample_weight).score() elif core_params["prediction_type"] == constants.MULTICLASS: ClassificationModelIntrinsicScorer( modeling_params, clf, train_X, train_y, pipeline, exec_folder, prepared_X, iipd, calibrate_proba).score() MulticlassModelSerializer(train_X.columns(), clf, modeling_params, exec_folder, target_map).serialize() multiclass_scorer_with_valid( modeling_params, clf, transformed_test, exec_folder, test_df_index, target_map=target_map, with_sample_weight=with_sample_weight).score() else: RegressionModelIntrinsicScorer(modeling_params, clf, train_X, train_y, pipeline, exec_folder, prepared_X, iipd).score() RegressionModelSerializer(train_X.columns(), clf, modeling_params, exec_folder).serialize() regression_scorer_with_valid(modeling_params, clf, transformed_test, exec_folder, test_df_index, with_sample_weight).score() end = unix_time_millis() utils.write_done_traininfo(exec_folder, start, start_train, end, listener)
def train_prediction_kfold(core_params, preprocessing_set, split_desc): start = unix_time_millis() preprocessing_params = preprocessing_set['preprocessing_params'] modeling_sets = preprocessing_set["modelingSets"] logging.info("PPS is %s" % preprocessing_params) preprocessing_listener = ProgressListener() preprocessing_listener.add_future_steps(constants.PRED_KFOLD_PREPROCESSING_STATES) for modeling_set in modeling_sets: listener = ProgressListener() listener.add_future_steps(constants.PRED_KFOLD_TRAIN_STATES) modeling_set["listener"] = listener def update_one_preprocessing_state(modeling_set): status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) def update_preprocessing_state(): map(update_one_preprocessing_state, modeling_sets) with preprocessing_listener.push_state(constants.STATE_LOADING_SRC): update_preprocessing_state() full_df = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"], core_params["prediction_type"]) logging.info("Loaded full_df df: shape=(%d,%d)" % full_df.shape) with preprocessing_listener.push_state(constants.STATE_COLLECTING): update_preprocessing_state() collector = PredictionPreprocessingDataCollector(full_df, preprocessing_params) collector_data = collector.build() pipeline, preproc_handler = build_pipeline_and_handler(collector_data, core_params, preprocessing_set['run_folder'], preprocessing_params) with preprocessing_listener.push_state(constants.KFOLDSTATE_PREPROCESS_GLOBAL): update_preprocessing_state() transformed_full = pipeline.fit_and_process(full_df) preproc_handler.save_data() preproc_handler.report(pipeline) update_preprocessing_state() preprocessing_end = unix_time_millis() train_X = transformed_full["TRAIN"] train_y = transformed_full["target"] weight_method = core_params.get("weight", {}).get("weightMethod", None) with_sample_weight = weight_method in {"SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"} with_class_weight = weight_method in {"CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"} calibrate_proba = core_params.get("calibration", {}).get("calibrationMethod", None) in ["SIGMOID", "ISOTONIC"] if with_sample_weight: assert transformed_full["weight"].values.min() > 0, "Sample weights must be positive" for modeling_set in modeling_sets: model_start = unix_time_millis() update_fn = lambda: update_one_preprocessing_state(modeling_set) if core_params["prediction_type"] in (constants.BINARY_CLASSIFICATION, constants.MULTICLASS): with modeling_set["listener"].push_state(constants.KFOLDSTATE_FITTING_GLOBAL): # no out-fold available, so calibrate through classification_fit on a random split if calibrate_proba: calibration_method = core_params.get("calibration", {}).get("calibrationMethod").lower() else: calibration_method = None update_one_preprocessing_state(modeling_set) (clf, out_params, prepared_X, iipd) = classification_fit(modeling_set['modelingParams'], split_desc, transformed_full, core_params["prediction_type"], modeling_set['run_folder'], target_map=preproc_handler.target_map, with_sample_weight=with_sample_weight, with_class_weight=with_class_weight, calibration=calibration_method) save_prediction_model(clf, out_params, modeling_set["listener"], update_fn, modeling_set['run_folder']) with modeling_set["listener"].push_state(constants.KFOLDSTATE_SCORING_GLOBAL): update_one_preprocessing_state(modeling_set) ClassificationModelIntrinsicScorer(modeling_set['modelingParams'], clf, train_X, train_y, pipeline, modeling_set['run_folder'], prepared_X, iipd, calibrate_proba).score() if core_params["prediction_type"] == constants.BINARY_CLASSIFICATION: BinaryModelSerializer(train_X.columns(), clf, modeling_set['modelingParams'], modeling_set['run_folder'], preproc_handler.target_map, calibrate_proba).serialize() else: MulticlassModelSerializer(train_X.columns(), clf, modeling_set['modelingParams'], modeling_set['run_folder'], preproc_handler.target_map, calibrate_proba).serialize() else: with modeling_set["listener"].push_state(constants.KFOLDSTATE_FITTING_GLOBAL): update_one_preprocessing_state(modeling_set) (clf, out_params, prepared_X, iipd) = regression_fit_single(modeling_set['modelingParams'], split_desc, transformed_full, modeling_set["run_folder"], with_sample_weight=with_sample_weight) save_prediction_model(clf, out_params, modeling_set["listener"], update_fn, modeling_set['run_folder']) with modeling_set["listener"].push_state(constants.KFOLDSTATE_SCORING_GLOBAL): update_fn() RegressionModelIntrinsicScorer(modeling_set['modelingParams'], clf, train_X, train_y, pipeline, modeling_set['run_folder'], prepared_X, iipd).score() # serialize the model if possible RegressionModelSerializer(train_X.columns(), clf, modeling_set['modelingParams'], modeling_set['run_folder']).serialize() full_df_clean = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"], core_params["prediction_type"]) optimized_params = out_params["resolved"] logging.info("Regridifying post-train params: %s" % json.dumps(optimized_params)) # Regridify to a unary grid the optimized params optimized_params_grid = intercom.backend_json_call("ml/prediction/regridify-to-pretrain", { "preTrain" : json.dumps(modeling_set["modelingParams"]), "postTrain" : json.dumps(optimized_params) }) logging.info("Using unary grid params: %s" % json.dumps(optimized_params_grid)) prediction_train_model_kfold(full_df_clean, core_params, split_desc, preprocessing_params, optimized_params_grid, preprocessing_set['run_folder'], modeling_set['run_folder'], modeling_set["listener"], update_fn, with_sample_weight, with_class_weight, calibrate_proba) end = unix_time_millis() utils.write_done_traininfo(modeling_set['run_folder'], start, model_start, end, (preprocessing_listener, modeling_set["listener"]), end_preprocessing_time=preprocessing_end) return "ok"
def create_ensemble(split_desc, core_params, model_folder, preprocessing_folder, model_folders, preprocessing_folders): listener = ProgressListener() listener.add_future_steps(constants.ENSEMBLE_STATES) start = unix_time_millis() def update_preprocessing_state(): utils.write_running_traininfo(model_folder, start, listener) split_desc = dkujson.loads(split_desc) core_params = dkujson.loads(core_params) weight_method = core_params.get("weight", {}).get("weightMethod", None) with_sample_weight = weight_method in {"SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"} # TODO: update downstream with_class_weight = weight_method in {"CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"} preprocessing_folders = dkujson.loads(preprocessing_folders) model_folders = dkujson.loads(model_folders) modeling_params = dkujson.load_from_filepath(osp.join(model_folder, "rmodeling_params.json")) ensemble_params = modeling_params["ensemble_params"] logging.info("creating ensemble") with listener.push_state(constants.STATE_ENSEMBLING): update_preprocessing_state() from dataiku.doctor.prediction.ensembles import ensemble_from_fitted train = df_from_split_desc(split_desc, "train", ensemble_params["preprocessing_params"][0]["per_feature"], core_params["prediction_type"]) iperf = { "modelInputNRows" : train.shape[0], #todo : not the right count as may have dropped ... "modelInputNCols" : -1, # makes no sense for an ensemble as may have different preprocessings "modelInputIsSparse" : False } dkujson.dump_to_filepath(osp.join(model_folder, "iperf.json"), iperf) clf = ensemble_from_fitted(core_params, ensemble_params, preprocessing_folders, model_folders, train, with_sample_weight, with_class_weight) logging.info("saving model") with listener.push_state(constants.STATE_SAVING): update_preprocessing_state() with open(osp.join(model_folder, "clf.pkl"), dku_write_mode_for_pickling()) as f: pickle.dump(clf, f, 2) logging.info("scoring model") with listener.push_state(constants.STATE_SCORING): update_preprocessing_state() test = df_from_split_desc(split_desc, "test", ensemble_params["preprocessing_params"][0]["per_feature"], core_params["prediction_type"]) # this is annoying, but we have to use one of the previous preprocessings in order to get the target prep_folder = preprocessing_folders[0] rppp = dkujson.load_from_filepath(osp.join(prep_folder, "rpreprocessing_params.json")) collector_data = dkujson.load_from_filepath(osp.join(prep_folder, "collector_data.json")) preprocessing_handler = PreprocessingHandler.build(core_params, rppp, prep_folder) preprocessing_handler.collector_data = collector_data pipe = preprocessing_handler.build_preprocessing_pipeline(with_target=True) transformed = pipe.process(test) y = transformed["target"] if with_sample_weight: sample_weight = transformed["weight"] else: sample_weight = None # Now that the CLF with scorable pipelines has been saved, set it in "pipelines with target" mode # to be able to compute metrics clf.set_with_target_pipelines_mode(True) pred = clf.predict(test) probas = None if core_params["prediction_type"] == "REGRESSION" else clf.predict_proba(test) target_map = None if core_params["prediction_type"] == "REGRESSION" else \ {t["sourceValue"]: t["mappedValue"] for t in ensemble_params["preprocessing_params"][0]["target_remapping"]} prediction_type = core_params["prediction_type"] if prediction_type == "REGRESSION": RegressionModelScorer(modeling_params, clf, pred, y, model_folder, transformed, test.index.copy(), sample_weight).score() elif prediction_type == "BINARY_CLASSIFICATION": BinaryClassificationModelScorer(modeling_params, clf, model_folder, pred, probas, y, target_map, transformed, test.index.copy(), sample_weight).score() else: MulticlassModelScorer(modeling_params, clf, model_folder, pred, probas, y.astype(int), target_map, transformed, test.index.copy(), sample_weight).score() update_preprocessing_state() end = unix_time_millis() dkujson.dump_to_filepath(osp.join(model_folder, "actual_params.json"), {"resolved": modeling_params}) dkujson.dump_to_filepath(osp.join(preprocessing_folder, "preprocessing_report.json"), {}) utils.write_done_traininfo(model_folder, start, end, end, listener, end_preprocessing_time=start) return "ok"
def train_clustering_models_nosave( split_desc, preprocessing_set): """Regular (mode 1) train: - Non streamed single split + fit preprocess on train + preprocess test - Fit N models sequentially - Fit - Save clf - Compute and save clf performance - Score, save scored test set + scored performnace """ start = unix_time_millis() preprocessing_listener = ProgressListener() preprocessing_params = preprocessing_set["preprocessing_params"] modeling_sets = preprocessing_set["modelingSets"] # Fill all the listeners ASAP to have correct progress data preprocessing_listener.add_future_steps(constants.CLUSTERING_REGULAR_PREPROCESSING_STATES) for modeling_set in modeling_sets: listener = ProgressListener() listener.add_future_steps(constants.ALL_CLUSTERING_TRAIN_STATES) modeling_set["listener"] = listener # Called by the preprocessing pipeline to update the state # of each model and dump it to disk def update_preprocessing_state(): for modeling_set in modeling_sets: status = { "modelId" : modeling_set["modelId"], "state": "RUNNING", "startTime": start, "progress" : merge_listeners(preprocessing_listener, modeling_set["listener"]) } utils.write_model_status(modeling_set, status) logging.info("START TRAIN :" + preprocessing_set["description"]) preprocessing_params = preprocessing_set["preprocessing_params"] with preprocessing_listener.push_state(constants.STATE_LOADING_SRC): update_preprocessing_state() source_df = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"]) logging.info("Loaded source df: shape=(%d,%d)" % source_df.shape) with preprocessing_listener.push_state(constants.STATE_COLLECTING): update_preprocessing_state() collector = ClusteringPreprocessingDataCollector(source_df, preprocessing_params) collector_data = collector.build() preproc_handler = ClusteringPreprocessingHandler({}, preprocessing_set["preprocessing_params"], preprocessing_set["run_folder"]) preproc_handler.collector_data = collector_data pipeline = preproc_handler.build_preprocessing_pipeline() with preprocessing_listener.push_state(constants.STATE_PREPROCESS_SRC): update_preprocessing_state() source_df_index = source_df.index.copy() # TODO: fit_and_process should take an update_fn argument transformed_source = pipeline.fit_and_process(source_df) # Saves fitted resources and collector data preproc_handler.save_data() # Report on work report = {} pipeline.report_fit(report, {}) utils.write_preproc_file(preprocessing_set["run_folder"], "preprocessing_report.json", report) update_preprocessing_state() preprocessing_end = unix_time_millis() for modeling_set in modeling_sets: model_start = unix_time_millis() def update_modeling_state(): status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) clustering_train_score_save(transformed_source, source_df_index, preprocessing_set["preprocessing_params"], modeling_set["modelingParams"], modeling_set["run_folder"], listener, update_modeling_state, pipeline) model_end = end = unix_time_millis() end = unix_time_millis() # Write the final model training info status = { "modelId": modeling_set["modelId"], "state": "DONE", "startTime": start, "endTime": end, "preprocessingTime": preprocessing_end - start, "trainingTime": model_end - model_start, "progress": merge_listeners(preprocessing_listener, modeling_set["listener"]) } utils.write_model_status(modeling_set, status) return "ok"
def train_prediction_keras(core_params, preprocessing_set, split_desc): start = unix_time_millis() preprocessing_params = preprocessing_set["preprocessing_params"] modeling_sets = preprocessing_set["modelingSets"] run_folder = preprocessing_set["run_folder"] logging.info("PPS is %s" % preprocessing_params) preprocessing_listener = ProgressListener() # Fill all the listeners ASAP to have correct progress data preprocessing_listener.add_future_steps(constants.PRED_KERAS_PREPROCESSING_STATES) for modeling_set in modeling_sets: listener = ProgressListener() listener.add_future_steps(constants.PRED_KERAS_TRAIN_STATES) modeling_set["listener"] = listener # Called by the preprocessing pipeline to update the state # of each model and dump it to disk def update_preprocessing_state(): for modeling_set in modeling_sets: status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) with preprocessing_listener.push_state(constants.STATE_LOADING_TRAIN): update_preprocessing_state() train_df = df_from_split_desc(split_desc, "train", preprocessing_params['per_feature'], core_params["prediction_type"]) train_df_orig = train_df.copy() logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape) # Not implemented in the UI so far, so processor_fit_df will always be train_df preprocessor_fit_df = train_df need_subsampling = preprocessing_params["preprocessingFitSampleRatio"] < 1 if need_subsampling: preprocessor_fit_df = preprocessor_fit_df.sample(frac=preprocessing_params["preprocessingFitSampleRatio"], random_state=preprocessing_params["preprocessingFitSampleSeed"]) with preprocessing_listener.push_state(constants.STATE_LOADING_TEST): update_preprocessing_state() test_df = df_from_split_desc(split_desc, "test", preprocessing_params['per_feature'], core_params["prediction_type"]) logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape) with preprocessing_listener.push_state(constants.STATE_COLLECTING): update_preprocessing_state() collector = PredictionPreprocessingDataCollector(preprocessor_fit_df, preprocessing_params) collector_data = collector.build() # Tagging special features to take them into account only in special_preproc_handler/special_pipeline per_feature = preprocessing_params["per_feature"] tag_special_features(per_feature) pipeline, preproc_handler = build_pipeline_and_handler(collector_data, core_params, run_folder, preprocessing_params, allow_empty_mf=True) with preprocessing_listener.push_state(constants.KERASSTATE_FIT_NORMAL_PREPROCESSING): update_preprocessing_state() # Retrieving transformed values to get the shape of all regular inputs, even if won't be # actually used, as each batch of data will be processed again transformed_normal = pipeline.fit_and_process(preprocessor_fit_df) preproc_handler.save_data() preproc_handler.report(pipeline) # TODO: REVIEW STATES OF TRAINING with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TRAIN): update_preprocessing_state() with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TEST): update_preprocessing_state() update_preprocessing_state() preprocessing_end = unix_time_millis() for modeling_set in modeling_sets: model_start = unix_time_millis() # Settings env variable that may be accessed in user defined code remoterun.set_dku_env_var_and_sys_env_var(constants.DKU_CURRENT_ANALYSIS_ID, modeling_set["fullId"]["taskLoc"]["analysisId"]) remoterun.set_dku_env_var_and_sys_env_var(constants.DKU_CURRENT_MLTASK_ID, modeling_set["fullId"]["taskLoc"]["mlTaskId"]) def update_modeling_state(): status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) prediction_train_model_keras(transformed_normal, train_df_orig, test_df, pipeline, modeling_set["modelingParams"], core_params, per_feature, modeling_set["run_folder"], modeling_set["listener"], update_modeling_state, preproc_handler.target_map, pipeline.generated_features_mapping) end = unix_time_millis() utils.write_done_traininfo(modeling_set["run_folder"], start, model_start, end, (preprocessing_listener, modeling_set["listener"]), end_preprocessing_time=preprocessing_end) return "ok"
def train_prediction_models_nosave(core_params, preprocessing_set, split_desc): """Regular (mode 1) train: - Non streamed single split + fit preprocess on train + preprocess test - Fit N models sequentially - Fit - Save clf - Compute and save clf performance - Score, save scored test set + scored performnace """ start = unix_time_millis() preprocessing_params = preprocessing_set["preprocessing_params"] modeling_sets = preprocessing_set["modelingSets"] logging.info("PPS is %s" % preprocessing_params) preprocessing_listener = ProgressListener() # Fill all the listeners ASAP to have correct progress data preprocessing_listener.add_future_steps(constants.PRED_REGULAR_PREPROCESSING_STATES) for modeling_set in modeling_sets: listener = ProgressListener() if modeling_set.get('modelingParams', {}).get('gridLength', 1) > 1: listener.add_future_step(constants.STATE_GRIDSEARCHING) listener.add_future_steps(constants.PRED_REGULAR_TRAIN_STATES) modeling_set["listener"] = listener # Called by the preprocessing pipeline to update the state # of each model and dump it to disk def update_preprocessing_state(): for modeling_set in modeling_sets: status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) with preprocessing_listener.push_state(constants.STATE_LOADING_TRAIN): update_preprocessing_state() train_df = df_from_split_desc(split_desc, "train", preprocessing_params['per_feature'], core_params["prediction_type"]) logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape) for col in train_df: logging.info("Train col : %s (%s)" % (col, train_df[col].dtype)) with preprocessing_listener.push_state(constants.STATE_LOADING_TEST): update_preprocessing_state() test_df = df_from_split_desc(split_desc, "test", preprocessing_params['per_feature'], core_params["prediction_type"]) logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape) with preprocessing_listener.push_state(constants.STATE_COLLECTING): update_preprocessing_state() collector = PredictionPreprocessingDataCollector(train_df, preprocessing_params) collector_data = collector.build() pipeline, preproc_handler = build_pipeline_and_handler(collector_data, core_params, preprocessing_set['run_folder'], preprocessing_params) with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TRAIN): update_preprocessing_state() # TODO: fit_and_process should take an update_fn argument transformed_train = pipeline.fit_and_process(train_df) preproc_handler.save_data() preproc_handler.report(pipeline) with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TEST): update_preprocessing_state() test_df_index = test_df.index.copy() transformed_test = pipeline.process(test_df) update_preprocessing_state() preprocessing_end = unix_time_millis() for modeling_set in modeling_sets: model_start = unix_time_millis() def update_modeling_state(): status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) # since ensembles are never fitted through the doctor, no need to distinguish here prediction_train_score_save(transformed_train, transformed_test, test_df_index, core_params, split_desc, modeling_set["modelingParams"], modeling_set["run_folder"], modeling_set["listener"], preproc_handler.target_map, update_modeling_state, pipeline, modeling_set["run_folder"]) end = unix_time_millis() utils.write_done_traininfo(modeling_set["run_folder"], start, model_start, end, (preprocessing_listener, modeling_set["listener"]), end_preprocessing_time=preprocessing_end) return "ok"