def clustering_rescore( split_desc, preprocessing_folder, model_folder): preprocessing_params = dkujson.load_from_filepath(osp.join(preprocessing_folder, "rpreprocessing_params.json")) modeling_params = dkujson.load_from_filepath(osp.join(model_folder,"rmodeling_params.json")) user_meta = dkujson.load_from_filepath(osp.join(model_folder, "user_meta.json")) split_desc = dkujson.loads(split_desc) source_df = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"]) logging.info("Loaded source df: shape=(%d,%d)" % source_df.shape) collector_data = dkujson.load_from_filepath(osp.join(preprocessing_folder, "collector_data.json")) preproc_handler = ClusteringPreprocessingHandler({}, preprocessing_params, "") # we're not saving the data preproc_handler.collector_data = collector_data pipeline = preproc_handler.build_preprocessing_pipeline() source_df_index = source_df.index.copy() transformed_source = pipeline.fit_and_process(source_df) logging.info("Loading the clustering model") with open(osp.join(model_folder, "clusterer.pkl"), "rb") as f: clf = pickle.load(f) try: logging.info("Post-processing the model") clf.post_process(user_meta) except AttributeError: pass train_np, is_sparse = prepare_multiframe(transformed_source["TRAIN"], modeling_params) cluster_labels = clf.predict(train_np) logging.info("Rescoring the clustering model") ClusteringModelScorer(clf, transformed_source, source_df_index, cluster_labels, preprocessing_params, modeling_params, pipeline, model_folder).score() return "ok"
def _get_df(self, split): return df_from_split_desc(self._split_desc, split, self._preprocessing_params['per_feature'], self._core_params["prediction_type"])
def train_prediction_kfold(core_params, preprocessing_set, split_desc): start = unix_time_millis() preprocessing_params = preprocessing_set['preprocessing_params'] modeling_sets = preprocessing_set["modelingSets"] logging.info("PPS is %s" % preprocessing_params) preprocessing_listener = ProgressListener() preprocessing_listener.add_future_steps(constants.PRED_KFOLD_PREPROCESSING_STATES) for modeling_set in modeling_sets: listener = ProgressListener() listener.add_future_steps(constants.PRED_KFOLD_TRAIN_STATES) modeling_set["listener"] = listener def update_one_preprocessing_state(modeling_set): status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) def update_preprocessing_state(): map(update_one_preprocessing_state, modeling_sets) with preprocessing_listener.push_state(constants.STATE_LOADING_SRC): update_preprocessing_state() full_df = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"], core_params["prediction_type"]) logging.info("Loaded full_df df: shape=(%d,%d)" % full_df.shape) with preprocessing_listener.push_state(constants.STATE_COLLECTING): update_preprocessing_state() collector = PredictionPreprocessingDataCollector(full_df, preprocessing_params) collector_data = collector.build() pipeline, preproc_handler = build_pipeline_and_handler(collector_data, core_params, preprocessing_set['run_folder'], preprocessing_params) with preprocessing_listener.push_state(constants.KFOLDSTATE_PREPROCESS_GLOBAL): update_preprocessing_state() transformed_full = pipeline.fit_and_process(full_df) preproc_handler.save_data() preproc_handler.report(pipeline) update_preprocessing_state() preprocessing_end = unix_time_millis() train_X = transformed_full["TRAIN"] train_y = transformed_full["target"] weight_method = core_params.get("weight", {}).get("weightMethod", None) with_sample_weight = weight_method in {"SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"} with_class_weight = weight_method in {"CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"} calibrate_proba = core_params.get("calibration", {}).get("calibrationMethod", None) in ["SIGMOID", "ISOTONIC"] if with_sample_weight: assert transformed_full["weight"].values.min() > 0, "Sample weights must be positive" for modeling_set in modeling_sets: model_start = unix_time_millis() update_fn = lambda: update_one_preprocessing_state(modeling_set) if core_params["prediction_type"] in (constants.BINARY_CLASSIFICATION, constants.MULTICLASS): with modeling_set["listener"].push_state(constants.KFOLDSTATE_FITTING_GLOBAL): # no out-fold available, so calibrate through classification_fit on a random split if calibrate_proba: calibration_method = core_params.get("calibration", {}).get("calibrationMethod").lower() else: calibration_method = None update_one_preprocessing_state(modeling_set) (clf, out_params, prepared_X, iipd) = classification_fit(modeling_set['modelingParams'], split_desc, transformed_full, core_params["prediction_type"], modeling_set['run_folder'], target_map=preproc_handler.target_map, with_sample_weight=with_sample_weight, with_class_weight=with_class_weight, calibration=calibration_method) save_prediction_model(clf, out_params, modeling_set["listener"], update_fn, modeling_set['run_folder']) with modeling_set["listener"].push_state(constants.KFOLDSTATE_SCORING_GLOBAL): update_one_preprocessing_state(modeling_set) ClassificationModelIntrinsicScorer(modeling_set['modelingParams'], clf, train_X, train_y, pipeline, modeling_set['run_folder'], prepared_X, iipd, calibrate_proba).score() if core_params["prediction_type"] == constants.BINARY_CLASSIFICATION: BinaryModelSerializer(train_X.columns(), clf, modeling_set['modelingParams'], modeling_set['run_folder'], preproc_handler.target_map, calibrate_proba).serialize() else: MulticlassModelSerializer(train_X.columns(), clf, modeling_set['modelingParams'], modeling_set['run_folder'], preproc_handler.target_map, calibrate_proba).serialize() else: with modeling_set["listener"].push_state(constants.KFOLDSTATE_FITTING_GLOBAL): update_one_preprocessing_state(modeling_set) (clf, out_params, prepared_X, iipd) = regression_fit_single(modeling_set['modelingParams'], split_desc, transformed_full, modeling_set["run_folder"], with_sample_weight=with_sample_weight) save_prediction_model(clf, out_params, modeling_set["listener"], update_fn, modeling_set['run_folder']) with modeling_set["listener"].push_state(constants.KFOLDSTATE_SCORING_GLOBAL): update_fn() RegressionModelIntrinsicScorer(modeling_set['modelingParams'], clf, train_X, train_y, pipeline, modeling_set['run_folder'], prepared_X, iipd).score() # serialize the model if possible RegressionModelSerializer(train_X.columns(), clf, modeling_set['modelingParams'], modeling_set['run_folder']).serialize() full_df_clean = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"], core_params["prediction_type"]) optimized_params = out_params["resolved"] logging.info("Regridifying post-train params: %s" % json.dumps(optimized_params)) # Regridify to a unary grid the optimized params optimized_params_grid = intercom.backend_json_call("ml/prediction/regridify-to-pretrain", { "preTrain" : json.dumps(modeling_set["modelingParams"]), "postTrain" : json.dumps(optimized_params) }) logging.info("Using unary grid params: %s" % json.dumps(optimized_params_grid)) prediction_train_model_kfold(full_df_clean, core_params, split_desc, preprocessing_params, optimized_params_grid, preprocessing_set['run_folder'], modeling_set['run_folder'], modeling_set["listener"], update_fn, with_sample_weight, with_class_weight, calibrate_proba) end = unix_time_millis() utils.write_done_traininfo(modeling_set['run_folder'], start, model_start, end, (preprocessing_listener, modeling_set["listener"]), end_preprocessing_time=preprocessing_end) return "ok"
def create_ensemble(split_desc, core_params, model_folder, preprocessing_folder, model_folders, preprocessing_folders): listener = ProgressListener() listener.add_future_steps(constants.ENSEMBLE_STATES) start = unix_time_millis() def update_preprocessing_state(): utils.write_running_traininfo(model_folder, start, listener) split_desc = dkujson.loads(split_desc) core_params = dkujson.loads(core_params) weight_method = core_params.get("weight", {}).get("weightMethod", None) with_sample_weight = weight_method in {"SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"} # TODO: update downstream with_class_weight = weight_method in {"CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"} preprocessing_folders = dkujson.loads(preprocessing_folders) model_folders = dkujson.loads(model_folders) modeling_params = dkujson.load_from_filepath(osp.join(model_folder, "rmodeling_params.json")) ensemble_params = modeling_params["ensemble_params"] logging.info("creating ensemble") with listener.push_state(constants.STATE_ENSEMBLING): update_preprocessing_state() from dataiku.doctor.prediction.ensembles import ensemble_from_fitted train = df_from_split_desc(split_desc, "train", ensemble_params["preprocessing_params"][0]["per_feature"], core_params["prediction_type"]) iperf = { "modelInputNRows" : train.shape[0], #todo : not the right count as may have dropped ... "modelInputNCols" : -1, # makes no sense for an ensemble as may have different preprocessings "modelInputIsSparse" : False } dkujson.dump_to_filepath(osp.join(model_folder, "iperf.json"), iperf) clf = ensemble_from_fitted(core_params, ensemble_params, preprocessing_folders, model_folders, train, with_sample_weight, with_class_weight) logging.info("saving model") with listener.push_state(constants.STATE_SAVING): update_preprocessing_state() with open(osp.join(model_folder, "clf.pkl"), dku_write_mode_for_pickling()) as f: pickle.dump(clf, f, 2) logging.info("scoring model") with listener.push_state(constants.STATE_SCORING): update_preprocessing_state() test = df_from_split_desc(split_desc, "test", ensemble_params["preprocessing_params"][0]["per_feature"], core_params["prediction_type"]) # this is annoying, but we have to use one of the previous preprocessings in order to get the target prep_folder = preprocessing_folders[0] rppp = dkujson.load_from_filepath(osp.join(prep_folder, "rpreprocessing_params.json")) collector_data = dkujson.load_from_filepath(osp.join(prep_folder, "collector_data.json")) preprocessing_handler = PreprocessingHandler.build(core_params, rppp, prep_folder) preprocessing_handler.collector_data = collector_data pipe = preprocessing_handler.build_preprocessing_pipeline(with_target=True) transformed = pipe.process(test) y = transformed["target"] if with_sample_weight: sample_weight = transformed["weight"] else: sample_weight = None # Now that the CLF with scorable pipelines has been saved, set it in "pipelines with target" mode # to be able to compute metrics clf.set_with_target_pipelines_mode(True) pred = clf.predict(test) probas = None if core_params["prediction_type"] == "REGRESSION" else clf.predict_proba(test) target_map = None if core_params["prediction_type"] == "REGRESSION" else \ {t["sourceValue"]: t["mappedValue"] for t in ensemble_params["preprocessing_params"][0]["target_remapping"]} prediction_type = core_params["prediction_type"] if prediction_type == "REGRESSION": RegressionModelScorer(modeling_params, clf, pred, y, model_folder, transformed, test.index.copy(), sample_weight).score() elif prediction_type == "BINARY_CLASSIFICATION": BinaryClassificationModelScorer(modeling_params, clf, model_folder, pred, probas, y, target_map, transformed, test.index.copy(), sample_weight).score() else: MulticlassModelScorer(modeling_params, clf, model_folder, pred, probas, y.astype(int), target_map, transformed, test.index.copy(), sample_weight).score() update_preprocessing_state() end = unix_time_millis() dkujson.dump_to_filepath(osp.join(model_folder, "actual_params.json"), {"resolved": modeling_params}) dkujson.dump_to_filepath(osp.join(preprocessing_folder, "preprocessing_report.json"), {}) utils.write_done_traininfo(model_folder, start, end, end, listener, end_preprocessing_time=start) return "ok"
def train_clustering_models_nosave( split_desc, preprocessing_set): """Regular (mode 1) train: - Non streamed single split + fit preprocess on train + preprocess test - Fit N models sequentially - Fit - Save clf - Compute and save clf performance - Score, save scored test set + scored performnace """ start = unix_time_millis() preprocessing_listener = ProgressListener() preprocessing_params = preprocessing_set["preprocessing_params"] modeling_sets = preprocessing_set["modelingSets"] # Fill all the listeners ASAP to have correct progress data preprocessing_listener.add_future_steps(constants.CLUSTERING_REGULAR_PREPROCESSING_STATES) for modeling_set in modeling_sets: listener = ProgressListener() listener.add_future_steps(constants.ALL_CLUSTERING_TRAIN_STATES) modeling_set["listener"] = listener # Called by the preprocessing pipeline to update the state # of each model and dump it to disk def update_preprocessing_state(): for modeling_set in modeling_sets: status = { "modelId" : modeling_set["modelId"], "state": "RUNNING", "startTime": start, "progress" : merge_listeners(preprocessing_listener, modeling_set["listener"]) } utils.write_model_status(modeling_set, status) logging.info("START TRAIN :" + preprocessing_set["description"]) preprocessing_params = preprocessing_set["preprocessing_params"] with preprocessing_listener.push_state(constants.STATE_LOADING_SRC): update_preprocessing_state() source_df = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"]) logging.info("Loaded source df: shape=(%d,%d)" % source_df.shape) with preprocessing_listener.push_state(constants.STATE_COLLECTING): update_preprocessing_state() collector = ClusteringPreprocessingDataCollector(source_df, preprocessing_params) collector_data = collector.build() preproc_handler = ClusteringPreprocessingHandler({}, preprocessing_set["preprocessing_params"], preprocessing_set["run_folder"]) preproc_handler.collector_data = collector_data pipeline = preproc_handler.build_preprocessing_pipeline() with preprocessing_listener.push_state(constants.STATE_PREPROCESS_SRC): update_preprocessing_state() source_df_index = source_df.index.copy() # TODO: fit_and_process should take an update_fn argument transformed_source = pipeline.fit_and_process(source_df) # Saves fitted resources and collector data preproc_handler.save_data() # Report on work report = {} pipeline.report_fit(report, {}) utils.write_preproc_file(preprocessing_set["run_folder"], "preprocessing_report.json", report) update_preprocessing_state() preprocessing_end = unix_time_millis() for modeling_set in modeling_sets: model_start = unix_time_millis() def update_modeling_state(): status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) clustering_train_score_save(transformed_source, source_df_index, preprocessing_set["preprocessing_params"], modeling_set["modelingParams"], modeling_set["run_folder"], listener, update_modeling_state, pipeline) model_end = end = unix_time_millis() end = unix_time_millis() # Write the final model training info status = { "modelId": modeling_set["modelId"], "state": "DONE", "startTime": start, "endTime": end, "preprocessingTime": preprocessing_end - start, "trainingTime": model_end - model_start, "progress": merge_listeners(preprocessing_listener, modeling_set["listener"]) } utils.write_model_status(modeling_set, status) return "ok"
def train_prediction_keras(core_params, preprocessing_set, split_desc): start = unix_time_millis() preprocessing_params = preprocessing_set["preprocessing_params"] modeling_sets = preprocessing_set["modelingSets"] run_folder = preprocessing_set["run_folder"] logging.info("PPS is %s" % preprocessing_params) preprocessing_listener = ProgressListener() # Fill all the listeners ASAP to have correct progress data preprocessing_listener.add_future_steps(constants.PRED_KERAS_PREPROCESSING_STATES) for modeling_set in modeling_sets: listener = ProgressListener() listener.add_future_steps(constants.PRED_KERAS_TRAIN_STATES) modeling_set["listener"] = listener # Called by the preprocessing pipeline to update the state # of each model and dump it to disk def update_preprocessing_state(): for modeling_set in modeling_sets: status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) with preprocessing_listener.push_state(constants.STATE_LOADING_TRAIN): update_preprocessing_state() train_df = df_from_split_desc(split_desc, "train", preprocessing_params['per_feature'], core_params["prediction_type"]) train_df_orig = train_df.copy() logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape) # Not implemented in the UI so far, so processor_fit_df will always be train_df preprocessor_fit_df = train_df need_subsampling = preprocessing_params["preprocessingFitSampleRatio"] < 1 if need_subsampling: preprocessor_fit_df = preprocessor_fit_df.sample(frac=preprocessing_params["preprocessingFitSampleRatio"], random_state=preprocessing_params["preprocessingFitSampleSeed"]) with preprocessing_listener.push_state(constants.STATE_LOADING_TEST): update_preprocessing_state() test_df = df_from_split_desc(split_desc, "test", preprocessing_params['per_feature'], core_params["prediction_type"]) logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape) with preprocessing_listener.push_state(constants.STATE_COLLECTING): update_preprocessing_state() collector = PredictionPreprocessingDataCollector(preprocessor_fit_df, preprocessing_params) collector_data = collector.build() # Tagging special features to take them into account only in special_preproc_handler/special_pipeline per_feature = preprocessing_params["per_feature"] tag_special_features(per_feature) pipeline, preproc_handler = build_pipeline_and_handler(collector_data, core_params, run_folder, preprocessing_params, allow_empty_mf=True) with preprocessing_listener.push_state(constants.KERASSTATE_FIT_NORMAL_PREPROCESSING): update_preprocessing_state() # Retrieving transformed values to get the shape of all regular inputs, even if won't be # actually used, as each batch of data will be processed again transformed_normal = pipeline.fit_and_process(preprocessor_fit_df) preproc_handler.save_data() preproc_handler.report(pipeline) # TODO: REVIEW STATES OF TRAINING with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TRAIN): update_preprocessing_state() with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TEST): update_preprocessing_state() update_preprocessing_state() preprocessing_end = unix_time_millis() for modeling_set in modeling_sets: model_start = unix_time_millis() # Settings env variable that may be accessed in user defined code remoterun.set_dku_env_var_and_sys_env_var(constants.DKU_CURRENT_ANALYSIS_ID, modeling_set["fullId"]["taskLoc"]["analysisId"]) remoterun.set_dku_env_var_and_sys_env_var(constants.DKU_CURRENT_MLTASK_ID, modeling_set["fullId"]["taskLoc"]["mlTaskId"]) def update_modeling_state(): status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) prediction_train_model_keras(transformed_normal, train_df_orig, test_df, pipeline, modeling_set["modelingParams"], core_params, per_feature, modeling_set["run_folder"], modeling_set["listener"], update_modeling_state, preproc_handler.target_map, pipeline.generated_features_mapping) end = unix_time_millis() utils.write_done_traininfo(modeling_set["run_folder"], start, model_start, end, (preprocessing_listener, modeling_set["listener"]), end_preprocessing_time=preprocessing_end) return "ok"
def train_prediction_models_nosave(core_params, preprocessing_set, split_desc): """Regular (mode 1) train: - Non streamed single split + fit preprocess on train + preprocess test - Fit N models sequentially - Fit - Save clf - Compute and save clf performance - Score, save scored test set + scored performnace """ start = unix_time_millis() preprocessing_params = preprocessing_set["preprocessing_params"] modeling_sets = preprocessing_set["modelingSets"] logging.info("PPS is %s" % preprocessing_params) preprocessing_listener = ProgressListener() # Fill all the listeners ASAP to have correct progress data preprocessing_listener.add_future_steps(constants.PRED_REGULAR_PREPROCESSING_STATES) for modeling_set in modeling_sets: listener = ProgressListener() if modeling_set.get('modelingParams', {}).get('gridLength', 1) > 1: listener.add_future_step(constants.STATE_GRIDSEARCHING) listener.add_future_steps(constants.PRED_REGULAR_TRAIN_STATES) modeling_set["listener"] = listener # Called by the preprocessing pipeline to update the state # of each model and dump it to disk def update_preprocessing_state(): for modeling_set in modeling_sets: status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) with preprocessing_listener.push_state(constants.STATE_LOADING_TRAIN): update_preprocessing_state() train_df = df_from_split_desc(split_desc, "train", preprocessing_params['per_feature'], core_params["prediction_type"]) logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape) for col in train_df: logging.info("Train col : %s (%s)" % (col, train_df[col].dtype)) with preprocessing_listener.push_state(constants.STATE_LOADING_TEST): update_preprocessing_state() test_df = df_from_split_desc(split_desc, "test", preprocessing_params['per_feature'], core_params["prediction_type"]) logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape) with preprocessing_listener.push_state(constants.STATE_COLLECTING): update_preprocessing_state() collector = PredictionPreprocessingDataCollector(train_df, preprocessing_params) collector_data = collector.build() pipeline, preproc_handler = build_pipeline_and_handler(collector_data, core_params, preprocessing_set['run_folder'], preprocessing_params) with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TRAIN): update_preprocessing_state() # TODO: fit_and_process should take an update_fn argument transformed_train = pipeline.fit_and_process(train_df) preproc_handler.save_data() preproc_handler.report(pipeline) with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TEST): update_preprocessing_state() test_df_index = test_df.index.copy() transformed_test = pipeline.process(test_df) update_preprocessing_state() preprocessing_end = unix_time_millis() for modeling_set in modeling_sets: model_start = unix_time_millis() def update_modeling_state(): status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) # since ensembles are never fitted through the doctor, no need to distinguish here prediction_train_score_save(transformed_train, transformed_test, test_df_index, core_params, split_desc, modeling_set["modelingParams"], modeling_set["run_folder"], modeling_set["listener"], preproc_handler.target_map, update_modeling_state, pipeline, modeling_set["run_folder"]) end = unix_time_millis() utils.write_done_traininfo(modeling_set["run_folder"], start, model_start, end, (preprocessing_listener, modeling_set["listener"]), end_preprocessing_time=preprocessing_end) return "ok"