def main(exec_folder, output_dataset, keptInputColumns): start = unix_time_millis() listener = ProgressListener() split_desc = json.load(open(osp.join(exec_folder, "_esplit.json"))) preprocessing_params = json.load(open(osp.join(exec_folder, "rpreprocessing_params.json"))) modeling_params = json.load(open(osp.join(exec_folder, "rmodeling_params.json"))) with listener.push_state(constants.STATE_LOADING_SRC): input_df = df_from_split_desc_no_normalization(split_desc, "full", preprocessing_params["per_feature"]) logging.info("Loaded full df: shape=(%d,%d)" % input_df.shape) input_df_orig = input_df.copy() input_df = utils.normalize_dataframe(input_df, preprocessing_params["per_feature"]) with listener.push_state("Collecting preprocessing data"): collector = ClusteringPreprocessingDataCollector(input_df, preprocessing_params) collector_data = collector.build() preproc_handler = ClusteringPreprocessingHandler({}, preprocessing_params, exec_folder) preproc_handler.collector_data = collector_data pipeline = preproc_handler.build_preprocessing_pipeline() with listener.push_state("Preprocessing data"): transformed_train = pipeline.fit_and_process(input_df) start_train = unix_time_millis() (clf, actual_params, cluster_labels, additional_columns) = clustering_fit(modeling_params, transformed_train) # if model has custom labels, use them try: cluster_names = clf.get_cluster_labels() except AttributeError: cluster_names = ["cluster_%s" % i for i in range(len(np.unique(cluster_labels)))] cl = pd.Series(data=cluster_labels, name="cluster_labels").map(lambda i: cluster_names[i]) cl.index = transformed_train["TRAIN"].index final_df = pd.concat([input_df_orig.join(cl, how='left'), additional_columns], axis=1) if keptInputColumns is not None: final_df = final_df[keptInputColumns + ['cluster_labels']] if preprocessing_params["outliers"]["method"] == "CLUSTER": final_df['cluster_labels'].fillna(constants.CLUSTER_OUTLIERS, inplace=True) dataiku.Dataset(output_dataset).write_from_dataframe(final_df) end = unix_time_millis() utils.write_done_traininfo(exec_folder, start, start_train, end, listener)
def main(exec_folder): start = unix_time_millis() listener = ProgressListener() def update_fn(): utils.write_running_traininfo(exec_folder, start, listener) split_desc = json.load(open(osp.join(exec_folder, "_esplit.json"))) preprocessing_params = json.load( open(osp.join(exec_folder, "rpreprocessing_params.json"))) modeling_params = json.load( open(osp.join(exec_folder, "rmodeling_params.json"))) with listener.push_state(constants.STATE_LOADING_SRC): update_fn() train_df = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"]) logging.info("Loaded full df: shape=(%d,%d)" % train_df.shape) with listener.push_state("Collecting preprocessing data"): update_fn() collector = ClusteringPreprocessingDataCollector( train_df, preprocessing_params) collector_data = collector.build() preproc_handler = ClusteringPreprocessingHandler({}, preprocessing_params, exec_folder) preproc_handler.collector_data = collector_data pipeline = preproc_handler.build_preprocessing_pipeline() with listener.push_state("Preprocessing data"): orig_index = train_df.index.copy() transformed_train = pipeline.fit_and_process(train_df) preproc_handler.save_data() preproc_handler.report(pipeline) start_train = unix_time_millis() clustering_train_score_save(transformed_train, orig_index, preprocessing_params, modeling_params, exec_folder, listener, update_fn, pipeline) end = unix_time_millis() utils.write_done_traininfo(exec_folder, start, start_train, end, listener)
def main(exec_folder, selection_state_folder, operation_mode): """The whole execution of the saved model train takes place in a single folder ?""" start = unix_time_millis() start_train = start listener = ProgressListener() def update_fn(): utils.write_running_traininfo(exec_folder, start, listener) split_desc = json.load(open(osp.join(exec_folder, "_esplit.json"))) core_params = json.load(open(osp.join(exec_folder, "core_params.json"))) preprocessing_params = json.load( open(osp.join(exec_folder, "rpreprocessing_params.json"))) weight_method = core_params.get("weight", {}).get("weightMethod", None) with_sample_weight = weight_method in { "SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT" } with_class_weight = weight_method in { "CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT" } calibrate_proba = core_params.get("calibration", {}).get("calibrationMethod", None) in ["SIGMOID", "ISOTONIC"] modeling_params = json.load( open(osp.join(exec_folder, "rmodeling_params.json"))) # For KERAS backend, need to tag special features, because they are only processed with process function, # not fit_and_process if modeling_params["algorithm"] == "KERAS_CODE": tag_special_features(preprocessing_params['per_feature']) def do_full_fit_and_save(): """Fit on 100% and save the clf and out params""" with listener.push_state(constants.STATE_LOADING_TRAIN): update_fn() full_df = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"], core_params["prediction_type"]) logging.info("Loaded FULL df: shape=(%d,%d)" % full_df.shape) with listener.push_state("Collecting preprocessing data"): update_fn() collector = ClusteringPreprocessingDataCollector( full_df, preprocessing_params) collector_data = collector.build() pipeline, preproc_handler = build_pipeline_and_handler( collector_data, core_params, exec_folder, preprocessing_params, selection_state_folder=selection_state_folder, allow_empty_mf=modeling_params["algorithm"] == "KERAS_CODE") # TODO if core_params["prediction_type"] in ( constants.BINARY_CLASSIFICATION, constants.MULTICLASS): target_map = preproc_handler.target_map else: target_map = None with listener.push_state("Preprocessing full set"): preprocessor_fit_full_df = full_df # For KERAS backend, we might need to take a subsample of the input_df to prevent from memory errors if modeling_params["algorithm"] == "KERAS_CODE": need_subsampling = preprocessing_params[ "preprocessingFitSampleRatio"] < 1 full_df_orig = full_df.copy() if need_subsampling: preprocessor_fit_full_df = preprocessor_fit_full_df.sample( frac=preprocessing_params[ "preprocessingFitSampleRatio"], random_state=preprocessing_params[ "preprocessingFitSampleSeed"]) transformed_full = pipeline.fit_and_process( preprocessor_fit_full_df) if with_sample_weight: assert transformed_full["weight"].values.min( ) > 0, "Sample weights must be positive" preproc_handler.save_data() preproc_handler.report(pipeline) if modeling_params["algorithm"] == "KERAS_CODE": modeling_set = {"run_folder": exec_folder, "listener": listener} def update_modeling_state(): status = utils.make_running_traininfo( modeling_set["run_folder"], start, modeling_set["listener"]) utils.write_model_status(modeling_set, status) empty_df = pd.DataFrame() return prediction_train_model_keras( transformed_full, full_df_orig, empty_df, pipeline, modeling_params, core_params, preprocessing_params["per_feature"], exec_folder, listener, update_modeling_state, preproc_handler.target_map, pipeline.generated_features_mapping) else: return fit_score_save(pipeline, target_map, transformed_full) def fit_score_save(pipeline, target_map, transformed_full): with listener.push_state(constants.STATE_FITTING): update_fn() if core_params["prediction_type"] in ( constants.BINARY_CLASSIFICATION, constants.MULTICLASS): (clf, out_params, prepared_X, iipd) = classification_fit( modeling_params, split_desc, transformed_full, core_params["prediction_type"], exec_folder, target_map=target_map, with_sample_weight=with_sample_weight, with_class_weight=with_class_weight) if calibrate_proba: method = core_params.get( "calibration", {}).get("calibrationMethod").lower() calibrated_clf = CalibratedClassifierCV(clf, cv="prefit", method=method) test_X = transformed_full["TRAIN"] test_X, is_sparse = prepare_multiframe( test_X, modeling_set['modelingParams']) test_y = transformed_full["target"].astype(int) if with_sample_weight: test_weight = transformed_full["weight"].astype(float) calibrated_clf.fit(test_X, test_y, sample_weight=test_weight) else: calibrated_clf.fit(test_X, test_y) clf = calibrated_clf else: (clf, out_params, prepared_X, iipd) = regression_fit_single( modeling_params, split_desc, transformed_full, exec_folder, with_sample_weight=with_sample_weight) with listener.push_state(constants.STATE_SAVING): save_prediction_model(clf, out_params, listener, update_fn, exec_folder) with listener.push_state(constants.STATE_SCORING): train_X = transformed_full["TRAIN"] train_y = transformed_full["target"] if core_params[ "prediction_type"] == constants.BINARY_CLASSIFICATION: ClassificationModelIntrinsicScorer(modeling_params, clf, train_X, train_y, pipeline, exec_folder, prepared_X, iipd, calibrate_proba).score() BinaryModelSerializer(train_X.columns(), clf, modeling_params, exec_folder, target_map, calibrate_proba).serialize() elif core_params["prediction_type"] == constants.MULTICLASS: ClassificationModelIntrinsicScorer(modeling_params, clf, train_X, train_y, pipeline, exec_folder, prepared_X, iipd, calibrate_proba).score() MulticlassModelSerializer(train_X.columns(), clf, modeling_params, exec_folder, target_map, calibrate_proba).serialize() else: RegressionModelIntrinsicScorer(modeling_params, clf, train_X, train_y, pipeline, exec_folder, prepared_X, iipd).score() RegressionModelSerializer(train_X.columns(), clf, modeling_params, exec_folder).serialize() return out_params if operation_mode == "TRAIN_SPLITTED_ONLY": with listener.push_state(constants.STATE_LOADING_TRAIN): update_fn() train_df = df_from_split_desc(split_desc, "train", preprocessing_params["per_feature"], core_params["prediction_type"]) logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape) with listener.push_state(constants.STATE_LOADING_TEST): update_fn() test_df = df_from_split_desc(split_desc, "test", preprocessing_params["per_feature"], core_params["prediction_type"]) logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape) with listener.push_state("Collecting preprocessing data"): update_fn() collector = PredictionPreprocessingDataCollector( train_df, preprocessing_params) collector_data = collector.build() pipeline, preproc_handler = build_pipeline_and_handler( collector_data, core_params, exec_folder, preprocessing_params, selection_state_folder=selection_state_folder, allow_empty_mf=modeling_params["algorithm"] == "KERAS_CODE") # TODO if core_params["prediction_type"] in ( constants.BINARY_CLASSIFICATION, constants.MULTICLASS): target_map = preproc_handler.target_map else: target_map = None with listener.push_state("Preprocessing train set"): preprocessor_fit_df = train_df # For KERAS backend, we might need to take a subsample of the input_df to prevent from memory errors if modeling_params["algorithm"] == "KERAS_CODE": train_df_orig = train_df.copy() need_subsampling = preprocessing_params[ "preprocessingFitSampleRatio"] < 1 if need_subsampling: preprocessor_fit_df = preprocessor_fit_df.sample( frac=preprocessing_params[ "preprocessingFitSampleRatio"], random_state=preprocessing_params[ "preprocessingFitSampleSeed"]) transformed_train = pipeline.fit_and_process(preprocessor_fit_df) if with_sample_weight: assert transformed_train["weight"].values.min( ) > 0, "Sample weights must be positive" preproc_handler.save_data() preproc_handler.report(pipeline) # For KERAS backend, cannot process test directly, because my have special features that may not # hold in memory if modeling_params["algorithm"] != "KERAS_CODE": with listener.push_state("Preprocessing test set"): test_df_index = test_df.index.copy() transformed_test = pipeline.process(test_df) if with_sample_weight: assert transformed_test["weight"].values.min( ) > 0, "Sample weights must be positive" if modeling_params["algorithm"] == "PYTHON_ENSEMBLE": prediction_train_score_save_ensemble(train_df, test_df, core_params, split_desc, modeling_params, exec_folder, listener, target_map, update_fn, pipeline, with_sample_weight) elif modeling_params["algorithm"] == "KERAS_CODE": modeling_set = {"run_folder": exec_folder, "listener": listener} def update_modeling_state(): status = utils.make_running_traininfo( modeling_set["run_folder"], start, modeling_set["listener"]) utils.write_model_status(modeling_set, status) prediction_train_model_keras( transformed_train, train_df_orig, test_df, pipeline, modeling_params, core_params, preprocessing_params["per_feature"], exec_folder, listener, update_modeling_state, preproc_handler.target_map, pipeline.generated_features_mapping) else: prediction_train_score_save(transformed_train, transformed_test, test_df_index, core_params, split_desc, modeling_params, exec_folder, listener, target_map, update_fn, pipeline, exec_folder) elif operation_mode == "TRAIN_FULL_ONLY": # Not yet functional ... do_full_fit_and_save() elif operation_mode == "TRAIN_KFOLD": out_params = do_full_fit_and_save() full_df_clean = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"], core_params["prediction_type"]) optimized_params = out_params["resolved"] logging.info("Regridifying post-train params: %s" % json.dumps(optimized_params)) # Regridify to a unary grid the optimized params optimized_params_grid = intercom.backend_json_call( "ml/prediction/regridify-to-pretrain", { "preTrain": json.dumps(modeling_params), "postTrain": json.dumps(optimized_params) }) logging.info("Using unary grid params: %s" % json.dumps(optimized_params_grid)) prediction_train_model_kfold(full_df_clean, core_params, split_desc, preprocessing_params, optimized_params_grid, exec_folder, exec_folder, listener, update_fn, with_sample_weight, with_class_weight, calibrate_proba) else: do_full_fit_and_save() # Do the split and scoring but don't save data with listener.push_state(constants.STATE_LOADING_TRAIN): update_fn() train_df = df_from_split_desc(split_desc, "train", preprocessing_params["per_feature"], core_params["prediction_type"]) logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape) with listener.push_state(constants.STATE_LOADING_TEST): update_fn() test_df = df_from_split_desc(split_desc, "test", preprocessing_params["per_feature"], core_params["prediction_type"]) logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape) with listener.push_state("Collecting preprocessing data"): update_fn() collector = PredictionPreprocessingDataCollector( train_df, preprocessing_params) collector_data = collector.build() pipeline, preproc_handler = build_pipeline_and_handler( collector_data, core_params, exec_folder, preprocessing_params, selection_state_folder=selection_state_folder, allow_empty_mf=modeling_params["algorithm"] == "KERAS_CODE") # TODO if core_params["prediction_type"] in ( constants.BINARY_CLASSIFICATION, constants.MULTICLASS): target_map = preproc_handler.target_map else: target_map = None with listener.push_state("Preprocessing train set"): preprocessor_fit_df = train_df # For KERAS backend, we might need to take a subsample of the input_df to prevent from memory errors if modeling_params["algorithm"] == "KERAS_CODE": need_subsampling = preprocessing_params[ "preprocessingFitSampleRatio"] < 1 train_df_orig = train_df.copy() if need_subsampling: preprocessor_fit_df = preprocessor_fit_df.sample( frac=preprocessing_params[ "preprocessingFitSampleRatio"], random_state=preprocessing_params[ "preprocessingFitSampleSeed"]) transformed_train = pipeline.fit_and_process(preprocessor_fit_df) # For KERAS backend, cannot process test directly, because my have special features that may not # hold in memory if modeling_params["algorithm"] != "KERAS_CODE": with listener.push_state("Preprocessing test set"): test_df_index = test_df.index.copy() transformed_test = pipeline.process(test_df) if modeling_params["algorithm"] == "KERAS_CODE": modeling_set = {"run_folder": exec_folder, "listener": listener} def update_modeling_state(): status = utils.make_running_traininfo( modeling_set["run_folder"], start, modeling_set["listener"]) utils.write_model_status(modeling_set, status) prediction_train_model_keras(transformed_train, train_df_orig, test_df, pipeline, modeling_params, core_params, preprocessing_params["per_feature"], exec_folder, listener, update_modeling_state, preproc_handler.target_map, pipeline.generated_features_mapping, save_model=False) else: with listener.push_state(constants.STATE_FITTING): update_fn() if core_params["prediction_type"] in ( constants.BINARY_CLASSIFICATION, constants.MULTICLASS): (clf, out_params, prepared_X, iipd) = classification_fit( modeling_params, split_desc, transformed_train, core_params["prediction_type"], target_map=target_map, with_sample_weight=with_sample_weight, with_class_weight=with_class_weight) else: (clf, out_params, prepared_X, iipd) = regression_fit_single( modeling_params, split_desc, transformed_train, exec_folder, with_sample_weight=with_sample_weight) with listener.push_state(constants.STATE_SCORING): train_X = transformed_train["TRAIN"] train_y = transformed_train["target"] if core_params[ "prediction_type"] == constants.BINARY_CLASSIFICATION: ClassificationModelIntrinsicScorer( modeling_params, clf, train_X, train_y, pipeline, exec_folder, prepared_X, iipd, calibrate_proba).score() BinaryModelSerializer(train_X.columns(), clf, modeling_params, exec_folder, target_map).serialize() binary_classification_scorer_with_valid( modeling_params, clf, transformed_test, exec_folder, test_df_index, target_map=target_map, with_sample_weight=with_sample_weight).score() elif core_params["prediction_type"] == constants.MULTICLASS: ClassificationModelIntrinsicScorer( modeling_params, clf, train_X, train_y, pipeline, exec_folder, prepared_X, iipd, calibrate_proba).score() MulticlassModelSerializer(train_X.columns(), clf, modeling_params, exec_folder, target_map).serialize() multiclass_scorer_with_valid( modeling_params, clf, transformed_test, exec_folder, test_df_index, target_map=target_map, with_sample_weight=with_sample_weight).score() else: RegressionModelIntrinsicScorer(modeling_params, clf, train_X, train_y, pipeline, exec_folder, prepared_X, iipd).score() RegressionModelSerializer(train_X.columns(), clf, modeling_params, exec_folder).serialize() regression_scorer_with_valid(modeling_params, clf, transformed_test, exec_folder, test_df_index, with_sample_weight).score() end = unix_time_millis() utils.write_done_traininfo(exec_folder, start, start_train, end, listener)
def train_prediction_kfold(core_params, preprocessing_set, split_desc): start = unix_time_millis() preprocessing_params = preprocessing_set['preprocessing_params'] modeling_sets = preprocessing_set["modelingSets"] logging.info("PPS is %s" % preprocessing_params) preprocessing_listener = ProgressListener() preprocessing_listener.add_future_steps(constants.PRED_KFOLD_PREPROCESSING_STATES) for modeling_set in modeling_sets: listener = ProgressListener() listener.add_future_steps(constants.PRED_KFOLD_TRAIN_STATES) modeling_set["listener"] = listener def update_one_preprocessing_state(modeling_set): status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) def update_preprocessing_state(): map(update_one_preprocessing_state, modeling_sets) with preprocessing_listener.push_state(constants.STATE_LOADING_SRC): update_preprocessing_state() full_df = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"], core_params["prediction_type"]) logging.info("Loaded full_df df: shape=(%d,%d)" % full_df.shape) with preprocessing_listener.push_state(constants.STATE_COLLECTING): update_preprocessing_state() collector = PredictionPreprocessingDataCollector(full_df, preprocessing_params) collector_data = collector.build() pipeline, preproc_handler = build_pipeline_and_handler(collector_data, core_params, preprocessing_set['run_folder'], preprocessing_params) with preprocessing_listener.push_state(constants.KFOLDSTATE_PREPROCESS_GLOBAL): update_preprocessing_state() transformed_full = pipeline.fit_and_process(full_df) preproc_handler.save_data() preproc_handler.report(pipeline) update_preprocessing_state() preprocessing_end = unix_time_millis() train_X = transformed_full["TRAIN"] train_y = transformed_full["target"] weight_method = core_params.get("weight", {}).get("weightMethod", None) with_sample_weight = weight_method in {"SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"} with_class_weight = weight_method in {"CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"} calibrate_proba = core_params.get("calibration", {}).get("calibrationMethod", None) in ["SIGMOID", "ISOTONIC"] if with_sample_weight: assert transformed_full["weight"].values.min() > 0, "Sample weights must be positive" for modeling_set in modeling_sets: model_start = unix_time_millis() update_fn = lambda: update_one_preprocessing_state(modeling_set) if core_params["prediction_type"] in (constants.BINARY_CLASSIFICATION, constants.MULTICLASS): with modeling_set["listener"].push_state(constants.KFOLDSTATE_FITTING_GLOBAL): # no out-fold available, so calibrate through classification_fit on a random split if calibrate_proba: calibration_method = core_params.get("calibration", {}).get("calibrationMethod").lower() else: calibration_method = None update_one_preprocessing_state(modeling_set) (clf, out_params, prepared_X, iipd) = classification_fit(modeling_set['modelingParams'], split_desc, transformed_full, core_params["prediction_type"], modeling_set['run_folder'], target_map=preproc_handler.target_map, with_sample_weight=with_sample_weight, with_class_weight=with_class_weight, calibration=calibration_method) save_prediction_model(clf, out_params, modeling_set["listener"], update_fn, modeling_set['run_folder']) with modeling_set["listener"].push_state(constants.KFOLDSTATE_SCORING_GLOBAL): update_one_preprocessing_state(modeling_set) ClassificationModelIntrinsicScorer(modeling_set['modelingParams'], clf, train_X, train_y, pipeline, modeling_set['run_folder'], prepared_X, iipd, calibrate_proba).score() if core_params["prediction_type"] == constants.BINARY_CLASSIFICATION: BinaryModelSerializer(train_X.columns(), clf, modeling_set['modelingParams'], modeling_set['run_folder'], preproc_handler.target_map, calibrate_proba).serialize() else: MulticlassModelSerializer(train_X.columns(), clf, modeling_set['modelingParams'], modeling_set['run_folder'], preproc_handler.target_map, calibrate_proba).serialize() else: with modeling_set["listener"].push_state(constants.KFOLDSTATE_FITTING_GLOBAL): update_one_preprocessing_state(modeling_set) (clf, out_params, prepared_X, iipd) = regression_fit_single(modeling_set['modelingParams'], split_desc, transformed_full, modeling_set["run_folder"], with_sample_weight=with_sample_weight) save_prediction_model(clf, out_params, modeling_set["listener"], update_fn, modeling_set['run_folder']) with modeling_set["listener"].push_state(constants.KFOLDSTATE_SCORING_GLOBAL): update_fn() RegressionModelIntrinsicScorer(modeling_set['modelingParams'], clf, train_X, train_y, pipeline, modeling_set['run_folder'], prepared_X, iipd).score() # serialize the model if possible RegressionModelSerializer(train_X.columns(), clf, modeling_set['modelingParams'], modeling_set['run_folder']).serialize() full_df_clean = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"], core_params["prediction_type"]) optimized_params = out_params["resolved"] logging.info("Regridifying post-train params: %s" % json.dumps(optimized_params)) # Regridify to a unary grid the optimized params optimized_params_grid = intercom.backend_json_call("ml/prediction/regridify-to-pretrain", { "preTrain" : json.dumps(modeling_set["modelingParams"]), "postTrain" : json.dumps(optimized_params) }) logging.info("Using unary grid params: %s" % json.dumps(optimized_params_grid)) prediction_train_model_kfold(full_df_clean, core_params, split_desc, preprocessing_params, optimized_params_grid, preprocessing_set['run_folder'], modeling_set['run_folder'], modeling_set["listener"], update_fn, with_sample_weight, with_class_weight, calibrate_proba) end = unix_time_millis() utils.write_done_traininfo(modeling_set['run_folder'], start, model_start, end, (preprocessing_listener, modeling_set["listener"]), end_preprocessing_time=preprocessing_end) return "ok"
def create_ensemble(split_desc, core_params, model_folder, preprocessing_folder, model_folders, preprocessing_folders): listener = ProgressListener() listener.add_future_steps(constants.ENSEMBLE_STATES) start = unix_time_millis() def update_preprocessing_state(): utils.write_running_traininfo(model_folder, start, listener) split_desc = dkujson.loads(split_desc) core_params = dkujson.loads(core_params) weight_method = core_params.get("weight", {}).get("weightMethod", None) with_sample_weight = weight_method in {"SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"} # TODO: update downstream with_class_weight = weight_method in {"CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"} preprocessing_folders = dkujson.loads(preprocessing_folders) model_folders = dkujson.loads(model_folders) modeling_params = dkujson.load_from_filepath(osp.join(model_folder, "rmodeling_params.json")) ensemble_params = modeling_params["ensemble_params"] logging.info("creating ensemble") with listener.push_state(constants.STATE_ENSEMBLING): update_preprocessing_state() from dataiku.doctor.prediction.ensembles import ensemble_from_fitted train = df_from_split_desc(split_desc, "train", ensemble_params["preprocessing_params"][0]["per_feature"], core_params["prediction_type"]) iperf = { "modelInputNRows" : train.shape[0], #todo : not the right count as may have dropped ... "modelInputNCols" : -1, # makes no sense for an ensemble as may have different preprocessings "modelInputIsSparse" : False } dkujson.dump_to_filepath(osp.join(model_folder, "iperf.json"), iperf) clf = ensemble_from_fitted(core_params, ensemble_params, preprocessing_folders, model_folders, train, with_sample_weight, with_class_weight) logging.info("saving model") with listener.push_state(constants.STATE_SAVING): update_preprocessing_state() with open(osp.join(model_folder, "clf.pkl"), dku_write_mode_for_pickling()) as f: pickle.dump(clf, f, 2) logging.info("scoring model") with listener.push_state(constants.STATE_SCORING): update_preprocessing_state() test = df_from_split_desc(split_desc, "test", ensemble_params["preprocessing_params"][0]["per_feature"], core_params["prediction_type"]) # this is annoying, but we have to use one of the previous preprocessings in order to get the target prep_folder = preprocessing_folders[0] rppp = dkujson.load_from_filepath(osp.join(prep_folder, "rpreprocessing_params.json")) collector_data = dkujson.load_from_filepath(osp.join(prep_folder, "collector_data.json")) preprocessing_handler = PreprocessingHandler.build(core_params, rppp, prep_folder) preprocessing_handler.collector_data = collector_data pipe = preprocessing_handler.build_preprocessing_pipeline(with_target=True) transformed = pipe.process(test) y = transformed["target"] if with_sample_weight: sample_weight = transformed["weight"] else: sample_weight = None # Now that the CLF with scorable pipelines has been saved, set it in "pipelines with target" mode # to be able to compute metrics clf.set_with_target_pipelines_mode(True) pred = clf.predict(test) probas = None if core_params["prediction_type"] == "REGRESSION" else clf.predict_proba(test) target_map = None if core_params["prediction_type"] == "REGRESSION" else \ {t["sourceValue"]: t["mappedValue"] for t in ensemble_params["preprocessing_params"][0]["target_remapping"]} prediction_type = core_params["prediction_type"] if prediction_type == "REGRESSION": RegressionModelScorer(modeling_params, clf, pred, y, model_folder, transformed, test.index.copy(), sample_weight).score() elif prediction_type == "BINARY_CLASSIFICATION": BinaryClassificationModelScorer(modeling_params, clf, model_folder, pred, probas, y, target_map, transformed, test.index.copy(), sample_weight).score() else: MulticlassModelScorer(modeling_params, clf, model_folder, pred, probas, y.astype(int), target_map, transformed, test.index.copy(), sample_weight).score() update_preprocessing_state() end = unix_time_millis() dkujson.dump_to_filepath(osp.join(model_folder, "actual_params.json"), {"resolved": modeling_params}) dkujson.dump_to_filepath(osp.join(preprocessing_folder, "preprocessing_report.json"), {}) utils.write_done_traininfo(model_folder, start, end, end, listener, end_preprocessing_time=start) return "ok"
def train_prediction_keras(core_params, preprocessing_set, split_desc): start = unix_time_millis() preprocessing_params = preprocessing_set["preprocessing_params"] modeling_sets = preprocessing_set["modelingSets"] run_folder = preprocessing_set["run_folder"] logging.info("PPS is %s" % preprocessing_params) preprocessing_listener = ProgressListener() # Fill all the listeners ASAP to have correct progress data preprocessing_listener.add_future_steps(constants.PRED_KERAS_PREPROCESSING_STATES) for modeling_set in modeling_sets: listener = ProgressListener() listener.add_future_steps(constants.PRED_KERAS_TRAIN_STATES) modeling_set["listener"] = listener # Called by the preprocessing pipeline to update the state # of each model and dump it to disk def update_preprocessing_state(): for modeling_set in modeling_sets: status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) with preprocessing_listener.push_state(constants.STATE_LOADING_TRAIN): update_preprocessing_state() train_df = df_from_split_desc(split_desc, "train", preprocessing_params['per_feature'], core_params["prediction_type"]) train_df_orig = train_df.copy() logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape) # Not implemented in the UI so far, so processor_fit_df will always be train_df preprocessor_fit_df = train_df need_subsampling = preprocessing_params["preprocessingFitSampleRatio"] < 1 if need_subsampling: preprocessor_fit_df = preprocessor_fit_df.sample(frac=preprocessing_params["preprocessingFitSampleRatio"], random_state=preprocessing_params["preprocessingFitSampleSeed"]) with preprocessing_listener.push_state(constants.STATE_LOADING_TEST): update_preprocessing_state() test_df = df_from_split_desc(split_desc, "test", preprocessing_params['per_feature'], core_params["prediction_type"]) logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape) with preprocessing_listener.push_state(constants.STATE_COLLECTING): update_preprocessing_state() collector = PredictionPreprocessingDataCollector(preprocessor_fit_df, preprocessing_params) collector_data = collector.build() # Tagging special features to take them into account only in special_preproc_handler/special_pipeline per_feature = preprocessing_params["per_feature"] tag_special_features(per_feature) pipeline, preproc_handler = build_pipeline_and_handler(collector_data, core_params, run_folder, preprocessing_params, allow_empty_mf=True) with preprocessing_listener.push_state(constants.KERASSTATE_FIT_NORMAL_PREPROCESSING): update_preprocessing_state() # Retrieving transformed values to get the shape of all regular inputs, even if won't be # actually used, as each batch of data will be processed again transformed_normal = pipeline.fit_and_process(preprocessor_fit_df) preproc_handler.save_data() preproc_handler.report(pipeline) # TODO: REVIEW STATES OF TRAINING with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TRAIN): update_preprocessing_state() with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TEST): update_preprocessing_state() update_preprocessing_state() preprocessing_end = unix_time_millis() for modeling_set in modeling_sets: model_start = unix_time_millis() # Settings env variable that may be accessed in user defined code remoterun.set_dku_env_var_and_sys_env_var(constants.DKU_CURRENT_ANALYSIS_ID, modeling_set["fullId"]["taskLoc"]["analysisId"]) remoterun.set_dku_env_var_and_sys_env_var(constants.DKU_CURRENT_MLTASK_ID, modeling_set["fullId"]["taskLoc"]["mlTaskId"]) def update_modeling_state(): status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) prediction_train_model_keras(transformed_normal, train_df_orig, test_df, pipeline, modeling_set["modelingParams"], core_params, per_feature, modeling_set["run_folder"], modeling_set["listener"], update_modeling_state, preproc_handler.target_map, pipeline.generated_features_mapping) end = unix_time_millis() utils.write_done_traininfo(modeling_set["run_folder"], start, model_start, end, (preprocessing_listener, modeling_set["listener"]), end_preprocessing_time=preprocessing_end) return "ok"
def train_prediction_models_nosave(core_params, preprocessing_set, split_desc): """Regular (mode 1) train: - Non streamed single split + fit preprocess on train + preprocess test - Fit N models sequentially - Fit - Save clf - Compute and save clf performance - Score, save scored test set + scored performnace """ start = unix_time_millis() preprocessing_params = preprocessing_set["preprocessing_params"] modeling_sets = preprocessing_set["modelingSets"] logging.info("PPS is %s" % preprocessing_params) preprocessing_listener = ProgressListener() # Fill all the listeners ASAP to have correct progress data preprocessing_listener.add_future_steps(constants.PRED_REGULAR_PREPROCESSING_STATES) for modeling_set in modeling_sets: listener = ProgressListener() if modeling_set.get('modelingParams', {}).get('gridLength', 1) > 1: listener.add_future_step(constants.STATE_GRIDSEARCHING) listener.add_future_steps(constants.PRED_REGULAR_TRAIN_STATES) modeling_set["listener"] = listener # Called by the preprocessing pipeline to update the state # of each model and dump it to disk def update_preprocessing_state(): for modeling_set in modeling_sets: status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) with preprocessing_listener.push_state(constants.STATE_LOADING_TRAIN): update_preprocessing_state() train_df = df_from_split_desc(split_desc, "train", preprocessing_params['per_feature'], core_params["prediction_type"]) logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape) for col in train_df: logging.info("Train col : %s (%s)" % (col, train_df[col].dtype)) with preprocessing_listener.push_state(constants.STATE_LOADING_TEST): update_preprocessing_state() test_df = df_from_split_desc(split_desc, "test", preprocessing_params['per_feature'], core_params["prediction_type"]) logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape) with preprocessing_listener.push_state(constants.STATE_COLLECTING): update_preprocessing_state() collector = PredictionPreprocessingDataCollector(train_df, preprocessing_params) collector_data = collector.build() pipeline, preproc_handler = build_pipeline_and_handler(collector_data, core_params, preprocessing_set['run_folder'], preprocessing_params) with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TRAIN): update_preprocessing_state() # TODO: fit_and_process should take an update_fn argument transformed_train = pipeline.fit_and_process(train_df) preproc_handler.save_data() preproc_handler.report(pipeline) with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TEST): update_preprocessing_state() test_df_index = test_df.index.copy() transformed_test = pipeline.process(test_df) update_preprocessing_state() preprocessing_end = unix_time_millis() for modeling_set in modeling_sets: model_start = unix_time_millis() def update_modeling_state(): status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) # since ensembles are never fitted through the doctor, no need to distinguish here prediction_train_score_save(transformed_train, transformed_test, test_df_index, core_params, split_desc, modeling_set["modelingParams"], modeling_set["run_folder"], modeling_set["listener"], preproc_handler.target_map, update_modeling_state, pipeline, modeling_set["run_folder"]) end = unix_time_millis() utils.write_done_traininfo(modeling_set["run_folder"], start, model_start, end, (preprocessing_listener, modeling_set["listener"]), end_preprocessing_time=preprocessing_end) return "ok"