def main(exec_folder, output_dataset, keptInputColumns): start = unix_time_millis() listener = ProgressListener() split_desc = json.load(open(osp.join(exec_folder, "_esplit.json"))) preprocessing_params = json.load(open(osp.join(exec_folder, "rpreprocessing_params.json"))) modeling_params = json.load(open(osp.join(exec_folder, "rmodeling_params.json"))) with listener.push_state(constants.STATE_LOADING_SRC): input_df = df_from_split_desc_no_normalization(split_desc, "full", preprocessing_params["per_feature"]) logging.info("Loaded full df: shape=(%d,%d)" % input_df.shape) input_df_orig = input_df.copy() input_df = utils.normalize_dataframe(input_df, preprocessing_params["per_feature"]) with listener.push_state("Collecting preprocessing data"): collector = ClusteringPreprocessingDataCollector(input_df, preprocessing_params) collector_data = collector.build() preproc_handler = ClusteringPreprocessingHandler({}, preprocessing_params, exec_folder) preproc_handler.collector_data = collector_data pipeline = preproc_handler.build_preprocessing_pipeline() with listener.push_state("Preprocessing data"): transformed_train = pipeline.fit_and_process(input_df) start_train = unix_time_millis() (clf, actual_params, cluster_labels, additional_columns) = clustering_fit(modeling_params, transformed_train) # if model has custom labels, use them try: cluster_names = clf.get_cluster_labels() except AttributeError: cluster_names = ["cluster_%s" % i for i in range(len(np.unique(cluster_labels)))] cl = pd.Series(data=cluster_labels, name="cluster_labels").map(lambda i: cluster_names[i]) cl.index = transformed_train["TRAIN"].index final_df = pd.concat([input_df_orig.join(cl, how='left'), additional_columns], axis=1) if keptInputColumns is not None: final_df = final_df[keptInputColumns + ['cluster_labels']] if preprocessing_params["outliers"]["method"] == "CLUSTER": final_df['cluster_labels'].fillna(constants.CLUSTER_OUTLIERS, inplace=True) dataiku.Dataset(output_dataset).write_from_dataframe(final_df) end = unix_time_millis() utils.write_done_traininfo(exec_folder, start, start_train, end, listener)
def main(exec_folder): start = unix_time_millis() listener = ProgressListener() def update_fn(): utils.write_running_traininfo(exec_folder, start, listener) split_desc = json.load(open(osp.join(exec_folder, "_esplit.json"))) preprocessing_params = json.load( open(osp.join(exec_folder, "rpreprocessing_params.json"))) modeling_params = json.load( open(osp.join(exec_folder, "rmodeling_params.json"))) with listener.push_state(constants.STATE_LOADING_SRC): update_fn() train_df = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"]) logging.info("Loaded full df: shape=(%d,%d)" % train_df.shape) with listener.push_state("Collecting preprocessing data"): update_fn() collector = ClusteringPreprocessingDataCollector( train_df, preprocessing_params) collector_data = collector.build() preproc_handler = ClusteringPreprocessingHandler({}, preprocessing_params, exec_folder) preproc_handler.collector_data = collector_data pipeline = preproc_handler.build_preprocessing_pipeline() with listener.push_state("Preprocessing data"): orig_index = train_df.index.copy() transformed_train = pipeline.fit_and_process(train_df) preproc_handler.save_data() preproc_handler.report(pipeline) start_train = unix_time_millis() clustering_train_score_save(transformed_train, orig_index, preprocessing_params, modeling_params, exec_folder, listener, update_fn, pipeline) end = unix_time_millis() utils.write_done_traininfo(exec_folder, start, start_train, end, listener)
def main(exec_folder, selection_state_folder, operation_mode): """The whole execution of the saved model train takes place in a single folder ?""" start = unix_time_millis() start_train = start listener = ProgressListener() def update_fn(): utils.write_running_traininfo(exec_folder, start, listener) split_desc = json.load(open(osp.join(exec_folder, "_esplit.json"))) core_params = json.load(open(osp.join(exec_folder, "core_params.json"))) preprocessing_params = json.load( open(osp.join(exec_folder, "rpreprocessing_params.json"))) weight_method = core_params.get("weight", {}).get("weightMethod", None) with_sample_weight = weight_method in { "SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT" } with_class_weight = weight_method in { "CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT" } calibrate_proba = core_params.get("calibration", {}).get("calibrationMethod", None) in ["SIGMOID", "ISOTONIC"] modeling_params = json.load( open(osp.join(exec_folder, "rmodeling_params.json"))) # For KERAS backend, need to tag special features, because they are only processed with process function, # not fit_and_process if modeling_params["algorithm"] == "KERAS_CODE": tag_special_features(preprocessing_params['per_feature']) def do_full_fit_and_save(): """Fit on 100% and save the clf and out params""" with listener.push_state(constants.STATE_LOADING_TRAIN): update_fn() full_df = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"], core_params["prediction_type"]) logging.info("Loaded FULL df: shape=(%d,%d)" % full_df.shape) with listener.push_state("Collecting preprocessing data"): update_fn() collector = ClusteringPreprocessingDataCollector( full_df, preprocessing_params) collector_data = collector.build() pipeline, preproc_handler = build_pipeline_and_handler( collector_data, core_params, exec_folder, preprocessing_params, selection_state_folder=selection_state_folder, allow_empty_mf=modeling_params["algorithm"] == "KERAS_CODE") # TODO if core_params["prediction_type"] in ( constants.BINARY_CLASSIFICATION, constants.MULTICLASS): target_map = preproc_handler.target_map else: target_map = None with listener.push_state("Preprocessing full set"): preprocessor_fit_full_df = full_df # For KERAS backend, we might need to take a subsample of the input_df to prevent from memory errors if modeling_params["algorithm"] == "KERAS_CODE": need_subsampling = preprocessing_params[ "preprocessingFitSampleRatio"] < 1 full_df_orig = full_df.copy() if need_subsampling: preprocessor_fit_full_df = preprocessor_fit_full_df.sample( frac=preprocessing_params[ "preprocessingFitSampleRatio"], random_state=preprocessing_params[ "preprocessingFitSampleSeed"]) transformed_full = pipeline.fit_and_process( preprocessor_fit_full_df) if with_sample_weight: assert transformed_full["weight"].values.min( ) > 0, "Sample weights must be positive" preproc_handler.save_data() preproc_handler.report(pipeline) if modeling_params["algorithm"] == "KERAS_CODE": modeling_set = {"run_folder": exec_folder, "listener": listener} def update_modeling_state(): status = utils.make_running_traininfo( modeling_set["run_folder"], start, modeling_set["listener"]) utils.write_model_status(modeling_set, status) empty_df = pd.DataFrame() return prediction_train_model_keras( transformed_full, full_df_orig, empty_df, pipeline, modeling_params, core_params, preprocessing_params["per_feature"], exec_folder, listener, update_modeling_state, preproc_handler.target_map, pipeline.generated_features_mapping) else: return fit_score_save(pipeline, target_map, transformed_full) def fit_score_save(pipeline, target_map, transformed_full): with listener.push_state(constants.STATE_FITTING): update_fn() if core_params["prediction_type"] in ( constants.BINARY_CLASSIFICATION, constants.MULTICLASS): (clf, out_params, prepared_X, iipd) = classification_fit( modeling_params, split_desc, transformed_full, core_params["prediction_type"], exec_folder, target_map=target_map, with_sample_weight=with_sample_weight, with_class_weight=with_class_weight) if calibrate_proba: method = core_params.get( "calibration", {}).get("calibrationMethod").lower() calibrated_clf = CalibratedClassifierCV(clf, cv="prefit", method=method) test_X = transformed_full["TRAIN"] test_X, is_sparse = prepare_multiframe( test_X, modeling_set['modelingParams']) test_y = transformed_full["target"].astype(int) if with_sample_weight: test_weight = transformed_full["weight"].astype(float) calibrated_clf.fit(test_X, test_y, sample_weight=test_weight) else: calibrated_clf.fit(test_X, test_y) clf = calibrated_clf else: (clf, out_params, prepared_X, iipd) = regression_fit_single( modeling_params, split_desc, transformed_full, exec_folder, with_sample_weight=with_sample_weight) with listener.push_state(constants.STATE_SAVING): save_prediction_model(clf, out_params, listener, update_fn, exec_folder) with listener.push_state(constants.STATE_SCORING): train_X = transformed_full["TRAIN"] train_y = transformed_full["target"] if core_params[ "prediction_type"] == constants.BINARY_CLASSIFICATION: ClassificationModelIntrinsicScorer(modeling_params, clf, train_X, train_y, pipeline, exec_folder, prepared_X, iipd, calibrate_proba).score() BinaryModelSerializer(train_X.columns(), clf, modeling_params, exec_folder, target_map, calibrate_proba).serialize() elif core_params["prediction_type"] == constants.MULTICLASS: ClassificationModelIntrinsicScorer(modeling_params, clf, train_X, train_y, pipeline, exec_folder, prepared_X, iipd, calibrate_proba).score() MulticlassModelSerializer(train_X.columns(), clf, modeling_params, exec_folder, target_map, calibrate_proba).serialize() else: RegressionModelIntrinsicScorer(modeling_params, clf, train_X, train_y, pipeline, exec_folder, prepared_X, iipd).score() RegressionModelSerializer(train_X.columns(), clf, modeling_params, exec_folder).serialize() return out_params if operation_mode == "TRAIN_SPLITTED_ONLY": with listener.push_state(constants.STATE_LOADING_TRAIN): update_fn() train_df = df_from_split_desc(split_desc, "train", preprocessing_params["per_feature"], core_params["prediction_type"]) logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape) with listener.push_state(constants.STATE_LOADING_TEST): update_fn() test_df = df_from_split_desc(split_desc, "test", preprocessing_params["per_feature"], core_params["prediction_type"]) logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape) with listener.push_state("Collecting preprocessing data"): update_fn() collector = PredictionPreprocessingDataCollector( train_df, preprocessing_params) collector_data = collector.build() pipeline, preproc_handler = build_pipeline_and_handler( collector_data, core_params, exec_folder, preprocessing_params, selection_state_folder=selection_state_folder, allow_empty_mf=modeling_params["algorithm"] == "KERAS_CODE") # TODO if core_params["prediction_type"] in ( constants.BINARY_CLASSIFICATION, constants.MULTICLASS): target_map = preproc_handler.target_map else: target_map = None with listener.push_state("Preprocessing train set"): preprocessor_fit_df = train_df # For KERAS backend, we might need to take a subsample of the input_df to prevent from memory errors if modeling_params["algorithm"] == "KERAS_CODE": train_df_orig = train_df.copy() need_subsampling = preprocessing_params[ "preprocessingFitSampleRatio"] < 1 if need_subsampling: preprocessor_fit_df = preprocessor_fit_df.sample( frac=preprocessing_params[ "preprocessingFitSampleRatio"], random_state=preprocessing_params[ "preprocessingFitSampleSeed"]) transformed_train = pipeline.fit_and_process(preprocessor_fit_df) if with_sample_weight: assert transformed_train["weight"].values.min( ) > 0, "Sample weights must be positive" preproc_handler.save_data() preproc_handler.report(pipeline) # For KERAS backend, cannot process test directly, because my have special features that may not # hold in memory if modeling_params["algorithm"] != "KERAS_CODE": with listener.push_state("Preprocessing test set"): test_df_index = test_df.index.copy() transformed_test = pipeline.process(test_df) if with_sample_weight: assert transformed_test["weight"].values.min( ) > 0, "Sample weights must be positive" if modeling_params["algorithm"] == "PYTHON_ENSEMBLE": prediction_train_score_save_ensemble(train_df, test_df, core_params, split_desc, modeling_params, exec_folder, listener, target_map, update_fn, pipeline, with_sample_weight) elif modeling_params["algorithm"] == "KERAS_CODE": modeling_set = {"run_folder": exec_folder, "listener": listener} def update_modeling_state(): status = utils.make_running_traininfo( modeling_set["run_folder"], start, modeling_set["listener"]) utils.write_model_status(modeling_set, status) prediction_train_model_keras( transformed_train, train_df_orig, test_df, pipeline, modeling_params, core_params, preprocessing_params["per_feature"], exec_folder, listener, update_modeling_state, preproc_handler.target_map, pipeline.generated_features_mapping) else: prediction_train_score_save(transformed_train, transformed_test, test_df_index, core_params, split_desc, modeling_params, exec_folder, listener, target_map, update_fn, pipeline, exec_folder) elif operation_mode == "TRAIN_FULL_ONLY": # Not yet functional ... do_full_fit_and_save() elif operation_mode == "TRAIN_KFOLD": out_params = do_full_fit_and_save() full_df_clean = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"], core_params["prediction_type"]) optimized_params = out_params["resolved"] logging.info("Regridifying post-train params: %s" % json.dumps(optimized_params)) # Regridify to a unary grid the optimized params optimized_params_grid = intercom.backend_json_call( "ml/prediction/regridify-to-pretrain", { "preTrain": json.dumps(modeling_params), "postTrain": json.dumps(optimized_params) }) logging.info("Using unary grid params: %s" % json.dumps(optimized_params_grid)) prediction_train_model_kfold(full_df_clean, core_params, split_desc, preprocessing_params, optimized_params_grid, exec_folder, exec_folder, listener, update_fn, with_sample_weight, with_class_weight, calibrate_proba) else: do_full_fit_and_save() # Do the split and scoring but don't save data with listener.push_state(constants.STATE_LOADING_TRAIN): update_fn() train_df = df_from_split_desc(split_desc, "train", preprocessing_params["per_feature"], core_params["prediction_type"]) logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape) with listener.push_state(constants.STATE_LOADING_TEST): update_fn() test_df = df_from_split_desc(split_desc, "test", preprocessing_params["per_feature"], core_params["prediction_type"]) logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape) with listener.push_state("Collecting preprocessing data"): update_fn() collector = PredictionPreprocessingDataCollector( train_df, preprocessing_params) collector_data = collector.build() pipeline, preproc_handler = build_pipeline_and_handler( collector_data, core_params, exec_folder, preprocessing_params, selection_state_folder=selection_state_folder, allow_empty_mf=modeling_params["algorithm"] == "KERAS_CODE") # TODO if core_params["prediction_type"] in ( constants.BINARY_CLASSIFICATION, constants.MULTICLASS): target_map = preproc_handler.target_map else: target_map = None with listener.push_state("Preprocessing train set"): preprocessor_fit_df = train_df # For KERAS backend, we might need to take a subsample of the input_df to prevent from memory errors if modeling_params["algorithm"] == "KERAS_CODE": need_subsampling = preprocessing_params[ "preprocessingFitSampleRatio"] < 1 train_df_orig = train_df.copy() if need_subsampling: preprocessor_fit_df = preprocessor_fit_df.sample( frac=preprocessing_params[ "preprocessingFitSampleRatio"], random_state=preprocessing_params[ "preprocessingFitSampleSeed"]) transformed_train = pipeline.fit_and_process(preprocessor_fit_df) # For KERAS backend, cannot process test directly, because my have special features that may not # hold in memory if modeling_params["algorithm"] != "KERAS_CODE": with listener.push_state("Preprocessing test set"): test_df_index = test_df.index.copy() transformed_test = pipeline.process(test_df) if modeling_params["algorithm"] == "KERAS_CODE": modeling_set = {"run_folder": exec_folder, "listener": listener} def update_modeling_state(): status = utils.make_running_traininfo( modeling_set["run_folder"], start, modeling_set["listener"]) utils.write_model_status(modeling_set, status) prediction_train_model_keras(transformed_train, train_df_orig, test_df, pipeline, modeling_params, core_params, preprocessing_params["per_feature"], exec_folder, listener, update_modeling_state, preproc_handler.target_map, pipeline.generated_features_mapping, save_model=False) else: with listener.push_state(constants.STATE_FITTING): update_fn() if core_params["prediction_type"] in ( constants.BINARY_CLASSIFICATION, constants.MULTICLASS): (clf, out_params, prepared_X, iipd) = classification_fit( modeling_params, split_desc, transformed_train, core_params["prediction_type"], target_map=target_map, with_sample_weight=with_sample_weight, with_class_weight=with_class_weight) else: (clf, out_params, prepared_X, iipd) = regression_fit_single( modeling_params, split_desc, transformed_train, exec_folder, with_sample_weight=with_sample_weight) with listener.push_state(constants.STATE_SCORING): train_X = transformed_train["TRAIN"] train_y = transformed_train["target"] if core_params[ "prediction_type"] == constants.BINARY_CLASSIFICATION: ClassificationModelIntrinsicScorer( modeling_params, clf, train_X, train_y, pipeline, exec_folder, prepared_X, iipd, calibrate_proba).score() BinaryModelSerializer(train_X.columns(), clf, modeling_params, exec_folder, target_map).serialize() binary_classification_scorer_with_valid( modeling_params, clf, transformed_test, exec_folder, test_df_index, target_map=target_map, with_sample_weight=with_sample_weight).score() elif core_params["prediction_type"] == constants.MULTICLASS: ClassificationModelIntrinsicScorer( modeling_params, clf, train_X, train_y, pipeline, exec_folder, prepared_X, iipd, calibrate_proba).score() MulticlassModelSerializer(train_X.columns(), clf, modeling_params, exec_folder, target_map).serialize() multiclass_scorer_with_valid( modeling_params, clf, transformed_test, exec_folder, test_df_index, target_map=target_map, with_sample_weight=with_sample_weight).score() else: RegressionModelIntrinsicScorer(modeling_params, clf, train_X, train_y, pipeline, exec_folder, prepared_X, iipd).score() RegressionModelSerializer(train_X.columns(), clf, modeling_params, exec_folder).serialize() regression_scorer_with_valid(modeling_params, clf, transformed_test, exec_folder, test_df_index, with_sample_weight).score() end = unix_time_millis() utils.write_done_traininfo(exec_folder, start, start_train, end, listener)
def main(model_folder, input_dataset_smartname, output_dataset_smartname, recipe_desc, script, preparation_output_schema, cond_outputs=None): # Obtain a streamed result of the preparation input_dataset = dataiku.Dataset(input_dataset_smartname) logging.info("Will do preparation, output schema: %s" % preparation_output_schema) input_dataset.set_preparation_steps(script["steps"], preparation_output_schema) listener = ProgressListener() core_params = dkujson.load_from_filepath( osp.join(model_folder, "core_params.json")) preprocessing_params = dkujson.load_from_filepath( osp.join(model_folder, "rpreprocessing_params.json")) modeling_params = dkujson.load_from_filepath( osp.join(model_folder, "actual_params.json"))["resolved"] collector_data = dkujson.load_from_filepath( osp.join(model_folder, "collector_data.json")) preprocessing_handler = PreprocessingHandler.build(core_params, preprocessing_params, model_folder) preprocessing_handler.collector_data = collector_data pipeline = preprocessing_handler.build_preprocessing_pipeline() batch_size = recipe_desc.get("pythonBatchSize", 100000) logging.info("Scoring with batch size: {}".format(batch_size)) with open(osp.join(model_folder, "clf.pkl"), "rb") as f: clf = pickle.load(f) def output_generator(): logging.info("Start output generator ...") (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st( preparation_output_schema["columns"], parse_dates=True, infer_with_pandas=False) logging.info("Reading with INITIAL dtypes: %s" % dtypes) dtypes = utils.ml_dtypes_from_dss_schema( preparation_output_schema, preprocessing_params["per_feature"], prediction_type=core_params["prediction_type"]) logging.info("Reading with dtypes: %s" % dtypes) for i in xrange(0, len(names)): logging.info("Column %s = %s (dtype=%s)" % (i, names[i], dtypes.get(names[i], None))) for input_df in input_dataset.iter_dataframes_forced_types( names, dtypes, parse_date_columns, chunksize=batch_size, float_precision="round_trip"): input_df.index = range(input_df.shape[0]) input_df_orig = input_df.copy() logging.info("Got a dataframe : %s" % str(input_df.shape)) normalize_dataframe(input_df, preprocessing_params['per_feature']) for col in input_df: logging.info("NORMALIZED: %s -> %s" % (col, input_df[col].dtype)) logging.info("Processing it") logging.info("Predicting it") if core_params[ "prediction_type"] == constants.BINARY_CLASSIFICATION: pred_df = binary_classification_predict( clf, pipeline, modeling_params, preprocessing_params, preprocessing_handler.target_map, recipe_desc["forcedClassifierThreshold"], input_df, output_probas=recipe_desc["outputProbabilities"]) # Probability percentile & Conditional outputs pred_df = binary_classif_scoring_add_percentile_and_cond_outputs( pred_df, recipe_desc, model_folder, cond_outputs, preprocessing_handler.target_map) elif core_params["prediction_type"] == constants.MULTICLASS: pred_df = multiclass_predict( clf, pipeline, modeling_params, preprocessing_params, preprocessing_handler.target_map, input_df, output_probas=recipe_desc["outputProbabilities"]) elif core_params["prediction_type"] == constants.REGRESSION: pred_df = regression_predict(clf, pipeline, modeling_params, input_df) else: raise ValueError("bad prediction type %s" % core_params["prediction_type"]) logging.info("pred df debug :") logging.info(pred_df) logging.info("Done predicting it") if recipe_desc.get("filterInputColumns", False): clean_kept_columns = [ c for c in recipe_desc["keptInputColumns"] if c not in pred_df.columns ] else: clean_kept_columns = [ c for c in input_df_orig.columns if c not in pred_df.columns ] yield pd.concat([input_df_orig[clean_kept_columns], pred_df], axis=1) output_dataset = dataiku.Dataset(output_dataset_smartname) logging.info("Starting writer") with output_dataset.get_writer() as writer: i = 0 logging.info("Starting to iterate") for output_df in output_generator(): logging.info("Generator generated a df %s" % str(output_df.shape)) #if i == 0: # output_dataset.write_schema_from_dataframe(output_df) i = i + 1 writer.write_dataframe(output_df) logging.info("Output df written")
def main(model_folder, input_dataset_smartname, output_dataset_smartname, recipe_desc, script, preparation_output_schema): input_dataset = dataiku.Dataset(input_dataset_smartname) logging.info("Will do preparation, output schema: %s" % preparation_output_schema) input_dataset.set_preparation_steps(script["steps"], preparation_output_schema) listener = ProgressListener() preprocessing_params = dkujson.load_from_filepath( osp.join(model_folder, "rpreprocessing_params.json")) modeling_params = dkujson.load_from_filepath( osp.join(model_folder, "actual_params.json"))["resolved"] collector_data = dkujson.load_from_filepath( osp.join(model_folder, "collector_data.json")) # Name remapping user_meta = dkujson.load_from_filepath( osp.join(model_folder, "user_meta.json")) cluster_name_map = {} if "clusterMetas" in user_meta: logging.info("Cluster metas: %s" % user_meta["clusterMetas"]) for (cluster_id, cluster_data) in user_meta["clusterMetas"].items(): cluster_name_map[cluster_id] = cluster_data["name"] preprocessing_handler = ClusteringPreprocessingHandler( {}, preprocessing_params, model_folder) preprocessing_handler.collector_data = collector_data pipeline = preprocessing_handler.build_preprocessing_pipeline() with open(osp.join(model_folder, "clusterer.pkl"), "rb") as f: clf = pickle.load(f) try: logging.info("Post-processing model") clf.post_process(user_meta) except AttributeError: # method does not exist if model cannot be post-processed, just pass pass try: custom_labels = clf.get_cluster_labels() def map_fun_custom(i): name = custom_labels[i] return cluster_name_map.get(name, name) naming = map_fun_custom except AttributeError: def map_fun(i): name = "cluster_%i" % i return cluster_name_map.get(name, name) naming = map_fun def output_generator(): logging.info("Start output generator ...") (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st( preparation_output_schema["columns"], parse_dates=True, infer_with_pandas=False) logging.info("Reading with INITIAL dtypes: %s" % dtypes) dtypes = utils.ml_dtypes_from_dss_schema( preparation_output_schema, preprocessing_params["per_feature"]) logging.info("Reading with dtypes: %s" % dtypes) for input_df in input_dataset.iter_dataframes_forced_types( names, dtypes, parse_date_columns, chunksize=100000): input_df.index = range(input_df.shape[0]) input_df_orig = input_df.copy() if recipe_desc.get("filterInputColumns", False): input_df_orig = input_df_orig[recipe_desc["keptInputColumns"]] logging.info("Got a dataframe : %s" % str(input_df.shape)) normalize_dataframe(input_df, preprocessing_params['per_feature']) for col in input_df: logging.info("NORMALIZED: %s -> %s" % (col, input_df[col].dtype)) logging.info("Processing it") transformed = pipeline.process(input_df) logging.info("Applying it") (labels_arr, additional_columns) = clustering_predict(modeling_params, clf, transformed) cluster_labels = pd.Series(labels_arr, name="cluster_labels").map(naming) cluster_labels.index = transformed["TRAIN"].index final_df = pd.concat([ input_df_orig.join(cluster_labels, how='left'), additional_columns ], axis=1) if preprocessing_params["outliers"]["method"] == "CLUSTER": outliers_cluter_name = cluster_name_map.get( constants.CLUSTER_OUTLIERS, constants.CLUSTER_OUTLIERS) final_df['cluster_labels'].fillna(outliers_cluter_name, inplace=True) logging.info("Done predicting it") yield final_df output_dataset = dataiku.Dataset(output_dataset_smartname) logging.info("Starting writer") with output_dataset.get_writer() as writer: i = 0 logging.info("Starting to iterate") for output_df in output_generator(): logging.info("Generator generated a df %s" % str(output_df.shape)) #if i == 0: # output_dataset.write_schema_from_dataframe(output_df) i = i + 1 writer.write_dataframe(output_df) logging.info("Output df written")
def train_prediction_kfold(core_params, preprocessing_set, split_desc): start = unix_time_millis() preprocessing_params = preprocessing_set['preprocessing_params'] modeling_sets = preprocessing_set["modelingSets"] logging.info("PPS is %s" % preprocessing_params) preprocessing_listener = ProgressListener() preprocessing_listener.add_future_steps(constants.PRED_KFOLD_PREPROCESSING_STATES) for modeling_set in modeling_sets: listener = ProgressListener() listener.add_future_steps(constants.PRED_KFOLD_TRAIN_STATES) modeling_set["listener"] = listener def update_one_preprocessing_state(modeling_set): status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) def update_preprocessing_state(): map(update_one_preprocessing_state, modeling_sets) with preprocessing_listener.push_state(constants.STATE_LOADING_SRC): update_preprocessing_state() full_df = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"], core_params["prediction_type"]) logging.info("Loaded full_df df: shape=(%d,%d)" % full_df.shape) with preprocessing_listener.push_state(constants.STATE_COLLECTING): update_preprocessing_state() collector = PredictionPreprocessingDataCollector(full_df, preprocessing_params) collector_data = collector.build() pipeline, preproc_handler = build_pipeline_and_handler(collector_data, core_params, preprocessing_set['run_folder'], preprocessing_params) with preprocessing_listener.push_state(constants.KFOLDSTATE_PREPROCESS_GLOBAL): update_preprocessing_state() transformed_full = pipeline.fit_and_process(full_df) preproc_handler.save_data() preproc_handler.report(pipeline) update_preprocessing_state() preprocessing_end = unix_time_millis() train_X = transformed_full["TRAIN"] train_y = transformed_full["target"] weight_method = core_params.get("weight", {}).get("weightMethod", None) with_sample_weight = weight_method in {"SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"} with_class_weight = weight_method in {"CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"} calibrate_proba = core_params.get("calibration", {}).get("calibrationMethod", None) in ["SIGMOID", "ISOTONIC"] if with_sample_weight: assert transformed_full["weight"].values.min() > 0, "Sample weights must be positive" for modeling_set in modeling_sets: model_start = unix_time_millis() update_fn = lambda: update_one_preprocessing_state(modeling_set) if core_params["prediction_type"] in (constants.BINARY_CLASSIFICATION, constants.MULTICLASS): with modeling_set["listener"].push_state(constants.KFOLDSTATE_FITTING_GLOBAL): # no out-fold available, so calibrate through classification_fit on a random split if calibrate_proba: calibration_method = core_params.get("calibration", {}).get("calibrationMethod").lower() else: calibration_method = None update_one_preprocessing_state(modeling_set) (clf, out_params, prepared_X, iipd) = classification_fit(modeling_set['modelingParams'], split_desc, transformed_full, core_params["prediction_type"], modeling_set['run_folder'], target_map=preproc_handler.target_map, with_sample_weight=with_sample_weight, with_class_weight=with_class_weight, calibration=calibration_method) save_prediction_model(clf, out_params, modeling_set["listener"], update_fn, modeling_set['run_folder']) with modeling_set["listener"].push_state(constants.KFOLDSTATE_SCORING_GLOBAL): update_one_preprocessing_state(modeling_set) ClassificationModelIntrinsicScorer(modeling_set['modelingParams'], clf, train_X, train_y, pipeline, modeling_set['run_folder'], prepared_X, iipd, calibrate_proba).score() if core_params["prediction_type"] == constants.BINARY_CLASSIFICATION: BinaryModelSerializer(train_X.columns(), clf, modeling_set['modelingParams'], modeling_set['run_folder'], preproc_handler.target_map, calibrate_proba).serialize() else: MulticlassModelSerializer(train_X.columns(), clf, modeling_set['modelingParams'], modeling_set['run_folder'], preproc_handler.target_map, calibrate_proba).serialize() else: with modeling_set["listener"].push_state(constants.KFOLDSTATE_FITTING_GLOBAL): update_one_preprocessing_state(modeling_set) (clf, out_params, prepared_X, iipd) = regression_fit_single(modeling_set['modelingParams'], split_desc, transformed_full, modeling_set["run_folder"], with_sample_weight=with_sample_weight) save_prediction_model(clf, out_params, modeling_set["listener"], update_fn, modeling_set['run_folder']) with modeling_set["listener"].push_state(constants.KFOLDSTATE_SCORING_GLOBAL): update_fn() RegressionModelIntrinsicScorer(modeling_set['modelingParams'], clf, train_X, train_y, pipeline, modeling_set['run_folder'], prepared_X, iipd).score() # serialize the model if possible RegressionModelSerializer(train_X.columns(), clf, modeling_set['modelingParams'], modeling_set['run_folder']).serialize() full_df_clean = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"], core_params["prediction_type"]) optimized_params = out_params["resolved"] logging.info("Regridifying post-train params: %s" % json.dumps(optimized_params)) # Regridify to a unary grid the optimized params optimized_params_grid = intercom.backend_json_call("ml/prediction/regridify-to-pretrain", { "preTrain" : json.dumps(modeling_set["modelingParams"]), "postTrain" : json.dumps(optimized_params) }) logging.info("Using unary grid params: %s" % json.dumps(optimized_params_grid)) prediction_train_model_kfold(full_df_clean, core_params, split_desc, preprocessing_params, optimized_params_grid, preprocessing_set['run_folder'], modeling_set['run_folder'], modeling_set["listener"], update_fn, with_sample_weight, with_class_weight, calibrate_proba) end = unix_time_millis() utils.write_done_traininfo(modeling_set['run_folder'], start, model_start, end, (preprocessing_listener, modeling_set["listener"]), end_preprocessing_time=preprocessing_end) return "ok"
def create_ensemble(split_desc, core_params, model_folder, preprocessing_folder, model_folders, preprocessing_folders): listener = ProgressListener() listener.add_future_steps(constants.ENSEMBLE_STATES) start = unix_time_millis() def update_preprocessing_state(): utils.write_running_traininfo(model_folder, start, listener) split_desc = dkujson.loads(split_desc) core_params = dkujson.loads(core_params) weight_method = core_params.get("weight", {}).get("weightMethod", None) with_sample_weight = weight_method in {"SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"} # TODO: update downstream with_class_weight = weight_method in {"CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"} preprocessing_folders = dkujson.loads(preprocessing_folders) model_folders = dkujson.loads(model_folders) modeling_params = dkujson.load_from_filepath(osp.join(model_folder, "rmodeling_params.json")) ensemble_params = modeling_params["ensemble_params"] logging.info("creating ensemble") with listener.push_state(constants.STATE_ENSEMBLING): update_preprocessing_state() from dataiku.doctor.prediction.ensembles import ensemble_from_fitted train = df_from_split_desc(split_desc, "train", ensemble_params["preprocessing_params"][0]["per_feature"], core_params["prediction_type"]) iperf = { "modelInputNRows" : train.shape[0], #todo : not the right count as may have dropped ... "modelInputNCols" : -1, # makes no sense for an ensemble as may have different preprocessings "modelInputIsSparse" : False } dkujson.dump_to_filepath(osp.join(model_folder, "iperf.json"), iperf) clf = ensemble_from_fitted(core_params, ensemble_params, preprocessing_folders, model_folders, train, with_sample_weight, with_class_weight) logging.info("saving model") with listener.push_state(constants.STATE_SAVING): update_preprocessing_state() with open(osp.join(model_folder, "clf.pkl"), dku_write_mode_for_pickling()) as f: pickle.dump(clf, f, 2) logging.info("scoring model") with listener.push_state(constants.STATE_SCORING): update_preprocessing_state() test = df_from_split_desc(split_desc, "test", ensemble_params["preprocessing_params"][0]["per_feature"], core_params["prediction_type"]) # this is annoying, but we have to use one of the previous preprocessings in order to get the target prep_folder = preprocessing_folders[0] rppp = dkujson.load_from_filepath(osp.join(prep_folder, "rpreprocessing_params.json")) collector_data = dkujson.load_from_filepath(osp.join(prep_folder, "collector_data.json")) preprocessing_handler = PreprocessingHandler.build(core_params, rppp, prep_folder) preprocessing_handler.collector_data = collector_data pipe = preprocessing_handler.build_preprocessing_pipeline(with_target=True) transformed = pipe.process(test) y = transformed["target"] if with_sample_weight: sample_weight = transformed["weight"] else: sample_weight = None # Now that the CLF with scorable pipelines has been saved, set it in "pipelines with target" mode # to be able to compute metrics clf.set_with_target_pipelines_mode(True) pred = clf.predict(test) probas = None if core_params["prediction_type"] == "REGRESSION" else clf.predict_proba(test) target_map = None if core_params["prediction_type"] == "REGRESSION" else \ {t["sourceValue"]: t["mappedValue"] for t in ensemble_params["preprocessing_params"][0]["target_remapping"]} prediction_type = core_params["prediction_type"] if prediction_type == "REGRESSION": RegressionModelScorer(modeling_params, clf, pred, y, model_folder, transformed, test.index.copy(), sample_weight).score() elif prediction_type == "BINARY_CLASSIFICATION": BinaryClassificationModelScorer(modeling_params, clf, model_folder, pred, probas, y, target_map, transformed, test.index.copy(), sample_weight).score() else: MulticlassModelScorer(modeling_params, clf, model_folder, pred, probas, y.astype(int), target_map, transformed, test.index.copy(), sample_weight).score() update_preprocessing_state() end = unix_time_millis() dkujson.dump_to_filepath(osp.join(model_folder, "actual_params.json"), {"resolved": modeling_params}) dkujson.dump_to_filepath(osp.join(preprocessing_folder, "preprocessing_report.json"), {}) utils.write_done_traininfo(model_folder, start, end, end, listener, end_preprocessing_time=start) return "ok"
def train_clustering_models_nosave( split_desc, preprocessing_set): """Regular (mode 1) train: - Non streamed single split + fit preprocess on train + preprocess test - Fit N models sequentially - Fit - Save clf - Compute and save clf performance - Score, save scored test set + scored performnace """ start = unix_time_millis() preprocessing_listener = ProgressListener() preprocessing_params = preprocessing_set["preprocessing_params"] modeling_sets = preprocessing_set["modelingSets"] # Fill all the listeners ASAP to have correct progress data preprocessing_listener.add_future_steps(constants.CLUSTERING_REGULAR_PREPROCESSING_STATES) for modeling_set in modeling_sets: listener = ProgressListener() listener.add_future_steps(constants.ALL_CLUSTERING_TRAIN_STATES) modeling_set["listener"] = listener # Called by the preprocessing pipeline to update the state # of each model and dump it to disk def update_preprocessing_state(): for modeling_set in modeling_sets: status = { "modelId" : modeling_set["modelId"], "state": "RUNNING", "startTime": start, "progress" : merge_listeners(preprocessing_listener, modeling_set["listener"]) } utils.write_model_status(modeling_set, status) logging.info("START TRAIN :" + preprocessing_set["description"]) preprocessing_params = preprocessing_set["preprocessing_params"] with preprocessing_listener.push_state(constants.STATE_LOADING_SRC): update_preprocessing_state() source_df = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"]) logging.info("Loaded source df: shape=(%d,%d)" % source_df.shape) with preprocessing_listener.push_state(constants.STATE_COLLECTING): update_preprocessing_state() collector = ClusteringPreprocessingDataCollector(source_df, preprocessing_params) collector_data = collector.build() preproc_handler = ClusteringPreprocessingHandler({}, preprocessing_set["preprocessing_params"], preprocessing_set["run_folder"]) preproc_handler.collector_data = collector_data pipeline = preproc_handler.build_preprocessing_pipeline() with preprocessing_listener.push_state(constants.STATE_PREPROCESS_SRC): update_preprocessing_state() source_df_index = source_df.index.copy() # TODO: fit_and_process should take an update_fn argument transformed_source = pipeline.fit_and_process(source_df) # Saves fitted resources and collector data preproc_handler.save_data() # Report on work report = {} pipeline.report_fit(report, {}) utils.write_preproc_file(preprocessing_set["run_folder"], "preprocessing_report.json", report) update_preprocessing_state() preprocessing_end = unix_time_millis() for modeling_set in modeling_sets: model_start = unix_time_millis() def update_modeling_state(): status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) clustering_train_score_save(transformed_source, source_df_index, preprocessing_set["preprocessing_params"], modeling_set["modelingParams"], modeling_set["run_folder"], listener, update_modeling_state, pipeline) model_end = end = unix_time_millis() end = unix_time_millis() # Write the final model training info status = { "modelId": modeling_set["modelId"], "state": "DONE", "startTime": start, "endTime": end, "preprocessingTime": preprocessing_end - start, "trainingTime": model_end - model_start, "progress": merge_listeners(preprocessing_listener, modeling_set["listener"]) } utils.write_model_status(modeling_set, status) return "ok"
def train_prediction_keras(core_params, preprocessing_set, split_desc): start = unix_time_millis() preprocessing_params = preprocessing_set["preprocessing_params"] modeling_sets = preprocessing_set["modelingSets"] run_folder = preprocessing_set["run_folder"] logging.info("PPS is %s" % preprocessing_params) preprocessing_listener = ProgressListener() # Fill all the listeners ASAP to have correct progress data preprocessing_listener.add_future_steps(constants.PRED_KERAS_PREPROCESSING_STATES) for modeling_set in modeling_sets: listener = ProgressListener() listener.add_future_steps(constants.PRED_KERAS_TRAIN_STATES) modeling_set["listener"] = listener # Called by the preprocessing pipeline to update the state # of each model and dump it to disk def update_preprocessing_state(): for modeling_set in modeling_sets: status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) with preprocessing_listener.push_state(constants.STATE_LOADING_TRAIN): update_preprocessing_state() train_df = df_from_split_desc(split_desc, "train", preprocessing_params['per_feature'], core_params["prediction_type"]) train_df_orig = train_df.copy() logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape) # Not implemented in the UI so far, so processor_fit_df will always be train_df preprocessor_fit_df = train_df need_subsampling = preprocessing_params["preprocessingFitSampleRatio"] < 1 if need_subsampling: preprocessor_fit_df = preprocessor_fit_df.sample(frac=preprocessing_params["preprocessingFitSampleRatio"], random_state=preprocessing_params["preprocessingFitSampleSeed"]) with preprocessing_listener.push_state(constants.STATE_LOADING_TEST): update_preprocessing_state() test_df = df_from_split_desc(split_desc, "test", preprocessing_params['per_feature'], core_params["prediction_type"]) logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape) with preprocessing_listener.push_state(constants.STATE_COLLECTING): update_preprocessing_state() collector = PredictionPreprocessingDataCollector(preprocessor_fit_df, preprocessing_params) collector_data = collector.build() # Tagging special features to take them into account only in special_preproc_handler/special_pipeline per_feature = preprocessing_params["per_feature"] tag_special_features(per_feature) pipeline, preproc_handler = build_pipeline_and_handler(collector_data, core_params, run_folder, preprocessing_params, allow_empty_mf=True) with preprocessing_listener.push_state(constants.KERASSTATE_FIT_NORMAL_PREPROCESSING): update_preprocessing_state() # Retrieving transformed values to get the shape of all regular inputs, even if won't be # actually used, as each batch of data will be processed again transformed_normal = pipeline.fit_and_process(preprocessor_fit_df) preproc_handler.save_data() preproc_handler.report(pipeline) # TODO: REVIEW STATES OF TRAINING with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TRAIN): update_preprocessing_state() with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TEST): update_preprocessing_state() update_preprocessing_state() preprocessing_end = unix_time_millis() for modeling_set in modeling_sets: model_start = unix_time_millis() # Settings env variable that may be accessed in user defined code remoterun.set_dku_env_var_and_sys_env_var(constants.DKU_CURRENT_ANALYSIS_ID, modeling_set["fullId"]["taskLoc"]["analysisId"]) remoterun.set_dku_env_var_and_sys_env_var(constants.DKU_CURRENT_MLTASK_ID, modeling_set["fullId"]["taskLoc"]["mlTaskId"]) def update_modeling_state(): status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) prediction_train_model_keras(transformed_normal, train_df_orig, test_df, pipeline, modeling_set["modelingParams"], core_params, per_feature, modeling_set["run_folder"], modeling_set["listener"], update_modeling_state, preproc_handler.target_map, pipeline.generated_features_mapping) end = unix_time_millis() utils.write_done_traininfo(modeling_set["run_folder"], start, model_start, end, (preprocessing_listener, modeling_set["listener"]), end_preprocessing_time=preprocessing_end) return "ok"
def train_prediction_models_nosave(core_params, preprocessing_set, split_desc): """Regular (mode 1) train: - Non streamed single split + fit preprocess on train + preprocess test - Fit N models sequentially - Fit - Save clf - Compute and save clf performance - Score, save scored test set + scored performnace """ start = unix_time_millis() preprocessing_params = preprocessing_set["preprocessing_params"] modeling_sets = preprocessing_set["modelingSets"] logging.info("PPS is %s" % preprocessing_params) preprocessing_listener = ProgressListener() # Fill all the listeners ASAP to have correct progress data preprocessing_listener.add_future_steps(constants.PRED_REGULAR_PREPROCESSING_STATES) for modeling_set in modeling_sets: listener = ProgressListener() if modeling_set.get('modelingParams', {}).get('gridLength', 1) > 1: listener.add_future_step(constants.STATE_GRIDSEARCHING) listener.add_future_steps(constants.PRED_REGULAR_TRAIN_STATES) modeling_set["listener"] = listener # Called by the preprocessing pipeline to update the state # of each model and dump it to disk def update_preprocessing_state(): for modeling_set in modeling_sets: status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) with preprocessing_listener.push_state(constants.STATE_LOADING_TRAIN): update_preprocessing_state() train_df = df_from_split_desc(split_desc, "train", preprocessing_params['per_feature'], core_params["prediction_type"]) logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape) for col in train_df: logging.info("Train col : %s (%s)" % (col, train_df[col].dtype)) with preprocessing_listener.push_state(constants.STATE_LOADING_TEST): update_preprocessing_state() test_df = df_from_split_desc(split_desc, "test", preprocessing_params['per_feature'], core_params["prediction_type"]) logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape) with preprocessing_listener.push_state(constants.STATE_COLLECTING): update_preprocessing_state() collector = PredictionPreprocessingDataCollector(train_df, preprocessing_params) collector_data = collector.build() pipeline, preproc_handler = build_pipeline_and_handler(collector_data, core_params, preprocessing_set['run_folder'], preprocessing_params) with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TRAIN): update_preprocessing_state() # TODO: fit_and_process should take an update_fn argument transformed_train = pipeline.fit_and_process(train_df) preproc_handler.save_data() preproc_handler.report(pipeline) with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TEST): update_preprocessing_state() test_df_index = test_df.index.copy() transformed_test = pipeline.process(test_df) update_preprocessing_state() preprocessing_end = unix_time_millis() for modeling_set in modeling_sets: model_start = unix_time_millis() def update_modeling_state(): status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) # since ensembles are never fitted through the doctor, no need to distinguish here prediction_train_score_save(transformed_train, transformed_test, test_df_index, core_params, split_desc, modeling_set["modelingParams"], modeling_set["run_folder"], modeling_set["listener"], preproc_handler.target_map, update_modeling_state, pipeline, modeling_set["run_folder"]) end = unix_time_millis() utils.write_done_traininfo(modeling_set["run_folder"], start, model_start, end, (preprocessing_listener, modeling_set["listener"]), end_preprocessing_time=preprocessing_end) return "ok"
BinaryClassificationModelScorer from dataiku.doctor.clustering.clustering_scorer import ClusteringModelScorer from dataiku.doctor.prediction.regression_scoring import RegressionModelIntrinsicScorer, RegressionModelScorer from dataiku.doctor.prediction_entrypoints import prediction_train_score_save, prediction_train_model_kfold, \ prediction_train_model_keras from dataiku.doctor.clustering_entrypoints import clustering_train_score_save from dataiku.doctor.utils import ProgressListener, unix_time_millis, dku_write_mode_for_pickling from dataiku.doctor.notebook_builder import PredictionNotebookBuilder, ClusteringNotebookBuilder from dataiku.core import dkujson, intercom from dataiku.doctor.deep_learning.keras_utils import tag_special_features from dataiku.doctor.utils.split import df_from_split_desc from dataiku.doctor.utils import merge_listeners from dataiku.doctor import utils from dataiku.doctor.posttraining import partial_depency as pdp, subpopulation as subpopulation preprocessing_listener = ProgressListener() global_modeling_sets = [] def _list_commands(): current_module = sys.modules[__name__] return [ (func_name, func) for (func_name, func) in current_module.__dict__.items() if not func_name.startswith("_") and inspect.isfunction(func) and inspect.getmodule(func) == current_module ] def create_prediction_notebook(model_name, model_date, dataset_smartname, script, preparation_output_schema, split_stuff,