def save(self, pd_result): iperf = dkujson.load_from_filepath( os.path.join(self.folder, "iperf.json")) if "partialDependencies" not in iperf: iperf["partialDependencies"] = [] for partial_dep in iperf["partialDependencies"]: if partial_dep.get('feature') == pd_result.feature.name: iperf["partialDependencies"].remove(partial_dep) break new_partial_dependence = { "data": list(pd_result.partial_dependence), "feature": pd_result.feature.name, "distribution": pd_result.distribution, "computedPostTraining": True, "isDate": self.dtypes[pd_result.feature.name] == "date", "unrepresentedModalities": pd_result.unrepresented_modalities, } if pd_result.indices_to_drop is not None: new_partial_dependence["indicesToDrop"] = pd_result.indices_to_drop if pd_result.feature.type == 'CATEGORY': new_partial_dependence["categories"] = list(pd_result.scale) elif pd_result.feature.type == 'NUMERIC': new_partial_dependence["featureBins"] = list(pd_result.scale) iperf["partialDependencies"].append(new_partial_dependence) dkujson.dump_to_filepath(os.path.join(self.folder, "iperf.json"), iperf) return iperf
def command(job_id, split_desc, core_params, preprocessing_folder, model_folder, computation_parameters): # LOADING INFO # model_handler = PredictionModelInformationHandler(split_desc, core_params, preprocessing_folder, model_folder) test_df = model_handler.get_test_df() # COMPUTING SUBPOPULATION # col_name = get_computation_parameter("column", computation_parameters) col_type = get_type_of_column(col_name, model_handler) if col_type == "CATEGORY": value = get_computation_parameter("value", computation_parameters) subpop_df = test_df[test_df[col_name] == value] else: raise NotImplementedError("Not implemented yet :-(") # COMPUTING NEW METRICS ON SUBPOP # prediction_type = model_handler.get_prediction_type() if prediction_type == constants.BINARY_CLASSIFICATION: results = compute_binary_subpopulation_metrics(subpop_df, model_handler) else: raise NotImplementedError("Not implemented yet :-(") dkujson.dump_to_filepath(osp.join(model_folder, "subpop.json"), results) return "ok"
def report(self, pipeline): report = {} if hasattr(self, "core_params"): pipeline.report_fit(report, self.core_params) else: pipeline.report_fit(report, {}) dkujson.dump_to_filepath( osp.join(self.data_path, "preprocessing_report.json"), report)
def __exit__(self, typ, val, tb): self._watching = False self.join() if self.m_folder is not None: self.aggregate_grid_dir() dkujson.dump_to_filepath(self.grid_search_file, self.grid_search_summary) self.cleanup()
def write_running_traininfo(folder, start_time, listener): status_filepath = osp.join(folder, "train_info.json") if osp.exists(status_filepath): status = dkujson.load_from_filepath(status_filepath) else: status = {} status["state"] = "RUNNING" status["startTime"] = start_time status["progress"] = listener.to_jsonifiable() dkujson.dump_to_filepath(status_filepath, status)
def score(self): logging.info("Intrinsic scoring of clustering model") if self.modeling_params['algorithm'] in ['PY_TWO_STEP']: dkujson.dump_to_filepath(self.pk_path('hierarchy.json'), self.clf.to_json(self.train_X, self._extract_rescalers())) # anomaly detection if self.modeling_params['algorithm'] in ['PY_ISOLATION_FOREST']: columns_to_keep = [s for s in list(set(self.profiling_df.columns) - (set(self.train_X.columns) | set(["cluster_labels"]))) if s[:6]!="dummy:"] extra_columns_df = self.profiling_df[columns_to_keep] # if there are actually two clusters (regular and anomaly) if self.profiling_df["cluster_labels"].nunique() > 1: dkujson.dump_to_filepath(self.pk_path('anomalies.json'), self.clf.get_top_outliers(self.train_X, self._extract_rescalers(), extra_columns_df))
def _serialize_pipeline_meta(self, name): meta = { "backend": "KERAS" if self.modeling_params.get("algorithm") == "KERAS_CODE" else "PY_MEMORY", "algorithm_name": name, "columns": self.columns } if self.target_mapping is not None: # because scikit does it own class mapping, we have to remap here. So the final classes will be different # from the target_mapping if some were missing from the training set inv_mapping = {x[1]: x[0] for x in self.target_mapping.items()} meta["classes"] = [inv_mapping[i] for i in self.clf.classes_] dkujson.dump_to_filepath( osp.join(self.model_folder, "dss_pipeline_meta.json"), meta)
def write_done_traininfo(folder, start_time, start_training_time, end_time, listener, end_preprocessing_time=None): status_filepath = osp.join(folder, "train_info.json") if osp.exists(status_filepath): status = dkujson.load_from_filepath(status_filepath) else: status = {} status["state"] = "DONE" status["startTime"] = start_time status["endTime"] = end_time status["preprocessingTime"] = (end_preprocessing_time or start_training_time) - start_time status["trainingTime"] = end_time - start_training_time if isinstance(listener, ProgressListener): status["progress"] = listener.to_jsonifiable() else: status["progress"] = reduce(merge_listeners, listener) dkujson.dump_to_filepath(status_filepath, status)
def save_prediction_model(clf, out_params, listener, update_fn, folder): import dataiku.doctor.constants as constants from dataiku.core import dkujson import os.path as osp try: import cPickle as pickle except: import pickle with listener.push_state(constants.STATE_SAVING): update_fn() # UGLY if hasattr(clf, "scorer"): clf.scorer = None if "scorer" in clf.params: del clf.params["scorer"] with open(osp.join(folder, "clf.pkl"), dku_write_mode_for_pickling()) as f: pickle.dump(clf, f, 2) dkujson.dump_to_filepath(osp.join(folder, "actual_params.json"), out_params)
def score(self): logging.info("Computing regression performance on %s\n", self.preds) self.ret["regression_performance"] = self.get_regression_performance(self.valid_Y, self.preds, self.sample_weight) # Scatter plot both = pd.DataFrame({ "predicted": self.preds, "actual": self.valid_Y }) nb_records = len(both.index) if nb_records < 1000: proba = 1 else: proba = 1000.0 / nb_records s, m = pdu.split_train_valid(both, prop=proba, seed=42) self.ret["scatterPlotData"] = {"x": [], "y": []} for record in s.itertuples(): self.ret["scatterPlotData"]["x"].append(float(record[1])) self.ret["scatterPlotData"]["y"].append(float("%.4f" % record[2])) # Metrics self.ret["metrics"] = compute_metrics(self.valid_Y, self.preds, self.sample_weight) if self.modeling_params["metrics"]["evaluationMetric"] == "CUSTOM": custom_scorefunc = get_custom_scorefunc(self.modeling_params, self.valid_unprocessed) self.ret["metrics"]["customScore"] = custom_scorefunc(self.valid_Y, self.preds, sample_weight=self.sample_weight) # Dump the predicted set if self.valid_X_index is not None: self.compute_predicted_data(self.preds, self.valid_X_index) # Dump the perf dkujson.dump_to_filepath(osp.join(self.out_folder, "perf.json"), self.ret) self.perf_data = self.ret return self.ret
def clustering_train_score_save(transformed_src, src_index, preprocessing_params, modeling_params, run_folder, listener, update_fn, pipeline): """Trains one model and saves results to run_folder""" with listener.push_state(constants.STATE_FITTING): update_fn() (clf, out_params, cluster_labels, additional_columns) = clustering_fit(modeling_params, transformed_src) with listener.push_state(constants.STATE_SAVING): update_fn() with open(osp.join(run_folder, "clusterer.pkl"), dku_write_mode_for_pickling()) as f: pickle.dump(clf, f, 2) dkujson.dump_to_filepath(osp.join(run_folder, "actual_params.json"), out_params) with listener.push_state(constants.STATE_SCORING): update_fn() ClusteringModelScorer(clf, transformed_src, src_index, cluster_labels, preprocessing_params, modeling_params, pipeline, run_folder).score()
def score(self): ret = self.iipd logging.info("Intrinsic scoring") if self.calibrate_proba: uncalibrated_clf = self.clf.base_estimator else: uncalibrated_clf = self.clf if self.modeling_params['algorithm'] in ['XGBOOST_CLASSIFICATION']: max_iterations = self.modeling_params['xgboost_grid']['n_estimators'] best_iteration = uncalibrated_clf._Booster.best_iteration early_stopping_rounds = self.modeling_params['xgboost_grid']['early_stopping_rounds'] ret["nBoostedEstimators"] = min(best_iteration + early_stopping_rounds, max_iterations) if 'feature_importances_' in dir(uncalibrated_clf): self.get_rf_raw_importance(uncalibrated_clf, ret) if self.modeling_params['algorithm'] in ['SCIKIT_MODEL']: # Make sure variable importances are normalized ri = ret["rawImportance"] weights_sum = sum(ri["importances"]) if weights_sum != 0: ri["importances"] = np.array(ri["importances"]) / float(weights_sum) # Regression coefficients for logit (binary only and only if not too many non-zero coef TODO @analysis) if self.modeling_params['algorithm'] in ['LOGISTIC_REGRESSION', 'SGD_CLASSIFICATION', 'LARS'] and uncalibrated_clf.coef_.shape[0] == 1: ret["lmCoefficients"] = _compute_coefs(uncalibrated_clf, self.train_X, self.prepared_X, self.train_y, self._extract_rescalers()) # Decision tree summary, not dumped in iperf json but separate file if self.modeling_params['algorithm'] in ['DECISION_TREE_CLASSIFICATION']: if not self.modeling_params.get("skipExpensiveReports"): logging.info("Creating decision tree summary") tree_summary = TreeSummaryBuilder(uncalibrated_clf, self.train_X.columns(), self._extract_rescalers(), False).build() dkujson.dump_to_filepath(osp.join(self.out_folder, "tree.json"), tree_summary) if self.modeling_params['algorithm'] == 'GBT_CLASSIFICATION': rescalers = self._extract_rescalers() # create tree summaries if not self.modeling_params.get("skipExpensiveReports"): logging.info("Creating gradient boosting trees summary") summary = GradientBoostingSummaryBuilder(uncalibrated_clf, self.train_X.columns(), rescalers, False, self.modeling_params["max_ensemble_nodes_serialized"]).build() dkujson.dump_to_filepath(osp.join(self.out_folder, "trees.json"), summary) # Compute partial dependencies ret["partialDependencies"] = PartialDependencyPlotBuilder(uncalibrated_clf, self.train_X, self.train_y, rescalers).build() if self.modeling_params['algorithm'] == 'RANDOM_FOREST_CLASSIFICATION': if not self.modeling_params.get("skipExpensiveReports"): logging.info("Creating random forest trees summary") summary = RandomForestSummaryBuilder(uncalibrated_clf, self.train_X.columns(), self._extract_rescalers(), False, self.modeling_params["max_ensemble_nodes_serialized"]).build() dkujson.dump_to_filepath(osp.join(self.out_folder, "trees.json"), summary) if self.modeling_params['algorithm'] == 'LARS': dkujson.dump_to_filepath(osp.join(self.out_folder, "coef_path.json"), { "path": [[[t for t in x] for x in c] for c in uncalibrated_clf.coef_path_], "features": self.train_X.columns(), "currentIndex": uncalibrated_clf.current_index }) # Learning curve if requested if self.modeling_params["computeLearningCurves"]: logging.info("Computing learning curves") train_X, is_sparse = prepare_multiframe(self.train_X, self.modeling_params) train_nbsamples = train_X.shape[0] train_y = self.train_y.astype(int) train_sizes, train_scores, valid_scores = learning_curve(uncalibrated_clf, train_X, train_y) ret["learningCurve"] = { "samples" : train_sizes, "trainScoreMean" : np.mean(train_scores, axis=1), "trainScoreStd": np.std(train_scores, axis=1), "cvScoreMean" : np.mean(valid_scores, axis=1), "cvScoreStd": np.std(valid_scores, axis=1) } ret["probaAware"] = is_proba_aware(self.modeling_params['algorithm'], uncalibrated_clf) # Dump the perf dkujson.dump_to_filepath(osp.join(self.out_folder, "iperf.json"), ret)
def _dku_fit_and_score(estimator, X, y, scorer, train, test, verbose, is_interruptible, parameters, cvwatcher, fit_params, error_score='raise', m_folder=None, split_id=None, parameter_id=None, sample_weight=None, algo_supports_weight=True): if cvwatcher.is_interrupted and is_interruptible: return None current_thread = threading.current_thread() current_thread.name = "GS-%s" % (current_thread.ident) if verbose > 1: if parameters is None: msg = '' else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) logging.info("Fit p=%s s=%s: %s %s" % (parameter_id, split_id, msg, (64 - len(msg)) * '.')) if parameters is not None: estimator.set_params(**parameters) start_time = unix_time_millis() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} # XGBoost early stopping if fit_params.get("early_stopping_rounds") is not None: if fit_params.get("eval_set") is None: # log the train and test objective but optimize on the test (last tuple used for early stopping eval) fit_params["eval_set"] = [(X_train, y_train), (X_test, y_test)] else: pass # still keep the possibility to use a fixed eval_set if sample_weight is not None: w_train, _ = _safe_split(estimator, sample_weight, y, train) w_test, _ = _safe_split(estimator, sample_weight, y, test) if algo_supports_weight: # fit with sample weights whenever they are enabled AND the algorithm supports them fit_params["sample_weight"] = np.array(w_train) fit_params = dict([(k, _dku_index_param_value(X, v, train)) for k, v in fit_params.items()]) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = unix_time_millis() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score train_score = error_score warnings.warn( "Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = unix_time_millis() - start_time if sample_weight is not None: # score with sample weights whenever they are enabled, regardless of the support by the algorithm test_score = _dku_score(estimator, X_test, y_test, scorer, sample_weight=w_test, indices=test) train_score = _dku_score(estimator, X_train, y_train, scorer, sample_weight=w_train, indices=train) else: test_score = _dku_score(estimator, X_test, y_test, scorer, indices=test) train_score = _dku_score(estimator, X_train, y_train, scorer, indices=train) score_time = unix_time_millis() - start_time - fit_time if verbose > 1: end_msg = "%s (ft=%.1fs st=%.1fs sc=%s)" % ( msg, fit_time / 1000, score_time / 1000, test_score) logging.info("Done p=%s s=%s: %s" % (parameter_id, split_id, end_msg)) num_samples = _num_samples(X_test) best_iteration = getattr(estimator, 'best_iteration', None) ret = { "train_score": train_score, "test_score": test_score, "num_samples": num_samples, "fit_time": fit_time, "score_time": score_time, "time": fit_time + score_time, "parameters": parameters, "parameter_id": parameter_id, "grid_point_id": get_grid_point_id(parameters, split_id), "best_iteration": best_iteration, "done_at": unix_time_millis() } if m_folder is not None: tmp_file = os.path.join( m_folder, 'grid.tmp/grid_search_{}.{}.gridpoint'.format( parameter_id, split_id)) dest_file = os.path.join( m_folder, 'grid/grid_search_{}.{}.gridpoint'.format(parameter_id, split_id)) dkujson.dump_to_filepath(tmp_file, ret) os.rename(tmp_file, dest_file) return ret
def create_ensemble(split_desc, core_params, model_folder, preprocessing_folder, model_folders, preprocessing_folders): listener = ProgressListener() listener.add_future_steps(constants.ENSEMBLE_STATES) start = unix_time_millis() def update_preprocessing_state(): utils.write_running_traininfo(model_folder, start, listener) split_desc = dkujson.loads(split_desc) core_params = dkujson.loads(core_params) weight_method = core_params.get("weight", {}).get("weightMethod", None) with_sample_weight = weight_method in {"SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"} # TODO: update downstream with_class_weight = weight_method in {"CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"} preprocessing_folders = dkujson.loads(preprocessing_folders) model_folders = dkujson.loads(model_folders) modeling_params = dkujson.load_from_filepath(osp.join(model_folder, "rmodeling_params.json")) ensemble_params = modeling_params["ensemble_params"] logging.info("creating ensemble") with listener.push_state(constants.STATE_ENSEMBLING): update_preprocessing_state() from dataiku.doctor.prediction.ensembles import ensemble_from_fitted train = df_from_split_desc(split_desc, "train", ensemble_params["preprocessing_params"][0]["per_feature"], core_params["prediction_type"]) iperf = { "modelInputNRows" : train.shape[0], #todo : not the right count as may have dropped ... "modelInputNCols" : -1, # makes no sense for an ensemble as may have different preprocessings "modelInputIsSparse" : False } dkujson.dump_to_filepath(osp.join(model_folder, "iperf.json"), iperf) clf = ensemble_from_fitted(core_params, ensemble_params, preprocessing_folders, model_folders, train, with_sample_weight, with_class_weight) logging.info("saving model") with listener.push_state(constants.STATE_SAVING): update_preprocessing_state() with open(osp.join(model_folder, "clf.pkl"), dku_write_mode_for_pickling()) as f: pickle.dump(clf, f, 2) logging.info("scoring model") with listener.push_state(constants.STATE_SCORING): update_preprocessing_state() test = df_from_split_desc(split_desc, "test", ensemble_params["preprocessing_params"][0]["per_feature"], core_params["prediction_type"]) # this is annoying, but we have to use one of the previous preprocessings in order to get the target prep_folder = preprocessing_folders[0] rppp = dkujson.load_from_filepath(osp.join(prep_folder, "rpreprocessing_params.json")) collector_data = dkujson.load_from_filepath(osp.join(prep_folder, "collector_data.json")) preprocessing_handler = PreprocessingHandler.build(core_params, rppp, prep_folder) preprocessing_handler.collector_data = collector_data pipe = preprocessing_handler.build_preprocessing_pipeline(with_target=True) transformed = pipe.process(test) y = transformed["target"] if with_sample_weight: sample_weight = transformed["weight"] else: sample_weight = None # Now that the CLF with scorable pipelines has been saved, set it in "pipelines with target" mode # to be able to compute metrics clf.set_with_target_pipelines_mode(True) pred = clf.predict(test) probas = None if core_params["prediction_type"] == "REGRESSION" else clf.predict_proba(test) target_map = None if core_params["prediction_type"] == "REGRESSION" else \ {t["sourceValue"]: t["mappedValue"] for t in ensemble_params["preprocessing_params"][0]["target_remapping"]} prediction_type = core_params["prediction_type"] if prediction_type == "REGRESSION": RegressionModelScorer(modeling_params, clf, pred, y, model_folder, transformed, test.index.copy(), sample_weight).score() elif prediction_type == "BINARY_CLASSIFICATION": BinaryClassificationModelScorer(modeling_params, clf, model_folder, pred, probas, y, target_map, transformed, test.index.copy(), sample_weight).score() else: MulticlassModelScorer(modeling_params, clf, model_folder, pred, probas, y.astype(int), target_map, transformed, test.index.copy(), sample_weight).score() update_preprocessing_state() end = unix_time_millis() dkujson.dump_to_filepath(osp.join(model_folder, "actual_params.json"), {"resolved": modeling_params}) dkujson.dump_to_filepath(osp.join(preprocessing_folder, "preprocessing_report.json"), {}) utils.write_done_traininfo(model_folder, start, end, end, listener, end_preprocessing_time=start) return "ok"
def write_preproc_file(run_folder, filename, obj): dkujson.dump_to_filepath(osp.join(run_folder, filename), obj)
def write_model_status(modeling_set, status): status_filepath = osp.join(modeling_set["run_folder"], "train_info.json") dkujson.dump_to_filepath(status_filepath, status)
def score(self): ret = self.iipd logging.info("Intrinsic scoring") if self.modeling_params['algorithm'] == 'DECISION_TREE_REGRESSION': if not self.modeling_params.get("skipExpensiveReports"): logging.info("Creating decision tree summary") tree_summary = TreeSummaryBuilder(self.clf, self.train_X.columns(), self._extract_rescalers(), True).build() dkujson.dump_to_filepath(osp.join(self.out_folder, "tree.json"), tree_summary) rescalers = self._extract_rescalers() ret["partialDependencies"] = _dt_pdp(self.clf, self.train_X, self.train_y, rescalers) if self.modeling_params['algorithm'] == 'GBT_REGRESSION': rescalers = self._extract_rescalers() # Create decision tree summary if not self.modeling_params.get("skipExpensiveReports"): logging.info("Creating gradient boosting trees summary") summary = GradientBoostingSummaryBuilder(self.clf, self.train_X.columns(), rescalers, True, self.modeling_params["max_ensemble_nodes_serialized"]).build() dkujson.dump_to_filepath(osp.join(self.out_folder, "trees.json"), summary) # Compute partial dependencies ret["partialDependencies"] = PartialDependencyPlotBuilder(self.clf, self.train_X, self.train_y, rescalers) \ .build() if self.modeling_params['algorithm'] == 'RANDOM_FOREST_REGRESSION': rescalers = self._extract_rescalers() ret["partialDependencies"] = _rf_pdp(self.clf, self.train_X, self.train_y, rescalers) if not self.modeling_params.get("skipExpensiveReports"): logging.info("Creating random forest trees summary") summary = RandomForestSummaryBuilder(self.clf, self.train_X.columns(), self._extract_rescalers(), True, self.modeling_params["max_ensemble_nodes_serialized"]).build() dkujson.dump_to_filepath(osp.join(self.out_folder, "trees.json"), summary) if self.modeling_params['algorithm'] in ['XGBOOST_REGRESSION']: max_iterations = self.modeling_params['xgboost_grid']['n_estimators'] best_iteration = self.clf._Booster.best_iteration early_stopping_rounds = self.modeling_params['xgboost_grid']['early_stopping_rounds'] ret["nBoostedEstimators"] = min(best_iteration + early_stopping_rounds, max_iterations) if self.modeling_params['algorithm'] == 'LARS': dkujson.dump_to_filepath(osp.join(self.out_folder, "coef_path.json"), { "path": [[[t] for t in x] for x in self.clf.coef_path_], "features": self.train_X.columns(), "currentIndex": self.clf.current_index }) if 'feature_importances_' in dir(self.clf): self.get_rf_raw_importance(self.clf, ret) if self.modeling_params['algorithm'] in ['SCIKIT_MODEL']: # Make sure variable importances are normalized ri = ret["rawImportance"] weights_sum = sum(ri["importances"]) if weights_sum != 0: ri["importances"] = np.array(ri["importances"]) / float(weights_sum) # compute coefs if model has any, except for SVM and XGBOOST where _coef can be missing if 'coef_' in dir(self.clf) and self.modeling_params['algorithm'] not in {"SVM_REGRESSION", "XGBOOST_REGRESSION"}: ret["lmCoefficients"] = _compute_coefs(self.clf, self.train_X, self.prepared_X, self.train_y, self._extract_rescalers()) dkujson.dump_to_filepath(osp.join(self.out_folder, "iperf.json"), ret)
def update_gridsearch_info(folder, grid_search_scores): status_filepath_tmp = osp.join(folder, "grid_search_scores.json.tmp") status_filepath = osp.join(folder, "grid_search_scores.json") dkujson.dump_to_filepath(status_filepath_tmp, grid_search_scores) os.rename(status_filepath_tmp, status_filepath)
def score(self, optimize_threshold = False): self.use_probas = is_proba_aware(self.modeling_params["algorithm"], self.clf) check_test_set_ok_for_classification(self.valid_y) # Not clear whether this is good or not ... # all_classes_in_test_set = np.unique(self.valid_y) # all_classes_in_pred = np.unique(self.preds) # logging.info(" IN TEST: %s" % all_classes_in_test_set) # logging.info(" IN PRED: %s" % all_classes_in_pred) # for cls in all_classes_in_pred: # if not cls in all_classes_in_test_set: # raise Exception("One of the classes predicted by the model (%s) is not in the test set. Cannot proceed." % (cls)) # Compute unmapped preds if self.target_map: self.mapped_preds = np.zeros(self.preds.shape, np.object) for k, v in self.target_map.items(): self.mapped_preds[self.preds == v] = k else: self.mapped_preds = self.preds # Confusion matrix self.ret["classes"] = self.classes self.ret["confusion"] = self.get_multiclass_confusion_matrix() logging.info("Calculated confusion matrix") # 1-vs-all ROC for proba-aware classifiers if self.use_probas: self.ret["oneVsAllRocAUC"] = {} self.ret["oneVsAllRocCurves"] = {} self.ret["oneVsAllCalibrationCurves"] = {} self.ret["oneVsAllCalibrationLoss"] = {} for class_selected in self.classes: class_selected_id = int(self.target_map[class_selected]) logging.info("Make ROC, valid_y=%s" % self.valid_y) logging.info("Make ROC, probas=%s" % self.probas[:,class_selected_id]) try: false_positive_rates, true_positive_rates, thresholds = \ roc_curve(self.valid_y, self.probas[:, class_selected_id], class_selected_id, self.sample_weight) roc_data = zip(false_positive_rates, true_positive_rates, thresholds) logging.info("AUC %s %s" % (false_positive_rates, true_positive_rates)) self.ret["oneVsAllRocCurves"][class_selected] = [{"x": x, "y": y, "p": p} for (x, y, p) in trim_curve(roc_data)] self.ret["oneVsAllRocAUC"][class_selected] = auc(false_positive_rates, true_positive_rates) except Exception as e: logging.error(e) continue finally: try: y_bin = (self.valid_y.values == int(class_selected_id)).astype(int) freqs, avg_preds, weights = dku_calibration_curve(y_bin, self.probas[:,int(class_selected_id)], n_bins=10, sample_weight=self.sample_weight) zipped = [(t, p, n) for (t, p, n) in zip(freqs, avg_preds, weights) if not np.isnan(t + p + n)] curve = [{"y": 0, "x": 0, "n": 0}] + [{"y": t, "x": p, "n": n} for (t, p, n) in zipped] + [{"y": 1, "x": 1, "n": 0}] self.ret["oneVsAllCalibrationCurves"][class_selected] = curve self.ret["oneVsAllCalibrationLoss"][class_selected] = dku_calibration_loss([x[0] for x in zipped], [x[1] for x in zipped], [x[2] for x in zipped]) except Exception as e: logging.error(e) self.ret["densityData"] = format_all_proba_density(self.classes, self.target_map, self.probas, self.valid_y, self.sample_weight) self.ret["metrics"] = {} if self.use_probas: self.ret["metrics"]["mrocAUC"] = mroc_auc_score(self.valid_y, self.probas, self.sample_weight) self.ret["metrics"]["mcalibrationLoss"] = sum(self.ret["oneVsAllCalibrationLoss"].values()) / len(self.classes) if self.modeling_params["metrics"]["evaluationMetric"] == "CUSTOM": custom_scorefunc = get_custom_scorefunc(self.modeling_params, self.valid_unprocessed) if self.modeling_params["metrics"]["customEvaluationMetricNeedsProba"]: self.ret["metrics"]["customScore"] = custom_scorefunc(self.valid_y, self.probas, sample_weight=self.sample_weight) else: self.ret["metrics"]["customScore"] = custom_scorefunc(self.valid_y, self.preds, sample_weight=self.sample_weight) self.ret["metrics"]["precision"] = precision_score(self.valid_y, self.preds, average='macro', pos_label=None, sample_weight=self.sample_weight) self.ret["metrics"]["recall"] = recall_score(self.valid_y, self.preds, average='macro', pos_label=None, sample_weight=self.sample_weight) self.ret["metrics"]["f1"] = f1_score(self.valid_y, self.preds, average='macro', pos_label=None, sample_weight=self.sample_weight) self.ret["metrics"]["accuracy"] = accuracy_score(self.valid_y, self.preds, sample_weight=self.sample_weight) self.ret["metrics"]["hammingLoss"] = hamming_loss(self.valid_y, self.preds, sample_weight=self.sample_weight) try: self.ret["metrics"]["logLoss"] = log_loss(self.valid_y, self.probas, sample_weight=self.sample_weight) except: # log loss only possible if all classes found, not always the case ... pass # Dump the predicted set if self.valid_X_index is not None: if self.use_probas: proba_df = pd.DataFrame(self.probas, columns = ["proba_%s" %x for x in self.classes]) pred_df = pd.DataFrame({"prediction": self.mapped_preds}) out_df = pd.concat([proba_df, pred_df], axis=1) # Realign out_df.index = self.valid_X_index full = pd.DataFrame(index = self.test_df_index) out_df = full.join(out_df, how="left") out_df.to_csv(self.out_folder +"/predicted.csv", sep="\t", header=True, index=False, encoding='utf-8') self.predicted_df = out_df else: pred_df = pd.DataFrame({"prediction": self.mapped_preds}) # Realign pred_df.index = self.valid_X_index full = pd.DataFrame(index = self.test_df_index) pred_df = full.join(pred_df, how="left") pred_df.to_csv(self.out_folder +"/predicted.csv", sep="\t", header=True, index=False, encoding='utf-8') self.predicted_df = pred_df # Dump the perf self.ret = remove_all_nan(self.ret) self.perf_data = self.ret dkujson.dump_to_filepath(osp.join(self.out_folder, "perf.json"), self.ret) return self.ret
def score(self): if self.use_probas: optimize_threshold = self.modeling_params["autoOptimizeThreshold"] forced_threshold = self.modeling_params["forcedClassifierThreshold"] # Compute probas on classifier and create cut data (nb_rows, nb_present_classes) = self.probas.shape logging.info("Probas raw shape %s/%s target_map=%s", nb_rows, nb_present_classes, len(self.target_map)) new_probas = np.zeros((nb_rows, len(self.target_map))) if not self.ignore_num_classes: for j in range(nb_present_classes): actual_class_id = self.clf.classes_[j] new_probas[:, actual_class_id] = self.probas[:, j] self.probas = new_probas # Compute all per-cut data probas_one = pd.Series(data=self.probas[:, 1], name='predicted') pcd = { "cut" : [], "tp" : [], "tn" : [], "fp":[], "fn":[], "precision":[], "recall": [], "accuracy": [], "f1" :[], "mcc" :[], "hammingLoss" :[]} # np.sort shouldn't be necessary but works around a microbug leading to non-monotonous percentiles. # See https://github.com/numpy/numpy/issues/10373 # Percentiles could include [..., a, b, a, ...] with b < a at the 15 or 16th decimal place, # which could lead to different probaPercentile results at prediction time. self.ret["probaPercentiles"] = np.sort(probas_one.quantile([float(x + 1) / 100 for x in range(99)]).values) custom_scorefunc = None if self.modeling_params["metrics"]["evaluationMetric"] == "CUSTOM": pcd["customScore"] = [] custom_scorefunc = get_custom_scorefunc(self.modeling_params, self.valid_unprocessed) custom_needsproba = self.modeling_params["metrics"]["customEvaluationMetricNeedsProba"] for cut in np.arange(0.0, 1.0, 0.025): decision = probas_one > cut pcd["cut"].append(cut) conf = confusion_matrix(self.valid_y, decision, sample_weight=self.sample_weight) pcd["tp"].append(conf[1,1]) pcd["tn"].append(conf[0,0]) pcd["fp"].append(conf[0,1]) pcd["fn"].append(conf[1,0]) pcd["precision"].append(1.0 if conf[1,1] == 0 and conf[0,1] == 0 \ else precision_score(self.valid_y, decision, sample_weight=self.sample_weight)) pcd["recall"].append(recall_score(self.valid_y, decision, sample_weight=self.sample_weight)) pcd["f1"].append(f1_score(self.valid_y, decision, sample_weight=self.sample_weight)) pcd["accuracy"].append(accuracy_score(self.valid_y, decision, sample_weight=self.sample_weight)) pcd["mcc"].append(matthews_corrcoef(self.valid_y, decision, sample_weight=self.sample_weight)) pcd["hammingLoss"].append(hamming_loss(self.valid_y, decision, sample_weight=self.sample_weight)) if custom_scorefunc is not None and not custom_needsproba: decision_with_valid_index = decision.copy() decision_with_valid_index.index = self.valid_y.index ret = custom_scorefunc(self.valid_y, decision_with_valid_index, sample_weight=self.sample_weight) if ret is None: pcd["customScore"].append(0) else: pcd["customScore"].append(ret) self.ret["perCutData"] = pcd if optimize_threshold: best_cut = compute_otimized_threshold(self.valid_y, self.probas, self.modeling_params, self.sample_weight) self.ret["optimalThreshold"] = best_cut used_threshold = best_cut else: used_threshold = forced_threshold self.ret["usedThreshold"] = used_threshold # Compute predictions based on the threshold probas_one = pd.Series(data=self.probas[:, 1], name='predicted') self.preds = (probas_one > used_threshold).astype(np.int) else: pass #todo : remove branching if we don't need the pandas series cast # No probas on clf, compute predictions directly # self.preds = pd.Series(self.clf.predict(self.valid_X).astype(np.int)) if self.target_map: self.mapped_preds = np.zeros(self.preds.shape, np.object) logging.info("preds %s" % self.preds) logging.info("MAPPED SHAPE %s" % self.mapped_preds.shape) for k, v in self.target_map.items(): v = int(v) logging.info("k=%s v=%s" % (k,v)) mask = self.preds == v logging.info("Mask data %s", mask.values) logging.info("mapped pred %s" % self.mapped_preds.__class__) self.mapped_preds[mask.values] = k else: self.mapped_preds = self.preds logging.info("MAPPED PREDS %s" % self.mapped_preds) if self.use_probas: # Threshold-independent metrics self.ret["tiMetrics"] = {} self.ret["tiMetrics"]["auc"] = mroc_auc_score(self.valid_y, self.probas, sample_weight=self.sample_weight) self.ret["tiMetrics"]["logLoss"] = log_loss(self.valid_y, self.probas, sample_weight=self.sample_weight) self.ret["tiMetrics"]["lift"] = make_lift_score(self.modeling_params["metrics"])(self.valid_y, self.probas, sample_weight=self.sample_weight) if custom_scorefunc is not None and custom_needsproba: ret = custom_scorefunc(self.valid_y, self.probas, sample_weight=self.sample_weight) if ret is None: ret = 0 self.ret["tiMetrics"]["customScore"] = ret # ROC and Lift for proba-aware classifiers false_positive_rates, true_positive_rates, thresholds = roc_curve(self.valid_y, self.probas[:, 1], sample_weight=self.sample_weight) # full roc curve data roc_data = zip(false_positive_rates, true_positive_rates, thresholds) # trim the data as we don't need all points for visualization # in a single-element array for k-fold compatibility self.ret["rocVizData"] = [[{"x": x, "y": y, "p": p} for (x, y, p) in trim_curve(roc_data)]] predicted = pd.Series(data=self.probas[:, 1], name='predicted') with_weight = self.sample_weight is not None if with_weight: results = pd.DataFrame({"__target__": self.valid_y, "sample_weight": self.sample_weight}).join(predicted) else: results = pd.DataFrame({"__target__": self.valid_y}).join(predicted) lb = LiftBuilder(results, '__target__', 'predicted', with_weight) try: self.ret["liftVizData"] = lb.build() except: logging.exception("Cannot compute Lift curve") # Probability density per actual class self.ret["densityData"] = format_all_proba_density(self.classes, self.target_map, self.probas, self.valid_y, self.sample_weight) freqs, avg_preds, weights = dku_calibration_curve(self.valid_y.values, self.probas[:,1], sample_weight=self.sample_weight, n_bins=10) zipped = [(t, p, n) for (t, p, n) in zip(freqs, avg_preds, weights) if not np.isnan(t + p + n)] self.ret["calibrationData"] = [{"y": 0, "x": 0, "n": 0}] + [{"y": t, "x": p, "n": n} for (t, p, n) in zipped ] + [{"y": 1, "x": 1, "n": 0}] self.ret["tiMetrics"]["calibrationLoss"] = dku_nonan(dku_calibration_loss([x[0] for x in zipped], [x[1] for x in zipped], [x[2] for x in zipped])) # if self.probas is not None: # self.add_metric("ROC - AUC Score", mroc_auc_score(self.valid_Y, self.probas), "From 0.5 (random model) to 1 (perfect model).") # if not self.multiclass and self.probas is not None: # self.add_metric('Average Precision Score', average_precision_score(self.valid_Y, self.probas[:, 1]), "Average precision for all classes") # self.add_metric('Accuracy Score', accuracy_score(self.valid_Y, self.preds), "Proportion of correct predictions (positive and negative) in the sample") # self.add_metric('F1 Score', f1_score(self.valid_Y, self.preds), "Harmonic mean of Precision and Recall") # self.add_metric('Precision Score', precision_score(self.valid_Y, self.preds), "Proportion of correct 'positive' predictions in the sample") # self.add_metric('Recall Score', recall_score(self.valid_Y, self.preds), "Proportion of catched 'positive' actual records in the predictions") # #self.add_metric('Hinge Loss', hinge_loss(self.valid_Y, self.preds)) # if not self.multiclass: # self.add_metric('Matthews Correlation Coefficient', matthews_corrcoef(self.valid_Y, self.preds), "The MCC is a correlation coefficient between actual and predicted classifications; +1 is perfect, -1 means no correlation") # self.add_metric('Hamming Loss', hamming_loss(self.valid_Y, self.preds), "The Hamming loss is the fraction of labels that are incorrectly predicted. (The lower the better)") # #self.add_metric('Jaccard Similarity Score', jaccard_similarity_score(self.valid_Y, self.preds)) # #self.add_metric('Zero One Loss', zero_one_loss(self.valid_Y, self.preds)) # if self.probas is not None: # self.add_metric('Log Loss', log_loss(self.valid_Y.values, self.probas), "Error metric that takes into account the predicted probabilities") # Dump the predicted set if self.valid_X_index is not None: if self.use_probas: proba_df = pd.DataFrame(self.probas, columns = ["proba_%s" %x for x in self.classes]) # Realign proba_df.index = self.valid_X_index full = pd.DataFrame(index = self.test_df_index) proba_df = full.join(proba_df, how="left") proba_df.to_csv(self.out_folder +"/predicted.csv", sep="\t", header=True, index=False, encoding='utf-8') self.predicted_df = proba_df else: preds_remapped = np.zeros(self.preds.shape, dtype="object") for (mapped_value, original_value) in self.inv_map.items(): idx = (self.preds.values == mapped_value) preds_remapped[idx] = original_value pred_df = pd.DataFrame({"prediction": preds_remapped}) # Realign pred_df.index = self.valid_X_index full = pd.DataFrame(index = self.test_df_index) pred_df = full.join(pred_df, how="left") pred_df.to_csv(self.out_folder +"/predicted.csv", sep="\t", header=True, index=False, encoding='utf-8') self.predicted_df = pred_df # Dump the perf self.ret = remove_all_nan(self.ret) dkujson.dump_to_filepath(osp.join(self.out_folder, "perf.json"), self.ret) self.perf_data = self.ret return self.ret
def score(self): logging.info("Clustering scoring: Starting work") nb_clusters = len(np.unique(self.cluster_labels)) # Metrics if hasattr(self.cluster_model, "inertia_"): self.ret["metrics"]["inertia"] = dku_nonan(self.cluster_model.inertia_) if nb_clusters > 1: self.ret["metrics"]["silhouette"] = self.silhouette_score() self.ret["metrics"]["nbClusters"] = dku_nonan(nb_clusters) # Importance self.ret["variables_importance"] = self.variables_importance() # Build profiling_df logging.info("Clustering scoring: building final profiling_df") cluster_labels = self.cluster_labels.map(lambda x: self.cluster_names[x]) #Keep only cluster_names that actually appear in cluster_labels self.cluster_names = [cn for cn in self.cluster_names if cn in cluster_labels.unique()] self.profiling_df = self.profiling_df.join(cluster_labels) if set(self.train.columns).intersection(self.profiling_df.columns): # There was no PCA, so we append all columns from train to profiling # to get the dummies self.ret["reduce_vars"] = [] train_with_suffixed = self.train.copy(False) train_with_suffixed.columns = [u"%s__fromtrain" % x for x in train_with_suffixed.columns] self.profiling_df = self.profiling_df.join(train_with_suffixed) else: # There was a PCA, so train only contains the PCA columns. self.ret["reduce_vars"] = list(self.train.columns) # We append train to get the factors in scatter plot self.profiling_df = self.profiling_df.join(self.train) # We append the PREPCA to profiling for the dummies train_with_suffixed = self.train_prepca.copy(False) train_with_suffixed.columns = [u"%s__fromtrain" % x for x in train_with_suffixed.columns] self.profiling_df = self.profiling_df.join(train_with_suffixed) # Dedup ... # I find it very stupid to have to do that while I just wanted to add some columns ... self.profiling_df = self.profiling_df[ list(filter(lambda x: not x.endswith("__fromtrain"), self.profiling_df.columns))] self.nfact = self.profiling_df.columns nb_outliers = self.profiling_df.shape[0] - self.train.shape[0] self.fact = ['cluster_labfels'] logging.info("shape ofw train : %i,%i" % self.train.shape) logging.info("shape of global dataframe : %i,%i" % self.profiling_df.shape) add_cluster_outliers_label = False if self.preprocessing_params["outliers"]["method"] == "DROP": pass #self.profiling_df['cluster_labels'].dropna(inplace=True) elif self.preprocessing_params["outliers"]["method"] == "CLUSTER" and self.profiling_df['cluster_labels'].isnull().sum() > 0: self.profiling_df['cluster_labels'].fillna(constants.CLUSTER_OUTLIERS, inplace=True) add_cluster_outliers_label = True self.ret.update({ "train_nb_records": self.train.shape[0], "train_nb_features": self.train.shape[1], "train_nb_outliers": nb_outliers }) logging.info("Clustering scorer: final profiling_df %s" % str(self.profiling_df.shape)) labels_df = pd.DataFrame({"cluster_labels": self.profiling_df["cluster_labels"]}) #logging.info("Clustering scorer: labels_df: %s" % labels_df) # Realign # labels_df.index = self.transformed_source.index full = pd.DataFrame(index=self.source_index) labels_df = full.join(labels_df, how="left") # If model has additional scoring columns, fetch them if hasattr(self.cluster_model, "get_additional_scoring_columns"): additional_scoring_columns = self.cluster_model.get_additional_scoring_columns(self.train) labels_df = labels_df.join(additional_scoring_columns, how="left") labels_df.to_csv(self.results_path + "/clustered.csv", sep="\t", header=True, index=False, encoding='utf-8') self.cluster_labels = self.cluster_names # this was and remains awful if add_cluster_outliers_label: self.cluster_labels.append(constants.CLUSTER_OUTLIERS) self.cluster_description() self.cluster_profiling() self.cluster_summary() logging.info("Done cluster desc/profiling/summary") self.build_scatter() self.build_numerical_cluster_stats() #If there is only one cluster, the heatmap is irrelevant if len(self.cluster_names) > 1: self.build_heatmap() self.build_facts() dkujson.dump_to_filepath(self.pk_path('results.json'), self.ret) # intrinsic scoring IntrinsicClusteringModelScorer(self.modeling_params, self.cluster_model, self.train, self.pipeline, self.results_path, self.profiling_df).score()
def update_deep_learning_model_info(folder, model_info): status_filepath_tmp = osp.join(folder, "keras_model_training_info.json.tmp") status_filepath = osp.join(folder, "keras_model_training_info.json") dkujson.dump_to_filepath(status_filepath_tmp, model_info) os.rename(status_filepath_tmp, status_filepath)
def cluster_profiling(self, ): cluster_profiling = [] # aggs = [np.min, np.max, np.median, percentile(25), percentile(75)] def profile_numerical(vals, scale): vals = np.array(vals) vals_no_nan = vals[~np.isnan(vals)] nb_rows = vals_no_nan.shape[0] if nb_rows < 2: return { "min": None, "max": None, "median": None, "percentile25": None, "percentile75": None, "percentile9": None, "percentile91": None, "std": None, "distribution": None, "total_no_nan": nb_rows, "max_ratio": 0.0, "total": vals.shape[0] } else: percentile = make_percentile(vals_no_nan) distribution = np.histogram(vals_no_nan, scale)[0] max_ratio = distribution.max() / float(nb_rows) # TODO use the interpolation option in numpy 1.9 return { "min": np.min(vals_no_nan), "max": np.max(vals_no_nan), "median": float(percentile(50)), "percentile25": float(percentile(25)), "percentile75": float(percentile(75)), "percentile9": float(percentile(9)), "percentile91": float(percentile(91)), "std": np.std(vals_no_nan), "distribution": distribution, "max_ratio": max_ratio, "total_no_nan": nb_rows, "total": vals.shape[0] } def profile_categorical(vals, categories): nb_rows = vals.shape[0] if nb_rows == 0: return { "distribution": None, "max_ratio": 0.0, "total_no_nan": nb_rows, "total": nb_rows } else: counts = value_counts(vals, n_most_common=30) distribution = [ { "label": category, "total_no_nan": counts.get(category, 0), "ratio": counts.get(category, 0) / float(nb_rows) } for category in categories ] max_ratio = max(counts.values()) / float(nb_rows) return { "distribution": distribution, "max_ratio": max_ratio, "total": nb_rows, "total_no_nan": nb_rows } # add source variables if len(self.nfact) >= 2: # cause 'cluster' in it anyway. profiling_df = self.profiling_df[self.nfact] cluster_labels = profiling_df["cluster_labels"] cluster_names = self.cluster_labels # sorted(np.unique(cluster_labels)) for col in profiling_df.columns: logging.info("Study profiling column: %s dtype=%s" % (col, profiling_df[col].dtype)) if col == "cluster_labels": continue if col.startswith("factor_"): continue if col.startswith("dummy:"): continue # if col.endswith("") col_profiling = {"variable": col} per_cluster = [] col_profiling["per_cluster"] = per_cluster if float in profiling_df[col].dtype.type.mro() or int in profiling_df[col].dtype.type.mro(): logging.info(" It's a float") col_profiling["type"] = "numerical" cluster_profiling.append(col_profiling) col_vals = profiling_df[col] col_vals_no_na = no_nan(col_vals) percentile = make_percentile(col_vals_no_na) scale_start = percentile(0) scale_stop = percentile(100) max_ratio = 0.01 col_profiling["scale"] = { "min": scale_start, "max": scale_stop, } if scale_stop - scale_start == 0: logging.info("This variable has no variance") col_profiling["no_variance"] = True continue scale = np.linspace(scale_start, scale_stop, num=61) col_profiling["global"] = profile_numerical(col_vals, scale) max_ratio = max(max_ratio, col_profiling["global"]["max_ratio"]) for cluster_label in cluster_names: filtered_col_vals = np.array(col_vals[cluster_labels == cluster_label]) cluster_profile = profile_numerical(filtered_col_vals, scale) max_ratio = max(max_ratio, cluster_profile["max_ratio"]) cluster_profile["cluster_name"] = cluster_label per_cluster.append(cluster_profile) col_profiling["scale"]["max_ratio"] = max_ratio else: col_profiling["type"] = "categorical" logging.info(" It's a cat") # categorical stuff. col_vals = profiling_df[col] global_counts = value_counts(col_vals, n_most_common=30) # global_counts contains the counts for the category values we break down on mask = col_vals.isin(global_counts.keys()) if None in global_counts: mask |= col_vals.isnull() col_vals = col_vals[mask] cluster_profiling.append(col_profiling) col_profiling["global"] = profile_categorical(col_vals, global_counts.keys()) max_ratio = 0.0 for cluster_label in cluster_names: filtered_col_vals = col_vals[cluster_labels == cluster_label] cluster_profile = profile_categorical(filtered_col_vals, global_counts.keys()) cluster_profile["cluster_name"] = cluster_label max_ratio = max(max_ratio, cluster_profile["max_ratio"]) per_cluster.append(cluster_profile) scale = {"max_ratio": max_ratio} col_profiling["scale"] = scale scale["categories"] = list(global_counts.keys()) dkujson.dump_to_filepath(self.pk_path('profiling.json'), cluster_profiling) logging.info("DONE cluster profiling")