Python ProgressListener Examples

Programming Language: Python

Namespace/Package Name: dataiku.doctor.utils

Class/Type: ProgressListener

Examples at hotexamples.com: 11

Python ProgressListener - 11 examples found. These are the top rated real world Python examples of dataiku.doctor.utils.ProgressListener extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ProgressListener(11)

push_state(8)

add_future_steps(5)

add_future_step(1)

Example #1

Show file

def main(exec_folder, output_dataset, keptInputColumns):
    start = unix_time_millis()
    listener = ProgressListener()

    split_desc = json.load(open(osp.join(exec_folder, "_esplit.json")))
    preprocessing_params = json.load(open(osp.join(exec_folder, "rpreprocessing_params.json")))
    modeling_params = json.load(open(osp.join(exec_folder, "rmodeling_params.json")))

    with listener.push_state(constants.STATE_LOADING_SRC):
        input_df = df_from_split_desc_no_normalization(split_desc, "full", preprocessing_params["per_feature"])
        logging.info("Loaded full df: shape=(%d,%d)" % input_df.shape)
        input_df_orig = input_df.copy()
        input_df = utils.normalize_dataframe(input_df, preprocessing_params["per_feature"])        

    with listener.push_state("Collecting preprocessing data"):
        collector = ClusteringPreprocessingDataCollector(input_df, preprocessing_params)
        collector_data = collector.build()

    preproc_handler = ClusteringPreprocessingHandler({}, preprocessing_params, exec_folder)
    preproc_handler.collector_data = collector_data
    pipeline = preproc_handler.build_preprocessing_pipeline()

    with listener.push_state("Preprocessing data"):
        transformed_train = pipeline.fit_and_process(input_df)

    start_train = unix_time_millis()

    (clf, actual_params, cluster_labels, additional_columns) = clustering_fit(modeling_params, transformed_train)

    # if model has custom labels, use them
    try:
        cluster_names = clf.get_cluster_labels()
    except AttributeError:
        cluster_names = ["cluster_%s" % i for i in range(len(np.unique(cluster_labels)))]
    cl = pd.Series(data=cluster_labels, name="cluster_labels").map(lambda i: cluster_names[i])
    cl.index = transformed_train["TRAIN"].index

    final_df = pd.concat([input_df_orig.join(cl, how='left'), additional_columns], axis=1)

    if keptInputColumns is not None:
        final_df = final_df[keptInputColumns + ['cluster_labels']]

    if preprocessing_params["outliers"]["method"] == "CLUSTER":
        final_df['cluster_labels'].fillna(constants.CLUSTER_OUTLIERS, inplace=True)

    dataiku.Dataset(output_dataset).write_from_dataframe(final_df)

    end = unix_time_millis()

    utils.write_done_traininfo(exec_folder, start, start_train, end, listener)

Example #2

Show file

def main(exec_folder):
    start = unix_time_millis()
    listener = ProgressListener()

    def update_fn():
        utils.write_running_traininfo(exec_folder, start, listener)

    split_desc = json.load(open(osp.join(exec_folder, "_esplit.json")))
    preprocessing_params = json.load(
        open(osp.join(exec_folder, "rpreprocessing_params.json")))
    modeling_params = json.load(
        open(osp.join(exec_folder, "rmodeling_params.json")))

    with listener.push_state(constants.STATE_LOADING_SRC):
        update_fn()
        train_df = df_from_split_desc(split_desc, "full",
                                      preprocessing_params["per_feature"])
        logging.info("Loaded full df: shape=(%d,%d)" % train_df.shape)

    with listener.push_state("Collecting preprocessing data"):
        update_fn()
        collector = ClusteringPreprocessingDataCollector(
            train_df, preprocessing_params)
        collector_data = collector.build()

    preproc_handler = ClusteringPreprocessingHandler({}, preprocessing_params,
                                                     exec_folder)
    preproc_handler.collector_data = collector_data
    pipeline = preproc_handler.build_preprocessing_pipeline()

    with listener.push_state("Preprocessing data"):
        orig_index = train_df.index.copy()
        transformed_train = pipeline.fit_and_process(train_df)
        preproc_handler.save_data()
        preproc_handler.report(pipeline)

    start_train = unix_time_millis()

    clustering_train_score_save(transformed_train, orig_index,
                                preprocessing_params, modeling_params,
                                exec_folder, listener, update_fn, pipeline)

    end = unix_time_millis()

    utils.write_done_traininfo(exec_folder, start, start_train, end, listener)

Example #3

Show file

def main(exec_folder, selection_state_folder, operation_mode):
    """The whole execution of the saved model train takes place in a single folder ?"""
    start = unix_time_millis()
    start_train = start
    listener = ProgressListener()

    def update_fn():
        utils.write_running_traininfo(exec_folder, start, listener)

    split_desc = json.load(open(osp.join(exec_folder, "_esplit.json")))
    core_params = json.load(open(osp.join(exec_folder, "core_params.json")))
    preprocessing_params = json.load(
        open(osp.join(exec_folder, "rpreprocessing_params.json")))
    weight_method = core_params.get("weight", {}).get("weightMethod", None)
    with_sample_weight = weight_method in {
        "SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"
    }
    with_class_weight = weight_method in {
        "CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"
    }
    calibrate_proba = core_params.get("calibration",
                                      {}).get("calibrationMethod",
                                              None) in ["SIGMOID", "ISOTONIC"]
    modeling_params = json.load(
        open(osp.join(exec_folder, "rmodeling_params.json")))

    # For KERAS backend, need to tag special features, because they are only processed with process function,
    # not fit_and_process
    if modeling_params["algorithm"] == "KERAS_CODE":
        tag_special_features(preprocessing_params['per_feature'])

    def do_full_fit_and_save():
        """Fit on 100% and save the clf and out params"""
        with listener.push_state(constants.STATE_LOADING_TRAIN):
            update_fn()
            full_df = df_from_split_desc(split_desc, "full",
                                         preprocessing_params["per_feature"],
                                         core_params["prediction_type"])
            logging.info("Loaded FULL df: shape=(%d,%d)" % full_df.shape)

        with listener.push_state("Collecting preprocessing data"):
            update_fn()
            collector = ClusteringPreprocessingDataCollector(
                full_df, preprocessing_params)
            collector_data = collector.build()

            pipeline, preproc_handler = build_pipeline_and_handler(
                collector_data,
                core_params,
                exec_folder,
                preprocessing_params,
                selection_state_folder=selection_state_folder,
                allow_empty_mf=modeling_params["algorithm"] == "KERAS_CODE")

            # TODO
            if core_params["prediction_type"] in (
                    constants.BINARY_CLASSIFICATION, constants.MULTICLASS):
                target_map = preproc_handler.target_map
            else:
                target_map = None

        with listener.push_state("Preprocessing full set"):
            preprocessor_fit_full_df = full_df

            # For KERAS backend, we might need to take a subsample of the input_df to prevent from memory errors
            if modeling_params["algorithm"] == "KERAS_CODE":
                need_subsampling = preprocessing_params[
                    "preprocessingFitSampleRatio"] < 1
                full_df_orig = full_df.copy()
                if need_subsampling:
                    preprocessor_fit_full_df = preprocessor_fit_full_df.sample(
                        frac=preprocessing_params[
                            "preprocessingFitSampleRatio"],
                        random_state=preprocessing_params[
                            "preprocessingFitSampleSeed"])

            transformed_full = pipeline.fit_and_process(
                preprocessor_fit_full_df)

            if with_sample_weight:
                assert transformed_full["weight"].values.min(
                ) > 0, "Sample weights must be positive"

            preproc_handler.save_data()
            preproc_handler.report(pipeline)

        if modeling_params["algorithm"] == "KERAS_CODE":

            modeling_set = {"run_folder": exec_folder, "listener": listener}

            def update_modeling_state():

                status = utils.make_running_traininfo(
                    modeling_set["run_folder"], start,
                    modeling_set["listener"])
                utils.write_model_status(modeling_set, status)

            empty_df = pd.DataFrame()

            return prediction_train_model_keras(
                transformed_full, full_df_orig, empty_df, pipeline,
                modeling_params, core_params,
                preprocessing_params["per_feature"], exec_folder, listener,
                update_modeling_state, preproc_handler.target_map,
                pipeline.generated_features_mapping)

        else:
            return fit_score_save(pipeline, target_map, transformed_full)

    def fit_score_save(pipeline, target_map, transformed_full):
        with listener.push_state(constants.STATE_FITTING):
            update_fn()
            if core_params["prediction_type"] in (
                    constants.BINARY_CLASSIFICATION, constants.MULTICLASS):
                (clf, out_params, prepared_X, iipd) = classification_fit(
                    modeling_params,
                    split_desc,
                    transformed_full,
                    core_params["prediction_type"],
                    exec_folder,
                    target_map=target_map,
                    with_sample_weight=with_sample_weight,
                    with_class_weight=with_class_weight)
                if calibrate_proba:
                    method = core_params.get(
                        "calibration", {}).get("calibrationMethod").lower()
                    calibrated_clf = CalibratedClassifierCV(clf,
                                                            cv="prefit",
                                                            method=method)
                    test_X = transformed_full["TRAIN"]
                    test_X, is_sparse = prepare_multiframe(
                        test_X, modeling_set['modelingParams'])
                    test_y = transformed_full["target"].astype(int)
                    if with_sample_weight:
                        test_weight = transformed_full["weight"].astype(float)
                        calibrated_clf.fit(test_X,
                                           test_y,
                                           sample_weight=test_weight)
                    else:
                        calibrated_clf.fit(test_X, test_y)
                    clf = calibrated_clf
            else:
                (clf, out_params, prepared_X, iipd) = regression_fit_single(
                    modeling_params,
                    split_desc,
                    transformed_full,
                    exec_folder,
                    with_sample_weight=with_sample_weight)

        with listener.push_state(constants.STATE_SAVING):
            save_prediction_model(clf, out_params, listener, update_fn,
                                  exec_folder)
        with listener.push_state(constants.STATE_SCORING):
            train_X = transformed_full["TRAIN"]
            train_y = transformed_full["target"]
            if core_params[
                    "prediction_type"] == constants.BINARY_CLASSIFICATION:
                ClassificationModelIntrinsicScorer(modeling_params, clf,
                                                   train_X, train_y, pipeline,
                                                   exec_folder, prepared_X,
                                                   iipd,
                                                   calibrate_proba).score()
                BinaryModelSerializer(train_X.columns(), clf, modeling_params,
                                      exec_folder, target_map,
                                      calibrate_proba).serialize()
            elif core_params["prediction_type"] == constants.MULTICLASS:
                ClassificationModelIntrinsicScorer(modeling_params, clf,
                                                   train_X, train_y, pipeline,
                                                   exec_folder, prepared_X,
                                                   iipd,
                                                   calibrate_proba).score()
                MulticlassModelSerializer(train_X.columns(), clf,
                                          modeling_params, exec_folder,
                                          target_map,
                                          calibrate_proba).serialize()
            else:
                RegressionModelIntrinsicScorer(modeling_params, clf, train_X,
                                               train_y, pipeline, exec_folder,
                                               prepared_X, iipd).score()
                RegressionModelSerializer(train_X.columns(), clf,
                                          modeling_params,
                                          exec_folder).serialize()
        return out_params

    if operation_mode == "TRAIN_SPLITTED_ONLY":

        with listener.push_state(constants.STATE_LOADING_TRAIN):
            update_fn()
            train_df = df_from_split_desc(split_desc, "train",
                                          preprocessing_params["per_feature"],
                                          core_params["prediction_type"])
            logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape)

        with listener.push_state(constants.STATE_LOADING_TEST):
            update_fn()
            test_df = df_from_split_desc(split_desc, "test",
                                         preprocessing_params["per_feature"],
                                         core_params["prediction_type"])
            logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape)

        with listener.push_state("Collecting preprocessing data"):
            update_fn()
            collector = PredictionPreprocessingDataCollector(
                train_df, preprocessing_params)
            collector_data = collector.build()
            pipeline, preproc_handler = build_pipeline_and_handler(
                collector_data,
                core_params,
                exec_folder,
                preprocessing_params,
                selection_state_folder=selection_state_folder,
                allow_empty_mf=modeling_params["algorithm"] == "KERAS_CODE")

            # TODO
            if core_params["prediction_type"] in (
                    constants.BINARY_CLASSIFICATION, constants.MULTICLASS):
                target_map = preproc_handler.target_map
            else:
                target_map = None

        with listener.push_state("Preprocessing train set"):
            preprocessor_fit_df = train_df

            # For KERAS backend, we might need to take a subsample of the input_df to prevent from memory errors
            if modeling_params["algorithm"] == "KERAS_CODE":
                train_df_orig = train_df.copy()
                need_subsampling = preprocessing_params[
                    "preprocessingFitSampleRatio"] < 1
                if need_subsampling:
                    preprocessor_fit_df = preprocessor_fit_df.sample(
                        frac=preprocessing_params[
                            "preprocessingFitSampleRatio"],
                        random_state=preprocessing_params[
                            "preprocessingFitSampleSeed"])

            transformed_train = pipeline.fit_and_process(preprocessor_fit_df)
            if with_sample_weight:
                assert transformed_train["weight"].values.min(
                ) > 0, "Sample weights must be positive"

            preproc_handler.save_data()
            preproc_handler.report(pipeline)

        # For KERAS backend, cannot process test directly, because my have special features that may not
        # hold in memory
        if modeling_params["algorithm"] != "KERAS_CODE":
            with listener.push_state("Preprocessing test set"):
                test_df_index = test_df.index.copy()
                transformed_test = pipeline.process(test_df)
                if with_sample_weight:
                    assert transformed_test["weight"].values.min(
                    ) > 0, "Sample weights must be positive"

        if modeling_params["algorithm"] == "PYTHON_ENSEMBLE":
            prediction_train_score_save_ensemble(train_df, test_df,
                                                 core_params, split_desc,
                                                 modeling_params, exec_folder,
                                                 listener, target_map,
                                                 update_fn, pipeline,
                                                 with_sample_weight)
        elif modeling_params["algorithm"] == "KERAS_CODE":
            modeling_set = {"run_folder": exec_folder, "listener": listener}

            def update_modeling_state():
                status = utils.make_running_traininfo(
                    modeling_set["run_folder"], start,
                    modeling_set["listener"])
                utils.write_model_status(modeling_set, status)

            prediction_train_model_keras(
                transformed_train, train_df_orig, test_df, pipeline,
                modeling_params, core_params,
                preprocessing_params["per_feature"], exec_folder, listener,
                update_modeling_state, preproc_handler.target_map,
                pipeline.generated_features_mapping)
        else:
            prediction_train_score_save(transformed_train, transformed_test,
                                        test_df_index, core_params, split_desc,
                                        modeling_params, exec_folder, listener,
                                        target_map, update_fn, pipeline,
                                        exec_folder)

    elif operation_mode == "TRAIN_FULL_ONLY":
        # Not yet functional ...
        do_full_fit_and_save()

    elif operation_mode == "TRAIN_KFOLD":
        out_params = do_full_fit_and_save()

        full_df_clean = df_from_split_desc(split_desc, "full",
                                           preprocessing_params["per_feature"],
                                           core_params["prediction_type"])

        optimized_params = out_params["resolved"]

        logging.info("Regridifying post-train params: %s" %
                     json.dumps(optimized_params))

        # Regridify to a unary grid the optimized params
        optimized_params_grid = intercom.backend_json_call(
            "ml/prediction/regridify-to-pretrain", {
                "preTrain": json.dumps(modeling_params),
                "postTrain": json.dumps(optimized_params)
            })
        logging.info("Using unary grid params: %s" %
                     json.dumps(optimized_params_grid))

        prediction_train_model_kfold(full_df_clean, core_params, split_desc,
                                     preprocessing_params,
                                     optimized_params_grid, exec_folder,
                                     exec_folder, listener, update_fn,
                                     with_sample_weight, with_class_weight,
                                     calibrate_proba)

    else:
        do_full_fit_and_save()
        # Do the split and scoring but don't save data
        with listener.push_state(constants.STATE_LOADING_TRAIN):
            update_fn()
            train_df = df_from_split_desc(split_desc, "train",
                                          preprocessing_params["per_feature"],
                                          core_params["prediction_type"])
            logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape)

        with listener.push_state(constants.STATE_LOADING_TEST):
            update_fn()
            test_df = df_from_split_desc(split_desc, "test",
                                         preprocessing_params["per_feature"],
                                         core_params["prediction_type"])
            logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape)

        with listener.push_state("Collecting preprocessing data"):
            update_fn()
            collector = PredictionPreprocessingDataCollector(
                train_df, preprocessing_params)
            collector_data = collector.build()

            pipeline, preproc_handler = build_pipeline_and_handler(
                collector_data,
                core_params,
                exec_folder,
                preprocessing_params,
                selection_state_folder=selection_state_folder,
                allow_empty_mf=modeling_params["algorithm"] == "KERAS_CODE")

            # TODO
            if core_params["prediction_type"] in (
                    constants.BINARY_CLASSIFICATION, constants.MULTICLASS):
                target_map = preproc_handler.target_map
            else:
                target_map = None

        with listener.push_state("Preprocessing train set"):
            preprocessor_fit_df = train_df

            # For KERAS backend, we might need to take a subsample of the input_df to prevent from memory errors
            if modeling_params["algorithm"] == "KERAS_CODE":
                need_subsampling = preprocessing_params[
                    "preprocessingFitSampleRatio"] < 1
                train_df_orig = train_df.copy()
                if need_subsampling:
                    preprocessor_fit_df = preprocessor_fit_df.sample(
                        frac=preprocessing_params[
                            "preprocessingFitSampleRatio"],
                        random_state=preprocessing_params[
                            "preprocessingFitSampleSeed"])

            transformed_train = pipeline.fit_and_process(preprocessor_fit_df)

        # For KERAS backend, cannot process test directly, because my have special features that may not
        # hold in memory
        if modeling_params["algorithm"] != "KERAS_CODE":
            with listener.push_state("Preprocessing test set"):
                test_df_index = test_df.index.copy()
                transformed_test = pipeline.process(test_df)

        if modeling_params["algorithm"] == "KERAS_CODE":
            modeling_set = {"run_folder": exec_folder, "listener": listener}

            def update_modeling_state():
                status = utils.make_running_traininfo(
                    modeling_set["run_folder"], start,
                    modeling_set["listener"])
                utils.write_model_status(modeling_set, status)

            prediction_train_model_keras(transformed_train,
                                         train_df_orig,
                                         test_df,
                                         pipeline,
                                         modeling_params,
                                         core_params,
                                         preprocessing_params["per_feature"],
                                         exec_folder,
                                         listener,
                                         update_modeling_state,
                                         preproc_handler.target_map,
                                         pipeline.generated_features_mapping,
                                         save_model=False)
        else:
            with listener.push_state(constants.STATE_FITTING):
                update_fn()
                if core_params["prediction_type"] in (
                        constants.BINARY_CLASSIFICATION, constants.MULTICLASS):
                    (clf, out_params, prepared_X, iipd) = classification_fit(
                        modeling_params,
                        split_desc,
                        transformed_train,
                        core_params["prediction_type"],
                        target_map=target_map,
                        with_sample_weight=with_sample_weight,
                        with_class_weight=with_class_weight)
                else:
                    (clf, out_params, prepared_X,
                     iipd) = regression_fit_single(
                         modeling_params,
                         split_desc,
                         transformed_train,
                         exec_folder,
                         with_sample_weight=with_sample_weight)
            with listener.push_state(constants.STATE_SCORING):
                train_X = transformed_train["TRAIN"]
                train_y = transformed_train["target"]
                if core_params[
                        "prediction_type"] == constants.BINARY_CLASSIFICATION:
                    ClassificationModelIntrinsicScorer(
                        modeling_params, clf, train_X, train_y, pipeline,
                        exec_folder, prepared_X, iipd,
                        calibrate_proba).score()
                    BinaryModelSerializer(train_X.columns(), clf,
                                          modeling_params, exec_folder,
                                          target_map).serialize()
                    binary_classification_scorer_with_valid(
                        modeling_params,
                        clf,
                        transformed_test,
                        exec_folder,
                        test_df_index,
                        target_map=target_map,
                        with_sample_weight=with_sample_weight).score()
                elif core_params["prediction_type"] == constants.MULTICLASS:
                    ClassificationModelIntrinsicScorer(
                        modeling_params, clf, train_X, train_y, pipeline,
                        exec_folder, prepared_X, iipd,
                        calibrate_proba).score()
                    MulticlassModelSerializer(train_X.columns(), clf,
                                              modeling_params, exec_folder,
                                              target_map).serialize()
                    multiclass_scorer_with_valid(
                        modeling_params,
                        clf,
                        transformed_test,
                        exec_folder,
                        test_df_index,
                        target_map=target_map,
                        with_sample_weight=with_sample_weight).score()
                else:
                    RegressionModelIntrinsicScorer(modeling_params, clf,
                                                   train_X, train_y, pipeline,
                                                   exec_folder, prepared_X,
                                                   iipd).score()
                    RegressionModelSerializer(train_X.columns(), clf,
                                              modeling_params,
                                              exec_folder).serialize()
                    regression_scorer_with_valid(modeling_params, clf,
                                                 transformed_test, exec_folder,
                                                 test_df_index,
                                                 with_sample_weight).score()

    end = unix_time_millis()

    utils.write_done_traininfo(exec_folder, start, start_train, end, listener)

Example #4

Show file

def main(model_folder,
         input_dataset_smartname,
         output_dataset_smartname,
         recipe_desc,
         script,
         preparation_output_schema,
         cond_outputs=None):

    # Obtain a streamed result of the preparation
    input_dataset = dataiku.Dataset(input_dataset_smartname)
    logging.info("Will do preparation, output schema: %s" %
                 preparation_output_schema)
    input_dataset.set_preparation_steps(script["steps"],
                                        preparation_output_schema)

    listener = ProgressListener()

    core_params = dkujson.load_from_filepath(
        osp.join(model_folder, "core_params.json"))
    preprocessing_params = dkujson.load_from_filepath(
        osp.join(model_folder, "rpreprocessing_params.json"))
    modeling_params = dkujson.load_from_filepath(
        osp.join(model_folder, "actual_params.json"))["resolved"]
    collector_data = dkujson.load_from_filepath(
        osp.join(model_folder, "collector_data.json"))

    preprocessing_handler = PreprocessingHandler.build(core_params,
                                                       preprocessing_params,
                                                       model_folder)
    preprocessing_handler.collector_data = collector_data

    pipeline = preprocessing_handler.build_preprocessing_pipeline()

    batch_size = recipe_desc.get("pythonBatchSize", 100000)
    logging.info("Scoring with batch size: {}".format(batch_size))

    with open(osp.join(model_folder, "clf.pkl"), "rb") as f:
        clf = pickle.load(f)

    def output_generator():
        logging.info("Start output generator ...")

        (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st(
            preparation_output_schema["columns"],
            parse_dates=True,
            infer_with_pandas=False)
        logging.info("Reading with INITIAL dtypes: %s" % dtypes)
        dtypes = utils.ml_dtypes_from_dss_schema(
            preparation_output_schema,
            preprocessing_params["per_feature"],
            prediction_type=core_params["prediction_type"])
        logging.info("Reading with dtypes: %s" % dtypes)

        for i in xrange(0, len(names)):
            logging.info("Column %s = %s (dtype=%s)" %
                         (i, names[i], dtypes.get(names[i], None)))

        for input_df in input_dataset.iter_dataframes_forced_types(
                names,
                dtypes,
                parse_date_columns,
                chunksize=batch_size,
                float_precision="round_trip"):
            input_df.index = range(input_df.shape[0])
            input_df_orig = input_df.copy()
            logging.info("Got a dataframe : %s" % str(input_df.shape))
            normalize_dataframe(input_df, preprocessing_params['per_feature'])

            for col in input_df:
                logging.info("NORMALIZED: %s -> %s" %
                             (col, input_df[col].dtype))

            logging.info("Processing it")
            logging.info("Predicting it")

            if core_params[
                    "prediction_type"] == constants.BINARY_CLASSIFICATION:
                pred_df = binary_classification_predict(
                    clf,
                    pipeline,
                    modeling_params,
                    preprocessing_params,
                    preprocessing_handler.target_map,
                    recipe_desc["forcedClassifierThreshold"],
                    input_df,
                    output_probas=recipe_desc["outputProbabilities"])
                # Probability percentile & Conditional outputs
                pred_df = binary_classif_scoring_add_percentile_and_cond_outputs(
                    pred_df, recipe_desc, model_folder, cond_outputs,
                    preprocessing_handler.target_map)

            elif core_params["prediction_type"] == constants.MULTICLASS:
                pred_df = multiclass_predict(
                    clf,
                    pipeline,
                    modeling_params,
                    preprocessing_params,
                    preprocessing_handler.target_map,
                    input_df,
                    output_probas=recipe_desc["outputProbabilities"])

            elif core_params["prediction_type"] == constants.REGRESSION:
                pred_df = regression_predict(clf, pipeline, modeling_params,
                                             input_df)

            else:
                raise ValueError("bad prediction type %s" %
                                 core_params["prediction_type"])

            logging.info("pred df debug :")
            logging.info(pred_df)

            logging.info("Done predicting it")
            if recipe_desc.get("filterInputColumns", False):
                clean_kept_columns = [
                    c for c in recipe_desc["keptInputColumns"]
                    if c not in pred_df.columns
                ]
            else:
                clean_kept_columns = [
                    c for c in input_df_orig.columns
                    if c not in pred_df.columns
                ]
            yield pd.concat([input_df_orig[clean_kept_columns], pred_df],
                            axis=1)

    output_dataset = dataiku.Dataset(output_dataset_smartname)
    logging.info("Starting writer")
    with output_dataset.get_writer() as writer:
        i = 0
        logging.info("Starting to iterate")
        for output_df in output_generator():
            logging.info("Generator generated a df %s" % str(output_df.shape))
            #if i == 0:
            #    output_dataset.write_schema_from_dataframe(output_df)
            i = i + 1
            writer.write_dataframe(output_df)
            logging.info("Output df written")

Example #5

Show file

def main(model_folder, input_dataset_smartname, output_dataset_smartname,
         recipe_desc, script, preparation_output_schema):
    input_dataset = dataiku.Dataset(input_dataset_smartname)
    logging.info("Will do preparation, output schema: %s" %
                 preparation_output_schema)
    input_dataset.set_preparation_steps(script["steps"],
                                        preparation_output_schema)

    listener = ProgressListener()

    preprocessing_params = dkujson.load_from_filepath(
        osp.join(model_folder, "rpreprocessing_params.json"))
    modeling_params = dkujson.load_from_filepath(
        osp.join(model_folder, "actual_params.json"))["resolved"]
    collector_data = dkujson.load_from_filepath(
        osp.join(model_folder, "collector_data.json"))

    # Name remapping
    user_meta = dkujson.load_from_filepath(
        osp.join(model_folder, "user_meta.json"))
    cluster_name_map = {}
    if "clusterMetas" in user_meta:
        logging.info("Cluster metas: %s" % user_meta["clusterMetas"])
        for (cluster_id, cluster_data) in user_meta["clusterMetas"].items():
            cluster_name_map[cluster_id] = cluster_data["name"]

    preprocessing_handler = ClusteringPreprocessingHandler(
        {}, preprocessing_params, model_folder)
    preprocessing_handler.collector_data = collector_data
    pipeline = preprocessing_handler.build_preprocessing_pipeline()

    with open(osp.join(model_folder, "clusterer.pkl"), "rb") as f:
        clf = pickle.load(f)

    try:
        logging.info("Post-processing model")
        clf.post_process(user_meta)
    except AttributeError:
        # method does not exist if model cannot be post-processed, just pass
        pass

    try:
        custom_labels = clf.get_cluster_labels()

        def map_fun_custom(i):
            name = custom_labels[i]
            return cluster_name_map.get(name, name)

        naming = map_fun_custom
    except AttributeError:

        def map_fun(i):
            name = "cluster_%i" % i
            return cluster_name_map.get(name, name)

        naming = map_fun

    def output_generator():
        logging.info("Start output generator ...")

        (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st(
            preparation_output_schema["columns"],
            parse_dates=True,
            infer_with_pandas=False)
        logging.info("Reading with INITIAL dtypes: %s" % dtypes)
        dtypes = utils.ml_dtypes_from_dss_schema(
            preparation_output_schema, preprocessing_params["per_feature"])
        logging.info("Reading with dtypes: %s" % dtypes)

        for input_df in input_dataset.iter_dataframes_forced_types(
                names, dtypes, parse_date_columns, chunksize=100000):
            input_df.index = range(input_df.shape[0])
            input_df_orig = input_df.copy()
            if recipe_desc.get("filterInputColumns", False):
                input_df_orig = input_df_orig[recipe_desc["keptInputColumns"]]

            logging.info("Got a dataframe : %s" % str(input_df.shape))
            normalize_dataframe(input_df, preprocessing_params['per_feature'])

            for col in input_df:
                logging.info("NORMALIZED: %s -> %s" %
                             (col, input_df[col].dtype))

            logging.info("Processing it")
            transformed = pipeline.process(input_df)
            logging.info("Applying it")

            (labels_arr,
             additional_columns) = clustering_predict(modeling_params, clf,
                                                      transformed)
            cluster_labels = pd.Series(labels_arr,
                                       name="cluster_labels").map(naming)
            cluster_labels.index = transformed["TRAIN"].index

            final_df = pd.concat([
                input_df_orig.join(cluster_labels, how='left'),
                additional_columns
            ],
                                 axis=1)

            if preprocessing_params["outliers"]["method"] == "CLUSTER":
                outliers_cluter_name = cluster_name_map.get(
                    constants.CLUSTER_OUTLIERS, constants.CLUSTER_OUTLIERS)
                final_df['cluster_labels'].fillna(outliers_cluter_name,
                                                  inplace=True)

            logging.info("Done predicting it")

            yield final_df

    output_dataset = dataiku.Dataset(output_dataset_smartname)
    logging.info("Starting writer")
    with output_dataset.get_writer() as writer:
        i = 0
        logging.info("Starting to iterate")
        for output_df in output_generator():
            logging.info("Generator generated a df %s" % str(output_df.shape))
            #if i == 0:
            #    output_dataset.write_schema_from_dataframe(output_df)
            i = i + 1
            writer.write_dataframe(output_df)
            logging.info("Output df written")

Example #6

Show file

def train_prediction_kfold(core_params, preprocessing_set, split_desc):

    start = unix_time_millis()
    preprocessing_params = preprocessing_set['preprocessing_params']
    modeling_sets = preprocessing_set["modelingSets"]

    logging.info("PPS is %s" % preprocessing_params)
    preprocessing_listener = ProgressListener()
    preprocessing_listener.add_future_steps(constants.PRED_KFOLD_PREPROCESSING_STATES)
    for modeling_set in modeling_sets:
        listener = ProgressListener()
        listener.add_future_steps(constants.PRED_KFOLD_TRAIN_STATES)
        modeling_set["listener"] = listener

    def update_one_preprocessing_state(modeling_set):
        status = utils.make_running_traininfo(modeling_set["run_folder"], start,
                                              (preprocessing_listener, modeling_set["listener"]))
        utils.write_model_status(modeling_set, status)

    def update_preprocessing_state():
        map(update_one_preprocessing_state, modeling_sets)


    with preprocessing_listener.push_state(constants.STATE_LOADING_SRC):
        update_preprocessing_state()
        full_df = df_from_split_desc(split_desc,
                                     "full",
                                     preprocessing_params["per_feature"],
                                     core_params["prediction_type"])
        logging.info("Loaded full_df df: shape=(%d,%d)" % full_df.shape)

    with preprocessing_listener.push_state(constants.STATE_COLLECTING):
        update_preprocessing_state()
        collector = PredictionPreprocessingDataCollector(full_df, preprocessing_params)
        collector_data = collector.build()

    pipeline, preproc_handler = build_pipeline_and_handler(collector_data, core_params, preprocessing_set['run_folder'],
                                                           preprocessing_params)

    with preprocessing_listener.push_state(constants.KFOLDSTATE_PREPROCESS_GLOBAL):
        update_preprocessing_state()
        transformed_full = pipeline.fit_and_process(full_df)
        preproc_handler.save_data()
        preproc_handler.report(pipeline)

    update_preprocessing_state()
    preprocessing_end = unix_time_millis()

    train_X = transformed_full["TRAIN"]
    train_y = transformed_full["target"]

    weight_method = core_params.get("weight", {}).get("weightMethod", None)
    with_sample_weight = weight_method in {"SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"}
    with_class_weight = weight_method in {"CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"}
    calibrate_proba = core_params.get("calibration", {}).get("calibrationMethod", None) in ["SIGMOID", "ISOTONIC"]

    if with_sample_weight:
        assert transformed_full["weight"].values.min() > 0, "Sample weights must be positive"

    for modeling_set in modeling_sets:
        model_start = unix_time_millis()

        update_fn = lambda: update_one_preprocessing_state(modeling_set)
        if core_params["prediction_type"] in (constants.BINARY_CLASSIFICATION, constants.MULTICLASS):
            with modeling_set["listener"].push_state(constants.KFOLDSTATE_FITTING_GLOBAL):
                # no out-fold available, so calibrate through classification_fit on a random split
                if calibrate_proba:
                    calibration_method = core_params.get("calibration", {}).get("calibrationMethod").lower()
                else:
                    calibration_method = None
                update_one_preprocessing_state(modeling_set)
                (clf, out_params, prepared_X, iipd) = classification_fit(modeling_set['modelingParams'], split_desc,
                                                                         transformed_full,
                                                                         core_params["prediction_type"],
                                                                         modeling_set['run_folder'],
                                                                         target_map=preproc_handler.target_map,
                                                                         with_sample_weight=with_sample_weight,
                                                                         with_class_weight=with_class_weight,
                                                                         calibration=calibration_method)
            save_prediction_model(clf, out_params, modeling_set["listener"], update_fn,
                                  modeling_set['run_folder'])

            with modeling_set["listener"].push_state(constants.KFOLDSTATE_SCORING_GLOBAL):
                update_one_preprocessing_state(modeling_set)
                ClassificationModelIntrinsicScorer(modeling_set['modelingParams'], clf,
                         train_X, train_y, pipeline, modeling_set['run_folder'], prepared_X, iipd, calibrate_proba).score()
                if core_params["prediction_type"] == constants.BINARY_CLASSIFICATION:
                    BinaryModelSerializer(train_X.columns(), clf, modeling_set['modelingParams'],
                                          modeling_set['run_folder'], preproc_handler.target_map, calibrate_proba).serialize()
                else:
                    MulticlassModelSerializer(train_X.columns(), clf, modeling_set['modelingParams'],
                                              modeling_set['run_folder'], preproc_handler.target_map, calibrate_proba).serialize()
        else:
            with modeling_set["listener"].push_state(constants.KFOLDSTATE_FITTING_GLOBAL):
                update_one_preprocessing_state(modeling_set)
                (clf, out_params, prepared_X, iipd) = regression_fit_single(modeling_set['modelingParams'],
                                                                            split_desc, transformed_full, modeling_set["run_folder"],
                                                                            with_sample_weight=with_sample_weight)
            save_prediction_model(clf, out_params, modeling_set["listener"], update_fn,
                                  modeling_set['run_folder'])

            with modeling_set["listener"].push_state(constants.KFOLDSTATE_SCORING_GLOBAL):
                update_fn()
                RegressionModelIntrinsicScorer(modeling_set['modelingParams'], clf, train_X, train_y, pipeline,
                                               modeling_set['run_folder'], prepared_X, iipd).score()
                # serialize the model if possible
                RegressionModelSerializer(train_X.columns(), clf, modeling_set['modelingParams'],
                                          modeling_set['run_folder']).serialize()

        full_df_clean = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"],
                                           core_params["prediction_type"])
        optimized_params = out_params["resolved"]

        logging.info("Regridifying post-train params: %s" % json.dumps(optimized_params))

        # Regridify to a unary grid the optimized params
        optimized_params_grid = intercom.backend_json_call("ml/prediction/regridify-to-pretrain", {
            "preTrain" : json.dumps(modeling_set["modelingParams"]),
            "postTrain" : json.dumps(optimized_params)
        })
        logging.info("Using unary grid params: %s" % json.dumps(optimized_params_grid))

        prediction_train_model_kfold(full_df_clean, core_params, split_desc, preprocessing_params, optimized_params_grid,
                                     preprocessing_set['run_folder'], modeling_set['run_folder'],
                                     modeling_set["listener"], update_fn, with_sample_weight, with_class_weight, calibrate_proba)

        end = unix_time_millis()
        utils.write_done_traininfo(modeling_set['run_folder'], start, model_start, end,
                                   (preprocessing_listener, modeling_set["listener"]),
                                   end_preprocessing_time=preprocessing_end)

        return "ok"

Example #7

Show file

def create_ensemble(split_desc, core_params, model_folder, preprocessing_folder, model_folders, preprocessing_folders):
    listener = ProgressListener()
    listener.add_future_steps(constants.ENSEMBLE_STATES)
    start = unix_time_millis()

    def update_preprocessing_state():
        utils.write_running_traininfo(model_folder, start, listener)

    split_desc = dkujson.loads(split_desc)
    core_params = dkujson.loads(core_params)
    weight_method = core_params.get("weight", {}).get("weightMethod", None)
    with_sample_weight = weight_method in {"SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"}
    # TODO: update downstream
    with_class_weight = weight_method in {"CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"}
    preprocessing_folders = dkujson.loads(preprocessing_folders)
    model_folders = dkujson.loads(model_folders)
    modeling_params = dkujson.load_from_filepath(osp.join(model_folder, "rmodeling_params.json"))
    ensemble_params = modeling_params["ensemble_params"]
    logging.info("creating ensemble")
    with listener.push_state(constants.STATE_ENSEMBLING):
        update_preprocessing_state()
        from dataiku.doctor.prediction.ensembles import ensemble_from_fitted
        train = df_from_split_desc(split_desc, "train", ensemble_params["preprocessing_params"][0]["per_feature"],
                                   core_params["prediction_type"])
        iperf = {
            "modelInputNRows" : train.shape[0], #todo : not the right count as may have dropped ...
            "modelInputNCols" : -1, # makes no sense for an ensemble as may have different preprocessings
            "modelInputIsSparse" : False
        }
        dkujson.dump_to_filepath(osp.join(model_folder, "iperf.json"), iperf)
        clf = ensemble_from_fitted(core_params, ensemble_params, preprocessing_folders, model_folders, train, with_sample_weight, with_class_weight)

    logging.info("saving model")
    with listener.push_state(constants.STATE_SAVING):
        update_preprocessing_state()
        with open(osp.join(model_folder, "clf.pkl"), dku_write_mode_for_pickling()) as f:
            pickle.dump(clf, f, 2)

    logging.info("scoring model")
    with listener.push_state(constants.STATE_SCORING):
        update_preprocessing_state()
        test = df_from_split_desc(split_desc, "test", ensemble_params["preprocessing_params"][0]["per_feature"],
                                   core_params["prediction_type"])
        # this is annoying, but we have to use one of the previous preprocessings in order to get the target
        prep_folder = preprocessing_folders[0]
        rppp = dkujson.load_from_filepath(osp.join(prep_folder, "rpreprocessing_params.json"))
        collector_data = dkujson.load_from_filepath(osp.join(prep_folder, "collector_data.json"))
        preprocessing_handler = PreprocessingHandler.build(core_params, rppp, prep_folder)
        preprocessing_handler.collector_data = collector_data
        pipe = preprocessing_handler.build_preprocessing_pipeline(with_target=True)
        transformed = pipe.process(test)
        y = transformed["target"]

        if with_sample_weight:
            sample_weight = transformed["weight"]
        else:
            sample_weight = None

        # Now that the CLF with scorable pipelines has been saved, set it in "pipelines with target" mode
        # to be able to compute metrics
        clf.set_with_target_pipelines_mode(True)

        pred = clf.predict(test)
        probas = None if core_params["prediction_type"] == "REGRESSION" else clf.predict_proba(test)
        target_map = None if core_params["prediction_type"] == "REGRESSION" else \
            {t["sourceValue"]: t["mappedValue"] for t in ensemble_params["preprocessing_params"][0]["target_remapping"]}
        prediction_type = core_params["prediction_type"]
        if prediction_type == "REGRESSION":
            RegressionModelScorer(modeling_params, clf, pred, y, model_folder, transformed, test.index.copy(), sample_weight).score()
        elif prediction_type == "BINARY_CLASSIFICATION":
            BinaryClassificationModelScorer(modeling_params, clf, model_folder, pred, probas, y, target_map, transformed, test.index.copy(), sample_weight).score()
        else:
            MulticlassModelScorer(modeling_params, clf, model_folder, pred, probas, y.astype(int), target_map, transformed, test.index.copy(), sample_weight).score()

    update_preprocessing_state()
    end = unix_time_millis()
    dkujson.dump_to_filepath(osp.join(model_folder, "actual_params.json"), {"resolved": modeling_params})
    dkujson.dump_to_filepath(osp.join(preprocessing_folder, "preprocessing_report.json"), {})
    utils.write_done_traininfo(model_folder, start, end, end, listener, end_preprocessing_time=start)

    return "ok"

Example #8

Show file

def train_clustering_models_nosave(
                            split_desc,
                            preprocessing_set):
    """Regular (mode 1) train:
      - Non streamed single split + fit preprocess on train + preprocess test
      - Fit N models sequentially
         - Fit
         - Save clf
         - Compute and save clf performance
         - Score, save scored test set + scored performnace
    """

    start = unix_time_millis()
    preprocessing_listener = ProgressListener()
    preprocessing_params = preprocessing_set["preprocessing_params"]
    modeling_sets = preprocessing_set["modelingSets"]

    # Fill all the listeners ASAP to have correct progress data
    preprocessing_listener.add_future_steps(constants.CLUSTERING_REGULAR_PREPROCESSING_STATES)
    for modeling_set in modeling_sets:
        listener = ProgressListener()
        listener.add_future_steps(constants.ALL_CLUSTERING_TRAIN_STATES)
        modeling_set["listener"] = listener

    # Called by the preprocessing pipeline to update the state
    # of each model and dump it to disk
    def update_preprocessing_state():
        for modeling_set in modeling_sets:
            status = {
                "modelId" : modeling_set["modelId"],
                "state": "RUNNING",
                "startTime": start,
                "progress" : merge_listeners(preprocessing_listener, modeling_set["listener"])
            }
            utils.write_model_status(modeling_set, status)

    logging.info("START TRAIN :" + preprocessing_set["description"])
    preprocessing_params = preprocessing_set["preprocessing_params"]

    with preprocessing_listener.push_state(constants.STATE_LOADING_SRC):
        update_preprocessing_state()
        source_df = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"])

        logging.info("Loaded source df: shape=(%d,%d)" % source_df.shape)

    with preprocessing_listener.push_state(constants.STATE_COLLECTING):
        update_preprocessing_state()
        collector = ClusteringPreprocessingDataCollector(source_df, preprocessing_params)
        collector_data = collector.build()

    preproc_handler = ClusteringPreprocessingHandler({},
                        preprocessing_set["preprocessing_params"],
                        preprocessing_set["run_folder"])

    preproc_handler.collector_data = collector_data
    pipeline = preproc_handler.build_preprocessing_pipeline()

    with preprocessing_listener.push_state(constants.STATE_PREPROCESS_SRC):
        update_preprocessing_state()
        source_df_index = source_df.index.copy()
        # TODO: fit_and_process should take an update_fn argument
        transformed_source = pipeline.fit_and_process(source_df)
        # Saves fitted resources and collector data
        preproc_handler.save_data()
        # Report on work
        report = {}
        pipeline.report_fit(report, {})
        utils.write_preproc_file(preprocessing_set["run_folder"], "preprocessing_report.json", report)

    update_preprocessing_state()

    preprocessing_end = unix_time_millis()

    for modeling_set in modeling_sets:
        model_start = unix_time_millis()
        def update_modeling_state():
            status = utils.make_running_traininfo(modeling_set["run_folder"], start,
                                                  (preprocessing_listener, modeling_set["listener"]))
            utils.write_model_status(modeling_set, status)
        clustering_train_score_save(transformed_source, source_df_index,
                                    preprocessing_set["preprocessing_params"],
                                    modeling_set["modelingParams"],
                                    modeling_set["run_folder"],
                                    listener,
                                    update_modeling_state,
                                    pipeline)

        model_end = end = unix_time_millis()
        end = unix_time_millis()

        # Write the final model training info
        status = {
            "modelId": modeling_set["modelId"],
            "state": "DONE",
            "startTime": start,
            "endTime": end,
            "preprocessingTime": preprocessing_end - start,
            "trainingTime": model_end - model_start,
            "progress": merge_listeners(preprocessing_listener, modeling_set["listener"])
        }
        utils.write_model_status(modeling_set, status)

    return "ok"

Example #9

Show file

def train_prediction_keras(core_params, preprocessing_set, split_desc):

    start = unix_time_millis()

    preprocessing_params = preprocessing_set["preprocessing_params"]
    modeling_sets = preprocessing_set["modelingSets"]
    run_folder = preprocessing_set["run_folder"]

    logging.info("PPS is %s" % preprocessing_params)
    preprocessing_listener = ProgressListener()
    # Fill all the listeners ASAP to have correct progress data
    preprocessing_listener.add_future_steps(constants.PRED_KERAS_PREPROCESSING_STATES)
    for modeling_set in modeling_sets:
        listener = ProgressListener()
        listener.add_future_steps(constants.PRED_KERAS_TRAIN_STATES)
        modeling_set["listener"] = listener

    # Called by the preprocessing pipeline to update the state
    # of each model and dump it to disk
    def update_preprocessing_state():
        for modeling_set in modeling_sets:
            status = utils.make_running_traininfo(modeling_set["run_folder"], start,
                                                  (preprocessing_listener, modeling_set["listener"]))
            utils.write_model_status(modeling_set, status)

    with preprocessing_listener.push_state(constants.STATE_LOADING_TRAIN):
        update_preprocessing_state()
        train_df = df_from_split_desc(split_desc, "train", preprocessing_params['per_feature'], core_params["prediction_type"])
        train_df_orig = train_df.copy()
        logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape)
        
        # Not implemented in the UI so far, so processor_fit_df will always be train_df
        preprocessor_fit_df = train_df
        need_subsampling = preprocessing_params["preprocessingFitSampleRatio"] < 1
        if need_subsampling:
            preprocessor_fit_df = preprocessor_fit_df.sample(frac=preprocessing_params["preprocessingFitSampleRatio"],
                                                             random_state=preprocessing_params["preprocessingFitSampleSeed"])

    with preprocessing_listener.push_state(constants.STATE_LOADING_TEST):
        update_preprocessing_state()
        test_df = df_from_split_desc(split_desc, "test", preprocessing_params['per_feature'], core_params["prediction_type"])
        logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape)

    with preprocessing_listener.push_state(constants.STATE_COLLECTING):
        update_preprocessing_state()
        collector = PredictionPreprocessingDataCollector(preprocessor_fit_df, preprocessing_params)
        collector_data = collector.build()

    # Tagging special features to take them into account only in special_preproc_handler/special_pipeline
    per_feature = preprocessing_params["per_feature"]
    tag_special_features(per_feature)

    pipeline, preproc_handler = build_pipeline_and_handler(collector_data, core_params, run_folder,
                                                           preprocessing_params, allow_empty_mf=True)

    with preprocessing_listener.push_state(constants.KERASSTATE_FIT_NORMAL_PREPROCESSING):
        update_preprocessing_state()

        # Retrieving transformed values to get the shape of all regular inputs, even if won't be
        # actually used, as each batch of data will be processed again
        transformed_normal = pipeline.fit_and_process(preprocessor_fit_df)
        preproc_handler.save_data()
        preproc_handler.report(pipeline)

    # TODO: REVIEW STATES OF TRAINING
    with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TRAIN):
        update_preprocessing_state()

    with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TEST):
        update_preprocessing_state()

    update_preprocessing_state()
    preprocessing_end = unix_time_millis()

    for modeling_set in modeling_sets:
        model_start = unix_time_millis()

        # Settings env variable that may be accessed in user defined code
        remoterun.set_dku_env_var_and_sys_env_var(constants.DKU_CURRENT_ANALYSIS_ID, modeling_set["fullId"]["taskLoc"]["analysisId"])
        remoterun.set_dku_env_var_and_sys_env_var(constants.DKU_CURRENT_MLTASK_ID, modeling_set["fullId"]["taskLoc"]["mlTaskId"])

        def update_modeling_state():
            status = utils.make_running_traininfo(modeling_set["run_folder"], start,
                                                  (preprocessing_listener, modeling_set["listener"]))
            utils.write_model_status(modeling_set, status)

        prediction_train_model_keras(transformed_normal, train_df_orig, test_df, pipeline, modeling_set["modelingParams"],
                                     core_params, per_feature, modeling_set["run_folder"], modeling_set["listener"],
                                     update_modeling_state, preproc_handler.target_map,
                                     pipeline.generated_features_mapping)

        end = unix_time_millis()
        utils.write_done_traininfo(modeling_set["run_folder"], start, model_start, end,
                                   (preprocessing_listener, modeling_set["listener"]),
                                   end_preprocessing_time=preprocessing_end)

    return "ok"

Example #10

Show file

def train_prediction_models_nosave(core_params, preprocessing_set, split_desc):
    """Regular (mode 1) train:
      - Non streamed single split + fit preprocess on train + preprocess test
      - Fit N models sequentially
         - Fit
         - Save clf
         - Compute and save clf performance
         - Score, save scored test set + scored performnace
    """

    start = unix_time_millis()
    preprocessing_params = preprocessing_set["preprocessing_params"]
    modeling_sets = preprocessing_set["modelingSets"]

    logging.info("PPS is %s" % preprocessing_params)
    preprocessing_listener = ProgressListener()
    # Fill all the listeners ASAP to have correct progress data
    preprocessing_listener.add_future_steps(constants.PRED_REGULAR_PREPROCESSING_STATES)
    for modeling_set in modeling_sets:
        listener = ProgressListener()
        if modeling_set.get('modelingParams', {}).get('gridLength', 1) > 1:
            listener.add_future_step(constants.STATE_GRIDSEARCHING)
        listener.add_future_steps(constants.PRED_REGULAR_TRAIN_STATES)
        modeling_set["listener"] = listener

    # Called by the preprocessing pipeline to update the state
    # of each model and dump it to disk
    def update_preprocessing_state():
        for modeling_set in modeling_sets:
            status = utils.make_running_traininfo(modeling_set["run_folder"], start,
                                                  (preprocessing_listener, modeling_set["listener"]))
            utils.write_model_status(modeling_set, status)

    with preprocessing_listener.push_state(constants.STATE_LOADING_TRAIN):
        update_preprocessing_state()
        train_df = df_from_split_desc(split_desc, "train", preprocessing_params['per_feature'], core_params["prediction_type"])
        logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape)

        for col in train_df:
            logging.info("Train col : %s (%s)" % (col, train_df[col].dtype))

    with preprocessing_listener.push_state(constants.STATE_LOADING_TEST):
        update_preprocessing_state()
        test_df = df_from_split_desc(split_desc, "test", preprocessing_params['per_feature'], core_params["prediction_type"])
        logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape)

    with preprocessing_listener.push_state(constants.STATE_COLLECTING):
        update_preprocessing_state()
        collector = PredictionPreprocessingDataCollector(train_df, preprocessing_params)
        collector_data = collector.build()

    pipeline, preproc_handler = build_pipeline_and_handler(collector_data, core_params, preprocessing_set['run_folder'],
                                                           preprocessing_params)

    with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TRAIN):
        update_preprocessing_state()
        # TODO: fit_and_process should take an update_fn argument
        transformed_train = pipeline.fit_and_process(train_df)
        preproc_handler.save_data()
        preproc_handler.report(pipeline)

    with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TEST):
        update_preprocessing_state()
        test_df_index = test_df.index.copy()
        transformed_test = pipeline.process(test_df)

    update_preprocessing_state()
    preprocessing_end = unix_time_millis()

    for modeling_set in modeling_sets:
        model_start = unix_time_millis()

        def update_modeling_state():
            status = utils.make_running_traininfo(modeling_set["run_folder"], start,
                                                  (preprocessing_listener, modeling_set["listener"]))
            utils.write_model_status(modeling_set, status)

        # since ensembles are never fitted through the doctor, no need to distinguish here
        prediction_train_score_save(transformed_train,
                                    transformed_test, test_df_index,
                                    core_params, split_desc,
                                    modeling_set["modelingParams"],
                                    modeling_set["run_folder"],
                                    modeling_set["listener"],
                                    preproc_handler.target_map,
                                    update_modeling_state,
                                    pipeline,
                                    modeling_set["run_folder"])

        end = unix_time_millis()

        utils.write_done_traininfo(modeling_set["run_folder"], start, model_start, end,
                                   (preprocessing_listener, modeling_set["listener"]),
                                   end_preprocessing_time=preprocessing_end)

    return "ok"

Example #11

Show file

    BinaryClassificationModelScorer
from dataiku.doctor.clustering.clustering_scorer import ClusteringModelScorer
from dataiku.doctor.prediction.regression_scoring import RegressionModelIntrinsicScorer, RegressionModelScorer
from dataiku.doctor.prediction_entrypoints import prediction_train_score_save, prediction_train_model_kfold, \
    prediction_train_model_keras
from dataiku.doctor.clustering_entrypoints import clustering_train_score_save
from dataiku.doctor.utils import ProgressListener, unix_time_millis, dku_write_mode_for_pickling
from dataiku.doctor.notebook_builder import PredictionNotebookBuilder, ClusteringNotebookBuilder
from dataiku.core import dkujson, intercom
from dataiku.doctor.deep_learning.keras_utils import tag_special_features
from dataiku.doctor.utils.split import df_from_split_desc
from dataiku.doctor.utils import merge_listeners
from dataiku.doctor import utils
from dataiku.doctor.posttraining import partial_depency as pdp, subpopulation as subpopulation

preprocessing_listener = ProgressListener()
global_modeling_sets = []


def _list_commands():
    current_module = sys.modules[__name__]
    return [
        (func_name, func)
        for (func_name, func) in current_module.__dict__.items()
        if not func_name.startswith("_") and inspect.isfunction(func) and inspect.getmodule(func) == current_module
    ]


def create_prediction_notebook(model_name, model_date, dataset_smartname,
                               script, preparation_output_schema,
                               split_stuff,