Example #1
0
def main(exec_folder, output_dataset, keptInputColumns):
    start = unix_time_millis()
    listener = ProgressListener()

    split_desc = json.load(open(osp.join(exec_folder, "_esplit.json")))
    preprocessing_params = json.load(open(osp.join(exec_folder, "rpreprocessing_params.json")))
    modeling_params = json.load(open(osp.join(exec_folder, "rmodeling_params.json")))

    with listener.push_state(constants.STATE_LOADING_SRC):
        input_df = df_from_split_desc_no_normalization(split_desc, "full", preprocessing_params["per_feature"])
        logging.info("Loaded full df: shape=(%d,%d)" % input_df.shape)
        input_df_orig = input_df.copy()
        input_df = utils.normalize_dataframe(input_df, preprocessing_params["per_feature"])        

    with listener.push_state("Collecting preprocessing data"):
        collector = ClusteringPreprocessingDataCollector(input_df, preprocessing_params)
        collector_data = collector.build()

    preproc_handler = ClusteringPreprocessingHandler({}, preprocessing_params, exec_folder)
    preproc_handler.collector_data = collector_data
    pipeline = preproc_handler.build_preprocessing_pipeline()

    with listener.push_state("Preprocessing data"):
        transformed_train = pipeline.fit_and_process(input_df)

    start_train = unix_time_millis()

    (clf, actual_params, cluster_labels, additional_columns) = clustering_fit(modeling_params, transformed_train)

    # if model has custom labels, use them
    try:
        cluster_names = clf.get_cluster_labels()
    except AttributeError:
        cluster_names = ["cluster_%s" % i for i in range(len(np.unique(cluster_labels)))]
    cl = pd.Series(data=cluster_labels, name="cluster_labels").map(lambda i: cluster_names[i])
    cl.index = transformed_train["TRAIN"].index

    final_df = pd.concat([input_df_orig.join(cl, how='left'), additional_columns], axis=1)

    if keptInputColumns is not None:
        final_df = final_df[keptInputColumns + ['cluster_labels']]

    if preprocessing_params["outliers"]["method"] == "CLUSTER":
        final_df['cluster_labels'].fillna(constants.CLUSTER_OUTLIERS, inplace=True)

    dataiku.Dataset(output_dataset).write_from_dataframe(final_df)

    end = unix_time_millis()

    utils.write_done_traininfo(exec_folder, start, start_train, end, listener)
Example #2
0
def main(exec_folder):
    start = unix_time_millis()
    listener = ProgressListener()

    def update_fn():
        utils.write_running_traininfo(exec_folder, start, listener)

    split_desc = json.load(open(osp.join(exec_folder, "_esplit.json")))
    preprocessing_params = json.load(
        open(osp.join(exec_folder, "rpreprocessing_params.json")))
    modeling_params = json.load(
        open(osp.join(exec_folder, "rmodeling_params.json")))

    with listener.push_state(constants.STATE_LOADING_SRC):
        update_fn()
        train_df = df_from_split_desc(split_desc, "full",
                                      preprocessing_params["per_feature"])
        logging.info("Loaded full df: shape=(%d,%d)" % train_df.shape)

    with listener.push_state("Collecting preprocessing data"):
        update_fn()
        collector = ClusteringPreprocessingDataCollector(
            train_df, preprocessing_params)
        collector_data = collector.build()

    preproc_handler = ClusteringPreprocessingHandler({}, preprocessing_params,
                                                     exec_folder)
    preproc_handler.collector_data = collector_data
    pipeline = preproc_handler.build_preprocessing_pipeline()

    with listener.push_state("Preprocessing data"):
        orig_index = train_df.index.copy()
        transformed_train = pipeline.fit_and_process(train_df)
        preproc_handler.save_data()
        preproc_handler.report(pipeline)

    start_train = unix_time_millis()

    clustering_train_score_save(transformed_train, orig_index,
                                preprocessing_params, modeling_params,
                                exec_folder, listener, update_fn, pipeline)

    end = unix_time_millis()

    utils.write_done_traininfo(exec_folder, start, start_train, end, listener)
Example #3
0
def main(exec_folder, selection_state_folder, operation_mode):
    """The whole execution of the saved model train takes place in a single folder ?"""
    start = unix_time_millis()
    start_train = start
    listener = ProgressListener()

    def update_fn():
        utils.write_running_traininfo(exec_folder, start, listener)

    split_desc = json.load(open(osp.join(exec_folder, "_esplit.json")))
    core_params = json.load(open(osp.join(exec_folder, "core_params.json")))
    preprocessing_params = json.load(
        open(osp.join(exec_folder, "rpreprocessing_params.json")))
    weight_method = core_params.get("weight", {}).get("weightMethod", None)
    with_sample_weight = weight_method in {
        "SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"
    }
    with_class_weight = weight_method in {
        "CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"
    }
    calibrate_proba = core_params.get("calibration",
                                      {}).get("calibrationMethod",
                                              None) in ["SIGMOID", "ISOTONIC"]
    modeling_params = json.load(
        open(osp.join(exec_folder, "rmodeling_params.json")))

    # For KERAS backend, need to tag special features, because they are only processed with process function,
    # not fit_and_process
    if modeling_params["algorithm"] == "KERAS_CODE":
        tag_special_features(preprocessing_params['per_feature'])

    def do_full_fit_and_save():
        """Fit on 100% and save the clf and out params"""
        with listener.push_state(constants.STATE_LOADING_TRAIN):
            update_fn()
            full_df = df_from_split_desc(split_desc, "full",
                                         preprocessing_params["per_feature"],
                                         core_params["prediction_type"])
            logging.info("Loaded FULL df: shape=(%d,%d)" % full_df.shape)

        with listener.push_state("Collecting preprocessing data"):
            update_fn()
            collector = ClusteringPreprocessingDataCollector(
                full_df, preprocessing_params)
            collector_data = collector.build()

            pipeline, preproc_handler = build_pipeline_and_handler(
                collector_data,
                core_params,
                exec_folder,
                preprocessing_params,
                selection_state_folder=selection_state_folder,
                allow_empty_mf=modeling_params["algorithm"] == "KERAS_CODE")

            # TODO
            if core_params["prediction_type"] in (
                    constants.BINARY_CLASSIFICATION, constants.MULTICLASS):
                target_map = preproc_handler.target_map
            else:
                target_map = None

        with listener.push_state("Preprocessing full set"):
            preprocessor_fit_full_df = full_df

            # For KERAS backend, we might need to take a subsample of the input_df to prevent from memory errors
            if modeling_params["algorithm"] == "KERAS_CODE":
                need_subsampling = preprocessing_params[
                    "preprocessingFitSampleRatio"] < 1
                full_df_orig = full_df.copy()
                if need_subsampling:
                    preprocessor_fit_full_df = preprocessor_fit_full_df.sample(
                        frac=preprocessing_params[
                            "preprocessingFitSampleRatio"],
                        random_state=preprocessing_params[
                            "preprocessingFitSampleSeed"])

            transformed_full = pipeline.fit_and_process(
                preprocessor_fit_full_df)

            if with_sample_weight:
                assert transformed_full["weight"].values.min(
                ) > 0, "Sample weights must be positive"

            preproc_handler.save_data()
            preproc_handler.report(pipeline)

        if modeling_params["algorithm"] == "KERAS_CODE":

            modeling_set = {"run_folder": exec_folder, "listener": listener}

            def update_modeling_state():

                status = utils.make_running_traininfo(
                    modeling_set["run_folder"], start,
                    modeling_set["listener"])
                utils.write_model_status(modeling_set, status)

            empty_df = pd.DataFrame()

            return prediction_train_model_keras(
                transformed_full, full_df_orig, empty_df, pipeline,
                modeling_params, core_params,
                preprocessing_params["per_feature"], exec_folder, listener,
                update_modeling_state, preproc_handler.target_map,
                pipeline.generated_features_mapping)

        else:
            return fit_score_save(pipeline, target_map, transformed_full)

    def fit_score_save(pipeline, target_map, transformed_full):
        with listener.push_state(constants.STATE_FITTING):
            update_fn()
            if core_params["prediction_type"] in (
                    constants.BINARY_CLASSIFICATION, constants.MULTICLASS):
                (clf, out_params, prepared_X, iipd) = classification_fit(
                    modeling_params,
                    split_desc,
                    transformed_full,
                    core_params["prediction_type"],
                    exec_folder,
                    target_map=target_map,
                    with_sample_weight=with_sample_weight,
                    with_class_weight=with_class_weight)
                if calibrate_proba:
                    method = core_params.get(
                        "calibration", {}).get("calibrationMethod").lower()
                    calibrated_clf = CalibratedClassifierCV(clf,
                                                            cv="prefit",
                                                            method=method)
                    test_X = transformed_full["TRAIN"]
                    test_X, is_sparse = prepare_multiframe(
                        test_X, modeling_set['modelingParams'])
                    test_y = transformed_full["target"].astype(int)
                    if with_sample_weight:
                        test_weight = transformed_full["weight"].astype(float)
                        calibrated_clf.fit(test_X,
                                           test_y,
                                           sample_weight=test_weight)
                    else:
                        calibrated_clf.fit(test_X, test_y)
                    clf = calibrated_clf
            else:
                (clf, out_params, prepared_X, iipd) = regression_fit_single(
                    modeling_params,
                    split_desc,
                    transformed_full,
                    exec_folder,
                    with_sample_weight=with_sample_weight)

        with listener.push_state(constants.STATE_SAVING):
            save_prediction_model(clf, out_params, listener, update_fn,
                                  exec_folder)
        with listener.push_state(constants.STATE_SCORING):
            train_X = transformed_full["TRAIN"]
            train_y = transformed_full["target"]
            if core_params[
                    "prediction_type"] == constants.BINARY_CLASSIFICATION:
                ClassificationModelIntrinsicScorer(modeling_params, clf,
                                                   train_X, train_y, pipeline,
                                                   exec_folder, prepared_X,
                                                   iipd,
                                                   calibrate_proba).score()
                BinaryModelSerializer(train_X.columns(), clf, modeling_params,
                                      exec_folder, target_map,
                                      calibrate_proba).serialize()
            elif core_params["prediction_type"] == constants.MULTICLASS:
                ClassificationModelIntrinsicScorer(modeling_params, clf,
                                                   train_X, train_y, pipeline,
                                                   exec_folder, prepared_X,
                                                   iipd,
                                                   calibrate_proba).score()
                MulticlassModelSerializer(train_X.columns(), clf,
                                          modeling_params, exec_folder,
                                          target_map,
                                          calibrate_proba).serialize()
            else:
                RegressionModelIntrinsicScorer(modeling_params, clf, train_X,
                                               train_y, pipeline, exec_folder,
                                               prepared_X, iipd).score()
                RegressionModelSerializer(train_X.columns(), clf,
                                          modeling_params,
                                          exec_folder).serialize()
        return out_params

    if operation_mode == "TRAIN_SPLITTED_ONLY":

        with listener.push_state(constants.STATE_LOADING_TRAIN):
            update_fn()
            train_df = df_from_split_desc(split_desc, "train",
                                          preprocessing_params["per_feature"],
                                          core_params["prediction_type"])
            logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape)

        with listener.push_state(constants.STATE_LOADING_TEST):
            update_fn()
            test_df = df_from_split_desc(split_desc, "test",
                                         preprocessing_params["per_feature"],
                                         core_params["prediction_type"])
            logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape)

        with listener.push_state("Collecting preprocessing data"):
            update_fn()
            collector = PredictionPreprocessingDataCollector(
                train_df, preprocessing_params)
            collector_data = collector.build()
            pipeline, preproc_handler = build_pipeline_and_handler(
                collector_data,
                core_params,
                exec_folder,
                preprocessing_params,
                selection_state_folder=selection_state_folder,
                allow_empty_mf=modeling_params["algorithm"] == "KERAS_CODE")

            # TODO
            if core_params["prediction_type"] in (
                    constants.BINARY_CLASSIFICATION, constants.MULTICLASS):
                target_map = preproc_handler.target_map
            else:
                target_map = None

        with listener.push_state("Preprocessing train set"):
            preprocessor_fit_df = train_df

            # For KERAS backend, we might need to take a subsample of the input_df to prevent from memory errors
            if modeling_params["algorithm"] == "KERAS_CODE":
                train_df_orig = train_df.copy()
                need_subsampling = preprocessing_params[
                    "preprocessingFitSampleRatio"] < 1
                if need_subsampling:
                    preprocessor_fit_df = preprocessor_fit_df.sample(
                        frac=preprocessing_params[
                            "preprocessingFitSampleRatio"],
                        random_state=preprocessing_params[
                            "preprocessingFitSampleSeed"])

            transformed_train = pipeline.fit_and_process(preprocessor_fit_df)
            if with_sample_weight:
                assert transformed_train["weight"].values.min(
                ) > 0, "Sample weights must be positive"

            preproc_handler.save_data()
            preproc_handler.report(pipeline)

        # For KERAS backend, cannot process test directly, because my have special features that may not
        # hold in memory
        if modeling_params["algorithm"] != "KERAS_CODE":
            with listener.push_state("Preprocessing test set"):
                test_df_index = test_df.index.copy()
                transformed_test = pipeline.process(test_df)
                if with_sample_weight:
                    assert transformed_test["weight"].values.min(
                    ) > 0, "Sample weights must be positive"

        if modeling_params["algorithm"] == "PYTHON_ENSEMBLE":
            prediction_train_score_save_ensemble(train_df, test_df,
                                                 core_params, split_desc,
                                                 modeling_params, exec_folder,
                                                 listener, target_map,
                                                 update_fn, pipeline,
                                                 with_sample_weight)
        elif modeling_params["algorithm"] == "KERAS_CODE":
            modeling_set = {"run_folder": exec_folder, "listener": listener}

            def update_modeling_state():
                status = utils.make_running_traininfo(
                    modeling_set["run_folder"], start,
                    modeling_set["listener"])
                utils.write_model_status(modeling_set, status)

            prediction_train_model_keras(
                transformed_train, train_df_orig, test_df, pipeline,
                modeling_params, core_params,
                preprocessing_params["per_feature"], exec_folder, listener,
                update_modeling_state, preproc_handler.target_map,
                pipeline.generated_features_mapping)
        else:
            prediction_train_score_save(transformed_train, transformed_test,
                                        test_df_index, core_params, split_desc,
                                        modeling_params, exec_folder, listener,
                                        target_map, update_fn, pipeline,
                                        exec_folder)

    elif operation_mode == "TRAIN_FULL_ONLY":
        # Not yet functional ...
        do_full_fit_and_save()

    elif operation_mode == "TRAIN_KFOLD":
        out_params = do_full_fit_and_save()

        full_df_clean = df_from_split_desc(split_desc, "full",
                                           preprocessing_params["per_feature"],
                                           core_params["prediction_type"])

        optimized_params = out_params["resolved"]

        logging.info("Regridifying post-train params: %s" %
                     json.dumps(optimized_params))

        # Regridify to a unary grid the optimized params
        optimized_params_grid = intercom.backend_json_call(
            "ml/prediction/regridify-to-pretrain", {
                "preTrain": json.dumps(modeling_params),
                "postTrain": json.dumps(optimized_params)
            })
        logging.info("Using unary grid params: %s" %
                     json.dumps(optimized_params_grid))

        prediction_train_model_kfold(full_df_clean, core_params, split_desc,
                                     preprocessing_params,
                                     optimized_params_grid, exec_folder,
                                     exec_folder, listener, update_fn,
                                     with_sample_weight, with_class_weight,
                                     calibrate_proba)

    else:
        do_full_fit_and_save()
        # Do the split and scoring but don't save data
        with listener.push_state(constants.STATE_LOADING_TRAIN):
            update_fn()
            train_df = df_from_split_desc(split_desc, "train",
                                          preprocessing_params["per_feature"],
                                          core_params["prediction_type"])
            logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape)

        with listener.push_state(constants.STATE_LOADING_TEST):
            update_fn()
            test_df = df_from_split_desc(split_desc, "test",
                                         preprocessing_params["per_feature"],
                                         core_params["prediction_type"])
            logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape)

        with listener.push_state("Collecting preprocessing data"):
            update_fn()
            collector = PredictionPreprocessingDataCollector(
                train_df, preprocessing_params)
            collector_data = collector.build()

            pipeline, preproc_handler = build_pipeline_and_handler(
                collector_data,
                core_params,
                exec_folder,
                preprocessing_params,
                selection_state_folder=selection_state_folder,
                allow_empty_mf=modeling_params["algorithm"] == "KERAS_CODE")

            # TODO
            if core_params["prediction_type"] in (
                    constants.BINARY_CLASSIFICATION, constants.MULTICLASS):
                target_map = preproc_handler.target_map
            else:
                target_map = None

        with listener.push_state("Preprocessing train set"):
            preprocessor_fit_df = train_df

            # For KERAS backend, we might need to take a subsample of the input_df to prevent from memory errors
            if modeling_params["algorithm"] == "KERAS_CODE":
                need_subsampling = preprocessing_params[
                    "preprocessingFitSampleRatio"] < 1
                train_df_orig = train_df.copy()
                if need_subsampling:
                    preprocessor_fit_df = preprocessor_fit_df.sample(
                        frac=preprocessing_params[
                            "preprocessingFitSampleRatio"],
                        random_state=preprocessing_params[
                            "preprocessingFitSampleSeed"])

            transformed_train = pipeline.fit_and_process(preprocessor_fit_df)

        # For KERAS backend, cannot process test directly, because my have special features that may not
        # hold in memory
        if modeling_params["algorithm"] != "KERAS_CODE":
            with listener.push_state("Preprocessing test set"):
                test_df_index = test_df.index.copy()
                transformed_test = pipeline.process(test_df)

        if modeling_params["algorithm"] == "KERAS_CODE":
            modeling_set = {"run_folder": exec_folder, "listener": listener}

            def update_modeling_state():
                status = utils.make_running_traininfo(
                    modeling_set["run_folder"], start,
                    modeling_set["listener"])
                utils.write_model_status(modeling_set, status)

            prediction_train_model_keras(transformed_train,
                                         train_df_orig,
                                         test_df,
                                         pipeline,
                                         modeling_params,
                                         core_params,
                                         preprocessing_params["per_feature"],
                                         exec_folder,
                                         listener,
                                         update_modeling_state,
                                         preproc_handler.target_map,
                                         pipeline.generated_features_mapping,
                                         save_model=False)
        else:
            with listener.push_state(constants.STATE_FITTING):
                update_fn()
                if core_params["prediction_type"] in (
                        constants.BINARY_CLASSIFICATION, constants.MULTICLASS):
                    (clf, out_params, prepared_X, iipd) = classification_fit(
                        modeling_params,
                        split_desc,
                        transformed_train,
                        core_params["prediction_type"],
                        target_map=target_map,
                        with_sample_weight=with_sample_weight,
                        with_class_weight=with_class_weight)
                else:
                    (clf, out_params, prepared_X,
                     iipd) = regression_fit_single(
                         modeling_params,
                         split_desc,
                         transformed_train,
                         exec_folder,
                         with_sample_weight=with_sample_weight)
            with listener.push_state(constants.STATE_SCORING):
                train_X = transformed_train["TRAIN"]
                train_y = transformed_train["target"]
                if core_params[
                        "prediction_type"] == constants.BINARY_CLASSIFICATION:
                    ClassificationModelIntrinsicScorer(
                        modeling_params, clf, train_X, train_y, pipeline,
                        exec_folder, prepared_X, iipd,
                        calibrate_proba).score()
                    BinaryModelSerializer(train_X.columns(), clf,
                                          modeling_params, exec_folder,
                                          target_map).serialize()
                    binary_classification_scorer_with_valid(
                        modeling_params,
                        clf,
                        transformed_test,
                        exec_folder,
                        test_df_index,
                        target_map=target_map,
                        with_sample_weight=with_sample_weight).score()
                elif core_params["prediction_type"] == constants.MULTICLASS:
                    ClassificationModelIntrinsicScorer(
                        modeling_params, clf, train_X, train_y, pipeline,
                        exec_folder, prepared_X, iipd,
                        calibrate_proba).score()
                    MulticlassModelSerializer(train_X.columns(), clf,
                                              modeling_params, exec_folder,
                                              target_map).serialize()
                    multiclass_scorer_with_valid(
                        modeling_params,
                        clf,
                        transformed_test,
                        exec_folder,
                        test_df_index,
                        target_map=target_map,
                        with_sample_weight=with_sample_weight).score()
                else:
                    RegressionModelIntrinsicScorer(modeling_params, clf,
                                                   train_X, train_y, pipeline,
                                                   exec_folder, prepared_X,
                                                   iipd).score()
                    RegressionModelSerializer(train_X.columns(), clf,
                                              modeling_params,
                                              exec_folder).serialize()
                    regression_scorer_with_valid(modeling_params, clf,
                                                 transformed_test, exec_folder,
                                                 test_df_index,
                                                 with_sample_weight).score()

    end = unix_time_millis()

    utils.write_done_traininfo(exec_folder, start, start_train, end, listener)
Example #4
0
def main(model_folder,
         input_dataset_smartname,
         output_dataset_smartname,
         recipe_desc,
         script,
         preparation_output_schema,
         cond_outputs=None):

    # Obtain a streamed result of the preparation
    input_dataset = dataiku.Dataset(input_dataset_smartname)
    logging.info("Will do preparation, output schema: %s" %
                 preparation_output_schema)
    input_dataset.set_preparation_steps(script["steps"],
                                        preparation_output_schema)

    listener = ProgressListener()

    core_params = dkujson.load_from_filepath(
        osp.join(model_folder, "core_params.json"))
    preprocessing_params = dkujson.load_from_filepath(
        osp.join(model_folder, "rpreprocessing_params.json"))
    modeling_params = dkujson.load_from_filepath(
        osp.join(model_folder, "actual_params.json"))["resolved"]
    collector_data = dkujson.load_from_filepath(
        osp.join(model_folder, "collector_data.json"))

    preprocessing_handler = PreprocessingHandler.build(core_params,
                                                       preprocessing_params,
                                                       model_folder)
    preprocessing_handler.collector_data = collector_data

    pipeline = preprocessing_handler.build_preprocessing_pipeline()

    batch_size = recipe_desc.get("pythonBatchSize", 100000)
    logging.info("Scoring with batch size: {}".format(batch_size))

    with open(osp.join(model_folder, "clf.pkl"), "rb") as f:
        clf = pickle.load(f)

    def output_generator():
        logging.info("Start output generator ...")

        (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st(
            preparation_output_schema["columns"],
            parse_dates=True,
            infer_with_pandas=False)
        logging.info("Reading with INITIAL dtypes: %s" % dtypes)
        dtypes = utils.ml_dtypes_from_dss_schema(
            preparation_output_schema,
            preprocessing_params["per_feature"],
            prediction_type=core_params["prediction_type"])
        logging.info("Reading with dtypes: %s" % dtypes)

        for i in xrange(0, len(names)):
            logging.info("Column %s = %s (dtype=%s)" %
                         (i, names[i], dtypes.get(names[i], None)))

        for input_df in input_dataset.iter_dataframes_forced_types(
                names,
                dtypes,
                parse_date_columns,
                chunksize=batch_size,
                float_precision="round_trip"):
            input_df.index = range(input_df.shape[0])
            input_df_orig = input_df.copy()
            logging.info("Got a dataframe : %s" % str(input_df.shape))
            normalize_dataframe(input_df, preprocessing_params['per_feature'])

            for col in input_df:
                logging.info("NORMALIZED: %s -> %s" %
                             (col, input_df[col].dtype))

            logging.info("Processing it")
            logging.info("Predicting it")

            if core_params[
                    "prediction_type"] == constants.BINARY_CLASSIFICATION:
                pred_df = binary_classification_predict(
                    clf,
                    pipeline,
                    modeling_params,
                    preprocessing_params,
                    preprocessing_handler.target_map,
                    recipe_desc["forcedClassifierThreshold"],
                    input_df,
                    output_probas=recipe_desc["outputProbabilities"])
                # Probability percentile & Conditional outputs
                pred_df = binary_classif_scoring_add_percentile_and_cond_outputs(
                    pred_df, recipe_desc, model_folder, cond_outputs,
                    preprocessing_handler.target_map)

            elif core_params["prediction_type"] == constants.MULTICLASS:
                pred_df = multiclass_predict(
                    clf,
                    pipeline,
                    modeling_params,
                    preprocessing_params,
                    preprocessing_handler.target_map,
                    input_df,
                    output_probas=recipe_desc["outputProbabilities"])

            elif core_params["prediction_type"] == constants.REGRESSION:
                pred_df = regression_predict(clf, pipeline, modeling_params,
                                             input_df)

            else:
                raise ValueError("bad prediction type %s" %
                                 core_params["prediction_type"])

            logging.info("pred df debug :")
            logging.info(pred_df)

            logging.info("Done predicting it")
            if recipe_desc.get("filterInputColumns", False):
                clean_kept_columns = [
                    c for c in recipe_desc["keptInputColumns"]
                    if c not in pred_df.columns
                ]
            else:
                clean_kept_columns = [
                    c for c in input_df_orig.columns
                    if c not in pred_df.columns
                ]
            yield pd.concat([input_df_orig[clean_kept_columns], pred_df],
                            axis=1)

    output_dataset = dataiku.Dataset(output_dataset_smartname)
    logging.info("Starting writer")
    with output_dataset.get_writer() as writer:
        i = 0
        logging.info("Starting to iterate")
        for output_df in output_generator():
            logging.info("Generator generated a df %s" % str(output_df.shape))
            #if i == 0:
            #    output_dataset.write_schema_from_dataframe(output_df)
            i = i + 1
            writer.write_dataframe(output_df)
            logging.info("Output df written")
Example #5
0
def main(model_folder, input_dataset_smartname, output_dataset_smartname,
         recipe_desc, script, preparation_output_schema):
    input_dataset = dataiku.Dataset(input_dataset_smartname)
    logging.info("Will do preparation, output schema: %s" %
                 preparation_output_schema)
    input_dataset.set_preparation_steps(script["steps"],
                                        preparation_output_schema)

    listener = ProgressListener()

    preprocessing_params = dkujson.load_from_filepath(
        osp.join(model_folder, "rpreprocessing_params.json"))
    modeling_params = dkujson.load_from_filepath(
        osp.join(model_folder, "actual_params.json"))["resolved"]
    collector_data = dkujson.load_from_filepath(
        osp.join(model_folder, "collector_data.json"))

    # Name remapping
    user_meta = dkujson.load_from_filepath(
        osp.join(model_folder, "user_meta.json"))
    cluster_name_map = {}
    if "clusterMetas" in user_meta:
        logging.info("Cluster metas: %s" % user_meta["clusterMetas"])
        for (cluster_id, cluster_data) in user_meta["clusterMetas"].items():
            cluster_name_map[cluster_id] = cluster_data["name"]

    preprocessing_handler = ClusteringPreprocessingHandler(
        {}, preprocessing_params, model_folder)
    preprocessing_handler.collector_data = collector_data
    pipeline = preprocessing_handler.build_preprocessing_pipeline()

    with open(osp.join(model_folder, "clusterer.pkl"), "rb") as f:
        clf = pickle.load(f)

    try:
        logging.info("Post-processing model")
        clf.post_process(user_meta)
    except AttributeError:
        # method does not exist if model cannot be post-processed, just pass
        pass

    try:
        custom_labels = clf.get_cluster_labels()

        def map_fun_custom(i):
            name = custom_labels[i]
            return cluster_name_map.get(name, name)

        naming = map_fun_custom
    except AttributeError:

        def map_fun(i):
            name = "cluster_%i" % i
            return cluster_name_map.get(name, name)

        naming = map_fun

    def output_generator():
        logging.info("Start output generator ...")

        (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st(
            preparation_output_schema["columns"],
            parse_dates=True,
            infer_with_pandas=False)
        logging.info("Reading with INITIAL dtypes: %s" % dtypes)
        dtypes = utils.ml_dtypes_from_dss_schema(
            preparation_output_schema, preprocessing_params["per_feature"])
        logging.info("Reading with dtypes: %s" % dtypes)

        for input_df in input_dataset.iter_dataframes_forced_types(
                names, dtypes, parse_date_columns, chunksize=100000):
            input_df.index = range(input_df.shape[0])
            input_df_orig = input_df.copy()
            if recipe_desc.get("filterInputColumns", False):
                input_df_orig = input_df_orig[recipe_desc["keptInputColumns"]]

            logging.info("Got a dataframe : %s" % str(input_df.shape))
            normalize_dataframe(input_df, preprocessing_params['per_feature'])

            for col in input_df:
                logging.info("NORMALIZED: %s -> %s" %
                             (col, input_df[col].dtype))

            logging.info("Processing it")
            transformed = pipeline.process(input_df)
            logging.info("Applying it")

            (labels_arr,
             additional_columns) = clustering_predict(modeling_params, clf,
                                                      transformed)
            cluster_labels = pd.Series(labels_arr,
                                       name="cluster_labels").map(naming)
            cluster_labels.index = transformed["TRAIN"].index

            final_df = pd.concat([
                input_df_orig.join(cluster_labels, how='left'),
                additional_columns
            ],
                                 axis=1)

            if preprocessing_params["outliers"]["method"] == "CLUSTER":
                outliers_cluter_name = cluster_name_map.get(
                    constants.CLUSTER_OUTLIERS, constants.CLUSTER_OUTLIERS)
                final_df['cluster_labels'].fillna(outliers_cluter_name,
                                                  inplace=True)

            logging.info("Done predicting it")

            yield final_df

    output_dataset = dataiku.Dataset(output_dataset_smartname)
    logging.info("Starting writer")
    with output_dataset.get_writer() as writer:
        i = 0
        logging.info("Starting to iterate")
        for output_df in output_generator():
            logging.info("Generator generated a df %s" % str(output_df.shape))
            #if i == 0:
            #    output_dataset.write_schema_from_dataframe(output_df)
            i = i + 1
            writer.write_dataframe(output_df)
            logging.info("Output df written")
Example #6
0
def train_prediction_kfold(core_params, preprocessing_set, split_desc):

    start = unix_time_millis()
    preprocessing_params = preprocessing_set['preprocessing_params']
    modeling_sets = preprocessing_set["modelingSets"]

    logging.info("PPS is %s" % preprocessing_params)
    preprocessing_listener = ProgressListener()
    preprocessing_listener.add_future_steps(constants.PRED_KFOLD_PREPROCESSING_STATES)
    for modeling_set in modeling_sets:
        listener = ProgressListener()
        listener.add_future_steps(constants.PRED_KFOLD_TRAIN_STATES)
        modeling_set["listener"] = listener

    def update_one_preprocessing_state(modeling_set):
        status = utils.make_running_traininfo(modeling_set["run_folder"], start,
                                              (preprocessing_listener, modeling_set["listener"]))
        utils.write_model_status(modeling_set, status)

    def update_preprocessing_state():
        map(update_one_preprocessing_state, modeling_sets)


    with preprocessing_listener.push_state(constants.STATE_LOADING_SRC):
        update_preprocessing_state()
        full_df = df_from_split_desc(split_desc,
                                     "full",
                                     preprocessing_params["per_feature"],
                                     core_params["prediction_type"])
        logging.info("Loaded full_df df: shape=(%d,%d)" % full_df.shape)

    with preprocessing_listener.push_state(constants.STATE_COLLECTING):
        update_preprocessing_state()
        collector = PredictionPreprocessingDataCollector(full_df, preprocessing_params)
        collector_data = collector.build()

    pipeline, preproc_handler = build_pipeline_and_handler(collector_data, core_params, preprocessing_set['run_folder'],
                                                           preprocessing_params)

    with preprocessing_listener.push_state(constants.KFOLDSTATE_PREPROCESS_GLOBAL):
        update_preprocessing_state()
        transformed_full = pipeline.fit_and_process(full_df)
        preproc_handler.save_data()
        preproc_handler.report(pipeline)

    update_preprocessing_state()
    preprocessing_end = unix_time_millis()

    train_X = transformed_full["TRAIN"]
    train_y = transformed_full["target"]

    weight_method = core_params.get("weight", {}).get("weightMethod", None)
    with_sample_weight = weight_method in {"SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"}
    with_class_weight = weight_method in {"CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"}
    calibrate_proba = core_params.get("calibration", {}).get("calibrationMethod", None) in ["SIGMOID", "ISOTONIC"]

    if with_sample_weight:
        assert transformed_full["weight"].values.min() > 0, "Sample weights must be positive"

    for modeling_set in modeling_sets:
        model_start = unix_time_millis()

        update_fn = lambda: update_one_preprocessing_state(modeling_set)
        if core_params["prediction_type"] in (constants.BINARY_CLASSIFICATION, constants.MULTICLASS):
            with modeling_set["listener"].push_state(constants.KFOLDSTATE_FITTING_GLOBAL):
                # no out-fold available, so calibrate through classification_fit on a random split
                if calibrate_proba:
                    calibration_method = core_params.get("calibration", {}).get("calibrationMethod").lower()
                else:
                    calibration_method = None
                update_one_preprocessing_state(modeling_set)
                (clf, out_params, prepared_X, iipd) = classification_fit(modeling_set['modelingParams'], split_desc,
                                                                         transformed_full,
                                                                         core_params["prediction_type"],
                                                                         modeling_set['run_folder'],
                                                                         target_map=preproc_handler.target_map,
                                                                         with_sample_weight=with_sample_weight,
                                                                         with_class_weight=with_class_weight,
                                                                         calibration=calibration_method)
            save_prediction_model(clf, out_params, modeling_set["listener"], update_fn,
                                  modeling_set['run_folder'])

            with modeling_set["listener"].push_state(constants.KFOLDSTATE_SCORING_GLOBAL):
                update_one_preprocessing_state(modeling_set)
                ClassificationModelIntrinsicScorer(modeling_set['modelingParams'], clf,
                         train_X, train_y, pipeline, modeling_set['run_folder'], prepared_X, iipd, calibrate_proba).score()
                if core_params["prediction_type"] == constants.BINARY_CLASSIFICATION:
                    BinaryModelSerializer(train_X.columns(), clf, modeling_set['modelingParams'],
                                          modeling_set['run_folder'], preproc_handler.target_map, calibrate_proba).serialize()
                else:
                    MulticlassModelSerializer(train_X.columns(), clf, modeling_set['modelingParams'],
                                              modeling_set['run_folder'], preproc_handler.target_map, calibrate_proba).serialize()
        else:
            with modeling_set["listener"].push_state(constants.KFOLDSTATE_FITTING_GLOBAL):
                update_one_preprocessing_state(modeling_set)
                (clf, out_params, prepared_X, iipd) = regression_fit_single(modeling_set['modelingParams'],
                                                                            split_desc, transformed_full, modeling_set["run_folder"],
                                                                            with_sample_weight=with_sample_weight)
            save_prediction_model(clf, out_params, modeling_set["listener"], update_fn,
                                  modeling_set['run_folder'])

            with modeling_set["listener"].push_state(constants.KFOLDSTATE_SCORING_GLOBAL):
                update_fn()
                RegressionModelIntrinsicScorer(modeling_set['modelingParams'], clf, train_X, train_y, pipeline,
                                               modeling_set['run_folder'], prepared_X, iipd).score()
                # serialize the model if possible
                RegressionModelSerializer(train_X.columns(), clf, modeling_set['modelingParams'],
                                          modeling_set['run_folder']).serialize()

        full_df_clean = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"],
                                           core_params["prediction_type"])
        optimized_params = out_params["resolved"]

        logging.info("Regridifying post-train params: %s" % json.dumps(optimized_params))

        # Regridify to a unary grid the optimized params
        optimized_params_grid = intercom.backend_json_call("ml/prediction/regridify-to-pretrain", {
            "preTrain" : json.dumps(modeling_set["modelingParams"]),
            "postTrain" : json.dumps(optimized_params)
        })
        logging.info("Using unary grid params: %s" % json.dumps(optimized_params_grid))

        prediction_train_model_kfold(full_df_clean, core_params, split_desc, preprocessing_params, optimized_params_grid,
                                     preprocessing_set['run_folder'], modeling_set['run_folder'],
                                     modeling_set["listener"], update_fn, with_sample_weight, with_class_weight, calibrate_proba)

        end = unix_time_millis()
        utils.write_done_traininfo(modeling_set['run_folder'], start, model_start, end,
                                   (preprocessing_listener, modeling_set["listener"]),
                                   end_preprocessing_time=preprocessing_end)

        return "ok"
Example #7
0
def create_ensemble(split_desc, core_params, model_folder, preprocessing_folder, model_folders, preprocessing_folders):
    listener = ProgressListener()
    listener.add_future_steps(constants.ENSEMBLE_STATES)
    start = unix_time_millis()

    def update_preprocessing_state():
        utils.write_running_traininfo(model_folder, start, listener)

    split_desc = dkujson.loads(split_desc)
    core_params = dkujson.loads(core_params)
    weight_method = core_params.get("weight", {}).get("weightMethod", None)
    with_sample_weight = weight_method in {"SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"}
    # TODO: update downstream
    with_class_weight = weight_method in {"CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"}
    preprocessing_folders = dkujson.loads(preprocessing_folders)
    model_folders = dkujson.loads(model_folders)
    modeling_params = dkujson.load_from_filepath(osp.join(model_folder, "rmodeling_params.json"))
    ensemble_params = modeling_params["ensemble_params"]
    logging.info("creating ensemble")
    with listener.push_state(constants.STATE_ENSEMBLING):
        update_preprocessing_state()
        from dataiku.doctor.prediction.ensembles import ensemble_from_fitted
        train = df_from_split_desc(split_desc, "train", ensemble_params["preprocessing_params"][0]["per_feature"],
                                   core_params["prediction_type"])
        iperf = {
            "modelInputNRows" : train.shape[0], #todo : not the right count as may have dropped ...
            "modelInputNCols" : -1, # makes no sense for an ensemble as may have different preprocessings
            "modelInputIsSparse" : False
        }
        dkujson.dump_to_filepath(osp.join(model_folder, "iperf.json"), iperf)
        clf = ensemble_from_fitted(core_params, ensemble_params, preprocessing_folders, model_folders, train, with_sample_weight, with_class_weight)

    logging.info("saving model")
    with listener.push_state(constants.STATE_SAVING):
        update_preprocessing_state()
        with open(osp.join(model_folder, "clf.pkl"), dku_write_mode_for_pickling()) as f:
            pickle.dump(clf, f, 2)

    logging.info("scoring model")
    with listener.push_state(constants.STATE_SCORING):
        update_preprocessing_state()
        test = df_from_split_desc(split_desc, "test", ensemble_params["preprocessing_params"][0]["per_feature"],
                                   core_params["prediction_type"])
        # this is annoying, but we have to use one of the previous preprocessings in order to get the target
        prep_folder = preprocessing_folders[0]
        rppp = dkujson.load_from_filepath(osp.join(prep_folder, "rpreprocessing_params.json"))
        collector_data = dkujson.load_from_filepath(osp.join(prep_folder, "collector_data.json"))
        preprocessing_handler = PreprocessingHandler.build(core_params, rppp, prep_folder)
        preprocessing_handler.collector_data = collector_data
        pipe = preprocessing_handler.build_preprocessing_pipeline(with_target=True)
        transformed = pipe.process(test)
        y = transformed["target"]

        if with_sample_weight:
            sample_weight = transformed["weight"]
        else:
            sample_weight = None

        # Now that the CLF with scorable pipelines has been saved, set it in "pipelines with target" mode
        # to be able to compute metrics
        clf.set_with_target_pipelines_mode(True)

        pred = clf.predict(test)
        probas = None if core_params["prediction_type"] == "REGRESSION" else clf.predict_proba(test)
        target_map = None if core_params["prediction_type"] == "REGRESSION" else \
            {t["sourceValue"]: t["mappedValue"] for t in ensemble_params["preprocessing_params"][0]["target_remapping"]}
        prediction_type = core_params["prediction_type"]
        if prediction_type == "REGRESSION":
            RegressionModelScorer(modeling_params, clf, pred, y, model_folder, transformed, test.index.copy(), sample_weight).score()
        elif prediction_type == "BINARY_CLASSIFICATION":
            BinaryClassificationModelScorer(modeling_params, clf, model_folder, pred, probas, y, target_map, transformed, test.index.copy(), sample_weight).score()
        else:
            MulticlassModelScorer(modeling_params, clf, model_folder, pred, probas, y.astype(int), target_map, transformed, test.index.copy(), sample_weight).score()

    update_preprocessing_state()
    end = unix_time_millis()
    dkujson.dump_to_filepath(osp.join(model_folder, "actual_params.json"), {"resolved": modeling_params})
    dkujson.dump_to_filepath(osp.join(preprocessing_folder, "preprocessing_report.json"), {})
    utils.write_done_traininfo(model_folder, start, end, end, listener, end_preprocessing_time=start)

    return "ok"
Example #8
0
def train_clustering_models_nosave(
                            split_desc,
                            preprocessing_set):
    """Regular (mode 1) train:
      - Non streamed single split + fit preprocess on train + preprocess test
      - Fit N models sequentially
         - Fit
         - Save clf
         - Compute and save clf performance
         - Score, save scored test set + scored performnace
    """

    start = unix_time_millis()
    preprocessing_listener = ProgressListener()
    preprocessing_params = preprocessing_set["preprocessing_params"]
    modeling_sets = preprocessing_set["modelingSets"]

    # Fill all the listeners ASAP to have correct progress data
    preprocessing_listener.add_future_steps(constants.CLUSTERING_REGULAR_PREPROCESSING_STATES)
    for modeling_set in modeling_sets:
        listener = ProgressListener()
        listener.add_future_steps(constants.ALL_CLUSTERING_TRAIN_STATES)
        modeling_set["listener"] = listener

    # Called by the preprocessing pipeline to update the state
    # of each model and dump it to disk
    def update_preprocessing_state():
        for modeling_set in modeling_sets:
            status = {
                "modelId" : modeling_set["modelId"],
                "state": "RUNNING",
                "startTime": start,
                "progress" : merge_listeners(preprocessing_listener, modeling_set["listener"])
            }
            utils.write_model_status(modeling_set, status)

    logging.info("START TRAIN :" + preprocessing_set["description"])
    preprocessing_params = preprocessing_set["preprocessing_params"]

    with preprocessing_listener.push_state(constants.STATE_LOADING_SRC):
        update_preprocessing_state()
        source_df = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"])

        logging.info("Loaded source df: shape=(%d,%d)" % source_df.shape)

    with preprocessing_listener.push_state(constants.STATE_COLLECTING):
        update_preprocessing_state()
        collector = ClusteringPreprocessingDataCollector(source_df, preprocessing_params)
        collector_data = collector.build()

    preproc_handler = ClusteringPreprocessingHandler({},
                        preprocessing_set["preprocessing_params"],
                        preprocessing_set["run_folder"])

    preproc_handler.collector_data = collector_data
    pipeline = preproc_handler.build_preprocessing_pipeline()

    with preprocessing_listener.push_state(constants.STATE_PREPROCESS_SRC):
        update_preprocessing_state()
        source_df_index = source_df.index.copy()
        # TODO: fit_and_process should take an update_fn argument
        transformed_source = pipeline.fit_and_process(source_df)
        # Saves fitted resources and collector data
        preproc_handler.save_data()
        # Report on work
        report = {}
        pipeline.report_fit(report, {})
        utils.write_preproc_file(preprocessing_set["run_folder"], "preprocessing_report.json", report)

    update_preprocessing_state()

    preprocessing_end = unix_time_millis()

    for modeling_set in modeling_sets:
        model_start = unix_time_millis()
        def update_modeling_state():
            status = utils.make_running_traininfo(modeling_set["run_folder"], start,
                                                  (preprocessing_listener, modeling_set["listener"]))
            utils.write_model_status(modeling_set, status)
        clustering_train_score_save(transformed_source, source_df_index,
                                    preprocessing_set["preprocessing_params"],
                                    modeling_set["modelingParams"],
                                    modeling_set["run_folder"],
                                    listener,
                                    update_modeling_state,
                                    pipeline)

        model_end = end = unix_time_millis()
        end = unix_time_millis()

        # Write the final model training info
        status = {
            "modelId": modeling_set["modelId"],
            "state": "DONE",
            "startTime": start,
            "endTime": end,
            "preprocessingTime": preprocessing_end - start,
            "trainingTime": model_end - model_start,
            "progress": merge_listeners(preprocessing_listener, modeling_set["listener"])
        }
        utils.write_model_status(modeling_set, status)

    return "ok"
Example #9
0
def train_prediction_keras(core_params, preprocessing_set, split_desc):

    start = unix_time_millis()

    preprocessing_params = preprocessing_set["preprocessing_params"]
    modeling_sets = preprocessing_set["modelingSets"]
    run_folder = preprocessing_set["run_folder"]

    logging.info("PPS is %s" % preprocessing_params)
    preprocessing_listener = ProgressListener()
    # Fill all the listeners ASAP to have correct progress data
    preprocessing_listener.add_future_steps(constants.PRED_KERAS_PREPROCESSING_STATES)
    for modeling_set in modeling_sets:
        listener = ProgressListener()
        listener.add_future_steps(constants.PRED_KERAS_TRAIN_STATES)
        modeling_set["listener"] = listener

    # Called by the preprocessing pipeline to update the state
    # of each model and dump it to disk
    def update_preprocessing_state():
        for modeling_set in modeling_sets:
            status = utils.make_running_traininfo(modeling_set["run_folder"], start,
                                                  (preprocessing_listener, modeling_set["listener"]))
            utils.write_model_status(modeling_set, status)

    with preprocessing_listener.push_state(constants.STATE_LOADING_TRAIN):
        update_preprocessing_state()
        train_df = df_from_split_desc(split_desc, "train", preprocessing_params['per_feature'], core_params["prediction_type"])
        train_df_orig = train_df.copy()
        logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape)
        
        # Not implemented in the UI so far, so processor_fit_df will always be train_df
        preprocessor_fit_df = train_df
        need_subsampling = preprocessing_params["preprocessingFitSampleRatio"] < 1
        if need_subsampling:
            preprocessor_fit_df = preprocessor_fit_df.sample(frac=preprocessing_params["preprocessingFitSampleRatio"],
                                                             random_state=preprocessing_params["preprocessingFitSampleSeed"])

    with preprocessing_listener.push_state(constants.STATE_LOADING_TEST):
        update_preprocessing_state()
        test_df = df_from_split_desc(split_desc, "test", preprocessing_params['per_feature'], core_params["prediction_type"])
        logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape)

    with preprocessing_listener.push_state(constants.STATE_COLLECTING):
        update_preprocessing_state()
        collector = PredictionPreprocessingDataCollector(preprocessor_fit_df, preprocessing_params)
        collector_data = collector.build()

    # Tagging special features to take them into account only in special_preproc_handler/special_pipeline
    per_feature = preprocessing_params["per_feature"]
    tag_special_features(per_feature)

    pipeline, preproc_handler = build_pipeline_and_handler(collector_data, core_params, run_folder,
                                                           preprocessing_params, allow_empty_mf=True)

    with preprocessing_listener.push_state(constants.KERASSTATE_FIT_NORMAL_PREPROCESSING):
        update_preprocessing_state()

        # Retrieving transformed values to get the shape of all regular inputs, even if won't be
        # actually used, as each batch of data will be processed again
        transformed_normal = pipeline.fit_and_process(preprocessor_fit_df)
        preproc_handler.save_data()
        preproc_handler.report(pipeline)

    # TODO: REVIEW STATES OF TRAINING
    with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TRAIN):
        update_preprocessing_state()

    with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TEST):
        update_preprocessing_state()

    update_preprocessing_state()
    preprocessing_end = unix_time_millis()

    for modeling_set in modeling_sets:
        model_start = unix_time_millis()

        # Settings env variable that may be accessed in user defined code
        remoterun.set_dku_env_var_and_sys_env_var(constants.DKU_CURRENT_ANALYSIS_ID, modeling_set["fullId"]["taskLoc"]["analysisId"])
        remoterun.set_dku_env_var_and_sys_env_var(constants.DKU_CURRENT_MLTASK_ID, modeling_set["fullId"]["taskLoc"]["mlTaskId"])

        def update_modeling_state():
            status = utils.make_running_traininfo(modeling_set["run_folder"], start,
                                                  (preprocessing_listener, modeling_set["listener"]))
            utils.write_model_status(modeling_set, status)

        prediction_train_model_keras(transformed_normal, train_df_orig, test_df, pipeline, modeling_set["modelingParams"],
                                     core_params, per_feature, modeling_set["run_folder"], modeling_set["listener"],
                                     update_modeling_state, preproc_handler.target_map,
                                     pipeline.generated_features_mapping)

        end = unix_time_millis()
        utils.write_done_traininfo(modeling_set["run_folder"], start, model_start, end,
                                   (preprocessing_listener, modeling_set["listener"]),
                                   end_preprocessing_time=preprocessing_end)

    return "ok"
Example #10
0
def train_prediction_models_nosave(core_params, preprocessing_set, split_desc):
    """Regular (mode 1) train:
      - Non streamed single split + fit preprocess on train + preprocess test
      - Fit N models sequentially
         - Fit
         - Save clf
         - Compute and save clf performance
         - Score, save scored test set + scored performnace
    """

    start = unix_time_millis()
    preprocessing_params = preprocessing_set["preprocessing_params"]
    modeling_sets = preprocessing_set["modelingSets"]

    logging.info("PPS is %s" % preprocessing_params)
    preprocessing_listener = ProgressListener()
    # Fill all the listeners ASAP to have correct progress data
    preprocessing_listener.add_future_steps(constants.PRED_REGULAR_PREPROCESSING_STATES)
    for modeling_set in modeling_sets:
        listener = ProgressListener()
        if modeling_set.get('modelingParams', {}).get('gridLength', 1) > 1:
            listener.add_future_step(constants.STATE_GRIDSEARCHING)
        listener.add_future_steps(constants.PRED_REGULAR_TRAIN_STATES)
        modeling_set["listener"] = listener

    # Called by the preprocessing pipeline to update the state
    # of each model and dump it to disk
    def update_preprocessing_state():
        for modeling_set in modeling_sets:
            status = utils.make_running_traininfo(modeling_set["run_folder"], start,
                                                  (preprocessing_listener, modeling_set["listener"]))
            utils.write_model_status(modeling_set, status)

    with preprocessing_listener.push_state(constants.STATE_LOADING_TRAIN):
        update_preprocessing_state()
        train_df = df_from_split_desc(split_desc, "train", preprocessing_params['per_feature'], core_params["prediction_type"])
        logging.info("Loaded train df: shape=(%d,%d)" % train_df.shape)

        for col in train_df:
            logging.info("Train col : %s (%s)" % (col, train_df[col].dtype))

    with preprocessing_listener.push_state(constants.STATE_LOADING_TEST):
        update_preprocessing_state()
        test_df = df_from_split_desc(split_desc, "test", preprocessing_params['per_feature'], core_params["prediction_type"])
        logging.info("Loaded test df: shape=(%d,%d)" % test_df.shape)

    with preprocessing_listener.push_state(constants.STATE_COLLECTING):
        update_preprocessing_state()
        collector = PredictionPreprocessingDataCollector(train_df, preprocessing_params)
        collector_data = collector.build()

    pipeline, preproc_handler = build_pipeline_and_handler(collector_data, core_params, preprocessing_set['run_folder'],
                                                           preprocessing_params)

    with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TRAIN):
        update_preprocessing_state()
        # TODO: fit_and_process should take an update_fn argument
        transformed_train = pipeline.fit_and_process(train_df)
        preproc_handler.save_data()
        preproc_handler.report(pipeline)

    with preprocessing_listener.push_state(constants.STATE_PREPROCESS_TEST):
        update_preprocessing_state()
        test_df_index = test_df.index.copy()
        transformed_test = pipeline.process(test_df)

    update_preprocessing_state()
    preprocessing_end = unix_time_millis()

    for modeling_set in modeling_sets:
        model_start = unix_time_millis()

        def update_modeling_state():
            status = utils.make_running_traininfo(modeling_set["run_folder"], start,
                                                  (preprocessing_listener, modeling_set["listener"]))
            utils.write_model_status(modeling_set, status)

        # since ensembles are never fitted through the doctor, no need to distinguish here
        prediction_train_score_save(transformed_train,
                                    transformed_test, test_df_index,
                                    core_params, split_desc,
                                    modeling_set["modelingParams"],
                                    modeling_set["run_folder"],
                                    modeling_set["listener"],
                                    preproc_handler.target_map,
                                    update_modeling_state,
                                    pipeline,
                                    modeling_set["run_folder"])

        end = unix_time_millis()

        utils.write_done_traininfo(modeling_set["run_folder"], start, model_start, end,
                                   (preprocessing_listener, modeling_set["listener"]),
                                   end_preprocessing_time=preprocessing_end)

    return "ok"
Example #11
0
    BinaryClassificationModelScorer
from dataiku.doctor.clustering.clustering_scorer import ClusteringModelScorer
from dataiku.doctor.prediction.regression_scoring import RegressionModelIntrinsicScorer, RegressionModelScorer
from dataiku.doctor.prediction_entrypoints import prediction_train_score_save, prediction_train_model_kfold, \
    prediction_train_model_keras
from dataiku.doctor.clustering_entrypoints import clustering_train_score_save
from dataiku.doctor.utils import ProgressListener, unix_time_millis, dku_write_mode_for_pickling
from dataiku.doctor.notebook_builder import PredictionNotebookBuilder, ClusteringNotebookBuilder
from dataiku.core import dkujson, intercom
from dataiku.doctor.deep_learning.keras_utils import tag_special_features
from dataiku.doctor.utils.split import df_from_split_desc
from dataiku.doctor.utils import merge_listeners
from dataiku.doctor import utils
from dataiku.doctor.posttraining import partial_depency as pdp, subpopulation as subpopulation

preprocessing_listener = ProgressListener()
global_modeling_sets = []


def _list_commands():
    current_module = sys.modules[__name__]
    return [
        (func_name, func)
        for (func_name, func) in current_module.__dict__.items()
        if not func_name.startswith("_") and inspect.isfunction(func) and inspect.getmodule(func) == current_module
    ]


def create_prediction_notebook(model_name, model_date, dataset_smartname,
                               script, preparation_output_schema,
                               split_stuff,