Python load_from_filepath Beispiele, dataiku.core.dkujson.load_from_filepath Python Beispiele

Beispiel #1

0

Datei anzeigen

    def __init__(self, split_desc, core_params, preprocessing_folder,
                 model_folder):

        self._split_desc = split_desc
        self._core_params = core_params
        self._preprocessing_folder = preprocessing_folder
        self._model_folder = model_folder
        self._preprocessing_params = dkujson.load_from_filepath(
            osp.join(preprocessing_folder, "rpreprocessing_params.json"))
        self._user_meta = dkujson.load_from_filepath(
            osp.join(model_folder, "user_meta.json"))
        self._modeling_params = dkujson.load_from_filepath(
            osp.join(model_folder, "rmodeling_params.json"))

        self._keras_scoring_batches = 100

        self._predictor = build_predictor(
            "PREDICTION",
            self._model_folder,
            self._preprocessing_folder,
            [],  # no need for conditional outputs in this case
            self._core_params,
            self._split_desc)

        self._collector_data = None
        self._preproc_handler = None
        self._pipeline = None
        self._clf = None
        self._train_df = None
        self._test_df = None
        self._full_df = None

Beispiel #2

0

Datei anzeigen

def build_predictor_for_saved_model(model_folder, model_type,
                                    conditional_outputs):
    is_prediction = is_model_prediction(model_type)

    if is_prediction:
        core_params = dkujson.load_from_filepath(
            osp.join(model_folder, "core_params.json"))
    else:
        core_params = None

    split_desc = dkujson.load_from_filepath(
        osp.join(model_folder, "split", "split.json"))

    return build_predictor(model_type, model_folder, model_folder,
                           conditional_outputs, core_params, split_desc)

Beispiel #3

0

Datei anzeigen

    def save(self, pd_result):
        iperf = dkujson.load_from_filepath(
            os.path.join(self.folder, "iperf.json"))

        if "partialDependencies" not in iperf:
            iperf["partialDependencies"] = []

        for partial_dep in iperf["partialDependencies"]:
            if partial_dep.get('feature') == pd_result.feature.name:
                iperf["partialDependencies"].remove(partial_dep)
                break

        new_partial_dependence = {
            "data": list(pd_result.partial_dependence),
            "feature": pd_result.feature.name,
            "distribution": pd_result.distribution,
            "computedPostTraining": True,
            "isDate": self.dtypes[pd_result.feature.name] == "date",
            "unrepresentedModalities": pd_result.unrepresented_modalities,
        }

        if pd_result.indices_to_drop is not None:
            new_partial_dependence["indicesToDrop"] = pd_result.indices_to_drop

        if pd_result.feature.type == 'CATEGORY':
            new_partial_dependence["categories"] = list(pd_result.scale)
        elif pd_result.feature.type == 'NUMERIC':
            new_partial_dependence["featureBins"] = list(pd_result.scale)

        iperf["partialDependencies"].append(new_partial_dependence)
        dkujson.dump_to_filepath(os.path.join(self.folder, "iperf.json"),
                                 iperf)

        return iperf

Beispiel #4

0

Datei anzeigen

def load_relfilepath(basepath, relative_filepath):
    """ Returns None if the file does not exists """
    filepath = osp.join(basepath, relative_filepath)
    if osp.exists(filepath):
        return dkujson.load_from_filepath(filepath)
    else:
        return None

Beispiel #5

0

Datei anzeigen

def write_running_traininfo(folder, start_time, listener):
    status_filepath = osp.join(folder, "train_info.json")
    if osp.exists(status_filepath):
        status = dkujson.load_from_filepath(status_filepath)
    else:
        status = {}

    status["state"] = "RUNNING"
    status["startTime"] = start_time
    status["progress"] = listener.to_jsonifiable()
    dkujson.dump_to_filepath(status_filepath, status)

Beispiel #6

0

Datei anzeigen

def clustering_rescore(
        split_desc,
        preprocessing_folder,
        model_folder):

    preprocessing_params = dkujson.load_from_filepath(osp.join(preprocessing_folder, "rpreprocessing_params.json"))
    modeling_params = dkujson.load_from_filepath(osp.join(model_folder,"rmodeling_params.json"))
    user_meta = dkujson.load_from_filepath(osp.join(model_folder, "user_meta.json"))

    split_desc = dkujson.loads(split_desc)
    source_df = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"])
    logging.info("Loaded source df: shape=(%d,%d)" % source_df.shape)

    collector_data = dkujson.load_from_filepath(osp.join(preprocessing_folder, "collector_data.json"))

    preproc_handler = ClusteringPreprocessingHandler({}, preprocessing_params, "")  # we're not saving the data
    preproc_handler.collector_data = collector_data
    pipeline = preproc_handler.build_preprocessing_pipeline()

    source_df_index = source_df.index.copy()
    transformed_source = pipeline.fit_and_process(source_df)

    logging.info("Loading the clustering model")

    with open(osp.join(model_folder, "clusterer.pkl"), "rb") as f:
        clf = pickle.load(f)

    try:
        logging.info("Post-processing the model")
        clf.post_process(user_meta)
    except AttributeError:
        pass

    train_np, is_sparse = prepare_multiframe(transformed_source["TRAIN"], modeling_params)
    cluster_labels = clf.predict(train_np)

    logging.info("Rescoring the clustering model")
    ClusteringModelScorer(clf, transformed_source, source_df_index, cluster_labels, preprocessing_params, modeling_params,
                          pipeline, model_folder).score()

    return "ok"

Beispiel #7

0

Datei anzeigen

def make_running_traininfo(folder, start_time, listener):
    status_filepath = osp.join(folder, "train_info.json")
    if osp.exists(status_filepath):
        status = dkujson.load_from_filepath(status_filepath)
    else:
        status = {}
    status["state"] = "RUNNING"
    status["startTime"] = start_time
    if isinstance(listener, ProgressListener):
        status["progress"] = listener.to_jsonifiable()
    else:
        status["progress"] = reduce(merge_listeners, listener)
    return status

Beispiel #8

0

Datei anzeigen

def write_done_traininfo(folder, start_time, start_training_time, end_time, listener, end_preprocessing_time=None):
    status_filepath = osp.join(folder, "train_info.json")
    if osp.exists(status_filepath):
        status = dkujson.load_from_filepath(status_filepath)
    else:
        status = {}

    status["state"] = "DONE"
    status["startTime"] = start_time
    status["endTime"] = end_time
    status["preprocessingTime"] = (end_preprocessing_time or start_training_time) - start_time
    status["trainingTime"] = end_time - start_training_time
    if isinstance(listener, ProgressListener):
        status["progress"] = listener.to_jsonifiable()
    else:
        status["progress"] = reduce(merge_listeners, listener)

    dkujson.dump_to_filepath(status_filepath, status)

Beispiel #9

0

Datei anzeigen

def binary_classif_scoring_add_percentile_and_cond_outputs(pred_df, recipe_desc, model_folder, cond_outputs, target_map):
    inv_map = {
        int(class_id): label
        for label, class_id in target_map.items()
    }
    classes = [class_label for (_, class_label) in sorted(inv_map.items())]
    proba_cols = ["proba_{}".format(c) for c in classes]
    has_probas = recipe_desc["outputProbabilities"] or (cond_outputs and
                                                        len([co for co in cond_outputs
                                                             if co["input"] in proba_cols]))
    has_percentiles = recipe_desc["outputProbaPercentiles"] or (cond_outputs and
                                                                len([co for co in cond_outputs if
                                                                     co["input"] == "proba_percentile"]))
    if has_percentiles:
        model_perf = dkujson.load_from_filepath(osp.join(model_folder, "perf.json"))
        if model_perf.has_key("probaPercentiles") and model_perf["probaPercentiles"]:
            percentile = pd.Series(model_perf["probaPercentiles"])
            proba_1 = "proba_" + str(inv_map[1])
            pred_df["proba_percentile"] = pred_df[proba_1].apply(
                lambda p: percentile.where(percentile <= p).count() + 1)
        else:
            raise Exception("Probability percentiles are missing from model.")
    if cond_outputs:
        for co in cond_outputs:
            inp = pred_df[co["input"]]
            acc = inp.notnull()  # condition accumulator
            for r in co["rules"]:
                if r["operation"] == 'GT':
                    cond = inp > r["operand"]
                elif r["operation"] == 'GE':
                    cond = inp >= r["operand"]
                elif r["operation"] == 'LT':
                    cond = inp < r["operand"]
                elif r["operation"] == 'LE':
                    cond = inp <= r["operand"]
                pred_df.loc[acc & cond, co["name"]] = r["output"]
                acc = acc & (~cond)
            pred_df.loc[acc, co["name"]] = co.get("defaultOutput", "")
    if has_percentiles and not recipe_desc["outputProbaPercentiles"]:  # was only for conditional outputs
        pred_df.drop("proba_percentile", axis=1, inplace=True)
    if has_probas and not recipe_desc["outputProbabilities"]:  # was only for conditional outputs
        pred_df.drop(proba_cols, axis=1, inplace=True)

    return pred_df

Beispiel #10

0

Datei anzeigen

    def __init__(self,
                 parallel,
                 m_folder=None,
                 n_splits=None,
                 n_candidates=None,
                 timeout=None,
                 n_jobs=None,
                 evaluation_metric=None,
                 metric_sign=1):
        self.parallel = parallel
        self.m_folder = m_folder
        self.n_splits = n_splits
        self.n_candidates = n_candidates
        self._watching = self.m_folder is not None
        self.grid_search_summary = []
        self.end_time = time.time(
        ) + timeout * 60 if timeout is not None else None  # timeout in minutes
        self.initial_grid_points = []
        self.initial_grid_point_ids = []
        self.n_jobs = n_jobs
        self.evaluation_metric = evaluation_metric
        self.metric_sign = metric_sign
        self.start_time = unix_time_millis()
        self.is_interrupted = False

        if self._watching:
            self.grid_folder = os.path.join(self.m_folder, 'grid')
            self.grid_tmp_folder = os.path.join(self.m_folder, 'grid.tmp')
            interrupt_optimization.set_interrupt_folder(self.m_folder)
            self.grid_search_file = os.path.join(self.m_folder,
                                                 'grid_search_done_py.json')
            self.grid_search_summary = dkujson.load_from_filepath(self.grid_search_file) \
                if os.path.exists(self.grid_search_file) else []
            self.initial_grid_point_ids = [
                x['grid_point_id'] for x in self.grid_search_summary
            ]
            self.initial_grid_points = self.grid_search_summary[:]
            for grid_point_id in self.initial_grid_point_ids:
                logging.info(
                    "Using precomputed score for Grid point {}".format(
                        grid_point_id))

        super(CVInterruptWatcherThread, self).__init__()

Beispiel #11

0

Datei anzeigen

def main(model_folder,
         input_dataset_smartname,
         output_dataset_smartname,
         recipe_desc,
         script,
         preparation_output_schema,
         cond_outputs=None):

    # Obtain a streamed result of the preparation
    input_dataset = dataiku.Dataset(input_dataset_smartname)
    logging.info("Will do preparation, output schema: %s" %
                 preparation_output_schema)
    input_dataset.set_preparation_steps(script["steps"],
                                        preparation_output_schema)

    listener = ProgressListener()

    core_params = dkujson.load_from_filepath(
        osp.join(model_folder, "core_params.json"))
    preprocessing_params = dkujson.load_from_filepath(
        osp.join(model_folder, "rpreprocessing_params.json"))
    modeling_params = dkujson.load_from_filepath(
        osp.join(model_folder, "actual_params.json"))["resolved"]
    collector_data = dkujson.load_from_filepath(
        osp.join(model_folder, "collector_data.json"))

    preprocessing_handler = PreprocessingHandler.build(core_params,
                                                       preprocessing_params,
                                                       model_folder)
    preprocessing_handler.collector_data = collector_data

    pipeline = preprocessing_handler.build_preprocessing_pipeline()

    batch_size = recipe_desc.get("pythonBatchSize", 100000)
    logging.info("Scoring with batch size: {}".format(batch_size))

    with open(osp.join(model_folder, "clf.pkl"), "rb") as f:
        clf = pickle.load(f)

    def output_generator():
        logging.info("Start output generator ...")

        (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st(
            preparation_output_schema["columns"],
            parse_dates=True,
            infer_with_pandas=False)
        logging.info("Reading with INITIAL dtypes: %s" % dtypes)
        dtypes = utils.ml_dtypes_from_dss_schema(
            preparation_output_schema,
            preprocessing_params["per_feature"],
            prediction_type=core_params["prediction_type"])
        logging.info("Reading with dtypes: %s" % dtypes)

        for i in xrange(0, len(names)):
            logging.info("Column %s = %s (dtype=%s)" %
                         (i, names[i], dtypes.get(names[i], None)))

        for input_df in input_dataset.iter_dataframes_forced_types(
                names,
                dtypes,
                parse_date_columns,
                chunksize=batch_size,
                float_precision="round_trip"):
            input_df.index = range(input_df.shape[0])
            input_df_orig = input_df.copy()
            logging.info("Got a dataframe : %s" % str(input_df.shape))
            normalize_dataframe(input_df, preprocessing_params['per_feature'])

            for col in input_df:
                logging.info("NORMALIZED: %s -> %s" %
                             (col, input_df[col].dtype))

            logging.info("Processing it")
            logging.info("Predicting it")

            if core_params[
                    "prediction_type"] == constants.BINARY_CLASSIFICATION:
                pred_df = binary_classification_predict(
                    clf,
                    pipeline,
                    modeling_params,
                    preprocessing_params,
                    preprocessing_handler.target_map,
                    recipe_desc["forcedClassifierThreshold"],
                    input_df,
                    output_probas=recipe_desc["outputProbabilities"])
                # Probability percentile & Conditional outputs
                pred_df = binary_classif_scoring_add_percentile_and_cond_outputs(
                    pred_df, recipe_desc, model_folder, cond_outputs,
                    preprocessing_handler.target_map)

            elif core_params["prediction_type"] == constants.MULTICLASS:
                pred_df = multiclass_predict(
                    clf,
                    pipeline,
                    modeling_params,
                    preprocessing_params,
                    preprocessing_handler.target_map,
                    input_df,
                    output_probas=recipe_desc["outputProbabilities"])

            elif core_params["prediction_type"] == constants.REGRESSION:
                pred_df = regression_predict(clf, pipeline, modeling_params,
                                             input_df)

            else:
                raise ValueError("bad prediction type %s" %
                                 core_params["prediction_type"])

            logging.info("pred df debug :")
            logging.info(pred_df)

            logging.info("Done predicting it")
            if recipe_desc.get("filterInputColumns", False):
                clean_kept_columns = [
                    c for c in recipe_desc["keptInputColumns"]
                    if c not in pred_df.columns
                ]
            else:
                clean_kept_columns = [
                    c for c in input_df_orig.columns
                    if c not in pred_df.columns
                ]
            yield pd.concat([input_df_orig[clean_kept_columns], pred_df],
                            axis=1)

    output_dataset = dataiku.Dataset(output_dataset_smartname)
    logging.info("Starting writer")
    with output_dataset.get_writer() as writer:
        i = 0
        logging.info("Starting to iterate")
        for output_df in output_generator():
            logging.info("Generator generated a df %s" % str(output_df.shape))
            #if i == 0:
            #    output_dataset.write_schema_from_dataframe(output_df)
            i = i + 1
            writer.write_dataframe(output_df)
            logging.info("Output df written")

Beispiel #12

0

Datei anzeigen

 def get_collector_data(self):
     if self._collector_data is None:
         self._collector_data = dkujson.load_from_filepath(
             osp.join(self._preprocessing_folder, "collector_data.json"))
     return self._collector_data

Beispiel #13

0

Datei anzeigen

def create_ensemble(split_desc, core_params, model_folder, preprocessing_folder, model_folders, preprocessing_folders):
    listener = ProgressListener()
    listener.add_future_steps(constants.ENSEMBLE_STATES)
    start = unix_time_millis()

    def update_preprocessing_state():
        utils.write_running_traininfo(model_folder, start, listener)

    split_desc = dkujson.loads(split_desc)
    core_params = dkujson.loads(core_params)
    weight_method = core_params.get("weight", {}).get("weightMethod", None)
    with_sample_weight = weight_method in {"SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"}
    # TODO: update downstream
    with_class_weight = weight_method in {"CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"}
    preprocessing_folders = dkujson.loads(preprocessing_folders)
    model_folders = dkujson.loads(model_folders)
    modeling_params = dkujson.load_from_filepath(osp.join(model_folder, "rmodeling_params.json"))
    ensemble_params = modeling_params["ensemble_params"]
    logging.info("creating ensemble")
    with listener.push_state(constants.STATE_ENSEMBLING):
        update_preprocessing_state()
        from dataiku.doctor.prediction.ensembles import ensemble_from_fitted
        train = df_from_split_desc(split_desc, "train", ensemble_params["preprocessing_params"][0]["per_feature"],
                                   core_params["prediction_type"])
        iperf = {
            "modelInputNRows" : train.shape[0], #todo : not the right count as may have dropped ...
            "modelInputNCols" : -1, # makes no sense for an ensemble as may have different preprocessings
            "modelInputIsSparse" : False
        }
        dkujson.dump_to_filepath(osp.join(model_folder, "iperf.json"), iperf)
        clf = ensemble_from_fitted(core_params, ensemble_params, preprocessing_folders, model_folders, train, with_sample_weight, with_class_weight)

    logging.info("saving model")
    with listener.push_state(constants.STATE_SAVING):
        update_preprocessing_state()
        with open(osp.join(model_folder, "clf.pkl"), dku_write_mode_for_pickling()) as f:
            pickle.dump(clf, f, 2)

    logging.info("scoring model")
    with listener.push_state(constants.STATE_SCORING):
        update_preprocessing_state()
        test = df_from_split_desc(split_desc, "test", ensemble_params["preprocessing_params"][0]["per_feature"],
                                   core_params["prediction_type"])
        # this is annoying, but we have to use one of the previous preprocessings in order to get the target
        prep_folder = preprocessing_folders[0]
        rppp = dkujson.load_from_filepath(osp.join(prep_folder, "rpreprocessing_params.json"))
        collector_data = dkujson.load_from_filepath(osp.join(prep_folder, "collector_data.json"))
        preprocessing_handler = PreprocessingHandler.build(core_params, rppp, prep_folder)
        preprocessing_handler.collector_data = collector_data
        pipe = preprocessing_handler.build_preprocessing_pipeline(with_target=True)
        transformed = pipe.process(test)
        y = transformed["target"]

        if with_sample_weight:
            sample_weight = transformed["weight"]
        else:
            sample_weight = None

        # Now that the CLF with scorable pipelines has been saved, set it in "pipelines with target" mode
        # to be able to compute metrics
        clf.set_with_target_pipelines_mode(True)

        pred = clf.predict(test)
        probas = None if core_params["prediction_type"] == "REGRESSION" else clf.predict_proba(test)
        target_map = None if core_params["prediction_type"] == "REGRESSION" else \
            {t["sourceValue"]: t["mappedValue"] for t in ensemble_params["preprocessing_params"][0]["target_remapping"]}
        prediction_type = core_params["prediction_type"]
        if prediction_type == "REGRESSION":
            RegressionModelScorer(modeling_params, clf, pred, y, model_folder, transformed, test.index.copy(), sample_weight).score()
        elif prediction_type == "BINARY_CLASSIFICATION":
            BinaryClassificationModelScorer(modeling_params, clf, model_folder, pred, probas, y, target_map, transformed, test.index.copy(), sample_weight).score()
        else:
            MulticlassModelScorer(modeling_params, clf, model_folder, pred, probas, y.astype(int), target_map, transformed, test.index.copy(), sample_weight).score()

    update_preprocessing_state()
    end = unix_time_millis()
    dkujson.dump_to_filepath(osp.join(model_folder, "actual_params.json"), {"resolved": modeling_params})
    dkujson.dump_to_filepath(osp.join(preprocessing_folder, "preprocessing_report.json"), {})
    utils.write_done_traininfo(model_folder, start, end, end, listener, end_preprocessing_time=start)

    return "ok"

Beispiel #14

0

Datei anzeigen

def get_deep_learning_model_info(folder):
    status_filepath = osp.join(folder, "keras_model_training_info.json")
    return dkujson.load_from_filepath(status_filepath)

Beispiel #15

0

Datei anzeigen

def build_predictor(model_type, model_folder, preprocessing_folder,
                    conditional_outputs, core_params, split_desc):
    is_prediction = is_model_prediction(model_type)

    # import various parameters
    preprocessing_params = dkujson.load_from_filepath(
        osp.join(preprocessing_folder, "rpreprocessing_params.json"))
    modeling_params = dkujson.load_from_filepath(
        osp.join(model_folder, "actual_params.json"))["resolved"]
    collector_data = dkujson.load_from_filepath(
        osp.join(preprocessing_folder, "collector_data.json"))
    user_meta = dkujson.load_from_filepath(
        osp.join(model_folder, "user_meta.json"))
    schema = split_desc["schema"]

    is_keras_backend = modeling_params["algorithm"] == "KERAS_CODE"

    # load model
    if is_keras_backend:
        try:

            # If model was trained on GPU, the prediction will always use GPU as well
            # In order for one model not to take all the GPU capabilities, we force TensorFlow
            # to "allow_growth" on each GPU, i.e. it will take only the required resources
            from dataiku.doctor.deep_learning import gpu
            gpu.load_gpu_options_only_allow_growth()

            from keras.models import load_model
            model_path = osp.join(model_folder, "keras_model.h5")
            model = load_model(model_path)
        except IOError:
            raise NotImplementedError(
                "Using saved models in python recipes is limited to models trained using the Keras engine"
            )
    else:
        try:
            pkl_path = osp.join(
                model_folder, "clf.pkl" if is_prediction else "clusterer.pkl")
            with open(pkl_path, "rb") as f:
                clf = pickle.load(f)
                try:
                    logging.info("Post-processing model")
                    clf.post_process(user_meta)
                except AttributeError:
                    pass
                    # method does not exist if model cannot be post-processed, just pass
        except IOError:
            raise NotImplementedError(
                "Using saved models in python recipes is limited to models trained using the python engine"
            )

    # Only prediction has perf.json
    if osp.isfile(osp.join(model_folder, "perf.json")):
        model_perf = dkujson.load_from_filepath(
            osp.join(model_folder, "perf.json"))
    else:
        model_perf = {}

    if is_prediction:
        cluster_name_map = None
    else:
        cluster_name_map = {}
        if "clusterMetas" in user_meta:
            for cluster_id, cluster_data in user_meta["clusterMetas"].items():
                cluster_name_map[cluster_id] = cluster_data["name"]

    # create preprocessing
    from dataiku.doctor.preprocessing_handler import PreprocessingHandler
    from dataiku.doctor.preprocessing_handler import ClusteringPreprocessingHandler
    if is_prediction:
        preprocessing_handler = PreprocessingHandler.build(
            core_params, preprocessing_params, preprocessing_folder)
    else:
        preprocessing_handler = ClusteringPreprocessingHandler(
            {}, preprocessing_params, preprocessing_folder)

    preprocessing_handler.collector_data = collector_data

    params = ModelParams(model_type, modeling_params, preprocessing_params,
                         core_params, schema, user_meta, model_perf,
                         conditional_outputs, cluster_name_map)

    if modeling_params["algorithm"] == "PYTHON_ENSEMBLE":
        return EnsemblePredictor(params, clf)
    else:
        pipeline = preprocessing_handler.build_preprocessing_pipeline()

        if is_keras_backend:
            from dataiku.doctor.deep_learning.keras_utils import tag_special_features
            per_feature = preprocessing_params["per_feature"]
            tag_special_features(per_feature)
            preprocessing = KerasPreprocessing(pipeline, modeling_params,
                                               per_feature)
            return KerasPredictor(params,
                                  preprocessing,
                                  model,
                                  modeling_params,
                                  batch_size=100)
        else:
            preprocessing = Preprocessing(pipeline, modeling_params)
            features = _generate_features(collector_data, pipeline)
            return Predictor(params, preprocessing, features, clf)

Beispiel #16

0

Datei anzeigen

def ensemble_from_fitted(core_params,
                         ensemble_params,
                         prep_folders,
                         model_folders,
                         train,
                         with_sample_weight=False,
                         with_class_weight=False):
    logging.debug("creating ensemble for doctor")
    model_ids = ensemble_params["model_ids"]
    prep_hashes = ensemble_params["preprocessing_hashes"]
    rppp_map = {
        h: prep
        for h, prep in zip(ensemble_params["ordered_hashes"],
                           ensemble_params["preprocessing_params"])
    }
    pipe_map = {}
    preds = []
    clfs = []
    y = None
    sample_weight = None
    target_map = None if "target_remapping" not in ensemble_params["preprocessing_params"][0] else \
        {x["sourceValue"]: x["mappedValue"] for x in ensemble_params["preprocessing_params"][0]["target_remapping"]}
    proba_inputs = ensemble_params["proba_inputs"]
    for i in range(len(model_ids)):
        fmi = model_ids[i]
        hash = prep_hashes[fmi]
        prep = rppp_map[hash]
        if hash in pipe_map:
            # prep pipeline was already cached
            pipe_with_target = pipe_map[hash]["with_target"]
        else:
            # load the preparation pipeline
            from dataiku.doctor.preprocessing_handler import PredictionPreprocessingHandler
            prep_folder = prep_folders[i]
            collector_data = dkujson.load_from_filepath(
                osp.join(prep_folder, "collector_data.json"))

            # Build a pipe with target for fitting the ensemble
            preprocessing_handler = PredictionPreprocessingHandler.build(
                core_params, prep, prep_folder)
            preprocessing_handler.collector_data = collector_data
            pipe_with_target = preprocessing_handler.build_preprocessing_pipeline(
                with_target=True)

            # Also build a pipe without target for scoring
            preprocessing_handler = PredictionPreprocessingHandler.build(
                core_params, prep, prep_folder)
            preprocessing_handler.collector_data = collector_data
            scorable_pipe = preprocessing_handler.build_preprocessing_pipeline(
                with_target=False)

            pipe_map[hash] = {
                "with_target": pipe_with_target,
                "scorable": scorable_pipe
            }
        with open(osp.join(model_folders[i], "clf.pkl"), "rb") as clf_file:
            clf = pickle.load(clf_file)
        clfs.append(clf)
        if y is None:
            # because some rows might be dropped, we have to recover the target here
            transformed = pipe_with_target.process(train)
            y = transformed["target"]
            # because some rows might be dropped, we have to recover the sample weights here
            if with_sample_weight:
                sample_weight = transformed["weight"]
        # todo : group this to avoid multiple preprocessings.
        modeling_params = ensemble_params["modeling_params"][i]
        if core_params["prediction_type"] == "REGRESSION":
            p = regression_predict(clf, pipe_with_target, modeling_params,
                                   train)["prediction"]
        elif core_params["prediction_type"] == "BINARY_CLASSIFICATION":
            threshold = 0.5 if "thresholds" not in ensemble_params else ensemble_params[
                "thresholds"][i]
            from dataiku.doctor.prediction import binary_classification_predict
            p_df = binary_classification_predict(clf, pipe_with_target,
                                                 modeling_params, prep,
                                                 target_map, threshold, train)
            if proba_inputs:
                p = extract_probas(p_df, prep["target_remapping"])
            else:
                p = p_df["prediction"]
        else:
            from dataiku.doctor.prediction import multiclass_predict
            p_df = multiclass_predict(clf, pipe_with_target, modeling_params,
                                      prep, target_map, train)
            if proba_inputs:
                p = extract_probas(p_df, prep["target_remapping"])
            else:
                p = p_df["prediction"]
        preds.append(p)

    # fit the ensemble
    if core_params["prediction_type"] == "REGRESSION":
        ensembler = get_regression_ensembler(ensemble_params, preds, y,
                                             sample_weight)
    elif proba_inputs:
        ensembler = get_probabilistic_ensembler(len(prep["target_remapping"]),
                                                ensemble_params, preds, y,
                                                sample_weight,
                                                with_class_weight)
    else:
        ensembler = get_classifier_ensembler(len(prep["target_remapping"]),
                                             ensemble_params, preds, y,
                                             sample_weight, with_class_weight)

    scorable_pipes = [
        pipe_map[h]["scorable"] for h in ensemble_params["ordered_hashes"]
    ]
    pipes_with_target = [
        pipe_map[h]["with_target"] for h in ensemble_params["ordered_hashes"]
    ]
    return EnsembleModel(core_params, ensemble_params, scorable_pipes,
                         pipes_with_target, clfs, ensembler)

Beispiel #17

0

Datei anzeigen

if __name__ == "__main__":
    setup_log()
    read_dku_env_and_set()
    execution = read_execution()
    execution_id = execution['id']

    with ErrorMonitoringWrapper():
        load_libs()
        logging.info("Launching doctor main")
        if execution['type'] == 'RECIPE_PREDICTION_SCORE_PYTHON':
            from dataiku.doctor.prediction.reg_scoring_recipe import main
            names = json.loads(execution['payload'])
            main(
                'model', names['inputDatasetSmartName'],
                names['outputDatasetSmartName'],
                dkujson.load_from_filepath('work/desc.json'),
                dkujson.load_from_filepath('work/script.json'),
                dkujson.load_from_filepath(
                    'work/preparation_output_schema.json'),
                dkujson.load_from_filepath('work/conditional_outputs.json'))
        elif execution['type'] == 'RECIPE_PREDICTION_SCORE_KERAS':
            from dataiku.doctor.prediction.keras_scoring_recipe import main
            names = json.loads(execution['payload'])
            main(
                'model', names['inputDatasetSmartName'],
                names['outputDatasetSmartName'],
                dkujson.load_from_filepath('work/desc.json'),
                dkujson.load_from_filepath('work/script.json'),
                dkujson.load_from_filepath(
                    'work/preparation_output_schema.json'),
                dkujson.load_from_filepath('work/conditional_outputs.json'))

Beispiel #18

0

Datei anzeigen

def main(model_folder, input_dataset_smartname, output_dataset_smartname,
         recipe_desc, script, preparation_output_schema):
    input_dataset = dataiku.Dataset(input_dataset_smartname)
    logging.info("Will do preparation, output schema: %s" %
                 preparation_output_schema)
    input_dataset.set_preparation_steps(script["steps"],
                                        preparation_output_schema)

    listener = ProgressListener()

    preprocessing_params = dkujson.load_from_filepath(
        osp.join(model_folder, "rpreprocessing_params.json"))
    modeling_params = dkujson.load_from_filepath(
        osp.join(model_folder, "actual_params.json"))["resolved"]
    collector_data = dkujson.load_from_filepath(
        osp.join(model_folder, "collector_data.json"))

    # Name remapping
    user_meta = dkujson.load_from_filepath(
        osp.join(model_folder, "user_meta.json"))
    cluster_name_map = {}
    if "clusterMetas" in user_meta:
        logging.info("Cluster metas: %s" % user_meta["clusterMetas"])
        for (cluster_id, cluster_data) in user_meta["clusterMetas"].items():
            cluster_name_map[cluster_id] = cluster_data["name"]

    preprocessing_handler = ClusteringPreprocessingHandler(
        {}, preprocessing_params, model_folder)
    preprocessing_handler.collector_data = collector_data
    pipeline = preprocessing_handler.build_preprocessing_pipeline()

    with open(osp.join(model_folder, "clusterer.pkl"), "rb") as f:
        clf = pickle.load(f)

    try:
        logging.info("Post-processing model")
        clf.post_process(user_meta)
    except AttributeError:
        # method does not exist if model cannot be post-processed, just pass
        pass

    try:
        custom_labels = clf.get_cluster_labels()

        def map_fun_custom(i):
            name = custom_labels[i]
            return cluster_name_map.get(name, name)

        naming = map_fun_custom
    except AttributeError:

        def map_fun(i):
            name = "cluster_%i" % i
            return cluster_name_map.get(name, name)

        naming = map_fun

    def output_generator():
        logging.info("Start output generator ...")

        (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st(
            preparation_output_schema["columns"],
            parse_dates=True,
            infer_with_pandas=False)
        logging.info("Reading with INITIAL dtypes: %s" % dtypes)
        dtypes = utils.ml_dtypes_from_dss_schema(
            preparation_output_schema, preprocessing_params["per_feature"])
        logging.info("Reading with dtypes: %s" % dtypes)

        for input_df in input_dataset.iter_dataframes_forced_types(
                names, dtypes, parse_date_columns, chunksize=100000):
            input_df.index = range(input_df.shape[0])
            input_df_orig = input_df.copy()
            if recipe_desc.get("filterInputColumns", False):
                input_df_orig = input_df_orig[recipe_desc["keptInputColumns"]]

            logging.info("Got a dataframe : %s" % str(input_df.shape))
            normalize_dataframe(input_df, preprocessing_params['per_feature'])

            for col in input_df:
                logging.info("NORMALIZED: %s -> %s" %
                             (col, input_df[col].dtype))

            logging.info("Processing it")
            transformed = pipeline.process(input_df)
            logging.info("Applying it")

            (labels_arr,
             additional_columns) = clustering_predict(modeling_params, clf,
                                                      transformed)
            cluster_labels = pd.Series(labels_arr,
                                       name="cluster_labels").map(naming)
            cluster_labels.index = transformed["TRAIN"].index

            final_df = pd.concat([
                input_df_orig.join(cluster_labels, how='left'),
                additional_columns
            ],
                                 axis=1)

            if preprocessing_params["outliers"]["method"] == "CLUSTER":
                outliers_cluter_name = cluster_name_map.get(
                    constants.CLUSTER_OUTLIERS, constants.CLUSTER_OUTLIERS)
                final_df['cluster_labels'].fillna(outliers_cluter_name,
                                                  inplace=True)

            logging.info("Done predicting it")

            yield final_df

    output_dataset = dataiku.Dataset(output_dataset_smartname)
    logging.info("Starting writer")
    with output_dataset.get_writer() as writer:
        i = 0
        logging.info("Starting to iterate")
        for output_df in output_generator():
            logging.info("Generator generated a df %s" % str(output_df.shape))
            #if i == 0:
            #    output_dataset.write_schema_from_dataframe(output_df)
            i = i + 1
            writer.write_dataframe(output_df)
            logging.info("Output df written")

Beispiel #19

0

Datei anzeigen

def main(model_folder,
         input_dataset_smartname,
         output_dataset_smartname,
         metrics_dataset_smartname,
         recipe_desc,
         script,
         preparation_output_schema,
         cond_outputs=None):
    # Obtain a streamed result of the preparation
    input_dataset = dataiku.Dataset(input_dataset_smartname)
    logging.info("Will do preparation, output schema: %s" %
                 preparation_output_schema)
    input_dataset.set_preparation_steps(script["steps"],
                                        preparation_output_schema)

    core_params = dkujson.load_from_filepath(
        osp.join(model_folder, "core_params.json"))
    preprocessing_params = dkujson.load_from_filepath(
        osp.join(model_folder, "rpreprocessing_params.json"))
    modeling_params = dkujson.load_from_filepath(
        osp.join(model_folder, "rmodeling_params.json"))
    collector_data = dkujson.load_from_filepath(
        osp.join(model_folder, "collector_data.json"))

    preprocessing_handler = PreprocessingHandler.build(core_params,
                                                       preprocessing_params,
                                                       model_folder)
    preprocessing_handler.collector_data = collector_data

    pipeline = preprocessing_handler.build_preprocessing_pipeline(
        with_target=True)

    with open(osp.join(model_folder, "clf.pkl"), "rb") as f:
        clf = pickle.load(f)

    logging.info("Scoring data")

    (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st(
        preparation_output_schema["columns"],
        parse_dates=True,
        infer_with_pandas=False)
    logging.info("Reading with INITIAL dtypes: %s" % dtypes)
    dtypes = utils.ml_dtypes_from_dss_schema(
        preparation_output_schema,
        preprocessing_params["per_feature"],
        prediction_type=core_params["prediction_type"])
    logging.info("Reading with dtypes: %s" % dtypes)

    for i in xrange(0, len(names)):
        logging.info("Column %s = %s (dtype=%s)" %
                     (i, names[i], dtypes.get(names[i], None)))

    with input_dataset._stream(infer_with_pandas=True,
                               sampling='head',
                               sampling_column=None,
                               limit=None,
                               ratio=None,
                               columns=names) as stream:
        input_df = pd.read_table(stream,
                                 names=names,
                                 dtype=dtypes,
                                 header=None,
                                 sep='\t',
                                 doublequote=True,
                                 quotechar='"',
                                 parse_dates=parse_date_columns,
                                 float_precision="round_trip")

    input_df_orig = input_df.copy()
    logging.info("Got a dataframe : %s" % str(input_df.shape))
    normalize_dataframe(input_df, preprocessing_params['per_feature'])

    for col in input_df:
        logging.info("NORMALIZED: %s -> %s" % (col, input_df[col].dtype))

    logging.info("Processing it")
    transformed = pipeline.process(input_df)
    logging.info("Predicting it")

    if core_params["prediction_type"] == constants.BINARY_CLASSIFICATION:
        pred_df = binary_classification_predict(
            clf,
            pipeline,
            modeling_params,
            preprocessing_params,
            preprocessing_handler.target_map,
            recipe_desc["forcedClassifierThreshold"],
            input_df,
            output_probas=recipe_desc["outputProbabilities"],
            # For ensemble model, we need to indicate that we have target, so that a target-aware pipeline is
            # selected. See 0c87605 for more information
            ensemble_has_target=True)

        # Probability percentile & Conditional outputs
        has_cond_output = recipe_desc["outputProbabilities"] and cond_outputs
        has_percentiles = recipe_desc["outputProbaPercentiles"] or (
            has_cond_output and len([
                co for co in cond_outputs if co["input"] == "proba_percentile"
            ]))
        if has_percentiles:
            model_perf = dkujson.load_from_filepath(
                osp.join(model_folder, "perf.json"))
            if model_perf.has_key(
                    "probaPercentiles") and model_perf["probaPercentiles"]:
                percentile = pd.Series(model_perf["probaPercentiles"])
                proba_1 = "proba_" + str(
                    (k for k, v in preprocessing_handler.target_map.items()
                     if v == 1).next())
                pred_df["proba_percentile"] = pred_df[proba_1].apply(
                    lambda p: percentile.where(percentile <= p).count() + 1)
            else:
                raise Exception(
                    "Probability percentiles are missing from model.")
        if has_cond_output:
            for co in cond_outputs:
                inp = pred_df[co["input"]]
                acc = inp.notnull()  # condition accumulator
                for r in co["rules"]:
                    if r["operation"] == 'GT':
                        cond = inp > r["operand"]
                    elif r["operation"] == 'GE':
                        cond = inp >= r["operand"]
                    elif r["operation"] == 'LT':
                        cond = inp < r["operand"]
                    elif r["operation"] == 'LE':
                        cond = inp <= r["operand"]
                    pred_df.loc[acc & cond, co["name"]] = r["output"]
                    acc = acc & (~cond)
                pred_df.loc[acc, co["name"]] = co.get("defaultOutput", "")
        if has_percentiles and not recipe_desc[
                "outputProbaPercentiles"]:  # was only for conditional outputs
            pred_df.drop("proba_percentile", axis=1, inplace=True)

    elif core_params["prediction_type"] == constants.MULTICLASS:
        pred_df = multiclass_predict(
            clf,
            pipeline,
            modeling_params,
            preprocessing_params,
            preprocessing_handler.target_map,
            input_df,
            output_probas=recipe_desc["outputProbabilities"],
            # For ensemble model, we need to indicate that we have target, so that a target-aware pipeline is
            # selected. See 0c87605 for more information
            ensemble_has_target=True)

    elif core_params["prediction_type"] == constants.REGRESSION:
        pred_df = regression_predict(
            clf,
            pipeline,
            modeling_params,
            input_df,
            # For ensemble model, we need to indicate that we have target, so that a target-aware pipeline is
            # selected. See 0c87605 for more information
            ensemble_has_target=True)
    else:
        raise ValueError("bad prediction type %s" %
                         core_params["prediction_type"])

    # add error information to pred_df
    y = transformed["target"]
    target_mapping = {}
    if core_params["prediction_type"] in [
            constants.BINARY_CLASSIFICATION, constants.MULTICLASS
    ]:
        target_mapping = {
            label: int(class_id)
            for label, class_id in preprocessing_handler.target_map.items()
        }

    pred_df = add_evaluation_columns(core_params["prediction_type"], pred_df,
                                     y, target_mapping)

    logging.info("Done predicting it")
    if recipe_desc.get("filterInputColumns", False):
        clean_kept_columns = [
            c for c in recipe_desc["keptInputColumns"]
            if c not in pred_df.columns
        ]
    else:
        clean_kept_columns = [
            c for c in input_df_orig.columns if c not in pred_df.columns
        ]
    output_df = pd.concat([input_df_orig[clean_kept_columns], pred_df], axis=1)

    # write scored data
    output_dataset = dataiku.Dataset(output_dataset_smartname)
    #logging.info("writing scored schema")
    #output_dataset.write_schema_from_dataframe(output_df)  # backend should do this
    logging.info("writing scored data")
    output_dataset.write_from_dataframe(output_df)

    weight_method = core_params.get("weight", {}).get("weightMethod", None)
    with_sample_weight = weight_method in {
        "SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"
    }
    if with_sample_weight:
        sample_weight = transformed["weight"]
    else:
        sample_weight = None

    metrics_df = compute_metrics_df(core_params["prediction_type"],
                                    target_mapping, modeling_params, output_df,
                                    recipe_desc, y, transformed["UNPROCESSED"],
                                    sample_weight)

    # write metrics dataset
    if metrics_dataset_smartname:
        metrics_dataset = dataiku.Dataset(metrics_dataset_smartname)
        #logging.info("writing metrics schema")
        #metrics_dataset.write_schema_from_dataframe(metrics_df)  # backend should maybe do this ?
        logging.info("writing metrics data")
        metrics_dataset.write_from_dataframe(metrics_df)

Beispiel #20

0

Datei anzeigen

    output_df = pd.concat(output_list)
    input_df = pd.concat(input_df_list)

    logging.info("writing scored data")
    output_dataset = dataiku.Dataset(output_dataset_smartname)
    output_dataset.write_from_dataframe(output_df)

    # Compute and write Metrics Dataset
    # Don't need to provide sample weight because not supported by KERAS backend
    metrics_df = compute_metrics_df(prediction_type,
                                    target_mapping,
                                    modeling_params,
                                    output_df,
                                    recipe_desc,
                                    y,
                                    unprocessed=input_df,
                                    sample_weight=None)

    logging.info("writing metrics data")
    metrics_dataset = dataiku.Dataset(metrics_dataset_smartname)
    metrics_dataset.write_from_dataframe(metrics_df)


if __name__ == "__main__":
    read_dku_env_and_set()

    main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4],
         dkujson.load_from_filepath(sys.argv[5]),
         dkujson.load_from_filepath(sys.argv[6]),
         dkujson.load_from_filepath(sys.argv[7]),
         dkujson.load_from_filepath(sys.argv[8]))

Beispiel #21

0

Datei anzeigen

def main(model_folder,
         input_dataset_smartname,
         output_dataset_smartname,
         metrics_dataset_smartname,
         recipe_desc,
         script,
         preparation_output_schema,
         cond_outputs=None):

    # Fetching information about the model
    core_params = dkujson.load_from_filepath(
        osp.join(model_folder, "core_params.json"))
    preprocessing_params = dkujson.load_from_filepath(
        osp.join(model_folder, "rpreprocessing_params.json"))
    modeling_params = dkujson.load_from_filepath(
        osp.join(model_folder, "actual_params.json"))["resolved"]
    collector_data = dkujson.load_from_filepath(
        osp.join(model_folder, "collector_data.json"))

    prediction_type = core_params["prediction_type"]
    preprocessing_handler = PreprocessingHandler.build(core_params,
                                                       preprocessing_params,
                                                       model_folder)
    preprocessing_handler.collector_data = collector_data

    target_mapping = {}
    if core_params["prediction_type"] in [
            constants.BINARY_CLASSIFICATION, constants.MULTICLASS
    ]:
        target_mapping = {
            label: int(class_id)
            for label, class_id in preprocessing_handler.target_map.items()
        }

    # Retrieving scored data with generator (in order to prevent from out of memory errors with
    # big preprocessing)
    output_generator = scored_dataset_generator(
        model_folder,
        input_dataset_smartname,
        recipe_desc,
        script,
        preparation_output_schema,
        cond_outputs,
        output_y=True,
        output_input_df=True,
        should_add_evaluation_columns=True)

    logging.info("Starting to iterate")
    i = 0
    y_list = []
    output_list = []
    input_df_list = []
    for output_dict in output_generator:
        output_list.append(output_dict["scored"])
        y_list.append(output_dict["y"])
        input_df_list.append(output_dict["input_df"])
        logging.info("Generator generated a df {}".format(
            str(output_dict["scored"].shape)))
        i += 1

    y = pd.concat(y_list)
    output_df = pd.concat(output_list)
    input_df = pd.concat(input_df_list)

    logging.info("writing scored data")
    output_dataset = dataiku.Dataset(output_dataset_smartname)
    output_dataset.write_from_dataframe(output_df)

    # Compute and write Metrics Dataset
    # Don't need to provide sample weight because not supported by KERAS backend
    metrics_df = compute_metrics_df(prediction_type,
                                    target_mapping,
                                    modeling_params,
                                    output_df,
                                    recipe_desc,
                                    y,
                                    unprocessed=input_df,
                                    sample_weight=None)

    logging.info("writing metrics data")
    metrics_dataset = dataiku.Dataset(metrics_dataset_smartname)
    metrics_dataset.write_from_dataframe(metrics_df)

Beispiel #22

0

Datei anzeigen

 def aggregate_grid_dir(self):
     for grid_point_file_name in os.listdir(self.grid_folder):
         grid_point_file_path = os.path.join(self.grid_folder,
                                             grid_point_file_name)
         self.process_line(dkujson.load_from_filepath(grid_point_file_path))
         os.remove(grid_point_file_path)

Beispiel #23

0

Datei anzeigen

def scored_dataset_generator(model_folder,
                             input_dataset_smartname,
                             recipe_desc,
                             script,
                             preparation_output_schema,
                             cond_outputs,
                             output_y=False,
                             output_input_df=False,
                             should_add_evaluation_columns=False):
    from keras.models import load_model
    from dataiku.doctor.deep_learning import gpu
    from dataiku.doctor.deep_learning.keras_utils import tag_special_features, split_train_per_input

    # Load GPU Options
    if recipe_desc["useGPU"]:
        from dataiku.doctor.deep_learning import gpu
        gpu.load_gpu_options(recipe_desc["gpuList"],
                             allow_growth=recipe_desc["gpuAllowGrowth"],
                             per_process_gpu_memory_fraction=float(
                                 recipe_desc["perGPUMemoryFraction"]))
    else:
        gpu.deactivate_gpu()

    batch_size = recipe_desc.get("batchSize", 100)

    # Obtain a streamed result of the preparation
    input_dataset = dataiku.Dataset(input_dataset_smartname)
    logging.info("Will do preparation, output schema: %s" %
                 preparation_output_schema)
    input_dataset.set_preparation_steps(script["steps"],
                                        preparation_output_schema)

    core_params = dkujson.load_from_filepath(
        osp.join(model_folder, "core_params.json"))
    preprocessing_params = dkujson.load_from_filepath(
        osp.join(model_folder, "rpreprocessing_params.json"))
    collector_data = dkujson.load_from_filepath(
        osp.join(model_folder, "collector_data.json"))
    modeling_params = dkujson.load_from_filepath(
        osp.join(model_folder, "actual_params.json"))["resolved"]

    prediction_type = core_params["prediction_type"]

    # Tagging special features to take them into account only in special_preproc_handler/special_pipeline
    per_feature = preprocessing_params["per_feature"]
    tag_special_features(per_feature)

    preproc_handler = PreprocessingHandler.build(core_params,
                                                 preprocessing_params,
                                                 model_folder)
    preproc_handler.collector_data = collector_data
    pipeline = preproc_handler.build_preprocessing_pipeline(
        with_target=output_y)
    target_map = preproc_handler.target_map

    logging.info("Loading model")
    model = load_model(osp.join(model_folder, constants.KERAS_MODEL_FILENAME))

    logging.info("Start output generator")

    (names, dtypes,
     parse_date_columns) = dataiku.Dataset.get_dataframe_schema_st(
         preparation_output_schema["columns"],
         parse_dates=True,
         infer_with_pandas=False)
    logging.info("Reading with INITIAL dtypes: %s" % dtypes)
    dtypes = utils.ml_dtypes_from_dss_schema(
        preparation_output_schema,
        preprocessing_params["per_feature"],
        prediction_type=prediction_type)
    logging.info("Reading with dtypes: %s" % dtypes)

    for i in xrange(0, len(names)):
        logging.info("Column %s = %s (dtype=%s)" %
                     (i, names[i], dtypes.get(names[i], None)))

    for input_df in input_dataset.iter_dataframes_forced_types(
            names, dtypes, parse_date_columns, chunksize=batch_size):

        input_df.index = range(input_df.shape[0])
        input_df_orig = input_df.copy()
        logging.info("Got a dataframe chunk : %s" % str(input_df.shape))
        normalize_dataframe(input_df, preprocessing_params['per_feature'])

        for col in input_df:
            logging.info("NORMALIZED: %s -> %s" % (col, input_df[col].dtype))

        logging.info("Processing chunk")

        transformed = pipeline.process(input_df)
        features_X_orig = transformed["TRAIN"]
        transformed_X_mf = transformed["TRAIN"]

        inputs_dict = split_train_per_input(
            transformed_X_mf, per_feature, pipeline.generated_features_mapping)

        if prediction_type in [
                constants.MULTICLASS, constants.BINARY_CLASSIFICATION
        ]:

            inv_map = {
                int(class_id): label
                for label, class_id in target_map.items()
            }
            classes = [
                class_label for (_, class_label) in sorted(inv_map.items())
            ]

            if prediction_type == constants.MULTICLASS:
                probas_raw = model.predict(inputs_dict)
                preds = np.argmax(probas_raw, axis=1)

            if prediction_type == constants.BINARY_CLASSIFICATION:
                if modeling_params["keras"]["oneDimensionalOutput"]:
                    probas_one = np.squeeze(model.predict(inputs_dict), axis=1)
                    probas_raw = np.zeros((probas_one.shape[0], 2))
                    probas_raw[:, 1] = probas_one
                    probas_raw[:, 0] = 1 - probas_one
                else:
                    probas_raw = model.predict(inputs_dict)
                    probas_one = probas_raw[:, 1]

                threshold = recipe_desc["forcedClassifierThreshold"]
                preds = (probas_one > threshold).astype(np.int)

            (nb_rows, nb_present_classes) = probas_raw.shape
            logging.info("Probas raw shape %s/%s target_map=%s", nb_rows,
                         nb_present_classes, len(target_map))

            preds_remapped = np.zeros(preds.shape, dtype="object")
            for (mapped_value, original_value) in inv_map.items():
                idx = (preds == mapped_value)
                preds_remapped[idx] = original_value
            pred_df = pd.DataFrame({"prediction": preds_remapped})
            pred_df.index = features_X_orig.index

            proba_cols = ["proba_{}".format(c) for c in classes]
            # For Binary Classification: Must compute probas if conditional there are outputs that use them
            # Will be deleted afterwards (if outputProbabilities if False)
            # in binary_classif_scoring_add_percentile_and_cond_outputs
            probas_in_cond_outputs = (cond_outputs and len(
                [co for co in cond_outputs if co["input"] in proba_cols]) > 0)
            use_probas = recipe_desc[
                "outputProbabilities"] or probas_in_cond_outputs
            if use_probas:
                proba_df = pd.DataFrame(
                    probas_raw,
                    columns=["proba_{}".format(c) for c in classes])
                proba_df.index = features_X_orig.index
                pred_df = pd.concat([proba_df, pred_df], axis=1)

            if prediction_type == constants.BINARY_CLASSIFICATION:
                pred_df = binary_classif_scoring_add_percentile_and_cond_outputs(
                    pred_df, recipe_desc, model_folder, cond_outputs,
                    target_map)

        elif prediction_type == constants.REGRESSION:
            preds = model.predict(inputs_dict)
            pred_df = pd.DataFrame({"prediction": np.squeeze(preds, axis=1)})
            pred_df.index = features_X_orig.index

        if should_add_evaluation_columns:
            if not output_y:
                raise ValueError(
                    "Cannot add evaluation columns if not outputing Y")
            else:
                target_mapping = {}
                if core_params["prediction_type"] in [
                        constants.BINARY_CLASSIFICATION, constants.MULTICLASS
                ]:
                    target_mapping = {
                        label: int(class_id)
                        for label, class_id in
                        preproc_handler.target_map.items()
                    }
                add_evaluation_columns(prediction_type, pred_df,
                                       transformed["target"], target_mapping)

        logging.info("Done predicting it")
        if recipe_desc.get("filterInputColumns", False):
            clean_kept_columns = [
                c for c in recipe_desc["keptInputColumns"]
                if c not in pred_df.columns
            ]
        else:
            clean_kept_columns = [
                c for c in input_df_orig.columns if c not in pred_df.columns
            ]

        res = {
            "scored":
            pd.concat([input_df_orig[clean_kept_columns], pred_df], axis=1)
        }

        if output_y:
            res["y"] = transformed["target"]

        if output_input_df:
            res["input_df"] = input_df_orig

        yield res