Ejemplo n.º 1
0
    def output_generator():
        logging.info("Start output generator ...")

        (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st(
            preparation_output_schema["columns"],
            parse_dates=True,
            infer_with_pandas=False)
        logging.info("Reading with INITIAL dtypes: %s" % dtypes)
        dtypes = utils.ml_dtypes_from_dss_schema(
            preparation_output_schema, preprocessing_params["per_feature"])
        logging.info("Reading with dtypes: %s" % dtypes)

        for input_df in input_dataset.iter_dataframes_forced_types(
                names, dtypes, parse_date_columns, chunksize=100000):
            input_df.index = range(input_df.shape[0])
            input_df_orig = input_df.copy()
            if recipe_desc.get("filterInputColumns", False):
                input_df_orig = input_df_orig[recipe_desc["keptInputColumns"]]

            logging.info("Got a dataframe : %s" % str(input_df.shape))
            normalize_dataframe(input_df, preprocessing_params['per_feature'])

            for col in input_df:
                logging.info("NORMALIZED: %s -> %s" %
                             (col, input_df[col].dtype))

            logging.info("Processing it")
            transformed = pipeline.process(input_df)
            logging.info("Applying it")

            (labels_arr,
             additional_columns) = clustering_predict(modeling_params, clf,
                                                      transformed)
            cluster_labels = pd.Series(labels_arr,
                                       name="cluster_labels").map(naming)
            cluster_labels.index = transformed["TRAIN"].index

            final_df = pd.concat([
                input_df_orig.join(cluster_labels, how='left'),
                additional_columns
            ],
                                 axis=1)

            if preprocessing_params["outliers"]["method"] == "CLUSTER":
                outliers_cluter_name = cluster_name_map.get(
                    constants.CLUSTER_OUTLIERS, constants.CLUSTER_OUTLIERS)
                final_df['cluster_labels'].fillna(outliers_cluter_name,
                                                  inplace=True)

            logging.info("Done predicting it")

            yield final_df
Ejemplo n.º 2
0
def df_from_split_desc(split_desc,
                       split,
                       feature_params,
                       prediction_type=None):
    df = df_from_split_desc_no_normalization(split_desc, split, feature_params,
                                             prediction_type)
    return utils.normalize_dataframe(df, feature_params)
Ejemplo n.º 3
0
def main(exec_folder, output_dataset, keptInputColumns):
    start = unix_time_millis()
    listener = ProgressListener()

    split_desc = json.load(open(osp.join(exec_folder, "_esplit.json")))
    preprocessing_params = json.load(open(osp.join(exec_folder, "rpreprocessing_params.json")))
    modeling_params = json.load(open(osp.join(exec_folder, "rmodeling_params.json")))

    with listener.push_state(constants.STATE_LOADING_SRC):
        input_df = df_from_split_desc_no_normalization(split_desc, "full", preprocessing_params["per_feature"])
        logging.info("Loaded full df: shape=(%d,%d)" % input_df.shape)
        input_df_orig = input_df.copy()
        input_df = utils.normalize_dataframe(input_df, preprocessing_params["per_feature"])        

    with listener.push_state("Collecting preprocessing data"):
        collector = ClusteringPreprocessingDataCollector(input_df, preprocessing_params)
        collector_data = collector.build()

    preproc_handler = ClusteringPreprocessingHandler({}, preprocessing_params, exec_folder)
    preproc_handler.collector_data = collector_data
    pipeline = preproc_handler.build_preprocessing_pipeline()

    with listener.push_state("Preprocessing data"):
        transformed_train = pipeline.fit_and_process(input_df)

    start_train = unix_time_millis()

    (clf, actual_params, cluster_labels, additional_columns) = clustering_fit(modeling_params, transformed_train)

    # if model has custom labels, use them
    try:
        cluster_names = clf.get_cluster_labels()
    except AttributeError:
        cluster_names = ["cluster_%s" % i for i in range(len(np.unique(cluster_labels)))]
    cl = pd.Series(data=cluster_labels, name="cluster_labels").map(lambda i: cluster_names[i])
    cl.index = transformed_train["TRAIN"].index

    final_df = pd.concat([input_df_orig.join(cl, how='left'), additional_columns], axis=1)

    if keptInputColumns is not None:
        final_df = final_df[keptInputColumns + ['cluster_labels']]

    if preprocessing_params["outliers"]["method"] == "CLUSTER":
        final_df['cluster_labels'].fillna(constants.CLUSTER_OUTLIERS, inplace=True)

    dataiku.Dataset(output_dataset).write_from_dataframe(final_df)

    end = unix_time_millis()

    utils.write_done_traininfo(exec_folder, start, start_train, end, listener)
Ejemplo n.º 4
0
def _renormalize_dates(df, schema, prep):
    from dataiku.doctor.utils import normalize_dataframe
    df = df.copy(deep=False)
    (names, dtypes,
     parse_dates) = Dataset.get_dataframe_schema_st(schema["columns"],
                                                    infer_with_pandas=False,
                                                    bool_as_str=True)
    # For columns for which preparation output schema says date, parse it,
    # because the Pandas CSV parser does not do it
    if parse_dates is not False:
        for col_idx in parse_dates:
            col = schema["columns"][col_idx]["name"]
            if col in df:
                df[col] = pd.to_datetime(df[col])
    return normalize_dataframe(df, prep["per_feature"])
Ejemplo n.º 5
0
    def output_generator():
        logging.info("Start output generator ...")

        (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st(
            preparation_output_schema["columns"],
            parse_dates=True,
            infer_with_pandas=False)
        logging.info("Reading with INITIAL dtypes: %s" % dtypes)
        dtypes = utils.ml_dtypes_from_dss_schema(
            preparation_output_schema,
            preprocessing_params["per_feature"],
            prediction_type=core_params["prediction_type"])
        logging.info("Reading with dtypes: %s" % dtypes)

        for i in xrange(0, len(names)):
            logging.info("Column %s = %s (dtype=%s)" %
                         (i, names[i], dtypes.get(names[i], None)))

        for input_df in input_dataset.iter_dataframes_forced_types(
                names,
                dtypes,
                parse_date_columns,
                chunksize=batch_size,
                float_precision="round_trip"):
            input_df.index = range(input_df.shape[0])
            input_df_orig = input_df.copy()
            logging.info("Got a dataframe : %s" % str(input_df.shape))
            normalize_dataframe(input_df, preprocessing_params['per_feature'])

            for col in input_df:
                logging.info("NORMALIZED: %s -> %s" %
                             (col, input_df[col].dtype))

            logging.info("Processing it")
            logging.info("Predicting it")

            if core_params[
                    "prediction_type"] == constants.BINARY_CLASSIFICATION:
                pred_df = binary_classification_predict(
                    clf,
                    pipeline,
                    modeling_params,
                    preprocessing_params,
                    preprocessing_handler.target_map,
                    recipe_desc["forcedClassifierThreshold"],
                    input_df,
                    output_probas=recipe_desc["outputProbabilities"])
                # Probability percentile & Conditional outputs
                pred_df = binary_classif_scoring_add_percentile_and_cond_outputs(
                    pred_df, recipe_desc, model_folder, cond_outputs,
                    preprocessing_handler.target_map)

            elif core_params["prediction_type"] == constants.MULTICLASS:
                pred_df = multiclass_predict(
                    clf,
                    pipeline,
                    modeling_params,
                    preprocessing_params,
                    preprocessing_handler.target_map,
                    input_df,
                    output_probas=recipe_desc["outputProbabilities"])

            elif core_params["prediction_type"] == constants.REGRESSION:
                pred_df = regression_predict(clf, pipeline, modeling_params,
                                             input_df)

            else:
                raise ValueError("bad prediction type %s" %
                                 core_params["prediction_type"])

            logging.info("pred df debug :")
            logging.info(pred_df)

            logging.info("Done predicting it")
            if recipe_desc.get("filterInputColumns", False):
                clean_kept_columns = [
                    c for c in recipe_desc["keptInputColumns"]
                    if c not in pred_df.columns
                ]
            else:
                clean_kept_columns = [
                    c for c in input_df_orig.columns
                    if c not in pred_df.columns
                ]
            yield pd.concat([input_df_orig[clean_kept_columns], pred_df],
                            axis=1)
Ejemplo n.º 6
0
def main(model_folder,
         input_dataset_smartname,
         output_dataset_smartname,
         metrics_dataset_smartname,
         recipe_desc,
         script,
         preparation_output_schema,
         cond_outputs=None):
    # Obtain a streamed result of the preparation
    input_dataset = dataiku.Dataset(input_dataset_smartname)
    logging.info("Will do preparation, output schema: %s" %
                 preparation_output_schema)
    input_dataset.set_preparation_steps(script["steps"],
                                        preparation_output_schema)

    core_params = dkujson.load_from_filepath(
        osp.join(model_folder, "core_params.json"))
    preprocessing_params = dkujson.load_from_filepath(
        osp.join(model_folder, "rpreprocessing_params.json"))
    modeling_params = dkujson.load_from_filepath(
        osp.join(model_folder, "rmodeling_params.json"))
    collector_data = dkujson.load_from_filepath(
        osp.join(model_folder, "collector_data.json"))

    preprocessing_handler = PreprocessingHandler.build(core_params,
                                                       preprocessing_params,
                                                       model_folder)
    preprocessing_handler.collector_data = collector_data

    pipeline = preprocessing_handler.build_preprocessing_pipeline(
        with_target=True)

    with open(osp.join(model_folder, "clf.pkl"), "rb") as f:
        clf = pickle.load(f)

    logging.info("Scoring data")

    (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st(
        preparation_output_schema["columns"],
        parse_dates=True,
        infer_with_pandas=False)
    logging.info("Reading with INITIAL dtypes: %s" % dtypes)
    dtypes = utils.ml_dtypes_from_dss_schema(
        preparation_output_schema,
        preprocessing_params["per_feature"],
        prediction_type=core_params["prediction_type"])
    logging.info("Reading with dtypes: %s" % dtypes)

    for i in xrange(0, len(names)):
        logging.info("Column %s = %s (dtype=%s)" %
                     (i, names[i], dtypes.get(names[i], None)))

    with input_dataset._stream(infer_with_pandas=True,
                               sampling='head',
                               sampling_column=None,
                               limit=None,
                               ratio=None,
                               columns=names) as stream:
        input_df = pd.read_table(stream,
                                 names=names,
                                 dtype=dtypes,
                                 header=None,
                                 sep='\t',
                                 doublequote=True,
                                 quotechar='"',
                                 parse_dates=parse_date_columns,
                                 float_precision="round_trip")

    input_df_orig = input_df.copy()
    logging.info("Got a dataframe : %s" % str(input_df.shape))
    normalize_dataframe(input_df, preprocessing_params['per_feature'])

    for col in input_df:
        logging.info("NORMALIZED: %s -> %s" % (col, input_df[col].dtype))

    logging.info("Processing it")
    transformed = pipeline.process(input_df)
    logging.info("Predicting it")

    if core_params["prediction_type"] == constants.BINARY_CLASSIFICATION:
        pred_df = binary_classification_predict(
            clf,
            pipeline,
            modeling_params,
            preprocessing_params,
            preprocessing_handler.target_map,
            recipe_desc["forcedClassifierThreshold"],
            input_df,
            output_probas=recipe_desc["outputProbabilities"],
            # For ensemble model, we need to indicate that we have target, so that a target-aware pipeline is
            # selected. See 0c87605 for more information
            ensemble_has_target=True)

        # Probability percentile & Conditional outputs
        has_cond_output = recipe_desc["outputProbabilities"] and cond_outputs
        has_percentiles = recipe_desc["outputProbaPercentiles"] or (
            has_cond_output and len([
                co for co in cond_outputs if co["input"] == "proba_percentile"
            ]))
        if has_percentiles:
            model_perf = dkujson.load_from_filepath(
                osp.join(model_folder, "perf.json"))
            if model_perf.has_key(
                    "probaPercentiles") and model_perf["probaPercentiles"]:
                percentile = pd.Series(model_perf["probaPercentiles"])
                proba_1 = "proba_" + str(
                    (k for k, v in preprocessing_handler.target_map.items()
                     if v == 1).next())
                pred_df["proba_percentile"] = pred_df[proba_1].apply(
                    lambda p: percentile.where(percentile <= p).count() + 1)
            else:
                raise Exception(
                    "Probability percentiles are missing from model.")
        if has_cond_output:
            for co in cond_outputs:
                inp = pred_df[co["input"]]
                acc = inp.notnull()  # condition accumulator
                for r in co["rules"]:
                    if r["operation"] == 'GT':
                        cond = inp > r["operand"]
                    elif r["operation"] == 'GE':
                        cond = inp >= r["operand"]
                    elif r["operation"] == 'LT':
                        cond = inp < r["operand"]
                    elif r["operation"] == 'LE':
                        cond = inp <= r["operand"]
                    pred_df.loc[acc & cond, co["name"]] = r["output"]
                    acc = acc & (~cond)
                pred_df.loc[acc, co["name"]] = co.get("defaultOutput", "")
        if has_percentiles and not recipe_desc[
                "outputProbaPercentiles"]:  # was only for conditional outputs
            pred_df.drop("proba_percentile", axis=1, inplace=True)

    elif core_params["prediction_type"] == constants.MULTICLASS:
        pred_df = multiclass_predict(
            clf,
            pipeline,
            modeling_params,
            preprocessing_params,
            preprocessing_handler.target_map,
            input_df,
            output_probas=recipe_desc["outputProbabilities"],
            # For ensemble model, we need to indicate that we have target, so that a target-aware pipeline is
            # selected. See 0c87605 for more information
            ensemble_has_target=True)

    elif core_params["prediction_type"] == constants.REGRESSION:
        pred_df = regression_predict(
            clf,
            pipeline,
            modeling_params,
            input_df,
            # For ensemble model, we need to indicate that we have target, so that a target-aware pipeline is
            # selected. See 0c87605 for more information
            ensemble_has_target=True)
    else:
        raise ValueError("bad prediction type %s" %
                         core_params["prediction_type"])

    # add error information to pred_df
    y = transformed["target"]
    target_mapping = {}
    if core_params["prediction_type"] in [
            constants.BINARY_CLASSIFICATION, constants.MULTICLASS
    ]:
        target_mapping = {
            label: int(class_id)
            for label, class_id in preprocessing_handler.target_map.items()
        }

    pred_df = add_evaluation_columns(core_params["prediction_type"], pred_df,
                                     y, target_mapping)

    logging.info("Done predicting it")
    if recipe_desc.get("filterInputColumns", False):
        clean_kept_columns = [
            c for c in recipe_desc["keptInputColumns"]
            if c not in pred_df.columns
        ]
    else:
        clean_kept_columns = [
            c for c in input_df_orig.columns if c not in pred_df.columns
        ]
    output_df = pd.concat([input_df_orig[clean_kept_columns], pred_df], axis=1)

    # write scored data
    output_dataset = dataiku.Dataset(output_dataset_smartname)
    #logging.info("writing scored schema")
    #output_dataset.write_schema_from_dataframe(output_df)  # backend should do this
    logging.info("writing scored data")
    output_dataset.write_from_dataframe(output_df)

    weight_method = core_params.get("weight", {}).get("weightMethod", None)
    with_sample_weight = weight_method in {
        "SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"
    }
    if with_sample_weight:
        sample_weight = transformed["weight"]
    else:
        sample_weight = None

    metrics_df = compute_metrics_df(core_params["prediction_type"],
                                    target_mapping, modeling_params, output_df,
                                    recipe_desc, y, transformed["UNPROCESSED"],
                                    sample_weight)

    # write metrics dataset
    if metrics_dataset_smartname:
        metrics_dataset = dataiku.Dataset(metrics_dataset_smartname)
        #logging.info("writing metrics schema")
        #metrics_dataset.write_schema_from_dataframe(metrics_df)  # backend should maybe do this ?
        logging.info("writing metrics data")
        metrics_dataset.write_from_dataframe(metrics_df)
Ejemplo n.º 7
0
def scored_dataset_generator(model_folder,
                             input_dataset_smartname,
                             recipe_desc,
                             script,
                             preparation_output_schema,
                             cond_outputs,
                             output_y=False,
                             output_input_df=False,
                             should_add_evaluation_columns=False):
    from keras.models import load_model
    from dataiku.doctor.deep_learning import gpu
    from dataiku.doctor.deep_learning.keras_utils import tag_special_features, split_train_per_input

    # Load GPU Options
    if recipe_desc["useGPU"]:
        from dataiku.doctor.deep_learning import gpu
        gpu.load_gpu_options(recipe_desc["gpuList"],
                             allow_growth=recipe_desc["gpuAllowGrowth"],
                             per_process_gpu_memory_fraction=float(
                                 recipe_desc["perGPUMemoryFraction"]))
    else:
        gpu.deactivate_gpu()

    batch_size = recipe_desc.get("batchSize", 100)

    # Obtain a streamed result of the preparation
    input_dataset = dataiku.Dataset(input_dataset_smartname)
    logging.info("Will do preparation, output schema: %s" %
                 preparation_output_schema)
    input_dataset.set_preparation_steps(script["steps"],
                                        preparation_output_schema)

    core_params = dkujson.load_from_filepath(
        osp.join(model_folder, "core_params.json"))
    preprocessing_params = dkujson.load_from_filepath(
        osp.join(model_folder, "rpreprocessing_params.json"))
    collector_data = dkujson.load_from_filepath(
        osp.join(model_folder, "collector_data.json"))
    modeling_params = dkujson.load_from_filepath(
        osp.join(model_folder, "actual_params.json"))["resolved"]

    prediction_type = core_params["prediction_type"]

    # Tagging special features to take them into account only in special_preproc_handler/special_pipeline
    per_feature = preprocessing_params["per_feature"]
    tag_special_features(per_feature)

    preproc_handler = PreprocessingHandler.build(core_params,
                                                 preprocessing_params,
                                                 model_folder)
    preproc_handler.collector_data = collector_data
    pipeline = preproc_handler.build_preprocessing_pipeline(
        with_target=output_y)
    target_map = preproc_handler.target_map

    logging.info("Loading model")
    model = load_model(osp.join(model_folder, constants.KERAS_MODEL_FILENAME))

    logging.info("Start output generator")

    (names, dtypes,
     parse_date_columns) = dataiku.Dataset.get_dataframe_schema_st(
         preparation_output_schema["columns"],
         parse_dates=True,
         infer_with_pandas=False)
    logging.info("Reading with INITIAL dtypes: %s" % dtypes)
    dtypes = utils.ml_dtypes_from_dss_schema(
        preparation_output_schema,
        preprocessing_params["per_feature"],
        prediction_type=prediction_type)
    logging.info("Reading with dtypes: %s" % dtypes)

    for i in xrange(0, len(names)):
        logging.info("Column %s = %s (dtype=%s)" %
                     (i, names[i], dtypes.get(names[i], None)))

    for input_df in input_dataset.iter_dataframes_forced_types(
            names, dtypes, parse_date_columns, chunksize=batch_size):

        input_df.index = range(input_df.shape[0])
        input_df_orig = input_df.copy()
        logging.info("Got a dataframe chunk : %s" % str(input_df.shape))
        normalize_dataframe(input_df, preprocessing_params['per_feature'])

        for col in input_df:
            logging.info("NORMALIZED: %s -> %s" % (col, input_df[col].dtype))

        logging.info("Processing chunk")

        transformed = pipeline.process(input_df)
        features_X_orig = transformed["TRAIN"]
        transformed_X_mf = transformed["TRAIN"]

        inputs_dict = split_train_per_input(
            transformed_X_mf, per_feature, pipeline.generated_features_mapping)

        if prediction_type in [
                constants.MULTICLASS, constants.BINARY_CLASSIFICATION
        ]:

            inv_map = {
                int(class_id): label
                for label, class_id in target_map.items()
            }
            classes = [
                class_label for (_, class_label) in sorted(inv_map.items())
            ]

            if prediction_type == constants.MULTICLASS:
                probas_raw = model.predict(inputs_dict)
                preds = np.argmax(probas_raw, axis=1)

            if prediction_type == constants.BINARY_CLASSIFICATION:
                if modeling_params["keras"]["oneDimensionalOutput"]:
                    probas_one = np.squeeze(model.predict(inputs_dict), axis=1)
                    probas_raw = np.zeros((probas_one.shape[0], 2))
                    probas_raw[:, 1] = probas_one
                    probas_raw[:, 0] = 1 - probas_one
                else:
                    probas_raw = model.predict(inputs_dict)
                    probas_one = probas_raw[:, 1]

                threshold = recipe_desc["forcedClassifierThreshold"]
                preds = (probas_one > threshold).astype(np.int)

            (nb_rows, nb_present_classes) = probas_raw.shape
            logging.info("Probas raw shape %s/%s target_map=%s", nb_rows,
                         nb_present_classes, len(target_map))

            preds_remapped = np.zeros(preds.shape, dtype="object")
            for (mapped_value, original_value) in inv_map.items():
                idx = (preds == mapped_value)
                preds_remapped[idx] = original_value
            pred_df = pd.DataFrame({"prediction": preds_remapped})
            pred_df.index = features_X_orig.index

            proba_cols = ["proba_{}".format(c) for c in classes]
            # For Binary Classification: Must compute probas if conditional there are outputs that use them
            # Will be deleted afterwards (if outputProbabilities if False)
            # in binary_classif_scoring_add_percentile_and_cond_outputs
            probas_in_cond_outputs = (cond_outputs and len(
                [co for co in cond_outputs if co["input"] in proba_cols]) > 0)
            use_probas = recipe_desc[
                "outputProbabilities"] or probas_in_cond_outputs
            if use_probas:
                proba_df = pd.DataFrame(
                    probas_raw,
                    columns=["proba_{}".format(c) for c in classes])
                proba_df.index = features_X_orig.index
                pred_df = pd.concat([proba_df, pred_df], axis=1)

            if prediction_type == constants.BINARY_CLASSIFICATION:
                pred_df = binary_classif_scoring_add_percentile_and_cond_outputs(
                    pred_df, recipe_desc, model_folder, cond_outputs,
                    target_map)

        elif prediction_type == constants.REGRESSION:
            preds = model.predict(inputs_dict)
            pred_df = pd.DataFrame({"prediction": np.squeeze(preds, axis=1)})
            pred_df.index = features_X_orig.index

        if should_add_evaluation_columns:
            if not output_y:
                raise ValueError(
                    "Cannot add evaluation columns if not outputing Y")
            else:
                target_mapping = {}
                if core_params["prediction_type"] in [
                        constants.BINARY_CLASSIFICATION, constants.MULTICLASS
                ]:
                    target_mapping = {
                        label: int(class_id)
                        for label, class_id in
                        preproc_handler.target_map.items()
                    }
                add_evaluation_columns(prediction_type, pred_df,
                                       transformed["target"], target_mapping)

        logging.info("Done predicting it")
        if recipe_desc.get("filterInputColumns", False):
            clean_kept_columns = [
                c for c in recipe_desc["keptInputColumns"]
                if c not in pred_df.columns
            ]
        else:
            clean_kept_columns = [
                c for c in input_df_orig.columns if c not in pred_df.columns
            ]

        res = {
            "scored":
            pd.concat([input_df_orig[clean_kept_columns], pred_df], axis=1)
        }

        if output_y:
            res["y"] = transformed["target"]

        if output_input_df:
            res["input_df"] = input_df_orig

        yield res