def add_scoring_columns(tree, df, output_probabilities, is_evaluation=False, check_prediction=False):
    for leaf_id in tree.leaves:
        leaf = tree.get_node(leaf_id)
        if leaf.prediction is not None:
            filtered_df = tree.get_filtered_df(leaf, df)
            label_indices = filtered_df.index
            if is_evaluation:
                filtered_df = filtered_df[filtered_df[tree.target].isin(tree.target_values)]
            filtered_df_indices = filtered_df.index

            if output_probabilities:
                remaining_target_classes = set(tree.target_values)
                for target_class_name, proba in leaf.probabilities:
                    df.loc[filtered_df_indices, "proba_"+safe_str(target_class_name)] = proba
                    remaining_target_classes.remove(target_class_name)
                for target_class_name in remaining_target_classes:
                    df.loc[filtered_df_indices, "proba_"+safe_str(target_class_name)] = 0

            df.loc[filtered_df_indices, "prediction"] = leaf.prediction
            if check_prediction:
                df.loc[filtered_df_indices, "prediction_correct"] = filtered_df[tree.target] == leaf.prediction
            df.loc[label_indices, "label"] = leaf.label

        elif leaf.label is not None:
            filtered_df = tree.get_filtered_df(leaf, df)
            df.loc[filtered_df.index, "label"] = leaf.label
Beispiel #2
0
    def parse_nodes(self, nodes):
        self.nodes, ids = {}, deque()
        node = Node(0, -1, set(nodes["0"]["treated_as_numerical"]))
        node.rebuild(nodes["0"]["children_ids"], nodes["0"]["prediction"],
                     nodes["0"]["samples"], nodes["0"]["probabilities"],
                     nodes["0"]["label"])
        self.nodes[0] = node

        ids += node.children_ids
        if not ids:
            self.leaves.add(node.id)
        while ids:
            dict_node = nodes[safe_str(ids.popleft())]
            if dict_node.get("values") is not None:
                node = CategoricalNode(
                    dict_node.pop("id"),
                    dict_node.pop("parent_id"),
                    set(dict_node.pop("treated_as_numerical")),
                    dict_node.pop("feature"),
                    dict_node.pop("values"),
                    others=dict_node.pop("others"))
            else:
                node = NumericalNode(
                    dict_node.pop("id"),
                    dict_node.pop("parent_id"),
                    set(dict_node.pop("treated_as_numerical")),
                    dict_node.pop("feature"),
                    beginning=dict_node.pop("beginning", None),
                    end=dict_node.pop("end", None))
            node.rebuild(**dict_node)
            if not node.children_ids:
                self.leaves.add(node.id)
            self.nodes[node.id] = node
            ids += node.children_ids
def get_scored_df_schema(tree, schema, columns, output_probabilities, is_evaluation=False, check_prediction=False):
    check_input_schema(tree, set(column["name"] for column in schema), is_evaluation)
    if columns is not None:
        schema = update_input_schema(schema, columns)
    if output_probabilities:
        for value in tree.target_values:
            schema.append({'type': 'double', 'name': "proba_" + safe_str(value)})
            if columns is not None:
                columns.append("proba_"+safe_str(value))
    schema.append({'type': 'string', 'name': 'prediction'})
    if columns is not None:
        columns.append("prediction")
    if check_prediction:
        schema.append({'type': 'boolean', 'name': 'prediction_correct'})
        if columns is not None:
            columns.append("prediction_correct")
    schema.append({'type': 'string', 'name': 'label'})
    if columns is not None:
        columns.append("label")
    return schema
def score_chunk(tree, df, check_prediction):
    filtered_dfs = []
    for leaf_id in tree.leaves:
        leaf = tree.get_node(leaf_id)
        filtered_df = tree.get_filtered_df(leaf, df)
        for proba in leaf.probabilities:
            filtered_df["proba_" + safe_str(proba[0])] = proba[1]
        filtered_df["prediction"] = leaf.prediction
        if check_prediction and leaf.prediction is not None:
            filtered_df["prediction_correct"] = filtered_df[
                tree.target] == leaf.prediction
        filtered_df["label"] = leaf.label
        filtered_dfs.append(filtered_df)
    return filtered_dfs
    def get_stats_numerical_node(self, column, target_column, mean):
        if column.empty:
            return {"no_values": True}

        stats = {"bins": [], "mean": column.mean(), "max": column.max(), "min": column.min()}
        bins = pd.cut(column.fillna(mean), bins = min(10, column.nunique()), include_lowest = True, right = False)
        target_grouped = target_column.groupby(bins)
        target_distrib = target_grouped.apply(lambda x: x.value_counts())
        col_distrib = target_grouped.count()
        for interval, count in col_distrib.items():
            stats["bins"].append({"value": safe_str(interval),
                                    "target_distrib": target_distrib[interval].to_dict() if count > 0 else {},
                                    "mid": interval.mid,
                                    "count": count})
        return stats
def score(tree, input_dataset, chunk_size_param, check_prediction):
    dfs = []
    first_chunk = True
    for df in input_dataset.iter_dataframes(chunksize=chunk_size_param):
        if first_chunk:
            check(df, tree, check_prediction)
            first_chunk = False
        dfs += score_chunk(tree, df, check_prediction)
    full_df = pd.concat(dfs).sort_index()
    proba_columns = [
        "proba_" + safe_str(target_value)
        for target_value in tree.target_values
    ]
    full_df[proba_columns] = full_df[proba_columns].fillna(0)
    return full_df
def write_with_schema(tree, input_dataset, scored_dataset, scored_df,
                      output_probabilities, check_prediction):
    schema = input_dataset.read_schema()
    if output_probabilities:
        for value in tree.target_values:
            schema.append({
                'type': 'double',
                'name': "proba_" + safe_str(value)
            })
    schema.append({'type': 'string', 'name': 'prediction'})
    if check_prediction:
        schema.append({'type': 'boolean', 'name': 'prediction_correct'})
    schema.append({'type': 'string', 'name': 'label'})

    scored_dataset.write_schema(schema)

    with scored_dataset.get_writer() as writer:
        writer.write_dataframe(scored_df)
 def get_stats_categorical_node(self, column, target_column, unfiltered_col):
     stats = {"bins": []}
     empty_values = set(unfiltered_col.unique())
     if not column.empty:
         target_grouped = target_column.groupby(column.fillna("No values").apply(safe_str))
         target_distrib = target_grouped.value_counts(dropna=False)
         col_distrib = target_grouped.count().sort_values(ascending=False)
         empty_values -= set(col_distrib.index)
         stats["same_target_distrib"] = True
         for value in col_distrib.index:
             stats["bins"].append({"value": value,
                                   "target_distrib": target_distrib[value].to_dict(),
                                   "count": col_distrib[value]})
             if stats.get("same_target_distrib") and stats["bins"][0]["target_distrib"] != stats["bins"][-1]["target_distrib"]:
                 del stats["same_target_distrib"]
     else:
         stats["no_values"] = True
     for value in empty_values:
         stats["bins"].append({"value": safe_str(value), "count": 0})
     return stats
    def parse_nodes(self, nodes, rebuild_nodes=False, numerical_features=None):
        self.nodes, ids = {}, deque()
        root_node_dict = nodes["0"]
        treated_as_numerical = set(root_node_dict["treated_as_numerical"])
        if numerical_features is not None:
            treated_as_numerical.intersection_update(numerical_features)
        root_node = Node(0, -1, treated_as_numerical)
        root_node.label = root_node_dict["label"]
        self.add_node(root_node)

        ids += root_node_dict["children_ids"]

        while ids:
            dict_node = nodes[safe_str(ids.popleft())]
            treated_as_numerical = set(dict_node["treated_as_numerical"])
            feature = dict_node["feature"]
            if numerical_features is not None:
                treated_as_numerical.intersection_update(numerical_features)
            if dict_node.get("values") is not None:
                node = CategoricalNode(dict_node["id"],
                                       dict_node["parent_id"],
                                       treated_as_numerical,
                                       feature,
                                       dict_node["values"],
                                       others=dict_node["others"])
            else:
                node = NumericalNode(dict_node["id"],
                                    dict_node["parent_id"],
                                    treated_as_numerical,
                                    feature,
                                    beginning=dict_node.get("beginning", None),
                                    end=dict_node.get("end", None))
            node.label = dict_node["label"]
            self.add_node(node)
            if rebuild_nodes:
                node.rebuild(dict_node["prediction"],
                            dict_node["samples"],
                            dict_node["probabilities"])
            ids += dict_node["children_ids"]
Beispiel #10
0
    get_output_names_for_role("metrics_dataset")[0])
folder = dataiku.Folder(get_input_names_for_role("folder")[0])
chunk_size_param = get_recipe_config()["chunk_size"]

try:
    tree = folder.read_json(get_recipe_config()["tree_file"])
except ValueError:
    raise Exception("No tree file named " + get_recipe_config()["tree_file"])

tree["df"] = input_dataset.get_dataframe()
tree = Tree(**tree)

scored_df = score(tree, input_dataset, chunk_size_param, True)

target_mapping = {
    safe_str(label): index
    for index, label in enumerate(tree.target_values)
}
scored_df_nona = scored_df.dropna(subset=["prediction"])
y_actual, y_pred = scored_df_nona[tree.target], scored_df_nona.prediction
y_actual = y_actual.map(lambda t: int(target_mapping[safe_str(t)]))
y_pred = y_pred.map(lambda t: int(target_mapping[safe_str(t)]))

if len(tree.target_values) > 2:
    compute_metrics = compute_multiclass_metrics
    metrics = [
        "precision", "recall", "accuracy", "mrocAUC", "logLoss", "hammingLoss",
        "mcalibrationLoss"
    ]
else:
    compute_metrics = compute_binary_classification_metrics
Beispiel #11
0
    def get_stats(self, i, col):
        node = self.get_node(i)
        filtered_df = self.get_filtered_df(node, self.df)
        column = filtered_df[col]
        target_column = filtered_df[self.target]
        stats = {}
        if col in node.treated_as_numerical:
            if not column.empty:
                stats.update({
                    "mean": column.mean(),
                    "max": column.max(),
                    "min": column.min()
                })
                target_grouped = target_column.groupby(
                    pd.cut(column.fillna(self.features[col]["mean"]),
                           bins=min(10, column.nunique()),
                           include_lowest=True,
                           right=False))
                target_distrib = target_grouped.apply(
                    lambda x: x.value_counts())
                col_distrib = target_grouped.count()
                stats["bins"] = []
                for interval, count in col_distrib.items():
                    stats["bins"].append({
                        "value": safe_str(interval),
                        "target_distrib": target_distrib[interval].to_dict()
                        if count > 0 else {},
                        "mid": interval.mid,
                        "count": count
                    })
            else:
                stats["no_values"] = True
            return stats

        stats["bins"] = []
        empty_values = set(self.df[col].dropna().apply(safe_str).unique())
        if not column.empty:
            target_grouped = target_column.groupby(
                column.fillna("No values").apply(safe_str))
            target_distrib = target_grouped.value_counts(dropna=False)
            col_distrib = target_grouped.count().sort_values(ascending=False)
            empty_values -= set(col_distrib.index)
            stats["same_target_distrib"] = True
            for value in col_distrib.index:
                stats["bins"].append({
                    "value":
                    value,
                    "target_distrib":
                    target_distrib[value].to_dict(),
                    "count":
                    col_distrib[value]
                })
                if stats.get("same_target_distrib"
                             ) and stats["bins"][0]["target_distrib"] != stats[
                                 "bins"][-1]["target_distrib"]:
                    del stats["same_target_distrib"]
        else:
            stats["no_values"] = True
        for value in empty_values:
            stats["bins"].append({"value": safe_str(value), "count": 0})
        return stats