Beispiel #1
0
def create():
    try:
        data = json.loads(request.data)
        df = dataiku.Dataset(data["name"]).get_dataframe(
            sampling=data.get("sample_method", "head"),
            limit=data.get("sample_size"))
        tree = Tree(df, **data)
        factory.set_tree(folder_name, tree)
        return jsonify(tree.jsonify())
    except:
        logger.error(traceback.format_exc())
        return traceback.format_exc(), 500
def test_score():
    tree = Tree(pd.DataFrame(columns=["target"]), **tree_dict)
    chunk_1, chunk_2, chunk_3 = score_chunk(tree, df, True)
    expected_chunk_1.index = [0, 3, 4]

    assert chunk_1.combine(expected_chunk_1, check_equal).all(axis=None)
    assert chunk_2.combine(expected_chunk_2, check_equal).all(axis=None)
    assert chunk_3.combine(expected_chunk_3, check_equal).all(axis=None)
Beispiel #3
0
def load():
    try:
        data = json.loads(request.data)
        jsonified_tree = folder.read_json(data["filename"])
        new_sampling = data.get("sample_method") == "random" \
            or data.get("sample_method") != jsonified_tree.get("sample_method") \
            or data.get("sample_size") != jsonified_tree.get("sample_size")
        df = dataiku.Dataset(jsonified_tree["name"]).get_dataframe(
            sampling=data.get("sample_method", "head"),
            limit=data.get("sample_size"))
        jsonified_tree["sample_method"] = data.get("sample_method", "head")
        jsonified_tree["sample_size"] = data.get("sample_size")
        tree = Tree(df, new_sampling=new_sampling, **jsonified_tree)
        factory.set_tree(folder_name, tree)
        return jsonify(tree.jsonify())
    except:
        logger.error(traceback.format_exc())
        return traceback.format_exc(), 500
Beispiel #4
0
input_dataset = dataiku.Dataset(get_input_names_for_role("input_dataset")[0])
scored_dataset = dataiku.Dataset(
    get_output_names_for_role("scored_dataset")[0])
metrics_dataset = dataiku.Dataset(
    get_output_names_for_role("metrics_dataset")[0])
folder = dataiku.Folder(get_input_names_for_role("folder")[0])
chunk_size_param = get_recipe_config()["chunk_size"]

try:
    tree = folder.read_json(get_recipe_config()["tree_file"])
except ValueError:
    raise Exception("No tree file named " + get_recipe_config()["tree_file"])

tree["df"] = input_dataset.get_dataframe()
tree = Tree(**tree)

scored_df = score(tree, input_dataset, chunk_size_param, True)

target_mapping = {
    safe_str(label): index
    for index, label in enumerate(tree.target_values)
}
scored_df_nona = scored_df.dropna(subset=["prediction"])
y_actual, y_pred = scored_df_nona[tree.target], scored_df_nona.prediction
y_actual = y_actual.map(lambda t: int(target_mapping[safe_str(t)]))
y_pred = y_pred.map(lambda t: int(target_mapping[safe_str(t)]))

if len(tree.target_values) > 2:
    compute_metrics = compute_multiclass_metrics
    metrics = [
Beispiel #5
0
def test_on_categorical_splits():
    tree = Tree(df, None, "target")
    _test_add_categorical_nodes(tree)
    _test_update_categorical_nodes(tree)
    _test_delete_categorical_node(tree)
Beispiel #6
0
def test_get_stats():
    tree = Tree(df, None, "target")
    stats_num_col = tree.get_stats(0, "num_1")
    assert stats_num_col["mean"] == 6
    assert stats_num_col["max"] == 11
    assert stats_num_col["min"] == 1
    last_bin = stats_num_col["bins"].pop()
    assert last_bin == {
        "count": 2,
        "target_distrib": {
            "B": 2
        },
        "mid": (10 + 11.01) / 2.0,
        "value": "[10.0, 11.01)"
    }
    assert stats_num_col["bins"][5] == {
        "count": 2,
        "target_distrib": {
            "A": 1,
            "B": 1
        },
        "mid": (6 + 7) / 2.0,
        "value": "[6.0, 7.0)"
    }
    for idx, current_bin in enumerate(stats_num_col["bins"]):
        if idx != 5:
            assert current_bin == {
                "count": 1,
                "target_distrib": {
                    df[df.num_1 == (idx + 1)].target.values[0]: 1
                },
                "mid": (2 * idx + 3) / 2.0,
                "value": "[{0}, {1})".format(float(idx + 1), float(idx + 2))
            }

    stats_cat_col = tree.get_stats(0, "cat_1")
    assert stats_cat_col["bins"] == [{
        "value": "z",
        "count": 5,
        "target_distrib": {
            "A": 1,
            "B": 3,
            "C": 1
        }
    }, {
        "value": "x",
        "count": 4,
        "target_distrib": {
            "A": 2,
            "B": 2
        }
    }, {
        "value": "y",
        "count": 3,
        "target_distrib": {
            "A": 1,
            "B": 1,
            "C": 1
        }
    }]
    assert stats_cat_col.get("same_target_distrib") is None

    tree = Tree(df.head(4), None, "target")
    assert tree.get_stats(0, "cat_2")["same_target_distrib"]