Beispiel #1
0
def mp_rf_optimizer_func(fn_tuple):
    """Executes in parallel creation of random forrest creation."""

    fn, flags, file_suffix = fn_tuple

    n_trees = flags["n_trees"]
    is_regressor = flags["is_regressor"]
    sample_size = flags["sample_size"]
    n_features = flags["n_features"]
    max_depth = flags["max_depth"]

    if not file_suffix:
        file_suffix = "none"

    path_split = fn.split("/")
    path = "/".join(path_split[:-1]) + "/"
    fn_split = path_split[-1].split(".")
    # o_file = path + ".".join(fn_split[0:-2] + [fn_split[-1]])
    cv_file = path + ".".join(fn_split[0:-2] + [file_suffix])
    rfb_file = path + ".".join(fn_split[0:-2] + ["rb", "bin"])

    # let's compress the table first to make the job easier for random forest.
    # compression can usually achieve a ratio of 50x or more.

    # compress(fn, o_file)
    train = load(fn)

    n_features = "auto" if not n_features else float(n_features)

    # min_size = 1

    if max_depth:
        max_depth = int(max_depth)

    print("... creating random forrest for " + os.path.basename(fn) +
          " with " + str(sample_size) + " samples")

    if is_regressor:
        rf = RandomForestRegressor(
            n_estimators=n_trees,
            max_depth=max_depth,
            # min_samples_split=2,
            # min_samples_leaf=min_size,
            max_features=n_features,
            # max_leaf_nodes=100,
            # oob_score=True,
            # warm_start=True,
            bootstrap=True,
            random_state=42,
            n_jobs=1)
    else:
        rf = RandomForestClassifier(
            n_estimators=n_trees,
            max_depth=max_depth,
            # min_samples_split=2,
            # min_samples_leaf=min_size,
            max_features=n_features,
            # max_leaf_nodes=100,
            # oob_score=True,
            # warm_start=True,
            bootstrap=True,
            random_state=42,
            n_jobs=1)

    if sample_size and train.shape[0] >= 10000:
        sample_size = int(sample_size)
        np.random.seed(42)
        idx = np.random.choice(train.shape[0], train.shape[0], replace=False)

        x = train[idx[sample_size:], 0:-1]
        y = train[idx[sample_size:], -1]

        x_test = train[idx[0:sample_size], 0:-1]
        y_test = train[idx[0:sample_size], -1]
    else:
        x = train[:, 0:-1]
        y = train[:, -1]

        x_test = x
        y_test = y

    estimators = []
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        rf.fit(x, y)

    func_name = fn_split[0]

    bits = np.ceil(np.log2(np.abs(np.amax(x, axis=0) - np.amin(x, axis=0) +
                                  1))).astype(np.int32)
    is_neg = (np.amin(x, axis=0) < 0).astype(np.int8)

    o_bits = np.ceil(
        np.log2(np.abs(np.amax(y, axis=0) - np.amin(y, axis=0) + 1))).astype(
            np.int32)
    o_is_neg = (np.amin(y, axis=0) < 0).astype(np.int8)

    rf.bits = bits
    rf.is_neg = is_neg
    rf.o_bits = o_bits
    rf.o_is_neg = o_is_neg

    code = gen_random_forest(rf,
                             func_name,
                             bits,
                             is_neg,
                             o_bits,
                             o_is_neg,
                             is_regressor=is_regressor,
                             is_top_level=False,
                             is_cc=file_suffix == "cc")

    open(cv_file, "w").write("\n".join(code))

    p = 1.0 * np.round(rf.predict(x_test))

    dy = np.max(train[:, -1]) - np.min(train[:, -1])

    error = np.sum(np.abs(y_test - p)) / (1.0 * p.shape[0] * dy)
    score = np.sum(y_test == p) / p.shape[0]

    print("y:", np.max(y_test), y_test[0:30].astype(np.int32))
    print("p:", np.max(p), p[0:30].astype(np.int32))

    print("... model {} with score of {:.2f}% and error of {:.2f}%".format(
        func_name, 100.0 * score, 100.0 * error))

    print("... saving model in {}".format(rfb_file))
    pickle.dump(rf, open(rfb_file, "wb"))
    return rfb_file