Example #1
0
    def fast_train(self,
                   param,
                   dset,
                   enable=[0, 0, 1, 0, 1, 1, 1, 1],
                   cc=False):

        # mods = ['KNN', 'SVC', 'SVM', 'XGBoost', 'MLP', 'RF']  # local mods list
        post = []
        # l = sum(np.array(enable) == 1)
        cnt = 0

        if cc:
            train_X, train_y, test_X, test_y = self.Ctrain_X[
                dset], self.Ctrain_y[dset], self.Ctest_X[dset], self.Ctest_y[
                    dset]
        else:
            train_X, train_y, test_X, test_y = self.train_X[
                dset], self.train_y[dset], self.test_X[dset], self.test_y[dset]

        for i in range(len(enable)):

            if enable[i] == 1:
                if i == 0:
                    temp = KNeighborsClassifier()
                elif i == 1:
                    temp = svm.SVC()
                elif i == 2:
                    temp = svm.NuSVC()
                elif i == 3:
                    temp = xgb.XGBClassifier(objective='binary:logistic')
                elif i == 4:
                    temp = MLPClassifier()
                elif i == 5:
                    temp = RandomForestClassifier(n_jobs=-1)
                elif i == 6:
                    temp = rerfClassifier(projection_matrix='RerF', n_jobs=-1)
                elif i == 7:
                    temp = rerfClassifier(projection_matrix='MT-MORF',
                                          n_jobs=-1)

                temp = temp.set_params(**param[cnt].get_params())
                temp.fit(train_X, train_y)
                post.append(temp)
                cnt += 1

            # if i == 5:
            #     temp = QuadraticDiscriminantAnalysis()
            #     temp.fit(train_X, train_y)
            #     post.append(temp)

        return post
Example #2
0
    def fit(self, images, labels):
        MF_image = np.zeros(5)
        if self.type == 'native':
            batch_size, length, width, _ = images.shape

            reshaped_images = images.reshape(batch_size, length * width)

            self.forest = rerfClassifier(
                projection_matrix="S-RerF",
                n_estimators=self.num_trees,
                n_jobs=cpu_count() - 1,
                image_height=length,
                image_width=width,
                patch_height_min=self.patch_height_min,
                patch_width_min=self.patch_width_min,
                patch_height_max=self.patch_height_max,
                patch_width_max=self.patch_height_min)
            self.forest.fit(reshaped_images, labels)
            #Is this necessary
            #for i in range(length):
            #    for j in range(width):
            #        x = 1
            #        MF_image[:, i, j] = np.array([approx_predict_proba_sample_wise(
            #            sample) for sample in images[:, i, j]])[..., np.newaxis]

            MF_image = self.forest.predict_proba(reshaped_images)

        return MF_image
Example #3
0
def train_random_forest(X,
                        y,
                        train_idx,
                        test_idx,
                        projection_matrices=[
                            "RerF", "S-RerF", "Graph-Node-RerF",
                            "Graph-Edge-RerF"
                        ],
                        n_trees=1000,
                        sporf_mtry=None,
                        morf_mtry=None,
                        patch_min=None,
                        patch_max=None,
                        random_state=None,
                        return_prob=False):

    XTRAIN = X[train_idx]
    XTEST = X[test_idx]
    YTRAIN = y[train_idx]
    YTEST = y[test_idx]

    # params inferred from data
    img_height = X.shape[1]

    # vectorize so that inputs work
    XTRAIN = XTRAIN.reshape(XTRAIN.shape[0], -1)
    XTEST = XTEST.reshape(XTEST.shape[0], -1)

    errors = []
    for projection_matrix in projection_matrices:
        if projection_matrix == "RerF":
            mtry = sporf_mtry
        else:
            mtry = morf_mtry

        cls = rerfClassifier(
            projection_matrix=projection_matrix,
            max_features=mtry,
            n_jobs=-1,
            n_estimators=n_trees,
            oob_score=False,
            random_state=random_state,
            image_height=img_height,
            image_width=img_height,
            patch_height_max=patch_max,
            patch_height_min=patch_min,
            patch_width_max=patch_max,
            patch_width_min=patch_min,
        )
        cls.fit(XTRAIN, YTRAIN)

        if not return_prob:
            preds = cls.predict(XTEST)
            errors.append(np.mean(preds != YTEST))
        else:
            preds = cls.predict_proba(XTEST)
            errors.append(preds)

    return errors
Example #4
0
    def compute(self, config, budget, **kwargs):
        """

        Args:
            config: dictionary containing the sampled configurations by the optimizer.
            budget: (int) number of trees the model is allowed to use in training.

        Returns:
            dictionary with mandatory fields:
                'loss' (scalar)
                'info' (dict)

            clf = RandomForestClassifier(n_estimators = int(budget),\
                                         n_jobs = n_jobs,\
                                         max_features = 0.9,\
                                         max_depth = None)
        """

        if config['clf'] == "skrf":
            clf = RandomForestClassifier(n_estimators = int(budget),\
                                         n_jobs = self.n_jobs,\
                                         max_features = config['max_features_sk'],\
                                         max_depth = config['max_depth'])

        elif config['clf'] == "sporf":
            clf = rerfClassifier(n_estimators = int(budget),\
                                 max_features = config['max_features_sporf'],\
                                 max_depth = config['max_depth'],\
                                 feature_combinations = config['sporf_fc'],\
                                 n_jobs = self.n_jobs,\
                                 projection_matrix = "RerF",\
                                )

        clf.fit(self.X, self.y)
        #clf.fit(X, y)

        train_pred = clf.predict(self.X)
        train_accuracy = metrics.accuracy_score(self.y, train_pred)

        y_val_hat = clf.predict(self.X_val)
        val_accuracy = metrics.accuracy_score(self.y_val, y_val_hat)

        y_test_hat = clf.predict(self.X_test)
        test_accuracy = metrics.accuracy_score(self.y_test, y_test_hat)

        return ({
            # this is the a mandatory field to run hyperband
            'loss': float(1 - val_accuracy),
            # can be used for any user-defined information - also mandatory
            'info': {
                "test_loss": float(1 - test_accuracy),
                "test_accuracy": test_accuracy,
                "val_accuracy": val_accuracy,
                "train_accuracy": train_accuracy,
            }
        })
Example #5
0
def random_forest_chunks(headers, feature_length, csv_file_location, file_name):
    # df = pd.DataFrame.from_records(values, columns=headers)
    chunk_size = 10 ** 4
    counter = 0
    # clf = RandomForestClassifier(n_estimators=1000,max_features=None,random_state= 42,max_depth=10,n_jobs=-1,oob_score=True,class_weight= "balanced",bootstrap= True)
    # clf = RandomForestClassifier(n_estimators=1000,max_features=None,max_depth=10,n_jobs=-1,random_state= 42,bootstrap= True)
    #clf = rerfClassifier(projection_matrix="RerF",
                           # n_estimators = 1000, max_depth = 10 , max_features= None,oob_score = True, n_jobs =-1, random_state=42)
    clf = rerfClassifier(projection_matrix="RerF", n_estimators=10, max_depth=10, max_features=None, oob_score=True,
                        n_jobs=-1, random_state=42, feature_combinations=1)


    chunk = pd.read_csv(csv_file_location)
    # X= chunk[create_headers(feature_length)]
    # print(X.shape)
    # y = chunk['label']
    # clf.fit(X, y)
    chunk = shuffle(chunk)
    data = chunk.iloc[:, 0:-1]
    corr = data.corr()
    # sns.heatmap(corr)
    # plt.show()
    columns = np.full((corr.shape[0],), True, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i + 1, corr.shape[0]):
            if corr.iloc[i, j] >= 0.5:
                if columns[j]:
                    columns[j] = False
    selected_columns = data.columns[columns]
    data = data[selected_columns]

    selected_columns = selected_columns[1:].values
    SL = 0.3
    data_modeled, selected_columns = backwardElimination(data.iloc[:, 0:-1].values, data.iloc[:, -1].values, SL,
                                                         selected_columns)
    data = pd.DataFrame(data=data_modeled, columns=selected_columns)
    """clf = rerfClassifier(projection_matrix = "S-RerF", n_estimators =1000, max_depth =10, max_features = None,
                         oob_score = True, n_jobs =-1 , random_state = 42,image_height=1,
                    image_width=len(selected_columns),
                    patch_height_max=1,
                    patch_height_min=1,
                    patch_width_max=len(selected_columns),
                    patch_width_min = len(selected_columns))"""

    clf.fit(data.values, chunk['label'])
    # print(clf.feature_importances_)
    # print(selected_columns)
    dot_data = StringIO()
    """
    for i in range (1000):
        estimators = clf.estimators_[i]

        export_graphviz(estimators, out_file="tree.dot",
                        feature_names=selected_columns,
                        class_names=["1","0"],
                        rounded=True, proportion=False,
                        precision=2, filled=True)
        print('loop')
        call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
        img = mpimg.imread('tree.png')
        imgplot = plt.imshow(img)
        plt.show()"""
    #  print(clf.decision_path(data.values))


    # pickle.dump(clf, open(file_name, 'wb'))
    return clf, selected_columns
openml.config.apikey = '204cdba18d110fd68ad24b131ea92030'
benchmark_suite = openml.study.get_suite('OpenML100')

for task_id in benchmark_suite.tasks[92:93]:  # iterate over all tasks
    # try:
        # get some data
        task = openml.tasks.get_task(task_id)
        X, y = task.get_X_and_y()
        n_features = np.shape(X)[1]
        n_samples = np.shape(X)[0]

        print(task_id)
        print('Data set: %s: ' % (task.get_dataset().name))

        # build a classifier
        rerf = rerfClassifier()

        #specify max_depth and min_sample_splits ranges
        max_depth_array_rerf = (np.unique(np.round((np.linspace(2,n_samples,
                            10))))).astype(int)
        max_depth_range_rerf = np.append(max_depth_array_rerf, None)

        min_sample_splits_range_rerf = (np.unique(np.round((np.arange(1,math.log(n_samples),
                                    (math.log(n_samples)-2)/10))))).astype(int)

        # specify parameters and distributions to sample from
        rerf_param_dict = {"n_estimators": np.arange(50,550,50),
                      "max_depth": max_depth_range_rerf,
                      # "min_samples_split": min_sample_splits_range_rerf,
                      "feature_combinations": [1,2,3,4,5], 
                      "max_features": ["sqrt","log2", None, n_features**2]}
Example #7
0
def print_pred_summ(acc_list):
    print(sum([math.isclose(yt, 1) for yt in acc_list]))
    # print("avg acc", sum(y_train_acc_list)/len(y_train_acc_list))
    print(sorted(acc_list)[0:5])


def two_sided_mannwhitneyu(x, y):
    u, prob_one_sided = mannwhitneyu(x, y, use_continuity=False)
    prob = prob_one_sided * 2

    return u, prob


# RerF classifier test
clf = rerfClassifier(n_estimators=100, projection_matrix="RerF")
rerf_acc = iris_pred_acc(clf, 10000)
print("RerF")
print_pred_summ(rerf_acc)

# Random Forest classifier test (the one of Neurodata)
clf = rerfClassifier(n_estimators=100, projection_matrix="Base")
rf_acc = iris_pred_acc(clf, 10000)
print("RF")
print_pred_summ(rf_acc)

# Random Forest classifier test (the one of sklearn)
clf = RandomForestClassifier(n_estimators=100)
sklearn_acc = iris_pred_acc(clf, 10000)
print("sklearn")
print_pred_summ(sklearn_acc)
Example #8
0
        "petal width": iris.data[:, 3],
        "species": iris.target,
    }
)

# Prints the first 5 lines of data
print(data.head())

# extracting some data
# this takes all the data relating only to the 4 columns sepal length, width, petal length, width
X = data[["sepal length", "sepal width", "petal length", "petal width"]]  # Features
# this takes all the data relating to the species column. Note: there is no double square brackets here, because
# we are not adding the title of the column to it. You can print y and X to see for yourself.
y = data["species"]  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3
)  # 70% training and 30% test

# Create a RerF Classifier
clf = rerfClassifier(n_estimators=100)

# Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train, y_train)

# so we predict where the X data is supposed to go (which species is the X element from?)
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
Example #9
0
        matplotlib.use('QT5Agg')
        import matplotlib.pyplot as plt
        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]x[y_min, y_max].
        plt.subplots(figsize=(10, 10))
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
        # Put the result into a color plot
        Z = np.array(Z)
        Z = Z.reshape(xx.shape)
        plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)
        # Plot also the training points
        plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm, edgecolor='k')
        plt.xlabel('Dimension 1')
        plt.ylabel('Dimension 2')
        plt.xlim(xx.min(), xx.max())
        plt.ylim(yy.min(), yy.max())
        plt.xticks(())
        plt.yticks(())
        plt.show()

    MASEP = MASEPipeline(
        [('pca', PCA(n_components=4)),
         ('rerf', rerfClassifier(n_estimators=10, max_depth=2))],
        plot_method=plotRerF)
    MASEP.set_params(MASE__n_components=6, MASE__algorithm='full')
    MASEP.fit(undirected_sbms, labels_sbm)
    print(type(MASEP.predict(undirected_sbms)))
    cvs, _ = MASEP.cross_val_score(undirected_sbms, labels_sbm)
    print(cvs)
    MASEP.plot(undirected_sbms, labels_sbm)
###############################################################################
# Building classifiers and specifying parameter ranges to sample from
# ----------------------------------------------------------
#

# get some data
X, y = fetch_openml(data_id=40975, return_X_y=True,
                    as_frame=True)  #car dataset
y = pd.factorize(y)[0]
X = X.apply(lambda x: pd.factorize(x)[0])
n_features = np.shape(X)[1]
n_samples = np.shape(X)[0]

# build a classifier
rerf = rerfClassifier()

# specify max_depth and min_sample_splits ranges
max_depth_array_rerf = (np.unique(np.round((np.linspace(2, n_samples,
                                                        10))))).astype(int)
max_depth_range_rerf = np.append(max_depth_array_rerf, None)

min_sample_splits_range_rerf = (np.unique(
    np.round((np.arange(1, math.log(n_samples),
                        (math.log(n_samples) - 2) / 10))))).astype(int)

# specify parameters and distributions to sample from
rerf_param_dict = {
    "n_estimators": np.arange(50, 550, 50),
    "max_depth": max_depth_range_rerf,
    "min_samples_split": min_sample_splits_range_rerf,