def fast_train(self, param, dset, enable=[0, 0, 1, 0, 1, 1, 1, 1], cc=False): # mods = ['KNN', 'SVC', 'SVM', 'XGBoost', 'MLP', 'RF'] # local mods list post = [] # l = sum(np.array(enable) == 1) cnt = 0 if cc: train_X, train_y, test_X, test_y = self.Ctrain_X[ dset], self.Ctrain_y[dset], self.Ctest_X[dset], self.Ctest_y[ dset] else: train_X, train_y, test_X, test_y = self.train_X[ dset], self.train_y[dset], self.test_X[dset], self.test_y[dset] for i in range(len(enable)): if enable[i] == 1: if i == 0: temp = KNeighborsClassifier() elif i == 1: temp = svm.SVC() elif i == 2: temp = svm.NuSVC() elif i == 3: temp = xgb.XGBClassifier(objective='binary:logistic') elif i == 4: temp = MLPClassifier() elif i == 5: temp = RandomForestClassifier(n_jobs=-1) elif i == 6: temp = rerfClassifier(projection_matrix='RerF', n_jobs=-1) elif i == 7: temp = rerfClassifier(projection_matrix='MT-MORF', n_jobs=-1) temp = temp.set_params(**param[cnt].get_params()) temp.fit(train_X, train_y) post.append(temp) cnt += 1 # if i == 5: # temp = QuadraticDiscriminantAnalysis() # temp.fit(train_X, train_y) # post.append(temp) return post
def fit(self, images, labels): MF_image = np.zeros(5) if self.type == 'native': batch_size, length, width, _ = images.shape reshaped_images = images.reshape(batch_size, length * width) self.forest = rerfClassifier( projection_matrix="S-RerF", n_estimators=self.num_trees, n_jobs=cpu_count() - 1, image_height=length, image_width=width, patch_height_min=self.patch_height_min, patch_width_min=self.patch_width_min, patch_height_max=self.patch_height_max, patch_width_max=self.patch_height_min) self.forest.fit(reshaped_images, labels) #Is this necessary #for i in range(length): # for j in range(width): # x = 1 # MF_image[:, i, j] = np.array([approx_predict_proba_sample_wise( # sample) for sample in images[:, i, j]])[..., np.newaxis] MF_image = self.forest.predict_proba(reshaped_images) return MF_image
def train_random_forest(X, y, train_idx, test_idx, projection_matrices=[ "RerF", "S-RerF", "Graph-Node-RerF", "Graph-Edge-RerF" ], n_trees=1000, sporf_mtry=None, morf_mtry=None, patch_min=None, patch_max=None, random_state=None, return_prob=False): XTRAIN = X[train_idx] XTEST = X[test_idx] YTRAIN = y[train_idx] YTEST = y[test_idx] # params inferred from data img_height = X.shape[1] # vectorize so that inputs work XTRAIN = XTRAIN.reshape(XTRAIN.shape[0], -1) XTEST = XTEST.reshape(XTEST.shape[0], -1) errors = [] for projection_matrix in projection_matrices: if projection_matrix == "RerF": mtry = sporf_mtry else: mtry = morf_mtry cls = rerfClassifier( projection_matrix=projection_matrix, max_features=mtry, n_jobs=-1, n_estimators=n_trees, oob_score=False, random_state=random_state, image_height=img_height, image_width=img_height, patch_height_max=patch_max, patch_height_min=patch_min, patch_width_max=patch_max, patch_width_min=patch_min, ) cls.fit(XTRAIN, YTRAIN) if not return_prob: preds = cls.predict(XTEST) errors.append(np.mean(preds != YTEST)) else: preds = cls.predict_proba(XTEST) errors.append(preds) return errors
def compute(self, config, budget, **kwargs): """ Args: config: dictionary containing the sampled configurations by the optimizer. budget: (int) number of trees the model is allowed to use in training. Returns: dictionary with mandatory fields: 'loss' (scalar) 'info' (dict) clf = RandomForestClassifier(n_estimators = int(budget),\ n_jobs = n_jobs,\ max_features = 0.9,\ max_depth = None) """ if config['clf'] == "skrf": clf = RandomForestClassifier(n_estimators = int(budget),\ n_jobs = self.n_jobs,\ max_features = config['max_features_sk'],\ max_depth = config['max_depth']) elif config['clf'] == "sporf": clf = rerfClassifier(n_estimators = int(budget),\ max_features = config['max_features_sporf'],\ max_depth = config['max_depth'],\ feature_combinations = config['sporf_fc'],\ n_jobs = self.n_jobs,\ projection_matrix = "RerF",\ ) clf.fit(self.X, self.y) #clf.fit(X, y) train_pred = clf.predict(self.X) train_accuracy = metrics.accuracy_score(self.y, train_pred) y_val_hat = clf.predict(self.X_val) val_accuracy = metrics.accuracy_score(self.y_val, y_val_hat) y_test_hat = clf.predict(self.X_test) test_accuracy = metrics.accuracy_score(self.y_test, y_test_hat) return ({ # this is the a mandatory field to run hyperband 'loss': float(1 - val_accuracy), # can be used for any user-defined information - also mandatory 'info': { "test_loss": float(1 - test_accuracy), "test_accuracy": test_accuracy, "val_accuracy": val_accuracy, "train_accuracy": train_accuracy, } })
def random_forest_chunks(headers, feature_length, csv_file_location, file_name): # df = pd.DataFrame.from_records(values, columns=headers) chunk_size = 10 ** 4 counter = 0 # clf = RandomForestClassifier(n_estimators=1000,max_features=None,random_state= 42,max_depth=10,n_jobs=-1,oob_score=True,class_weight= "balanced",bootstrap= True) # clf = RandomForestClassifier(n_estimators=1000,max_features=None,max_depth=10,n_jobs=-1,random_state= 42,bootstrap= True) #clf = rerfClassifier(projection_matrix="RerF", # n_estimators = 1000, max_depth = 10 , max_features= None,oob_score = True, n_jobs =-1, random_state=42) clf = rerfClassifier(projection_matrix="RerF", n_estimators=10, max_depth=10, max_features=None, oob_score=True, n_jobs=-1, random_state=42, feature_combinations=1) chunk = pd.read_csv(csv_file_location) # X= chunk[create_headers(feature_length)] # print(X.shape) # y = chunk['label'] # clf.fit(X, y) chunk = shuffle(chunk) data = chunk.iloc[:, 0:-1] corr = data.corr() # sns.heatmap(corr) # plt.show() columns = np.full((corr.shape[0],), True, dtype=bool) for i in range(corr.shape[0]): for j in range(i + 1, corr.shape[0]): if corr.iloc[i, j] >= 0.5: if columns[j]: columns[j] = False selected_columns = data.columns[columns] data = data[selected_columns] selected_columns = selected_columns[1:].values SL = 0.3 data_modeled, selected_columns = backwardElimination(data.iloc[:, 0:-1].values, data.iloc[:, -1].values, SL, selected_columns) data = pd.DataFrame(data=data_modeled, columns=selected_columns) """clf = rerfClassifier(projection_matrix = "S-RerF", n_estimators =1000, max_depth =10, max_features = None, oob_score = True, n_jobs =-1 , random_state = 42,image_height=1, image_width=len(selected_columns), patch_height_max=1, patch_height_min=1, patch_width_max=len(selected_columns), patch_width_min = len(selected_columns))""" clf.fit(data.values, chunk['label']) # print(clf.feature_importances_) # print(selected_columns) dot_data = StringIO() """ for i in range (1000): estimators = clf.estimators_[i] export_graphviz(estimators, out_file="tree.dot", feature_names=selected_columns, class_names=["1","0"], rounded=True, proportion=False, precision=2, filled=True) print('loop') call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600']) img = mpimg.imread('tree.png') imgplot = plt.imshow(img) plt.show()""" # print(clf.decision_path(data.values)) # pickle.dump(clf, open(file_name, 'wb')) return clf, selected_columns
openml.config.apikey = '204cdba18d110fd68ad24b131ea92030' benchmark_suite = openml.study.get_suite('OpenML100') for task_id in benchmark_suite.tasks[92:93]: # iterate over all tasks # try: # get some data task = openml.tasks.get_task(task_id) X, y = task.get_X_and_y() n_features = np.shape(X)[1] n_samples = np.shape(X)[0] print(task_id) print('Data set: %s: ' % (task.get_dataset().name)) # build a classifier rerf = rerfClassifier() #specify max_depth and min_sample_splits ranges max_depth_array_rerf = (np.unique(np.round((np.linspace(2,n_samples, 10))))).astype(int) max_depth_range_rerf = np.append(max_depth_array_rerf, None) min_sample_splits_range_rerf = (np.unique(np.round((np.arange(1,math.log(n_samples), (math.log(n_samples)-2)/10))))).astype(int) # specify parameters and distributions to sample from rerf_param_dict = {"n_estimators": np.arange(50,550,50), "max_depth": max_depth_range_rerf, # "min_samples_split": min_sample_splits_range_rerf, "feature_combinations": [1,2,3,4,5], "max_features": ["sqrt","log2", None, n_features**2]}
def print_pred_summ(acc_list): print(sum([math.isclose(yt, 1) for yt in acc_list])) # print("avg acc", sum(y_train_acc_list)/len(y_train_acc_list)) print(sorted(acc_list)[0:5]) def two_sided_mannwhitneyu(x, y): u, prob_one_sided = mannwhitneyu(x, y, use_continuity=False) prob = prob_one_sided * 2 return u, prob # RerF classifier test clf = rerfClassifier(n_estimators=100, projection_matrix="RerF") rerf_acc = iris_pred_acc(clf, 10000) print("RerF") print_pred_summ(rerf_acc) # Random Forest classifier test (the one of Neurodata) clf = rerfClassifier(n_estimators=100, projection_matrix="Base") rf_acc = iris_pred_acc(clf, 10000) print("RF") print_pred_summ(rf_acc) # Random Forest classifier test (the one of sklearn) clf = RandomForestClassifier(n_estimators=100) sklearn_acc = iris_pred_acc(clf, 10000) print("sklearn") print_pred_summ(sklearn_acc)
"petal width": iris.data[:, 3], "species": iris.target, } ) # Prints the first 5 lines of data print(data.head()) # extracting some data # this takes all the data relating only to the 4 columns sepal length, width, petal length, width X = data[["sepal length", "sepal width", "petal length", "petal width"]] # Features # this takes all the data relating to the species column. Note: there is no double square brackets here, because # we are not adding the title of the column to it. You can print y and X to see for yourself. y = data["species"] # Labels # Split dataset into training set and test set X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3 ) # 70% training and 30% test # Create a RerF Classifier clf = rerfClassifier(n_estimators=100) # Train the model using the training sets y_pred=clf.predict(X_test) clf.fit(X_train, y_train) # so we predict where the X data is supposed to go (which species is the X element from?) y_pred = clf.predict(X_test) # Model Accuracy, how often is the classifier correct? print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
matplotlib.use('QT5Agg') import matplotlib.pyplot as plt # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. plt.subplots(figsize=(10, 10)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = np.array(Z) Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8) # Plot also the training points plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm, edgecolor='k') plt.xlabel('Dimension 1') plt.ylabel('Dimension 2') plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.xticks(()) plt.yticks(()) plt.show() MASEP = MASEPipeline( [('pca', PCA(n_components=4)), ('rerf', rerfClassifier(n_estimators=10, max_depth=2))], plot_method=plotRerF) MASEP.set_params(MASE__n_components=6, MASE__algorithm='full') MASEP.fit(undirected_sbms, labels_sbm) print(type(MASEP.predict(undirected_sbms))) cvs, _ = MASEP.cross_val_score(undirected_sbms, labels_sbm) print(cvs) MASEP.plot(undirected_sbms, labels_sbm)
############################################################################### # Building classifiers and specifying parameter ranges to sample from # ---------------------------------------------------------- # # get some data X, y = fetch_openml(data_id=40975, return_X_y=True, as_frame=True) #car dataset y = pd.factorize(y)[0] X = X.apply(lambda x: pd.factorize(x)[0]) n_features = np.shape(X)[1] n_samples = np.shape(X)[0] # build a classifier rerf = rerfClassifier() # specify max_depth and min_sample_splits ranges max_depth_array_rerf = (np.unique(np.round((np.linspace(2, n_samples, 10))))).astype(int) max_depth_range_rerf = np.append(max_depth_array_rerf, None) min_sample_splits_range_rerf = (np.unique( np.round((np.arange(1, math.log(n_samples), (math.log(n_samples) - 2) / 10))))).astype(int) # specify parameters and distributions to sample from rerf_param_dict = { "n_estimators": np.arange(50, 550, 50), "max_depth": max_depth_range_rerf, "min_samples_split": min_sample_splits_range_rerf,