def main(): print('--- Adaboost ---') data = datasets.load_digits() X, y = data.data, data.target digit1 = 1 digit2 = 8 idx = np.append(np.where(y == digit1)[0], np.where(y == digit2)[0]) y = data.target[idx] y[y == digit1] = 1 y[y == digit2] = -1 X = data.data[idx] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) clf = Adaboost(n_estimators=5) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) acc = accuracy_score(y_pred, y_test) clf_tree = ClassificationTree() clf_tree.fit(X_train, y_train) y_pred_tree = clf_tree.predict(X_test) acc_tree = accuracy_score(y_pred, y_test) print("Adaboost_Accuracy:", acc) print("Tree_Accuracy:", acc_tree)
def __init__(self, df, dep_var, cont_inputs, int_inputs, test_size, seed=None): """ Generates train/test and arr/tensor versions of the data. Input data is raw. After init, the data is scaled and transformed. :param df: Original raw DataFrame :param dep_var: Name of the dependent variable :param cont_inputs: List of strings of names of continuous features :param int_inputs: List of strings of names of integer features :param test_size: Size of test set (number of rows) :param seed: Random seed for reproducibility """ self.dep_var = dep_var self.cont_inputs = cont_inputs self.int_inputs = int_inputs self.labels_list = list(df[dep_var].unique()) self.df_dtypes = df.dtypes self.df_cols = df.columns # Reorganize data set df = uu.reorder_cols(df=df, dep_var=dep_var, cont_inputs=self.cont_inputs) self.cat_inputs, self.cat_mask = uu.define_cat_inputs(df=df, dep_var=dep_var, cont_inputs=cont_inputs) # Split data into train/test x_train_arr, x_test_arr, y_train_arr, y_test_arr = uu.train_test_split(df.drop(columns=dep_var), df[dep_var], test_size=test_size, stratify=df[dep_var], random_state=seed) # Convert all categorical variables to dummies, and save two-way transformation self.le_dict, self.ohe, x_train_arr, x_test_arr = uu.encode_categoricals_custom(df=df, x_train=x_train_arr, x_test=x_test_arr, cat_inputs=self.cat_inputs, cat_mask=self.cat_mask) self.preprocessed_cat_mask = uu.create_preprocessed_cat_mask(le_dict=self.le_dict, x_train=x_train_arr) # Scale continuous inputs if len(self.cont_inputs) == 0: self.scaler = None else: x_train_arr, self.scaler = uu.scale_cont_inputs(arr=x_train_arr, preprocessed_cat_mask=self.preprocessed_cat_mask) x_test_arr, _ = uu.scale_cont_inputs(arr=x_test_arr, preprocessed_cat_mask=self.preprocessed_cat_mask, scaler=self.scaler) # Convert to tensor-friendly format self.x_train, self.x_test, self.y_train, self.y_test = self.preprocess_data(x_train_arr=x_train_arr, y_train_arr=y_train_arr, x_test_arr=x_test_arr, y_test_arr=y_test_arr) self.out_dim = self.x_train.shape[1] self.eval_stratify = list(self.y_train.mean(0).detach().cpu().numpy()) # Set current device self.device = self.get_dev()
def main(): data = datasets.load_digits() X = data.data y = data.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, seed=2) clf = RandomForest(n_estimators=10) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print ("Accuracy:", accuracy)
def main(): print ("-- XGBoost --") data = datasets.load_iris() X = data.data y = data.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, seed=3) clf = XGBoost(n_estimators=20) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print ("Accuracy:", accuracy)
def main(): print("-- Classification Tree --") data = datasets.load_iris() X = data.data y = data.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) clf = ClassificationTree() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy)
def main_classifier(): print("-- Gradient Boosting Classification --") data = datasets.load_iris() X = data.data y = data.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) print(y_train.shape) clf = GradientBoostingClassifier(n_estimators=10) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy)
def main(): data = datasets.load_digits() X = data.data y = data.target digit1 = 1 digit2 = 8 idx = np.append(np.where(y == digit1)[0], np.where(y == digit2)[0]) y = data.target[idx] # Change labels to {-1, 1} y[y == digit1] = -1 y[y == digit2] = 1 X = data.data[idx] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) # Adaboost classification with 5 weak classifiers clf = Adaboost_1(n_clfs=5) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy)
from format_datasets.format_dataset import format_mnist_from_labels from utils.utils import train_test_split import networkx as nx from itertools import combinations from random import random as rand import numpy as np if __name__ == "__main__": print("RANDOM EMBEDDING") idx_constraints, reverse_cache, (x, y), class_distr = format_mnist_from_labels() G = nx.DiGraph() train, test = train_test_split(idx_constraints, test_percentage=0.3) for constraint in train: for u, v in list(combinations(constraint, 2))[:-1]: G.add_edge(u, v) random_embedding = {} missing = 0 for node in G.nodes: random_embedding[node] = rand() error_rate = 0 for test_constraint in test: i, j, k = test_constraint new_cost = 0 f_i = random_embedding.get(i) f_j = random_embedding.get(j) f_k = random_embedding.get(k) if (f_i is not None) and (f_j is not None) and (f_k is not None):
def main(dataset_name): best_embedding = OrderedDict() min_cost = float("inf") if USE_MULTIPROCESS: cpu_count = multiprocessing.cpu_count() else: cpu_count = 1 process_pool = Pool(cpu_count) # Choosing the dataset to test if USE_MNIST: constraints, num_points = format_mnist_from_distances() elif USE_RANDOM: constraints, num_points = create_random_dataset() elif USE_SINE: constraints, num_points = create_sine_dataset() elif USE_DD_SQUARES: constraints, num_points = create_double_density_squares() elif USE_CLUSTERS: constraints, num_points = create_n_density_squares() train_constraints, test_constraints = train_test_split( constraints, test_percentage=TRAIN_TEST_SPLIT_RATE) process_pool_arguments = format_arguments(train_constraints, num_points, cpu_count) responses = process_pool.starmap(lloc, process_pool_arguments) best_embedding = reduce_embedding(best_embedding, min_cost, responses) best_violation_count = count_raw_violated_constraints( best_embedding, train_constraints) predict(best_embedding, dataset_name, test_constraints, train_constraints, best_violation_count, embedding_dim=1) if SECOND_DIM: # SECOND DIMENSION! new_train_set, _ = get_violated_constraints(best_embedding, train_constraints) new_num_points = get_num_points(new_train_set) process_pool_args = format_arguments(new_train_set, new_num_points, cpu_count) responses = process_pool.starmap(lloc, process_pool_args) new_best_embedding = reduce_embedding(OrderedDict(), float("inf"), responses) projected_best_embedding = create_nd_embedding(best_embedding, n_dim=2) best_violation_count = count_raw_violated_constraints( projected_best_embedding, train_constraints) new_embedding = projected_best_embedding.copy() new_violation_count = best_violation_count print(f"Original Violates {new_violation_count} constraints", file=sys.stderr) new_violation_count, best_embedding = merge_embeddings( new_best_embedding, new_violation_count, new_embedding, train_constraints) print(f"New Violates {new_violation_count} constraints", file=sys.stderr) process_pool.close() predict(best_embedding, dataset_name, test_constraints, train_constraints, new_violation_count, embedding_dim=2) exit(0)
print('Processed: {}, with data shape: {}'.format( cluster, values.shape)) # transform and scale the data for training: X = values[:, 2:] Y = values[:, 1:2] # apply transformation to the data y = y_scaler.fit_transform(Y).squeeze() x = preprocessor.fit_transform(X, y) y = y.reshape(-1, 1) # split data for validation and testing: x_train, x_test, _, x_plot, y_train, y_test = train_test_split( x, X, y, test_size=parameters['test_split'], shuffle=parameters['test_shuffle']) # print some info to stdout: print('--------------------------------------------------------') print('inputs: float, tensor of shape (samples, predictors)') print('inputs_train shape:', x_train.shape) print('inputs_test shape:', x_test.shape) print('--------------------------------------------------------') print('outputs: float, tensor of shape (samples, 1)') print('outputs_train shape:', y_train.shape) print('outputs_test shape:', y_test.shape) print('--------------------------------------------------------') # Create neural-nework:
import matplotlib.pyplot as plt from utils.utils import X_train_matrix_0,X_train_matrix_1,X_train_matrix_2,\ X_train_0, X_train_1, X_train_2,Y_train_0,Y_train_1,Y_train_2, X_test_0\ ,X_test_1,X_test_2,X_test_matrix_0,X_test_matrix_1,X_test_matrix_2, accuracy_score, train_test_split import numpy as np from K_means.model import K_Means colors = 10*["g","r","c","b","k"] X_train_full = np.concatenate((X_train_matrix_0,X_train_matrix_1,X_train_matrix_2)) Y_train_full = np.concatenate((Y_train_0,Y_train_1,Y_train_2)).reshape(-1) X_test_full= np.concatenate((X_test_matrix_0,X_test_matrix_1,X_test_matrix_2)) X_train, X_val,y_train,y_val = train_test_split(X_train_full,Y_train_full,test_size=0.1) clf = K_Means() clf.fit(X_train) y_pred_val =[] y_pred =[] correct = 0 for i in range(len(X_val)): predict_me = np.array(X_val[i].astype(float)) predict_me = predict_me.reshape(-1, len(predict_me)) prediction = clf.predict(predict_me) y_pred_val.append(prediction) if prediction == y_val[i]:
self.parameters[c][feature_i]['var'], feature_value) posterior *= likelihood else: posterior *= self.parameters[c][feature_i] # 储存x为类别c的概率 posteriors.append(posterior) # 返回概率最大的类别 return self.classes[np.argmax(posteriors)] def predict(self, X_test): y_pred = [self.get_label(x) for x in X_test] return y_pred if __name__ == '__main__': print("-- Navie-Bayes --") data = datasets.load_iris() X = data.data y = data.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) clf = Navie_Bayes() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy)
""" Set tunables """ tunables_sequences = [ subsampling_factor, stride, look_back, target_steps_ahead ] tunables_network = [ epochs, depth_conv, depth_dense, filters, kernel_size, reg_n, dropout, dilation_rate ] # ----------------------------------------------------------------------------- # DATA PREPROCESSING # ----------------------------------------------------------------------------- """ Import dataset """ X, y, dataset, seizure = load_data() """ Select training set and test set """ X_train_fold, y_train_fold, X_test_fold, y_test_fold = train_test_split( X, y, cross_val=cross_val) n_folds = len(X_train_fold) """ Iterate through fold-sets """ for fold in range(n_folds): fold_set = fold if cross_val else '/' if cross_val: print(f"Fold set: {fold_set}") X_train = X_train_fold[fold] y_train = y_train_fold[fold] X_test = X_test_fold[fold] y_test = y_test_fold[fold] class_weight = compute_class_weight(y_train) """ Standardize data """ X_train, X_test = data_standardization(X_train, X_test)