def main3(): X1, y1 = make_gaussian_quantiles(cov=2., n_samples=200, n_features=2, n_classes=2, random_state=1) X2, y2 = make_gaussian_quantiles(mean=(3, 3), cov=1.5, n_samples=300, n_features=2, n_classes=2, random_state=1) X = np.concatenate((X1, X2)) y = np.concatenate((y1, - y2 + 1)) x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43) graph(x_train[:, 0], x_train[:, 1], y_train, 'gaussian_train') boosting = REBEL(max_iteration=301) boosting.fit(x_train, y_train, x_test, y_test) y_predict_test = boosting.get_prediction() print(y_predict_test) graph(x_test[:, 0], x_test[:, 1], y_predict_test, 'gaussian_test') resultats = np.array(boosting.get_resultats()) plt.plot(resultats[:, [0]], resultats[:, [1]], label='Loss') plt.plot(resultats[:, [0]], resultats[:, [2]], label='erreur_train') plt.plot(resultats[:, [0]], resultats[:, [3]], label='erreur_test') plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.show()
def main(): #baseline_clfs = [SVC(), GaussianNB(), DecisionTreeClassifier(), MLPClassifier(hidden_layer_sizes=(10,10,10,10,10,10), solver='lbfgs', alpha=2, random_state=1, activation='relu')] datasets = [] experiments = [] query_strat = 'RandomSampling' # datasets.append((make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=2), # make_gaussian_quantiles(n_samples=500, n_features=10, n_classes=2))) # experiments.append('hastie_10_2_vs_gauss_quant_10_2') # datasets.append((make_moons(n_samples=1000), make_moons(n_samples=1000))) # experiments.append('moons') # datasets.append((u.hastie(1000), u.hastie(1000))) datasets.append((make_gaussian_quantiles(n_samples=500, n_features=5, n_classes=3), make_gaussian_quantiles(n_samples=500, n_features=5, n_classes=3))) experiments.append('gauus') #datasets.append((mnist.load_mnist(), mnist.load_mnist_rotated())) #experiments.append('MNIST_vs_MNIST_Rotated') #baseline_active(classifiers=clfs, datasets=datasets, experiments=experiments, query_strat=query_strat) bsda_active(datasets=datasets)
def create_data(show_scatter=False): # 生成协方差cov=2,维度n_features=2,类别n_classes=2,特征均值mean=(0,0) X1, y1 = make_gaussian_quantiles(mean=(0, 0), cov=2., n_samples=200, n_features=2, n_classes=2, random_state=1) # 生成协方差cov=2,维度n_features=2,类别n_classes=2,特征均值mean=(3, 3) X2, y2 = make_gaussian_quantiles(mean=(3, 3), cov=1.5, n_samples=300, n_features=2, n_classes=2, random_state=1) # 合并X1,X2 X = np.concatenate((X1, X2)) y = np.concatenate((y1, -y2 + 1)) # 图示样本的分布 if show_scatter == True: fig, axs = plt.subplots(1, 2) axs[0].scatter(X1[:, 0], X1[:, 1], c=y1) axs[0].set_title('类别0的分布', fontproperties=myfont) axs[1].scatter(X2[:, 0], X2[:, 1], c=y2) axs[1].set_title('类别1的分布', fontproperties=myfont) fig.show() return X, y
def sklearn_test(): """ AdaBoost test takes from sklearn https://scikit-learn.org/stable/auto_examples/ensemble/plot_adaboost_twoclass.html#sphx-glr-auto-examples-ensemble-plot-adaboost-twoclass-py""" # Construct dataset x1_samples = 200 x2_samples = 300 X1, y1 = make_gaussian_quantiles(cov=2., n_samples=x1_samples, n_features=2, n_classes=2, random_state=1) X2, y2 = make_gaussian_quantiles(mean=(3, 3), cov=1.5, n_samples=x2_samples, n_features=2, n_classes=2, random_state=1) X = np.concatenate((X1, X2)) y = np.concatenate((y1, -y2 + 1)) y = 2 * y - 1 # Create and fit an AdaBoosted decision tree my_learner = lambda: DecisionTreeClassifier(max_depth=1) bdt = AdaBoostClassifier(my_learner, n_estimators=200) scores, exp_losses = bdt.fit(X, y) print('Final Accuracy', scores[-1]) fig, ax = plt.subplots(1, 2, figsize=(12, 10)) ax[0].plot(exp_losses, 'b--', label='exp loss') ax[0].plot(1.0 - np.array(scores), 'm--', label='0-1 Loss') ax[0].legend(fontsize=15) ax[0].set_title('Loss Per Iteration for AdaBoost', fontsize=20) #plot_colors = "br" plot_step = 0.02 #class_names = "AB" # Plot the decision boundaries x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) Z = bdt.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) cs = ax[1].contourf(xx, yy, Z, cmap=plt.cm.Paired) ax[1].scatter(X[:, 0], X[:, 1], c=y, s=20, cmap=plt.cm.Paired, edgecolor='k') ax[1].set_title('AdaBoost Decision Boundary', fontsize=20) plt.show()
def generate_evil_2d_set(): X1, y1 = datasets.make_gaussian_quantiles(cov=2., n_samples=200, n_features=2, n_classes=2, random_state=1) X2, y2 = datasets.make_gaussian_quantiles(mean=(3, 3), cov=1.5, n_samples=300, n_features=2, n_classes=2, random_state=1) X = np.concatenate((X1, X2)) y = np.concatenate((y1, - y2 + 1)) return X, y
def creat_data(self): # 生成2维正态分布,生成的数据按分位数分为两类,500个样本,2个样本特征 x1, y1 = make_gaussian_quantiles(n_samples=500, n_features=2, n_classes=2) # 生成2维正态分布,生成的数据按分位数分为两类,400个样本,2个样本特征均值都为3 x2, y2 = make_gaussian_quantiles(mean=(3, 3), n_samples=500, n_features=2, n_classes=2) # 将两组数据合成一组数据 self.x_data = np.concatenate((x1, x2)) self.y_data = np.concatenate((y1, -y2 + 1))
def create_data(): # 创建符合高斯分布的数据集 X1, y1 = make_gaussian_quantiles(cov=2., n_samples=200, n_features=2, n_classes=2, random_state=1) X2, y2 = make_gaussian_quantiles(mean=(3, 3), cov=1.5, n_samples=300, n_features=2, n_classes=2, random_state=1) X = np.concatenate((X1, X2)) y = np.concatenate((y1, -y2 + 1)) return X, y
def gauss_easy(n_samples_bsm, n_samples_sm): #Dataset creation X1, y1 = make_gaussian_quantiles(mean=(2, 0), cov=1.5, n_samples=n_samples_bsm, n_features=2, n_classes=1, random_state=1) X2, y2 = make_gaussian_quantiles(mean=(-2, 0), cov=1.5, n_samples=n_samples_sm, n_features=2, n_classes=1, random_state=1) X12 = np.concatenate((X1, X2)) X = np.concatenate((X12, X12)) n_samle_size = len(X12) # Generating labels y1 = np.zeros(n_samle_size) y2 = np.ones(n_samle_size) y = np.concatenate((y1, y2)) #Generating weights n_alt_hyp_size = len(X1) #BSM n_hyp_size = len(X2) #SM EPSILON = 0.01 #in case of y=0, SM w_alt_hyp = np.zeros(n_alt_hyp_size) w_alt_hyp += (EPSILON / n_alt_hyp_size) w_hyp = np.ones(n_hyp_size) w_hyp /= n_hyp_size w0 = np.concatenate((w_alt_hyp, w_hyp)) #in case of y=1, BSM w_alt_hyp = np.ones(n_alt_hyp_size) w_alt_hyp /= n_alt_hyp_size w_hyp = np.zeros(n_hyp_size) w_hyp += (EPSILON / n_hyp_size) w1 = np.concatenate((w_alt_hyp, w_hyp)) #final weights w = np.concatenate((w0, w1)) #calculate the minimal weights, to avoid division by zero w_min = min((2.0 * EPSILON) / n_hyp_size, (2.0 * EPSILON) / n_alt_hyp_size) return X, y, w, w_min
def gauss_quantiles_dataset(samples_amount: int, features_amount: int, classes_amount: int, full_shuffle=True, **kwargs): """ Generates a random dataset for n-class classification problem based on multi-dimensional gaussian distribution quantiles using scikit-learn API. :param samples_amount: Total amount of samples in the resulted dataset. :param features_amount: Total amount of features per sample. :param classes_amount: The amount of classes in the dataset. :param full_shuffle: if true then all features and samples will be shuffled. :param kwargs: Optional params: \ - 'gauss_params': mean and covariance values of the distribution. :return: features and target as numpy-arrays. """ if 'gauss_params' in kwargs: mean, cov = kwargs['gauss_params'] else: mean, cov = None, 1. features, target = datasets.make_gaussian_quantiles( n_samples=samples_amount, n_features=features_amount, n_classes=classes_amount, shuffle=full_shuffle, mean=mean, cov=cov) return features, target
def iterate_data(): """Yields numpy iterator """ # Yield for each epoch for epoch in range(NUM_EPOCHS): examples = [ get_biased_data( possible_actions_and_rewards=datasets.make_gaussian_quantiles( n_samples=numpy.random.choice(a=range( MIN_NUM_ACTIONS, MAX_NUM_ACTIONS), ), n_features=NUM_FEATURES, n_classes=2, ), epoch=epoch, ) for _ in range(NUM_EXAMPLES) ] yield { # Shape = (num_examples, num_actions_possible, num_features) 'possible_actions': [example['possible_actions'] for example in examples], # Shape = (num_examples, num_features) 'chosen_actions': [example['chosen_action'] for example in examples], # Shape = (num_examples, 1) 'rewards': [example['reward'] for example in examples], }
def main(): ## create data... plt.figure(figsize=(8, 8)) print("'kernel', 'round', 'score', 'vetores'") for kernel in ['linear', 'rbf']: for i in range(10): #X, y = make_blobs(n_samples=300, centers=2) X, y = make_gaussian_quantiles(n_samples =300, n_features=2, n_classes =2) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) # cria um SVM clf = svm.SVC(kernel=kernel) # treina o classificador na base de treinamento #print "Training Classifier..." clf.fit(X_train, y_train) print("[\'{}\',{},{},{}], ".format( kernel, i, clf.score(X_test, y_test), clf.n_support_[0]+clf.n_support_[1]))
def gen_data(): # N clusters: # data, targets = datasets.make_classification( # n_samples=n, n_features=2, n_informative=2, n_redundant=0, n_classes=num_classes, class_sep=3.0, n_clusters_per_class=1) data, targets = datasets.make_gaussian_quantiles(mean=(0, 0), cov=1, n_samples=n, n_classes=num_classes) # Circles: # data, targets = datasets.make_circles( # n_samples=n, shuffle=True, noise=0.1, random_state=None, factor=0.1) # Moons: # data, targets = datasets.make_moons(n_samples=n, shuffle=True, noise=0.05) # print data # print targets targets = [to_one_hot_vect(target, num_classes) for target in targets] train = zip( np.array(data[:n * 9 / 10]).astype(np.float), np.array(targets[:n * 9 / 10]).astype(np.float)) test = zip( np.array(data[n / 10:]).astype(np.float), np.array(targets[n / 10:]).astype(np.float)) return train, test
def make_dataset(dataset, n_rows, n_cols, n_classes=2): np.random.seed(137) if n_rows*0.25 < 4000: # Use at least 4000 test samples n_test = 4000 if n_rows > 1000: # To avoid a large increase in test time (which is between # O(n_rows^2) and O(n_rows^3)). n_rows = int(n_rows * 0.75) n_rows += n_test else: n_test = n_rows * 0.25 if dataset == 'classification1': X, y = make_classification( n_rows, n_cols, n_informative=2, n_redundant=0, n_classes=n_classes, n_clusters_per_class=1) elif dataset == 'classification2': X, y = make_classification( n_rows, n_cols, n_informative=2, n_redundant=0, n_classes=n_classes, n_clusters_per_class=2) elif dataset == 'gaussian': X, y = make_gaussian_quantiles(n_samples=n_rows, n_features=n_cols, n_classes=n_classes) elif dataset == 'blobs': X, y = make_blobs(n_samples=n_rows, n_features=n_cols, centers=n_classes) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test) # correct case when not all classes made it into the training set if np.unique(y_train).size < n_classes: for i in range(n_classes): y_train[i] = i return X_train, X_test, y_train, y_test
def make_multiview_gaussian_quantiles(n_classes=3, n_views=3, n_features=3, n_samples='auto', rotate=True, shuffle=True, seed=None): np.random.seed(seed) n_samples = n_classes * 20 if n_samples == 'auto' else n_samples X_0, y = make_gaussian_quantiles(n_features=n_features, n_samples=n_samples, n_classes=n_classes, random_state=seed) std = np.std(X_0) Xs = [X_0] for i in range(n_views - 1): X_i = X_0 + np.tile( np.random.normal(loc=0.0, scale=10 * std, size=n_features), (len(X_0), 1)) X_i += np.random.normal(loc=0.0, scale=std / 100, size=X_i.shape) if rotate: X_i = X_i @ ortho_group.rvs(n_features) Xs.append(X_i) if shuffle: indexes = np.random.permutation(np.arange(len(y))) y = y[indexes] for _ in range(n_views): Xs[_] = Xs[_][indexes, :] if n_views > 1: return [torch.tensor(X).float() for X in Xs], torch.from_numpy(y) return torch.tensor(Xs).squeeze(0).float(), torch.from_numpy(y)
def backward_test(): "Test for backward flow" sampled_z = datasets.make_gaussian_quantiles(n_samples=1000)[0].astype( np.float32) backward_test_loader = torch.utils.data.DataLoader(sampled_z, batch_size=batch_size, shuffle=True, **kwargs) plt.subplot(2, 2, 4) plt.scatter(sampled_z[:, 0], sampled_z[:, 1], c='b', s=10) plt.title("INPUT: z ~ p(z)") model.eval() z_all = np.array([[]]).reshape(0, 2) with torch.no_grad(): for i, data in enumerate(backward_test_loader): z = model.backward(data) z_all = np.concatenate((z_all, z.numpy())) plt.subplot(2, 2, 3) plt.scatter(z_all[:, 0], z_all[:, 1], c='b', s=10) plt.title("OUTPUT: x = f^(-1)(z)") # plt.show() plt.savefig("result.png")
def gaussian_data_generator(dim=2, cls=5, objs_size=None, cov=None): """ init necessary parameters """ if cov is None: cov = [random.randrange(100, 500, 100) for _ in range(cls)] if objs_size is None: # random each cluster size; min=100, max=1000 objs_size = [random.randrange(100, 500, 50) for _ in range(cls)] # print("random object size = ", objs_size) means = [[random.randrange(100, 200, 20) for __ in range(dim)] for _ in range(cls)] # print("object's mean = ", means) point = [] label = [] for i in range(cls): tmp_point, tmp_label = make_gaussian_quantiles(mean=means[i], cov=cov[i], n_features=dim, n_classes=1, n_samples=objs_size[i]) list(map(lambda x: point.append(x), tmp_point)) list(map(lambda x: label.append(x + i), tmp_label)) # [temp] redundant translate to np.array return standardize_data(np.array(point)), np.array(label)
def dataset(n, random_seed, classes): if random_seed: np.random.seed(random_seed) X, y = make_gaussian_quantiles(n_samples=n, n_features=2, n_classes=classes) return X, y * 2 - 1
def main(): # Example 1 def load_simple_data(): features = ([[1.0, 2.1], [2.0, 1.1], [1.3, 1.0], [1.0, 1.0], [2.0, 1.0]]) labels = [1.0, 1.0, -1.0, -1.0, 1.0] return np.array(features), np.array(labels) X, y = load_simple_data() model = AdaBoostClassifier(n_estimators=5) model.fit(X, y) y_pred = model.predict(X) print(y_pred) accuracy = calculate_accuracy_score(y, y_pred) print("Accuracy Score: {:.2%}".format(accuracy)) # Example 2 X, y = make_gaussian_quantiles(n_samples=1300, n_features=10, n_classes=2) n_split = 300 X_train, X_test = X[:n_split], X[n_split:] y_train, y_test = y[:n_split], y[n_split:] model = AdaBoostClassifier(n_estimators=100) model.fit(X_train, y_train) y_pred = model.predict(X_test) accuracy = calculate_accuracy_score(y_test, y_pred) print("Accuracy Score: {:.2%}".format(accuracy))
def gaussian_data_generator(dim=2, cls=5, objs_size=None, cov=None): """ init necessary parameters """ if cov is None: cov = [random.randrange(100, 500, 100) for _ in range(cls)] if objs_size is None: # random each cluster size; min=100, max=1000 objs_size = [random.randrange(100, 500, 50) for _ in range(cls)] # print("random object size = ", objs_size) means = [[random.randrange(100, 200, 20) for __ in range(dim)] for _ in range(cls)] # print("object's mean = ", means) point = [] label = [] for i in range(cls): tmp_point, tmp_label = make_gaussian_quantiles(mean = means[i], cov = cov[i], n_features = dim, n_classes = 1, n_samples = objs_size[i]) list(map(lambda x: point.append(x), tmp_point)) list(map(lambda x: label.append(x + i), tmp_label)) # [temp] redundant translate to np.array return standardize_data(np.array(point)), np.array(label)
def gaussian_dataset(n_classes=3, n_views=3, n_features=3, n_samples='auto', rotate=True, shuffle=True, seed=154): np.random.seed(seed) n_samples = n_classes * 20 if n_samples == 'auto' else n_samples X_ori, y = make_gaussian_quantiles(cov=4.5, n_features=3, n_samples=n_samples, n_classes=n_classes, random_state=156) Xs = [X_ori] for i in range(n_views - 1): X_new_view = X_ori + np.random.randn(n_features) * np.random.randint( 7, 30) X_new_view = np.array([ x + np.random.rand(len(x.shape)) * np.random.randint(1, 3) for x in X_new_view ]) if rotate: X_new_view = X_new_view @ rvs(n_features, seed) Xs.append(X_new_view) if shuffle: indexes = np.random.permutation(np.arange(len(y))) y = y[indexes] for _ in range(n_views): Xs[_] = Xs[_][indexes, :] return [torch.tensor(X).float() for X in Xs], y
def load_non_linearly_separable_data(): """ Generates non-linearly separable data and returns the samples and class labels :return: """ x, y = make_gaussian_quantiles(n_features=2, n_classes=2, random_state=1) assert np.bitwise_or(y == 0, y == 1).all() return x, y
def generate_data(): ''' generate data :return: X: input data, y: given labels ''' np.random.seed(0) X, y = datasets.make_gaussian_quantiles(n_features=2, n_classes=3) return X, y
def main(): # 生成2维正态分布,生成的数据按分位数分为两类,500个样本,2个样本特征,协方差系数为2 X1, y1 = make_gaussian_quantiles(cov=2.0, n_samples=500, n_features=2, n_classes=2, random_state=1) # 生成2维正态分布,生成的数据按分位数分为两类,400个样本,2个样本特征均值都为3,协方差系数为2 X2, y2 = make_gaussian_quantiles(mean=(3, 3), cov=1.5, n_samples=400, n_features=2, n_classes=2, random_state=1) #两组数据合成一组数据 X = np.concatenate((X1, X2)) y = np.concatenate((y1, -y2 + 1)) # 显示 plt.scatter(X[:, 0], X[:, 1], marker='o', c=y) plt.show() # 用基于决策树的Adaboost来做分类拟合 dt = DecisionTreeClassifier(max_depth=2, min_samples_split=20, min_samples_leaf=5) adb = AdaBoostClassifier(base_estimator=dt, n_estimators=300, learning_rate=0.8, algorithm="SAMME") adb.fit(X, y) print("Score:", adb.score(X, y)) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02)) Z = adb.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired) plt.scatter(X[:, 0], X[:, 1], marker='o', c=y) plt.show()
def make_toy_dataset(n: int = 100, random_seed: int = None): """ Generate a toy dataset for evaluating AdaBoost classifiers """ if random_seed: np.random.seed(random_seed) x, y = make_gaussian_quantiles(n_samples=n, n_features=2, n_classes=2) return x, y * 2 - 1
def Makedata(): x1, y1 = make_gaussian_quantiles(cov=2., n_samples=200, n_features=2, n_classes=2, shuffle=True, random_state=1) x2, y2 = make_gaussian_quantiles(mean=(3, 3), cov=1.5, n_samples=300, n_features=2, n_classes=2, shuffle=True, random_state=1) X = np.vstack((x1, x2)) Y = np.hstack((y1, 1 - y2)) return X, Y
def get_nolinear_separable_dataset(self, random_state = 10, ext = True): from sklearn.datasets import make_gaussian_quantiles X, Y = make_gaussian_quantiles(n_samples=50, n_features= 2, n_classes = 2, random_state= random_state) if ext: unos = np.array([np.ones(X.shape[0])]) X = np.concatenate((unos.T, X), axis=1) X = X.reshape(X.shape[0], X.shape[1]) Y = Y.reshape(np.size(Y), 1) return (X,Y)
def data_creation(): # Construct dataset # Gaussian 1 X1, y1 = make_gaussian_quantiles(cov=3.,n_samples=100, n_features=2, n_classes=1) X1 = pd.DataFrame(X1,columns=['x','y']) y1 = pd.Series(y1) # Gaussian 2 X2, y2 = make_gaussian_quantiles(mean=(4, 4), cov=1, n_samples=100, n_features=2, n_classes=1) X2 = pd.DataFrame(X2,columns=['x','y']) y2 = pd.Series(y2) X3, y3 = make_gaussian_quantiles(mean=(-6,-1),cov=3.,n_samples=100, n_features=2, n_classes=1) X3 = pd.DataFrame(X3,columns=['x','y']) y3 = pd.Series(y3) X4, y4 = make_gaussian_quantiles(mean = (3, -2), cov=3.,n_samples=100, n_features=2, n_classes=1) X4 = pd.DataFrame(X4,columns=['x','y']) y4 = pd.Series(y4) # Combine the gaussians X1.shape X2.shape X3.shape X4.shape X = pd.DataFrame(np.concatenate((X1, X2, X3, X4))) y = pd.Series(np.concatenate((y1, - y2 + 1, y3, y4))) X.shape plt.figure() plt.plot(X[0][0:100],X[1][0:100], 'ro') plt.plot(X[0][100:200],X[1][100:200], 'yo') plt.plot(X[0][200:300],X[1][200:300], 'go') plt.plot(X[0][300:400],X[1][300:400], 'o') plt.show() return X
def generate_nonlin_data(num_features, num_samples): plt.figure() csfont = {'fontname':'Times New Roman'} X1, Y1 = make_gaussian_quantiles(mean = (1, 1), cov = 5, n_samples=num_samples, n_features=num_features, n_classes=2) plt.scatter(X1[:, 0], X1[:, 1], marker='.', c=Y1, cmap=plt.cm.Paired) plt.savefig('non_lin_data', bbox_inches='tight', pad_inches=0.1) plt.show() return X1, Y1
def load_extra_datasets(): N = 200 gaussian_quantiles = datasets.make_gaussian_quantiles(mean=None, cov=0.7, n_samples=N, n_features=2, n_classes=2, shuffle=True, random_state=None) return gaussian_quantiles
def generate_data(): N = 700 gq = skd.make_gaussian_quantiles(mean=None, cov=0.7, n_samples=N, n_features=3, n_classes=2, shuffle=True, random_state=None) return gq
def _generate(self, random_state): rows = api.payload['rows'] features = api.payload['features'] classes = api.payload['classes'] return make_gaussian_quantiles(random_state=random_state, n_samples=rows, n_features=features, n_classes=classes, cov=1.0)
def make_gaussian_data(self): X1, y1 = make_gaussian_quantiles(cov=0.75, n_samples=self.N/2, n_features=2, n_classes=2) X2, y2 = make_gaussian_quantiles(mean=(3, 3), cov=0.75, n_samples=self.N/2, n_features=2, n_classes=2) X = np.concatenate((X1, X2)) y = np.concatenate((y1, - y2 + 1)) yinds = np.random.choice(range(len(y)), size=int(round(len(y)/6)), replace=False) yshuff = np.random.choice(y[yinds], size=len(yinds), replace=False) y[yinds] = yshuff X = X*self.scaling return X, y
def test_binary_classification_with_classification_pipeline(): # generate the dataset n_samples = 100 n_features = 20 x, y = datasets.make_gaussian_quantiles(mean=None, cov=1.0, n_samples=n_samples, n_features=n_features, n_classes=2, shuffle=True, random_state=1) # -- test with darwin classifier_name = 'RBFSVC' #'linsvm' cvmethod = '10' #n_feats = x.shape[1] pipe = ClassificationPipeline(clfmethod=classifier_name, cvmethod=cvmethod) results, metrics = pipe.cross_validation(x, y) assert(results is not None)
from sklearn.datasets import make_blobs, make_classification,make_gaussian_quantiles import matplotlib.pyplot as plt import numpy as np import sklearn.preprocessing as p def saveFile(name,dataset): np.save(name,dataset) #X1,Y1=make_blobs(n_samples=100, n_features=2, centers=2) X1,Y1=make_gaussian_quantiles(n_samples=500,n_features=2, n_classes=2) plt.figure() plt.subplot(1,2,1) plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1) print 'X.shape is ',X1.shape,' y is ', Y1 plt.subplot(1,2,2) scalar=p.StandardScaler().fit(X1) X1=scalar.fit_transform(X1) plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1) plt.show() sp=int(0.8*X1.shape[0]) X1train=X1[0:sp] Y1train=Y1[0:sp] X1test=X1[sp:] Y1test=Y1[sp:] saveFile("tempXtrain2",X1train)
import numpy as np from sklearn.cross_validation import train_test_split, cross_val_score from sklearn.datasets import load_boston, load_breast_cancer, load_iris, make_moons, make_gaussian_quantiles from sklearn.metrics import mean_squared_error from mlxtend.evaluate import plot_decision_regions import matplotlib.pyplot as plt from pines.estimators import DecisionTreeRegressor, DecisionTreeClassifier from pines.tree_builders import TreeType if __name__ == '__main__': model = DecisionTreeClassifier(max_n_splits=3, max_depth=10, tree_type=TreeType.OBLIVIOUS) X, y = make_gaussian_quantiles(n_samples=10000, n_classes=4) model.fit(X, y) print(model.tree_) plot_decision_regions(X, y, clf=model, res=0.02, legend=2) plt.savefig('decision_boundary.png')
global imgx, imgy temppath = tempimage() plt.savefig(temppath, dpi=dpi) dx,dy = imagesize(temppath) w = min(W,dx) image(temppath,imgx,imgy,width=w) imgy = imgy + dy + 20 os.remove(temppath) size(W, HEIGHT+dy+40) else: def pltshow(mplpyplot): mplpyplot.show() # nodebox section end X, y = make_gaussian_quantiles(n_samples=13000, n_features=10, n_classes=3, random_state=1) n_split = 3000 X_train, X_test = X[:n_split], X[n_split:] y_train, y_test = y[:n_split], y[n_split:] bdt_real = AdaBoostClassifier( DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1) bdt_discrete = AdaBoostClassifier( DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1.5,
__author__ = 'darya' import numpy as np from sklearn import svm, datasets from darwin.pipeline import ClassificationPipeline # generate the dataset n_samples=100 n_features=20 x, y = datasets.make_gaussian_quantiles(mean=None, cov=1.0, n_samples=n_samples, n_features=n_features, n_classes=2, shuffle=True, random_state=1) # another way to generate the data # x, y = datasets.make_hastie_10_2(n_samples=10, random_state=1) # -- test with darwin classifier_name='linsvm' cvmethod='10' n_feats = x.shape[1] pipe = ClassificationPipeline(n_feats=n_feats, clfmethod=classifier_name, cvmethod=cvmethod) results, metrics = pipe.cross_validation(x, y)
## sklearn: make regression import numpy as np import matplotlib.pyplot as plt from sklearn.datasets.samples_generator import make_regression # X为样本特征,y为样本输出, coef为回归系数,共1000个样本,每个样本1个特征 X, y, coef =make_regression(n_samples=1000, n_features=1,noise=10, coef=True) # 画图 plt.scatter(X, y, color='black')plt.plot(X, X*coef, color='blue',linewidth=3)plt.xticks(())plt.yticks(())plt.show() ## sklearn: make classification import numpy as np import matplotlib.pyplot as plt from sklearn.datasets.samples_generator import make_classification # X1为样本特征,Y1为样本类别输出, 共400个样本,每个样本2个特征,输出有3个类别,没有冗余特征,每个类别一个簇 X1, Y1 = make_classification(n_samples=400, n_features=2, n_redundant=0, n_clusters_per_class=1, n_classes=3)plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)plt.show() ## sklearn: make blobs import numpy as np import matplotlib.pyplot as plt from sklearn.datasets.samples_generator import make_blobs # X为样本特征,Y为样本簇类别, 共1000个样本,每个样本2个特征,共3个簇,簇中心在[-1,-1], [1,1], [2,2], 簇方差分别为[0.4, 0.5, 0.2] X, y = make_blobs(n_samples=1000, n_features=2, centers=[[-1,-1], [1,1], [2,2]], cluster_std=[0.4, 0.5, 0.2])plt.scatter(X[:, 0], X[:, 1], marker='o', c=y)plt.show() ## sklearn: import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import make_gaussian_quantiles #生成2维正态分布,生成的数据按分位数分成3组,1000个样本,2个样本特征均值为1和2,协方差系数为2 X1, Y1 = make_gaussian_quantiles(n_samples=1000, n_features=2, n_classes=3, mean=[1,2],cov=2)plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)
import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import make_gaussian_quantiles if __name__ == '__main__': fig = plt.figure(figsize=(9,7)) ax = fig.add_subplot(111) X, y = make_gaussian_quantiles(n_features=2, n_classes=1) ax.scatter(X[:, 0], X[:, 1], marker='o', c='k', alpha=0.6) ax.set_xlabel('$x_1$') ax.set_ylabel('$x_0$') ax.set_xticks([]) ax.set_yticks([]) plt.title("Cluster Analysis") plt.savefig('/Users/benjamin/Desktop/cluster.png') kx = np.random.uniform(-3, 3, 5) ky = np.random.uniform(-3, 3, 5) plt.scatter(kx,ky, c='rbgyc', s=50) plt.savefig('/Users/benjamin/Desktop/kpoints.png') plt.show()
s=25, edgecolor='k') plt.subplot(323) plt.title("Two informative features, two clusters per class", fontsize='small') X2, Y2 = make_classification(n_features=2, n_redundant=0, n_informative=2) plt.scatter(X2[:, 0], X2[:, 1], marker='o', c=Y2, s=25, edgecolor='k') plt.subplot(324) plt.title("Multi-class, two informative features, one cluster", fontsize='small') X1, Y1 = make_classification(n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1, n_classes=3) plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1, s=25, edgecolor='k') plt.subplot(325) plt.title("Three blobs", fontsize='small') X1, Y1 = make_blobs(n_features=2, centers=3) plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1, s=25, edgecolor='k') plt.subplot(326) plt.title("Gaussian divided into three quantiles", fontsize='small') X1, Y1 = make_gaussian_quantiles(n_features=2, n_classes=3) plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1, s=25, edgecolor='k') plt.show()
from sklearn.datasets import make_gaussian_quantiles from sklearn.ensemble import RandomForestClassifier as RFC from sklearn.ensemble import GradientBoostingClassifier as GBC from sklearn import cross_validation as CV import matplotlib.pyplot as plt X,Y = make_gaussian_quantiles(n_features=2, n_samples=2000, n_classes=2) plt.scatter(X[:, 0], X[:, 1], marker='o', c=Y) plt.show() skf = CV.StratifiedKFold(Y, n_folds=2) #X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) #y = np.array([0, 0, 1, 1]) for train_index, test_index in skf: print("TRAIN:", len(train_index), "TEST:", len(test_index)) X_train, X_test = X[train_index], X[test_index] Y_train, Y_test = Y[train_index], Y[test_index] plt.scatter(X_train[:, 0], X_train[:, 1], marker='o', c=Y_train) plt.show() plt.scatter(X_test[:, 0], X_test[:, 1], marker='o', c=Y_test) plt.show() #clf = GBC(n_estimators=25, learning_rate=0.18,min_samples_leaf=6, max_features=0.8,subsample=0.9,verbose=2,max_depth=10) trainscore = list() testscore = list()
# Author: Noel Dawe <*****@*****.**> # # License: BSD 3 clause import numpy as np import matplotlib.pyplot as plt from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.datasets import make_gaussian_quantiles # Construct dataset X1, y1 = make_gaussian_quantiles(cov=2., n_samples=200, n_features=2, n_classes=2, random_state=1) X2, y2 = make_gaussian_quantiles(mean=(3, 3), cov=1.5, n_samples=300, n_features=2, n_classes=2, random_state=1) X = np.concatenate((X1, X2)) y = np.concatenate((y1, - y2 + 1)) # Create and fit an AdaBoosted decision tree bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200) bdt.fit(X, y) plot_colors = "br"
plt.subplot(311) plt.title("One informative feature, one cluster per class", fontsize='small') X, Y = make_classification(n_samples=2000,n_features=8, n_redundant=0, n_informative=8, n_clusters_per_class=4,random_state=13) plt.scatter(X[:, 0], X[:, 1], marker='o', c=Y) plt.subplot(312) plt.title("Two informative features, one cluster per class", fontsize='small') X, Y = make_classification(n_samples=300,n_features=3, n_redundant=0, n_informative=3, n_clusters_per_class=2,random_state=13) plt.scatter(X[:, 0], X[:, 1], marker='o', c=Y, cmap=plt.cm.Paired) plt.subplot(313) plt.title("Gaussian divided into three quantiles", fontsize='small') X, Y = make_gaussian_quantiles(n_samples=500,n_features=2, n_classes=2, mean=None,cov=1.0,random_state=13) plt.scatter(X[:, 0], X[:, 1], marker='o', c=Y) # ========================== import real data =========================== data = scipy.io.loadmat('breastdata.mat') X = data['X']; Y = data['Y'] data = scipy.io.loadmat('sonar.mat') X = data['X']; Y = data['Y'] # ========================== standardize data =========================== from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X = Scaler.fit_transform(X) X_mean = np.mean(X,axis=0)