def test_figure_k_acurracy(self): """""" #加载数据 dataset = load_files('./test_file2') #对数据进行分词处理 datasets = [] for i in dataset.data: datasets.append(' '.join([j for j in jieba.cut(i)])) #训练数据 cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0) ks = [ 1, 2, 3, 5, 10, 20, 30, 50, 100, 200, 300, 500, 1000, 2000, 3000, 5000, 10000, 20000, 'all' ] accuracys = [] for k in ks: classifier = bayesClassifier(MultinomialNB, k=k) clf = make_pipeline(classifier) accuracys.append(\ average(cross_val_score(clf, datasets, dataset.target, cv=cv))) fig, ax = plt.subplots() ax.scatter(range(len(ks)), accuracys) ax.set_xlabel('k') ax.set_ylabel('accuracy') plt.show() print 'test_figure_k_acurracy done!' print '-' * 70
def processData(data): trainNum=int(len(data)*0.9) datasets=[] features=[] for row in data: datasets.append(row[1:]) features.append(row[0]) return datasets[:trainNum],features[:trainNum],datasets[trainNum:],features[trainNum:]
def processData(data): trainNum = int(len(data) * 0.9) datasets = [] features = [] for row in data: datasets.append(row[1:]) features.append(row[0]) return datasets[:trainNum], features[:trainNum], datasets[ trainNum:], features[trainNum:]
def test_pipline_cross_val_score(self): """""" #加载数据 dataset = load_files('./test_file2') #对数据进行分词处理 datasets = [] for i in dataset.data: datasets.append(' '.join([j for j in jieba.cut(i)])) #训练数据 classifier = bayesClassifier(MultinomialNB, k=1000) cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0) clf = make_pipeline(classifier) print cross_val_score(clf, datasets, dataset.target, cv=cv) print 'test_pipline_cross_val_score done!' print '-' * 70
def make_train_test_split(X, Y, im_files, explain_files, class_names, explain_interp): # split_data = [X_train, X_test, Y_trains, ...] split_data = train_test_split(X, Y, im_files, explain_files, explain_interp, test_size=0.2, random_state=0) datasets = [] for dd in range(2): dataset = {} dataset['X'] = split_data[dd+0] dataset['Y'] = split_data[dd+2] dataset['im_files'] = split_data[dd+4] dataset['explain_files'] = split_data[dd+6] dataset['explain_interp'] = split_data[dd+8] dataset['class_names'] = class_names datasets.append(dataset) return datasets[0], datasets[1]
def load_adult(*, valid_split=0.25, path="adult.npz", device=None): data = np.load(path) datasets = [] for suffix in ("train", "test"): # the 13th column is always zero X = torch.FloatTensor(np.delete(data[f"x_{suffix}"], 13, 1)) y = torch.FloatTensor(data[f"y_{suffix}"][:, 0]) is_protected = torch.ByteTensor( np.cast[int](data[f"attr_{suffix}"][:, 0] > 0)) X = X.to(device) y = y.to(device) is_protected = is_protected.to(device) datasets.append(TensorDataset(X, y, is_protected)) train, test = datasets in_valid = torch.rand(len(train)) < valid_split train, valid = TensorDataset(*train[~in_valid]), TensorDataset( *train[in_valid]) return train, valid, test
ap.add_argument("-ds", "--dataset", required=True, help="copy.txt") ap.add_argument("-ts", "--testset", required=True, help="textcopy.txt") ap.add_argument("-at", "--algotype", required=True, help="dtc/dtr/svm/gnb/erfc/bagc/model") ap.add_argument("-m", "--model", required=False, help="dtc/dtr/svm/gnb/erfc/bagc") args = vars(ap.parse_args()) lines_train = open(args["dataset"], 'r').readlines() datasets = [] for n in lines_train: datasets.append(list(map(int, n.split(' ')))) lines_test = open(args["testset"], 'r').readlines() testset = [] for m in lines_test: testset.append(list(map(int, m.split(' ')))) X = datasets[:-1] target = datasets[-1] test = testset[:-1] test_target = testset[-1] # X = np.c_[(0, 0, 0, 5, 5, 0, 0, 0, 2, 7, 5, 0, 1, 3, 4, 1, 0, 0, 0, 0), # (0, 0, 0, 5, 3, 0, 0, 0, 1, 13, 3, 0, 0, 2, 5, 1, 0, 0, 0, 0), # (0, 0, 0, 4, 5, 1, 0, 0, 3, 8, 3, 0, 0, 3, 3, 2, 0, 0, 1, 0), # (0, 1, 1, 0, 4, 0, 0, 0, 2, 11, 5, 0, 0, 4, 3, 1, 0, 1, 0, 0),
def test_correlation_examples(N=500): ''' Python code of correlation examples from beaucronin : https://gist.github.com/beaucronin/2509755 Python translation of examples http://en.wikipedia.org/wiki/File:Correlation_examples2.svg Title: An example of the correlation of x and y for various distributions of (x,y) pairs Author: Denis Boigelot Parameters ---------- N : the number of samples Returns ------- ''' from numpy.random import ( uniform as runif, multivariate_normal as rmvn, normal as rnorm ) from numpy import inner, linspace, array, vstack from math import pi, cos, sin, pow, sqrt import matplotlib.pyplot as plt datasets = [] MI_scores = [] for corr in [1., .8, .4, 0., -.4, -.8, -1.]: x = rmvn([0., 0.], [[1., corr], [corr, 1.]], N) mi = MI_RenyiCC_Multi(x, type="c") mi = compute_normMI(x, type="c") print("MI score of two continuous linear correlated (correlation degree of %f) gaussian variables : %f" % (corr, mi)) MI_scores.append(mi) datasets.append(x) for phi in [0., pi/12., pi/6., pi/4., pi/2. - pi/6., pi/2. - pi/12, pi/2]: x = rmvn([0., 0.], [[1., 1.], [1., 1.]], N) x = rotate(phi, x) mi = MI_RenyiCC_Multi(x, type="c") mi = compute_normMI(x, type="c") print("MI score of two continuous linear correlated (correlation slope of %f) gaussian variables : %f" % (phi, mi)) MI_scores.append(mi) datasets.append(x) a = linspace(-1, 1, N) x = array([(x0, 4. * pow(x0 * x0 - .5, 2.) + runif(-1./3., 1./3., 1)) for x0 in a]) mi = MI_RenyiCC_Multi(x, type="c") mi = compute_normMI(x, type="c") print("MI score of two continuous non-linear correlated (y=4*(x0^2-0.5)^2+c) variables : %f" % mi) MI_scores.append(mi) datasets.append(x) x = rotate(-pi/8., array([(x0, runif(-1., 1.)) for x0 in a])) mi = MI_RenyiCC_Multi(x, type="c") mi = compute_normMI(x, type="c") print("MI score of two continuous non-linear correlated variables : %f" % mi) MI_scores.append(mi) datasets.append(x) x = rotate(-pi/8, x) mi = MI_RenyiCC_Multi(x, type="c") mi = compute_normMI(x, type="c") print("MI score of two continuous non-linear correlated variables : %f" % mi) MI_scores.append(mi) datasets.append(x) x = array([(x0, x0 * x0 + runif(-.5, .5)) for x0 in a]) mi = MI_RenyiCC_Multi(x, type="c") mi = compute_normMI(x, type="c") print("MI score of two continuous non-linear correlated variables : %f" % mi) MI_scores.append(mi) datasets.append(x) signs = [1. if runif() < .5 else -1. for _ in range(N)] x = array([(x0, (x0 * x0 + runif(0., .5)) * sign) for x0, sign in zip(a, signs)]) mi = MI_RenyiCC_Multi(x, type="c") mi = compute_normMI(x, type="c") print("MI score of two continuous non-linear correlated variables : %f" % mi) MI_scores.append(mi) datasets.append(x) x = array([(sin(x0 * pi) + rnorm(0., .125), cos(x0 * pi) + rnorm(0., .125)) for x0 in a]) mi = MI_RenyiCC_Multi(x, type="c") mi = compute_normMI(x, type="c") print("MI score of two continuous non-linear correlated variables : %f" % mi) MI_scores.append(mi) datasets.append(x) x = vstack(( rmvn([3., 3], [[1., 0.], [0., 1.]], round(N/4)), rmvn([-3., 3], [[1., 0.], [0., 1.]], round(N/4)), rmvn([-3., -3], [[1., 0.], [0., 1.]], round(N/4)), rmvn([3., -3], [[1., 0.], [0., 1.]], round(N/4)) )) mi = MI_RenyiCC_Multi(x, type="c") mi = compute_normMI(x, type="c") print("MI score of two continuous non-linear correlated variables : %f" % mi) MI_scores.append(mi) datasets.append(x) """ Plot the datasets, mimicking the original plot from Wikipedia. """ plt.figure() print("MI_scores:",MI_scores) for i in range(len(datasets)): plt.subplot(3, 7, i+1) x = [a[0] for a in datasets[i]] y = [a[1] for a in datasets[i]] plt.plot(x, y, '.', markersize=1.) plt.title("%.4f" %MI_scores[i]) # plt.axis('scaled') plt.xticks([]) plt.yticks([]) ax = plt.gca() ax.set_axis_off() if i == 14: plt.xlim([-1, 1]) plt.ylim([-1./3., 1.+1./3.]) elif i == 15: z = sqrt(2. + sqrt(2.)) / sqrt(2.) plt.xlim([-z, z]) plt.ylim([-z, z]) elif i == 16: plt.xlim([-sqrt(2.), sqrt(2.)]) plt.ylim([-sqrt(2.), sqrt(2.)]) elif i == 17: plt.xlim([-1, 1]) plt.ylim([-.5, 1.5]) elif i == 18: plt.xlim([-1.5, 1.5]) plt.ylim([-1.5, 1.5]) elif i == 19: plt.xlim([-1.5, 1.5]) plt.ylim([-1.5, 1.5]) elif i == 20: plt.xlim([-7, 7]) plt.ylim([-7, 7]) else: plt.xlim([-4, 4]) plt.ylim([-4, 4]) ax.set_aspect('equal', adjustable='datalim')
from data.load import df1pos #dfs, dfp, dfmlle as isomap, spectral1, pca, mlle from numpy import random from numpy.random import randint as rndint datalist = [dfi, dfs, dfp, dfmlle] datasets = [] for j in datalist: names = j.values[:, 0] k = (j.values[:, 1:]).astype(float) rowz = (k.shape)[0] y = (noisy_circles[1])[0:rowz] z = (k, y) datasets.append(z) #--------------------------------------------------------- colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk']) colors = np.hstack([colors] * 20) clustering_names = [ 'MiniBatchKMeans', 'AffinityPropagation', 'MeanShift', 'SpectralClustering', 'Ward', 'AgglomerativeClustering', 'DBSCAN', 'Birch' ] plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5)) plt.subplots_adjust(left=.02, right=.98, bottom=.001,
scores['value'].append(metrics[i]) scores['dataset'].append(dataset) scores_['sensitivity'].append(sens) scores_['specificity'].append(spec) scores_['accuracy'].append(acc) scores_['dataset'].append(dataset) scores_['algorithm'].append('MLP') return scores, scores_ if __name__ == "__main__": datasets = [('../data/features_1.npy', '../data/target_1.npy'), ('../data/mean_imputation_features_2.npy', '../data/target_1.npy' )] datasets.append(('../data/regression_imputation_features_2.npy', '../data/target_1.npy' )) resultnames = [ 'dataset1', 'dataset2', 'dataset3' ] for i, (dataset, resultname) in enumerate(zip(datasets, resultnames)): features, targets = dataset X = np.load(features) Y = np.load(targets) n_classes = len(np.unique(Y)) print(n_classes) ### Stratify DATA ### skf = StratifiedKFold(n_splits=4)