def test_classifier(self): train_df = datasets.load("titanic")[["Name", "Survived"]] y = np.array(train_df.pop("Survived")) X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.2, random_state=0) X_train = DataFrameContainer( "TrainSet", dataset_instance=X_train, resource_manager=self.mock_resource_manager) X_test = DataFrameContainer( "TestSet", dataset_instance=X_test, resource_manager=self.mock_resource_manager) y_train = NdArrayContainer("TrainLabel", dataset_instance=y_train, resource_manager=self.mock_resource_manager) y_test = NdArrayContainer("TestLabel", dataset_instance=y_test, resource_manager=self.mock_resource_manager) X_train.set_feature_groups(["text"]) X_test.set_feature_groups(["text"]) est_cls_list = [ TsvdTransformer, NmfTransformer, LsiTransformer, LdaTransformer, RpTransformer, ] for cls in est_cls_list: print("=========================") print(cls.__name__) print("=========================") tokenizer = SimpleTokenlizer( **get_default_hp_of_cls(SimpleTokenlizer)) tokenizer.in_feature_groups = "text" tokenizer.out_feature_groups = "token" transformer = cls(**get_default_hp_of_cls(cls)) transformer.in_feature_groups = "token" transformer.out_feature_groups = "num" classifier = RandomForestClassifier( **get_default_hp_of_cls(RandomForestClassifier)) pipeline = ML_Workflow([ ("tokenizer", tokenizer), ("transformer", transformer), ("classifier", classifier), ], resource_manager=self.mock_resource_manager) start = time() pipeline.fit(X_train, y_train, X_test, y_test) y_pred = pipeline.predict(X_test) score = accuracy_score(y_test.data, y_pred) end = time() print("score:", score) print("time:", end - start) self.assertGreater(score, 0.6) print('\n' * 2)
from matplotlib.pyplot import scatter, show, hist def gaussian_anomaly_detection(data): rows, cols = data.shape mu = data.mean(axis=0) diff = data - mu cov = dot(diff.T, diff) / rows a = exp(-0.5 * dot(dot(diff, inv(cov)), diff.T)) b = sqrt(pow(2 * pi, cols) * det(cov)) res = (a / b).sum(axis=1) return res from sklearn.datasets import load_wine as load from sklearn.decomposition import PCA data = PCA(2).fit_transform(load().data) res = gaussian_anomaly_detection(data) colors = [] for x in res: if x < res.mean() - 2 * res.std() or x > res.mean() + 2 * res.std(): colors.append('red') else: colors.append('green') scatter(data[:,0], data[:,1], c=colors) show() hist(res, bins=100) show()
x_test, y_test = all_data[train_test_split:], all_labels[train_test_split:] predictions = knn(5, x_train, y_train, x_test) correct = 0 for i in range(len(predictions)): if predictions[i] == y_test[i]: correct += 1 print('Accuracy: ', correct / len(predictions)) #####################################3 from sklearn.datasets import load_iris as load from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.neighbors import KNeighborsClassifier x_train, x_test, y_train, y_test = train_test_split(load().data, load().target, test_size=0.2, shuffle=True) predictions = knn(5, x_train, y_train, x_test) print('My accuracy:', accuracy_score(y_test, predictions)) ########################################################################################## model = KNeighborsClassifier(n_neighbors=5, algorithm='brute', weights='uniform') model.fit(x_train, y_train) predictions = model.predict(x_test) print('Scikit accuracy:', accuracy_score(y_test, predictions)) ##########################################################################################
''' This is a snippet of code showing how to train a multiclass MKL algorithm Author: Ivano Lauriola, [email protected] ''' #load data print('loading \'iris\' dataset...', end='') from sklearn.datasets import load_iris as load ds = load() X, Y = ds.data, ds.target print('done') ''' WARNING: be sure that your matrix is not sparse! EXAMPLE: from sklearn.datasets import load_svmlight_file X,Y = load_svmlight_file(...) X = X.toarray() ''' #preprocess data print('preprocessing data...', end='') from MKLpy.preprocessing import normalization, rescale_01 X = rescale_01(X) #feature scaling in [0,1] X = normalization(X) #||X_i||_2^2 = 1 #train/test split from sklearn.model_selection import train_test_split Xtr, Xte, Ytr, Yte = train_test_split(X, Y,
from sklearn.datasets import load_digits as load from sklearn.cross_validation import train_test_split from DecisionTree import * from sklearn.metrics import accuracy_score from RF import RandomForestClassifier # from sklearn.ensemble import RandomForestClassifier data = load() X = data.data y = data.target print X.shape Xtrain, Xtest, ytrain, ytest = train_test_split(X, y) dtree = RandomForestClassifier(n_estimators=10) dtree.fit(Xtrain, ytrain) y_pred = dtree.predict(Xtest) print accuracy_score(y_pred, ytest)
# Importing Modules from sklearn import datasets import matplotlib.pyplot as plt # Loading dataset iris_df = datasets.load("/home/dottie/Downloads/datasets/dataset/dataset/test.csv") # Available methods on dataset print(dir(iris_df)) # Features print(iris_df.feature_names) # Targets print(iris_df.target) # Target Names print(iris_df.target_names) label = {0: 'red', 1: 'blue', 2: 'green'} # Dataset Slicing x_axis = iris_df.data[:, 0] # Sepal Length y_axis = iris_df.data[:, 2] # Sepal Width # Plotting plt.scatter(x_axis, y_axis, c=iris_df.target) plt.show()
from sklearn.datasets import load_boston as load from sklearn.svm import SVR from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from numpy import mean from ACA import ACATransformer from sklearn import cluster from sklearn.base import clone from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(*load(return_X_y=True)) def score(estimator, new_X, new_y): estimator.fit(new_X, new_y) predictions = estimator.predict(X_test) error = mean((predictions - y_test)**2) return -error pipeline = Pipeline([("estimator", SVR())]) params = { "estimator__C": [.5, 1, 2, 5, 10], "estimator__epsilon": [.5, 1, 1.5, 2, 2.5, 3], "estimator__kernel": ["rbf"], } #compare to random forest model print(score(RandomForestRegressor(), X_train, y_train))
from sklearn.datasets import load_digits as load from sklearn.cross_validation import train_test_split from DecisionTree import * from sklearn.metrics import accuracy_score from RF import RandomForestClassifier #from sklearn.ensemble import RandomForestClassifier data = load() X = data.data y = data.target print X.shape Xtrain, Xtest, ytrain, ytest = train_test_split(X, y) dtree = RandomForestClassifier(n_estimators=10) dtree.fit(Xtrain, ytrain) y_pred = dtree.predict(Xtest) print accuracy_score(y_pred, ytest)
from RPCA import RPCA from sklearn.datasets import load_diabetes as load import numpy as np import matplotlib.pyplot as plt import seaborn as sns data = load().data q = 0.9 # q is in [0,1],and the bigger q, the more positive noise. the small q, the more negtive noise p = 0.9 # p is in [0,1],and the bigger p, the less noise. the small p, the more noise. noise = np.random.random(data.shape) #what is the sign of noise? sign = np.sign(np.random.random(data.shape) - q) pick = np.random.random(data.shape) pick = np.where(pick > p, 1, 0) noise = 20 * noise * pick * sign #where have noise point in the matrix? sns.heatmap(pick) ans = RPCA(data + noise, w=0.1, tol=1e-6, itermax=1000, p=1.2, u=1e-3, umax=1e10) plt.figure() sns.heatmap(data, vmax=0.2, vmin=0)
def predict(x_test, coefficients): intercept = ones(shape=(len(x_test), 1)) x_test = append(intercept, x_test, axis=1) return matmul(x_test, coefficients) def mse(y_test, predictions): sum_ = 0 for i in range(len(y_test)): sum_ += (y_test[i] - predictions[i])**2 return sum_ / len(y_test) data = load().data x_train, x_test, y_train, y_test = train_test_split(data, load().target, test_size=0.3) coefficients = fit(x_train, y_train) predictions = predict(x_test, coefficients) print(mean_absolute_error(y_test, predictions)) reg = LinearRegression(fit_intercept=True, normalize=False) reg.fit(x_train, y_train) predictions = reg.predict(x_test) print(mean_absolute_error(y_test, predictions)) poly = PolynomialFeatures(2) x_train = poly.fit_transform(x_train) x_test = poly.fit_transform(x_test)
from sklearn.datasets import load_boston as load from sklearn.manifold import TSNE from matplotlib import pyplot as plt from ACA import ACATransformer from sklearn.cluster import KMeans as c import numpy as np X, y = load(return_X_y=True) transformer = ACATransformer(clusterer=c(n_clusters=14), return_old=False) averaged_x, averaged_y = transformer.fit_transform(X, y) all_x = np.concatenate([X, averaged_x]) reduced = TSNE(n_components=2).fit_transform(all_x) unclustered_x = reduced[0:len(X)] clustered_x = reduced[len(X):] print(len(X), len(unclustered_x)) print(len(averaged_x), len(clustered_x)) plot_x = [point[0] for point in unclustered_x] plot_y = [point[1] for point in unclustered_x] plt.scatter(plot_x, plot_y, c=y, cmap="autumn") plot_x = [point[0] for point in clustered_x] plot_y = [point[1] for point in clustered_x] plt.scatter(plot_x, plot_y, c=averaged_y, cmap="winter") plt.show()