def test_classifier(self):
        train_df = datasets.load("titanic")[["Name", "Survived"]]
        y = np.array(train_df.pop("Survived"))

        X_train, X_test, y_train, y_test = train_test_split(train_df,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=0)
        X_train = DataFrameContainer(
            "TrainSet",
            dataset_instance=X_train,
            resource_manager=self.mock_resource_manager)
        X_test = DataFrameContainer(
            "TestSet",
            dataset_instance=X_test,
            resource_manager=self.mock_resource_manager)
        y_train = NdArrayContainer("TrainLabel",
                                   dataset_instance=y_train,
                                   resource_manager=self.mock_resource_manager)
        y_test = NdArrayContainer("TestLabel",
                                  dataset_instance=y_test,
                                  resource_manager=self.mock_resource_manager)
        X_train.set_feature_groups(["text"])
        X_test.set_feature_groups(["text"])
        est_cls_list = [
            TsvdTransformer,
            NmfTransformer,
            LsiTransformer,
            LdaTransformer,
            RpTransformer,
        ]
        for cls in est_cls_list:
            print("=========================")
            print(cls.__name__)
            print("=========================")
            tokenizer = SimpleTokenlizer(
                **get_default_hp_of_cls(SimpleTokenlizer))
            tokenizer.in_feature_groups = "text"
            tokenizer.out_feature_groups = "token"
            transformer = cls(**get_default_hp_of_cls(cls))
            transformer.in_feature_groups = "token"
            transformer.out_feature_groups = "num"
            classifier = RandomForestClassifier(
                **get_default_hp_of_cls(RandomForestClassifier))
            pipeline = ML_Workflow([
                ("tokenizer", tokenizer),
                ("transformer", transformer),
                ("classifier", classifier),
            ],
                                   resource_manager=self.mock_resource_manager)
            start = time()
            pipeline.fit(X_train, y_train, X_test, y_test)
            y_pred = pipeline.predict(X_test)
            score = accuracy_score(y_test.data, y_pred)
            end = time()
            print("score:", score)
            print("time:", end - start)
            self.assertGreater(score, 0.6)
            print('\n' * 2)
from matplotlib.pyplot import scatter, show, hist


def gaussian_anomaly_detection(data):
    rows, cols = data.shape
    mu = data.mean(axis=0)
    diff = data - mu
    cov = dot(diff.T, diff) / rows
    a = exp(-0.5 * dot(dot(diff, inv(cov)), diff.T))
    b = sqrt(pow(2 * pi, cols) * det(cov))
    res = (a / b).sum(axis=1)
    return res


from sklearn.datasets import load_wine as load
from sklearn.decomposition import PCA


data = PCA(2).fit_transform(load().data)
res = gaussian_anomaly_detection(data)
colors = []
for x in res:
    if x < res.mean() - 2 * res.std() or x > res.mean() + 2 * res.std():
        colors.append('red')
    else:
        colors.append('green')
scatter(data[:,0], data[:,1], c=colors)
show()
hist(res, bins=100)
show()
Ejemplo n.º 3
0
x_test, y_test = all_data[train_test_split:], all_labels[train_test_split:]

predictions = knn(5, x_train, y_train, x_test)
correct = 0
for i in range(len(predictions)):
    if predictions[i] == y_test[i]:
        correct += 1
print('Accuracy: ', correct / len(predictions))

#####################################3
from sklearn.datasets import load_iris as load
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

x_train, x_test, y_train, y_test = train_test_split(load().data,
                                                    load().target,
                                                    test_size=0.2,
                                                    shuffle=True)
predictions = knn(5, x_train, y_train, x_test)
print('My accuracy:', accuracy_score(y_test, predictions))

##########################################################################################
model = KNeighborsClassifier(n_neighbors=5,
                             algorithm='brute',
                             weights='uniform')
model.fit(x_train, y_train)
predictions = model.predict(x_test)
print('Scikit accuracy:', accuracy_score(y_test, predictions))
##########################################################################################
Ejemplo n.º 4
0
'''

This is a snippet of code showing how to train a multiclass MKL algorithm

Author: Ivano Lauriola, [email protected]

'''

#load data
print('loading \'iris\' dataset...', end='')
from sklearn.datasets import load_iris as load
ds = load()
X, Y = ds.data, ds.target
print('done')
'''
WARNING: be sure that your matrix is not sparse! EXAMPLE:
from sklearn.datasets import load_svmlight_file
X,Y = load_svmlight_file(...)
X = X.toarray()
'''

#preprocess data
print('preprocessing data...', end='')
from MKLpy.preprocessing import normalization, rescale_01
X = rescale_01(X)  #feature scaling in [0,1]
X = normalization(X)  #||X_i||_2^2 = 1

#train/test split
from sklearn.model_selection import train_test_split
Xtr, Xte, Ytr, Yte = train_test_split(X,
                                      Y,
Ejemplo n.º 5
0
from sklearn.datasets import load_digits as load
from sklearn.cross_validation import train_test_split
from DecisionTree import *
from sklearn.metrics import accuracy_score
from RF import RandomForestClassifier

# from sklearn.ensemble import RandomForestClassifier
data = load()

X = data.data
y = data.target

print X.shape
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)

dtree = RandomForestClassifier(n_estimators=10)

dtree.fit(Xtrain, ytrain)

y_pred = dtree.predict(Xtest)

print accuracy_score(y_pred, ytest)
Ejemplo n.º 6
0
# Importing Modules
from sklearn import datasets
import matplotlib.pyplot as plt

# Loading dataset
iris_df = datasets.load("/home/dottie/Downloads/datasets/dataset/dataset/test.csv")

# Available methods on dataset
print(dir(iris_df))

# Features
print(iris_df.feature_names)

# Targets
print(iris_df.target)

# Target Names
print(iris_df.target_names)
label = {0: 'red', 1: 'blue', 2: 'green'}

# Dataset Slicing
x_axis = iris_df.data[:, 0]  # Sepal Length
y_axis = iris_df.data[:, 2]  # Sepal Width

# Plotting
plt.scatter(x_axis, y_axis, c=iris_df.target)
plt.show()
from sklearn.datasets import load_boston as load
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from numpy import mean
from ACA import ACATransformer
from sklearn import cluster
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(*load(return_X_y=True))


def score(estimator, new_X, new_y):
    estimator.fit(new_X, new_y)
    predictions = estimator.predict(X_test)
    error = mean((predictions - y_test)**2)
    return -error


pipeline = Pipeline([("estimator", SVR())])

params = {
    "estimator__C": [.5, 1, 2, 5, 10],
    "estimator__epsilon": [.5, 1, 1.5, 2, 2.5, 3],
    "estimator__kernel": ["rbf"],
}

#compare to random forest model
print(score(RandomForestRegressor(), X_train, y_train))
Ejemplo n.º 8
0
from sklearn.datasets import load_digits as load
from sklearn.cross_validation import train_test_split
from DecisionTree import *
from sklearn.metrics import accuracy_score
from RF import RandomForestClassifier
#from sklearn.ensemble import RandomForestClassifier
data = load()

X = data.data
y = data.target

print X.shape
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)

dtree = RandomForestClassifier(n_estimators=10)

dtree.fit(Xtrain, ytrain)

y_pred = dtree.predict(Xtest)

print accuracy_score(y_pred, ytest)
Ejemplo n.º 9
0
from RPCA import RPCA
from sklearn.datasets import load_diabetes as load
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data = load().data

q = 0.9  # q is in [0,1],and the bigger q, the more positive noise. the small q, the more negtive noise
p = 0.9  # p is in [0,1],and the bigger p, the less noise. the small p, the more noise.

noise = np.random.random(data.shape)
#what is the sign of noise?
sign = np.sign(np.random.random(data.shape) - q)

pick = np.random.random(data.shape)
pick = np.where(pick > p, 1, 0)
noise = 20 * noise * pick * sign

#where have noise point in the matrix?
sns.heatmap(pick)

ans = RPCA(data + noise,
           w=0.1,
           tol=1e-6,
           itermax=1000,
           p=1.2,
           u=1e-3,
           umax=1e10)
plt.figure()
sns.heatmap(data, vmax=0.2, vmin=0)
Ejemplo n.º 10
0

def predict(x_test, coefficients):
    intercept = ones(shape=(len(x_test), 1))
    x_test = append(intercept, x_test, axis=1)
    return matmul(x_test, coefficients)


def mse(y_test, predictions):
    sum_ = 0
    for i in range(len(y_test)):
        sum_ += (y_test[i] - predictions[i])**2
    return sum_ / len(y_test)


data = load().data
x_train, x_test, y_train, y_test = train_test_split(data,
                                                    load().target,
                                                    test_size=0.3)
coefficients = fit(x_train, y_train)
predictions = predict(x_test, coefficients)
print(mean_absolute_error(y_test, predictions))

reg = LinearRegression(fit_intercept=True, normalize=False)
reg.fit(x_train, y_train)
predictions = reg.predict(x_test)
print(mean_absolute_error(y_test, predictions))

poly = PolynomialFeatures(2)
x_train = poly.fit_transform(x_train)
x_test = poly.fit_transform(x_test)
from sklearn.datasets import load_boston as load
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt
from ACA import ACATransformer
from sklearn.cluster import KMeans as c
import numpy as np

X, y = load(return_X_y=True)

transformer = ACATransformer(clusterer=c(n_clusters=14), return_old=False)
averaged_x, averaged_y = transformer.fit_transform(X, y)

all_x = np.concatenate([X, averaged_x])

reduced = TSNE(n_components=2).fit_transform(all_x)

unclustered_x = reduced[0:len(X)]
clustered_x = reduced[len(X):]

print(len(X), len(unclustered_x))
print(len(averaged_x), len(clustered_x))

plot_x = [point[0] for point in unclustered_x]
plot_y = [point[1] for point in unclustered_x]
plt.scatter(plot_x, plot_y, c=y, cmap="autumn")

plot_x = [point[0] for point in clustered_x]
plot_y = [point[1] for point in clustered_x]
plt.scatter(plot_x, plot_y, c=averaged_y, cmap="winter")

plt.show()