Beispiel #1
0
def data_to_kernels(tr_data, te_data):
    scaler = Scaler(copy=False)
    scaler.fit_transform(tr_data)
    #tr_data, mu, sigma = standardize(tr_data)
    tr_data = power_normalize(tr_data, 0.5)
    tr_data = L2_normalize(tr_data)

    #te_data, _, _ = standardize(te_data, mu, sigma)
    scaler.transform(te_data)
    te_data = power_normalize(te_data, 0.5)
    te_data = L2_normalize(te_data)

    tr_kernel = np.dot(tr_data, tr_data.T)
    te_kernel = np.dot(te_data, tr_data.T)

    return tr_kernel, te_kernel
def run_svm(svc,X):
    X = X.copy()
    scaler  = Scaler()
    X  = scaler.fit_transform(X)
    y_predict = svc.predict(X)
    
    return y_predict
Beispiel #3
0
    def process_data(self):
        test = pandas.read_csv("test.csv")
        testMat = test.as_matrix()

        train = pandas.read_csv("train.csv")
        trainMat = train.as_matrix()
        trainResult = trainMat[:, 0]
        trainMat = trainMat[:, 1:]

        # trainInd = np.where(trainResult == 0)[0]
        # how_many = (trainResult == 1).sum() - len(trainInd)
        # np.random.shuffle(trainInd)
        # addedResult = trainResult[trainInd[:how_many],:]
        # addedData = trainMat[trainInd[:how_many],:]
        # trainResult = np.append(trainResult,addedResult)
        # trainMat = np.vstack((trainMat,addedData))

        cv = StratifiedKFold(trainResult, 2)
        # cv = KFold(n=trainResult.shape[0],k=2)
        reduceFeatures = ExtraTreesClassifier(
            compute_importances=True, random_state=1234, n_jobs=self.cpus, n_estimators=1000, criterion="gini"
        )
        reduceFeatures.fit(trainMat, trainResult)
        trainScaler = Scaler()

        self.cv_data = []
        self.cv_data_nonreduced = []
        for train, test in cv:
            X_train, X_test, Y_train, Y_test = (
                trainMat[train, :],
                trainMat[test, :],
                trainResult[train, :],
                trainResult[test, :],
            )
            X_train = trainScaler.fit_transform(X_train)
            X_test = trainScaler.transform(X_test)
            self.cv_data_nonreduced.append((X_train, X_test, Y_train, Y_test))
            X_train = reduceFeatures.transform(X_train)
            X_test = reduceFeatures.transform(X_test)
            self.cv_data.append((X_train, X_test, Y_train, Y_test))
        testMat = trainScaler.transform(testMat)
        self.testMat_nonreduced = testMat
        self.testMat = reduceFeatures.transform(testMat)
        allData = self.testMat, self.cv_data, self.testMat_nonreduced, self.cv_data_nonreduced
        data_handle = open("allData.pkl", "w")
        pickle.dump(allData, data_handle)
        data_handle.close()
def get_sl_test_data(fileEvents,fileLabels,includedChannels,useMeans=False,parentIndices=None):
    ## declare variables
    X = fileEvents[:,includedChannels].copy()
    scaler  = Scaler()
    X = scaler.fit_transform(X)

    #if parentIndices != None:
    #    X = X[parentIndices,:]
    
    #X = (X - X.mean(axis=0)) / X.std(axis=0)

    if useMeans == True:
        clusterIds,X = get_mean_matrix(X,fileLabels)
        #X = (X - X.mean(axis=0)) / X.std(axis=0)
        return clusterIds,X
    
    return X
def run_svm_validation(X1,y1,X2,y2,gammaRange=[0.5],cRange=[0.005],useLinear=False):
    #X_train,y_train,X_test,y_test = split_train_test(X1,y1,X2,y2)

    X = np.vstack((X1, X2))
    Y = np.hstack((y1, y2))

    scaler = Scaler()
    X = scaler.fit_transform(X)

    #if useLinear == True:
    #    svc = svm.SVC(kernel='linear')#class_weight={1: 10
    #    #    #    #svc = svm.SVC(kernel='poly',degree=3,C=1.0)
    #    svc.fit(X, Y)
    #    return svc

    C_range = 10.0 ** np.arange(-2, 9)
    gamma_range = 10.0 ** np.arange(-5, 4)
    param_grid = dict(gamma=gamma_range, C=C_range)

    grid = GridSearchCV(SVC(class_weight={1: 100}), param_grid=param_grid, cv=StratifiedKFold(y=Y,k=2))
    grid.fit(X, Y)

    print("The best classifier is: ", grid.best_estimator_)
    return grid.best_estimator_
Beispiel #6
0
Y = iris.target

# dataset for decision function visualization
X_2d = X[:, :2]
X_2d = X_2d[Y > 0]
Y_2d = Y[Y > 0]
Y_2d -= 1

# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the training set and
# just applying it on the test set.

scaler = Scaler()

X = scaler.fit_transform(X)
X_2d = scaler.fit_transform(X_2d)

##############################################################################
# Train classifier
#
# For an initial search, a logarithmic grid with basis
# 10 is often helpful. Using a basis of 2, a finer
# tuning can be achieved but at a much higher cost.

C_range = 10.0**np.arange(-2, 9)
gamma_range = 10.0**np.arange(-5, 4)
param_grid = dict(gamma=gamma_range, C=C_range)

grid = GridSearchCV(SVC(), param_grid=param_grid, cv=StratifiedKFold(y=Y, k=3))
grid.fit(X, Y)
from sklearn.datasets import load_iris
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV

iris_dataset = load_iris()

X, Y = iris_dataset.data, iris_dataset.target

# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

scaler = Scaler()

X = scaler.fit_transform(X)

# For an initial search, a logarithmic grid with basis
# 10 is often helpful. Using a basis of 2, a finer
# tuning can be achieved but at a much higher cost.

C_range = 10. ** np.arange(-5, 5)
gamma_range = 10. ** np.arange(-5, 5)

param_grid = dict(gamma=gamma_range, C=C_range)

grid = GridSearchCV(SVC(), param_grid=param_grid, cv=StratifiedKFold(y=Y, k=5))

grid.fit(X, Y)

print("The best classifier is: ", grid.best_estimator_)
Beispiel #8
0
Y = iris.target

# dataset for decision function visualization
X_2d = X[:, :2]
X_2d = X_2d[Y > 0]
Y_2d = Y[Y > 0]
Y_2d -= 1

# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the training set and
# just applying it on the test set.

scaler = Scaler()

X = scaler.fit_transform(X)
X_2d = scaler.fit_transform(X_2d)

##############################################################################
# Train classifier
#
# For an initial search, a logarithmic grid with basis
# 10 is often helpful. Using a basis of 2, a finer
# tuning can be achieved but at a much higher cost.

C_range = 10.0 ** np.arange(-2, 9)
gamma_range = 10.0 ** np.arange(-5, 4)
param_grid = dict(gamma=gamma_range, C=C_range)

grid = GridSearchCV(SVC(), param_grid=param_grid, cv=StratifiedKFold(y=Y, k=3))
grid.fit(X, Y)
from errorcurves import ErrorCurves
import numpy as np
from sklearn import mixture
import pandas

df = pandas.read_csv('TrainingDataset.csv')
df_test = pandas.read_csv('TestDataset.csv')
ids = df_test.pop('id')

outcomes = list()
train_sets = list()

quants = [i for i in df.columns if 'Q' in i]
df_quants = df[quants]
scaler = Scaler()
scaled = scaler.fit_transform(df_quants.fillna(0))
dpgmm = mixture.DPGMM(n_components = 75)
dpgmm.fit(scaled)
clusters = dpgmm.predict(scaled)
df['clusters'] = clusters

# Parse dates
jan1 = datetime(2000,1,1)

# Drop all rows where response variable == NaN
for i in range(1,13):
	df_i = df[df['Outcome_M'+str(i)]>0]
	outcomes.append(df_i.pop('Outcome_M'+str(i)))
	[df_i.pop(i) for i in df_i.columns if 'Out' in i]

	#drop nas first
Beispiel #10
0
def load_kernels(
    dataset, tr_norms=['std', 'sqrt', 'L2'], te_norms=['std', 'sqrt', 'L2'],
    analytical_fim=False, pi_derivatives=False, sqrt_nr_descs=False,
    only_train=False, verbose=0, do_plot=False, outfile=None):

    tr_outfile = outfile % "train" if outfile is not None else outfile

    # Load sufficient statistics.
    samples, _ = dataset.get_data('train')
    tr_data, tr_counts, tr_labels = load_video_data(
        dataset, samples, outfile=tr_outfile, analytical_fim=analytical_fim,
        pi_derivatives=pi_derivatives, sqrt_nr_descs=sqrt_nr_descs, verbose=verbose)

    if verbose > 0:
        print "Train data: %dx%d" % tr_data.shape

    if do_plot:
        plot_fisher_vector(tr_data[0], 'before')

    scalers = []
    for norm in tr_norms:
        if norm == 'std':
            scaler = Scaler()
            tr_data = scaler.fit_transform(tr_data)
            scalers.append(scaler)
        elif norm == 'sqrt':
            tr_data = power_normalize(tr_data, 0.5)
        elif norm == 'sqrt_cnt':
            tr_data = approximate_signed_sqrt(
                tr_data, tr_counts, pi_derivatives=pi_derivatives)
        elif norm == 'L2':
            tr_data = L2_normalize(tr_data)
        if do_plot:
            plot_fisher_vector(tr_data[0], 'after_%s' % norm)

    tr_kernel = np.dot(tr_data, tr_data.T)

    if only_train:
        return tr_kernel, tr_labels, scalers, tr_data

    te_outfile = outfile % "test" if outfile is not None else outfile

    # Load sufficient statistics.
    samples, _ = dataset.get_data('test')
    te_data, te_counts, te_labels = load_video_data(
        dataset, samples, outfile=te_outfile, analytical_fim=analytical_fim,
        pi_derivatives=pi_derivatives, sqrt_nr_descs=sqrt_nr_descs, verbose=verbose)

    if verbose > 0:
        print "Test data: %dx%d" % te_data.shape

    ii = 0
    for norm in te_norms:
        if norm == 'std':
            te_data = scalers[ii].transform(te_data)
            ii += 1
        elif norm == 'sqrt':
            te_data = power_normalize(te_data, 0.5)
        elif norm == 'sqrt_cnt':
            te_data = approximate_signed_sqrt(
                te_data, te_counts, pi_derivatives=pi_derivatives)
        elif norm == 'L2':
            te_data = L2_normalize(te_data)

    te_kernel = np.dot(te_data, tr_data.T)

    return tr_kernel, tr_labels, te_kernel, te_labels
Beispiel #11
0
def load_kernels(dataset,
                 tr_norms=['std', 'sqrt', 'L2'],
                 te_norms=['std', 'sqrt', 'L2'],
                 analytical_fim=False,
                 pi_derivatives=False,
                 sqrt_nr_descs=False,
                 only_train=False,
                 verbose=0,
                 do_plot=False,
                 outfile=None):

    tr_outfile = outfile % "train" if outfile is not None else outfile

    # Load sufficient statistics.
    samples, _ = dataset.get_data('train')
    tr_data, tr_counts, tr_labels = load_video_data(
        dataset,
        samples,
        outfile=tr_outfile,
        analytical_fim=analytical_fim,
        pi_derivatives=pi_derivatives,
        sqrt_nr_descs=sqrt_nr_descs,
        verbose=verbose)

    if verbose > 0:
        print "Train data: %dx%d" % tr_data.shape

    if do_plot:
        plot_fisher_vector(tr_data[0], 'before')

    scalers = []
    for norm in tr_norms:
        if norm == 'std':
            scaler = Scaler()
            tr_data = scaler.fit_transform(tr_data)
            scalers.append(scaler)
        elif norm == 'sqrt':
            tr_data = power_normalize(tr_data, 0.5)
        elif norm == 'sqrt_cnt':
            tr_data = approximate_signed_sqrt(tr_data,
                                              tr_counts,
                                              pi_derivatives=pi_derivatives)
        elif norm == 'L2':
            tr_data = L2_normalize(tr_data)
        if do_plot:
            plot_fisher_vector(tr_data[0], 'after_%s' % norm)

    tr_kernel = np.dot(tr_data, tr_data.T)

    if only_train:
        return tr_kernel, tr_labels, scalers, tr_data

    te_outfile = outfile % "test" if outfile is not None else outfile

    # Load sufficient statistics.
    samples, _ = dataset.get_data('test')
    te_data, te_counts, te_labels = load_video_data(
        dataset,
        samples,
        outfile=te_outfile,
        analytical_fim=analytical_fim,
        pi_derivatives=pi_derivatives,
        sqrt_nr_descs=sqrt_nr_descs,
        verbose=verbose)

    if verbose > 0:
        print "Test data: %dx%d" % te_data.shape

    ii = 0
    for norm in te_norms:
        if norm == 'std':
            te_data = scalers[ii].transform(te_data)
            ii += 1
        elif norm == 'sqrt':
            te_data = power_normalize(te_data, 0.5)
        elif norm == 'sqrt_cnt':
            te_data = approximate_signed_sqrt(te_data,
                                              te_counts,
                                              pi_derivatives=pi_derivatives)
        elif norm == 'L2':
            te_data = L2_normalize(te_data)

    te_kernel = np.dot(te_data, tr_data.T)

    return tr_kernel, tr_labels, te_kernel, te_labels
from sklearn.datasets import load_iris
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV

iris_dataset = load_iris()

X, Y = iris_dataset.data, iris_dataset.target

# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

scaler = Scaler()

X = scaler.fit_transform(X)

# For an initial search, a logarithmic grid with basis
# 10 is often helpful. Using a basis of 2, a finer
# tuning can be achieved but at a much higher cost.

C_range = 10. ** np.arange(-5, 5)
gamma_range = 10. ** np.arange(-5, 5)

param_grid = dict(gamma=gamma_range, C=C_range)

grid = GridSearchCV(SVC(), param_grid=param_grid, cv=StratifiedKFold(y=Y, k=5))

grid.fit(X, Y)

print("The best classifier is: ", grid.best_estimator_)
Beispiel #13
0
class KMPBase(BaseEstimator):

    def __init__(self,
                 n_nonzero_coefs=0.3,
                 loss=None,
                 # components (basis functions)
                 init_components=None,
                 n_components=None,
                 check_duplicates=False,
                 scale=False,
                 scale_y=False,
                 # back-fitting
                 n_refit=5,
                 estimator=None,
                 # metric
                 metric="linear", gamma=0.1, coef0=1, degree=4,
                 # validation
                 X_val=None, y_val=None,
                 n_validate=1,
                 epsilon=0,
                 score_func=None,
                 # misc
                 random_state=None, verbose=0, n_jobs=1):
        if n_nonzero_coefs < 0:
            raise AttributeError("n_nonzero_coefs should be > 0.")

        self.n_nonzero_coefs = n_nonzero_coefs
        self.loss = loss
        self.init_components = init_components
        self.n_components = n_components
        self.check_duplicates = check_duplicates
        self.scale = scale
        self.scale_y = scale_y
        self.n_refit = n_refit
        self.estimator = estimator
        self.metric = metric
        self.gamma = gamma
        self.coef0 = coef0
        self.degree = degree
        self.X_val = X_val
        self.y_val = y_val
        self.n_validate = n_validate
        self.epsilon = epsilon
        self.score_func = score_func
        self.random_state = random_state
        self.verbose = verbose
        self.n_jobs = n_jobs

    def _kernel_params(self):
        return {"gamma" : self.gamma,
                "degree" : self.degree,
                "coef0" : self.coef0}

    def _get_estimator(self):
        if self.estimator is None:
            estimator = LinearRegression()
        else:
            estimator = clone(self.estimator)
        estimator.fit_intercept = False
        return estimator

    def _get_loss(self):
        if self.loss == "squared":
            return SquaredLoss()
        else:
            return None

    def _pre_fit(self, X, y):
        random_state = check_random_state(self.random_state)

        if self.scale_y:
            self.y_scaler_ = Scaler(copy=True).fit(y)
            y = self.y_scaler_.transform(y)

        if self.metric == "precomputed":
            self.components_ = None
            n_components = X.shape[1]
        else:
            if self.init_components is None:
                if self.verbose: print "Selecting components..."
                self.components_ = select_components(X, y,
                                                     self.n_components,
                                                     random_state=random_state)
            else:
                self.components_ = self.init_components

            n_components = self.components_.shape[0]


        n_nonzero_coefs = self.n_nonzero_coefs
        if 0 < n_nonzero_coefs and n_nonzero_coefs <= 1:
            n_nonzero_coefs = int(n_nonzero_coefs * n_components)
        n_nonzero_coefs = int(n_nonzero_coefs)

        if n_nonzero_coefs > n_components:
            raise AttributeError("n_nonzero_coefs cannot be bigger than "
                                 "n_components.")

        if self.verbose: print "Computing dictionary..."
        start = time.time()
        K = pairwise_kernels(X, self.components_, metric=self.metric,
                             filter_params=True, n_jobs=self.n_jobs,
                             **self._kernel_params())
        if self.verbose: print "Done in", time.time() - start, "seconds"

        if self.scale:
            if self.verbose: print "Scaling dictionary"
            start = time.time()
            copy = True if self.metric == "precomputed" else False
            self.scaler_ = Scaler(copy=copy)
            K = self.scaler_.fit_transform(K)
            if self.verbose: print "Done in", time.time() - start, "seconds"

        # FIXME: this allocates a lot of intermediary memory
        norms = np.sqrt(np.sum(K ** 2, axis=0))

        return n_nonzero_coefs, K, y, norms

    def _fit_multi(self, K, y, Y, n_nonzero_coefs, norms):
        if self.verbose: print "Starting training..."
        start = time.time()
        coef = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
                delayed(_run_iterator)(self._get_estimator(),
                                       self._get_loss(),
                                       K, Y[:, i], n_nonzero_coefs, norms,
                                       self.n_refit, self.check_duplicates)
                for i in xrange(Y.shape[1]))
        self.coef_ = np.array(coef)
        if self.verbose: print "Done in", time.time() - start, "seconds"

    def _score(self, y_true, y_pred):
        if self.score_func == "auc":
            return auc(y_true, y_pred)
        if hasattr(self, "lb_"):
            y_pred = self.lb_.inverse_transform(y_pred, threshold=0.5)
            if self.score_func is None:
                return np.mean(y_true == y_pred)
            else:
                return self.score_func(y_true, y_pred)
        else:
            # FIXME: no need to ravel y_pred if y_true is 2d!
            return -np.mean((y_true - y_pred.ravel()) ** 2)

    def _fit_multi_with_validation(self, K, y, Y, n_nonzero_coefs, norms):
        iterators = [FitIterator(self._get_estimator(), self._get_loss(),
                                 K, Y[:, i], n_nonzero_coefs, norms,
                                 self.n_refit, self.check_duplicates,
                                 self.verbose)
                     for i in xrange(Y.shape[1])]

        if self.verbose: print "Computing validation dictionary..."
        start = time.time()
        K_val = pairwise_kernels(self.X_val, self.components_,
                                 metric=self.metric,
                                 filter_params=True,
                                 n_jobs=self.n_jobs,
                                 **self._kernel_params())
        if self.verbose: print "Done in", time.time() - start, "seconds"
        if self.scale:
            K_val = self.scaler_.transform(K_val)

        y_val = self.y_val
        if self.scale_y:
            y_val = self.y_scaler_.transform(y_val)


        if self.verbose: print "Starting training..."
        start = time.time()
        best_score = -np.inf
        validation_scores = []
        training_scores = []
        iterations = []

        for i in xrange(1, n_nonzero_coefs + 1):
            iterators = [it.next() for it in iterators]
            #iterators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
                    #delayed(_run_iterator)(it) for it in iterators)
            coef = np.array([it.coef_ for it in iterators])
            y_train_pred = np.array([it.y_train_ for it in iterators]).T

            if i % self.n_validate == 0:
                if self.verbose >= 2:
                    print "Validating %d/%d..." % (i, n_nonzero_coefs)

                y_val_pred = np.dot(K_val, coef.T)

                validation_score = self._score(y_val, y_val_pred)
                training_score = self._score(y, y_train_pred)

                if validation_score > best_score:
                    self.coef_ = coef.copy()
                    best_score = np.abs(validation_score)

                validation_scores.append(np.abs(validation_score))
                training_scores.append(np.abs(training_score))
                iterations.append(i)

                if len(iterations) > 2 and self.epsilon > 0:
                    diff = (validation_scores[-1] - validation_scores[-2])
                    diff /= validation_scores[0]
                    if abs(diff) < self.epsilon:
                        if self.verbose:
                            print "Converged at iteration", i
                        break

        self.validation_scores_ = np.array(validation_scores)
        self.training_scores_ = np.array(training_scores)
        self.iterations_ = np.array(iterations)
        self.best_score_ = best_score

        if self.verbose: print "Done in", time.time() - start, "seconds"

    def _fit(self, K, y, Y, n_nonzero_coefs, norms):
        if self.X_val is not None and self.y_val is not None:
            meth = self._fit_multi_with_validation
        else:
            meth = self._fit_multi
        meth(K, y, Y, n_nonzero_coefs, norms)

    def _post_fit(self):
        if self.metric != "precomputed":
            used_basis = np.sum(self.coef_ != 0, axis=0, dtype=bool)
            self.coef_ = self.coef_[:, used_basis]
            self.components_ = self.components_[used_basis]

    def decision_function(self, X):
        K = pairwise_kernels(X, self.components_, metric=self.metric,
                             filter_params=True, n_jobs=self.n_jobs,
                             **self._kernel_params())
        if self.scale:
            K = self.scaler_.transform(K)

        pred = np.dot(K, self.coef_.T)

        if self.scale_y:
            pred = self.y_scaler_.inverse_transform(pred)

        return pred
Beispiel #14
0
 def normalize(self, data, n=N_COMPONENTS):
     X = np.array(data, dtype='float')
     #X=np.array(X[:,np.std(X,0)!=0.0], dtype='float')
     scaler = Scaler()
     Xnorm = scaler.fit_transform(X)
     return Xnorm
Beispiel #15
0
 def normalize(self, data, n=N_COMPONENTS):
    X=np.array(data, dtype='float')
    #X=np.array(X[:,np.std(X,0)!=0.0], dtype='float')
    scaler=Scaler()
    Xnorm=scaler.fit_transform(X)
    return Xnorm