Python Scaler.fitの例、sklearn.preprocessing.Scaler.fit Pythonの例

コード例 #1

1

ファイルを表示

def test_scaler_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero

    scaler = Scaler(with_mean=False)
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert not np.any(np.isnan(X_scaled))

    assert_array_almost_equal(
        X_scaled.mean(axis=0), [0., -0.01,  2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert X_scaled is not X

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert X_scaled_back is not X
    assert X_scaled_back is not X_scaled
    assert_array_almost_equal(X_scaled_back, X)

    X_scaled = scale(X, with_mean=False)
    assert not np.any(np.isnan(X_scaled))

    assert_array_almost_equal(
        X_scaled.mean(axis=0), [0., -0.01,  2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert X_scaled is not X

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert X_scaled_back is not X
    assert X_scaled_back is not X_scaled
    assert_array_almost_equal(X_scaled_back, X)

コード例 #2

0

ファイルを表示

ファイル: test_preprocessing.py プロジェクト: AlexLerman/scikit-learn

def test_scaler_1d():
    """Test scaling of dataset along single axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(5)
    X_orig_copy = X.copy()

    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_array_almost_equal(X_scaled_back, X_orig_copy)

    # Test with 1D list
    X = [0., 1., 2, 0.4, 1.]
    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    X_scaled = scale(X)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

コード例 #3

0

ファイルを表示

def test_scaler_1d():
    """Test scaling of dataset along single axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(5)
    X_orig_copy = X.copy()

    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_array_almost_equal(X_scaled_back, X_orig_copy)

    # Test with 1D list
    X = [0., 1., 2, 0.4, 1.]
    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    X_scaled = scale(X)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

コード例 #4

0

ファイルを表示

ファイル: answer_classifier.py プロジェクト: zzpanda/Quora

def SVM_fit(X_in, y_in, X_out, gamma, C):

    M = len(X_in[0])  #Number of features
    seed(time())

    #To prevent data snooping, breakes the input set into train. cross validation and test sets, with sizes proportional to 8-1-1

    #First puts aside 10% of the data for the tests
    test_indices, train_indices = split_indices(len(X_in),
                                                int(round(0.1 * len(X_in))))

    shuffle(X_in, y_in)

    X_test = [X_in[i] for i in test_indices]
    y_test = [y_in[i] for i in test_indices]
    X_in = [X_in[i] for i in train_indices]
    y_in = [y_in[i] for i in train_indices]

    #scale data first
    scaler = Scaler(copy=False)  #in place modification
    #Normalize the data and stores as inner parameters the mean and standard deviation
    #To avoid data snooping, normalization is computed on training set only, and then reported on data
    scaler.fit(X_test, y_test)
    X_in = scaler.transform(X_in)
    X_test = scaler.transform(X_test)
    X_out = scaler.transform(
        X_out)  #uses the same transformation (same mean_ and std_) fit before

    std_test = X_test.std(axis=0)
    f_indices = [j for j in range(M) if std_test[j] > 1e-7]

    #Removes feature with null variance
    X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))]
    X_test = [[X_test[i][j] for j in f_indices] for i in range(len(X_test))]
    X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))]

    M = len(f_indices)
    #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered
    svc = svm.SVC(kernel='rbf',
                  C=C,
                  gamma=gamma,
                  verbose=False,
                  cache_size=4092,
                  tol=1e-5)
    svc.fit(X_in, y_in)

    y_out = svc.predict(X_out)
    return y_out

コード例 #5

0

ファイルを表示

def test_scaler_2d_arrays():
    """Test scaling of 2d array along first axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero

    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is not X)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_scaled = scale(X, axis=1, with_std=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    X_scaled = scale(X, axis=1, with_std=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0])
    # Check that the data hasn't been modified
    assert_true(X_scaled is not X)

    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is X)

    X = rng.randn(4, 5)
    X[:, 0] = 1.0  # first feature is a constant, non zero feature
    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is not X)

コード例 #6

0

ファイルを表示

ファイル: test_preprocessing.py プロジェクト: AlexLerman/scikit-learn

def test_scaler_2d_arrays():
    """Test scaling of 2d array along first axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero

    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is not X)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_scaled = scale(X, axis=1, with_std=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    X_scaled = scale(X, axis=1, with_std=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0])
    # Check that the data hasn't been modified
    assert_true(X_scaled is not X)

    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is X)

    X = rng.randn(4, 5)
    X[:, 0] = 1.0  # first feature is a constant, non zero feature
    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is not X)

コード例 #7

0

ファイルを表示

ファイル: answer_classifier.py プロジェクト: atul2512/Quora

def SVM_fit(X_in, y_in, X_out, gamma, C):    

    M = len(X_in[0])   #Number of features
    seed(time())
    
    #To prevent data snooping, breakes the input set into train. cross validation and test sets, with sizes proportional to 8-1-1
    
    #First puts aside 10% of the data for the tests
    test_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in))))

    shuffle(X_in, y_in)
    
    X_test = [X_in[i] for i in test_indices]
    y_test = [y_in[i] for i in test_indices]
    X_in = [X_in[i] for i in train_indices]
    y_in = [y_in[i] for i in train_indices]
  
    
    #scale data first
    scaler = Scaler(copy=False) #in place modification
    #Normalize the data and stores as inner parameters the mean and standard deviation
    #To avoid data snooping, normalization is computed on training set only, and then reported on data
    scaler.fit(X_test, y_test)
    X_in = scaler.transform(X_in)
    X_test = scaler.transform(X_test)
    X_out = scaler.transform(X_out) #uses the same transformation (same mean_ and std_) fit before

    std_test = X_test.std(axis=0)
    f_indices = [j for j in range(M) if std_test[j] > 1e-7]
    
    #Removes feature with null variance    
    X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))]
    X_test = [[X_test[i][j] for j in f_indices] for i in range(len(X_test))]
    X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))]

    M = len(f_indices)
    #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered    
    svc = svm.SVC(kernel='rbf', C=C, gamma=gamma, verbose=False, cache_size=4092, tol=1e-5)   
    svc.fit(X_in, y_in)      
        
    y_out = svc.predict(X_out)
    return y_out

コード例 #8

0

ファイルを表示

def test_center_kernel():
    """Test that KernelCenterer is equivalent to Scaler in feature space"""
    X_fit = np.random.random((5, 4))
    scaler = Scaler(with_std=False)
    scaler.fit(X_fit)
    X_fit_centered = scaler.transform(X_fit)
    K_fit = np.dot(X_fit, X_fit.T)

    # center fit time matrix
    centerer = KernelCenterer()
    K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T)
    K_fit_centered2 = centerer.fit_transform(K_fit)
    assert_array_almost_equal(K_fit_centered, K_fit_centered2)

    # center predict time matrix
    X_pred = np.random.random((2, 4))
    K_pred = np.dot(X_pred, X_fit.T)
    X_pred_centered = scaler.transform(X_pred)
    K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T)
    K_pred_centered2 = centerer.transform(K_pred)
    assert_array_almost_equal(K_pred_centered, K_pred_centered2)

コード例 #9

0

ファイルを表示

ファイル: l2_approx.py プロジェクト: pgsrv/approx_norm_fv

def run_real_data_experiments(nr_samples,
                              delta,
                              verbose=0,
                              do_scatter_plot=False):

    dataset = Dataset('hollywood2',
                      suffix='.per_slice.delta_%d' % delta,
                      nr_clusters=256)
    samples, _ = dataset.get_data('test')
    nr_samples = np.minimum(len(samples), nr_samples)
    nr_samples = np.maximum(1, nr_samples)

    if verbose > 2:
        print "Loading train data."
    tr_data, _, _ = load_sample_data(dataset, 'train', pi_derivatives=True)
    scaler = Scaler()
    scaler.fit(tr_data)

    true_values, approx_values = [], []
    for ii in xrange(nr_samples):
        if verbose > 2:
            sys.stdout.write("%s\r" % samples[ii].movie)
        data, _, _ = load_sample_data(dataset,
                                      str(samples[ii]),
                                      pi_derivatives=True)
        data = scaler.transform(data)
        L2_norm_true, L2_norm_approx = L2_approx(data)
        true_values.append(L2_norm_true)
        approx_values.append(L2_norm_approx)

    if verbose:
        print
        print_info(true_values, approx_values, verbose)
        print

    if do_scatter_plot:
        scatter_plot(true_values, approx_values)

コード例 #10

0

ファイルを表示

ファイル: net.py プロジェクト: hendrik-p/neural_net

    records = data[:,1:]
    labels = data[:,0]
    n_train = 35000
    #n_val = n - n_train
    n_val = 7000
    trainset = records[:n_train,:]
    trainlabels = labels[:n_train]
    #valset = records[n_train:,:]
    #vallabels = labels[n_train:,:]
    valset = records[n_train:n_train+n_val,:]
    vallabels = labels[n_train:n_train+n_val]
    n,dim = trainset.shape

    # mean centering, stdev normalization and whitening
    scaler = Scaler()
    scaler.fit(trainset)
    trainset = scaler.transform(trainset)
    valset = scaler.transform(valset)
    pca = PCA(n_components=dim,whiten=True)
    pca.fit(trainset)
    trainset = pca.transform(trainset)
    valset = pca.transform(valset)

    config = Train_config()
    config.iterations = 10
    config.nonlinearity = 'tanh'
    config.batchsize = 50
    config.learning_rate = 0.2
    config.momentum = 0.7
    log = open('log.txt','w')
    nn = Net([dim,300,10],log_file=log)

コード例 #11

0

ファイルを表示

def test_scaler():
    """Test scaling of dataset along all axis"""
    # First test with 1D data
    X = np.random.randn(5)
    X_orig_copy = X.copy()

    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_array_almost_equal(X_scaled_back, X_orig_copy)

    # Test with 1D list
    X = [0., 1., 2, 0.4, 1.]
    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    X_scaled = scale(X)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    # Test with 2D data
    X = np.random.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero

    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert not np.any(np.isnan(X_scaled))

    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert X_scaled is not X

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert X_scaled_back is not X
    assert X_scaled_back is not X_scaled
    assert_array_almost_equal(X_scaled_back, X)

    X_scaled = scale(X, axis=1, with_std=False)
    assert not np.any(np.isnan(X_scaled))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    X_scaled = scale(X, axis=1, with_std=True)
    assert not np.any(np.isnan(X_scaled))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0])
    # Check that the data hasn't been modified
    assert X_scaled is not X

    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert not np.any(np.isnan(X_scaled))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert X_scaled is X

    X = np.random.randn(4, 5)
    X[:, 0] = 1.0  # first feature is a constant, non zero feature
    scaler = Scaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert not np.any(np.isnan(X_scaled))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert X_scaled is not X

コード例 #12

0

ファイルを表示

ファイル: answer_classifier.py プロジェクト: zzpanda/Quora

def Logistic_train(X_in, y_in, X_out, cs, file_log=None):
    if file_log:
        file_log.writelines('# of Samples: {}, # of Features: {}\n'.format(
            len(X_in), len(X_in[0])))
    M = len(X_in[0])  #Number of features
    seed(time())

    #To prevent data snooping, breakes the input set into train. cross validation and test sets, with sizes proportional to 8-1-1

    #First puts aside 10% of the data for the tests
    test_indices, train_indices = split_indices(len(X_in),
                                                int(round(0.1 * len(X_in))))

    X_scaler = [X_in[i] for i in test_indices]
    y_scaler = [y_in[i] for i in test_indices]
    X_in = [X_in[i] for i in train_indices]
    y_in = [y_in[i] for i in train_indices]

    #scale data first
    scaler = Scaler(copy=False)  #in place modification
    #Normalize the data and stores as inner parameters the mean and standard deviation
    #To avoid data snooping, normalization is computed on training set only, and then reported on data
    scaler.fit(X_scaler, y_scaler)
    X_scaler = scaler.transform(X_scaler)
    X_in = scaler.transform(X_in)
    X_out = scaler.transform(
        X_out)  #uses the same transformation (same mean_ and std_) fit before

    std_test = X_scaler.std(axis=0)
    f_indices = [j for j in range(M) if std_test[j] > 1e-7]

    #Removes feature with null variance

    X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))]
    X_scaler = [[X_scaler[i][j] for j in f_indices]
                for i in range(len(X_scaler))]
    X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))]

    M = len(X_in[0])
    #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered
    best_cv_accuracy = 0.
    best_c = 0.

    for c in cs:
        kfold = cross_validation.StratifiedKFold(y_in, k=10)
        lrc = LogisticRegression(C=c, tol=1e-5)

        in_accuracy = 0.
        cv_accuracy = 0.
        for t_indices, cv_indices in kfold:

            X_train = array([X_in[i][:] for i in t_indices])
            y_train = [y_in[i] for i in t_indices]
            X_cv = array([X_in[i][:] for i in cv_indices])
            y_cv = [y_in[i] for i in cv_indices]

            lrc.fit(X_train, y_train)
            in_accuracy += lrc.score(X_train, y_train)
            cv_accuracy += lrc.score(X_cv, y_cv)

        in_accuracy /= kfold.k
        cv_accuracy /= kfold.k

        if file_log:
            file_log.writelines('C: {}\n'.format(c))
            file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy))
            file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy))

        if (cv_accuracy > best_cv_accuracy):
            best_c = c
            best_cv_accuracy = cv_accuracy

    #Now tests the out of sample error
    if file_log:
        file_log.writelines('\nBEST result: E_cv={}, C={}\n'.format(
            1. - best_cv_accuracy, best_c))

    lrc = LogisticRegression(C=best_c, tol=1e-5)

    lrc.fit(X_in, y_in)
    if file_log:
        file_log.writelines('Ein= {}\n'.format(1. - lrc.score(X_in, y_in)))
        file_log.writelines(
            'Etest= {}\n'.format(1. - lrc.score(X_scaler, y_scaler)))

    y_out = lrc.predict(X_out)
    return y_out

コード例 #13

0

ファイルを表示

ファイル: answer_classifier.py プロジェクト: zzpanda/Quora

def SVM_train(X_in, y_in, X_out, gammas, cs, file_log=None):
    if file_log:
        file_log.writelines('# of Samples: {}, # of Features: {}\n'.format(
            len(X_in), len(X_in[0])))
    M = len(X_in[0])  #Number of features
    seed(time())

    #To prevent data snooping, breaks the input set into train. cross validation
    #and scale sets, with sizes proportional to 8-1-1

    #First puts aside 10% of the data for the tests
    scale_set_indices, train_indices = split_indices(
        len(X_in), int(round(0.1 * len(X_in))))

    #    shuffle(X_in, y_in)

    X_scale = [X_in[i] for i in scale_set_indices]
    y_scale = [y_in[i] for i in scale_set_indices]
    X_in = [X_in[i] for i in train_indices]
    y_in = [y_in[i] for i in train_indices]

    #Scale data first
    scaler = Scaler(copy=False)  #WARNING: copy=False => in place modification
    #Normalize the data and stores as inner parameters the mean and standard deviation
    #To avoid data snooping, normalization is computed on a separate subsetonly, and then reported on data
    scaler.fit(X_scale, y_scale)
    X_scale = scaler.transform(X_scale)
    X_in = scaler.transform(X_in)
    X_out = scaler.transform(
        X_out)  #uses the same transformation (same mean_ and std_) fit before

    std_test = X_scale.std(axis=0)
    f_indices = [j for j in range(M) if std_test[j] > 1e-7]

    #Removes feature with null variance
    X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))]
    X_scale = [[X_scale[i][j] for j in f_indices] for i in range(len(X_scale))]
    X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))]

    if file_log:
        file_log.writelines('Initial features :{}, Features used: {}\n'.format(
            M, len(X_in[0])))

    M = len(f_indices)
    best_cv_accuracy = 0.
    best_gamma = 0.
    best_c = 0.

    #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered
    for c in cs:
        for g in gammas:
            #Balanced cross validation (keeps the ratio of the two classes as
            #constant as possible across the k folds).
            kfold = cross_validation.StratifiedKFold(y_in, k=10)
            svc = svm.SVC(kernel='rbf',
                          C=c,
                          gamma=g,
                          verbose=False,
                          cache_size=4092,
                          tol=1e-5)

            in_accuracy = 0.
            cv_accuracy = 0.
            for t_indices, cv_indices in kfold:

                X_train = array([X_in[i][:] for i in t_indices])
                y_train = [y_in[i] for i in t_indices]
                X_cv = array([X_in[i][:] for i in cv_indices])
                y_cv = [y_in[i] for i in cv_indices]

                svc.fit(X_train, y_train)
                in_accuracy += svc.score(X_train, y_train)
                cv_accuracy += svc.score(X_cv, y_cv)

            in_accuracy /= kfold.k
            cv_accuracy /= kfold.k
            if file_log:
                file_log.writelines('C:{}, gamma:{}\n'.format(c, g))
                file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy))
                file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy))

            if (cv_accuracy > best_cv_accuracy):
                best_gamma = g
                best_c = c
                best_cv_accuracy = cv_accuracy

    if file_log:
        file_log.writelines('\nBEST result: E_cv={}, C={}, gamma={}\n'.format(
            1. - best_cv_accuracy, best_c, best_gamma))

    svc = svm.SVC(kernel='rbf',
                  C=best_c,
                  gamma=best_gamma,
                  verbose=False,
                  cache_size=4092,
                  tol=1e-5)

    svc.fit(X_in, y_in)
    if file_log:
        file_log.writelines('Ein= {}\n'.format(1. - svc.score(X_in, y_in)))
        file_log.writelines('Etest= {}\n'.format(1. -
                                                 svc.score(X_scale, y_scale)))

    y_out = svc.predict(X_out)
    #DEBUG:    output = ['{} {:+}\n'.format(id_out[i], int(y_scale[i])) for i in range(len(X_out))]
    #DEBUG:    file_log.writelines('------------------------')
    return y_out

コード例 #14

0

ファイルを表示

ファイル: answer_classifier.py プロジェクト: atul2512/Quora

def tree_train(X_in, y_in, X_out, min_meaningful_features_ratio=1., file_log=None):    
    if file_log:        
        file_log.writelines('# of Samples: {}, # of Features: {}\n'.format(len(X_in), len(X_in[0])))

    M = len(X_in[0])   #Number of features
    seed(time())
    
    #To prevent data snooping, breaks the input set into train. cross validation and test sets, with sizes proportional to 8-1-1
    
    #First puts aside 10% of the data for the tests
    test_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in))))
   
    X_scaler = [X_in[i] for i in test_indices]
    y_scaler = [y_in[i] for i in test_indices]
    X_in = [X_in[i] for i in train_indices]
    y_in = [y_in[i] for i in train_indices]
    
    #scale data first
    scaler = Scaler(copy=False) #in place modification
    #Normalize the data and stores as inner parameters the mean and standard deviation
    #To avoid data snooping, normalization is computed on training set only, and then reported on data
    scaler.fit(X_scaler, y_scaler)  
    X_scaler = scaler.transform(X_scaler)
    X_in = scaler.transform(X_in)
    X_out = scaler.transform(X_out) #uses the same transformation (same mean_ and std_) fit before
    
    std_test = X_scaler.std(axis=0)
    f_indices = [j for j in range(M) if std_test[j] > 1e-7]
    
    #Removes feature with null variance
    
    X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))]
    X_scaler = [[X_scaler[i][j] for j in f_indices] for i in range(len(X_scaler))]
    X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))]
  
    M = len(f_indices)
    #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered
    best_cv_accuracy = 0.
    best_features_number = M
                
    for features_number in range(int(floor(M * min_meaningful_features_ratio)), M + 1):
    
        
        # kfold = cross_validation.KFold(len(y_in), k=10, shuffle=True)
        kfold = cross_validation.StratifiedKFold(y_in, k=10)
        svc = ExtraTreesClassifier(criterion='entropy', max_features=features_number)

                            
        in_accuracy = 0.
        cv_accuracy = 0.
        for t_indices, cv_indices in kfold:
    
            X_train = array([[X_in[i][j] for j in range(M)] for i in t_indices])
            y_train = [y_in[i] for i in t_indices]
            X_cv = array([[X_in[i][j] for j in range(M)] for i in cv_indices])
            y_cv = [y_in[i] for i in cv_indices]        
            

            svc.fit(X_train, y_train)
            in_accuracy += svc.score(X_train, y_train)
            cv_accuracy += svc.score(X_cv, y_cv)
   
        
        in_accuracy /= kfold.k
        cv_accuracy /= kfold.k
        if file_log:        
            file_log.writelines('# of features: {}\n'.format(len(X_train[0])))   
            file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy))
            file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy))
    
        if (cv_accuracy > best_cv_accuracy):
            best_features_number = features_number
            best_cv_accuracy = cv_accuracy
            
    #Now tests the out of sample error
    if file_log:        
        file_log.writelines('\nBEST result: E_cv={}, t={}\n'.format(1. - best_cv_accuracy, best_features_number))
    
    
    svc = ExtraTreesClassifier(criterion='entropy', n_estimators=features_number)
    svc.fit(X_in, y_in)
    if file_log:        
        file_log.writelines('Ein= {}\n'.format(1. - svc.score(X_in, y_in)))
        file_log.writelines('Etest= {}\n'.format(1. - svc.score(X_scaler, y_scaler)))    
        
    y_out = svc.predict(X_out)
    return y_out

コード例 #15

0

ファイルを表示

ファイル: answer_classifier.py プロジェクト: atul2512/Quora

def Logistic_train(X_in, y_in, X_out, cs, file_log=None):    
    if file_log:        
        file_log.writelines('# of Samples: {}, # of Features: {}\n'.format(len(X_in), len(X_in[0])))
    M = len(X_in[0])   #Number of features
    seed(time())
    
    #To prevent data snooping, breakes the input set into train. cross validation and test sets, with sizes proportional to 8-1-1
    
    #First puts aside 10% of the data for the tests
    test_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in))))
    
    X_scaler = [X_in[i] for i in test_indices]
    y_scaler = [y_in[i] for i in test_indices]
    X_in = [X_in[i] for i in train_indices]
    y_in = [y_in[i] for i in train_indices]
    
    
    
    #scale data first
    scaler = Scaler(copy=False) #in place modification
    #Normalize the data and stores as inner parameters the mean and standard deviation
    #To avoid data snooping, normalization is computed on training set only, and then reported on data
    scaler.fit(X_scaler, y_scaler)  
    X_scaler = scaler.transform(X_scaler)
    X_in = scaler.transform(X_in)
    X_out = scaler.transform(X_out) #uses the same transformation (same mean_ and std_) fit before
    
    std_test = X_scaler.std(axis=0)
    f_indices = [j for j in range(M) if std_test[j] > 1e-7]
    
    #Removes feature with null variance
    
    X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))]
    X_scaler = [[X_scaler[i][j] for j in f_indices] for i in range(len(X_scaler))]
    X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))]   
    
    M = len(X_in[0])
    #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered
    best_cv_accuracy = 0.
    best_c = 0.



    for c in cs:
        kfold = cross_validation.StratifiedKFold(y_in, k=10)
        lrc = LogisticRegression(C=c, tol=1e-5)
                            
        in_accuracy = 0.
        cv_accuracy = 0.
        for t_indices, cv_indices in kfold:
    
            X_train = array([X_in[i][:] for i in t_indices])
            y_train = [y_in[i] for i in t_indices]
            X_cv = array([X_in[i][:] for i in cv_indices])
            y_cv = [y_in[i] for i in cv_indices]            
            
            lrc.fit(X_train, y_train)
            in_accuracy += lrc.score(X_train, y_train)
            cv_accuracy += lrc.score(X_cv, y_cv)
              
        in_accuracy /= kfold.k
        cv_accuracy /= kfold.k
        
        if file_log:
            file_log.writelines('C: {}\n'.format(c))  
            file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy))
            file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy))

        if (cv_accuracy > best_cv_accuracy):
            best_c = c
            best_cv_accuracy = cv_accuracy
            
    #Now tests the out of sample error
    if file_log:        
        file_log.writelines('\nBEST result: E_cv={}, C={}\n'.format(1. - best_cv_accuracy, best_c)) 
    
    lrc = LogisticRegression(C=best_c, tol=1e-5)

    lrc.fit(X_in, y_in)
    if file_log:        
        file_log.writelines('Ein= {}\n'.format(1. - lrc.score(X_in, y_in)))
        file_log.writelines('Etest= {}\n'.format(1. - lrc.score(X_scaler, y_scaler)))     
        
    y_out = lrc.predict(X_out)
    return y_out

コード例 #16

0

ファイルを表示

ファイル: answer_classifier.py プロジェクト: atul2512/Quora

def SVM_train(X_in, y_in, X_out, gammas, cs, file_log=None):    
    if file_log:        
        file_log.writelines('# of Samples: {}, # of Features: {}\n'.format(len(X_in), len(X_in[0])))
    M = len(X_in[0])   #Number of features
    seed(time())
    
    #To prevent data snooping, breaks the input set into train. cross validation
    #and scale sets, with sizes proportional to 8-1-1
    
    #First puts aside 10% of the data for the tests
    scale_set_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in))))

#    shuffle(X_in, y_in)
    
    X_scale = [X_in[i] for i in scale_set_indices]
    y_scale = [y_in[i] for i in scale_set_indices]
    X_in = [X_in[i] for i in train_indices]
    y_in = [y_in[i] for i in train_indices]
        
    #Scale data first
    scaler = Scaler(copy=False)             #WARNING: copy=False => in place modification
    #Normalize the data and stores as inner parameters the mean and standard deviation
    #To avoid data snooping, normalization is computed on a separate subsetonly, and then reported on data
    scaler.fit(X_scale, y_scale)
    X_scale = scaler.transform(X_scale)
    X_in = scaler.transform(X_in)
    X_out = scaler.transform(X_out)         #uses the same transformation (same mean_ and std_) fit before
    
    std_test = X_scale.std(axis=0)
    f_indices = [j for j in range(M) if std_test[j] > 1e-7]
    
    #Removes feature with null variance    
    X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))]
    X_scale = [[X_scale[i][j] for j in f_indices] for i in range(len(X_scale))]
    X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))]
    
    
    if file_log:        
        file_log.writelines('Initial features :{}, Features used: {}\n'.format(M, len(X_in[0])))
    
    M = len(f_indices)
    best_cv_accuracy = 0.
    best_gamma = 0.
    best_c = 0.

     
    #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered
    for c in cs:
        for g in gammas:
            #Balanced cross validation (keeps the ratio of the two classes as
            #constant as possible across the k folds).
            kfold = cross_validation.StratifiedKFold(y_in, k=10)        
            svc = svm.SVC(kernel='rbf', C=c, gamma=g, verbose=False, cache_size=4092, tol=1e-5)
                                
            in_accuracy = 0.
            cv_accuracy = 0.
            for t_indices, cv_indices in kfold:
        
                X_train = array([X_in[i][:] for i in t_indices])
                y_train = [y_in[i] for i in t_indices]
                X_cv = array([X_in[i][:] for i in cv_indices])
                y_cv = [y_in[i] for i in cv_indices]                
                
                svc.fit(X_train, y_train)
                in_accuracy += svc.score(X_train, y_train)
                cv_accuracy += svc.score(X_cv, y_cv)
            
            in_accuracy /= kfold.k
            cv_accuracy /= kfold.k
            if file_log:        
                file_log.writelines('C:{}, gamma:{}\n'.format(c, g))           
                file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy))
                file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy))
    
            if (cv_accuracy > best_cv_accuracy):
                best_gamma = g
                best_c = c
                best_cv_accuracy = cv_accuracy
            
    if file_log:        
        file_log.writelines('\nBEST result: E_cv={}, C={}, gamma={}\n'.format(1. - best_cv_accuracy, best_c, best_gamma))
    
    
    svc = svm.SVC(kernel='rbf', C=best_c, gamma=best_gamma, verbose=False, cache_size=4092, tol=1e-5)

    svc.fit(X_in, y_in)
    if file_log:        
        file_log.writelines('Ein= {}\n'.format(1. - svc.score(X_in, y_in)))
        file_log.writelines('Etest= {}\n'.format(1. - svc.score(X_scale, y_scale)))      
        
    y_out = svc.predict(X_out)
#DEBUG:    output = ['{} {:+}\n'.format(id_out[i], int(y_scale[i])) for i in range(len(X_out))]
#DEBUG:    file_log.writelines('------------------------')    
    return y_out

コード例 #17

0

ファイルを表示

ファイル: answer_classifier.py プロジェクト: zzpanda/Quora

def tree_train(X_in,
               y_in,
               X_out,
               min_meaningful_features_ratio=1.,
               file_log=None):
    if file_log:
        file_log.writelines('# of Samples: {}, # of Features: {}\n'.format(
            len(X_in), len(X_in[0])))

    M = len(X_in[0])  #Number of features
    seed(time())

    #To prevent data snooping, breaks the input set into train. cross validation and test sets, with sizes proportional to 8-1-1

    #First puts aside 10% of the data for the tests
    test_indices, train_indices = split_indices(len(X_in),
                                                int(round(0.1 * len(X_in))))

    X_scaler = [X_in[i] for i in test_indices]
    y_scaler = [y_in[i] for i in test_indices]
    X_in = [X_in[i] for i in train_indices]
    y_in = [y_in[i] for i in train_indices]

    #scale data first
    scaler = Scaler(copy=False)  #in place modification
    #Normalize the data and stores as inner parameters the mean and standard deviation
    #To avoid data snooping, normalization is computed on training set only, and then reported on data
    scaler.fit(X_scaler, y_scaler)
    X_scaler = scaler.transform(X_scaler)
    X_in = scaler.transform(X_in)
    X_out = scaler.transform(
        X_out)  #uses the same transformation (same mean_ and std_) fit before

    std_test = X_scaler.std(axis=0)
    f_indices = [j for j in range(M) if std_test[j] > 1e-7]

    #Removes feature with null variance

    X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))]
    X_scaler = [[X_scaler[i][j] for j in f_indices]
                for i in range(len(X_scaler))]
    X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))]

    M = len(f_indices)
    #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered
    best_cv_accuracy = 0.
    best_features_number = M

    for features_number in range(int(floor(M * min_meaningful_features_ratio)),
                                 M + 1):

        # kfold = cross_validation.KFold(len(y_in), k=10, shuffle=True)
        kfold = cross_validation.StratifiedKFold(y_in, k=10)
        svc = ExtraTreesClassifier(criterion='entropy',
                                   max_features=features_number)

        in_accuracy = 0.
        cv_accuracy = 0.
        for t_indices, cv_indices in kfold:

            X_train = array([[X_in[i][j] for j in range(M)]
                             for i in t_indices])
            y_train = [y_in[i] for i in t_indices]
            X_cv = array([[X_in[i][j] for j in range(M)] for i in cv_indices])
            y_cv = [y_in[i] for i in cv_indices]

            svc.fit(X_train, y_train)
            in_accuracy += svc.score(X_train, y_train)
            cv_accuracy += svc.score(X_cv, y_cv)

        in_accuracy /= kfold.k
        cv_accuracy /= kfold.k
        if file_log:
            file_log.writelines('# of features: {}\n'.format(len(X_train[0])))
            file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy))
            file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy))

        if (cv_accuracy > best_cv_accuracy):
            best_features_number = features_number
            best_cv_accuracy = cv_accuracy

    #Now tests the out of sample error
    if file_log:
        file_log.writelines('\nBEST result: E_cv={}, t={}\n'.format(
            1. - best_cv_accuracy, best_features_number))

    svc = ExtraTreesClassifier(criterion='entropy',
                               n_estimators=features_number)
    svc.fit(X_in, y_in)
    if file_log:
        file_log.writelines('Ein= {}\n'.format(1. - svc.score(X_in, y_in)))
        file_log.writelines(
            'Etest= {}\n'.format(1. - svc.score(X_scaler, y_scaler)))

    y_out = svc.predict(X_out)
    return y_out

コード例 #18

0

ファイルを表示

ファイル: skl_svm.py プロジェクト: kingjr/natmeg_arhus

 all_folds[split, fold, train] = 1
 all_folds[split, fold, test] = 0
 for d in range(0, dims.shape[0]):
     Xtrain = Xm_shfl[train, :, dims[d]]
     ytrain = y_shfl[train]
     sw_train = sw_shfl[train]
     # (deal with NaN in training)
     ytrain = ytrain[~np.isnan(np.nansum(Xtrain, axis=1))]
     sw_train = sw_train[~np.isnan(np.nansum(Xtrain, axis=1))]
     Xtrain = Xtrain[~np.isnan(np.nansum(Xtrain, axis=1)), :]
     if np.unique(ytrain).shape[0] > 1:
         # feature selection (find the 50% most discriminative channels)
         fs.fit(Xtrain, ytrain)         # find
         Xtrain = fs.transform(Xtrain)  # remove unnecessary channels
         # normalization
         scaler.fit(Xtrain)            # find
         Xtrain = scaler.transform(Xtrain)  # apply zscore
         # SVM fit
         clf.fit(Xtrain, ytrain, sample_weight=sw_train)
         # retrieve hyperplan feature identification
         coef[split, fold, dims[d], :, :] = 0  # initialize
         #--- univariate
         uni_features = fs.pvalues_ <= stats.scoreatpercentile(fs.pvalues_, fs.percentile)
         #--- multivariate
         coef[split, fold, dims[d], :, uni_features] = clf.coef_.T
         # predict cross val (deal with NaN in testing)
         Xtest = Xm_shfl[test, :, dims[d]]
         test_nan = np.isnan(np.nansum(Xtest, axis=1))
         Xtest = fs.transform(Xtest)
         Xtest = scaler.transform(Xtest)
         if (Xtest.shape[0] - np.sum(test_nan)) > 0: