def randproj(tx, ty, rx, ry):
    print "randproj"
    compressor = RandomProjection(tx[1].size)
    compressor.fit(tx, y=ty)
    newtx = compressor.transform(tx)
    # compressor = RandomProjection(tx[1].size)
    newrx = compressor.transform(rx)
    em(newtx, ty, newrx, ry, add="wRPtr")
    km(newtx, ty, newrx, ry, add="wRPtr")
    nn(newtx, ty, newrx, ry, add="wRPtr")
    print "randproj done"
Beispiel #2
0
 def rp_analysis(self, X_train, X_test, y_train, y_test, data_set_name):
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     
     ks = []
     for i in range(1000):
         ##
         ## Random Projection
         ##
         rp = GaussianRandomProjection(n_components=X_train_scl.shape[1])
         rp.fit(X_train_scl)
         X_train_rp = rp.transform(X_train_scl)
         
         ks.append(kurtosis(X_train_rp))
         
     mean_k = np.mean(ks, 0)
         
     ##
     ## Plots
     ##
     ph = plot_helper()
     
     title = 'Kurtosis (Randomized Projection) for ' + data_set_name
     name = data_set_name.lower() + '_rp_kurt'
     filename = './' + self.out_dir + '/' + name + '.png'
     
     ph.plot_simple_bar(np.arange(1, len(mean_k)+1, 1),
                        mean_k,
                        np.arange(1, len(mean_k)+1, 1).astype('str'),
                        'Feature Index',
                        'Kurtosis',
                        title,
                        filename)
Beispiel #3
0
 def best_rp_nba(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     rp = GaussianRandomProjection(n_components=X_train_scl.shape[1])
     X_train_transformed = rp.fit_transform(X_train_scl, y_train)
     X_test_transformed = rp.transform(X_test_scl)
     
     ## top 2
     kurt = kurtosis(X_train_transformed)
     i = kurt.argsort()[::-1]
     X_train_transformed_sorted = X_train_transformed[:, i]
     X_train_transformed = X_train_transformed_sorted[:,0:2]
     
     kurt = kurtosis(X_test_transformed)
     i = kurt.argsort()[::-1]
     X_test_transformed_sorted = X_test_transformed[:, i]
     X_test_transformed = X_test_transformed_sorted[:,0:2]
     
     # save
     filename = './' + self.save_dir + '/nba_rp_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Beispiel #4
0
def randproj(tx, ty, rx, ry):
    compressor = RandomProjection(tx[1].size)
    compressor.fit(tx, y=ty)
    newtx = compressor.transform(tx)
    newrx = compressor.transform(rx)
    em(newtx, ty, newrx, ry, add="wRPtr", times=10)
    km(newtx, ty, newrx, ry, add="wRPtr", times=10)
    nn(newtx, ty, newrx, ry, add="wRPtr")
Beispiel #5
0
def find_best_rp(train_data, test_data, start_components, max_features):
    # find random proj with lowest reconstruction error
    best_c = 0
    #scores = []
    #best_k = 0
    #best = 0
    best_rs = 0
    best_r = 20000

    # go to max - 1 since it doesn't make sense to randomly project to the same dimension
    r = range(start_components, max_features)
    print(r)
    # center data for reconstruction
    scalar = StandardScaler(with_mean=True, with_std=False)
    centered = scalar.fit_transform(test_data)
    for c in r:
        print('C=%d' % c)
        for rs in range(1, 501):
            rp = GaussianRandomProjection(n_components=c, random_state=rs).fit(train_data)
            fit = rp.transform(centered)
            recon = extmath.safe_sparse_dot(fit, rp.components_) + scalar.mean_
            err = linalg.norm(test_data - recon)
            if err < best_r:
                best_r = err
                best_c = c
                best_rs = rs
    print('best reconstruction error=%.4f' % best_r)
    print('>>best rs=%d,c=%d' % (best_rs, best_c))

    # for the best, track the variation
    v_max = 0
    errsum = 0
    for rs in range(1, 501):
        rp = GaussianRandomProjection(n_components=c, random_state=rs).fit(train_data)
        fit = rp.transform(centered)
        recon = extmath.safe_sparse_dot(fit, rp.components_) + scalar.mean_
        err = linalg.norm(test_data - recon)
        errsum += err
        if err > v_max:
            v_max = err

    print('RP max:%.3f, avg:%.3f' % (v_max, errsum/500))

    return best_c, best_rs
class ProjClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, n_components=1, knn=100):
        self.n_components = n_components
        self.knn = knn

    def fit(self, X, y, sample_weight=None):
        self.classes_ = numpy.array([0, 1])
        self.proj = GaussianRandomProjection(n_components=self.n_components)
        # self.knner = KNeighborsClassifier(n_neighbors=self.knn)
        self.knner = Knn1dClassifier(self.knn)
        self.proj.fit(X)
        X_new = self.proj.transform(X)
        # TODO sample weight!!
        self.knner.fit(X_new, y, sample_weight=sample_weight)
        print('ok')
        return self

    def predict_proba(self, X):
        X_new = self.proj.transform(X)
        return self.knner.predict_proba(X_new)

    def predict(self, X):
        return numpy.argmax(self.predict_proba(X), axis=1)
#variances.
pc = pca.fit_transform(x)
plotgraph('PCA',pc)


#Nonlinear kernelPCA
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components = 2, kernel = 'rbf')
#kernel=linear,rbf(radial basis function),sigmoid,cosine
xkpca = kpca.fit_transform(x)
plotgraph('kernel pca',xkpca)

##Isomap 
#It is a nonlinear dimensionality reduction method based on spectral theory that 
#attempts to preserve geodetic distances in the lower dimension.
from sklearn.manifold import Isomap
isomap = Isomap(n_components=2)#n_jobs = 4
isomap.fit(x)
X_isomap = isomap.transform(x)
plotgraph('Isomap',X_isomap)


##Gaussian Random Projection
#data with a very large dimension (d) are projected in a two-dimensional space (kd)
# with a random matrix.
from sklearn.random_projection import GaussianRandomProjection
GRP = GaussianRandomProjection(n_components=2, random_state=20)
#random state is like seed to the function
GRP.fit(x)
X_grd = GRP.transform(x)
plotgraph('GRP',X_grd)
Beispiel #8
0
# In[44]:

ids = test.reset_index()['ID']

# In[45]:

from sklearn.decomposition import FactorAnalysis
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection

X_fa = fa.transform(test)

X_srp = srp.transform(test)

X_grp = grp.transform(test)

X_added = pd.concat([
    pd.DataFrame(X_fa),
    pd.DataFrame(X_srp),
    pd.DataFrame(X_grp),
],
                    axis=1)

y_pred = gbm.predict(X_added)
y_pred

# In[46]:

y_pred = np.exp(y_pred) - 1
Beispiel #9
0
labels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
plt.legend(lines, labels)
plt.title('visualization of data in 2D (rp)-> digits dataset')
plt.show()

r = np.array([7, 17, 37, 57, 77])
plt.figure()
for m in range(5):

    x1 = []
    # e1=[]
    r1 = []
    for i in range(4, 64, 10):
        rp1 = GaussianRandomProjection(n_components=i, random_state=r[m])
        rp1.fit(X)
        X1 = rp1.transform(X)
        print(X1.shape, rp1.components_.shape)
        X2 = np.dot(X1, (rp1.components_))
        rmse = np.sqrt(mean_squared_error(X, X2))
        x1.append(i)
        r1.append(rmse)

    r1 = np.array(r1)
    x1 = np.array(x1)

    print(r1)

    if m == 1:
        plt.plot(x1, r1, color='r', label=1)
    if m == 2:
        plt.plot(x1, r1, color='g', label=2)
Beispiel #10
0
tsvd_results_test = tsvd.transform(test)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    train['pca_' + str(i)] = pca2_results_train[:,i-1]
    test['pca_' + str(i)] = pca2_results_test[:, i-1]

    train['ica_' + str(i)] = ica2_results_train[:,i-1]
    test['ica_' + str(i)] = ica2_results_test[:, i-1]

    train['tsvd_' + str(i)] = tsvd_results_train[:,i-1]
Beispiel #11
0
df_ica = pd.DataFrame(ica.fit_transform(train), columns=columns)
df_test_ica = pd.DataFrame(ica.transform(test), columns=columns)

# Truncated SVD
columns = ['TSVD_{}'.format(i) for i in range(n_components)]
tsvd = TruncatedSVD(n_components=n_components, random_state=420)
df_tsvd = pd.DataFrame(tsvd.fit_transform(train), columns=columns)
df_test_tsvd = pd.DataFrame(tsvd.transform(test), columns=columns)

# GRP
columns = ['GRP_{}'.format(i) for i in range(n_components)]
grp = GaussianRandomProjection(n_components=n_components,
                               eps=0.1,
                               random_state=420)
df_grp = pd.DataFrame(grp.fit_transform(train), columns=columns)
df_test_grp = pd.DataFrame(grp.transform(test), columns=columns)

# SRP
columns = ['SRP_{}'.format(i) for i in range(n_components)]
srp = SparseRandomProjection(n_components=n_components,
                             dense_output=True,
                             random_state=420)
df_srp = pd.DataFrame(srp.fit_transform(train), columns=columns)
df_test_srp = pd.DataFrame(srp.transform(test), columns=columns)

train = pd.concat([train, df_pca, df_ica, df_tsvd, df_grp, df_srp], axis=1)
test = pd.concat(
    [test, df_test_pca, df_test_ica, df_test_tsvd, df_test_grp, df_test_srp],
    axis=1)

### Grid Search
Beispiel #12
0
start_time = time.time()

# Load the data
from income_data import X_train, X_test, y_train, y_test

# Scale the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_toTransform = X_train

# Reduce Dimensionality
projection = ProjectionAlgorithm(n_components=22)
X_transformed = projection.fit_transform(X_train)
X_testTransformed = projection.transform(X_test)

# Run em clustering with 2 clusters and plot
cluster = GaussianMixture(random_state=0, n_components=99).fit(X_transformed)
cluster_labels = cluster.predict(X_transformed)
X_transformed = np.dot(X_transformed, np.transpose(cluster.means_))
X_testTransformed = np.dot(X_testTransformed, np.transpose(cluster.means_))

# Define the classifier
nn = MLPClassifier(solver='lbfgs',
                   random_state=1,
                   alpha=0.005,
                   hidden_layer_sizes=3)
grid_params = {'alpha': [0.005], 'hidden_layer_sizes': [3]}
clf = GridSearchCV(nn, param_grid=grid_params, cv=3)
def DecomposedFeatures(train,
                       test,
                       val,
                       total,
                       addtrain,
                       addtest,
                       use_pca=0.0,
                       use_tsvd=0.0,
                       use_ica=0.0,
                       use_fa=0.0,
                       use_grp=0.0,
                       use_srp=0.0,
                       use_KPCA=0.0,
                       kernal="rbf"):
    print("\nStart decomposition process...")
    train_decomposed = []
    test_decomposed = []
    val_decomposed = []

    if addtrain is not None:
        train_decomposed = [addtrain]
        val_decomposed = [addtrain]
    if addtest is not None:
        test_decomposed = [addtest]

    if use_pca > 0.0:
        print("PCA")
        N_COMP = int(use_pca * train.shape[1]) + 1
        pca = PCA(n_components=N_COMP,
                  whiten=True,
                  svd_solver="full",
                  random_state=42)
        pca_results = pca.fit(total)
        pca_results_train = pca.transform(train)
        pca_results_test = pca.transform(test)
        pca_results_val = pca.transform(val)
        train_decomposed.append(pca_results_train)
        test_decomposed.append(pca_results_test)
        val_decomposed.append(pca_results_val)

    if use_tsvd > 0.0:
        print("tSVD")
        N_COMP = int(use_tsvd * train.shape[1]) + 1
        tsvd = TruncatedSVD(n_components=N_COMP, random_state=42)
        tsvd_results = tsvd.fit(total)
        tsvd_results_train = tsvd.transform(train)
        tsvd_results_test = tsvd.transform(test)
        tsvd_results_val = tsvd.transform(val)

        train_decomposed.append(tsvd_results_train)
        test_decomposed.append(tsvd_results_test)
        val_decomposed.append(tsvd_results_val)

    if use_ica > 0.0:
        print("ICA")
        N_COMP = int(use_ica * train.shape[1]) + 1
        ica = FastICA(n_components=N_COMP, random_state=42)
        ica_results = ica.fit(total)
        ica_results_train = ica.transform(train)
        ica_results_test = ica.transform(test)
        ica_results_val = ica.transform(val)

        train_decomposed.append(ica_results_train)
        test_decomposed.append(ica_results_test)
        val_decomposed.append(ica_results_val)

    if use_fa > 0.0:
        print("FA")
        N_COMP = int(use_fa * train.shape[1]) + 1
        fa = FactorAnalysis(n_components=N_COMP, random_state=42)
        fa_results = fa.fit(total)
        fa_results_train = fa.transform(train)
        fa_results_test = fa.transform(test)
        fa_results_val = fa.transform(val)

        train_decomposed.append(fa_results_train)
        test_decomposed.append(fa_results_test)
        val_decomposed.append(fa_results_val)

    if use_grp > 0.0 or use_grp < 0.0:
        print("GRP")
        if use_grp > 0.0:
            N_COMP = int(use_grp * train.shape[1]) + 1
            eps = 10
        if use_grp < 0.0:
            N_COMP = "auto"
            eps = abs(use_grp)
        grp = GaussianRandomProjection(n_components=N_COMP,
                                       eps=eps,
                                       random_state=42)
        grp_results = grp.fit(total)
        grp_results_train = grp.transform(train)
        grp_results_test = grp.transform(test)
        grp_results_val = grp.transform(val)

        train_decomposed.append(grp_results_train)
        test_decomposed.append(grp_results_test)
        val_decomposed.append(grp_results_val)

    if use_srp > 0.0:
        print("SRP")
        N_COMP = int(use_srp * train.shape[1]) + 1
        srp = SparseRandomProjection(n_components=N_COMP,
                                     dense_output=True,
                                     random_state=42)
        srp_results = srp.fit(total)
        srp_results_train = srp.transform(train)
        srp_results_test = srp.transform(test)
        srp_results_val = pca.transform(val)

        train_decomposed.append(srp_results_train)
        test_decomposed.append(srp_results_test)
        val_decomposed.append(srp_results_val)

    if use_KPCA > 0.0:
        print("KPCA")
        N_COMP = int(use_KPCA * train.shape[1]) + 1
        #N_COMP = None
        pls = KernelPCA(n_components=N_COMP, kernel=kernal)
        pls_results = pls.fit(total)
        pls_results_train = pls.transform(train)
        pls_results_test = pls.transform(test)
        pls_results_val = pls.transform(val)
        train_decomposed.append(pls_results_train)
        test_decomposed.append(pls_results_test)
        val_decomposed.append(pls_results_val)
        gc.collect()

    print("Append decomposition components together...")

    train_decomposed = np.concatenate(train_decomposed, axis=1)
    test_decomposed = np.concatenate(test_decomposed, axis=1)
    val_decomposed = np.concatenate(val_decomposed, axis=1)

    train_with_only_decomposed_features = pd.DataFrame(train_decomposed)
    test_with_only_decomposed_features = pd.DataFrame(test_decomposed)
    val_with_only_decomposed_features = pd.DataFrame(val_decomposed)

    #for agg_col in ['sum', 'var', 'mean', 'median', 'std', 'weight_count', 'count_non_0', 'num_different', 'max', 'min']:
    #    train_with_only_decomposed_features[col] = train[col]
    #    test_with_only_decomposed_features[col] = test[col]

    # Remove any NA
    train_with_only_decomposed_features = train_with_only_decomposed_features.fillna(
        0)
    test_with_only_decomposed_features = test_with_only_decomposed_features.fillna(
        0)
    val_with_only_decomposed_features = val_with_only_decomposed_features.fillna(
        0)
    return train_with_only_decomposed_features, test_with_only_decomposed_features, val_with_only_decomposed_features
Beispiel #14
0
pca_results_test = pca.transform(test[flist])

print("tSVD")
tsvd = TruncatedSVD(n_components=N_COMP, random_state=random_state)
tsvd_results_train = tsvd.fit_transform(train[flist])
tsvd_results_test = tsvd.transform(test[flist])

print("ICA")
ica = FastICA(n_components=N_COMP, random_state=random_state)
ica_results_train = ica.fit_transform(train[flist])
ica_results_test = ica.transform(test[flist])

print("GRP")
grp = GaussianRandomProjection(n_components=N_COMP, eps=0.1, random_state=random_state)
grp_results_train = grp.fit_transform(train[flist])
grp_results_test = grp.transform(test[flist])

print("SRP")
srp = SparseRandomProjection(n_components=N_COMP, dense_output=True, random_state=random_state)
srp_results_train = srp.fit_transform(train[flist])
srp_results_test = srp.transform(test[flist])

print("FA")
fa = FactorAnalysis(n_components=N_COMP, random_state=random_state)
fa_results_train = fa.fit_transform(train[flist])
fa_results_test = fa.transform(test[flist])


print("Append decomposition components to datasets...")
for i in range(1, N_COMP + 1):
    train['pca_' + str(i)] = pca_results_train[:, i - 1]
def run_grp(n_c, X_train, X_test, y_train, y_test):
    from sklearn.random_projection import GaussianRandomProjection
    grp = GaussianRandomProjection(n_components = n_c, eps = 0.1)
    X_train = grp.fit_transform(X_train, y_train)
    X_test = grp.transform(X_test)
    return [X_train, X_test]
Beispiel #16
0
class PISFPA_GRP:
    def __init__(self,
                 num_input_nodes,
                 num_hidden_units,
                 num_output_units,
                 activation='sigmoid',
                 loss='mse',
                 beta_init=None,
                 w_init=None,
                 bias_init=None):
        self._num_input_nodes = num_input_nodes
        self._num_hidden_units = num_hidden_units
        self._num_output_units = num_output_units

        self._activation = getActivation(activation)
        self._loss = getLoss(loss)
        self._beta = None
        self._w = None
        self.GRP = GaussianRandomProjection(n_components=num_hidden_units)
        #weight_out
        self._bias = np.random.uniform(-1.,
                                       1.,
                                       size=(self._num_hidden_units, ))

    def fit(self, X, Y, itrs, lam, display_time=False):

        self._w = self.GRP.fit_transform(X)
        H = self._activation(self._w + self._bias)
        if display_time:
            start = time.time()

        L = 1. / np.max(np.linalg.eigvals(np.dot(H.T, H))).real
        m = H.shape[1]
        n = Y.shape[1]
        x0 = np.zeros((m, n))
        x1 = np.zeros((m, n))
        L1 = 2 * L * np.dot(H.T, H)
        L2 = 2 * L * np.dot(H.T, Y)

        for i in range(1, itrs + 1):
            cn = ((2e-6 * i) / (2 * i + 1)) * lam * L
            beta = 0.9 * i / (i + 1)
            alpha = 0.9 * i / (i + 1)

            y = x1 + thetan(x0, x1, i) * (x1 - x0)
            z = (1 - beta) * x1 + beta * T(x1, L1, L2, cn)

            Ty = T(y, L1, L2, cn)
            Tz = T(z, L1, L2, cn)
            x = (1 - alpha) * Ty + alpha * Tz

            x0, x1 = x1, x

        if display_time:
            stop = time.time()
            print(f'Train time: {stop-start}')

        self._beta = x

    def __call__(self, X):
        w = self.GRP.transform(X)
        H = self._activation(w + self._bias)
        return H.dot(self._beta)

    def evaluate(self, X, Y):
        pred = self(X)

        loss = self._loss(Y, pred)

        acc = np.sum(
            np.argmax(pred, axis=-1) == np.argmax(Y, axis=-1)) / len(Y)

        return loss, acc
def applyGRP(data, nc, new_data):
    grp = GaussianRandomProjection(n_components=nc, random_state=79)
    grp.fit(data) 
    return grp.transform(new_data)
Beispiel #18
0
def run(_train, _test):
    for c in categorical_columns:
        train = _train.copy()
        test = _test.copy()

        enc = LabelBinarizer()
        enc.fit(list(train[c].values) + list(test[c].values))
        encoded = pd.DataFrame(enc.transform(list(train[c].values)))
        train = pd.concat([train, encoded], axis=1)
        encoded = pd.DataFrame(enc.transform(list(test[c].values)))
        test = pd.concat([test, encoded], axis=1)
        train.drop(c, axis=1)
        test.drop(c, axis=1)

        ##Add decomposed components: PCA / ICA etc.
        from sklearn.decomposition import PCA, FastICA
        from sklearn.decomposition import TruncatedSVD

        n_comp = 12

        # tSVD
        tsvd = TruncatedSVD(n_components=n_comp, random_state=400)
        tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
        tsvd_results_test = tsvd.transform(test)

        # PCA
        pca = PCA(n_components=n_comp)# random_state=400)
        pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
        pca2_results_test = pca.transform(test)

        # ICA
        ica = FastICA(n_components=n_comp)#, random_state=400)
        ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
        ica2_results_test = ica.transform(test)

        # GRP
        grp = GaussianRandomProjection(n_components=n_comp, eps=0.007, random_state=400)
        grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
        grp_results_test = grp.transform(test)

        # SRP
        srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=400)
        srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
        srp_results_test = srp.transform(test)

        # Append decomposition components to datasets
        for i in range(1, n_comp + 1):
            train['pca_' + str(i)] = pca2_results_train[:, i - 1]
            test['pca_' + str(i)] = pca2_results_test[:, i - 1]

            train['ica_' + str(i)] = ica2_results_train[:, i - 1]
            test['ica_' + str(i)] = ica2_results_test[:, i - 1]

            train['grp_' + str(i)] = grp_results_train[:, i - 1]
            test['grp_' + str(i)] = grp_results_test[:, i - 1]

            train['srp_' + str(i)] = srp_results_train[:, i - 1]
            test['srp_' + str(i)] = srp_results_test[:, i - 1]

            train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
            test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

        y_train = train["y"]
        train = train.drop(["y"], axis=1)

        print(c)
        cv(train, y_train)
mse_df = pd.DataFrame(columns=[
    "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
    "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26",
    "27", "28", "29", "30"
])
transformed_df = pd.DataFrame(columns=[
    "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
    "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26",
    "27", "28", "29", "30"
])

for i in range(500):
    rp = GaussianRandomProjection(n_components=28)
    rp.fit(df)
    transformed_instances = rp.transform(df)
    inverse_components = np.linalg.pinv(rp.components_)
    reconstructed_instances = utils.extmath.safe_sparse_dot(
        transformed_instances, inverse_components.T)
    print(reconstructed_instances)
    break
    #mse = ((df - reconstructed_instances) ** 2).mean()
    #mse_df.loc[i] = mse
    new_df = pd.DataFrame(reconstructed_instances,
                          columns=[
                              "1", "2", "3", "4", "5", "6", "7", "8", "9",
                              "10", "11", "12", "13", "14", "15", "16", "17",
                              "18", "19", "20", "21", "22", "23", "24", "25",
                              "26", "27", "28", "29", "30"
                          ])
    transformed_df = transformed_df.append(new_df)
def select_features_GaussianRandomProjections(train_X, train_y, test_X, k):
    selector = GaussianRandomProjection(n_components=k, random_state=42)
    selector.fit(train_X)
    train_X = selector.transform(train_X)
    test_X = selector.transform(test_X)
    return train_X, test_X
tit_cols = list(ten_features.columns)
print(tit_cols)

# RCA

sil_em = []
rca_models = []
em_models = []

ks = range(2, 20)
for dim in range(1, len(tit_cols)):
    sil_em.append([])
    rca = GaussianRandomProjection(n_components=dim)
    rca.fit(X_train)
    rca_models.append(rca)
    rca_X_train = rca.transform(X_train)
    em_models.append([])
    for k in ks:
        em = GaussianMixture(n_components=k)
        em.fit(rca_X_train)
        em_models[-1].append(em)
        sil_em[-1].append(
            silhouette_score(rca_X_train, em.predict(rca_X_train)))

np.savetxt('rca_sil_em_tennis', sil_em)

if False:
    for j in range(len(tit_cols) - 1):
        plt.plot(ks, sil_em[j], linestyle='-', marker='o', label=str(j + 1))
    plt.title("RCA EM silhouette curves - Tennis")
    plt.xlabel("number of clusters")
					champion_games[key['champ']] [(key['patch'], region, tier)] += build['value']

items_json = []
for champ in champion_builds.keys():

	# perform GaussianRandomProjection
	all_builds = []
	for key in champion_builds[champ]:
		all_builds += champion_builds[champ][key]

	grp = GaussianRandomProjection(2, random_state = 0)
	grp.fit(all_builds)

	for key in champion_builds[champ]:
		builds = champion_builds[champ][key]
		reduction = grp.transform(builds)

		# get top 100 builds
		zipped = zip(list(reduction), build_games[champ][key], build_objects[champ][key])
		sorted_zipped = sorted(zipped, key=lambda x: x[1], reverse=True)
		top_builds = sorted_zipped[0:100]

		builds_json = []
		for i in top_builds:
			x = list(i[0])[0]
			y = list(i[0])[1]
			builds_json.append( {
				"champ":champ,
				"patch":key[0],
				"region":key[1],
				"tier":key[2],
Beispiel #23
0
# We select only sci.crypt category
# Other categories include
# 'sci.med', 'sci.space' ,'soc.religion.christian'
cat = ['sci.crypt']
data = fetch_20newsgroups(categories=cat)

# Create a term document matrix, with term frequencies as the values
# from the above dataset.
vectorizer = TfidfVectorizer(use_idf=False)
vector = vectorizer.fit_transform(data.data)

# Perform the projection. In this case we reduce the dimension to 1000
gauss_proj = GaussianRandomProjection(n_components=1000)
gauss_proj.fit(vector)
# Transform the original data to the new space
vector_t = gauss_proj.transform(vector)

# print transformed vector shape
print vector.shape
print vector_t.shape

# To validate if the transformation has preserved the distance, we calculate the old and the new distance between the points
org_dist = euclidean_distances(vector)
red_dist = euclidean_distances(vector_t)

diff_dist = abs(org_dist - red_dist)

# We take the difference between these points and plot them
# as a heatmap (only the first 100 documents).
plt.figure()
plt.pcolor(diff_dist[0:100, 0:100])
Beispiel #24
0
def main():
    """
    Main method for the script.
    """
    dataset = Dataset(file_path=DATASET_PATHS[CONFIG.dataset])
    df = dataset.get_dataframe()

    # remove columns that are constant values
    metric_headers = dataset.get_metric_headers()
    constant_headers = []
    variable_headers = []
    for header in metric_headers:
        if np.unique(df[header].values).size > 1:
            variable_headers.append(header)
        else:
            constant_headers.append(header)

    metric_headers = variable_headers
    dataset = Dataset(dataframe=df.drop(constant_headers, axis=1))
    raw_metrics = dataset.get_metrics()
    metrics = raw_metrics.T

    # factor analysis
    LOG.info('Starting factor analysis with %s factors...', CONFIG.num_factors)
    start = time()
    # model = FactorAnalysis(n_components=CONFIG.num_factors)
    # factors = model.fit_transform(metrics)  # num_metrics * num_factors
    rng = np.random.RandomState(74)
    model = GaussianRandomProjection(eps=0.999, random_state=rng)
    factors = model.fit_transform(metrics)
    LOG.debug('Dimension before factor analysis: %s', metrics.shape)
    LOG.debug('Dimension after factor analysis: %s', factors.shape)
    LOG.info('Finished factor analysis in %s seconds.', round(time() - start))

    # clustering
    if CONFIG.model == 'kmeans':
        model = build_k_means(factors)
    elif CONFIG.model == 'kmedoids':
        model = build_k_medoids(factors)
    else:
        raise ValueError('Unrecognized model: %s', CONFIG.model)

    # find cluster center
    labels = model.labels_
    # each dimension in transformed_data is the distance to the cluster
    # centers.
    transformed_data = model.transform(factors)
    leftover_metrics = []
    for i in np.unique(labels):
        # index of the points for the ith cluster
        cluster_member_idx = np.argwhere(labels == i).squeeze(1)
        cluster_members = transformed_data[cluster_member_idx]
        # find the index of the minimum-distance point to the center
        closest_member = cluster_member_idx[np.argmin(cluster_members[:, i])]
        leftover_metrics.append(metric_headers[closest_member])

    # latency needs to be in the metrics
    if 'latency' not in leftover_metrics:
        leftover_metrics += ['latency']

    with open(CONFIG.output_path, 'w') as file:
        file.writelines('\n'.join(leftover_metrics))
Beispiel #25
0
from sklearn.decomposition import TruncatedSVD as TruncSVD
tsvd = TruncSVD(n_components=num_components,  algorithm='randomized', random_state=0)
tsvd_transformed_data_train = tsvd.fit_transform(sparse_trainData)
tsvd_transformed_data_valid = tsvd.transform(sparse_validData)

# Perform Randomized Principal Components Analysis (PCA)
from sklearn.decomposition import RandomizedPCA as RPCA
rpca = RPCA(n_components=num_components)
rpca_transformed_data_train = rpca.fit_transform(dense_trainData)
rpca_transformed_data_valid = rpca.transform(dense_validData)

# Perform Gaussian Random Projection
from sklearn.random_projection import GaussianRandomProjection as GaussRan
grp = GaussRan(n_components=num_components)
grp_transformed_data_train = grp.fit_transform(dense_trainData)
grp_transformed_data_valid = grp.transform(dense_validData)

# Perform Sparse Random Projection
from sklearn.random_projection import SparseRandomProjection as SparseRan
srp = SparseRan(n_components=num_components, random_state=0)
srp_transformed_data_train = srp.fit_transform(dense_trainData)
srp_transformed_data_valid = srp.transform(dense_validData)

# Perform classification using 1-Nearest Neighbor Classifier
from sklearn.neighbors import KNeighborsClassifier

# Create a subset grid to plot performance against numbers of components
tsvd_max = tsvd_transformed_data_train.shape[1]
plot_subset = []
length_of_plot_subset = len(plot_subset)
if tsvd_max < 101:
Beispiel #26
0
rtest_data.append(all_reduced[reserve:, ])
rmethod.append("pca")

# ------------------------------------------------------------------------
ica = FastICA(max_iter=500, n_components=comps, random_state=1)
ica.fit(train_data)
all_reduced = ica.transform(train_data)
rdata.append(all_reduced[0:data_n - reserve, ])
rtest_data.append(all_reduced[reserve:, ])
rmethod.append("ica")

# ------------------------------------------------------------------------
grp = GaussianRandomProjection(n_components=comps, eps=0.1,
                               random_state=1)  # reduce data to n components
grp.fit(train_data)
all_reduced = grp.transform(train_data)
rdata.append(all_reduced[0:data_n - reserve, ])
rtest_data.append(all_reduced[reserve:, ])
rmethod.append("grp")

# ------ build the classifier, then run the various data through the neural net -----------------------------------
print_header()

for si in range(len(solvers)):
    solver = solvers[si]

    cfier = MLPClassifier(validation_fraction=.30,
                          max_iter=max_iters,
                          learning_rate=opts["learn_rate"],
                          solver=solver)
    ts_start = time()
Beispiel #27
0
y2 = list(dataset2["y"])
scaler = StandardScaler()
scaler.fit(x2)
x2_n = scaler.transform(x2)
#<---------------------DATASET2

#RANDOMIZED PROJECTION ITER

for name, data, y in [['student set', x1, y1],
                      ['student set normalized', x1_n, y1],
                      ['bank set', x2, y2], ['bank set normalized', x2_n, y2]]:
    data = np.array(data)
    varians = []
    for i in range(200):
        rp = GRP(n_components=8, random_state=None).fit(data)
        newdata = rp.transform(data)
        variance = np.var(newdata)
        varians.append(variance)
        data = newdata
    percentvars = (varians / varians[0])
    pyplot.plot(percentvars, linewidth=1.5, label=name)
pyplot.plot(np.tile(1, 200), 'k--', linewidth=1, label=('start variance'))
pyplot.title('Variance in RP self iteration \n (ratio to the first run)')
pyplot.xlabel('rp iterations')
pyplot.ylabel("variance ratio")
pyplot.legend()
pyplot.show()

#RANDOMIZED PROJECTION KM AND EM PERFORMANCE
"""
for name, data,y in [['student set',x1,y1],['student set normalized',x1_n,y1],['bank set',x2,y2],['bank set normalized',x2_n,y2]]:
n_components = 'auto'
eps = 0.5
random_state = 2018

# インスタンスの作成
GRP = GaussianRandomProjection(n_components=n_components, eps=eps,
                               random_state=random_state)

# 確認
vars(GRP)

# 学習器の作成
GRP.fit(X_train)

# 学習器の適用
X_train_GRP = GRP.transform(X_train)

# データフレームに変換
X_train_GRP = pd.DataFrame(data=X_train_GRP, index=train_index)

# プロット表示
scatterPlot(X_train_GRP, y_train, "Gaussian Random Projection")


# 2 スパースランダム射影 -------------------------------------------------------------------

# パラメータの設定
n_components = 'auto'
density = 'auto'
eps = 0.5
dense_output = False
Beispiel #29
0
def train_NN_RP(filename,
                X_train,
                X_test,
                y_train,
                y_test,
                debug=False,
                numFolds=10,
                njobs=-1,
                scalar=1,
                make_graphs=False,
                pNN={},
                nolegend=False,
                random_seed=1,
                num_dim=4):
    np.random.seed(random_seed)
    algo = 'RP' + str(num_dim)

    start = time.time()
    rp = GRP(n_components=num_dim, random_state=random_seed)
    rp.fit(X_train)
    X_train = rp.transform(X_train)
    X_test = rp.transform(X_test)

    param_grid = [{
        'hidden_layer_sizes': [(512, 512, 512, 512)],
        'activation': ['relu'],  # 'identity',
        'solver': ['adam'],
        'alpha': [0.0001, 0.001, 0.01, 0.1],
        'batch_size': ['auto'],
        'learning_rate_init': [0.001, 0.01],
        'max_iter': [10000],
        'warm_start': [True],
        'early_stopping': [True],
        'random_state': [1]
    }]

    nn_classifier = MLPClassifier()

    grid_search = GridSearchCV(nn_classifier,
                               param_grid,
                               cv=numFolds,
                               scoring='roc_auc_ovr_weighted',
                               return_train_score=True,
                               n_jobs=njobs,
                               verbose=debug)
    grid_search.fit(X_train, y_train)
    cvres = grid_search.cv_results_

    util.save_gridsearch_to_csv(cvres, algo,
                                filename[:-4] + '-' + str(num_dim), scalar, '')

    start = time.time()
    nn_classifier.fit(X_train, y_train)
    print('NN Fit Time: ', time.time() - start)

    start = time.time()
    y_prob = nn_classifier.predict_proba(X_train)
    train_score = roc_auc_score(y_train,
                                y_prob,
                                multi_class="ovr",
                                average="weighted")
    print('NN Train Score Time: ', train_score, time.time() - start)

    start = time.time()
    y_prob = nn_classifier.predict_proba(X_test)
    test_score = roc_auc_score(y_test,
                               y_prob,
                               multi_class="ovr",
                               average="weighted")
    print('NN Test Score Time: ', test_score, time.time() - start)

    test_class = MLPClassifier()
    test_class.set_params(**pNN)

    if make_graphs:
        # computer Model Complexity/Validation curves
        util.plot_learning_curve(nn_classifier,
                                 algo,
                                 filename[:-4],
                                 X_train,
                                 y_train,
                                 ylim=(0.0, 1.05),
                                 cv=10,
                                 n_jobs=njobs,
                                 debug=debug)

        # util.compute_vc(algo, 'alpha',
        #               [0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500,
        #                1000, 5000, 10000, 100000, 1000000], X_train, y_train, X_test, y_test, nn_classifier,
        #               filename[:-4], test_class, pNN, log=True, njobs=njobs, debug=debug)

    return time.time() - start, round(train_score, 4), round(test_score, 4)
plt.legend(loc='best')
plt.show()

#KNN baseline
knn = KNeighborsClassifier(n_neighbors=3)  
knn.fit(X_train, y_train)
ori_train_score = knn.score(X_train, y_train)
ori_test_score = knn.score(X_test, y_test)

#RP
train_scores=[]
test_scores=[]
for component in range(1, len(X_train[0])+1):
    grp = GaussianRandomProjection(n_components=component, random_state=1)
    X_train_reduced = grp.fit_transform(X_train)
    X_test_reduced = grp.transform(X_test)

    knn = KNeighborsClassifier(n_neighbors=3)  
    knn.fit(X_train_reduced, y_train)
    train_scores.append(knn.score(X_train_reduced, y_train))
    test_scores.append(knn.score(X_test_reduced, y_test))
if dataset_name=='spam':
    drawMultiple([train_scores, test_scores], 'KNN Accuracy over Randomized Projected Components (Spam dataset)', 'Number of Projected Components', 'Accuracy', ['projected train score','projected test score'], range(1, len(X_train[0])+1))
elif dataset_name=='letter':
    drawMultiple([train_scores, test_scores], 'KNN Accuracy over Randomized Projected Components (Letter Recognition dataset)', 'Number of Projected Components', 'Accuracy', ['projected train score','projected test score'], range(1, len(X_train[0])+1))

#FA
train_scores=[]
test_scores=[]
for component in range(1, len(X_train[0])+1):
    fa = FeatureAgglomeration(n_clusters=component)
Beispiel #31
0
                                                        test_size=0.3,
                                                        random_state=1,
                                                        stratify=yT)
datasetsRCA['Titanic'] = {
    'X_train': XT_train.copy(),
    'y_train': yT_train.copy(),
    'X_test': XT_test.copy(),
    'y_test': yT_test.copy(),
    'X_full': XT,
    'y_full': yT
}
rp = W4.best_estimator_.steps[0][1]
datasetsRCA['Wilt'] = {
    'X_train': rp.fit_transform(XW_train),
    'y_train': yW_train.copy(),
    'X_test': rp.transform(XW_test),
    'y_test': yW_test.copy()
}

#%% Clustering after DR
clusters = [2, 3, 4, 5, 6, 8, 10, 12, 15, 20, 25, 30, 35, 40, 50]
scoresRCA = hlp.explore_clustering(datasetsRCA, clusters)
pd.DataFrame(scores)

hlp.plot_CE([scores], clusters, 'DR using RCA (RP)')
hlp.plot_CE([scoresRCA, scores], clusters, 'DR using RCA (RP) vs baseline')
plt.suptitle(
    'ICA analysis - (solid) 4 components [Titanic] or 2 [Wilt]  (dotted) 7 [Titanic] or 4 [Wilt]'
)
# Plot Silhouettes
from silhouette import plot_silhouettes
''' Decomposition '''
pca = PCA(12, random_state=0)
pca_train = pca.fit_transform(train, y_train)
pca_test = pca.transform(test)

ica = FastICA(12, random_state=0)
ica_train = ica.fit_transform(train, y_train)
ica_test = ica.transform(test)

tsvd = TruncatedSVD(12, random_state=0)
tsvd_train = tsvd.fit_transform(train, y_train)
tsvd_test = tsvd.transform(test)

grp = GaussianRandomProjection(12, eps=0.1, random_state=0)
grp_train = grp.fit_transform(train, y_train)
grp_test = grp.transform(test)

srp = SparseRandomProjection(12, dense_output=True, random_state=0)
srp_train = srp.fit_transform(train, y_train)
srp_test = srp.transform(test)

for i in range(12):
    train['pca_' + str(i)] = pca_train.T[i]
    test['pca_' + str(i)] = pca_test.T[i]

    train['ica_' + str(i)] = ica_train.T[i]
    test['ica_' + str(i)] = ica_test.T[i]

    train['tsvd_' + str(i)] = tsvd_train.T[i]
    test['tsvd_' + str(i)] = tsvd_test.T[i]
Beispiel #33
0
def kMeans():
    # citation: https://realpython.com/k-means-clustering-python/
    digits = load_digits()

    # features
    digits_features = digits.data[:, 0:-1]
    # label
    label = digits.data[:, -1]

    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(digits_features)

    # citation: hands on machine learning
    gm = GaussianMixture(covariance_type='spherical',
                         n_components=8,
                         n_init=10)
    gm.fit(scaled_features)
    print("GM Converged", gm.converged_)
    print("GM Convergence Iterations", gm.n_iter_)
    print("GM weights", gm.weights_)

    gm.predict(scaled_features)
    gm.predict_proba(scaled_features)
    gm.score_samples(scaled_features)

    aic = []
    bic = []

    for i in range(21):
        gm = GaussianMixture(covariance_type='spherical',
                             n_components=20,
                             n_init=10)
        gm.fit(scaled_features)
        aic.append(gm.aic(scaled_features))
        bic.append(gm.bic(scaled_features))

    plt.plot(aic, label="AIC")
    plt.plot(bic, label="BIC")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("Number of Clusters")
    plt.ylabel("Information Criterion")
    plt.legend()
    plt.show()

    # x_centered = digits_features - digits_features.mean(axis=0)
    # U, s, Vt = np.linalg.svd(x_centered)
    # c1 = Vt.T[:, 0]
    # c2 = Vt.T[:, 1]

    # W2 = Vt.T[:, :2]
    # X2D = x_centered.dot(W2)

    # pca = PCA()
    # pca.fit(scaled_features)
    # cumsum = np.cumsum(pca.explained_variance_ratio_)
    # d = np.argmax(cumsum >= 0.95) + 1

    # pca = PCA(n_components=0.95)
    # X_reduced = pca.fit_transform(scaled_features)

    explained_variance = []
    for i in range(63):
        pca = PCA(n_components=i)
        pca.fit(scaled_features)
        cumsum = np.cumsum(pca.explained_variance_ratio_)

    plt.plot(cumsum, label="Explained Variance Ratio")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("Number of Dimensions")
    plt.ylabel("Explained Variance Ratio")
    plt.legend()
    plt.show()

    digits_trainingX, digits_testingX, digits_trainingY, digits_testingY = train_test_split(
        digits_features, label)

    # ica
    # citation: https://stackoverflow.com/questions/36566844/pca-projection-and-reconstruction-in-scikit-learn

    error = []

    for i in range(1, 50):
        pca = PCA(n_components=i)
        pca.fit(digits_trainingX)
        U, S, VT = np.linalg.svd(digits_trainingX - digits_trainingX.mean(0))
        x_train_pca = pca.transform(digits_trainingX)
        x_train_pca2 = (digits_trainingX - pca.mean_).dot(pca.components_.T)
        x_projected = pca.inverse_transform(x_train_pca)
        x_projected2 = x_train_pca.dot(pca.components_) + pca.mean_
        loss = ((digits_trainingX - x_projected)**2).mean()
        error.append(loss)

    plt.clf()
    plt.figure(figsize=(15, 15))
    plt.title("reconstruction error")
    plt.plot(error, 'r')
    plt.xticks(range(len(error)), range(1, 50), rotation='vertical')
    plt.xlim([-1, len(error)])
    plt.show()

    clf = MLPClassifier(alpha=0.001,
                        hidden_layer_sizes=(8, ),
                        random_state=1,
                        solver='lbfgs')
    clf.fit(digits_trainingX, digits_trainingY)
    y_pred = clf.predict(digits_testingX)
    print("Accuracy Score Normal", accuracy_score(digits_testingY, y_pred))

    k_acc = []
    k_gm = []
    time_arr = []
    for k in range(1, 15):
        kmeans = KMeans(n_clusters=k)
        X_train = kmeans.fit_transform(digits_trainingX)
        X_test = kmeans.transform(digits_testingX)
        start_time = time.time()
        clf = MLPClassifier(alpha=0.001,
                            hidden_layer_sizes=(8, ),
                            random_state=1,
                            solver='lbfgs')
        clf.fit(X_train, digits_trainingY)
        total_time = time.time() - start_time
        y_pred = clf.predict(X_test)
        score = accuracy_score(digits_testingY, y_pred)
        k_acc.append(score)
        time_arr.append(total_time)

    plt.plot(k_acc, label="K-Means")
    plt.plot(time_arr, label="Computation Time")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("k # of clusters")
    plt.ylabel("NN Accuracy")
    plt.legend()
    plt.show()

    acc = []
    acc_ica = []
    acc_rca = []
    for i in range(1, 40):
        pca = PCA(n_components=i)
        X_train = pca.fit_transform(digits_trainingX)
        X_test = pca.transform(digits_testingX)
        clf = MLPClassifier(alpha=0.001,
                            hidden_layer_sizes=(8, ),
                            random_state=1,
                            solver='lbfgs')
        clf.fit(X_train, digits_trainingY)
        y_pred = clf.predict(X_test)
        score = accuracy_score(digits_testingY, y_pred)
        acc.append(score)

        ica = FastICA(n_components=i)
        x_train_i = ica.fit_transform(digits_trainingX)
        x_test_i = ica.transform(digits_testingX)
        clf.fit(x_train_i, digits_trainingY)
        y_pred_i = clf.predict(x_test_i)
        score_i = accuracy_score(digits_testingY, y_pred_i)
        acc_ica.append(score_i)

        rca = GaussianRandomProjection(n_components=i)
        x_train_r = rca.fit_transform(digits_trainingX)
        x_test_r = rca.transform(digits_testingX)
        clf.fit(x_train_r, digits_trainingY)
        y_pred_r = clf.predict(x_test_r)
        score_r = accuracy_score(digits_testingY, y_pred_r)
        acc_rca.append(score_r)

    plt.plot(acc, label="PCA")
    plt.plot(acc_ica, label="ICA")
    plt.plot(acc_rca, label="RCA")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("Components")
    plt.ylabel("NN Accuracy")
    plt.legend()
    plt.show()
    # cumsum = np.cumsum(pca.explained_variance_ratio_)
    # d = np.argmax(cumsum >= 0.95) + 1

    # randomized projections
    rnd_pca = PCA(n_components=50, svd_solver="randomized")
    X_reduced_rand = rnd_pca.fit_transform(scaled_features)

    # citation: https://scikit-learn.org/stable/auto_examples/feature_selection/plot_feature_selection.html#sphx-glr-auto-examples-feature-selection-plot-feature-selection-py
    # k best
    scaler = MinMaxScaler()
    digits_indices = np.arange(digits_features.shape[-1])
    scaled_features_norm = scaler.fit_transform(scaled_features)
    k_selected = SelectKBest(f_classif, k=50)
    k_selected.fit(scaled_features_norm, label)
    scores = -np.log10(k_selected.pvalues_)
    plt.bar(digits_indices - .45,
            scores,
            width=.2,
            label=r'Univariate score ($-Log(p_{value})$)')
    plt.xlabel("Features")
    plt.ylabel("F-Score")
    plt.show()

    gm = GaussianMixture(covariance_type='spherical',
                         n_components=8,
                         n_init=10)
    gm.fit(X_reduced_inc)
    print("GM Converged - PCA Inc", gm.converged_)
    print("GM Convergence Iterations", gm.n_iter_)
    print("GM weights", gm.weights_)

    gm.predict(X_reduced_inc)
    gm.predict_proba(X_reduced_inc)
    gm.score_samples(X_reduced_inc)

    kmeans = KMeans(init="random",
                    n_clusters=63,
                    n_init=10,
                    max_iter=300,
                    random_state=42)
    kmeans.fit(scaled_features)

    # the lowest SSE value
    print("KMeans Inertia", kmeans.inertia_)

    # final locations of the centroid
    print("KMeans Cluster Centers", kmeans.cluster_centers_)

    # num of iterations required to converge
    print("KMeans Iterations Required To Converge", kmeans.n_iter_)

    # labels
    print("KMeans Labels", kmeans.labels_[:5])

    kmeans_kwargs = {
        "init": "random",
        "n_init": 10,
        "max_iter": 300,
        "random_state": 42,
    }

    sse = []
    for k in range(1, 63):
        kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
        kmeans.fit(scaled_features)
        sse.append(kmeans.inertia_)

    kl = KneeLocator(range(1, 63), sse, curve="convex", direction="decreasing")

    # optimal k (number of clusters) for this dataset
    print("Elbow", kl.elbow)

    clf = MLPClassifier(alpha=0.001,
                        hidden_layer_sizes=(8, ),
                        random_state=1,
                        solver='lbfgs')
    clf.fit(digits_trainingX, digits_trainingY)
    y_pred = clf.predict(digits_testingX)

    model = KMeans(n_clusters=5)
    kmeans.fit(scaled_features)
    labels = kmeans.fit_predict(digits_testingX)

    print("Accuracy Score Normal", accuracy_score(digits_testingY, y_pred))
    print("Accuracy Score K-Means", accuracy_score(digits_testingY, labels))

    elbow_visualizer = KElbowVisualizer(model, k=(2, 63))
    elbow_visualizer.fit(digits_features)
    elbow_visualizer.show()

    silhouette_visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
    silhouette_visualizer.fit(digits_features)
    silhouette_visualizer.show()

    ic_visualizer = InterclusterDistance(model)
    ic_visualizer.fit(digits_features)
    ic_visualizer.show()

    # gmm = GaussianMixture(n_components=7).fit(digits_features)
    # labels = gmm.predict(digits_features)
    # plt.scatter(digits_features[:, 0], digits_features[:, 1], c=labels, s=40, cmap='viridis')
    # plt.show()

    # digits_features_pd = pd.DataFrame(data=digits_features[1:, 1:],
    # index=digits_features[1:,0],
    # columns=digits_features[0,1:])

    # pd.plotting.scatter_matrix(digits_features_pd)

    # probs = GaussianMixture.predict_proba(digits_features)
    # print(probs[:5].round(3))

    kmeans = KMeans(init="random",
                    n_clusters=18,
                    n_init=10,
                    max_iter=300,
                    random_state=42)
    kmeans.fit(X_reduced_inc)

    # the lowest SSE value
    print("KMeans Inertia", kmeans.inertia_)

    # final locations of the centroid
    print("KMeans Cluster Centers", kmeans.cluster_centers_)

    # num of iterations required to converge
    print("KMeans Iterations Required To Converge", kmeans.n_iter_)

    # labels
    print("KMeans Labels", kmeans.labels_[:5])

    kmeans_kwargs = {
        "init": "random",
        "n_init": 10,
        "max_iter": 300,
        "random_state": 42,
    }

    sse = []
    for k in range(1, 18):
        kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
        kmeans.fit(scaled_features)
        sse.append(kmeans.inertia_)

    kl = KneeLocator(range(1, 18), sse, curve="convex", direction="decreasing")

    # optimal k (number of clusters) for this dataset
    print("Elbow", kl.elbow)

    model = KMeans()
    elbow_visualizer = KElbowVisualizer(model, k=(2, 18))
    elbow_visualizer.fit(X_reduced_inc)
    elbow_visualizer.show()

    silhouette_visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
    silhouette_visualizer.fit(X_reduced_inc)
    silhouette_visualizer.show()

    ic_visualizer = InterclusterDistance(model)
    ic_visualizer.fit(X_reduced_inc)
    ic_visualizer.show()
def nn2(xs, ys, xs_test, ys_test, n_components, clf_constructor):
    ks = [0 for _ in range(10)]
    cataccs = [0 for _ in range(10)]

    ys = [to_categorical(ys[0]), to_categorical(ys[1])]
    ys_test = [to_categorical(ys_test[0]), to_categorical(ys_test[1])]

    for i in range(2):
        shape = np.shape(xs[i])[1]
        n_components[i] = shape
        model = utils.create_adult_model(
            shape, 2) if i == 0 else utils.create_wine_model(shape, 5)
        model.fit(xs[i][:10000],
                  ys[i][:10000],
                  batch_size=50,
                  epochs=10,
                  verbose=False)
        cataccs[i] = model.evaluate(xs_test[i], ys_test[i],
                                    verbose=False)[1] * 100

    for k in range(2, 11):
        try:
            clf = clf_constructor(n_clusters=k)
        except:
            clf = clf_constructor(n_components=k)
        for i in range(2):
            pca = PCA(n_components=n_components[2 + i])
            transformed = pca.fit_transform(xs[i])
            transformed_test = pca.transform(xs_test[i])
            predict = to_categorical(clf.fit_predict(transformed[:10000]))
            predict_test = to_categorical(clf.predict(
                transformed_test[:10000]))
            input_dims = [n_components[2 + i], k]
            model = utils.create_mi_adult_model(
                input_dims, 2) if i == 0 else utils.create_mi_wine_model(
                    input_dims, 5)
            model.fit([transformed[:10000], predict],
                      ys[i][:10000],
                      batch_size=50,
                      epochs=10,
                      verbose=False)
            catacc = model.evaluate([transformed_test, predict_test],
                                    ys_test[i],
                                    verbose=False)[1] * 100
            if catacc > cataccs[2 + i]:
                ks[2 + i] = k
                cataccs[2 + i] = catacc

            ica = FastICA(n_components=n_components[4 + i])
            transformed = ica.fit_transform(xs[i])
            transformed_test = ica.transform(xs_test[i])
            predict = to_categorical(clf.fit_predict(transformed[:10000]))
            predict_test = to_categorical(clf.predict(
                transformed_test[:10000]))
            input_dims = [n_components[4 + i], k]
            model = utils.create_mi_adult_model(
                input_dims, 2) if i == 0 else utils.create_mi_wine_model(
                    input_dims, 5)
            model.fit([transformed[:10000], predict],
                      ys[i][:10000],
                      batch_size=50,
                      epochs=10,
                      verbose=False)
            catacc = model.evaluate([transformed_test, predict_test],
                                    ys_test[i],
                                    verbose=False)[1] * 100
            if catacc > cataccs[4 + i]:
                ks[4 + i] = k
                cataccs[4 + i] = catacc

            if i == 1:
                rp = GaussianRandomProjection(eps=0.95)
                transformed = rp.fit_transform(xs[i])
                transformed_test = rp.transform(xs_test[i])
                predict = to_categorical(clf.fit_predict(transformed[:10000]))
                predict_test = to_categorical(
                    clf.predict(transformed_test[:10000]))
                input_dims = [np.shape(transformed)[1], k]
                model = utils.create_mi_wine_model(input_dims, 5)
                model.fit([transformed[:10000], predict],
                          ys[i][:10000],
                          batch_size=50,
                          epochs=10,
                          verbose=False)
                catacc = model.evaluate([transformed_test, predict_test],
                                        ys_test[i],
                                        verbose=False)[1] * 100
                if catacc > cataccs[6 + i]:
                    ks[6 + i] = k
                    cataccs[6 + i] = catacc

            encoder, vae = utils.create_vae(
                np.shape(xs[i])[1], n_components[8 + i])
            vae.fit(xs[i], batch_size=50, epochs=10, verbose=False)
            transformed = encoder.predict(xs[i], verbose=False)
            transformed_test = encoder.predict(xs_test[i], verbose=False)
            predict = to_categorical(clf.fit_predict(transformed[:10000]))
            predict_test = to_categorical(clf.predict(
                transformed_test[:10000]))
            input_dims = [n_components[8 + i], k]
            model = utils.create_mi_adult_model(
                input_dims, 2) if i == 0 else utils.create_mi_wine_model(
                    input_dims, 5)
            model.fit([transformed[:10000], predict],
                      ys[i][:10000],
                      batch_size=50,
                      epochs=10,
                      verbose=False)
            catacc = model.evaluate([transformed_test, predict_test],
                                    ys_test[i],
                                    verbose=False)[1] * 100
            if catacc > cataccs[8 + i]:
                ks[8 + i] = k
                cataccs[8 + i] = catacc

    plot.style.use('seaborn-darkgrid')
    plot.title(f'Influence of feature transformation on the NN accuracy')
    color = []
    for _ in range(5):
        color.append('tab:blue')
        color.append('tab:orange')
    x = []
    count = 1
    for _ in range(5):
        x.append(count)
        count += 0.5
        x.append(count)
        count += 1
    plot.bar(x, cataccs, color=color, width=0.75)
    x = []
    count = 1.25
    for _ in range(5):
        x.append(count)
        count += 1.5
    plot.xticks(x, ['None', 'PCA', 'ICA', 'RP', 'VAE'])
    plot.xlabel('Feature transformation method')
    plot.ylabel('Categorical accuracy (%)')
    plot.show()
Beispiel #35
0
def gen_feature(train, test):
    train = pd.DataFrame(train)
    test = pd.DataFrame(test)

    n_comp = 15
    drop_list = []
    test_drop_list = []

    print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape)
    print('tSVD', datetime.now() - start)
    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp)
    tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1))
    tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1))

    print('PCA', datetime.now() - start)
    # PCA
    pca = PCA(n_components=n_comp)
    pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1))
    pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1))

    print('ICA', datetime.now() - start)
    # ICA
    ica = FastICA(n_components=n_comp, max_iter=10000)
    ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1))
    ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1))

    print('GRP', datetime.now() - start)
    # GRP
    grp = GaussianRandomProjection(n_components=n_comp, eps=0.1)
    grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1))
    grp_results_test = grp.transform(test.drop(test_drop_list, axis=1))

    print('SRP', datetime.now() - start)
    # SRP
    srp = SparseRandomProjection(n_components=n_comp, dense_output=True)
    srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1))
    srp_results_test = srp.transform(test.drop(test_drop_list, axis=1))

    # MCA
    # res_mca = MCA(train, ncp=n_comp, graph = FALSE)

    # save columns list before adding the decomposition components

    usable_columns = list(set(train.columns) - set(drop_list))

    # Append decomposition components to datasets
    for i in range(1, n_comp + 1):
        train['pca_' + str(i)] = pca2_results_train[:, i - 1]
        test['pca_' + str(i)] = pca2_results_test[:, i - 1]

        train['ica_' + str(i)] = ica2_results_train[:, i - 1]
        test['ica_' + str(i)] = ica2_results_test[:, i - 1]

        train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
        test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

        train['grp_' + str(i)] = grp_results_train[:, i - 1]
        test['grp_' + str(i)] = grp_results_test[:, i - 1]

        train['srp_' + str(i)] = srp_results_train[:, i - 1]
        test['srp_' + str(i)] = srp_results_test[:, i - 1]
    return train, test
def perform_feature_engineering(train, test, config):

    for c in train.columns:
        if (len(train[c].value_counts()) == 2):
            if (train[c].mean() < config['SparseThreshold']):
                del train[c]
                del test[c]

    col = list(test.columns)
    if config['ID'] != True:
        col.remove('ID')

    # tSVD
    if (config['tSVD'] == True):
        tsvd = TruncatedSVD(n_components=config['n_comp'])
        tsvd_results_train = tsvd.fit_transform(train[col])
        tsvd_results_test = tsvd.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
            test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]
    # PCA
    if (config['PCA'] == True):
        pca = PCA(n_components=config['n_comp'])
        pca2_results_train = pca.fit_transform(train[col])
        pca2_results_test = pca.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['pca_' + str(i)] = pca2_results_train[:, i - 1]
            test['pca_' + str(i)] = pca2_results_test[:, i - 1]
    # ICA
    if (config['ICA'] == True):
        ica = FastICA(n_components=config['n_comp'])
        ica2_results_train = ica.fit_transform(train[col])
        ica2_results_test = ica.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['ica_' + str(i)] = ica2_results_train[:, i - 1]
            test['ica_' + str(i)] = ica2_results_test[:, i - 1]

    # GRP
    if (config['GRP'] == True):
        grp = GaussianRandomProjection(n_components=config['n_comp'], eps=0.1)
        grp_results_train = grp.fit_transform(train[col])
        grp_results_test = grp.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['grp_' + str(i)] = grp_results_train[:, i - 1]
            test['grp_' + str(i)] = grp_results_test[:, i - 1]

    # SRP
    if (config['SRP'] == True):
        srp = SparseRandomProjection(n_components=config['n_comp'],
                                     dense_output=True,
                                     random_state=420)
        srp_results_train = srp.fit_transform(train[col])
        srp_results_test = srp.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['srp_' + str(i)] = srp_results_train[:, i - 1]
            test['srp_' + str(i)] = srp_results_test[:, i - 1]

    if config['magic'] == True:
        magic_mat = train[['ID', 'X0', 'y']]
        magic_mat = magic_mat.groupby(['X0'])['y'].mean()
        magic_mat = pd.DataFrame({
            'X0': magic_mat.index,
            'magic': list(magic_mat)
        })
        mean_magic = magic_mat['magic'].mean()
        train = train.merge(magic_mat, on='X0', how='left')
        test = test.merge(magic_mat, on='X0', how='left')
        test['magic'] = test['magic'].fillna(mean_magic)
    return train, test
Beispiel #37
0
def gen_features(train, val, test):
    train = pd.DataFrame(train)
    val = pd.DataFrame(val)
    test = pd.DataFrame(test)
    # cat_cols = ['city', 'bd', 'gender', 'registered_via', 'registration_init_year',
    #              'registration_init_month', 'registration_init_date', 'payment_method_id', 'payment_plan_days',
    #              'plan_list_price', 'actual_amount_paid', 'is_auto_renew', 'is_cancel',
    #              'transaction_date_year', 'transaction_date_month', 'transaction_date_date',
    #              'membership_expire_date_year',
    #              'membership_expire_date_month', 'membership_expire_date_date', 'membership_transaction_gap',
    #              'cancel_times',
    #              'auto_renew_count', 'plan_net_worth', 'user_date_year', 'user_date_month',
    #              'user_date_date']
    # con_cols = [x for x in train.columns if x not in cat_cols and x not in ['msno', 'is_churn']]
    # train[cat_cols] = train[cat_cols].astype('object')
    # test[cat_cols] = test[cat_cols].astype('object')
    # val[cat_cols] = val[cat_cols].astype('object')
    #
    # for col in cat_cols:
    #     train[col].fillna(value=train[col].mode()[0], inplace=True)
    #     test[col].fillna(value=test[col].mode()[0], inplace=True)
    #     val[col].fillna(value=val[col].mode()[0], inplace=True)
    # for col in con_cols:
    #     train[col].fillna(value=train[col].mean(), inplace=True)
    #     test[col].fillna(value=test[col].mean(), inplace=True)
    #     val[col].fillna(value=val[col].mean(), inplace=True)
    #
    # for c in train.columns:
    #     if train[c].dtype == 'object':
    #         lbl = LabelEncoder()
    #         lbl.fit(list(train[c].values) + list(test[c].values))
    #         train[c] = lbl.transform(list(train[c].values))
    #         test[c] = lbl.transform(list(test[c].values))

    n_comp = 15

    drop_list = []
    test_drop_list = []

    print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape)
    print('tSVD', datetime.now() - start)
    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp)
    tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1))
    tsvd_results_val= tsvd.transform(val.drop(test_drop_list, axis=1))
    tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1))

    print('PCA', datetime.now() - start)
    # PCA
    pca = PCA(n_components=n_comp)
    pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1))
    pca2_results_val = pca.transform(val.drop(test_drop_list, axis=1))
    pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1))

    print('ICA', datetime.now() - start)
    # ICA
    ica = FastICA(n_components=n_comp, max_iter=10000)
    ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1))
    ica2_results_val = ica.transform(val.drop(test_drop_list, axis=1))
    ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1))

    print('GRP', datetime.now() - start)
    # GRP
    grp = GaussianRandomProjection(n_components=n_comp, eps=0.1)
    grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1))
    grp_results_val = grp.transform(val.drop(test_drop_list, axis=1))
    grp_results_test = grp.transform(test.drop(test_drop_list, axis=1))

    print('SRP', datetime.now() - start)
    # SRP
    srp = SparseRandomProjection(n_components=n_comp, dense_output=True)
    srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1))
    srp_results_val = srp.transform(val.drop(test_drop_list, axis=1))
    srp_results_test = srp.transform(test.drop(test_drop_list, axis=1))

    # MCA
    # res_mca = MCA(train, ncp=n_comp, graph = FALSE)

    # save columns list before adding the decomposition components

    usable_columns = list(set(train.columns) - set(drop_list))

    # Append decomposition components to datasets
    for i in range(1, n_comp + 1):
        train['pca_' + str(i)] = pca2_results_train[:, i - 1]
        val['pca_' + str(i)] = pca2_results_val[:, i - 1]
        test['pca_' + str(i)] = pca2_results_test[:, i - 1]

        train['ica_' + str(i)] = ica2_results_train[:, i - 1]
        val['ica_' + str(i)] = ica2_results_val[:, i - 1]
        test['ica_' + str(i)] = ica2_results_test[:, i - 1]

        train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
        val['tsvd_' + str(i)] = tsvd_results_val[:, i - 1]
        test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

        train['grp_' + str(i)] = grp_results_train[:, i - 1]
        val['grp_' + str(i)] = grp_results_val[:, i - 1]
        test['grp_' + str(i)] = grp_results_test[:, i - 1]

        train['srp_' + str(i)] = srp_results_train[:, i - 1]
        val['srp_' + str(i)] = srp_results_val[:, i - 1]
        test['srp_' + str(i)] = srp_results_test[:, i - 1]
    return train, val, test
def run_rp(dataset, min_components, max_components):

    X, y = load_dataset(dataset)
    data = X

    n_samples, n_features = data.shape
    n_labels = len(np.unique(y))
    labels = y

    results = []

    for n_components in range(min_components, max_components):
        print('n_components: ', n_components)

        for max_iters in [10, 40, 100, 500]:

            scores = []
            times = []
            components = []
            kurtoses = []
            losses = []

            for iters in range(0, max_iters):

                data = X.copy()
                rp = GaussianRandomProjection(n_components=n_components)

                t0 = time()
                rp.fit(X)
                times.append(time() - t0)
                components.append(rp.n_components_)
                kurtoses.append(kurtosis(rp.components_, axis=None))

                matrix = rp.components_
                new_data = rp.transform(data)
                new_data_inv = np.dot(new_data, matrix)
                loss = metrics.mean_squared_error(data, new_data_inv)
                losses.append(loss)

            scores.append(n_components)
            scores.append(max_iters)
            scores.append(np.mean(np.array(times)))
            scores.append(np.mean(np.array(components)))
            scores.append(np.mean(np.array(kurtoses)))
            scores.append(np.std(np.array(kurtoses)))
            scores.append(np.mean(np.array(losses)))
            scores.append(np.std(np.array(losses)))

            results.append(scores)

    # N-Components vs Loss
    plot_results(np.array(results),
                 trends_index=1,
                 x_axis_index=0,
                 x_axis_label='K-Components',
                 y_axis_index=[6],
                 y_axis_label='Reconstruction Error',
                 title=dataset.title() + ': RP',
                 filename='-'.join(['rp', dataset, 'loss']))

    # N-Components vs Loss (STD)
    plot_results(np.array(results),
                 trends_index=1,
                 x_axis_index=0,
                 x_axis_label='K-Components',
                 y_axis_index=[7],
                 y_axis_label='Reconstruction Error (STD)',
                 title=dataset.title() + ': RP',
                 filename='-'.join(['rp', dataset, 'lossstd']))

    # N-Components vs Kurtosis
    plot_results(np.array(results),
                 trends_index=1,
                 x_axis_index=0,
                 x_axis_label='K-Components',
                 y_axis_index=[4],
                 y_axis_label='Kurtosis',
                 title=dataset.title() + ': RP',
                 filename='-'.join(['rp', dataset, 'kurtosis']))

    # N-Components vs Kurtosis (STD)
    plot_results(np.array(results),
                 trends_index=1,
                 x_axis_index=0,
                 x_axis_label='K-Components',
                 y_axis_index=[5],
                 y_axis_label='Kurtosis (STD)',
                 title=dataset.title() + ': RP',
                 filename='-'.join(['rp', dataset, 'kurtstd']))

    # N-Components vs Components
    plot_results(np.array(results),
                 trends_index=1,
                 x_axis_index=0,
                 x_axis_label='K-Components',
                 y_axis_index=[3],
                 y_axis_label='Components',
                 title=dataset.title() + ': RP',
                 filename='-'.join(['rp', dataset, 'comp']))

    # N-Components vs Time
    plot_results(np.array(results),
                 trends_index=1,
                 x_axis_index=0,
                 x_axis_label='K-Components',
                 y_axis_index=[2],
                 y_axis_label='Time',
                 title=dataset.title() + ': RP',
                 filename='-'.join(['rp', dataset, 'time']))

    results = np.array(results)
    np.savetxt('output-csv/' + ('-'.join([dataset, 'rp.csv'])),
               results,
               delimiter=",",
               fmt="%s")
tsvd_results_test = tsvd.transform(testingSet)

# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(trainingSet.drop(["y"], axis=1))
pca2_results_test = pca.transform(testingSet)

# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(trainingSet.drop(["y"], axis=1))
ica2_results_test = ica.transform(testingSet)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(trainingSet.drop(["y"], axis=1))
grp_results_test = grp.transform(testingSet)

# SRP
srp = SparseRandomProjection(n_components=n_comp,
                             dense_output=True,
                             random_state=420)
srp_results_train = srp.fit_transform(trainingSet.drop(["y"], axis=1))
srp_results_test = srp.transform(testingSet)

# Append decomposition components to datasets
for i in range(1, n_comp + 1):
    trainingSet['pca_' + str(i)] = pca2_results_train[:, i - 1]
    testingSet['pca_' + str(i)] = pca2_results_test[:, i - 1]

    trainingSet['ica_' + str(i)] = ica2_results_train[:, i - 1]
    testingSet['ica_' + str(i)] = ica2_results_test[:, i - 1]
# Selec tonly sci.crypt category
# Other categories include 
# sci.med sci.space soc.religion.christian
cat = ['sci.crypt']
data = fetch_20newsgroups(categories=cat)

# Creat a term document matrix with term frequencies as the values frmo the
# above dataset
vectorizer = TfidfVectorizer(use_idf=False)
vector = vectorizer.fit_transform(data.data)

# Perform the projection. In this case we reduce the dimension to 1000
gauss_proj = GaussianRandomProjection(n_components=1000)
gauss_proj.fit(vector)
# Transform the original data to the new space
vector_t = gauss_proj.transform(vector)

# Print transformed vector shape
print vector.shape
print vector_t.shape

# To validate if the transformation has preserved the distance, we calculate the
# old and the new distance between the points
org_dist = euclidean_distances(vector)
red_dist = euclidean_distances(vector_t)
diff_dist = abs(org_dist - red_dist)

# We take the difference between these points and plot them as a heatmap (only
# the first 1000 documents).
plt.figure()
plt.pcolor(diff_dist[0:100, 0:100])
Beispiel #41
0
def scatterPlot(xDF, yDF, algoName):
    tempDF = pd.DataFrame(data=xDF.loc[:, 0:1], index=xDF.index)
    tempDF = pd.concat((tempDF, yDF), axis=1, join="inner")
    tempDF.columns = ["First Vector", "Second Vector", "Label"]
    sns.lmplot(x="First Vector", y="Second Vector", hue="Label", \
               data=tempDF, fit_reg=False)
    ax = plt.gca()
    ax.set_title("Separation of Observations using " + algoName)


#----------------------------------------------------------------------------------------------------

# Gaussian Random Projection
from sklearn.random_projection import GaussianRandomProjection

n_components = 'auto'
eps = 0.5
random_state = 2018
#eps:n_componentsが'auto'に設定されている場合に、Johnson-Lindenstrauss lemmaに従った埋め込みの品質を制御するためのパラメータ.
#   値が小さいほど埋め込みが良くなり,ターゲット射影空間の次元数(n_components)が多くなる
GRP = GaussianRandomProjection(n_components=n_components, eps=eps, \
                               random_state=random_state)

X_train_GRP = GRP.fit_transform(X_train)
X_train_GRP = pd.DataFrame(data=X_train_GRP, index=train_index)

X_validation_GRP = GRP.transform(X_validation)
X_validation_GRP = pd.DataFrame(data=X_validation_GRP, index=validation_index)

scatterPlot(X_train_GRP, y_train, "Gaussian Random Projection")
plt.show()
Beispiel #42
0
tsvd_results_test = tsvd.transform(test)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    train['pca_' + str(i)] = pca2_results_train[:,i-1]
    test['pca_' + str(i)] = pca2_results_test[:, i-1]

    train['ica_' + str(i)] = ica2_results_train[:,i-1]
    test['ica_' + str(i)] = ica2_results_test[:, i-1]

    train['tsvd_' + str(i)] = tsvd_results_train[:,i-1]
        
        train_decomposed.append(fa_results_train)
        test_decomposed.append(fa_results_test)
        val_decomposed.append(fa_results_val)

    if use_grp>0.0 or use_grp<0.0:
        print("GRP")
        if use_grp>0.0:
            N_COMP = int(use_grp  * train.shape[1]) +1
            eps=10
        if use_grp<0.0:
            N_COMP = "auto"
            eps=abs(use_grp)
        grp = GaussianRandomProjection(n_components = N_COMP, eps=eps, random_state=42)
        grp_results = grp.fit(total)
        grp_results_train = grp.transform(train)
        grp_results_test = grp.transform(test)
        grp_results_val = grp.transform(val)
      
        train_decomposed.append(grp_results_train)
        test_decomposed.append(grp_results_test)
        val_decomposed.append(grp_results_val)
        

    if use_srp>0.0:
        print("SRP")
        N_COMP = int(use_srp  * train.shape[1]) +1
        srp = SparseRandomProjection(n_components = N_COMP, dense_output=True, random_state=42)
        srp_results = srp.fit(total)
        srp_results_train = srp.transform(train)
        srp_results_test = srp.transform(test)
Beispiel #44
0
ap_region_data = { k:v for (k,v) in region_data.items() if ap_champs[k[2]]}
ap_tier_data = { k:v for (k,v) in tier_data.items() if ap_champs[k[2]]}
ap_cross_data = { k:v for (k,v) in cross_data.items() if ap_champs[k[3]]}
ap_patch_data = { k:v for (k,v) in patch_data.items() if ap_champs[k[1]]}

all_ap_data = ap_region_data.values() + (ap_tier_data.values()) + ap_cross_data.values() + ap_patch_data.values()

#print(ap_champs)

#pca = PCA(n_components=2)
#reduction = pca.fit_transform(ap_champs.values())
#print(pca.explained_variance_ratio_)

grp = GaussianRandomProjection(2, random_state = 0)
grp.fit(all_ap_data)
region_reduction = grp.transform(ap_region_data.values())
tier_reduction = grp.transform(ap_tier_data.values())
cross_reduction = grp.transform(ap_cross_data.values())
patch_reduction = grp.transform(ap_patch_data.values())

region_json_data = []
for i in range(0,len(ap_region_data.keys())):
	key = ap_region_data.keys()[i]
	data = list(region_reduction[i])
	num_games = region_games[key]
	region_json_data.append( {
		"patch":key[0],
		"region":key[1],
		#"tier":key[2],
		"champion":key[2],
		"coordinate":{
Beispiel #45
0
class RCAReducer():

    def __init__(self, dataset, dataset_name, num_components=10):
        self.dataset = dataset
        self.dataset_name = dataset_name
        self.labels = dataset.target
        self.scaler = MinMaxScaler()
        self.data = self.scaler.fit_transform(dataset.data)
        self.n_samples, self.n_features = self.data.shape

        self.reducer = GaussianRandomProjection(n_components=num_components)

    def reduce(self):
        self.reducer.fit(self.data)
        self.reduced = self.scaler.fit_transform(self.reducer.transform(self.data))
        return self.reduced

    def benchmark(self, estimator, name, data):
        t0 = time()
        sample_size = 300
        labels = self.labels

        estimator.fit(data)
        print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
              % (name, (time() - t0), estimator.inertia_,
                 metrics.homogeneity_score(labels, estimator.labels_),
                 metrics.completeness_score(labels, estimator.labels_),
                 metrics.v_measure_score(labels, estimator.labels_),
                 metrics.adjusted_rand_score(labels, estimator.labels_),
                 metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
                 metrics.silhouette_score(data, estimator.labels_,
                                          metric='euclidean',
                                          sample_size=sample_size)))

    def display_reduced_digits(self):
        sys.stdout = open('out/RCAReduceDigitsOutput.txt', 'w')
        print("RCA Reduction of %s:\n" % self.dataset_name)
        print(40 * '-')
        print("Length of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0]))
        print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0]))
        print("\nProjection axes:\n")
        for i,axis in enumerate(self.reducer.components_.tolist()):
            print("Axis %d:\n" % i, axis)
        self.compute_plane_variance()

    def compute_plane_variance(self):
        points_along_dimension = self.reduced.T
        for i,points in enumerate(points_along_dimension):
            print("\nVariance of dimension %d:" % i)
            print(np.var(points), "\n")

    def display_reduced_iris(self):
        sys.stdout = open('out/RCAReduceIrisOutput.txt', 'w')
        print("RCA Reduction of %s:\n" % self.dataset_name)
        print(40 * '-')
        print("Length of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0]))
        print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0]))
        print("\nProjection axes:\n")
        for i,axis in enumerate(self.reducer.components_.tolist()):
            print("Axis %d:\n" % i, axis)
        self.compute_plane_variance()

    def reduce_crossvalidation_set(self, X_train, X_test):
        self.reducer.fit(X_train)
        reduced_X_train = self.scaler.transform(X_train)
        reduced_X_test = self.scaler.transform(X_test)
        return reduced_X_train, reduced_X_test