Exemple #1
0
def randproj(tx, ty, rx, ry):
    compressor = RandomProjection(tx[1].size)
    newtx = compressor.fit_transform(tx)
    compressor = RandomProjection(tx[1].size)
    newrx = compressor.fit_transform(rx)
    em(newtx, ty, newrx, ry, add="wRPtr", times=10)
    km(newtx, ty, newrx, ry, add="wRPtr", times=10)
    nn(newtx, ty, newrx, ry, add="wRPtr")
Exemple #2
0
 def best_rp_nba(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     rp = GaussianRandomProjection(n_components=X_train_scl.shape[1])
     X_train_transformed = rp.fit_transform(X_train_scl, y_train)
     X_test_transformed = rp.transform(X_test_scl)
     
     ## top 2
     kurt = kurtosis(X_train_transformed)
     i = kurt.argsort()[::-1]
     X_train_transformed_sorted = X_train_transformed[:, i]
     X_train_transformed = X_train_transformed_sorted[:,0:2]
     
     kurt = kurtosis(X_test_transformed)
     i = kurt.argsort()[::-1]
     X_test_transformed_sorted = X_test_transformed[:, i]
     X_test_transformed = X_test_transformed_sorted[:,0:2]
     
     # save
     filename = './' + self.save_dir + '/nba_rp_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
    def project_points(points, dim=None):
        if dim is None:
            dim = 5
            #dim = min(max(int(np.log(len(points))), 2), 15)

        proj = GaussianRandomProjection(n_components=dim)
        return proj.fit_transform(points)
def test_fixed_state_transformer():

    random_state = check_random_state(0)
    X = random_state.rand(500, 100)

    # Check that setting the random_seed is equivalent to set the
    # random_state
    transf = GaussianRandomProjection(n_components=5, random_state=0)
    fixed_transf = FixedStateTransformer(GaussianRandomProjection(n_components=5), random_seed=0)
    assert_array_almost_equal(fixed_transf.fit_transform(X), transf.fit_transform(X))

    # Check that set_params doesn't modify the results
    fixed_transf = FixedStateTransformer(GaussianRandomProjection(n_components=5, random_state=None))

    fixed_transf2 = FixedStateTransformer(GaussianRandomProjection(random_state=1, n_components=5))

    assert_array_almost_equal(fixed_transf.fit_transform(X), fixed_transf2.fit_transform(X))

    # Check that it work when there is no random_state
    fixed_transf = FixedStateTransformer(IdentityProjection())
    assert_array_almost_equal(fixed_transf.fit_transform(X), X)
Exemple #5
0
X.reset_index(inplace=True, drop=True)
Y.reset_index(inplace=True, drop=True)

# Handling outliers
# Y[Y > 150] = Y.quantile(0.99)

pca = PCA(n_components=5)
ica = FastICA(n_components=5, max_iter=1000)
tsvd = TruncatedSVD(n_components=5)
gp = GaussianRandomProjection(n_components=5)
sp = SparseRandomProjection(n_components=5, dense_output=True)

x_pca = pd.DataFrame(pca.fit_transform(X))
x_ica = pd.DataFrame(ica.fit_transform(X))
x_tsvd = pd.DataFrame(tsvd.fit_transform(X))
x_gp = pd.DataFrame(gp.fit_transform(X))
x_sp = pd.DataFrame(sp.fit_transform(X))

x_pca.columns = ["pca_{}".format(i) for i in x_pca.columns]
x_ica.columns = ["ica_{}".format(i) for i in x_ica.columns]
x_tsvd.columns = ["tsvd_{}".format(i) for i in x_tsvd.columns]
x_gp.columns = ["gp_{}".format(i) for i in x_gp.columns]
x_sp.columns = ["sp_{}".format(i) for i in x_sp.columns]

X = pd.concat((X, x_pca), axis=1)
X = pd.concat((X, x_ica), axis=1)
X = pd.concat((X, x_tsvd), axis=1)
X = pd.concat((X, x_gp), axis=1)
X = pd.concat((X, x_sp), axis=1)

x_test_pca = pd.DataFrame(pca.transform(X_Test))
Exemple #6
0
def em_rp(X, y, n, dataset):

    print("---- EM + RP ----")

    rp = GaussianRandomProjection(n_components=n)
    X_new = rp.fit_transform(X)

    em_silhouette = []
    vmeasure_score = []
    adjusted_rand = []
    mutual_in_score = []
    homogenity = []
    completeness = []

    list_k = list(range(2, 15))

    start = time.time()

    for i in list_k:
        i_start = time.time()
        print("CLUSTER :", i)

        em = GaussianMixture(n_components=i,
                             n_init=10,
                             max_iter=500,
                             random_state=0).fit(X_new)
        preds = em.predict(X_new)

        silhouette = silhouette_score(X_new, preds)
        em_silhouette.append(silhouette)
        print("Silhouette score : {}".format(silhouette))

        ad_rand = adjusted_rand_score(y, preds)
        adjusted_rand.append(ad_rand)
        print("Adjusted random score : {}".format(ad_rand))

        mutual_info = mutual_info_score(y, preds)
        mutual_in_score.append(mutual_info)
        print("Adjusted mutual info score : {}".format(mutual_info))

        h**o = homogeneity_score(y, preds)
        homogenity.append(h**o)
        print("Homogeneity score: {}".format(h**o))

        comp = completeness_score(y, preds)
        completeness.append(comp)
        print("Completeness score : {}".format(comp))

        v_measure = v_measure_score(y, preds)
        vmeasure_score.append(v_measure)
        print("V-measure score : {}".format(v_measure))

        print("BIC : {}".format(em.bic(X_new)))
        print("Log-likelihood score : {}".format(em.score(X_new)))

        i_end = time.time()
        print("Time for this iteration :", (i_end - i_start))

        print("-" * 100)

    end = time.time()

    print("TOTAL TIME", (end - start))

    plt.style.use('seaborn')
    plt.title('EM Clustering on RP', fontsize=16, y=1.03)
    plt.plot(list_k, em_silhouette, '-o', label='Silhouette score')
    plt.plot(list_k, adjusted_rand, '-o', label='Adjusted Random score')
    plt.plot(list_k, mutual_in_score, '-o', label='Mutual Info score')
    plt.plot(list_k, homogenity, '-o', label='Homogenity score')
    plt.plot(list_k, completeness, '-o', label='Completeness score')
    plt.plot(list_k, vmeasure_score, '-o', label='V-measure score')
    plt.xlabel('Number of clusters')
    plt.ylabel('Metrics score')
    plt.legend()
    filename = 'EM_RP_' + dataset + '.png'
    plt.savefig(filename)
    plt.clf()
Exemple #7
0
    file_2.write("ICA_kurt2")
    for i in range(0, len(kurt2)):
        file_2.write(";")
        file_2.write("%1.9f" % kurt2[i])
    file_2.write("\n")

    ############################## RP ##############################

    grp = GaussianRandomProjection(random_state=5)
    error_rate_1 = np.zeros(np.shape(data1_X)[1])
    for i in range(0, np.shape(data1_X)[1]):
        grp.set_params(n_components=i + 1)
        DT1 = tree.DecisionTreeClassifier(criterion='gini',
                                          min_samples_leaf=0.005)
        error_rate_1[i] = sum(
            DT1.fit(grp.fit_transform(data1_X), data1_Y).predict(
                grp.fit_transform(data1_X)) == data1_Y) * 1.0 / n1
        print i + 1
    i1 = np.argmax(error_rate_1) + 1
    grp.set_params(n_components=i1)
    recon1 = range(0,
                   2)  #pairwiseDistCorr(grp.fit_transform(data1_X), data1_X)

    error_rate_2 = np.zeros(np.shape(data2_X)[1])
    for i in range(0, np.shape(data2_X)[1]):
        grp.set_params(n_components=i + 1)
        DT2 = tree.DecisionTreeClassifier(criterion='gini',
                                          min_samples_leaf=0.005)
        error_rate_2[i] = sum(
            DT2.fit(grp.fit_transform(data2_X), data2_Y).predict(
                grp.fit_transform(data2_X)) == data2_Y) * 1.0 / n2
import numpy as np
from sklearn.random_projection import GaussianRandomProjection
from sklearn import datasets, svm

random_state = np.random.RandomState(0)
X = random_state.randn(10, 10000)
print(X.dtype)

X = np.array(X, dtype='float32')
print(X.dtype)

transformer = GaussianRandomProjection()
X_new = transformer.fit_transform(X)
print(X_new.dtype)
print(X.shape, X_new.shape)

iris = datasets.load_iris()
clf = svm.SVC(gamma='auto')

clf.fit(iris.data, iris.target)
print(clf.predict(iris.data[:3]))

clf.fit(iris.data, iris.target_names[iris.target])
print(clf.predict(iris.data[:3]))
Exemple #9
0
             markersize=3)

plt.xticks(())
plt.yticks(())

plt.title('Labels mapped on the ICA-reduced 2D graph')
fig.savefig('figures/gender_em_ICA_rankings.png')

plt.close(fig)

# # ##########################################################################

print("RP - kmeans")
for n in nrange:
    transformer = GaussianRandomProjection(n_components=n)
    reduced_data = transformer.fit_transform(df_x)

    print("N:", n)

    kmeans = KMeans(init='k-means++', n_clusters=10, n_init=10)
    kmeans.fit(reduced_data)

    correct = 0
    for i in range(10):
        d = defaultdict(int)
        for index, row in df.iterrows():
            if row[label] == float(i):
                lab = kmeans.predict([reduced_data[index]])
                d[lab[0]] += 1
        if d: correct += max(d.values())
def random(X, K):
    grp = GaussianRandomProjection(n_components=K)
    X_red = grp.fit_transform(X)
    X_red = normalizer.fit_transform(X_red)
    return X_red
Exemple #11
0
alpha = 20.
epsilon1 = alpha / (noise_std * noise_std) + np.log(1 / delta) / (alpha - 1)
print(epsilon1)
gaussian_graph_noise = np.random.normal(0, noise_std,
                                        citeseer_graph_copy2.shape)
citeseer_graph_copy2 += gaussian_graph_noise
graph_proj_2 = clf_proj.fit_transform(citeseer_graph_copy2)
##### baseline 4: gradient descent + gradient perturbation

##### the proposed method: random projection + add Gaussian noise to the graph adjacency matrix directly
#d = 10
d = 30
fraction = 8.

rand_proj = GaussianRandomProjection(n_components=d)
graph_randn_proj = rand_proj.fit_transform(citeseer_graph)
noise_std = fraction * np.sqrt(1 / d)
graph_randn_proj += np.random.normal(0.0,
                                     noise_std,
                                     size=graph_randn_proj.shape)
quad_base, r = qr(graph_randn_proj)
#graph_randn_svd = quad_base[:,:3*no_labels]
graph_randn_svd = graph_randn_proj

#graph_randn_svd = singlepass_evd(citeseer_graph,d)
#alpha = 2.5
#epsilon_renyi = np.max([2*(d/2*np.log((3+fraction)/(2+fraction)) + d/(2*(alpha-1))*np.log((3+fraction)/(alpha*(3+fraction) - (alpha-1)*(2+fraction)))),2*(d/2*np.log((2+fraction)/(3+fraction)) + d/(2*(alpha-1))*np.log((2+fraction)/(alpha*(2+fraction) - (alpha-1)*(3+fraction))))])
#epsilon1 = epsilon_renyi + np.log(1/delta)/(alpha-1)
#print(epsilon1)
'''
d = 10
X_ica = X_r

plt.figure()
colors = ["b", "g", "r", "c", "m", "y", "k"]
lw = 2

for color, i in zip(colors, [4, 8]):
    plt.scatter(X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=.8, lw=lw, label=i)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('ICA of Wine Quality dataset')

#################################################
# Random Projection feature transformation

rca = GaussianRandomProjection(n_components=11, random_state=10)
X_r = rca.fit_transform(X)
X_rca = X_r

plt.figure()
colors = ["b", "g", "r", "c", "m", "y", "k"]
lw = 2

for color, i in zip(colors, [4, 8]):
    plt.scatter(X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=.8, lw=lw, label=i)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('Random Projection of Wine Quality dataset')

#################################################
# Univariate feature selection (K best)

from sklearn.feature_selection import chi2
Exemple #13
0
def get_rp(data, tdata, num_classes):
    rp = GaussianRandomProjection(n_components=num_classes)
    pdata = rp.fit_transform(data)
    ptdata = rp.fit_transform(tdata)
    return pdata, ptdata
Exemple #14
0
def run_random_projection(data, num_components):
    clf = GaussianRandomProjection(n_components=num_components,
                                   random_state=seed)
    return clf.fit_transform(data)
Exemple #15
0
      ((maxAccuracy * 100), treesNo))

# # Random Projection

# In[6]:

from sklearn.ensemble import RandomForestClassifier

n_comp = 12

# GaussianRandomProjection

from sklearn.random_projection import GaussianRandomProjection

grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(df_train)
grp_results_test = grp.transform(testData)

treesNo = 141
model = RandomForestClassifier(n_estimators=treesNo)
model.fit(grp_results_train, y_train)
score = model.score(grp_results_test, y_test)
print("GaussianRandomProjection with random forest accuracy = %.2f%%" %
      (score * 100))

# SparseRandomProjection

from sklearn.random_projection import SparseRandomProjection

srp = SparseRandomProjection(n_components=n_comp,
                             dense_output=True,
def nn2(xs, ys, xs_test, ys_test, n_components, clf_constructor):
    ks = [0 for _ in range(10)]
    cataccs = [0 for _ in range(10)]

    ys = [to_categorical(ys[0]), to_categorical(ys[1])]
    ys_test = [to_categorical(ys_test[0]), to_categorical(ys_test[1])]

    for i in range(2):
        shape = np.shape(xs[i])[1]
        n_components[i] = shape
        model = utils.create_adult_model(
            shape, 2) if i == 0 else utils.create_wine_model(shape, 5)
        model.fit(xs[i][:10000],
                  ys[i][:10000],
                  batch_size=50,
                  epochs=10,
                  verbose=False)
        cataccs[i] = model.evaluate(xs_test[i], ys_test[i],
                                    verbose=False)[1] * 100

    for k in range(2, 11):
        try:
            clf = clf_constructor(n_clusters=k)
        except:
            clf = clf_constructor(n_components=k)
        for i in range(2):
            pca = PCA(n_components=n_components[2 + i])
            transformed = pca.fit_transform(xs[i])
            transformed_test = pca.transform(xs_test[i])
            predict = to_categorical(clf.fit_predict(transformed[:10000]))
            predict_test = to_categorical(clf.predict(
                transformed_test[:10000]))
            input_dims = [n_components[2 + i], k]
            model = utils.create_mi_adult_model(
                input_dims, 2) if i == 0 else utils.create_mi_wine_model(
                    input_dims, 5)
            model.fit([transformed[:10000], predict],
                      ys[i][:10000],
                      batch_size=50,
                      epochs=10,
                      verbose=False)
            catacc = model.evaluate([transformed_test, predict_test],
                                    ys_test[i],
                                    verbose=False)[1] * 100
            if catacc > cataccs[2 + i]:
                ks[2 + i] = k
                cataccs[2 + i] = catacc

            ica = FastICA(n_components=n_components[4 + i])
            transformed = ica.fit_transform(xs[i])
            transformed_test = ica.transform(xs_test[i])
            predict = to_categorical(clf.fit_predict(transformed[:10000]))
            predict_test = to_categorical(clf.predict(
                transformed_test[:10000]))
            input_dims = [n_components[4 + i], k]
            model = utils.create_mi_adult_model(
                input_dims, 2) if i == 0 else utils.create_mi_wine_model(
                    input_dims, 5)
            model.fit([transformed[:10000], predict],
                      ys[i][:10000],
                      batch_size=50,
                      epochs=10,
                      verbose=False)
            catacc = model.evaluate([transformed_test, predict_test],
                                    ys_test[i],
                                    verbose=False)[1] * 100
            if catacc > cataccs[4 + i]:
                ks[4 + i] = k
                cataccs[4 + i] = catacc

            if i == 1:
                rp = GaussianRandomProjection(eps=0.95)
                transformed = rp.fit_transform(xs[i])
                transformed_test = rp.transform(xs_test[i])
                predict = to_categorical(clf.fit_predict(transformed[:10000]))
                predict_test = to_categorical(
                    clf.predict(transformed_test[:10000]))
                input_dims = [np.shape(transformed)[1], k]
                model = utils.create_mi_wine_model(input_dims, 5)
                model.fit([transformed[:10000], predict],
                          ys[i][:10000],
                          batch_size=50,
                          epochs=10,
                          verbose=False)
                catacc = model.evaluate([transformed_test, predict_test],
                                        ys_test[i],
                                        verbose=False)[1] * 100
                if catacc > cataccs[6 + i]:
                    ks[6 + i] = k
                    cataccs[6 + i] = catacc

            encoder, vae = utils.create_vae(
                np.shape(xs[i])[1], n_components[8 + i])
            vae.fit(xs[i], batch_size=50, epochs=10, verbose=False)
            transformed = encoder.predict(xs[i], verbose=False)
            transformed_test = encoder.predict(xs_test[i], verbose=False)
            predict = to_categorical(clf.fit_predict(transformed[:10000]))
            predict_test = to_categorical(clf.predict(
                transformed_test[:10000]))
            input_dims = [n_components[8 + i], k]
            model = utils.create_mi_adult_model(
                input_dims, 2) if i == 0 else utils.create_mi_wine_model(
                    input_dims, 5)
            model.fit([transformed[:10000], predict],
                      ys[i][:10000],
                      batch_size=50,
                      epochs=10,
                      verbose=False)
            catacc = model.evaluate([transformed_test, predict_test],
                                    ys_test[i],
                                    verbose=False)[1] * 100
            if catacc > cataccs[8 + i]:
                ks[8 + i] = k
                cataccs[8 + i] = catacc

    plot.style.use('seaborn-darkgrid')
    plot.title(f'Influence of feature transformation on the NN accuracy')
    color = []
    for _ in range(5):
        color.append('tab:blue')
        color.append('tab:orange')
    x = []
    count = 1
    for _ in range(5):
        x.append(count)
        count += 0.5
        x.append(count)
        count += 1
    plot.bar(x, cataccs, color=color, width=0.75)
    x = []
    count = 1.25
    for _ in range(5):
        x.append(count)
        count += 1.5
    plot.xticks(x, ['None', 'PCA', 'ICA', 'RP', 'VAE'])
    plot.xlabel('Feature transformation method')
    plot.ylabel('Categorical accuracy (%)')
    plot.show()
def nn_benchmark(xs, ys, n_components):
    ys = [to_categorical(ys[0]), to_categorical(ys[1])]

    none_samples = [[], []]
    pca_samples = [[], []]
    ica_samples = [[], []]
    rp_samples = [[], []]
    vae_samples = [[], []]

    trials = 7
    for _ in range(trials):

        for i in range(2):
            shape = np.shape(xs[i])[1]
            n_components[i] = shape
            model = utils.create_adult_model(
                shape, 2) if i == 0 else utils.create_wine_model(shape, 5)
            start = time.time()
            model.fit(xs[i][:10000],
                      ys[i][:10000],
                      batch_size=50,
                      epochs=10,
                      verbose=False)
            none_samples[i].append(time.time() - start)

        for i in range(2):
            dim = n_components[2 + i]
            pca = PCA(n_components=dim)
            transformed = pca.fit_transform(xs[i])
            model = utils.create_adult_model(
                dim, 2) if i == 0 else utils.create_wine_model(dim, 5)
            start = time.time()
            model.fit(transformed[:10000],
                      ys[i][:10000],
                      batch_size=50,
                      epochs=10,
                      verbose=False)
            pca_samples[i].append(time.time() - start)

            dim = n_components[4 + i]
            ica = FastICA(n_components=dim)
            transformed = ica.fit_transform(xs[i])
            model = utils.create_adult_model(
                dim, 2) if i == 0 else utils.create_wine_model(dim, 5)
            start = time.time()
            model.fit(transformed[:10000],
                      ys[i][:10000],
                      batch_size=50,
                      epochs=10,
                      verbose=False)
            ica_samples[i].append(time.time() - start)

            if i == 1:
                rp = GaussianRandomProjection(eps=0.95)
                transformed = rp.fit_transform(xs[i])
                dim = np.shape(transformed)[1]
                model = utils.create_wine_model(dim, 5)
                start = time.time()
                model.fit(transformed[:10000],
                          ys[i][:10000],
                          batch_size=50,
                          epochs=10,
                          verbose=False)
                rp_samples[i].append(time.time() - start)

            dim = n_components[8 + i]
            encoder, vae = utils.create_vae(np.shape(xs[i])[1], dim)
            vae.fit(xs[i], batch_size=50, epochs=10, verbose=False)
            transformed = encoder.predict(xs[i], verbose=False)
            model = utils.create_adult_model(
                dim, 2) if i == 0 else utils.create_wine_model(dim, 5)
            start = time.time()
            model.fit(transformed[:10000],
                      ys[i][:10000],
                      batch_size=50,
                      epochs=10,
                      verbose=False)
            vae_samples[i].append(time.time() - start)

    times = [
        np.mean(none_samples[0]),
        np.mean(none_samples[1]),
        np.mean(pca_samples[0]),
        np.mean(pca_samples[1]),
        np.mean(ica_samples[0]),
        np.mean(ica_samples[1]), 0,
        np.mean(rp_samples[1]),
        np.mean(vae_samples[0]),
        np.mean(vae_samples[1])
    ]

    times_err = [
        np.std(none_samples[0]) / 2,
        np.std(none_samples[1]) / 2,
        np.std(pca_samples[0]) / 2,
        np.std(pca_samples[1]) / 2,
        np.std(ica_samples[0]) / 2,
        np.std(ica_samples[1]) / 2, 0,
        np.std(rp_samples[1]) / 2,
        np.std(vae_samples[0]) / 2,
        np.std(vae_samples[1]) / 2
    ]

    plot.style.use('seaborn-darkgrid')
    plot.title(f'Influence of feature transformation on the NN training time')
    color = []
    for _ in range(5):
        color.append('tab:blue')
        color.append('tab:orange')
    x = []
    count = 1
    for _ in range(5):
        x.append(count)
        count += 0.5
        x.append(count)
        count += 1
    plot.bar(x, times, color=color, width=0.75, yerr=times_err)
    x = []
    count = 1.25
    for _ in range(5):
        x.append(count)
        count += 1.5
    plot.xticks(x, ['None', 'PCA', 'ICA', 'RP', 'VAE'])
    plot.xlabel('Feature transformation method')
    plot.ylabel('Average training time (s)')
    plot.show()
Exemple #18
0
def gen_features(train, val, test):
    train = pd.DataFrame(train)
    val = pd.DataFrame(val)
    test = pd.DataFrame(test)
    # cat_cols = ['city', 'bd', 'gender', 'registered_via', 'registration_init_year',
    #              'registration_init_month', 'registration_init_date', 'payment_method_id', 'payment_plan_days',
    #              'plan_list_price', 'actual_amount_paid', 'is_auto_renew', 'is_cancel',
    #              'transaction_date_year', 'transaction_date_month', 'transaction_date_date',
    #              'membership_expire_date_year',
    #              'membership_expire_date_month', 'membership_expire_date_date', 'membership_transaction_gap',
    #              'cancel_times',
    #              'auto_renew_count', 'plan_net_worth', 'user_date_year', 'user_date_month',
    #              'user_date_date']
    # con_cols = [x for x in train.columns if x not in cat_cols and x not in ['msno', 'is_churn']]
    # train[cat_cols] = train[cat_cols].astype('object')
    # test[cat_cols] = test[cat_cols].astype('object')
    # val[cat_cols] = val[cat_cols].astype('object')
    #
    # for col in cat_cols:
    #     train[col].fillna(value=train[col].mode()[0], inplace=True)
    #     test[col].fillna(value=test[col].mode()[0], inplace=True)
    #     val[col].fillna(value=val[col].mode()[0], inplace=True)
    # for col in con_cols:
    #     train[col].fillna(value=train[col].mean(), inplace=True)
    #     test[col].fillna(value=test[col].mean(), inplace=True)
    #     val[col].fillna(value=val[col].mean(), inplace=True)
    #
    # for c in train.columns:
    #     if train[c].dtype == 'object':
    #         lbl = LabelEncoder()
    #         lbl.fit(list(train[c].values) + list(test[c].values))
    #         train[c] = lbl.transform(list(train[c].values))
    #         test[c] = lbl.transform(list(test[c].values))

    n_comp = 15

    drop_list = []
    test_drop_list = []

    print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape)
    print('tSVD', datetime.now() - start)
    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp)
    tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1))
    tsvd_results_val= tsvd.transform(val.drop(test_drop_list, axis=1))
    tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1))

    print('PCA', datetime.now() - start)
    # PCA
    pca = PCA(n_components=n_comp)
    pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1))
    pca2_results_val = pca.transform(val.drop(test_drop_list, axis=1))
    pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1))

    print('ICA', datetime.now() - start)
    # ICA
    ica = FastICA(n_components=n_comp, max_iter=10000)
    ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1))
    ica2_results_val = ica.transform(val.drop(test_drop_list, axis=1))
    ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1))

    print('GRP', datetime.now() - start)
    # GRP
    grp = GaussianRandomProjection(n_components=n_comp, eps=0.1)
    grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1))
    grp_results_val = grp.transform(val.drop(test_drop_list, axis=1))
    grp_results_test = grp.transform(test.drop(test_drop_list, axis=1))

    print('SRP', datetime.now() - start)
    # SRP
    srp = SparseRandomProjection(n_components=n_comp, dense_output=True)
    srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1))
    srp_results_val = srp.transform(val.drop(test_drop_list, axis=1))
    srp_results_test = srp.transform(test.drop(test_drop_list, axis=1))

    # MCA
    # res_mca = MCA(train, ncp=n_comp, graph = FALSE)

    # save columns list before adding the decomposition components

    usable_columns = list(set(train.columns) - set(drop_list))

    # Append decomposition components to datasets
    for i in range(1, n_comp + 1):
        train['pca_' + str(i)] = pca2_results_train[:, i - 1]
        val['pca_' + str(i)] = pca2_results_val[:, i - 1]
        test['pca_' + str(i)] = pca2_results_test[:, i - 1]

        train['ica_' + str(i)] = ica2_results_train[:, i - 1]
        val['ica_' + str(i)] = ica2_results_val[:, i - 1]
        test['ica_' + str(i)] = ica2_results_test[:, i - 1]

        train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
        val['tsvd_' + str(i)] = tsvd_results_val[:, i - 1]
        test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

        train['grp_' + str(i)] = grp_results_train[:, i - 1]
        val['grp_' + str(i)] = grp_results_val[:, i - 1]
        test['grp_' + str(i)] = grp_results_test[:, i - 1]

        train['srp_' + str(i)] = srp_results_train[:, i - 1]
        val['srp_' + str(i)] = srp_results_val[:, i - 1]
        test['srp_' + str(i)] = srp_results_test[:, i - 1]
    return train, val, test
Exemple #19
0
import numpy as np
from sklearn.random_projection import GaussianRandomProjection
import pandas as pd
from sklearn import metrics

balance_data = pd.read_csv('seeds.csv', sep=',', header=None)
X = balance_data.values[:, 0:6]
Y = balance_data.values[:, 7]
for i in range(1, 7):
    gaussian = GaussianRandomProjection(n_components=6)
    new_X = gaussian.fit_transform(X)
    np.savetxt('GAUSSIAN' + str(i) + '.csv', new_X)
def gaussianRP(data, orig_dimension, new_dimension):
    rp = GaussianRandomProjection(n_components=new_dimension)
    return rp.fit_transform(data)
Exemple #21
0
# PCA
pca = PCA(n_components=n_comp)  # random_state=400)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp)  #, random_state=400)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp,
                               eps=0.007,
                               random_state=400)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp,
                             dense_output=True,
                             random_state=400)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

# Append decomposition components to datasets
for i in range(1, n_comp + 1):
    train['pca_' + str(i)] = pca2_results_train[:, i - 1]
    test['pca_' + str(i)] = pca2_results_test[:, i - 1]

    train['ica_' + str(i)] = ica2_results_train[:, i - 1]
Exemple #22
0
            s = node.label() + ' ' + node.parent().label() + ' ' + node.parent(
            ).parent().label()
            col.append(mapo[s])
    fm = sparse.csr_matrix(([1] * len(col), ([0] * len(col), col)),
                           shape=(1, len(mapo)))
    O[node.label()].append(fm)


for tree in tqdm(config.train, desc='NAACL collect'):
    for node in tree.postorder():
        collect(node)

rp = GaussianRandomProjection(n_components=500, random_state=42)
newI, newO = dict(), dict()
for k, v in tqdm(I.items(), desc='PCA/RP inside'):
    newI[config.nonterminal_map[k]] = rp.fit_transform(sparse.vstack(v))
for k, v in tqdm(O.items(), desc='PCA/RP outside'):
    newO[config.nonterminal_map[k]] = rp.fit_transform(sparse.vstack(v))

config.I = newI
config.O = newO

del M, counti, counto, mapi, mapo, I, O
transform_trees(config.train)

cnt = Counter()
for tree in config.train:
    for node in tree.postorder():
        Inode[node] = config.I[node.label()][cnt[node.label()]]
        Onode[node] = config.O[node.label()][cnt[node.label()]]
        cnt[node.label()] += 1
Exemple #23
0
def gen_feature(train, test):
    train = pd.DataFrame(train)
    test = pd.DataFrame(test)

    n_comp = 15
    drop_list = []
    test_drop_list = []

    print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape)
    print('tSVD', datetime.now() - start)
    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp)
    tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1))
    tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1))

    print('PCA', datetime.now() - start)
    # PCA
    pca = PCA(n_components=n_comp)
    pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1))
    pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1))

    print('ICA', datetime.now() - start)
    # ICA
    ica = FastICA(n_components=n_comp, max_iter=10000)
    ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1))
    ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1))

    print('GRP', datetime.now() - start)
    # GRP
    grp = GaussianRandomProjection(n_components=n_comp, eps=0.1)
    grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1))
    grp_results_test = grp.transform(test.drop(test_drop_list, axis=1))

    print('SRP', datetime.now() - start)
    # SRP
    srp = SparseRandomProjection(n_components=n_comp, dense_output=True)
    srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1))
    srp_results_test = srp.transform(test.drop(test_drop_list, axis=1))

    # MCA
    # res_mca = MCA(train, ncp=n_comp, graph = FALSE)

    # save columns list before adding the decomposition components

    usable_columns = list(set(train.columns) - set(drop_list))

    # Append decomposition components to datasets
    for i in range(1, n_comp + 1):
        train['pca_' + str(i)] = pca2_results_train[:, i - 1]
        test['pca_' + str(i)] = pca2_results_test[:, i - 1]

        train['ica_' + str(i)] = ica2_results_train[:, i - 1]
        test['ica_' + str(i)] = ica2_results_test[:, i - 1]

        train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
        test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

        train['grp_' + str(i)] = grp_results_train[:, i - 1]
        test['grp_' + str(i)] = grp_results_test[:, i - 1]

        train['srp_' + str(i)] = srp_results_train[:, i - 1]
        test['srp_' + str(i)] = srp_results_test[:, i - 1]
    return train, test
def perform_feature_engineering(train, test, config):

    for c in train.columns:
        if (len(train[c].value_counts()) == 2):
            if (train[c].mean() < config['SparseThreshold']):
                del train[c]
                del test[c]

    col = list(test.columns)
    if config['ID'] != True:
        col.remove('ID')

    # tSVD
    if (config['tSVD'] == True):
        tsvd = TruncatedSVD(n_components=config['n_comp'])
        tsvd_results_train = tsvd.fit_transform(train[col])
        tsvd_results_test = tsvd.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
            test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]
    # PCA
    if (config['PCA'] == True):
        pca = PCA(n_components=config['n_comp'])
        pca2_results_train = pca.fit_transform(train[col])
        pca2_results_test = pca.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['pca_' + str(i)] = pca2_results_train[:, i - 1]
            test['pca_' + str(i)] = pca2_results_test[:, i - 1]
    # ICA
    if (config['ICA'] == True):
        ica = FastICA(n_components=config['n_comp'])
        ica2_results_train = ica.fit_transform(train[col])
        ica2_results_test = ica.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['ica_' + str(i)] = ica2_results_train[:, i - 1]
            test['ica_' + str(i)] = ica2_results_test[:, i - 1]

    # GRP
    if (config['GRP'] == True):
        grp = GaussianRandomProjection(n_components=config['n_comp'], eps=0.1)
        grp_results_train = grp.fit_transform(train[col])
        grp_results_test = grp.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['grp_' + str(i)] = grp_results_train[:, i - 1]
            test['grp_' + str(i)] = grp_results_test[:, i - 1]

    # SRP
    if (config['SRP'] == True):
        srp = SparseRandomProjection(n_components=config['n_comp'],
                                     dense_output=True,
                                     random_state=420)
        srp_results_train = srp.fit_transform(train[col])
        srp_results_test = srp.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['srp_' + str(i)] = srp_results_train[:, i - 1]
            test['srp_' + str(i)] = srp_results_test[:, i - 1]

    if config['magic'] == True:
        magic_mat = train[['ID', 'X0', 'y']]
        magic_mat = magic_mat.groupby(['X0'])['y'].mean()
        magic_mat = pd.DataFrame({
            'X0': magic_mat.index,
            'magic': list(magic_mat)
        })
        mean_magic = magic_mat['magic'].mean()
        train = train.merge(magic_mat, on='X0', how='left')
        test = test.merge(magic_mat, on='X0', how='left')
        test['magic'] = test['magic'].fillna(mean_magic)
    return train, test
Exemple #25
0
import sys
from sklearn.random_projection import SparseRandomProjection, GaussianRandomProjection
import numpy as np

in_dim = int(sys.argv[1])
out_dim = int(sys.argv[2])
out_file = sys.argv[3]

# dummy data
X = np.zeros((2, in_dim), dtype=float)

g = GaussianRandomProjection(out_dim)
g.fit_transform(X)

# random mat, transpose() from (out_d, int_d) to (in_d, out_d)
random_mat = g.components_.transpose()

random_mat.dump(out_file)
Exemple #26
0
def retdata():
    # load data into dataframe
    window_size = 7
    df = pd.read_csv('ds.csv')
    df = df.drop('Unnamed: 0', 1)

    # add data part to dataframe
    df['datepart'] = pd.DatetimeIndex(pd.to_datetime(df['time'])).date

    # split dataframe up into the variable where we want the sum and where we want the mean
    means = ['mood', 'circumplex.valence', 'circumplex.arousal']
    df2 = df[df['variable'].isin(means)]
    df = df[~df['variable'].isin(means)]

    # create 2 different dataframes with different aggfunc and merge them
    pt1 = pd.pivot_table(df,
                         values='value',
                         index=['id', 'datepart'],
                         aggfunc='sum',
                         columns='variable').reset_index()  # 1973 rows
    pt2 = pd.pivot_table(df2,
                         values='value',
                         index=['id', 'datepart'],
                         aggfunc='mean',
                         columns='variable').reset_index()  # 1973 rows
    pt = pd.merge(pt1,
                  pt2,
                  how='left',
                  left_on=['id', 'datepart'],
                  right_on=['id', 'datepart'])

    # remove rows with no mood, valence, and arousal value--1268 rows
    ptcl1 = pt[np.isfinite(pt.mood)]  # 1268 rows
    ptcl2 = ptcl1[np.isfinite(ptcl1['circumplex.valence'])]  # 1266 rows
    ptcl3 = ptcl2[np.isfinite(ptcl2['circumplex.arousal'])]  # 1266 rows
    ptcl3['weekday'] = pd.to_datetime(ptcl3['datepart']).dt.weekday_name
    le = sklearn.preprocessing.LabelEncoder()
    ptcl3['weekday'] = le.fit_transform(ptcl3['weekday'])
    ptcl3 = ptcl3.sort_values(by=['id', 'datepart'])

    new_feature_list = ['id', 'datepart', 'weekday', 'mood_']
    for feature_name in ptcl3.columns:
        if feature_name not in ['id', 'datepart', 'sms', 'call', 'weekday']:
            # new_feature_list.append((feature_name + 'PrevDay'))
            new_feature_list.append((feature_name + 'MeanPrevDays'))
            new_feature_list.append((feature_name + 'Gradient'))
            new_feature_list.append((feature_name + 'Log'))
        elif feature_name not in ['id', 'datepart', 'weekday']:
            # new_feature_list.append((feature_name + 'PrevDay'))
            new_feature_list.append((feature_name + 'SumPrevDays'))
            new_feature_list.append((feature_name + 'Gradient'))

    the_df = pd.DataFrame()
    #    the_df.columns = the_df.loc[0,:]
    #    the_df = the_df.drop(0)
    #    the_df = the_df.fillna(0)
    ptcl3 = ptcl3.fillna(0)

    # add previous day's mood

    id_set = list(OrderedDict.fromkeys(ptcl3['id']))
    for person in id_set:
        persondf = ptcl3[ptcl3['id'] == person]

        for feature_name in persondf.columns:
            if feature_name == 'mood':
                persondf['mood_'] = persondf[
                    'mood']  # all original feature names will be removed, hence the new name
            if feature_name not in [
                    'id', 'datepart', 'call', 'sms', 'weekday'
            ]:
                # persondf[str(feature_name)+'PrevDay'] = persondf[feature_name].shift(1)
                # persondf[str(feature_name)+'PrevDay'] = persondf[str(feature_name)+'PrevDay'].fillna(0)
                persondf[str(feature_name) + 'MeanPrevDays'] = persondf[str(
                    feature_name)].rolling(window_size).mean()
                persondf[str(feature_name) + 'Gradient'] = np.gradient(
                    persondf[str(feature_name)].rolling(window_size).mean())
                persondf[str(feature_name) + 'Log'] = np.log(
                    persondf[str(feature_name)])
                persondf[str(feature_name) + 'Log'][np.isneginf(
                    persondf[str(feature_name) + 'Log'])] = 0
                persondf = persondf.drop(feature_name, 1)
            elif feature_name not in [
                    'id', 'datepart', 'weekday'
            ]:  # looking at the sum instead of the mean of the previous days for sms and call
                # persondf[str(feature_name)+'PrevDay'] = persondf[feature_name].shift(1)
                # persondf[str(feature_name)+'PrevDay'] = persondf[str(feature_name)+'PrevDay'].fillna(0)
                persondf[str(feature_name) + 'SumPrevDays'] = persondf[str(
                    feature_name)].rolling(window_size).sum()
                persondf[str(feature_name) + 'Gradient'] = np.gradient(
                    persondf[str(feature_name)].rolling(window_size).mean())
                persondf = persondf.drop(feature_name, 1)

        persondf = persondf[persondf['activityGradient'].notnull(
        )]  # arbritrary feature to remove the first 6 days
        persondf = persondf.fillna(0)

        pca = PCA(n_components=5)
        tsvd = TruncatedSVD(n_components=5)
        gp = GaussianRandomProjection(n_components=5)
        sp = SparseRandomProjection(n_components=5, dense_output=True)

        x_pca = pd.DataFrame(
            pca.fit_transform(
                persondf.drop(['mood_', 'id', 'datepart', 'weekday'], axis=1)))
        x_tsvd = pd.DataFrame(
            tsvd.fit_transform(
                persondf.drop(['mood_', 'id', 'datepart', 'weekday'], axis=1)))
        x_gp = pd.DataFrame(
            gp.fit_transform(
                persondf.drop(['mood_', 'id', 'datepart', 'weekday'], axis=1)))
        x_sp = pd.DataFrame(
            sp.fit_transform(
                persondf.drop(['mood_', 'id', 'datepart', 'weekday'], axis=1)))
        x_pca.columns = ["pca_{}".format(i) for i in x_pca.columns]
        x_tsvd.columns = ["tsvd_{}".format(i) for i in x_tsvd.columns]
        x_gp.columns = ["gp_{}".format(i) for i in x_gp.columns]
        x_sp.columns = ["sp_{}".format(i) for i in x_sp.columns]
        x_pca = x_pca.reset_index()
        x_tsvd = x_tsvd.reset_index()
        x_gp = x_gp.reset_index()
        x_sp = x_sp.reset_index()
        persondf = persondf.reset_index()
        persondf = pd.concat((persondf, x_pca), axis=1)
        persondf = pd.concat((persondf, x_tsvd), axis=1)
        persondf = pd.concat((persondf, x_gp), axis=1)
        persondf = pd.concat((persondf, x_sp), axis=1)

        the_df = the_df.append(persondf)
        the_df = the_df.fillna(0)

    # replace null with 0 and reindex
    cleandata = the_df.fillna(0)
    cleandata.index = range(len(cleandata.values))

    # clean up
    del ptcl1, ptcl2, ptcl3, pt, pt1, pt2, df, df2, means
    # get normalized datasets
    cleandata = cleandata.drop(cleandata.columns[[60, 66, 72, 78]], axis=1)
    normalizedwholeds = normalize(cleandata)
    normalizedperuser = normalizeperuser(cleandata)

    return normalizedwholeds, normalizedperuser, cleandata
n_outputs = 500
X = 3 + 5 * random_state.normal(size=(n_samples, n_outputs))

# Let's compute the sum of the variance in the orignal output space
var_origin = np.var(X, axis=0).sum()

# Let's compute the variance on a random subspace
all_n_components = np.array([1, 50, 100, 200, 400, 500])
n_repetitions = 10
distortion = np.empty((len(all_n_components), n_repetitions))

for i, n_components in enumerate(all_n_components):
    for j in range(n_repetitions):
        transformer = GaussianRandomProjection(n_components=n_components,
                                               random_state=random_state)
        X_subspace = transformer.fit_transform(X)
        distortion[i, j] = np.var(X_subspace, axis=0).sum() / var_origin

# Let's plot the distortion as a function of the compression ratio
distortion_mean = distortion.mean(axis=1)
distortion_std = distortion.std(axis=1)

plt.figure()
plt.plot(all_n_components / n_outputs, distortion_mean, "o-", color="g")
plt.plot(all_n_components / n_outputs, np.ones_like(distortion_mean),
         "--", color="r")
plt.fill_between(all_n_components / n_outputs,
                 distortion_mean - distortion_std,
                 distortion_mean + distortion_std, alpha=0.25, color="g")
plt.xlabel("n_components / n_outputs")
plt.ylabel('Distortion of the variance on a Gaussian subspace')
def ft_red_select(x,
                  y,
                  choice,
                  no_normalize,
                  dis_kept_features,
                  num_features=30):
    """
    :param 'full_file_name', which is the full path name to the file in question that we wish to do dimensionality
    reduction on
    :return: the new reduced 'x' and 'y' components of the file to be later written to a new file
    """

    #Normalize the data
    if not no_normalize:
        x = normalize(x)

    #Given the argument choice of feature selection/reduction, creates the relevant object, fits the 'x' data to it,
    #and reduces/transforms it to a lower dimensionality
    new_x = []
    print("Original 'x' shape:", np.shape(x))
    if choice == "pca":
        pca = PCA(n_components=num_features)
        new_x = pca.fit_transform(x)
        print("Explained variance = " +
              str(round(sum(pca.explained_variance_) * 100, 2)) + "%")
    elif choice == "grp":
        grp = GaussianRandomProjection(n_components=num_features)
        new_x = grp.fit_transform(x)
    elif choice == "agglom":
        agg = FeatureAgglomeration(n_clusters=num_features)
        new_x = agg.fit_transform(x)
    elif choice == "thresh":
        #Below threshold gives ~26 components upon application
        vt = VarianceThreshold(threshold=0.00015)
        new_x = vt.fit_transform(x)
        print("Explained variance = " +
              str(round(sum(vt.variances_) * 100, 2)) + "%")
        kept_features = list(vt.get_support(indices=True))
        if dis_kept_features:
            print("Kept features: ")
            for i in kept_features:
                print(col_names[i])
    elif choice == "rf":
        y_labels = [1 if s == "D" else 0 for s in y[:, 1]]
        clf = RandomForestClassifier(n_estimators=10000,
                                     random_state=0,
                                     n_jobs=-1)
        print("Fitting RF model....")
        clf.fit(x, y_labels)
        sfm = SelectFromModel(clf,
                              threshold=-np.inf,
                              max_features=num_features)
        print("Selecting best features from model...")
        sfm.fit(x, y_labels)
        kept_features = list(sfm.get_support(indices=True))
        if dis_kept_features:
            print("Kept features: ")
            for i in kept_features:
                print(col_names[i])
        new_x = x[:, kept_features]

    print("Reduced 'x' shape:", np.shape(new_x))
    return new_x, y
# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results = tsvd.fit_transform(train_test_p)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca_results = pca.fit_transform(train_test_p)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica_results = ica.fit_transform(train_test_p)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results = grp.fit_transform(train_test_p)

# SRP
srp = SparseRandomProjection(n_components=n_comp,
                             dense_output=True,
                             random_state=420)
srp_results = srp.fit_transform(train_test_p)

# save columns list before adding the decomposition components
usable_columns = train_test_p.columns

print train_test_p.shape
print tsvd_results.shape, type(tsvd_results)

# # Append decomposition components to datasets
for i in range(1, n_comp + 1):
Exemple #30
0
    plt.close()

Cancer_X, Cancer_y = ReadData.ReadCancerData()
Cancer_X = Cancer_X.iloc[:, :-1].values
Cancer_X = preprocessing.scale(Cancer_X)


Cancer_X_train, Cancer_X_test, Cancer_y_train, Cancer_y_test = train_test_split(Cancer_X, Cancer_y,random_state= 7641, test_size=0.2)


dims = range(2,30)
aa = defaultdict(dict)

for i,dim in product(range(10),dims):
    rp = GaussianRandomProjection(random_state=i, n_components=dim)
    aa[dim][i] = DistCorr(rp.fit_transform(Cancer_X_train), Cancer_X_train)
    
aa = pd.DataFrame(aa).T
mean_recon = aa.mean(axis=1).tolist()

fig, ax1 = plt.subplots()
ax1.plot(dims,mean_recon)
ax1.set_xlabel('Components')
ax1.set_ylabel('Pair Wise DisCorr')
plt.grid(linestyle='-', linewidth=1, axis = "x")
plt.title("Random Components Pair Wise DisCorr Cancer")
plt.savefig('Cancer_RP.png')
plt.show()
plt.close()

Cancer_RP = GaussianRandomProjection(n_components=13,random_state=7641).fit_transform(Cancer_X)
Exemple #31
0
X_agg.head()

# In[14]:

from sklearn.decomposition import FactorAnalysis

fa = FactorAnalysis(n_components=50, random_state=42)
X_fa = fa.fit_transform(X)

# In[15]:

from sklearn.random_projection import GaussianRandomProjection

grp = GaussianRandomProjection(n_components=50, random_state=42, eps=0.1)
X_grp = grp.fit_transform(X)

# In[16]:

# from sklearn.decomposition import PCA

# pca = PCA(n_components=100, random_state=42)
# X_pca = pca.fit_transform(X)

# In[17]:

# from sklearn.decomposition import FastICA

# ica = FastICA(n_components=15, random_state=42)
# X_ica = ica.fit_transform(X)
Exemple #32
0
def get_dc_feature(df_train,
                   df_test,
                   n_comp=12,
                   id_column=None,
                   label_column=None):
    """
    构造分解特征
    """
    train = df_train.copy()
    test = df_test.copy()

    if id_column:
        train_id = train[id_column]
        test_id = test[id_column]
        train = drop_columns(train, [id_column])
        test = drop_columns(test, [id_column])
    if label_column:
        train_y = train[label_column]
        train = drop_columns(train, [label_column])

    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
    tsvd_results_train = tsvd.fit_transform(train)
    tsvd_results_test = tsvd.transform(test)

    # PCA
    pca = PCA(n_components=n_comp, random_state=420)
    pca2_results_train = pca.fit_transform(train)
    pca2_results_test = pca.transform(test)

    # ICA
    ica = FastICA(n_components=n_comp, random_state=420)
    ica2_results_train = ica.fit_transform(train)
    ica2_results_test = ica.transform(test)

    # GRP
    grp = GaussianRandomProjection(n_components=n_comp,
                                   eps=0.1,
                                   random_state=420)
    grp_results_train = grp.fit_transform(train)
    grp_results_test = grp.transform(test)

    # SRP
    srp = SparseRandomProjection(n_components=n_comp,
                                 dense_output=True,
                                 random_state=420)
    srp_results_train = srp.fit_transform(train)
    srp_results_test = srp.transform(test)

    # Append decomposition components to datasets
    for i in range(1, n_comp + 1):
        train['pca_' + str(i)] = pca2_results_train[:, i - 1]
        test['pca_' + str(i)] = pca2_results_test[:, i - 1]

        train['ica_' + str(i)] = ica2_results_train[:, i - 1]
        test['ica_' + str(i)] = ica2_results_test[:, i - 1]

        train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
        test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

        train['grp_' + str(i)] = grp_results_train[:, i - 1]
        test['grp_' + str(i)] = grp_results_test[:, i - 1]

        train['srp_' + str(i)] = srp_results_train[:, i - 1]
        test['srp_' + str(i)] = srp_results_test[:, i - 1]

    if id_column:
        train[id_column] = train_id
        test[id_column] = test_id
    if label_column:
        train[label_column] = train_y

    return train, test
Exemple #33
0
# In[155]:


n_random_comp = 7


# In[156]:


random_proj = GaussianRandomProjection(n_components=n_random_comp)


# In[157]:


X_random_proj = random_proj.fit_transform(X_scaled)


# In[158]:


df_random_proj=pd.DataFrame(data=X_random_proj,columns=['Random_projection'+str(i) for i in range(1,n_random_comp+1)])


# ### Running k-means on random projections

# In[159]:


km_sse= []
km_silhouette = []
tsvd_results_train = tsvd.fit_transform(train_df)
tsvd_results_test = tsvd.transform(test_df)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train_df)
pca2_results_test = pca.transform(test_df)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train_df)
ica2_results_test = ica.transform(test_df)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train_df)
grp_results_test = grp.transform(test_df)

# SRP
srp = SparseRandomProjection(n_components=n_comp,
                             dense_output=True,
                             random_state=420)
srp_results_train = srp.fit_transform(train_df)
srp_results_test = srp.transform(test_df)

############
# write output
#############
print("Writing output...")
outputTrain = pd.DataFrame()
outputTest = pd.DataFrame()
# power_t : double, optional, default 0.5
# max_iter : int, optional, default 200
# shuffle : bool, optional, default True
# random_state : int, RandomState instance or None, optional, default None
# tol : float, optional, default 1e-4
# early_stopping : bool, default False
# validation_fraction : float, optional, default 0.1; only used if early_stopping True

clf = GaussianRandomProjection(
    random_state=0,
    n_components=20,
)

print(clf)

X_train = clf.fit_transform(X_train)
X_test = clf.fit_transform(X_test)

train_results = []
test_results = []

clf = MLPClassifier(
    random_state=0,
    hidden_layer_sizes=(100),
    activation='relu',
    solver='adam',
    batch_size=100,
    early_stopping=True,
    beta_1=.001,
    beta_2=.999,
)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    train['pca_' + str(i)] = pca2_results_train[:,i-1]
    test['pca_' + str(i)] = pca2_results_test[:, i-1]

    train['ica_' + str(i)] = ica2_results_train[:,i-1]
    test['ica_' + str(i)] = ica2_results_test[:, i-1]
start_time = time.time()

# Load the data
from income_data import X, y, X_train, X_test, y_train, y_test

# Scale the data
scaler = StandardScaler()
scaler.fit(X)
X_train_std = scaler.transform(X)
X_test_std = scaler.transform(X)
X_toCluster = X_train_std
y_inputs = y

# Reduce Dimensionality (Randomized Projections)
projection = ProjectionAlgorithm(n_components=22)
X_toCluster = projection.fit_transform(X_toCluster)

######
# Run k-means clustering with 1:n clusters determine scores for each
######
scores = []
silhouette_avg = []
BIC = []
maxClusters = 100
minClusters = 1
for i in range(minClusters, maxClusters):
    kmeans = KMeans(n_clusters=i + 1, random_state=0)
    cluster_labels = kmeans.fit_predict(X_toCluster)
    scores.append(kmeans.score(X_toCluster))
    silhouette_avg.append(silhouette_score(X, cluster_labels))
    BIC.append(compute_bic(kmeans, X_toCluster))
from load_mydata import LoadData
import math

mushroom = LoadData("mushroom")
data = scale(mushroom.data)
labels = np.array(mushroom.labels)

n_samples, n_features = data.shape
n_digits = len(np.unique(labels))
n_iter = 1000

print("n_digits: %d, \t n_samples %d, \t n_features %d"
      % (n_digits, n_samples, n_features))
t0 = time()
rp = GaussianRandomProjection(n_components=20)
reduced_data = rp.fit_transform(data)
print("time spent: %0.3fs" % (time()-t0))
#reduced_data = data

# Plot the data
fig=plt.figure()
#plt.clf()
n_plots=9
h = 0.02
x_min, x_max = reduced_data[:, 0].min(), reduced_data[:, 0].max()
y_min, y_max = reduced_data[:, 1].min(), reduced_data[:, 1].max()
for index in range(1,n_plots+1):
   vert=math.floor(math.sqrt(n_plots))
   hori=n_plots/vert
   fig.add_subplot(vert,hori,index)
   i,j = 2*index-2, 2*index-1
Exemple #39
0
# Perform Truncated Singular Value Decomposition (SVD)
from sklearn.decomposition import TruncatedSVD as TruncSVD
tsvd = TruncSVD(n_components=num_components,  algorithm='randomized', random_state=0)
tsvd_transformed_data_train = tsvd.fit_transform(sparse_trainData)
tsvd_transformed_data_valid = tsvd.transform(sparse_validData)

# Perform Randomized Principal Components Analysis (PCA)
from sklearn.decomposition import RandomizedPCA as RPCA
rpca = RPCA(n_components=num_components)
rpca_transformed_data_train = rpca.fit_transform(dense_trainData)
rpca_transformed_data_valid = rpca.transform(dense_validData)

# Perform Gaussian Random Projection
from sklearn.random_projection import GaussianRandomProjection as GaussRan
grp = GaussRan(n_components=num_components)
grp_transformed_data_train = grp.fit_transform(dense_trainData)
grp_transformed_data_valid = grp.transform(dense_validData)

# Perform Sparse Random Projection
from sklearn.random_projection import SparseRandomProjection as SparseRan
srp = SparseRan(n_components=num_components, random_state=0)
srp_transformed_data_train = srp.fit_transform(dense_trainData)
srp_transformed_data_valid = srp.transform(dense_validData)

# Perform classification using 1-Nearest Neighbor Classifier
from sklearn.neighbors import KNeighborsClassifier

# Create a subset grid to plot performance against numbers of components
tsvd_max = tsvd_transformed_data_train.shape[1]
plot_subset = []
length_of_plot_subset = len(plot_subset)
Exemple #40
0
import itertools
from scipy import linalg
import matplotlib as mpl
from sklearn import mixture

df = pd.read_csv('tic-tac-toe.data', sep=",", skiprows=0)

df = np.array(df)
print(df.shape, df.dtype)
dat = df[:, 0:9]
tar1 = df[:, 9]
X = dat
y = tar1

rp1 = GaussianRandomProjection(n_components=2)
X1 = rp1.fit_transform(X)
plt.figure()
for i in range(len(y)):
    if y[i] == 0:
        plt.scatter(X1[i, 0], X1[i, 1], color='r')
    elif y[i] == 1:
        plt.scatter(X1[i, 0], X1[i, 1], color='b')
plt.title('visualization of data in 2D (rp)-> tic-tac toe dataset')
plt.show()

r = np.array([7, 17, 37, 57, 77])
plt.figure()
for m in range(5):

    x1 = []
    # e1=[]
Exemple #41
0
    X = data.iloc[:, :41]
    y = data.iloc[:, 41]
    scaler = MinMaxScaler(feature_range=[0, 100])
    from sklearn.preprocessing import StandardScaler
    X_norm = StandardScaler().fit_transform(X)
    ###
    pca = PCA(n_components=10, random_state=10)
    X_r = pca.fit(X).transform(X)
    X_pca = X_r
    ####
    ica = FastICA(n_components=10, random_state=10)
    X_r = ica.fit(X).transform(X)
    X_ica = X_r
    ####
    rca = GaussianRandomProjection(n_components=10, random_state=10)
    X_r = rca.fit_transform(X_norm)
    X_rca = X_r
    ####
    svd = SVD(n_components=2)
    X_r = svd.fit_transform(X_norm)
    X_svd = X_r

    clf = MLPClassifier(hidden_layer_sizes=(82, 82, 82),
                        alpha=0.316227766,
                        learning_rate_init=0.016,
                        random_state=0,
                        solver="lbfgs")

    clusterer = KMeans(n_clusters=10, random_state=10).fit(X_pca)
    y_kmeans = clusterer.labels_
    X_df = pd.DataFrame(X_pca)
Exemple #42
0
def Random_Projection(M, new_dim, prng):
    proj = GaussianRandomProjection(n_components=new_dim, eps=0.1, random_state=None)
    return proj.fit_transform(M)
Exemple #43
0
int10 = int10 / max(int10)

df_non_obj_feats['binSum'] = df_non_obj_feats.apply(sum, 1)
df_non_obj_feats['binDec'] = int10

all_data_proc = pd.concat((df_obj_feats_freq, df_non_obj_feats), axis=1)

#%%
from sklearn.decomposition import PCA, FastICA
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
n_comp = 12

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results = grp.fit_transform(all_data_proc)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results = srp.fit_transform(all_data_proc)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca_results = pca.fit_transform(all_data_proc)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica_results = ica.fit_transform(all_data_proc)
for i in range(1, n_comp+1):
    all_data_proc['pca_' + str(i)] = pca_results[:,i-1]
    all_data_proc['ica_' + str(i)] = ica_results[:, i-1]
def gaussianRP(data):
    rp = GaussianRandomProjection(n_components=new_dimension)
    return rp.fit_transform(data)