def randomized_projection(X, y, dataset_name):
    rand = GaussianRandomProjection(n_components=2)

    X_transformed = rand.fit_transform(X)
    plt.figure()
    plt.title(
        '{} data after Gaussian Random Projection into 2 components'.format(
            dataset_name))
    plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=y)
    plt.show()

    rand = GaussianRandomProjection(n_components=3)
    X_transformed = rand.fit_transform(X)
    # Visualize transformed data
    plt.figure()
    fig = plt.figure(1, figsize=(4, 3))
    plt.clf()
    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
    plt.cla()
    plt.title(
        '{} data after Gaussian Random Projection into 3 components'.format(
            dataset_name))
    ax.scatter(X_transformed[:, 0],
               X_transformed[:, 1],
               X_transformed[:, 2],
               c=y)
    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])
    plt.show()
Exemple #2
0
    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        eps = self.hyperparams['eps']
        n_components = johnson_lindenstrauss_min_dim(n_samples=self._x_dim,
                                                     eps=eps)
        _logger.info("[INFO] n_components is " + str(n_components))
        if n_components > self._y_dim:
            # Default n_components == 'auto' fails. Need to explicitly assign n_components
            self._model = GaussianRandomProjection(
                n_components=self._y_dim, random_state=self.random_seed)
        else:
            try:
                self._model = GaussianRandomProjection(
                    eps=eps, random_state=self.random_seed)
                self._model.fit(self._training_data)
            except:
                _logger.info(
                    "[Warning] Using given eps value failed, will use default conditions."
                )
                self._model = GaussianRandomProjection()

        self._model.fit(self._training_data)

        self._fitted = True
        return CallResult(None, has_finished=True)
def test_output_transformer():
    X, y = datasets.make_multilabel_classification(return_indicator=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    # Check that random_state are different
    transformer = GaussianRandomProjection(n_components=5, random_state=None)
    for name, ForestEstimator in FOREST_ESTIMATORS.items():
        est = ForestEstimator(random_state=5, output_transformer=transformer)
        est.fit(X_train, y_train)
        y_pred = est.predict(X_test)
        assert_equal(y_pred.shape, y_test.shape)

        random_state = [
            sub.output_transformer_.random_state for sub in est.estimators_
        ]

        assert_equal(len(set(random_state)), est.n_estimators)

    # Check that random_state are equals
    transformer = FixedStateTransformer(
        GaussianRandomProjection(n_components=5), random_seed=0)
    for name, ForestEstimator in FOREST_ESTIMATORS.items():
        est = ForestEstimator(random_state=5, output_transformer=transformer)
        est.fit(X_train, y_train)
        y_pred = est.predict(X_test)
        assert_equal(y_pred.shape, y_test.shape)

        random_state = [
            sub.output_transformer_.random_state for sub in est.estimators_
        ]

        assert_equal(len(set(random_state)), 1)
        assert_equal(random_state[0], 0)
def test_fixed_state_transformer():

    random_state = check_random_state(0)
    X = random_state.rand(500, 100)

    # Check that setting the random_seed is equivalent to set the
    # random_state
    transf = GaussianRandomProjection(n_components=5, random_state=0)
    fixed_transf = FixedStateTransformer(
        GaussianRandomProjection(n_components=5), random_seed=0)
    assert_array_almost_equal(fixed_transf.fit_transform(X),
                              transf.fit_transform(X))

    # Check that set_params doesn't modify the results
    fixed_transf = FixedStateTransformer(
        GaussianRandomProjection(n_components=5, random_state=None))

    fixed_transf2 = FixedStateTransformer(
        GaussianRandomProjection(random_state=1, n_components=5))

    assert_array_almost_equal(fixed_transf.fit_transform(X),
                              fixed_transf2.fit_transform(X))

    # Check that it work when there is no random_state
    fixed_transf = FixedStateTransformer(IdentityProjection())
    assert_array_almost_equal(fixed_transf.fit_transform(X), X)
Exemple #5
0
def rp_dim_red(x_train_scaled, dataset_name, features_num = 19):
    print(x_train_scaled.shape[1])
    rp_feature_losses = []
    rp_feature_stds = []
    z=0
    for k in range(1, x_train_scaled.shape[1]+1):
        losses = []
        for m in range(5):
            rp = GaussianRandomProjection(k)
            rp_result = rp.fit_transform(x_train_scaled)
            # inverse_transform
            inv = np.linalg.pinv(rp.components_.T)
            x_projected_rp = rp_result.dot(inv)
            loss = ((x_train_scaled - x_projected_rp) ** 2).mean()
            losses.append(loss)
        rp_feature_stds.append(np.std(losses))   
        rp_feature_losses.append(np.mean(losses))

    np_feature_losses_percent = np.multiply(100, rp_feature_losses/np.sum(rp_feature_losses))
    print("std")
    print(rp_feature_stds)
    print('loss')
    print(rp_feature_losses)
    print('sum')
    print(np.sum(rp_feature_losses))
    print('%')
    print(np_feature_losses_percent)
    print('num of clustrs < 10% loss')
    for i in range(len(np_feature_losses_percent)):
        z=z+np_feature_losses_percent[i]
        if z>90:
            print(i+1)
            break    
    plt.bar(list(range(1,len(np_feature_losses_percent)+1)), np_feature_losses_percent)
    plt.title("Random Projection Losses % ("+str(dataset_name)+")")
    plt.ylabel("Mean Squared Error (% of Total)")
    plt.xlabel("Features")
    plt.savefig((str(dataset_name))+' rp analysis % loss.png')
    plt.show()
    
    plt.bar(list(range(1,len(rp_feature_losses)+1)), rp_feature_losses)
    plt.title("Random Projection Losses ("+str(dataset_name)+")")
    plt.ylabel("Mean Squared Error")
    plt.xlabel("Features")
    # plt.subplots_adjust(bottom=.15, left=.15)
    plt.savefig((str(dataset_name))+' rp analysis.png')
    plt.show()
    
    plt.bar(list(range(1,len(rp_feature_stds)+1)), rp_feature_stds)
    plt.title("Random Projection STDs ("+str(dataset_name)+")")
    plt.ylabel("STD")
    plt.xlabel("Features")
    plt.savefig((str(dataset_name))+' rp std analysis.png')
    plt.show()
    
    rp = GaussianRandomProjection(features_num,random_state=random_state)
    rp_result = rp.fit_transform(x_train_scaled)
    inv = np.linalg.pinv(rp.components_.T)
    x_projected_rp = rp_result.dot(inv)
    return rp_result, x_projected_rp
Exemple #6
0
def run_rp(dataset_name, X, y, verbose=False):
    # attempt RP for various dimensionality levels
    n_components_vals = np.arange(1, len(X.columns))
    iterations = np.arange(1, 15)
    recon_losses = []

    for n_components in n_components_vals:
        # see how reconstruction loss changes across iterations
        tmp_recon_losses = []
        for i in iterations:
            rp = GaussianRandomProjection(n_components=n_components, random_state=i)
            X_rp = rp.fit_transform(X)

            # calculate reconstruction error
            X_comp_pinv = np.linalg.pinv(rp.components_.T)
            X_projection = np.dot(X_rp, X_comp_pinv)
            recon_loss = ((X - X_projection) ** 2).mean()
            # if verbose: print(recon_loss.shape)
            tmp_recon_losses.append(np.sum(recon_loss))

        tmp_avg_recon_loss = np.mean(np.array(tmp_recon_losses))
        recon_losses.append(tmp_avg_recon_loss)

    if dataset_name == 'abalone':
        n_components = 3
    else:
        n_components = 25

    # plot reconstruction losses
    # if verbose: print(recon_losses[0])
    recon_losses = np.array(recon_losses)
    plot_title = "RP for " + dataset_name + ": Reconstruction loss\n"
    plotting.plot_recon_loss(
        recon_losses, n_components_vals, title=plot_title)
    plt.savefig('graphs/rp_' + dataset_name + '_recon_loss.png')
    plt.clf()

    # calculate reconstruction error
    grp = GaussianRandomProjection(n_components=n_components, random_state=RANDOM_SEED)
    X_rp = grp.fit_transform(X)

    X_comp_pinv = np.linalg.pinv(grp.components_.T)
    X_projection = np.dot(X_rp, X_comp_pinv)
    recon_loss = ((X - X_projection) ** 2).mean()

    print(dataset_name, ": RP reconstruction loss for k =", n_components, ":", np.sum(recon_loss), '\n')
    X_rp = pd.DataFrame(X_rp)

    # run K-means
    clustering.run_k_means(dataset_name, X_rp, y, dim_reduction='rp', verbose=verbose)

    # run EM
    clustering.run_expect_max(dataset_name, X_rp, y, dim_reduction='rp', verbose=verbose)

    return X_rp
Exemple #7
0
def property_plot(model_name, n_comp, n_cluster, data, label):
    if model_name == 'PCA':
        train_PCA = PCA(n_components=n_comp).fit(data)
        reduced = PCA(n_components=n_comp).fit_transform(data)
        estimator = KMeans(init=train_PCA.components_,
                           n_clusters=n_cluster,
                           max_iter=2000,
                           n_init=1)
    elif model_name == 'ICA':
        train_ICA = FastICA(n_components=n_comp).fit(data)
        reduced = FastICA(n_components=n_comp).fit_transform(data)
        estimator = KMeans(init=train_ICA.components_,
                           n_clusters=n_cluster,
                           max_iter=2000,
                           n_init=1)
    elif model_name == 'RP':
        train_RP = GaussianRandomProjection(n_components=n_comp).fit(data)
        reduced = GaussianRandomProjection(
            n_components=n_comp).fit_transform(data)
        estimator = KMeans(init=train_RP.components_,
                           n_clusters=n_cluster,
                           max_iter=2000,
                           n_init=1)
    elif model_name == 'TSVD':
        train_SVD = TruncatedSVD(n_components=n_comp).fit(data)
        reduced = TruncatedSVD(n_components=n_comp).fit_transform(data)
        estimator = KMeans(init=train_SVD.components_,
                           n_clusters=n_cluster,
                           max_iter=2000,
                           n_init=1)
    elif model_name == 'k-means':
        reduced = data
        estimator = KMeans(init='k-means++',
                           n_clusters=n_cluster,
                           max_iter=2000)

    np.random.seed(99)
    t0 = time()
    estimator.fit(data)
    runtime = time() - t0
    dist = estimator.inertia_
    h**o = metrics.homogeneity_score(label, estimator.labels_)
    compl = metrics.completeness_score(label, estimator.labels_)
    est2 = KMeans(init='k-means++', n_clusters=n_cluster,
                  max_iter=2000).fit(reduced)
    newlabels = est2.predict(reduced)
    correct = 1.0 * sum(label == newlabels) / len(label)

    print(
        '% 9s      %3i         %3i      %.3f        %i      %.3f        %.3f    %.3f'
        % (model_name, n_cluster, n_comp, runtime, dist, h**o, compl, correct))

    return (model_name, n_cluster, n_comp, runtime, dist, h**o, compl,
            estimator.labels_)
 def fit(self,
         *,
         timeout: float = None,
         iterations: int = None) -> CallResult[None]:
     eps = self.hyperparams['eps']
     n_components = johnson_lindenstrauss_min_dim(n_samples=self._x_dim,
                                                  eps=eps)
     if n_components > self._x_dim:
         self._model = GaussianRandomProjection(n_components=self._x_dim)
     else:
         self._model = GaussianRandomProjection(eps=eps)
     self._model.fit(self._training_data)
Exemple #9
0
def reduce_then_cluster(X,y, dataset_name):
    # First, PCA n=2
    pca = PCA(n_components=2)
    X_transformed = pca.fit_transform(X)

    kmeans(X_transformed, y, dataset_name + ' - After PCA (n_components=2)')
    expectation_maximization(X_transformed, y, dataset_name + ' - After PCA (n_components=2)')

    # Then, PCA n=3
    pca = PCA(n_components=3)
    X_transformed = pca.fit_transform(X)

    kmeans(X_transformed, y, dataset_name + ' - After PCA (n_components=3)')
    expectation_maximization(X_transformed, y, dataset_name + ' - After PCA (n_components=3)')

    # ICA, n=2
    ica = FastICA(n_components=2)
    X_transformed = ica.fit_transform(X)

    kmeans(X_transformed, y, dataset_name + ' - After ICA (n_components=2)')
    expectation_maximization(X_transformed, y, dataset_name + ' - After ICA (n_components=2)')

    # ICA, n=3
    ica = FastICA(n_components=2)
    X_transformed = ica.fit_transform(X)

    kmeans(X_transformed, y, dataset_name + ' - After ICA (n_components=3)')
    expectation_maximization(X_transformed, y, dataset_name + ' - After ICA (n_components=3)')

    # Random Projections, n=2
    rand = GaussianRandomProjection(n_components=2, random_state=65)
    X_transformed = rand.fit_transform(X)
    kmeans(X_transformed, y, dataset_name + ' - After Gaussian Random Projection (n_components=2)')
    expectation_maximization(X_transformed, y, dataset_name + ' - After Gaussian Random Projection (n_components=2)')

    # Random Projections, n=3
    rand = GaussianRandomProjection(n_components=3, random_state=65)
    X_transformed = rand.fit_transform(X)
    kmeans(X_transformed, y, dataset_name + ' - After Gaussian Random Projection (n_components=3)')
    expectation_maximization(X_transformed, y, dataset_name + ' - After Gaussian Random Projection (n_components=3)')

    # Select K best, k=2
    select = SelectKBest(f_classif, k=2)
    X_transformed = select.fit_transform(X,y)
    kmeans(X_transformed, y, dataset_name + ' - After 2 Best Features Selected')
    expectation_maximization(X_transformed, y, dataset_name + ' - After 2 Best Features Selected')

    # Select K best, k=3
    select = SelectKBest(f_classif, k=3)
    X_transformed = select.fit_transform(X,y)
    kmeans(X_transformed, y, dataset_name + ' - After 3 Best Features Selected')
    expectation_maximization(X_transformed, y, dataset_name + ' - After 3 Best Features Selected')
    def rand_guas(self, n_comp, data=None):
        if data is None:
            data = self.train
        else:
            data = pd.DataFrame(data)

        rand_guas = GaussianRandomProjection(n_components=n_comp)
        rand_guas.fit(data)
        self.rand_guas_train_data = rand_guas.transform(data)
        self.RAND_GUAS = rand_guas

        rand_test = GaussianRandomProjection(n_components=n_comp)
        rand_test.fit(self.test)
        self.rand_guas_test_data = rand_test.transform(self.test)
Exemple #11
0
def compute_neural_network(a, b):
    ### PCA
    ### ICA
    ### RP
    ### Feature Importance

    # dataset a
    clf = PCA(n_components=3)
    temp = clf.fit_transform(a[0])
    print(f'Admissions Dataset: PCA {run_nn(temp, a[1])}')

    clf = FastICA(n_components=7, random_state=seed)
    temp = clf.fit_transform(a[0])
    print(f'Admissions Dataset: ICA {run_nn(temp, a[1])}')

    clf = GaussianRandomProjection(n_components=2, random_state=seed)
    temp = clf.fit_transform(a[0])
    print(f'Admissions Dataset: RP {run_nn(temp, a[1])}')

    important_features = ['CGPA', 'GRE Score', 'TOEFL Score']
    temp_data = dict()
    for feature in important_features:
        temp_data[feature] = a[0][feature]

    temp = pd.DataFrame(temp_data)

    print(f'Admissions Dataset: Feature Importance {run_nn(temp, a[1])}')

    # dataset b
    ## 5 PCA, 7 ICA
    clf = PCA(n_components=5)
    temp = clf.fit_transform(b[0])
    print(f'Income Dataset: PCA {run_nn(temp, b[1])}')

    clf = FastICA(n_components=7, random_state=seed)
    temp = clf.fit_transform(b[0])
    print(f'Income Dataset: ICA {run_nn(temp, b[1])}')

    clf = GaussianRandomProjection(n_components=2, random_state=seed)
    temp = clf.fit_transform(b[0])
    print(f'Income Dataset Dataset: RP {run_nn(temp, b[1])}')

    important_features = ['fnlwgt', 'age', 'education-num']
    temp_data = dict()
    for feature in important_features:
        temp_data[feature] = b[0][feature]

    temp = pd.DataFrame(temp_data)

    print(f'Income Dataset: Feature Importance {run_nn(temp, b[1])}')
Exemple #12
0
def myRCA(data, act_labels, output_folder, experiment_name):
    # Let's start by exploring the random projections we get for different number of components
    num_features = data.shape[1]
    values = list(range(1, num_features))
    rn = np.random.RandomState(13)
    random_seeds = list(rn.randint(1, 1000000, 20))
    errors = []
    for r in random_seeds:
        mses = []
        for k in values:
            rca = GaussianRandomProjection(n_components=k,
                                           random_state=r).fit(data)
            trans_data = rca.transform(data)
            inv_data = np.linalg.pinv(rca.components_.T)
            rec_data = trans_data.dot(inv_data)
            mse = MSE(rec_data, data.values)
            mses.append(mse)
        errors.append(mses)
    avg_errors = np.mean(np.array(errors), axis=0)
    std_errors = np.std(np.array(errors), axis=0)

    # Graph the reconstruction error per component
    plt.errorbar(list(range(1, num_features)), avg_errors, std_errors)
    plt.xticks(ticks=list(range(num_features)),
               labels=list(range(1, num_features + 1)))
    plt.xlabel('# Components')
    plt.ylabel('Reconstruction Error')
    plt.title(
        'Average Reconstruction Error for K Components Over 200 Iterations')
    plt.savefig(output_folder + '/' + experiment_name +
                '_rca_component_reconstruction_error.png')
    plt.close()
    plt.figure()

    # Create a final rca to return
    k = np.argmin(avg_errors) + 1  # add 1 to account for 0 indexing
    thresh = 0.2
    for i in range(len(avg_errors)):
        if avg_errors[i] <= thresh:
            k = i + 1  # Add 1 to account for 0 indexing
            break

    start_time = time.time()
    rca = GaussianRandomProjection(n_components=k, random_state=13).fit(data)
    end_time = time.time()
    final_time = end_time - start_time

    return rca, final_time
def comp1(K):
    Sum_of_squared_distances = []
    k = []
    accuracy_train = []
    accuracy_test = []
    score = []
    for i in range(1, K):
        print(i)
        agglo = GaussianRandomProjection(n_components=10, eps=0.6)
        #X_new_train,y_new_train=transformer.fit(X_train,y_train)
        #X_new_test,y_new_test = transformer.transform(X_test,y_test)
        agglo.fit(X)
        X_reduced = agglo.transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X_reduced,
                                                            y,
                                                            test_size=0.20)
        km = MLPClassifier(solver='lbfgs',
                           alpha=1e-5,
                           hidden_layer_sizes=[8, 8, 8, 8, 8],
                           random_state=1)
        km.fit(X_train, y_train)
        km.fit(X_test, y_test)
        #transformer1 = GaussianRandomProjection(n_components=i,eps=0.5)
        #transformer2 = GaussianRandomProjection(n_compo
        label_train = km.predict(X_train)
        label_test = km.predict(X_test)
        accu_train = km.score(X_test, y_test)
        accu_test = km.score(X_train, y_train)
        #score_train1=metrics.silhouette_score(X_new,label, metric='euclidean')
        #Sum_of_squared_distances.append(km.inenents=i,eps=0.6)
        #label=transformer.predicn)rtia_)
        k.append(i)
        accuracy_train.append(accu_train)
        accuracy_test.append(accu_test)
        #score.append(score_train1)
        #print(accuracy)
    k = np.array(k)
    Sum_of_squared_distances = np.array(Sum_of_squared_distances)
    score = np.array(score)
    accuracy_train = np.array(accuracy_train)
    accuracy_test = np.asarray(accuracy_test)
    #line1,=plt.plot(k, Sum_of_squared_distances, 'bx-',marker='o')
    #line2,=plt.plot(k,score,color='g',marker='o')
    line3, = plt.plot(k,
                      accuracy_train,
                      color='r',
                      marker='o',
                      label='train_accuracy')
    line4, = plt.plot(k,
                      accuracy_test,
                      color='g',
                      marker='o',
                      label='test_accuracy')
    #plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
    plt.xlabel('k')
    plt.legend()
    plt.ylabel('accuracy')
    #plt.ylim(0,1)
    plt.show()
    return None
Exemple #14
0
 def __init__(self, nComp):
     self._N_COMP = nComp
     self._pca = PCA(n_components=self._N_COMP, random_state=17)
     self._tsvd = TruncatedSVD(n_components=self._N_COMP, random_state=17)
     self._ica = FastICA(n_components=self._N_COMP, random_state=17)
     self._grp = GaussianRandomProjection(n_components=self._N_COMP, eps=0.1, random_state=17)
     self._srp = SparseRandomProjection(n_components=self._N_COMP, dense_output=True, random_state=17)
def eps():
    Sum_of_squared_distances = []
    k = []
    score = []
    eps = [0.8, 0.6, 0.4, 0.2, 0.05, 0.01]
    for i in eps:
        transformer = GaussianRandomProjection(n_components=4, eps=i)
        X_new = transformer.fit_transform(X)
        #label=transformer.predict(X)
        km = KMeans(n_clusters=2, random_state=0, max_iter=10000,
                    tol=1e-9).fit(X_new)
        #label=km.predict(X_new)
        #score_train1=metrics.silhouette_score(X_new,label, metric='euclidean')
        Sum_of_squared_distances.append(km.inertia_)
        k.append(i)
        #score.append(score_train1)
        print(Sum_of_squared_distances)
    k = np.array(k)
    Sum_of_squared_distances = np.array(Sum_of_squared_distances)
    score = np.array(score)
    line1, = plt.plot(k, Sum_of_squared_distances, 'bx-', marker='o')
    #line2,=plt.plot(k,score,color='g',marker='o')
    plt.xlabel('k')
    plt.ylabel('Sum_of_squared_distances')
    plt.title('Elbow curve Optimal eps')
    plt.show()
    return None
def otherScikitImpl(data, orig_dimension, new_dimension):
    rp = GaussianRandomProjection(n_components=new_dimension)
    m = rp._make_random_matrix(new_dimension, orig_dimension)
    m = np.mat(m)
    reduced = m * np.mat(data).transpose()
    reduced = reduced.transpose()
    return reduced
def RP_exp(X, y, title):
    ncomp= [i+1 for i in range(X.shape[1]-1)]
    stdev=[]
    mean=[]
    for n in ncomp:
        repeats = []
        for i in range(5):
            rp = GaussianRandomProjection(n_components=n)
            temp = rp.fit_transform(X)
            repeats.append(temp)

        diffs = []
        for (i, j) in [(0, 1), (0, 2), (0, 3), (0, 4), (1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]:
            diffs.append(repeats[i] - repeats[j])
        stdev.append(np.std(diffs))
        mean.append(np.mean(diffs))

    comp_arr=np.array(ncomp)
    mean_arr=np.array(mean)
    stdev_arr=np.array(stdev)

    plt.fill_between(comp_arr, mean_arr-stdev_arr,
                    mean_arr + stdev_arr, alpha=0.1,
                         color="b", label="Stdev")
    plt.plot(ncomp, mean, 'o-', color="b", label="Mean")
    plt.title("Mean pairwise difference of RP: "+ title)
    plt.legend(loc='best')
    plt.xlabel("n_components")
    plt.ylabel("Pairwise difference")
    plt.savefig("RP "+title)
    plt.show()
Exemple #18
0
def rand_proj_reconstruction_error(train_x, n):
    ''' '''

    results = []

    for i in range(1, n, 10):

        for j in range(1, 11):

            error = 0

            rand_proj = GaussianRandomProjection(n_components=n)
            reduced_df = rand_proj.fit_transform(train_x)

            psuedo_inverse = np.linalg.pinv(rand_proj.components_.T)
            reconstructed = reduced_df.dot(psuedo_inverse)

            error += metrics.mean_squared_error(train_x, reconstructed)
            # # error = (np.linalg.norm(train_x - reconstructed) ** 2) / len(train_x)
            # # error = np.sum(np.square(train_x - reconstructed))
            # error = np.mean((train_x - reconstructed)**2)
            # error =  ((train_x - reconstructed) ** 2).sum(1).mean()

        results.append({"n_components": i, "reconstruction_error": error / 10})

    return results
Exemple #19
0
def rand_proj(train_x, n):
    ''' '''

    rp = GaussianRandomProjection(n_components=n)
    reduced_df = rp.fit_transform(train_x)

    return reduced_df
def plot_data(method, X, y, title, filename):

    fig, (ax1) = plt.subplots(1, 1)

    n_labels = len(y)

    if method == 'pca':
        t = decomposition.PCA(n_components=2)
        X = t.fit_transform(X)
    elif method == 'ica':
        t = decomposition.FastICA(n_components=2, whiten=True)
        X = t.fit_transform(X)
    elif method == 'rp':
        t = GaussianRandomProjection(n_components=2)
        X = t.fit_transform(X)

    np.random.seed(20)
    for label in np.unique(y):
        ax1.scatter(X[y == label, 0],
                    X[y == label, 1],
                    color=np.random.rand(3),
                    linewidths=1)

    ax1.set_title(title)
    ax1.grid()
    plt.tight_layout()

    plt.savefig('/'.join(['output', filename]))
    plt.close("all")
def rp(name, x, y):
    plot.style.use('seaborn-darkgrid')

    for i in range(6):
        rp = GaussianRandomProjection(eps=0.95, random_state=i)
        transformed = rp.fit_transform(x)

        axes = [0, 0]
        axes_std = [0, 0]

        for axis in range(np.shape(transformed)[1]):
            std = np.std(transformed[:, axis])
            if std > axes_std[0]:
                axes[0] = axis
                axes_std[0] = std
            elif std > axes_std[1]:
                axes[1] = axis
                axes_std[1] = std

        plot.subplot(2, 3, i + 1)
        plot.title(f'Random seed = {i}')
        plot.xlabel(f'Dimension {axes[0]}')
        plot.ylabel(f'Dimension {axes[1]}')
        plot.scatter(transformed[:, axes[0]],
                     transformed[:, axes[1]],
                     c=y,
                     cmap='viridis')

    plot.show()
def components(K):
    Sum_of_squared_distances = []
    k = []
    accuracy = []
    score = []
    for i in range(1, K):
        transformer = GaussianRandomProjection(n_components=i, eps=0.1)
        #transformer1 = GaussianRandomProjection(n_components=i,eps=0.5)
        #transformer2 = GaussianRandomProjection(n_components=i,eps=0.6)
        X_new = transformer.fit_transform(X)
        #label=transformer.predict(X)
        km = KMeans(n_clusters=2, random_state=0, max_iter=10000,
                    tol=1e-9).fit(X_new)
        label = km.predict(X_new)
        accu = matchfn(y, label)
        #score_train1=metrics.silhouette_score(X_new,label, metric='euclidean')
        Sum_of_squared_distances.append(km.inertia_)
        k.append(i)
        accuracy.append(accu)
        #score.append(score_train1)
        #print(Sum_of_squared_distances)
    k = np.array(k)
    Sum_of_squared_distances = np.array(Sum_of_squared_distances)
    score = np.array(score)
    accuracy = np.array(accuracy)
    #line1,=plt.plot(k, Sum_of_squared_distances, 'bx-',marker='o')
    #line2,=plt.plot(k,score,color='g',marker='o')
    line3, = plt.plot(k, accuracy, color='r', marker='o')
    plt.xlabel('k')
    plt.ylabel('accuracy')
    #plt.title('Elbow curve Optimal k')
    #plt.ylim(0,1)
    plt.show()
    return None
def dimensionality_reduction():
    ica_best_components = 5
    pca_best_components = 6
    rp_chosen_components = 3
    variance_threshold = 0.02
    pca = PCA(n_components=pca_best_components)
    pca_x_train = pca.fit_transform(x_train)
    base_experiment.plot_eigen_values("{}-{}".format(plot_name, "PCA"),
                                      pca.explained_variance_)
    base_experiment.plot_points_3d("{}-{}".format(plot_name, "PCA"),
                                   pca_x_train)
    ica = FastICA(n_components=ica_best_components)
    ica_x_train = ica.fit_transform(x_train)
    base_experiment.plot_points_3d("{}-{}".format(plot_name, "ICA"),
                                   ica_x_train)
    rp = GaussianRandomProjection(n_components=rp_chosen_components)
    rp_x_train = rp.fit_transform(x_train)
    base_experiment.plot_points_3d(
        "{}-{}".format(plot_name, "Random Projection"), rp_x_train)
    variance_x_train = VarianceThreshold(
        threshold=variance_threshold).fit_transform(
            min_max_scaler.transform(features_data))
    variance_x_train = preprocessing.scale(variance_x_train)
    find_best_k_for_reduced_features(ica_x_train, pca_x_train, rp_x_train,
                                     variance_x_train)
    clustering_after_reduction(pca_x_train, ica_x_train, rp_x_train,
                               variance_x_train)
    run_ann_with_only_dimensionality_reduction(pca_x_train, ica_x_train,
                                               rp_x_train, variance_x_train)
Exemple #24
0
def run_k_means_on_random_projections_cardiovascular_data(path):
    data_set = 'cardio'
    x_train, y_train = load_data(path + 'data/' + data_set + '/train/')
    # X, y = load_data(path + 'data/' + data_set + '/train/')

    pca = GaussianRandomProjection(n_components=5)
    pca_x_train = pca.fit_transform(x_train)

    f = open("cardiovascular_random_projections_stats.txt","w+")

    bench_k_means("1", pca_x_train, y_train, 1, f, 1)
    bench_k_means("2", pca_x_train, y_train, 2, f, 1)
    bench_k_means("3", pca_x_train, y_train, 3, f, 1)
    bench_k_means("4", pca_x_train, y_train, 4, f, 1)
    bench_k_means("5", pca_x_train, y_train, 5, f, 1)
    bench_k_means("6", pca_x_train, y_train, 6, f, 1)
    bench_k_means("7", pca_x_train, y_train, 7, f, 1)
    bench_k_means("8", pca_x_train, y_train, 8, f, 1)
    bench_k_means("9", pca_x_train, y_train, 9, f, 1)
    bench_k_means("10", pca_x_train, y_train, 10, f, 1)
    bench_k_means("11", pca_x_train, y_train, 11, f, 1)
    bench_k_means("12", pca_x_train, y_train, 12, f, 1)
    bench_k_means("13", pca_x_train, y_train, 13, f, 1)
    bench_k_means("14", pca_x_train, y_train, 14, f, 1)
    bench_k_means("15", pca_x_train, y_train, 15, f, 1)
    f.close()
def rp(X, y, n_components='auto', eps=0.1, random_state=None, plot=1, dataset='german'):
    rp_model = GaussianRandomProjection(n_components=n_components, eps=eps, random_state=random_state)
    rp_model.fit(X)
    X_new = rp_model.transform(X)
    if plot:
        if dataset == 'german':
            plt.scatter(X_new[y == 1, 0], X_new[y == 1, 1], c='red', label='Samples with label 1')
            plt.scatter(X_new[y == 0, 0], X_new[y == 0, 1], c='green', label='Samples with label 0')
            plt.title("German dataset after Randomized Projection")
            plt.legend()
            plt.xlabel("Component 1")
            plt.ylabel("Component 2")
            plt.savefig("german-after-Random-Projection.png")
            plt.close()

        elif dataset == 'australian':
            plt.scatter(X_new[y == 1, 0], X_new[y == 1, 1], c='red', label='Samples with label 1')
            plt.scatter(X_new[y == 0, 0], X_new[y == 0, 1], c='green', label='Samples with label 0')
            plt.title("Australian dataset after Randomized Projection")
            plt.legend()
            plt.xlabel("Component 1")
            plt.ylabel("Component 2")
            plt.savefig("australian-after-Random-Projection.png")
            plt.close()
    return X_new
def save_new_data(dataset, n_components, iteration):
    X, y = load_dataset(dataset)
    data = X
    rp = GaussianRandomProjection(n_components=n_components)
    rp.fit(data)

    matrix = rp.components_
    new_data = rp.transform(data)

    plot_data('rp',
              new_data,
              y,
              dataset.title() + ': RP',
              filename='-'.join(
                  ['rp', dataset,
                   str(iteration), 'data', 'trans']))

    results = np.array(new_data)
    np.savetxt('data/' + ('-'.join(
        [dataset, str(n_components),
         str(iteration) + 'rp.csv'])),
               results,
               delimiter=",")

    new_data_inv = np.dot(new_data, matrix)
    loss = metrics.mean_squared_error(data, new_data_inv)
    print loss
def train_reduc(data,
                reduc_type='pca',
                kernel='rbf',
                n_c=8,
                eps=0.01,
                random_state=2020):
    if reduc_type == 'pca':
        reduc = PCA(n_components=n_c)
    elif reduc_type == 'spca':
        reduc = SparsePCA(n_components=n_c)
    elif reduc_type == 'kpca':
        reduc = KernelPCA(n_components=n_c, kernel=kernel)
    elif reduc_type == 'ica':
        reduc = FastICA(n_components=n_c)
    elif reduc_type == 'grp':
        reduc = GaussianRandomProjection(n_components=n_c,
                                         eps=eps,
                                         random_state=random_state)
    elif reduc_type == 'srp':
        reduc = SparseRandomProjection(n_components=n_c,
                                       density='auto',
                                       eps=eps,
                                       dense_output=True,
                                       random_state=random_state)

    reduced = reduc.fit_transform(data)
    print('Reduc Complete')
    return reduced, reduc
Exemple #28
0
    def get_transform(algorithm):
        """
        Defines and returns a feature selection transform object of the designated type.

        Parameters
        ----------
        algorithm : {'pca', 'kpca', 'grp', 'fa', 'k_best'}
            Transform algorithm to return an object.

        Returns
        ----------
        transform : object
            Instantiated transform object.
        """
        if algorithm == 'pca':
            transform = PCA()
        elif algorithm == 'kpca':
            transform = KernelPCA()
        elif algorithm == 'grp':
            transform = GaussianRandomProjection()
        elif algorithm == 'fa':
            transform = FeatureAgglomeration()
        elif algorithm == 'k_best':
            transform = SelectKBest(mutual_info_regression)
        else:
            raise Exception(
                'No selection algorithm defined for {0}'.format(algorithm))

        return transform
    def fit(self, X, y=None):
        r"""
        Fit to the singleview data.

        Parameters
        ----------
        X : array of shape (n_samples, n_total_features)
            Input dataset

        y : Ignored

        Returns
        -------
        self : object
            The Transformer instance
        """
        # set function level random state
        np.random.seed(self.random_state)
        self.GaussianRandomProjections_ = [
            GaussianRandomProjection(n_components=self.n_components,
                                     eps=self.eps).fit(X)
            for _ in range(self.n_views)
        ]

        return self
def create_random_guassian_projections(params, x_data):
    components = params['components']
    grps = [GaussianRandomProjection(n_components=components) for _ in range(params['num_retry'])]
    x_data_news = []
    x_data_recons = []
    x_data_projection_losses = []
    for i in range(0, params['num_retry']):
        print(str(i))
        # project data from high dim to low dim
        x_data_news.append(grps[i].fit_transform(x_data))

        # now reconstruct the data by projecting it back into higher dimensions
        x_data_recons.append(np.dot(x_data_news[i], grps[i].components_))

        # calculate projection errors
        x_projection_loss = ((x_data - x_data_recons) ** 2).mean()
        x_data_projection_losses.append(x_projection_loss)

    if params['projection_loss_graph'] is not None:
        plt.figure()
        plt.plot(x_data_projection_losses)
        plt.ylabel("Mean Squared Error")
        plt.xlabel("Random Model")
        plt.title(params['projection_loss_graph'])
        plt.savefig(params['projection_loss_graph'] + '.png')

    i = np.argmin(x_data_projection_losses)

    return x_data_news[i]