Esempio n. 1
0
def part2():
    tmp = defaultdict(dict)
    for i, dim in product(range(10), range(1, 31)):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(cancer_x), cancer_x)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'cancer part2.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims_big):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(housing_x), housing_x)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'housing part2.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), range(1, 31)):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        rp.fit(cancer_x)
        tmp[dim][i] = reconstructionError(rp, cancer_x)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'cancer part2.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims_big):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        rp.fit(housing_x)
        tmp[dim][i] = reconstructionError(rp, housing_x)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'housing part2.csv')
Esempio n. 2
0
def rp(X_train, X_test):
        num_components = johnson_lindenstrauss_min_dim(n_samples=X_train.shape[0], eps=0.1)
        print(num_components)
        print("# features: ", X_train.shape[1], " JL min dim:", num_components)
        print("JL number > #features so cant make any JL guarentees")
        # Of course not! It simply means that we can’t make any assumptions regarding the preservation of pairwise distances between data points.

        accuracies = []
        components = np.int32(np.linspace(1, 19, 19))

        model = LinearSVC()
        model.fit(X_train, y_train)
        baseline = metrics.accuracy_score(model.predict(X_test), y_test)

        # loop over the projection sizes
        for comp in components:
            # create the random projection
            sp = SparseRandomProjection(n_components=comp)
            X = sp.fit_transform(X_train)

            # train a classifier on the sparse random projection
            # TODO this is wrong.. needs to be KMeans
            model = LinearSVC(max_iter=1000)
            model.fit(X, y_train)

            # evaluate the model and update the list of accuracies
            test = sp.transform(X_test)
            accuracies.append(metrics.accuracy_score(model.predict(test), y_test))

        # create the figure
        plt.figure()
        plt.title("Accuracy of Sparse Random Projection on Churn")
        plt.xlabel("# of Components")
        plt.ylabel("Accuracy")
        plt.xlim([1, 20])
        plt.ylim([0, 1.0])

        # plot the baseline and random projection accuracies
        plt.plot(components, [baseline] * len(accuracies), color="r")
        plt.plot(components, accuracies)

        plt.show()
        # average looks to be around 5 components in RP to best the baseline
        sp = SparseRandomProjection(n_components = 5)
        X_transformed = sp.fit_transform(X_train)

        km = KMeans(n_clusters=2,
                    init='k-means++',
                    n_init=10,
                    max_iter=300,
                    random_state=RAND)
        plot_silhouette(km, X_transformed, title="SRP(5) KM(2)")

        km = KMeans(n_clusters=3,
                    init='k-means++',
                    n_init=10,
                    max_iter=300,
                    random_state=RAND)
        plot_silhouette(km, X_transformed, title="SRP(5) KM(3)")
Esempio n. 3
0
def part3():
    dim = 5
    rp = SparseRandomProjection(n_components=dim, random_state=5)
    cancer_x2 = rp.fit_transform(cancer_x)

    dim = 9
    rp = SparseRandomProjection(n_components=dim, random_state=5)
    housing_x2 = rp.fit_transform(housing_x)

    run_clustering(out, cancer_x2, cancer_y, housing_x2, housing_y)
Esempio n. 4
0
 def generate(self, train, val, test, n_comps):
     decomposer = SparseRandomProjection(n_components=n_comps, random_state=1234)
     results_train = decomposer.fit_transform(train)
     results_val = decomposer.fit_transform(val)
     results_test = decomposer.transform(test)
     for i in range(1, n_comps + 1):
         train[self.featurename(i)] = results_train[:, i - 1]
         val[self.featurename(i)] = results_val[:, i - 1]
         test[self.featurename(i)] = results_test[:, i - 1]
     return (train, val, test)
Esempio n. 5
0
def rp(train, test, y_train, y_test):
    model = LinearSVC()
    model.fit(train, y_train)
    baseline = metrics.accuracy_score(model.predict(X_test), y_test)

    accuracies = []
    components = np.int32(np.linspace(2, 60, 20))

    # loop over the projection sizes
    for comp in components:
        # create the random projection
        sp = SparseRandomProjection(n_components=comp)
        X = sp.fit_transform(train)

        # train a classifier on the sparse random projection
        model = LinearSVC()
        model.fit(X, y_train)

        # evaluate the model and update the list of accuracies
        test = sp.transform(X_test)
        accuracies.append(metrics.accuracy_score(model.predict(test), y_test))

    # create the figure
    plt.figure()
    plt.title("Accuracy of Sparse Rand Projection on Sonar (EM, GMM)")
    plt.xlabel("# of Components")
    plt.ylabel("Accuracy")
    plt.xlim([2, 64])
    plt.ylim([0, 1.0])

    # plot the baseline and random projection accuracies
    plt.plot(components, [baseline] * len(accuracies), color="r")
    plt.plot(components, accuracies)
    plt.show()

    #random pick 30 as the best number of Random components
    sp = SparseRandomProjection(n_components=30)
    X_train = sp.fit_transform(train)

    gmm = mixture.GaussianMixture(2, covariance_type='full', random_state=RAND)
    gmm.fit(X_train)
    plot_silhouette(gmm, X_train, title="RP(30), GMM(2)")

    gmm = mixture.GaussianMixture(3, covariance_type='full', random_state=RAND)
    gmm.fit(X_train)
    plot_silhouette(gmm, X_train, title="RP(30), GMM(3)")

    gmm = mixture.GaussianMixture(4, covariance_type='full', random_state=RAND)
    gmm.fit(X_train)
    plot_silhouette(gmm, X_train, title="RP(30), GMM(4)")
Esempio n. 6
0
def run_RP(X, y, title):
    from itertools import product

    dims = list(np.arange(2, (X.shape[1] - 1), 3))
    dims.append(X.shape[1])
    tmp = defaultdict(dict)
    for i, dim in product(range(5), dims):
        rp = RP(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(X), X)
    tmp = pd.DataFrame(tmp).T
    mean_recon = tmp.mean(axis=1).tolist()
    std_recon = tmp.std(axis=1).tolist()

    fig, ax1 = plt.subplots()
    ax1.plot(dims, mean_recon, 'b-')
    ax1.set_xlabel('Random Components')
    # Make the y-axis label, ticks and tick labels match the line color.
    ax1.set_ylabel('Mean Reconstruction Correlation', color='b')
    ax1.tick_params('y', colors='b')
    plt.grid(False)

    ax2 = ax1.twinx()
    ax2.plot(dims, std_recon, 'm-')
    ax2.set_ylabel('STD Reconstruction Correlation', color='m')
    ax2.tick_params('y', colors='m')
    plt.grid(False)

    plt.title("Random Components for 5 Restarts: " + title)
    fig.tight_layout()
    d = plotsdir + "/" + title
    if not os.path.exists(d):
        os.makedirs(d)
    plt.savefig(d + "/Random Components for 5 Restarts.png")
Esempio n. 7
0
def run_rp(n_projections, dataset, data):
    file = './results/srp_' + dataset + '.csv'
    with open(file, 'w') as f:
        f.write('{},{},{},{}\n'.format("n_components",
                                       "reconstruction_error_mean",
                                       "reconstruction_error_sigma",
                                       "runtime"))
    for n in n_projections:
        errors = []
        for i in range(1, 5):
            start = time.time()
            srp = SparseRandomProjection(n_components=n)
            # srp = GaussianRandomProjection(n_components=n)
            trans_x = srp.fit_transform(data)
            end = time.time()
            elapsed = end - start
            error = reconstruction_error(srp, data)
            errors.append(error)
            # print("For SRP n_components of: ", n, " on ", dataset, " data set got reconstruction error of: ", error, " in time: ", elapsed)
        error_mean = np.mean(errors)
        error_sigma = np.std(errors)
        print("For SRP n_components of: ", n, " on ", dataset,
              " data set got mean reconstruction error of: ", error_mean,
              " with sigma of: ", error_sigma)
        with open(file, 'a') as f:
            f.write('{},{},{},{}\n'.format(n, error_mean, error_sigma,
                                           elapsed))
    return
Esempio n. 8
0
def rp(train, test, y_train, y_test):
    sp = SparseRandomProjection(n_components=12)
    X_train = sp.fit_transform(train)
    X_test = sp.transform(test)

    clf = MLPClassifier(solver='sgd',
                        hidden_layer_sizes=(70, ),
                        random_state=23,
                        shuffle=True,
                        activation='relu',
                        learning_rate_init=0.15,
                        alpha=0.45)
    run_analysis(
        X_train, y_train, clf,
        "NN with lrate=0.15, 70 units in hidden layer, alpha 0.45, RP(12)")

    clf = MLPClassifier(solver='sgd',
                        hidden_layer_sizes=(70, ),
                        random_state=23,
                        shuffle=True,
                        activation='relu',
                        learning_rate_init=0.15,
                        alpha=0.45)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    plot_confusion_matrix(y_test, y_pred, title="NN (70,) lrate=0.15, RP(12)")
Esempio n. 9
0
def randomProjections(data, n_components):
    pWDC = {}
    rError = {}
    
    for iterN in range(1, n_components):
        rp = SparseRandomProjection(n_components = iterN, random_state= seed)
        
        rpCopy = rp
        pWDC[iterN] = pairwiseDistCorr(rp.fit_transform(data), data)
        
        rpCopy.fit(data)
        rError[iterN] = reconstructionError(rpCopy, data)
    
    
    plt.subplot(2, 1, 1)
    plt.plot(list(pWDC.keys()), list(pWDC.values()))
    plt.xlabel("Number of Components")
    plt.ylabel("Pair-wise Distance Correlation")     
    
    plt.subplot(2, 1, 2)
    plt.plot(list(rError.keys()), list(rError.values()))
    plt.xlabel("Number of Components")
    plt.ylabel("Reconstruction Error")     

    return plt, pairwiseDistCorr, reconstructionError, rp
Esempio n. 10
0
    def perform(self):
        # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/RP.py
        self.log("Performing {}".format(self.experiment_name()))

        # TODO: Use a diff random state? Might be ok as-is
        # %% Data for 1
        tmp = defaultdict(dict)
        for i, dim in product(range(10), self._dims):
            rp = SparseRandomProjection(random_state=i, n_components=dim)
            tmp[dim][i] = pairwise_dist_corr(rp.fit_transform(self._details.ds.training_x), self._details.ds.training_x)
        tmp = pd.DataFrame(tmp).T
        tmp.to_csv(self._out.format('{}_scree1.csv'.format(self._details.ds_name)))

        tmp = defaultdict(dict)
        for i, dim in product(range(10), self._dims):
            rp = SparseRandomProjection(random_state=i, n_components=dim)
            rp.fit(self._details.ds.training_x)
            tmp[dim][i] = reconstruction_error(rp, self._details.ds.training_x)
        tmp = pd.DataFrame(tmp).T
        tmp.to_csv(self._out.format('{}_scree2.csv'.format(self._details.ds_name)))

        # %% Data for 2
        grid = {'rp__n_components': self._dims, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch}
        rp = SparseRandomProjection(random_state=self._details.seed)
        mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed)
        pipe = Pipeline([('rp', rp), ('NN', mlp)], memory=experiments.pipeline_memory)
        gs, final_estimator = self.gs_with_best_estimator(pipe, grid)
        self.log("Grid search complete")

        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv(self._out.format('{}_dim_red.csv'.format(self._details.ds_name)))
        self.log("Done")
Esempio n. 11
0
def train(X_train, y_train, project=False, rnd=42, **kwargs):
    if project is not False:
        if project == 'rproj':
            proj = SparseRandomProjection(n_components=X_train.shape[1], random_state=rnd)
        elif project == 'std':
            proj = StandardScaler()
        elif project == 'pca':
            proj = PCA(n_components='mle', whiten=True, random_state=rnd)
        elif project == 'rpca':
            proj = RandomizedPCA(whiten=True, random_state=rnd)
        elif project == 'rbf':
            proj = RBFSampler(n_components=max(X_train.shape[1], 50), random_state=rnd)
        else:
            raise Error('Projection {} not available'.format(project))

        X_train = proj.fit_transform(X_train)

    kwargs.setdefault('random_state', rnd)
    clf = RandomForestClassifier(**kwargs)
    clf.fit(X_train, y_train)

    if project is not False:
        return clf, proj

    return clf
Esempio n. 12
0
def _train_classifier(clf, X_train, y_train, rnd=42, project=None):

    if project is not None and project != 'None':
        log.info('+ Projecting features')
        if project == 'random_projection':
            log.info('  * Sparse Random Projection')
            proj = SparseRandomProjection(n_components=X_train.shape[1],
                                          random_state=rnd)
        elif project == 'standard':
            log.info('  * Standard Projection')
            proj = StandardScaler()
        elif project == 'pca':
            log.info('  * Principle Component Analysis')
            proj = IncrementalPCA(batch_size=100)
        elif project == 'random_pca':
            log.info('  * Randomized Principle Component Analysis')
            proj = PCA(n_components=X_train.shape[1],
                       svd_solver='randomized',
                       whiten=True,
                       random_state=rnd)
        else:
            log.error('Projection {} not available'.format(project))
            return

        X_train = proj.fit_transform(X_train)

    log.info('+ Training classifier')
    clf.fit(X_train, y_train)
Esempio n. 13
0
    def rp_dim_reduction(self, x, y, title, dim_max):
        print('Random Projections')
        rp_error = []
        scaler = StandardScaler()
        scaler.fit(x)
        n_min_reconstruction_error = -1
        components_min_error = None
        min_error = float("inf")
        for n in range(2, dim_max + 1):
            rp = SparseRandomProjection(n_components=n, random_state=123)
            rp_result = rp.fit_transform(x)
            reconstruction_error = self.reconstructionError(rp, x)
            rp_error.append(reconstruction_error)

            if reconstruction_error < min_error:
                n_min_reconstruction_error = n
                min_error = reconstruction_error

        plt.figure()
        plt.plot(range(1, dim_max), rp_error, 'bx-')
        plt.xlabel('n_components')
        plt.ylabel('Reconstructed Error')
        plt.savefig('image/' + title + '/rp_train.png')
        plt.close()

        return n_min_reconstruction_error
Esempio n. 14
0
def run_RCA(X, title):
    dims = list(np.arange(2, (X.shape[1] - 1), 3))
    dims.append(X.shape[1])
    tmp = defaultdict(dict)

    for i, dim in product(range(10), dims):
        rp = RCA(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(X), X)
    tmp = pd.DataFrame(tmp).T
    mean_recon = tmp.mean(axis=1).tolist()
    std_recon = tmp.std(axis=1).tolist()

    fig, ax1 = plt.subplots()
    ax1.plot(dims, mean_recon, 'b-')
    ax1.set_xlabel('Random Components')
    ax1.set_ylabel('Mean Reconstruction Correlation', color='b')
    ax1.tick_params('y', colors='b')
    plt.grid(False)

    ax2 = ax1.twinx()
    ax2.plot(dims, std_recon, 'm-')
    ax2.set_ylabel('STD Reconstruction Correlation', color='m')
    ax2.tick_params('y', colors='m')
    plt.grid(False)

    plt.title("Random Components for 5 Restarts: " + title)
    fig.tight_layout()
    plt.show()
Esempio n. 15
0
    def sample_proj_mat(self, sample_inds):
        """
        Gets the projection matrix and it fits the transform to the samples of interest.

        Parameters
        ----------
        sample_inds : array of shape [n_samples]
            The data we are transforming.

        Returns
        -------
        proj_mat : {ndarray, sparse matrix} of shape (n_samples, n_features)
            The generated sparse random matrix.
        proj_mat : {ndarray, sparse matrix} of shape (n_samples, n_features)
            Projected matrix.
        """

        proj_mat = SparseRandomProjection(
            density=self.density,
            n_components=self.proj_dims,
            random_state=self.random_state,
        )

        proj_X = proj_mat.fit_transform(self.X[sample_inds, :])
        return proj_X, proj_mat
Esempio n. 16
0
    def PairwiseDistribution(self, data):
        n_samples, n_features = data.shape
        print(
            "Embedding %d samples with dim %d using various random projections"
            % (n_samples, n_features))

        n_components = 4
        dists = euclidean_distances(data, squared=True).ravel()

        # select only non-identical samples pairs
        nonzero = dists != 0
        dists = dists[nonzero]

        rp = SparseRandomProjection(n_components=n_components)
        projected_data = rp.fit_transform(data)
        projected_dists = euclidean_distances(projected_data,
                                              squared=True).ravel()[nonzero]
        plt.figure()
        plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu)
        plt.xlim([0, 150])
        plt.ylim([0, 150])
        plt.xlabel("Pairwise squared distances in original space")
        plt.ylabel("Pairwise squared distances in projected space")
        plt.title("Pairwise distances distribution for n_components= 4")
        cb = plt.colorbar()
        cb.set_label('Sample pairs counts')
        #cb.ax.set_yticklabels(['0','250', '500', '750', '1000', '1250'])
        plt.savefig("Plots/RandomProjection/pairwisedist2.png")
Esempio n. 17
0
class SRP:
    def __init__(self, rfe_cv, *args, **kwargs):
        self.rfe = None
        self.rfe_cv = rfe_cv
        self.model = SparseRandomProjection(*args, **kwargs)

    def fit(self, X, y):
        pass

    def predict(self, X):
        Z = numpy.concatenate([X], axis=1)
        Z = numpy.array(Z, dtype=numpy.float32)
        Z[Z == numpy.inf] = numpy.nan
        Z[Z == -numpy.inf] = numpy.nan
        nan_mask = ~pandas.isna(Z).any(axis=1)
        X_ = X[nan_mask, :]
        if Z.shape[0] != X.shape[0]:
            print(
                'PREDICT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}'
                .format(X.shape[0] - X_.shape[0]))
        if self.rfe_cv:
            raise Exception("PCA could not be processed with RFE_CV")
        else:
            predicted = self.model.fit_transform(X_)
            Z = numpy.full(shape=(X.shape[0], predicted.shape[1]),
                           fill_value=numpy.nan,
                           dtype=numpy.float64)
            Z[nan_mask, :] = predicted
        return Z
def rpFluctuation(dims, ds, X):
    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims):
        print(i, dim)
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(X), X)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv('{}/{}_comparison.csv'.format(OUT, ds))
Esempio n. 19
0
def project(file_name, dimensions):
    data = load_svmlight_file(file_name)
    projector = SparseRandomProjection(dimensions, 1/3.0,
                                       dense_output=True)
    projected = projector.fit_transform(data[0])
    
    new_file_name = file_name[:-4] + '-' + str(dimensions) + '.mat'
    new_file = open(new_file_name, 'wb')
    dump_svmlight_file(projected, data[1], new_file)
Esempio n. 20
0
def Random(X, labels, imgs, **kwargs):
    # Random 2D projection using a random unitary matrix
    print("Computing random projection")
    t = time()
    rp = SparseRandomProjection(
        n_components=2, random_state=0)
    X_projected = rp.fit_transform(X)
    plot_embedding(X_projected, labels, imgs, "Random Projection of the dataset (time %.2fs)" %
                   (time() - t), **kwargs)
Esempio n. 21
0
    def reduce_data(self, data):

        scaler = preprocessing.Normalizer()
        normalized_x = scaler.fit_transform(data)

        randomized_projection = SparseRandomProjection(n_components=3)
        dim_reduced_x = randomized_projection.fit_transform(normalized_x)

        return dim_reduced_x
def engineer2(train, test):
    
    myfeats = [f for f in train.columns if f not in ['UCIC_ID','Responders']]
    
    scaler = StandardScaler()
    
    slr = scaler.fit(train[myfeats])
    
    dim_train = slr.transform(train[myfeats])
    dim_test = slr.transform(test[myfeats])
    
    n_comp = 10
    
    print('Starting decomposition.........\n')
    
    tsvd = TruncatedSVD(n_components=n_comp, random_state=42)
    tsvd_train = tsvd.fit_transform(dim_train)
    tsvd_test = tsvd.transform(dim_test)

    pca = PCA(n_components=n_comp, random_state=420)
    pca_train = pca.fit_transform(dim_train)
    pca_test = pca.transform(dim_test)
    
    ica = FastICA(n_components=n_comp, random_state=2030)
    ica_train = ica.fit_transform(dim_train)
    ica_test = ica.transform(dim_test)
    
    grp = GaussianRandomProjection(n_components=n_comp, random_state=42)
    grp_train = grp.fit_transform(dim_train)
    grp_test = grp.transform(dim_test)    
    
    srp = SparseRandomProjection(n_components=n_comp, random_state=42)
    srp_train = srp.fit_transform(dim_train)
    srp_test = srp.transform(dim_test)    
    
    for i in range(1, n_comp + 1):
        train['pca_' + str(i)] = pca_train[:,i-1]
        test['pca_' + str(i)] = pca_test[:,i-1]
        
        train['tsvd_' + str(i)] = tsvd_train[:,i-1]
        test['tsvd_' + str(i)] = tsvd_test[:,i-1]
        
        train['ica_' + str(i)] = ica_train[:,i-1]
        test['ica_' + str(i)] = ica_test[:,i-1]
        
        train['grp_' + str(i)] = grp_train[:,i-1]
        test['grp_' + str(i)] = grp_test[:,i-1]
        
        train['srp_' + str(i)] = srp_train[:,i-1]
        test['srp_' + str(i)] = srp_test[:,i-1]
        
    
    del dim_train, dim_test
    
    return train, test
    
Esempio n. 23
0
def getRCAData(X, dataType):
    if dataType == 'Adult':
        components = 40
    else:
        components = 25

 
    transformer = SparseRandomProjection(n_components=components)
    transformed = transformer.fit_transform(X)
    return transformed
Esempio n. 24
0
def flastVectorization(dataPoints, reduceDim=True, dim=0, eps=0.33):
    countVec = CountVectorizer()
    Z_full = countVec.fit_transform(dataPoints)
    if reduceDim:
        if dim <= 0:
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
        return Z
    else:
        return Z_full
def apply_SRP(table, features, label, n_components):
    from sklearn.random_projection import SparseRandomProjection
    from paje import feature_file_processor

    x, y = feature_file_processor.split_features_target(table, features, label)

    rp = SparseRandomProjection \
        (n_components=n_components, dense_output=True, random_state=420)
    pc = rp.fit_transform(x)

    return feature_file_processor.generate_data_frame(pc, table[[label]])
Esempio n. 26
0
def getDR(dt_all, n_comp=12):
    # cols
    cols_encode_label = dt_all.filter(
        regex="Encode_Label").columns.values.tolist()
    cols_cat = dt_all.drop(
        "ID", axis=1).select_dtypes(include=["object"]).columns.tolist()

    # standardize
    dt_all_norm = MinMaxScaler().fit_transform(
        dt_all.drop(["y", "Fold"] + cols_cat + cols_encode_label, axis=1))

    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
    tsvd_results = tsvd.fit_transform(dt_all_norm)

    # PCA
    pca = PCA(n_components=n_comp, random_state=420)
    pca_results = pca.fit_transform(dt_all_norm)

    # ICA
    ica = FastICA(n_components=n_comp, max_iter=5000, random_state=420)
    ica_results = ica.fit_transform(dt_all_norm)

    # GRP
    grp = GaussianRandomProjection(n_components=n_comp,
                                   eps=0.1,
                                   random_state=420)
    grp_results = grp.fit_transform(dt_all_norm)

    # SRP
    srp = SparseRandomProjection(n_components=n_comp,
                                 dense_output=True,
                                 random_state=420)
    srp_results = srp.fit_transform(dt_all_norm)

    # NMF
    nmf = NMF(n_components=n_comp, init='nndsvdar', random_state=420)
    nmf_results = nmf.fit_transform(dt_all_norm)

    # F*G
    f*g = FeatureAgglomeration(n_clusters=n_comp, linkage='ward')
    fag_results = f*g.fit_transform(dt_all_norm)

    # Append decomposition components to datasets
    for i in range(1, n_comp + 1):
        dt_all['DR_TSVD_' + str(i)] = tsvd_results[:, i - 1]
        dt_all['DR_PCA_' + str(i)] = pca_results[:, i - 1]
        dt_all['DR_ICA_' + str(i)] = ica_results[:, i - 1]
        dt_all['DR_GRP_' + str(i)] = grp_results[:, i - 1]
        dt_all['DR_SRP_' + str(i)] = srp_results[:, i - 1]
        dt_all['DR_NMF_' + str(i)] = nmf_results[:, i - 1]
        dt_all['DR_FAG_' + str(i)] = fag_results[:, i - 1]

    return (dt_all)
def SSPCA_V(X, k):
    # overall O(M * k + c_3 * p * k^2)
    transformer = SparseRandomProjection(n_components=k, random_state=0)
    #O(M * k)
    Y = transformer.fit_transform(X)
    #O(M * k)
    B = safe_sparse_dot(Y.T, X).toarray()
    #O(p * k^2)
    U, S, V = np.linalg.svd(B)
    V = V[:k]
    return V.T
Esempio n. 28
0
def preprocess(X, y):
    X = np.array([x.flatten() for x in X])
    y = np.array([one_hot(y_item) for y_item in y])
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler = scaler.fit(X)
    X = scaler.transform(X)
    #print(X.shape)  #(173, 3840000)
    # reduce principle components to improve performance

    sp = SparseRandomProjection(n_components=int(5792))
    X = sp.fit_transform(X)
    return np.array(X), y
Esempio n. 29
0
def rp(X_train, y_train, X_test, y_test):
    accuracies = []
    components = np.int32(np.linspace(2, 64, 20))

    model = LinearSVC()
    model.fit(X_train, y_train)
    baseline = metrics.accuracy_score(model.predict(X_test), y_test)

    # loop over the projection sizes
    # for comp in components:
    #     # create the random projection
    #     sp = SparseRandomProjection(n_components=comp, random_state=RAND)
    #     X = sp.fit_transform(X_train)
    #
    #     # train a classifier on the sparse random projection
    #     model = LinearSVC()
    #     model.fit(X, y_train)
    #
    #     # evaluate the model and update the list of accuracies
    #     test = sp.transform(X_test)
    #     accuracies.append(metrics.accuracy_score(model.predict(test), y_test))
    #
    # # create the figure
    # plt.figure()
    # plt.title("Accuracy of Sparse Projection on Churn")
    # plt.xlabel("# of Components")
    # plt.ylabel("Accuracy")
    # plt.xlim([2, 64])
    # plt.ylim([0, 1.0])
    #
    # # plot the baseline and random projection accuracies
    # plt.plot(components, [baseline] * len(accuracies), color="r")
    # plt.plot(components, accuracies)
    #
    # print("Average of 4 runs, first better than baseline ave 12 components")
    #
    # plt.show()

    sp = SparseRandomProjection(n_components=12, random_state=RAND)
    X = sp.fit_transform(X_train)

    em = mixture.GaussianMixture(2, covariance_type='full', random_state=RAND)

    plot_silhouette(em, X, title="RP, K=2, 12 RC")

    em = mixture.GaussianMixture(3, covariance_type='full', random_state=RAND)

    plot_silhouette(em, X, title="RP, K=3, 12 RC")

    em = mixture.GaussianMixture(4, covariance_type='full', random_state=RAND)

    plot_silhouette(em, X, title="RP, K=4, 12 RC")
def get_additional_features(train, test, magic=False, ID=False):
    col = list(test.columns)
    if ID != True:
        col.remove('ID')
    n_comp = 12
    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
    tsvd_results_train = tsvd.fit_transform(train[col])
    tsvd_results_test = tsvd.transform(test[col])
    # PCA
    pca = PCA(n_components=n_comp, random_state=420)
    pca2_results_train = pca.fit_transform(train[col])
    pca2_results_test = pca.transform(test[col])
    # ICA
    ica = FastICA(n_components=n_comp, random_state=420)
    ica2_results_train = ica.fit_transform(train[col])
    ica2_results_test = ica.transform(test[col])
    # GRP
    grp = GaussianRandomProjection(n_components=n_comp,
                                   eps=0.1,
                                   random_state=420)
    grp_results_train = grp.fit_transform(train[col])
    grp_results_test = grp.transform(test[col])
    # SRP
    srp = SparseRandomProjection(n_components=n_comp,
                                 dense_output=True,
                                 random_state=420)
    srp_results_train = srp.fit_transform(train[col])
    srp_results_test = srp.transform(test[col])
    for i in range(1, n_comp + 1):
        train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
        test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]
        train['pca_' + str(i)] = pca2_results_train[:, i - 1]
        test['pca_' + str(i)] = pca2_results_test[:, i - 1]
        train['ica_' + str(i)] = ica2_results_train[:, i - 1]
        test['ica_' + str(i)] = ica2_results_test[:, i - 1]
        train['grp_' + str(i)] = grp_results_train[:, i - 1]
        test['grp_' + str(i)] = grp_results_test[:, i - 1]
        train['srp_' + str(i)] = srp_results_train[:, i - 1]
        test['srp_' + str(i)] = srp_results_test[:, i - 1]
    if magic == True:
        magic_mat = train[['ID', 'X0', 'y']]
        magic_mat = magic_mat.groupby(['X0'])['y'].mean()
        magic_mat = pd.DataFrame({
            'X0': magic_mat.index,
            'magic': list(magic_mat)
        })
        mean_magic = magic_mat['magic'].mean()
        train = train.merge(magic_mat, on='X0', how='left')
        test = test.merge(magic_mat, on='X0', how='left')
        test['magic'] = test['magic'].fillna(mean_magic)
    return train, test
Esempio n. 31
0
def ML_SRP(X_train, n_components, density, eps, dense_output, random_state):
    from sklearn.random_projection import SparseRandomProjection
    import pandas as pd
    srp = SparseRandomProjection(
        n_components=n_components,
        density=density,
        eps=eps,
        dense_output=dense_output,
        random_state=random_state,
    )
    X_train_PCA = srp.fit_transform(X_train)
    X_train_PCA = pd.DataFrame(data=X_train_PCA)
    return X_train_PCA
def plotProjection(data, n_samples, n_features): 

    n_components_range = np.array([300, 1000, 10000])
    dists = euclidean_distances(data, squared=True).ravel()

    # select only non-identical samples pairs
    nonzero = dists != 0
    dists = dists[nonzero]

    for n_components in n_components_range:

        t0 = time()

        rp = SparseRandomProjection(n_components=n_components)
        projected_data = rp.fit_transform(data)

        print("Projected %d samples from %d to %d in %.3fs" \
                % (n_samples, \
                   n_features, \
                   n_components, \
                   time() - t0))

        if hasattr(rp, 'components_'):
            n_bytes = rp.components_.data.nbytes
            n_bytes += rp.components_.indices.nbytes
            print("Random matrix with size: %.3fMB" % (n_bytes / 1e6))


        projected_dists = euclidean_distances(projected_data, squared=True)
        projected_dists = projected_dists.ravel()[nonzero]

        rates = projected_dists / dists
        print("Mean distances rate: %.2f (%.2f)" \
                % (np.mean(rates), \
                   np.std(rates)))

        plotHexbin(dists, projected_dists, n_components)
        plotHist(rates, n_components)
Esempio n. 33
0
def create_sector_subset(sample_n, X_output_path, Y_output_path):
    X_path = "/cs/puls/Experiments/hxiao-test/feature-data.mat"
    Y_path = "/cs/puls/Experiments/hxiao-test/label-data.mat"

    X = loadmat(X_path)["featureData"]
    Y = loadmat(Y_path)["labelData"]

    print "Applying random projection to reduce dimension"
    print "Shape before: %r" % (X.shape,)

    transformer = SparseRandomProjection(random_state=0)
    X = transformer.fit_transform(X)
    print "Shape after: %r" % (X.shape,)
    print "Random projection: OFF"

    rng = np.random.RandomState(0)
    print "Sample size: %d" % sample_n
    rows = rng.permutation(X.shape[0])[:sample_n]
    X = X[rows, :]
    Y = Y[rows, :]

    dump(X, open(X_output_path, "w"))
    dump(Y, open(Y_output_path, "w"))
Esempio n. 34
0
cPickle.dump(articles, f, protocol=-1)

print "saving done"

print len(articles)

vec = TfidfVectorizer(max_df=0.8, sublinear_tf=True)

X = vec.fit_transform(articles)


print X.shape

proj = SparseRandomProjection()

X = proj.fit_transform(X)

print X.shape

sparse_save(X,"../data/tfidf.h5")

# f = open('X_data.p', 'wb')
# cPickle.dump(X.data, f, protocol=-1)
# f = open('X_indices.p', 'wb')
# cPickle.dump(X.indices, f, protocol=-1)
# f = open('X_indptr.p', 'wb')
# cPickle.dump(X.indptr, f, protocol=-1)

#X = normalize(X)

# compute the inverse of l2 norm of non-zero elements
faces_data = fetch_olivetti_faces().data
n_samples, n_features = faces_data.shape
print "Embedding %d faces with dim %d using various random projections" % (
    n_samples, n_features)

n_components_range = np.array([50, 200, 1000])
dists = euclidean_distances(faces_data, squared=True).ravel()

# select only non-identical samples pairs
nonzero = dists != 0
dists = dists[nonzero]

for n_components in n_components_range:
    rp = SparseRandomProjection(n_components=n_components)
    projected_data = rp.fit_transform(faces_data)
    projected_dists = euclidean_distances(
        projected_data, squared=True).ravel()[nonzero]

    pl.figure()
    pl.hexbin(dists, projected_dists, gridsize=100)
    pl.xlabel("Pairwise squared distances in original space")
    pl.ylabel("Pairwise squared distances in projected space")
    pl.title("Pairwise distances distribution for n_components=%d" %
             n_components)
    cb = pl.colorbar()
    cb.set_label('Sample pairs counts')

    rates = projected_dists / dists

    pl.figure()
Esempio n. 36
0
def sparseRP(data):
    rp = SparseRandomProjection(n_components=new_dimension)
    return rp.fit_transform(data)
Esempio n. 37
0
all_data_proc = pd.concat((df_obj_feats_freq, df_non_obj_feats), axis=1)

#%%
from sklearn.decomposition import PCA, FastICA
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
n_comp = 12

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results = grp.fit_transform(all_data_proc)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results = srp.fit_transform(all_data_proc)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca_results = pca.fit_transform(all_data_proc)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica_results = ica.fit_transform(all_data_proc)
for i in range(1, n_comp+1):
    all_data_proc['pca_' + str(i)] = pca_results[:,i-1]
    all_data_proc['ica_' + str(i)] = ica_results[:, i-1]
    all_data_proc['grp_' + str(i)] = grp_results[:,i-1]
    all_data_proc['srp_' + str(i)] = srp_results[:, i-1]

df_X_train = all_data_proc[:train_len]
n_samples, n_features = data.shape
print("Embedding %d samples with dim %d using various random projections"
      % (n_samples, n_features))

n_components_range = np.array([300, 1000, 10000])
dists = euclidean_distances(data, squared=True).ravel()

# select only non-identical samples pairs
nonzero = dists != 0
dists = dists[nonzero]

for n_components in n_components_range:
    t0 = time()
    rp = SparseRandomProjection(n_components=n_components)
    projected_data = rp.fit_transform(data)
    print("Projected %d samples from %d to %d in %0.3fs"
          % (n_samples, n_features, n_components, time() - t0))
    if hasattr(rp, 'components_'):
        n_bytes = rp.components_.data.nbytes
        n_bytes += rp.components_.indices.nbytes
        print("Random matrix with size: %0.3fMB" % (n_bytes / 1e6))

    projected_dists = euclidean_distances(
        projected_data, squared=True).ravel()[nonzero]

    plt.figure()
    plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu)
    plt.xlabel("Pairwise squared distances in original space")
    plt.ylabel("Pairwise squared distances in projected space")
    plt.title("Pairwise distances distribution for n_components=%d" %
Esempio n. 39
0
# Perform Randomized Principal Components Analysis (PCA)
from sklearn.decomposition import RandomizedPCA as RPCA
rpca = RPCA(n_components=num_components)
rpca_transformed_data_train = rpca.fit_transform(dense_trainData)
rpca_transformed_data_valid = rpca.transform(dense_validData)

# Perform Gaussian Random Projection
from sklearn.random_projection import GaussianRandomProjection as GaussRan
grp = GaussRan(n_components=num_components)
grp_transformed_data_train = grp.fit_transform(dense_trainData)
grp_transformed_data_valid = grp.transform(dense_validData)

# Perform Sparse Random Projection
from sklearn.random_projection import SparseRandomProjection as SparseRan
srp = SparseRan(n_components=num_components, random_state=0)
srp_transformed_data_train = srp.fit_transform(dense_trainData)
srp_transformed_data_valid = srp.transform(dense_validData)

# Perform classification using 1-Nearest Neighbor Classifier
from sklearn.neighbors import KNeighborsClassifier

# Create a subset grid to plot performance against numbers of components
tsvd_max = tsvd_transformed_data_train.shape[1]
plot_subset = []
length_of_plot_subset = len(plot_subset)
if tsvd_max < 101:
    spacing = super_fine_spacing
    plot_subset = []
    for j in arange(1, spacing - 1):
        plot_subset.append(j)
    quotient = tsvd_max / spacing
Esempio n. 40
0
def gen_feature(train, test):
    train = pd.DataFrame(train)
    test = pd.DataFrame(test)

    n_comp = 15
    drop_list = []
    test_drop_list = []

    print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape)
    print('tSVD', datetime.now() - start)
    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp)
    tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1))
    tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1))

    print('PCA', datetime.now() - start)
    # PCA
    pca = PCA(n_components=n_comp)
    pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1))
    pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1))

    print('ICA', datetime.now() - start)
    # ICA
    ica = FastICA(n_components=n_comp, max_iter=10000)
    ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1))
    ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1))

    print('GRP', datetime.now() - start)
    # GRP
    grp = GaussianRandomProjection(n_components=n_comp, eps=0.1)
    grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1))
    grp_results_test = grp.transform(test.drop(test_drop_list, axis=1))

    print('SRP', datetime.now() - start)
    # SRP
    srp = SparseRandomProjection(n_components=n_comp, dense_output=True)
    srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1))
    srp_results_test = srp.transform(test.drop(test_drop_list, axis=1))

    # MCA
    # res_mca = MCA(train, ncp=n_comp, graph = FALSE)

    # save columns list before adding the decomposition components

    usable_columns = list(set(train.columns) - set(drop_list))

    # Append decomposition components to datasets
    for i in range(1, n_comp + 1):
        train['pca_' + str(i)] = pca2_results_train[:, i - 1]
        test['pca_' + str(i)] = pca2_results_test[:, i - 1]

        train['ica_' + str(i)] = ica2_results_train[:, i - 1]
        test['ica_' + str(i)] = ica2_results_test[:, i - 1]

        train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
        test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

        train['grp_' + str(i)] = grp_results_train[:, i - 1]
        test['grp_' + str(i)] = grp_results_test[:, i - 1]

        train['srp_' + str(i)] = srp_results_train[:, i - 1]
        test['srp_' + str(i)] = srp_results_test[:, i - 1]
    return train, test
Esempio n. 41
0
def gen_features(train, val, test):
    train = pd.DataFrame(train)
    val = pd.DataFrame(val)
    test = pd.DataFrame(test)
    # cat_cols = ['city', 'bd', 'gender', 'registered_via', 'registration_init_year',
    #              'registration_init_month', 'registration_init_date', 'payment_method_id', 'payment_plan_days',
    #              'plan_list_price', 'actual_amount_paid', 'is_auto_renew', 'is_cancel',
    #              'transaction_date_year', 'transaction_date_month', 'transaction_date_date',
    #              'membership_expire_date_year',
    #              'membership_expire_date_month', 'membership_expire_date_date', 'membership_transaction_gap',
    #              'cancel_times',
    #              'auto_renew_count', 'plan_net_worth', 'user_date_year', 'user_date_month',
    #              'user_date_date']
    # con_cols = [x for x in train.columns if x not in cat_cols and x not in ['msno', 'is_churn']]
    # train[cat_cols] = train[cat_cols].astype('object')
    # test[cat_cols] = test[cat_cols].astype('object')
    # val[cat_cols] = val[cat_cols].astype('object')
    #
    # for col in cat_cols:
    #     train[col].fillna(value=train[col].mode()[0], inplace=True)
    #     test[col].fillna(value=test[col].mode()[0], inplace=True)
    #     val[col].fillna(value=val[col].mode()[0], inplace=True)
    # for col in con_cols:
    #     train[col].fillna(value=train[col].mean(), inplace=True)
    #     test[col].fillna(value=test[col].mean(), inplace=True)
    #     val[col].fillna(value=val[col].mean(), inplace=True)
    #
    # for c in train.columns:
    #     if train[c].dtype == 'object':
    #         lbl = LabelEncoder()
    #         lbl.fit(list(train[c].values) + list(test[c].values))
    #         train[c] = lbl.transform(list(train[c].values))
    #         test[c] = lbl.transform(list(test[c].values))

    n_comp = 15

    drop_list = []
    test_drop_list = []

    print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape)
    print('tSVD', datetime.now() - start)
    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp)
    tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1))
    tsvd_results_val= tsvd.transform(val.drop(test_drop_list, axis=1))
    tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1))

    print('PCA', datetime.now() - start)
    # PCA
    pca = PCA(n_components=n_comp)
    pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1))
    pca2_results_val = pca.transform(val.drop(test_drop_list, axis=1))
    pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1))

    print('ICA', datetime.now() - start)
    # ICA
    ica = FastICA(n_components=n_comp, max_iter=10000)
    ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1))
    ica2_results_val = ica.transform(val.drop(test_drop_list, axis=1))
    ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1))

    print('GRP', datetime.now() - start)
    # GRP
    grp = GaussianRandomProjection(n_components=n_comp, eps=0.1)
    grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1))
    grp_results_val = grp.transform(val.drop(test_drop_list, axis=1))
    grp_results_test = grp.transform(test.drop(test_drop_list, axis=1))

    print('SRP', datetime.now() - start)
    # SRP
    srp = SparseRandomProjection(n_components=n_comp, dense_output=True)
    srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1))
    srp_results_val = srp.transform(val.drop(test_drop_list, axis=1))
    srp_results_test = srp.transform(test.drop(test_drop_list, axis=1))

    # MCA
    # res_mca = MCA(train, ncp=n_comp, graph = FALSE)

    # save columns list before adding the decomposition components

    usable_columns = list(set(train.columns) - set(drop_list))

    # Append decomposition components to datasets
    for i in range(1, n_comp + 1):
        train['pca_' + str(i)] = pca2_results_train[:, i - 1]
        val['pca_' + str(i)] = pca2_results_val[:, i - 1]
        test['pca_' + str(i)] = pca2_results_test[:, i - 1]

        train['ica_' + str(i)] = ica2_results_train[:, i - 1]
        val['ica_' + str(i)] = ica2_results_val[:, i - 1]
        test['ica_' + str(i)] = ica2_results_test[:, i - 1]

        train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
        val['tsvd_' + str(i)] = tsvd_results_val[:, i - 1]
        test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

        train['grp_' + str(i)] = grp_results_train[:, i - 1]
        val['grp_' + str(i)] = grp_results_val[:, i - 1]
        test['grp_' + str(i)] = grp_results_test[:, i - 1]

        train['srp_' + str(i)] = srp_results_train[:, i - 1]
        val['srp_' + str(i)] = srp_results_val[:, i - 1]
        test['srp_' + str(i)] = srp_results_test[:, i - 1]
    return train, val, test
Esempio n. 42
0
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    train['pca_' + str(i)] = pca2_results_train[:,i-1]
    test['pca_' + str(i)] = pca2_results_test[:, i-1]

    train['ica_' + str(i)] = ica2_results_train[:,i-1]
    test['ica_' + str(i)] = ica2_results_test[:, i-1]

    train['tsvd_' + str(i)] = tsvd_results_train[:,i-1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i-1]

    train['grp_' + str(i)] = grp_results_train[:,i-1]
    test['grp_' + str(i)] = grp_results_test[:, i-1]
Esempio n. 43
0
X_path = '/cs/puls/Experiments/hxiao-test/feature-data.mat'
Y_path = '/cs/puls/Experiments/hxiao-test/label-data.mat'

X = loadmat(X_path)['featureData']
y = loadmat(Y_path)['labelData']

RANDOM_PROJECTION_FLAG = True

if RANDOM_PROJECTION_FLAG:
    from sklearn.random_projection import SparseRandomProjection

    print "Applying random projection to reduce dimension"
    print "Shape before: %r" % (X.shape, )

    transformer = SparseRandomProjection()
    X = transformer.fit_transform(X)
    print "Shape after: %r" % (X.shape, )


# sample subset of all the data
rng = np.random.RandomState(0)
sample_n = 10000
rows = rng.permutation(X.shape[0])[:sample_n]
X = X[rows, :]
y = y[rows, :]

# sample train and test
train_ratio = 0.8
train_n = int(sample_n*train_ratio)

rows = rng.permutation(sample_n)