Example #1
0
def test_input_size_jl_min_dim():
    with pytest.raises(ValueError):
        johnson_lindenstrauss_min_dim(3 * [100], eps=2 * [0.9])

    johnson_lindenstrauss_min_dim(
        np.random.randint(1, 10, size=(10, 10)), eps=np.full((10, 10), 0.5)
    )
def test_input_size_jl_min_dim():
    assert_raises(ValueError, johnson_lindenstrauss_min_dim,
                  3 * [100], 2 * [0.9])

    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 3 * [100],
                  2 * [0.9])

    johnson_lindenstrauss_min_dim(np.random.randint(1, 10, size=(10, 10)),
                                  np.full((10, 10), 0.5))
def test_input_size_jl_min_dim():
    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 3 * [100],
                  2 * [0.9])

    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 3 * [100],
                  2 * [0.9])

    johnson_lindenstrauss_min_dim(np.random.randint(1, 10, size=(10, 10)),
                                  0.5 * np.ones((10, 10)))
def plotDependencyEPS():

    """Plot thoretical dependency between n_components and eps"""
    
    # range of admissible distortions
    eps_range = np.linspace(0.01, 0.99, 100)

    # range of number of samples to embed
    n_samples_range = np.logspace(2, 6, 5)
    colors = pl.cm.Blues(np.linspace(0.3, 1.0, len(n_samples_range)))

    pl.figure()

    for n_samples, color in zip(n_samples_range, colors):
        min_n_components = johnson_lindenstrauss_min_dim(n_samples, \
                                                         eps=eps_range)
        pl.semilogy(eps_range, min_n_components, color=color)

    pl.legend(["n_samples = %d" % n for n in n_samples_range], \
              loc="upper right")

    pl.xlabel("Distortion eps")
    pl.ylabel("Minimum number of dimensions")
    pl.title("Johnson-Lindenstrauss bounds:\nn_components vs eps")
    pl.show()
Example #5
0
def plot_jl_bounds(label, X):
    """
    http://scikit-learn.org/stable/auto_examples/plot_johnson_lindenstrauss_bound.html#sphx-glr-auto-examples-plot-johnson-lindenstrauss-bound-py
    """
    print("calculating jl bounds")
    eps_ranges = []
    eps_ranges.append(np.linspace(0.2, 0.99, 5))

    # range of number of samples (observation) to embed
    n_samples_range = np.linspace(100, 6000, 5)

    for i, eps_range in enumerate(eps_ranges):
        colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))
        plt.figure()
        for eps, color in zip(eps_range, colors):
            min_n_components = johnson_lindenstrauss_min_dim(n_samples_range,
                                                             eps=eps)
            plt.plot(n_samples_range, min_n_components, color=color)

        plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="best")
        plt.xlabel("Number of observations to eps-embed")
        plt.ylabel("Minimum number of dimensions")
        plt.title("Johnson-Lindenstrauss bounds:\n%s Data" % (label))
        plt.axhline(y=X.shape[1], color='r', linestyle='--', alpha=0.3)
        plt.axvline(x=X.shape[0], color='r', linestyle='--', alpha=0.3)
        plt.gcf()
        plt.savefig('%s-jlbounds.png' % (label.replace(" ", "-")))
        plt.close()
Example #6
0
    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        eps = self.hyperparams['eps']
        n_components = johnson_lindenstrauss_min_dim(n_samples=self._x_dim,
                                                     eps=eps)
        _logger.info("[INFO] n_components is " + str(n_components))
        if n_components > self._y_dim:
            # Default n_components == 'auto' fails. Need to explicitly assign n_components
            self._model = GaussianRandomProjection(
                n_components=self._y_dim, random_state=self.random_seed)
        else:
            try:
                self._model = GaussianRandomProjection(
                    eps=eps, random_state=self.random_seed)
                self._model.fit(self._training_data)
            except:
                _logger.info(
                    "[Warning] Using given eps value failed, will use default conditions."
                )
                self._model = GaussianRandomProjection()

        self._model.fit(self._training_data)

        self._fitted = True
        return CallResult(None, has_finished=True)
Example #7
0
def reduce_dimensions(data, random_state, target_dim=2):
    """
    Reduces the dimensionality of the data using UMAP for lower dimensions, PCA for higher dimensions and possibly
    even random projections if the number of dimension is over the limit given by the Johnson–Lindenstrauss lemma. Works
    for NumPy arrays.

    Args:
        data: The input data.
        random_state: Random state to generate reproducible results.
        target_dim: The targeted dimension.

    Returns:
        Lower dimension representation of the data.
    """
    jl_limit = johnson_lindenstrauss_min_dim(n_samples=data.shape[0], eps=.3)
    pca_limit = 30

    if data.shape[1] > jl_limit and data.shape[1] > pca_limit:
        data = SparseRandomProjection(
            n_components=jl_limit,
            random_state=random_state).fit_transform(data)

    if data.shape[1] > pca_limit:
        data = PCA(n_components=pca_limit,
                   random_state=random_state).fit_transform(data)

    return UMAP(n_components=target_dim,
                n_neighbors=30,
                min_dist=0.0,
                random_state=random_state).fit_transform(data)
def plotDependencyComponents():

    """Plot thoretical dependency between n_samples and n_components"""

    # range of admissible distortions
    eps_range = np.linspace(0.1, 0.99, 5)
    colors = pl.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))

    # range of number of samples to embed
    n_samples_range = np.logspace(1, 9, 9)

    
    pl.figure()

    for eps, color in zip(eps_range, colors):
        min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, \
                                                         eps=eps)
        pl.loglog(n_samples_range, min_n_components, color=color)

    pl.legend(["eps = %.1f" % eps for eps in eps_range], \
              loc="lower right")

    pl.xlabel("Number of observations to eps-embed")
    pl.ylabel("Minimum number of dimensions")
    pl.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")
    pl.show()
Example #9
0
def rp(X_train, X_test):
        num_components = johnson_lindenstrauss_min_dim(n_samples=X_train.shape[0], eps=0.1)
        print(num_components)
        print("# features: ", X_train.shape[1], " JL min dim:", num_components)
        print("JL number > #features so cant make any JL guarentees")
        # Of course not! It simply means that we can’t make any assumptions regarding the preservation of pairwise distances between data points.

        accuracies = []
        components = np.int32(np.linspace(1, 19, 19))

        model = LinearSVC()
        model.fit(X_train, y_train)
        baseline = metrics.accuracy_score(model.predict(X_test), y_test)

        # loop over the projection sizes
        for comp in components:
            # create the random projection
            sp = SparseRandomProjection(n_components=comp)
            X = sp.fit_transform(X_train)

            # train a classifier on the sparse random projection
            # TODO this is wrong.. needs to be KMeans
            model = LinearSVC(max_iter=1000)
            model.fit(X, y_train)

            # evaluate the model and update the list of accuracies
            test = sp.transform(X_test)
            accuracies.append(metrics.accuracy_score(model.predict(test), y_test))

        # create the figure
        plt.figure()
        plt.title("Accuracy of Sparse Random Projection on Churn")
        plt.xlabel("# of Components")
        plt.ylabel("Accuracy")
        plt.xlim([1, 20])
        plt.ylim([0, 1.0])

        # plot the baseline and random projection accuracies
        plt.plot(components, [baseline] * len(accuracies), color="r")
        plt.plot(components, accuracies)

        plt.show()
        # average looks to be around 5 components in RP to best the baseline
        sp = SparseRandomProjection(n_components = 5)
        X_transformed = sp.fit_transform(X_train)

        km = KMeans(n_clusters=2,
                    init='k-means++',
                    n_init=10,
                    max_iter=300,
                    random_state=RAND)
        plot_silhouette(km, X_transformed, title="SRP(5) KM(2)")

        km = KMeans(n_clusters=3,
                    init='k-means++',
                    n_init=10,
                    max_iter=300,
                    random_state=RAND)
        plot_silhouette(km, X_transformed, title="SRP(5) KM(3)")
 def gaussianRP(self, parameters):
     #defaut parameters
     n_components = parameters["n_components"] if "n_components" in parameters else "auto"
     eps = parameters["eps"] if "eps" in parameters else 1e-1
     if('johnsonRP' in parameters):
         n_components = johnson_lindenstrauss_min_dim(parameters['johnsonRP']['n_samples'], eps=parameters['johnsonRP']['eps'])
     
     #algo Object
     return GaussianRandomProjection(n_components=n_components, eps=eps)
Example #11
0
def r_projection(input_data, no_components=None, e=0.1):
    if no_components == None:
        no_components = johnson_lindenstrauss_min_dim(
            n_samples=input_data.shape[0], eps=e)

    projected_data = random_projection.GaussianRandomProjection(
        n_components=no_components).fit_transform(input_data)

    return projected_data
Example #12
0
def test():
    s = 50
    d = 1000
    miu = 0.3
    k = johnson_lindenstrauss_min_dim(s, miu)
    if k > d:
        raise ValueError("can't embed into smaller dimension")
    # TODO check the result guarantee of jl and change the 'print' to 'assure'
    print __test_transform__(s, d, k, miu, 100)
Example #13
0
File: jl.py Project: menisadi/pydp
def test():
    s = 50
    d = 1000
    miu = 0.3
    k = johnson_lindenstrauss_min_dim(s, miu)
    if k > d:
        raise ValueError("can't embed into smaller dimension")
    # TODO check the result guarantee of jl and change the 'print' to 'assure'
    print __test_transform__(s, d, k, miu, 100)
def reduction(eps, input_x, out_dir):
	print 'JL bound:', random_projection.johnson_lindenstrauss_min_dim(len(input_x[0]),eps),'(eps={})'.format(eps)
	transformer = random_projection.GaussianRandomProjection(50,eps)
	data_reduced = transformer.fit_transform(code)
	with open('{}/projection'.format(out_dir), "w") as op:
		for component in data_reduced:
			line = ', '.join(str(round(e,3)) for e in component)
        		op.write( line + '\n')
	return data_reduced
Example #15
0
 def _get_eps(self, n_samples, n_dims, n_slice=int(1e4)):
   new_dim = n_dims * self.keep_rate
   for i in range(1, n_slice):
     eps = i / n_slice
     jl_dim = johnson_lindenstrauss_min_dim(n_samples=n_samples, eps=eps)
     if jl_dim <= new_dim:
       print("rate %.3f, n_dims %d, new_dim %d, dims error rate: %.4f" % (self.keep_rate, n_dims, jl_dim, ((new_dim-jl_dim) / new_dim)) )
       return eps
   return -1
def determine_min_dim(params, x_data):
    eps = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    min_dim = johnson_lindenstrauss_min_dim(x_data.shape[0], eps)

    plt.figure()
    plt.plot(eps, min_dim)
    plt.ylabel("Minimum Number of Dimensions")
    plt.xlabel("Distortion EPS")
    plt.title(params['min_dim_graph'])
    plt.savefig(params['min_dim_graph'] + ".png")
 def sparseRP(self, parameters):
     #defaut parameters
     n_components = parameters["n_components"] if "n_components" in parameters else "auto"
     density = parameters["density"] if "density" in parameters else 'auto'
     eps = parameters["eps"] if "eps" in parameters else 1e-1
     if('johnsonRP' in parameters):
         n_components = johnson_lindenstrauss_min_dim(parameters['johnsonRP']['n_samples'], eps=parameters['johnsonRP']['eps'])
     
     #algo Object
     return SparseRandomProjection(n_components=n_components, eps=eps, density=density)
Example #18
0
def dim_reduce(
    points,
    technique='tsne',
    random_state=2021,
    n_components=2,
):
    if technique == 'tsne':
        tsne = TSNE(
            n_components=n_components,
            perplexity=30.0,
            early_exaggeration=12.0,
            learning_rate=200.0,
            n_iter=1000,
            n_iter_without_progress=300,
            min_grad_norm=1e-07,
            metric='euclidean',
            init='random',
            verbose=100,
            random_state=random_state,
            method='barnes_hut',
            angle=0.5,
            n_jobs=None,
        )
        transformed_points = tsne.fit_transform(points)
    elif technique == 'jlt':
        print(
            'Minimum JL components (eps = .99): ',
            johnson_lindenstrauss_min_dim(len(points),
                                          eps=1 - np.finfo(float).eps))
        grp = GaussianRandomProjection(n_components=n_components, eps=.99)
        transformed_points = grp.fit_transform(points)
    elif technique == 'pca':
        pca = PCA(
            n_components=n_components,
            copy=True,
            whiten=False,
            svd_solver='auto',
            tol=0.0,
            iterated_power='auto',
            random_state=random_state,
        )
        transformed_points = pca.fit_transform(points)

    elif technique == 'lle':
        lle = LocallyLinearEmbedding(
            n_components=n_components,
            random_state=random_state,
        )
        return lle.fit_transform(points)

    else:
        raise ValueError('Invalid technique.')

    return transformed_points
Example #19
0
def flastVectorization(dataPoints, reduceDim=True, dim=0, eps=0.33):
    countVec = CountVectorizer()
    Z_full = countVec.fit_transform(dataPoints)
    if reduceDim:
        if dim <= 0:
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
        return Z
    else:
        return Z_full
 def fit(self,
         *,
         timeout: float = None,
         iterations: int = None) -> CallResult[None]:
     eps = self.hyperparams['eps']
     n_components = johnson_lindenstrauss_min_dim(n_samples=self._x_dim,
                                                  eps=eps)
     if n_components > self._x_dim:
         self._model = GaussianRandomProjection(n_components=self._x_dim)
     else:
         self._model = GaussianRandomProjection(eps=eps)
     self._model.fit(self._training_data)
Example #21
0
def preprocess(X, y):
    min_frame = min(X, key=lambda x: x.shape[2]).shape[2]
    X = np.array([x[:,:,:min_frame].flatten() for x in X])
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler = scaler.fit(X)
    X = scaler.transform(X)
    
    # reduce principle components to improve performance
    reduced_pc = 2000
    recommended_pc = johnson_lindenstrauss_min_dim(861,eps=0.1)
    min_pc = recommended_pc - reduced_pc
    sp = SparseRandomProjection(n_components = int(min_pc))
    X = sp.fit_transform(X)
    return np.array(X), np.array(y)
def jlmd_search(ubs, names):
    epsilons = np.linspace(0.2, 0.999, 1000)
    y = []

    for eps in epsilons:
        y.append(johnson_lindenstrauss_min_dim(40000, eps))

    plot.style.use('seaborn-darkgrid')
    ax = plot.subplots()[1]
    plot.title('Influence of epsilon on the minimum number of dimensions')
    plot.semilogy(epsilons, y)
    for ub in ubs:
        plot.semilogy([0, 1], [ub, ub])
    plot.legend(['Minimum number of dimensions', *names], loc='upper right')
    plot.show()
Example #23
0
 def fit(self, X):
     if self.n_components == -1:
         super().set_params(
             n_components=random_projection.johnson_lindenstrauss_min_dim(
                 n_samples=X.shape[0], eps=self.eps))
     try:
         return super().fit(X)
     except ValueError as e:
         if self.n_components >= X.shape[1]:
             raise RuntimeError("eps={} and n_samples={} lead to a target "
                                "dimension of {} which is larger than the "
                                "original space with n_features={}".format(
                                    self.eps, X.shape[0], self.n_components,
                                    X.shape[1]))
         else:
             raise
Example #24
0
def data_JL_proj_data_diff(data):
    n_row = len(data)
    #finding minimum dimension reduction possible using JL lemma, while preserving pairwise distances upto a given eps value.
    min_dim = random_projection.johnson_lindenstrauss_min_dim(n_row, eps=0.1)
    print("min dim suggested by JL lemma with eps = 0.1 is " + str(min_dim))
    #creating transformer matrix to use for projecting the input data to target data. if O = IR. transformer is R here.
    transformer = random_projection.SparseRandomProjection()
    #transforming given "data"(input) to "projected_data"(output) by using "transformer" as random matrix R.
    projected_data = transformer.fit_transform(data)
    print(
        "new data dimensions after projection according to user provided target data dimension: "
        + str(np.shape(projected_data)))
    #printing pdist() of projected data
    #print("pdist of points in JL projected data")
    #print(sp.pdist(projected_data))
    print("\n\n")
    return sp.pdist(projected_data)
def train_drfs(train_x, train_y, eps=0.5, threshold="median"):
    n_samples, n_features, n_classes = \
            get_counts_tt(train_x, train_y)

    # pick number of components
    min_comp = random_projection.johnson_lindenstrauss_min_dim( \
            n_samples=n_samples, eps=eps)
    min_comp = min(min_comp, n_features)

    # scale and agglomerate to min_comp
    #scaler = preprocessing.StandardScaler()
    scaler = preprocessing.QuantileTransformer()
    feat_agg = cluster.FeatureAgglomeration( \
            n_clusters=min_comp)
    xtc = ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=-1)
    scaler2 = preprocessing.RobustScaler()
    #poly = preprocessing.PolynomialFeatures(degree=2, interaction_only=True)

    # train the model pipeline
    dr_pipe = pipeline.Pipeline([('scaler', scaler), \
            ('feat_agg', feat_agg), ('scaler2', scaler2)])

    dr_pipe.fit(train_x)

    # transform train_x to train xtc
    train_x = dr_pipe.transform(train_x)
    # train the xtc
    xtc.fit(train_x, train_y)

    print("Feature importances:")
    print("\tMax:", max(xtc.feature_importances_))
    print("\tMin:", min(xtc.feature_importances_))
    #print(xtc.feature_importances_)

    # create the feature selection model from the xtc
    feat_sel = feature_selection.SelectFromModel( \
            xtc, prefit=True, threshold=threshold)

    # create the pipeline to reduce dim then feature select
    drfs_pipe = pipeline.Pipeline(\
            [('dr_pipe', dr_pipe), ('feat_sel', feat_sel)])

    return drfs_pipe
 def plot_JL_curve(self):
     '''
     Plot the Johnson-Lindenstrauss minimum dimensions curve against the maximum distortion rate for Random Projection.
     The plot is also saved to a local jpg file.
     '''
     fig = plt.figure(figsize=(6, 4))
     eps_range = np.linspace(0.01, 0.99, 100)
     min_n_components = johnson_lindenstrauss_min_dim(n_samples=len(self.X),
                                                      eps=eps_range)
     plt.plot(eps_range, min_n_components)
     plt.xlabel('maximum distortion rate', fontsize=16)
     plt.ylabel('mimimum dimensions to keep', fontsize=16)
     plt.ylim(0, 20000)
     plt.title(
         'johnson_lindenstrauss_min_dim vs max_distortion_rate \nsample size = '
         + str(len(self.X)),
         fontsize=16)
     plt.show()
     return plt2base64(plt)
Example #27
0
def rp(X_train, X_test):
    num_components = johnson_lindenstrauss_min_dim(n_samples=X_train.shape[0],
                                                   eps=0.1)
    print(num_components)
    print("# features: ", X_train.shape[1], " JL min dim:", num_components)
    print("JL number > #features so cant make any JL guarentees")
    # Of course not! It simply means that we can’t make any assumptions regarding the preservation of pairwise distances between data points.

    accuracies = []
    components = np.int32(np.linspace(2, 64, 20))

    model = LinearSVC()
    model.fit(X_train, y_train)
    baseline = metrics.accuracy_score(model.predict(X_test), y_test)

    # loop over the projection sizes
    for comp in components:
        # create the random projection
        sp = SparseRandomProjection(n_components=comp)
        X = sp.fit_transform(X_train)

        # train a classifier on the sparse random projection
        model = LinearSVC()
        model.fit(X, y_train)

        # evaluate the model and update the list of accuracies
        test = sp.transform(X_test)
        accuracies.append(metrics.accuracy_score(model.predict(test), y_test))

    # create the figure
    plt.figure()
    plt.title("Accuracy of Sparse Projection on Sonar")
    plt.xlabel("# of Components")
    plt.ylabel("Accuracy")
    plt.xlim([2, 64])
    plt.ylim([0, 1.0])

    # plot the baseline and random projection accuracies
    plt.plot(components, [baseline] * len(accuracies), color="r")
    plt.plot(components, accuracies)

    plt.show()
Example #28
0
def preparation(inputFile, dim=0):
    vectorizer = HashingVectorizer()  # compute "TF"
    testCases = [line.rstrip("\n") for line in open(inputFile)]
    testSuite = vectorizer.fit_transform(testCases)

    # dimensionality reduction
    if dim <= 0:
        e = 0.5  # epsilon in jl lemma
        dim = johnson_lindenstrauss_min_dim(len(testCases), eps=e)
    srp = SparseRandomProjection(n_components=dim)
    projectedTestSuite = srp.fit_transform(testSuite)

    # map sparse matrix to dict
    TS = []
    for i in range(len(testCases)):
        tc = {}
        for j in projectedTestSuite[i].nonzero()[1]:
            tc[j] = projectedTestSuite[i, j]
        TS.append(tc)

    return TS
def checkOptimaldimensionality(s):
    # range of distortions
    eps_range = np.linspace(0.1, 0.99, 10)
    colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))

    # range of number of samples (observation) to embed
    n_samples_range = np.logspace(1, 4, s)

    plt.figure()
    for eps, color in zip(eps_range, colors):
        min_n_components = johnson_lindenstrauss_min_dim(n_samples_range,
                                                         eps=eps)
        plt.loglog(n_samples_range, min_n_components, color=color)
    plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right")
    plt.xlabel("Number of observations to eps-embed")
    plt.ylabel("Minimum number of dimensions")
    plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")
    plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right")
    plt.xlabel("Number of observations to eps-embed")
    plt.ylabel("Minimum number of dimensions")
    plt.title(
        "Johnson-Lindenstrauss bounds:\nn_samples vs n_components w.r.t eps")
        sys.exit(1)
    opts.n_components = type_auto_or_int(opts.n_components)
    opts.density = type_auto_or_float(opts.density)
    selected_transformers = opts.selected_transformers.split(",")

    ###########################################################################
    # Generate dataset
    ###########################################################################
    n_nonzeros = int(opts.ratio_nonzeros * opts.n_features)

    print("Dataset statics")
    print("===========================")
    print("n_samples \t= %s" % opts.n_samples)
    print("n_features \t= %s" % opts.n_features)
    if opts.n_components == "auto":
        print("n_components \t= %s (auto)" % johnson_lindenstrauss_min_dim(n_samples=opts.n_samples, eps=opts.eps))
    else:
        print("n_components \t= %s" % opts.n_components)
    print("n_elements \t= %s" % (opts.n_features * opts.n_samples))
    print("n_nonzeros \t= %s per feature" % n_nonzeros)
    print("ratio_nonzeros \t= %s" % opts.ratio_nonzeros)
    print("")

    ###########################################################################
    # Set transformer input
    ###########################################################################
    transformers = {}

    ###########################################################################
    # Set GaussianRandomProjection input
    gaussian_matrix_params = {"n_components": opts.n_components, "random_state": opts.random_seed}
Example #31
0
File: a3.py Project: rkaufholz3/a3
def rp_analysis(X, y, dataset, plot, X_test):

    if plot:
        # Project in 2D for visualization
        rp = GaussianRandomProjection(n_components=2)
        projected = rp.fit_transform(X)
        plot_2d(projected, y)

        # Project in 3D for visualization
        rp = GaussianRandomProjection(n_components=3)
        projected = rp.fit_transform(X)
        plot_3d(projected, y)

    # # Plot eps vs. n components
    # eps_range = [0.4, 0.6, 0.8, 0.99]  # For Fashion MNIST eps 0.4 to 0.999 (must be < 1)
    # num_components = []
    # for eps in eps_range:
    #     rp = GaussianRandomProjection(n_components='auto', eps=eps)
    #     projected = rp.fit_transform(X)
    #     num_components.append(projected.shape)
    # print(num_components)

    # Determine min components for varying eps
    min_dims = []
    eps_range = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99]
    for e in eps_range:
        min_dims.append(johnson_lindenstrauss_min_dim(n_samples=X.shape[0], eps=e))
    print('\nmin dims', min_dims)
    print('\nX shape:', X.shape)

    # Measure variation across multiple runs
    means_list = []
    stdev_list = []
    kurtosis_list = []
    iterations = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    for i in iterations:
        rp3 = GaussianRandomProjection(n_components=10)  # 10 components to help visualize the variation
        projected3 = rp3.fit_transform(X)
        means_list.append(np.mean(projected3))
        stdev_list.append(np.std(projected3))
        kurtosis_list.append(np.mean(kurtosis(projected3)))
        projected_df = pd.DataFrame(projected3)
        projected_df.to_csv('projected.csv')
        print(plot_kurtosis(projected3))

    # http://kitchingroup.cheme.cmu.edu/blog/2013/09/13/Plotting-two-datasets-with-very-different-scales/
    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    ax1.plot(iterations, means_list, label='Mean', color='red')
    ax1.plot(iterations, stdev_list, label='Std Deviation', color='blue')
    ax1.set_xlabel('Iteration', fontsize=18)
    ax1.legend()
    ax2 = ax1.twinx()
    ax2.plot(iterations, kurtosis_list, label='Kurtosis', color='green')
    plt.legend()
    plt.show()

    # print('\ncomponents_ shape:', rp3.components_.shape)

    # Project on to an 'optimal' number of components
    rp2 = GaussianRandomProjection(n_components=331)
    projected2_train = rp2.fit_transform(X)
    projected2_test = rp2.transform(X_test)
    print('\nRP projected X_train:', projected2_train.shape)

    return projected2_train, projected2_test
Example #32
0
from sklearn import  datasets, metrics, decomposition, random_projection
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.random_projection import johnson_lindenstrauss_min_dim
from sklearn.model_selection import train_test_split, validation_curve, learning_curve, ShuffleSplit,GridSearchCV
from sklearn.cross_validation import StratifiedKFold, cross_val_score
from functions import *

data = datasets.load_digits()
X = data.data
y = data.target

johnson_lindenstrauss_min_dim(1797,eps=0.1)

accuracies = []
components = range(2,X.shape[1])

split = train_test_split(X, y, test_size = 0.33,
    random_state = 42)
#digits = datasets.load_digits()
#split = train_test_split(digits.data, digits.target, test_size = 0.3,
#    random_state = 42)
(trainData, testData, trainTarget, testTarget) = split

model = LinearSVC()
model.fit(trainData, trainTarget)
baseline = metrics.accuracy_score(model.predict(testData), testTarget)
Example #33
0
colnames = [
    "make", "address", "all", "3d", "our", "over", "remove", "internet",
    "order", "mail", "receive", "will", "people", "report", "addresses",
    "free", "business", "email", "you", "credit", "your", "font", "000",
    "money", "hp", "hpl", "george", "650", "lab", "labs", "telnet", "857",
    "data", "415", "85", "technology", "1999", "parts", "pm", "direct", "cs",
    "meeting", "original", "project", "re", "edu", "table", "conference", ";",
    "(", "[", "!", "$", "#", "average", "longest", "total", "class"
]

data.columns = colnames
X, y = data.iloc[:, :-1], data.iloc[:, -1]
X.columns = colnames[:len(colnames) - 1]

print johnson_lindenstrauss_min_dim(4601, eps=0.1)

split = train_test_split(X, y, test_size=0.3, random_state=42)
(trainData, testData, trainTarget, testTarget) = split
accuracies = []
components = np.int32(np.linspace(2, 56, 14))
model = LinearSVC()
model.fit(trainData, trainTarget)
baseline = metrics.accuracy_score(model.predict(testData), testTarget)
# loop over the projection sizes
for comp in components:
    # create the random projection
    sp = SparseRandomProjection(n_components=comp)
    X = sp.fit_transform(trainData)

    # train a classifier on the sparse random projection
def johnson_lindenstrauss(data, data_name):
    # `normed` is being deprecated in favor of `density` in histograms
    if LooseVersion(matplotlib.__version__) >= '2.1':
        density_param = {'density': True}
    else:
        density_param = {'normed': True}

    # Part 1: plot the theoretical dependency between n_components_min and
    # n_samples

    # range of admissible distortions
    eps_range = np.linspace(0.1, 0.99, 5)
    colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))

    # range of number of samples (observation) to embed
    n_samples_range = np.logspace(1, 9, 9)

    plt.figure()
    for eps, color in zip(eps_range, colors):
        min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, eps=eps)
        plt.loglog(n_samples_range, min_n_components, color=color)

    plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right")
    plt.xlabel("Number of observations to eps-embed")
    plt.ylabel("Minimum number of dimensions")
    plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")
    plt.savefig('Figs/02b_rp_comp_samples')

    # range of admissible distortions
    eps_range = np.linspace(0.01, 0.99, 100)

    n_samples_range = np.logspace(2, 6, 5)
    colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(n_samples_range)))

    plt.figure()
    for n_samples, color in zip(n_samples_range, colors):
        min_n_components = johnson_lindenstrauss_min_dim(n_samples, eps=eps_range)
        plt.semilogy(eps_range, min_n_components, color=color)

    plt.legend(["n_samples = %d" % n for n in n_samples_range], loc="upper right")
    plt.xlabel("Distortion eps")
    plt.ylabel("Minimum number of dimensions")
    plt.title("Johnson-Lindenstrauss bounds:\nn_components vs eps")
    plt.savefig('Figs/02b_rp_comp_eps')

    # Part 2: perform sparse random projection of some digits images which are
    # quite low dimensional and dense or documents of the 20 newsgroups dataset
    # which is both high dimensional and sparse

    n_samples, n_features = data.shape
    print("Embedding %d samples with dim %d using various random projections"
        % (n_samples, n_features))

    n_components_range = np.array([1,10,100,1000])
    dists = euclidean_distances(data, squared=True).ravel()

    # select only non-identical samples pairs
    nonzero = dists != 0
    dists = dists[nonzero]

    for n_components in n_components_range:
        t0 = time()
        rp = SparseRandomProjection(n_components=n_components)
        projected_data = rp.fit_transform(data)
        print("Projected %d samples from %d to %d in %0.3fs"
            % (n_samples, n_features, n_components, time() - t0))
        if hasattr(rp, 'components_'):
            n_bytes = rp.components_.data.nbytes
            n_bytes += rp.components_.indices.nbytes
            print("Random matrix with size: %0.3fMB" % (n_bytes / 1e6))

        projected_dists = euclidean_distances(
            projected_data, squared=True).ravel()[nonzero]

        plt.figure()
        plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu)
        plt.xlabel("Pairwise squared distances in original space")
        plt.ylabel("Pairwise squared distances in projected space")
        plt.title("Pairwise distances distribution for n_components=%d" %
                n_components)
        cb = plt.colorbar()
        cb.set_label('Sample pairs counts')

        rates = projected_dists / dists
        print("Mean distances rate: %0.2f (%0.2f)"
            % (np.mean(rates), np.std(rates)))
        plt.savefig('Figs/02b_rp_pwdist_{}_{}'.format(data_name, n_components))

        plt.figure()
        plt.hist(rates, bins=50, range=(0., 2.), edgecolor='k', **density_param)
        plt.xlabel("Squared distances rate: projected / original")
        plt.ylabel("Distribution of samples pairs")
        plt.title("Histogram of pairwise distance rates for n_components=%d" %
                n_components)
        plt.savefig('Figs/02b_rp_histogram_{}_{}'.format(data_name, n_components))
        plt.clf()
Example #35
0
def generate_real_dataset_rp(data_path, sparse=False, eps=0.1):
    ################################################ LOADING AND CLEANING THE DATA #########################################
    samples = open(os.path.join(data_path, 'samples.txt'))
    labels = open(os.path.join(data_path, 'labels.txt'))
    annotations = open(os.path.join(data_path, 'annotations.txt'))
    out_f = open(os.path.join(data_path,'rp_out'),'w')

    bad_samples = []
    real_labels = []
    qpoint_lists = []
    label_list = []
    annotation_list = []
    label_count = np.zeros((1,13))

    for data in samples:
        qpoint_lists = data.split(';')
    for data in labels:
        label_list = data.split(';')
    for data in annotations:
        annotation_list = data.split(';')

    out_s = 'found %i qpoint lists.\n' % len(qpoint_lists) + 'found %i labels.\n' % len(label_list) + 'found %i annotations.\n\n' % len(annotation_list)
    print out_s
    out_f.write(out_s)
    out_f.close()

    for list_ind in np.arange(len(qpoint_lists)):
        bad = False

        ################# PROCESS THE LABELS
        if annotation_list[list_ind][0:2] != 'vo' and annotation_list[list_ind][0:2] != 'fl' and annotation_list[list_ind][0:2] != 'mi' and annotation_list[list_ind][0:2] != 'ja':
            real_labels.append(0)
            label_count[0][0] += 1
        else:
            position = label_list[list_ind].split(',')
            if float(position[0]) == -2000 or float(position[0]) == -1000:
                real_labels.append(-1)
                bad = True
            else:
                lab = determine_label((float(position[0]),float(position[1]),float(position[2])))
                real_labels.append(lab)
                label_count[0][lab] += 1

        ################# PROCESS THE Q-POINTS
        qpoint_lists[list_ind] = qpoint_lists[list_ind].split(':')
        for point_ind in np.arange(len(qpoint_lists[list_ind])):
            qpoint_lists[list_ind][point_ind] = qpoint_lists[list_ind][point_ind].split(',')
            if len(qpoint_lists[list_ind][point_ind]) != 7:
                bad = True

        if bad:
            bad_samples.append(list_ind)

    print 'need to remove %i bad samples.' %len(bad_samples)
    ################# REMOVE BAD SAMPLES
    ind = 0
    for bad_ind in bad_samples:
        real_ind = bad_ind - ind
        qpoint_lists.pop(real_ind)
        real_labels.pop(real_ind)
        annotation_list.pop(real_ind)
        ind += 1

    out_f = open(os.path.join(data_path,'rp_out'),'a')
    out_s = str(len(qpoint_lists)) + ' samples remain after purging.\n' + str(len(real_labels)) + ' labels remain after purging.\n'\
            + str(len(annotation_list)) + ' annotations remain after purging.\n' + 'percentages of the labels are %s\n\n' %str(label_count/len(qpoint_lists))
    print out_s
    out_f.write(out_s)
    out_f.close()

    samples.close()
    labels.close()
    annotations.close()

    ################################################## PROJECTING THE DATA INTO A GRID #####################################
    pcol = 0
    ps = 0

    # ASSUMPTION: relevant area is never less than 0.7 meters and more than 4.4 meters on the x-axis, 2.5 meters to both sides on the y-axis
    # and 2 meters on the z-axis away from the sensors
    bin_cm = 3
    max_x_cm = 440
    min_x_cm = 70
    max_y_cm = 250
    max_z_cm = 200

    x_range = max_x_cm / bin_cm - min_x_cm / bin_cm
    y_range = max_y_cm * 2 / bin_cm
    z_range = max_z_cm / bin_cm

    out_f = open(os.path.join(data_path,'rp_out'),'a')
    out_s = 'length of data in original space: %d\n\n' %(x_range*y_range*z_range)
    print out_s
    out_f.write(out_s)
    out_f.close()

    # compute a conservative estimate of the number of latent dimensions required to guarantuee the given epsilons
    n_dims = johnson_lindenstrauss_min_dim(len(qpoint_lists),eps)

    out_f = open(os.path.join(data_path,'rp_out'),'a')
    out_s = 'number of latent dimensions needed to guarantee %f epsilon is %f\n\n' %(eps, n_dims)
    print out_s
    out_f.write(out_s)
    out_f.close()

    f_path = os.path.join(data_path,'rp_real_sparse.hdf5') if sparse else os.path.join(data_path,'rp_real_gauss.hdf5')
    print f_path
    f = h5.File(f_path, "w")
    f.create_dataset('data_set/data_set', (len(qpoint_lists), n_dims), dtype='f')
    f.create_dataset('labels/real_labels', (len(real_labels),), dtype='i')
    dt = h5.special_dtype(vlen=unicode)
    f.create_dataset('annotations/annotations', (len(annotation_list),), dtype=dt)

    transformer = random_projection.SparseRandomProjection(n_components=n_dims) if sparse else random_projection.GaussianRandomProjection(n_components=n_dims)
    if sparse:
        print 'performing projection with sparse matrix'
    else:
        print 'performing projection with gaussian matrix'

    # this is not the way it's supposed to be done BUT the proper training set doesn't fit into the memory
    transformer.components_ = transformer._make_random_matrix(n_dims, x_range*y_range*z_range)
    last_per = -1

    for ind, qpoint_list in enumerate(qpoint_lists):
        grid = np.zeros((x_range, y_range, z_range))

        for qpoint in qpoint_list:
            x = int(float(qpoint[0])*100) / bin_cm
            y = (int(float(qpoint[1])*100) + max_y_cm) / bin_cm
            z = int(float(qpoint[2])*100) / bin_cm
            if x - min_x_cm/bin_cm < 0 or x - min_x_cm/bin_cm > x_range-1 or y > y_range-1 or y < 0 or z > z_range-1 or z < 0:
                continue
            pow = float(qpoint[4])
            if grid[x-min_x_cm/bin_cm][y][z] != 0:
                pcol += 1
                if grid[x-min_x_cm/bin_cm][y][z] < pow:
                    grid[x-min_x_cm/bin_cm][y][z] = pow
            else:
                grid[x-min_x_cm/bin_cm][y][z] = pow
            ps += 1

        f['data_set/data_set'][ind] = transformer.transform(np.reshape(grid,(1,-1)))
        f['labels/real_labels'][ind] = real_labels[ind]
        f['annotations/annotations'][ind] = annotation_list[ind]
        curr_percent = int(float(ind) / len(qpoint_lists) * 100)
        if last_per != curr_percent:
            last_per = curr_percent
            out_f = open(os.path.join(data_path,'rp_out'),'a')
            out_s = 'have now looked at %i%% of the data.\n' % int(float(ind) / len(qpoint_lists) * 100)
            print out_s
            out_f.write(out_s)
            out_f.close()

    print 'done with projecting onto the grid (without binning)'
    print 'percentage of point collision: ' + str(float(pcol)/ps)
    print 'number of samples: ' +str(len(f['data_set/data_set']))
    print 'dimensionality of the samples: ' +str(len(f['data_set/data_set'][0]))
    print 'number of labels: ' +str(len(f['labels/real_labels']))
    print 'number of annotations: ' +str(len(f['annotations/annotations']))

    out_f = open(os.path.join(data_path,'rp_out'),'a')
    out_s = 'projection done, new dimension is %d\n\n' %len(f['data_set/data_set'][0])
    print out_s
    out_f.write(out_s)
    out_f.close()

    f.close()

    if sparse:
        generate_train_val_test_set(os.path.join(data_path,"rp_real_sparse.hdf5"), os.path.join(data_path,"train_val_test_rp_real_sparse.hdf5"))
    else:
        generate_train_val_test_set(os.path.join(data_path,"rp_real_gauss.hdf5"), os.path.join(data_path,"train_val_test_rp_real_gauss.hdf5"))
from sklearn.datasets import load_digits
from sklearn.metrics.pairwise import euclidean_distances

# Part 1: plot the theoretical dependency between n_components_min and
# n_samples

# range of admissible distortions
eps_range = np.linspace(0.1, 0.99, 5)
colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))

# range of number of samples (observation) to embed
n_samples_range = np.logspace(1, 9, 9)

plt.figure()
for eps, color in zip(eps_range, colors):
    min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, eps=eps)
    plt.loglog(n_samples_range, min_n_components, color=color)

plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right")
plt.xlabel("Number of observations to eps-embed")
plt.ylabel("Minimum number of dimensions")
plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")
plt.show()

# range of admissible distortions
eps_range = np.linspace(0.01, 0.99, 100)

# range of number of samples (observation) to embed
n_samples_range = np.logspace(2, 6, 5)
colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(n_samples_range)))
from sklearn.datasets import load_digits
from sklearn.metrics.pairwise import euclidean_distances

# Part 1: plot the theoretical dependency between n_components_min and
# n_samples

# range of admissible distortions
eps_range = np.linspace(0.1, 0.99, 5)
colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))

# range of number of samples (observation) to embed
n_samples_range = np.logspace(1, 9, 9)

plt.figure()
for eps, color in zip(eps_range, colors):
    min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, eps=eps)
    plt.loglog(n_samples_range, min_n_components, color=color)

plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right")
plt.xlabel("Number of observations to eps-embed")
plt.ylabel("Minimum number of dimensions")
plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")

# range of admissible distortions
eps_range = np.linspace(0.01, 0.99, 100)

# range of number of samples (observation) to embed
n_samples_range = np.logspace(2, 6, 5)
colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(n_samples_range)))

plt.figure()
Example #38
0
File: hw3.py Project: jezlax/python
def min_features(scaled_data):
    print johnson_lindenstrauss_min_dim(len(scaled_data),eps=0.1)
Example #39
0
        sys.exit(1)
    opts.n_components = type_auto_or_int(opts.n_components)
    opts.density = type_auto_or_float(opts.density)
    selected_transformers = opts.selected_transformers.split(',')

    ###########################################################################
    # Generate dataset
    ###########################################################################
    n_nonzeros = int(opts.ratio_nonzeros * opts.n_features)

    print('Dataset statics')
    print("===========================")
    print('n_samples \t= %s' % opts.n_samples)
    print('n_features \t= %s' % opts.n_features)
    if opts.n_components == "auto":
        print('n_components \t= %s (auto)' % johnson_lindenstrauss_min_dim(
            n_samples=opts.n_samples, eps=opts.eps))
    else:
        print('n_components \t= %s' % opts.n_components)
    print('n_elements \t= %s' % (opts.n_features * opts.n_samples))
    print('n_nonzeros \t= %s per feature' % n_nonzeros)
    print('ratio_nonzeros \t= %s' % opts.ratio_nonzeros)
    print('')

    ###########################################################################
    # Set transformer input
    ###########################################################################
    transformers = {}

    ###########################################################################
    # Set GaussianRandomProjection input
    gaussian_matrix_params = {
    opts.n_components = type_auto_or_int(opts.n_components)
    opts.density = type_auto_or_float(opts.density)
    selected_transformers = opts.selected_transformers.split(',')

    ###########################################################################
    # Generate dataset
    ###########################################################################
    n_nonzeros = int(opts.ratio_nonzeros * opts.n_features)

    print('Dataset statics')
    print("===========================")
    print('n_samples \t= %s' % opts.n_samples)
    print('n_features \t= %s' % opts.n_features)
    if opts.n_components == "auto":
        print('n_components \t= %s (auto)' %
              johnson_lindenstrauss_min_dim(n_samples=opts.n_samples,
                                            eps=opts.eps))
    else:
        print('n_components \t= %s' % opts.n_components)
    print('n_elements \t= %s' % (opts.n_features * opts.n_samples))
    print('n_nonzeros \t= %s per feature' % n_nonzeros)
    print('ratio_nonzeros \t= %s' % opts.ratio_nonzeros)
    print('')

    ###########################################################################
    # Set transformer input
    ###########################################################################
    transformers = {}

    ###########################################################################
    # Set GaussianRandomProjection input
    gaussian_matrix_params = {