Python johnson_lindenstrauss_min_dim Examples, sklearn.random_projection.johnson_lindenstrauss_min_dim Python Examples

Example #1

0

Show file

def test_input_size_jl_min_dim():
    with pytest.raises(ValueError):
        johnson_lindenstrauss_min_dim(3 * [100], eps=2 * [0.9])

    johnson_lindenstrauss_min_dim(
        np.random.randint(1, 10, size=(10, 10)), eps=np.full((10, 10), 0.5)
    )

Example #2

0

Show file

File: test_random_projection.py Project: allefpablo/scikit-learn

def test_input_size_jl_min_dim():
    assert_raises(ValueError, johnson_lindenstrauss_min_dim,
                  3 * [100], 2 * [0.9])

    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 3 * [100],
                  2 * [0.9])

    johnson_lindenstrauss_min_dim(np.random.randint(1, 10, size=(10, 10)),
                                  np.full((10, 10), 0.5))

Example #3

0

Show file

File: test_random_projection.py Project: zinc-40/scikit-learn

def test_input_size_jl_min_dim():
    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 3 * [100],
                  2 * [0.9])

    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 3 * [100],
                  2 * [0.9])

    johnson_lindenstrauss_min_dim(np.random.randint(1, 10, size=(10, 10)),
                                  0.5 * np.ones((10, 10)))

Example #4

0

Show file

File: the_Johnson-Lindenstrauss_bound_for_embedding_with_random_projections.py Project: AkiraKane/Python

def plotDependencyEPS():

    """Plot thoretical dependency between n_components and eps"""
    
    # range of admissible distortions
    eps_range = np.linspace(0.01, 0.99, 100)

    # range of number of samples to embed
    n_samples_range = np.logspace(2, 6, 5)
    colors = pl.cm.Blues(np.linspace(0.3, 1.0, len(n_samples_range)))

    pl.figure()

    for n_samples, color in zip(n_samples_range, colors):
        min_n_components = johnson_lindenstrauss_min_dim(n_samples, \
                                                         eps=eps_range)
        pl.semilogy(eps_range, min_n_components, color=color)

    pl.legend(["n_samples = %d" % n for n in n_samples_range], \
              loc="upper right")

    pl.xlabel("Distortion eps")
    pl.ylabel("Minimum number of dimensions")
    pl.title("Johnson-Lindenstrauss bounds:\nn_components vs eps")
    pl.show()

Example #5

0

Show file

File: q2rp.py Project: auimendoza/cs7641-omscs-a3

def plot_jl_bounds(label, X):
    """
    http://scikit-learn.org/stable/auto_examples/plot_johnson_lindenstrauss_bound.html#sphx-glr-auto-examples-plot-johnson-lindenstrauss-bound-py
    """
    print("calculating jl bounds")
    eps_ranges = []
    eps_ranges.append(np.linspace(0.2, 0.99, 5))

    # range of number of samples (observation) to embed
    n_samples_range = np.linspace(100, 6000, 5)

    for i, eps_range in enumerate(eps_ranges):
        colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))
        plt.figure()
        for eps, color in zip(eps_range, colors):
            min_n_components = johnson_lindenstrauss_min_dim(n_samples_range,
                                                             eps=eps)
            plt.plot(n_samples_range, min_n_components, color=color)

        plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="best")
        plt.xlabel("Number of observations to eps-embed")
        plt.ylabel("Minimum number of dimensions")
        plt.title("Johnson-Lindenstrauss bounds:\n%s Data" % (label))
        plt.axhline(y=X.shape[1], color='r', linestyle='--', alpha=0.3)
        plt.axvline(x=X.shape[0], color='r', linestyle='--', alpha=0.3)
        plt.gcf()
        plt.savefig('%s-jlbounds.png' % (label.replace(" ", "-")))
        plt.close()

Example #6

0

Show file

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        eps = self.hyperparams['eps']
        n_components = johnson_lindenstrauss_min_dim(n_samples=self._x_dim,
                                                     eps=eps)
        _logger.info("[INFO] n_components is " + str(n_components))
        if n_components > self._y_dim:
            # Default n_components == 'auto' fails. Need to explicitly assign n_components
            self._model = GaussianRandomProjection(
                n_components=self._y_dim, random_state=self.random_seed)
        else:
            try:
                self._model = GaussianRandomProjection(
                    eps=eps, random_state=self.random_seed)
                self._model.fit(self._training_data)
            except:
                _logger.info(
                    "[Warning] Using given eps value failed, will use default conditions."
                )
                self._model = GaussianRandomProjection()

        self._model.fit(self._training_data)

        self._fitted = True
        return CallResult(None, has_finished=True)

Example #7

0

Show file

File: ml_utils.py Project: SIholin/dpEmu

def reduce_dimensions(data, random_state, target_dim=2):
    """
    Reduces the dimensionality of the data using UMAP for lower dimensions, PCA for higher dimensions and possibly
    even random projections if the number of dimension is over the limit given by the Johnson–Lindenstrauss lemma. Works
    for NumPy arrays.

    Args:
        data: The input data.
        random_state: Random state to generate reproducible results.
        target_dim: The targeted dimension.

    Returns:
        Lower dimension representation of the data.
    """
    jl_limit = johnson_lindenstrauss_min_dim(n_samples=data.shape[0], eps=.3)
    pca_limit = 30

    if data.shape[1] > jl_limit and data.shape[1] > pca_limit:
        data = SparseRandomProjection(
            n_components=jl_limit,
            random_state=random_state).fit_transform(data)

    if data.shape[1] > pca_limit:
        data = PCA(n_components=pca_limit,
                   random_state=random_state).fit_transform(data)

    return UMAP(n_components=target_dim,
                n_neighbors=30,
                min_dist=0.0,
                random_state=random_state).fit_transform(data)

Example #8

0

Show file

File: the_Johnson-Lindenstrauss_bound_for_embedding_with_random_projections.py Project: AkiraKane/Python

def plotDependencyComponents():

    """Plot thoretical dependency between n_samples and n_components"""

    # range of admissible distortions
    eps_range = np.linspace(0.1, 0.99, 5)
    colors = pl.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))

    # range of number of samples to embed
    n_samples_range = np.logspace(1, 9, 9)

    
    pl.figure()

    for eps, color in zip(eps_range, colors):
        min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, \
                                                         eps=eps)
        pl.loglog(n_samples_range, min_n_components, color=color)

    pl.legend(["eps = %.1f" % eps for eps in eps_range], \
              loc="lower right")

    pl.xlabel("Number of observations to eps-embed")
    pl.ylabel("Minimum number of dimensions")
    pl.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")
    pl.show()

Example #9

0

Show file

def rp(X_train, X_test):
        num_components = johnson_lindenstrauss_min_dim(n_samples=X_train.shape[0], eps=0.1)
        print(num_components)
        print("# features: ", X_train.shape[1], " JL min dim:", num_components)
        print("JL number > #features so cant make any JL guarentees")
        # Of course not! It simply means that we can’t make any assumptions regarding the preservation of pairwise distances between data points.

        accuracies = []
        components = np.int32(np.linspace(1, 19, 19))

        model = LinearSVC()
        model.fit(X_train, y_train)
        baseline = metrics.accuracy_score(model.predict(X_test), y_test)

        # loop over the projection sizes
        for comp in components:
            # create the random projection
            sp = SparseRandomProjection(n_components=comp)
            X = sp.fit_transform(X_train)

            # train a classifier on the sparse random projection
            # TODO this is wrong.. needs to be KMeans
            model = LinearSVC(max_iter=1000)
            model.fit(X, y_train)

            # evaluate the model and update the list of accuracies
            test = sp.transform(X_test)
            accuracies.append(metrics.accuracy_score(model.predict(test), y_test))

        # create the figure
        plt.figure()
        plt.title("Accuracy of Sparse Random Projection on Churn")
        plt.xlabel("# of Components")
        plt.ylabel("Accuracy")
        plt.xlim([1, 20])
        plt.ylim([0, 1.0])

        # plot the baseline and random projection accuracies
        plt.plot(components, [baseline] * len(accuracies), color="r")
        plt.plot(components, accuracies)

        plt.show()
        # average looks to be around 5 components in RP to best the baseline
        sp = SparseRandomProjection(n_components = 5)
        X_transformed = sp.fit_transform(X_train)

        km = KMeans(n_clusters=2,
                    init='k-means++',
                    n_init=10,
                    max_iter=300,
                    random_state=RAND)
        plot_silhouette(km, X_transformed, title="SRP(5) KM(2)")

        km = KMeans(n_clusters=3,
                    init='k-means++',
                    n_init=10,
                    max_iter=300,
                    random_state=RAND)
        plot_silhouette(km, X_transformed, title="SRP(5) KM(3)")

Example #10

0

Show file

File: utils.py Project: jean3108/Research-Data-Science-Methodology

 def gaussianRP(self, parameters):
     #defaut parameters
     n_components = parameters["n_components"] if "n_components" in parameters else "auto"
     eps = parameters["eps"] if "eps" in parameters else 1e-1
     if('johnsonRP' in parameters):
         n_components = johnson_lindenstrauss_min_dim(parameters['johnsonRP']['n_samples'], eps=parameters['johnsonRP']['eps'])
     
     #algo Object
     return GaussianRandomProjection(n_components=n_components, eps=eps)

Example #11

0

Show file

def r_projection(input_data, no_components=None, e=0.1):
    if no_components == None:
        no_components = johnson_lindenstrauss_min_dim(
            n_samples=input_data.shape[0], eps=e)

    projected_data = random_projection.GaussianRandomProjection(
        n_components=no_components).fit_transform(input_data)

    return projected_data

Example #12

0

Show file

def test():
    s = 50
    d = 1000
    miu = 0.3
    k = johnson_lindenstrauss_min_dim(s, miu)
    if k > d:
        raise ValueError("can't embed into smaller dimension")
    # TODO check the result guarantee of jl and change the 'print' to 'assure'
    print __test_transform__(s, d, k, miu, 100)

Example #13

0

Show file

File: jl.py Project: menisadi/pydp

def test():
    s = 50
    d = 1000
    miu = 0.3
    k = johnson_lindenstrauss_min_dim(s, miu)
    if k > d:
        raise ValueError("can't embed into smaller dimension")
    # TODO check the result guarantee of jl and change the 'print' to 'assure'
    print __test_transform__(s, d, k, miu, 100)

Example #14

0

Show file

File: NoDimReduction.py Project: paramoecium/dim_reduction_via_sparse_coding

def reduction(eps, input_x, out_dir):
	print 'JL bound:', random_projection.johnson_lindenstrauss_min_dim(len(input_x[0]),eps),'(eps={})'.format(eps)
	transformer = random_projection.GaussianRandomProjection(50,eps)
	data_reduced = transformer.fit_transform(code)
	with open('{}/projection'.format(out_dir), "w") as op:
		for component in data_reduced:
			line = ', '.join(str(round(e,3)) for e in component)
        		op.write( line + '\n')
	return data_reduced

Example #15

0

Show file

 def _get_eps(self, n_samples, n_dims, n_slice=int(1e4)):
   new_dim = n_dims * self.keep_rate
   for i in range(1, n_slice):
     eps = i / n_slice
     jl_dim = johnson_lindenstrauss_min_dim(n_samples=n_samples, eps=eps)
     if jl_dim <= new_dim:
       print("rate %.3f, n_dims %d, new_dim %d, dims error rate: %.4f" % (self.keep_rate, n_dims, jl_dim, ((new_dim-jl_dim) / new_dim)) )
       return eps
   return -1

Example #16

0

Show file

File: random_projections.py Project: shayanmukhtar/CS_7641_Assgnt3

def determine_min_dim(params, x_data):
    eps = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    min_dim = johnson_lindenstrauss_min_dim(x_data.shape[0], eps)

    plt.figure()
    plt.plot(eps, min_dim)
    plt.ylabel("Minimum Number of Dimensions")
    plt.xlabel("Distortion EPS")
    plt.title(params['min_dim_graph'])
    plt.savefig(params['min_dim_graph'] + ".png")

Example #17

0

Show file

File: utils.py Project: jean3108/Research-Data-Science-Methodology

 def sparseRP(self, parameters):
     #defaut parameters
     n_components = parameters["n_components"] if "n_components" in parameters else "auto"
     density = parameters["density"] if "density" in parameters else 'auto'
     eps = parameters["eps"] if "eps" in parameters else 1e-1
     if('johnsonRP' in parameters):
         n_components = johnson_lindenstrauss_min_dim(parameters['johnsonRP']['n_samples'], eps=parameters['johnsonRP']['eps'])
     
     #algo Object
     return SparseRandomProjection(n_components=n_components, eps=eps, density=density)

Example #18

0

Show file

File: reduction.py Project: emanuele-albini/emutils

def dim_reduce(
    points,
    technique='tsne',
    random_state=2021,
    n_components=2,
):
    if technique == 'tsne':
        tsne = TSNE(
            n_components=n_components,
            perplexity=30.0,
            early_exaggeration=12.0,
            learning_rate=200.0,
            n_iter=1000,
            n_iter_without_progress=300,
            min_grad_norm=1e-07,
            metric='euclidean',
            init='random',
            verbose=100,
            random_state=random_state,
            method='barnes_hut',
            angle=0.5,
            n_jobs=None,
        )
        transformed_points = tsne.fit_transform(points)
    elif technique == 'jlt':
        print(
            'Minimum JL components (eps = .99): ',
            johnson_lindenstrauss_min_dim(len(points),
                                          eps=1 - np.finfo(float).eps))
        grp = GaussianRandomProjection(n_components=n_components, eps=.99)
        transformed_points = grp.fit_transform(points)
    elif technique == 'pca':
        pca = PCA(
            n_components=n_components,
            copy=True,
            whiten=False,
            svd_solver='auto',
            tol=0.0,
            iterated_power='auto',
            random_state=random_state,
        )
        transformed_points = pca.fit_transform(points)

    elif technique == 'lle':
        lle = LocallyLinearEmbedding(
            n_components=n_components,
            random_state=random_state,
        )
        return lle.fit_transform(points)

    else:
        raise ValueError('Invalid technique.')

    return transformed_points

Example #19

0

Show file

def flastVectorization(dataPoints, reduceDim=True, dim=0, eps=0.33):
    countVec = CountVectorizer()
    Z_full = countVec.fit_transform(dataPoints)
    if reduceDim:
        if dim <= 0:
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
        return Z
    else:
        return Z_full

Example #20

0

Show file

File: timeseries_feature.py Project: liangmuxin/dsbox-ta2

 def fit(self,
         *,
         timeout: float = None,
         iterations: int = None) -> CallResult[None]:
     eps = self.hyperparams['eps']
     n_components = johnson_lindenstrauss_min_dim(n_samples=self._x_dim,
                                                  eps=eps)
     if n_components > self._x_dim:
         self._model = GaussianRandomProjection(n_components=self._x_dim)
     else:
         self._model = GaussianRandomProjection(eps=eps)
     self._model.fit(self._training_data)

Example #21

0

Show file

def preprocess(X, y):
    min_frame = min(X, key=lambda x: x.shape[2]).shape[2]
    X = np.array([x[:,:,:min_frame].flatten() for x in X])
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler = scaler.fit(X)
    X = scaler.transform(X)
    
    # reduce principle components to improve performance
    reduced_pc = 2000
    recommended_pc = johnson_lindenstrauss_min_dim(861,eps=0.1)
    min_pc = recommended_pc - reduced_pc
    sp = SparseRandomProjection(n_components = int(min_pc))
    X = sp.fit_transform(X)
    return np.array(X), np.array(y)

Example #22

0

Show file

File: main.py Project: thomas-schillaci/cs7641assignment3

def jlmd_search(ubs, names):
    epsilons = np.linspace(0.2, 0.999, 1000)
    y = []

    for eps in epsilons:
        y.append(johnson_lindenstrauss_min_dim(40000, eps))

    plot.style.use('seaborn-darkgrid')
    ax = plot.subplots()[1]
    plot.title('Influence of epsilon on the minimum number of dimensions')
    plot.semilogy(epsilons, y)
    for ub in ubs:
        plot.semilogy([0, 1], [ub, ub])
    plot.legend(['Minimum number of dimensions', *names], loc='upper right')
    plot.show()

Example #23

0

Show file

File: reduce.py Project: anmol6536/scprep

 def fit(self, X):
     if self.n_components == -1:
         super().set_params(
             n_components=random_projection.johnson_lindenstrauss_min_dim(
                 n_samples=X.shape[0], eps=self.eps))
     try:
         return super().fit(X)
     except ValueError as e:
         if self.n_components >= X.shape[1]:
             raise RuntimeError("eps={} and n_samples={} lead to a target "
                                "dimension of {} which is larger than the "
                                "original space with n_features={}".format(
                                    self.eps, X.shape[0], self.n_components,
                                    X.shape[1]))
         else:
             raise

Example #24

0

Show file

def data_JL_proj_data_diff(data):
    n_row = len(data)
    #finding minimum dimension reduction possible using JL lemma, while preserving pairwise distances upto a given eps value.
    min_dim = random_projection.johnson_lindenstrauss_min_dim(n_row, eps=0.1)
    print("min dim suggested by JL lemma with eps = 0.1 is " + str(min_dim))
    #creating transformer matrix to use for projecting the input data to target data. if O = IR. transformer is R here.
    transformer = random_projection.SparseRandomProjection()
    #transforming given "data"(input) to "projected_data"(output) by using "transformer" as random matrix R.
    projected_data = transformer.fit_transform(data)
    print(
        "new data dimensions after projection according to user provided target data dimension: "
        + str(np.shape(projected_data)))
    #printing pdist() of projected data
    #print("pdist of points in JL projected data")
    #print(sp.pdist(projected_data))
    print("\n\n")
    return sp.pdist(projected_data)

Example #25

0

Show file

File: main.py Project: GavinNishizawa/crohme-experimentation

def train_drfs(train_x, train_y, eps=0.5, threshold="median"):
    n_samples, n_features, n_classes = \
            get_counts_tt(train_x, train_y)

    # pick number of components
    min_comp = random_projection.johnson_lindenstrauss_min_dim( \
            n_samples=n_samples, eps=eps)
    min_comp = min(min_comp, n_features)

    # scale and agglomerate to min_comp
    #scaler = preprocessing.StandardScaler()
    scaler = preprocessing.QuantileTransformer()
    feat_agg = cluster.FeatureAgglomeration( \
            n_clusters=min_comp)
    xtc = ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=-1)
    scaler2 = preprocessing.RobustScaler()
    #poly = preprocessing.PolynomialFeatures(degree=2, interaction_only=True)

    # train the model pipeline
    dr_pipe = pipeline.Pipeline([('scaler', scaler), \
            ('feat_agg', feat_agg), ('scaler2', scaler2)])

    dr_pipe.fit(train_x)

    # transform train_x to train xtc
    train_x = dr_pipe.transform(train_x)
    # train the xtc
    xtc.fit(train_x, train_y)

    print("Feature importances:")
    print("\tMax:", max(xtc.feature_importances_))
    print("\tMin:", min(xtc.feature_importances_))
    #print(xtc.feature_importances_)

    # create the feature selection model from the xtc
    feat_sel = feature_selection.SelectFromModel( \
            xtc, prefit=True, threshold=threshold)

    # create the pipeline to reduce dim then feature select
    drfs_pipe = pipeline.Pipeline(\
            [('dr_pipe', dr_pipe), ('feat_sel', feat_sel)])

    return drfs_pipe

Example #26

0

Show file

File: pyDRMetrics.py Project: mtsai101/Multi-level-Feature-Driven-Storage-Server

 def plot_JL_curve(self):
     '''
     Plot the Johnson-Lindenstrauss minimum dimensions curve against the maximum distortion rate for Random Projection.
     The plot is also saved to a local jpg file.
     '''
     fig = plt.figure(figsize=(6, 4))
     eps_range = np.linspace(0.01, 0.99, 100)
     min_n_components = johnson_lindenstrauss_min_dim(n_samples=len(self.X),
                                                      eps=eps_range)
     plt.plot(eps_range, min_n_components)
     plt.xlabel('maximum distortion rate', fontsize=16)
     plt.ylabel('mimimum dimensions to keep', fontsize=16)
     plt.ylim(0, 20000)
     plt.title(
         'johnson_lindenstrauss_min_dim vs max_distortion_rate \nsample size = '
         + str(len(self.X)),
         fontsize=16)
     plt.show()
     return plt2base64(plt)

Example #27

0

Show file

def rp(X_train, X_test):
    num_components = johnson_lindenstrauss_min_dim(n_samples=X_train.shape[0],
                                                   eps=0.1)
    print(num_components)
    print("# features: ", X_train.shape[1], " JL min dim:", num_components)
    print("JL number > #features so cant make any JL guarentees")
    # Of course not! It simply means that we can’t make any assumptions regarding the preservation of pairwise distances between data points.

    accuracies = []
    components = np.int32(np.linspace(2, 64, 20))

    model = LinearSVC()
    model.fit(X_train, y_train)
    baseline = metrics.accuracy_score(model.predict(X_test), y_test)

    # loop over the projection sizes
    for comp in components:
        # create the random projection
        sp = SparseRandomProjection(n_components=comp)
        X = sp.fit_transform(X_train)

        # train a classifier on the sparse random projection
        model = LinearSVC()
        model.fit(X, y_train)

        # evaluate the model and update the list of accuracies
        test = sp.transform(X_test)
        accuracies.append(metrics.accuracy_score(model.predict(test), y_test))

    # create the figure
    plt.figure()
    plt.title("Accuracy of Sparse Projection on Sonar")
    plt.xlabel("# of Components")
    plt.ylabel("Accuracy")
    plt.xlim([2, 64])
    plt.ylim([0, 1.0])

    # plot the baseline and random projection accuracies
    plt.plot(components, [baseline] * len(accuracies), color="r")
    plt.plot(components, accuracies)

    plt.show()

Example #28

0

Show file

def preparation(inputFile, dim=0):
    vectorizer = HashingVectorizer()  # compute "TF"
    testCases = [line.rstrip("\n") for line in open(inputFile)]
    testSuite = vectorizer.fit_transform(testCases)

    # dimensionality reduction
    if dim <= 0:
        e = 0.5  # epsilon in jl lemma
        dim = johnson_lindenstrauss_min_dim(len(testCases), eps=e)
    srp = SparseRandomProjection(n_components=dim)
    projectedTestSuite = srp.fit_transform(testSuite)

    # map sparse matrix to dict
    TS = []
    for i in range(len(testCases)):
        tc = {}
        for j in projectedTestSuite[i].nonzero()[1]:
            tc[j] = projectedTestSuite[i, j]
        TS.append(tc)

    return TS

Example #29

0

Show file

File: ClusterDimRedFeatTransf.py Project: manimalakumar/ML-Unrevealed

def checkOptimaldimensionality(s):
    # range of distortions
    eps_range = np.linspace(0.1, 0.99, 10)
    colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))

    # range of number of samples (observation) to embed
    n_samples_range = np.logspace(1, 4, s)

    plt.figure()
    for eps, color in zip(eps_range, colors):
        min_n_components = johnson_lindenstrauss_min_dim(n_samples_range,
                                                         eps=eps)
        plt.loglog(n_samples_range, min_n_components, color=color)
    plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right")
    plt.xlabel("Number of observations to eps-embed")
    plt.ylabel("Minimum number of dimensions")
    plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")
    plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right")
    plt.xlabel("Number of observations to eps-embed")
    plt.ylabel("Minimum number of dimensions")
    plt.title(
        "Johnson-Lindenstrauss bounds:\nn_samples vs n_components w.r.t eps")

Example #30

0

Show file

File: bench_random_projections.py Project: turian/scikit-learn

        sys.exit(1)
    opts.n_components = type_auto_or_int(opts.n_components)
    opts.density = type_auto_or_float(opts.density)
    selected_transformers = opts.selected_transformers.split(",")

    ###########################################################################
    # Generate dataset
    ###########################################################################
    n_nonzeros = int(opts.ratio_nonzeros * opts.n_features)

    print("Dataset statics")
    print("===========================")
    print("n_samples \t= %s" % opts.n_samples)
    print("n_features \t= %s" % opts.n_features)
    if opts.n_components == "auto":
        print("n_components \t= %s (auto)" % johnson_lindenstrauss_min_dim(n_samples=opts.n_samples, eps=opts.eps))
    else:
        print("n_components \t= %s" % opts.n_components)
    print("n_elements \t= %s" % (opts.n_features * opts.n_samples))
    print("n_nonzeros \t= %s per feature" % n_nonzeros)
    print("ratio_nonzeros \t= %s" % opts.ratio_nonzeros)
    print("")

    ###########################################################################
    # Set transformer input
    ###########################################################################
    transformers = {}

    ###########################################################################
    # Set GaussianRandomProjection input
    gaussian_matrix_params = {"n_components": opts.n_components, "random_state": opts.random_seed}

Example #31

0

Show file

File: a3.py Project: rkaufholz3/a3

def rp_analysis(X, y, dataset, plot, X_test):

    if plot:
        # Project in 2D for visualization
        rp = GaussianRandomProjection(n_components=2)
        projected = rp.fit_transform(X)
        plot_2d(projected, y)

        # Project in 3D for visualization
        rp = GaussianRandomProjection(n_components=3)
        projected = rp.fit_transform(X)
        plot_3d(projected, y)

    # # Plot eps vs. n components
    # eps_range = [0.4, 0.6, 0.8, 0.99]  # For Fashion MNIST eps 0.4 to 0.999 (must be < 1)
    # num_components = []
    # for eps in eps_range:
    #     rp = GaussianRandomProjection(n_components='auto', eps=eps)
    #     projected = rp.fit_transform(X)
    #     num_components.append(projected.shape)
    # print(num_components)

    # Determine min components for varying eps
    min_dims = []
    eps_range = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99]
    for e in eps_range:
        min_dims.append(johnson_lindenstrauss_min_dim(n_samples=X.shape[0], eps=e))
    print('\nmin dims', min_dims)
    print('\nX shape:', X.shape)

    # Measure variation across multiple runs
    means_list = []
    stdev_list = []
    kurtosis_list = []
    iterations = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    for i in iterations:
        rp3 = GaussianRandomProjection(n_components=10)  # 10 components to help visualize the variation
        projected3 = rp3.fit_transform(X)
        means_list.append(np.mean(projected3))
        stdev_list.append(np.std(projected3))
        kurtosis_list.append(np.mean(kurtosis(projected3)))
        projected_df = pd.DataFrame(projected3)
        projected_df.to_csv('projected.csv')
        print(plot_kurtosis(projected3))

    # http://kitchingroup.cheme.cmu.edu/blog/2013/09/13/Plotting-two-datasets-with-very-different-scales/
    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    ax1.plot(iterations, means_list, label='Mean', color='red')
    ax1.plot(iterations, stdev_list, label='Std Deviation', color='blue')
    ax1.set_xlabel('Iteration', fontsize=18)
    ax1.legend()
    ax2 = ax1.twinx()
    ax2.plot(iterations, kurtosis_list, label='Kurtosis', color='green')
    plt.legend()
    plt.show()

    # print('\ncomponents_ shape:', rp3.components_.shape)

    # Project on to an 'optimal' number of components
    rp2 = GaussianRandomProjection(n_components=331)
    projected2_train = rp2.fit_transform(X)
    projected2_test = rp2.transform(X_test)
    print('\nRP projected X_train:', projected2_train.shape)

    return projected2_train, projected2_test

Example #32

0

Show file

from sklearn import  datasets, metrics, decomposition, random_projection
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.random_projection import johnson_lindenstrauss_min_dim
from sklearn.model_selection import train_test_split, validation_curve, learning_curve, ShuffleSplit,GridSearchCV
from sklearn.cross_validation import StratifiedKFold, cross_val_score
from functions import *

data = datasets.load_digits()
X = data.data
y = data.target

johnson_lindenstrauss_min_dim(1797,eps=0.1)

accuracies = []
components = range(2,X.shape[1])

split = train_test_split(X, y, test_size = 0.33,
    random_state = 42)
#digits = datasets.load_digits()
#split = train_test_split(digits.data, digits.target, test_size = 0.3,
#    random_state = 42)
(trainData, testData, trainTarget, testTarget) = split

model = LinearSVC()
model.fit(trainData, trainTarget)
baseline = metrics.accuracy_score(model.predict(testData), testTarget)

Example #33

0

Show file

colnames = [
    "make", "address", "all", "3d", "our", "over", "remove", "internet",
    "order", "mail", "receive", "will", "people", "report", "addresses",
    "free", "business", "email", "you", "credit", "your", "font", "000",
    "money", "hp", "hpl", "george", "650", "lab", "labs", "telnet", "857",
    "data", "415", "85", "technology", "1999", "parts", "pm", "direct", "cs",
    "meeting", "original", "project", "re", "edu", "table", "conference", ";",
    "(", "[", "!", "$", "#", "average", "longest", "total", "class"
]

data.columns = colnames
X, y = data.iloc[:, :-1], data.iloc[:, -1]
X.columns = colnames[:len(colnames) - 1]

print johnson_lindenstrauss_min_dim(4601, eps=0.1)

split = train_test_split(X, y, test_size=0.3, random_state=42)
(trainData, testData, trainTarget, testTarget) = split
accuracies = []
components = np.int32(np.linspace(2, 56, 14))
model = LinearSVC()
model.fit(trainData, trainTarget)
baseline = metrics.accuracy_score(model.predict(testData), testTarget)
# loop over the projection sizes
for comp in components:
    # create the random projection
    sp = SparseRandomProjection(n_components=comp)
    X = sp.fit_transform(trainData)

    # train a classifier on the sparse random projection

Example #34

0

Show file

File: 02b_reduction_rand_proj.py Project: jagrusy/UnsupervisedLearning

def johnson_lindenstrauss(data, data_name):
    # `normed` is being deprecated in favor of `density` in histograms
    if LooseVersion(matplotlib.__version__) >= '2.1':
        density_param = {'density': True}
    else:
        density_param = {'normed': True}

    # Part 1: plot the theoretical dependency between n_components_min and
    # n_samples

    # range of admissible distortions
    eps_range = np.linspace(0.1, 0.99, 5)
    colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))

    # range of number of samples (observation) to embed
    n_samples_range = np.logspace(1, 9, 9)

    plt.figure()
    for eps, color in zip(eps_range, colors):
        min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, eps=eps)
        plt.loglog(n_samples_range, min_n_components, color=color)

    plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right")
    plt.xlabel("Number of observations to eps-embed")
    plt.ylabel("Minimum number of dimensions")
    plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")
    plt.savefig('Figs/02b_rp_comp_samples')

    # range of admissible distortions
    eps_range = np.linspace(0.01, 0.99, 100)

    n_samples_range = np.logspace(2, 6, 5)
    colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(n_samples_range)))

    plt.figure()
    for n_samples, color in zip(n_samples_range, colors):
        min_n_components = johnson_lindenstrauss_min_dim(n_samples, eps=eps_range)
        plt.semilogy(eps_range, min_n_components, color=color)

    plt.legend(["n_samples = %d" % n for n in n_samples_range], loc="upper right")
    plt.xlabel("Distortion eps")
    plt.ylabel("Minimum number of dimensions")
    plt.title("Johnson-Lindenstrauss bounds:\nn_components vs eps")
    plt.savefig('Figs/02b_rp_comp_eps')

    # Part 2: perform sparse random projection of some digits images which are
    # quite low dimensional and dense or documents of the 20 newsgroups dataset
    # which is both high dimensional and sparse

    n_samples, n_features = data.shape
    print("Embedding %d samples with dim %d using various random projections"
        % (n_samples, n_features))

    n_components_range = np.array([1,10,100,1000])
    dists = euclidean_distances(data, squared=True).ravel()

    # select only non-identical samples pairs
    nonzero = dists != 0
    dists = dists[nonzero]

    for n_components in n_components_range:
        t0 = time()
        rp = SparseRandomProjection(n_components=n_components)
        projected_data = rp.fit_transform(data)
        print("Projected %d samples from %d to %d in %0.3fs"
            % (n_samples, n_features, n_components, time() - t0))
        if hasattr(rp, 'components_'):
            n_bytes = rp.components_.data.nbytes
            n_bytes += rp.components_.indices.nbytes
            print("Random matrix with size: %0.3fMB" % (n_bytes / 1e6))

        projected_dists = euclidean_distances(
            projected_data, squared=True).ravel()[nonzero]

        plt.figure()
        plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu)
        plt.xlabel("Pairwise squared distances in original space")
        plt.ylabel("Pairwise squared distances in projected space")
        plt.title("Pairwise distances distribution for n_components=%d" %
                n_components)
        cb = plt.colorbar()
        cb.set_label('Sample pairs counts')

        rates = projected_dists / dists
        print("Mean distances rate: %0.2f (%0.2f)"
            % (np.mean(rates), np.std(rates)))
        plt.savefig('Figs/02b_rp_pwdist_{}_{}'.format(data_name, n_components))

        plt.figure()
        plt.hist(rates, bins=50, range=(0., 2.), edgecolor='k', **density_param)
        plt.xlabel("Squared distances rate: projected / original")
        plt.ylabel("Distribution of samples pairs")
        plt.title("Histogram of pairwise distance rates for n_components=%d" %
                n_components)
        plt.savefig('Figs/02b_rp_histogram_{}_{}'.format(data_name, n_components))
        plt.clf()

Example #35

0

Show file

File: real_rand_proj_data.py Project: m0r17z/thesis

def generate_real_dataset_rp(data_path, sparse=False, eps=0.1):
    ################################################ LOADING AND CLEANING THE DATA #########################################
    samples = open(os.path.join(data_path, 'samples.txt'))
    labels = open(os.path.join(data_path, 'labels.txt'))
    annotations = open(os.path.join(data_path, 'annotations.txt'))
    out_f = open(os.path.join(data_path,'rp_out'),'w')

    bad_samples = []
    real_labels = []
    qpoint_lists = []
    label_list = []
    annotation_list = []
    label_count = np.zeros((1,13))

    for data in samples:
        qpoint_lists = data.split(';')
    for data in labels:
        label_list = data.split(';')
    for data in annotations:
        annotation_list = data.split(';')

    out_s = 'found %i qpoint lists.\n' % len(qpoint_lists) + 'found %i labels.\n' % len(label_list) + 'found %i annotations.\n\n' % len(annotation_list)
    print out_s
    out_f.write(out_s)
    out_f.close()

    for list_ind in np.arange(len(qpoint_lists)):
        bad = False

        ################# PROCESS THE LABELS
        if annotation_list[list_ind][0:2] != 'vo' and annotation_list[list_ind][0:2] != 'fl' and annotation_list[list_ind][0:2] != 'mi' and annotation_list[list_ind][0:2] != 'ja':
            real_labels.append(0)
            label_count[0][0] += 1
        else:
            position = label_list[list_ind].split(',')
            if float(position[0]) == -2000 or float(position[0]) == -1000:
                real_labels.append(-1)
                bad = True
            else:
                lab = determine_label((float(position[0]),float(position[1]),float(position[2])))
                real_labels.append(lab)
                label_count[0][lab] += 1

        ################# PROCESS THE Q-POINTS
        qpoint_lists[list_ind] = qpoint_lists[list_ind].split(':')
        for point_ind in np.arange(len(qpoint_lists[list_ind])):
            qpoint_lists[list_ind][point_ind] = qpoint_lists[list_ind][point_ind].split(',')
            if len(qpoint_lists[list_ind][point_ind]) != 7:
                bad = True

        if bad:
            bad_samples.append(list_ind)

    print 'need to remove %i bad samples.' %len(bad_samples)
    ################# REMOVE BAD SAMPLES
    ind = 0
    for bad_ind in bad_samples:
        real_ind = bad_ind - ind
        qpoint_lists.pop(real_ind)
        real_labels.pop(real_ind)
        annotation_list.pop(real_ind)
        ind += 1

    out_f = open(os.path.join(data_path,'rp_out'),'a')
    out_s = str(len(qpoint_lists)) + ' samples remain after purging.\n' + str(len(real_labels)) + ' labels remain after purging.\n'\
            + str(len(annotation_list)) + ' annotations remain after purging.\n' + 'percentages of the labels are %s\n\n' %str(label_count/len(qpoint_lists))
    print out_s
    out_f.write(out_s)
    out_f.close()

    samples.close()
    labels.close()
    annotations.close()

    ################################################## PROJECTING THE DATA INTO A GRID #####################################
    pcol = 0
    ps = 0

    # ASSUMPTION: relevant area is never less than 0.7 meters and more than 4.4 meters on the x-axis, 2.5 meters to both sides on the y-axis
    # and 2 meters on the z-axis away from the sensors
    bin_cm = 3
    max_x_cm = 440
    min_x_cm = 70
    max_y_cm = 250
    max_z_cm = 200

    x_range = max_x_cm / bin_cm - min_x_cm / bin_cm
    y_range = max_y_cm * 2 / bin_cm
    z_range = max_z_cm / bin_cm

    out_f = open(os.path.join(data_path,'rp_out'),'a')
    out_s = 'length of data in original space: %d\n\n' %(x_range*y_range*z_range)
    print out_s
    out_f.write(out_s)
    out_f.close()

    # compute a conservative estimate of the number of latent dimensions required to guarantuee the given epsilons
    n_dims = johnson_lindenstrauss_min_dim(len(qpoint_lists),eps)

    out_f = open(os.path.join(data_path,'rp_out'),'a')
    out_s = 'number of latent dimensions needed to guarantee %f epsilon is %f\n\n' %(eps, n_dims)
    print out_s
    out_f.write(out_s)
    out_f.close()

    f_path = os.path.join(data_path,'rp_real_sparse.hdf5') if sparse else os.path.join(data_path,'rp_real_gauss.hdf5')
    print f_path
    f = h5.File(f_path, "w")
    f.create_dataset('data_set/data_set', (len(qpoint_lists), n_dims), dtype='f')
    f.create_dataset('labels/real_labels', (len(real_labels),), dtype='i')
    dt = h5.special_dtype(vlen=unicode)
    f.create_dataset('annotations/annotations', (len(annotation_list),), dtype=dt)

    transformer = random_projection.SparseRandomProjection(n_components=n_dims) if sparse else random_projection.GaussianRandomProjection(n_components=n_dims)
    if sparse:
        print 'performing projection with sparse matrix'
    else:
        print 'performing projection with gaussian matrix'

    # this is not the way it's supposed to be done BUT the proper training set doesn't fit into the memory
    transformer.components_ = transformer._make_random_matrix(n_dims, x_range*y_range*z_range)
    last_per = -1

    for ind, qpoint_list in enumerate(qpoint_lists):
        grid = np.zeros((x_range, y_range, z_range))

        for qpoint in qpoint_list:
            x = int(float(qpoint[0])*100) / bin_cm
            y = (int(float(qpoint[1])*100) + max_y_cm) / bin_cm
            z = int(float(qpoint[2])*100) / bin_cm
            if x - min_x_cm/bin_cm < 0 or x - min_x_cm/bin_cm > x_range-1 or y > y_range-1 or y < 0 or z > z_range-1 or z < 0:
                continue
            pow = float(qpoint[4])
            if grid[x-min_x_cm/bin_cm][y][z] != 0:
                pcol += 1
                if grid[x-min_x_cm/bin_cm][y][z] < pow:
                    grid[x-min_x_cm/bin_cm][y][z] = pow
            else:
                grid[x-min_x_cm/bin_cm][y][z] = pow
            ps += 1

        f['data_set/data_set'][ind] = transformer.transform(np.reshape(grid,(1,-1)))
        f['labels/real_labels'][ind] = real_labels[ind]
        f['annotations/annotations'][ind] = annotation_list[ind]
        curr_percent = int(float(ind) / len(qpoint_lists) * 100)
        if last_per != curr_percent:
            last_per = curr_percent
            out_f = open(os.path.join(data_path,'rp_out'),'a')
            out_s = 'have now looked at %i%% of the data.\n' % int(float(ind) / len(qpoint_lists) * 100)
            print out_s
            out_f.write(out_s)
            out_f.close()

    print 'done with projecting onto the grid (without binning)'
    print 'percentage of point collision: ' + str(float(pcol)/ps)
    print 'number of samples: ' +str(len(f['data_set/data_set']))
    print 'dimensionality of the samples: ' +str(len(f['data_set/data_set'][0]))
    print 'number of labels: ' +str(len(f['labels/real_labels']))
    print 'number of annotations: ' +str(len(f['annotations/annotations']))

    out_f = open(os.path.join(data_path,'rp_out'),'a')
    out_s = 'projection done, new dimension is %d\n\n' %len(f['data_set/data_set'][0])
    print out_s
    out_f.write(out_s)
    out_f.close()

    f.close()

    if sparse:
        generate_train_val_test_set(os.path.join(data_path,"rp_real_sparse.hdf5"), os.path.join(data_path,"train_val_test_rp_real_sparse.hdf5"))
    else:
        generate_train_val_test_set(os.path.join(data_path,"rp_real_gauss.hdf5"), os.path.join(data_path,"train_val_test_rp_real_gauss.hdf5"))

Example #36

0

Show file

File: plot_johnson_lindenstrauss_bound.py Project: Sapphirine/TV-Analytics

from sklearn.datasets import load_digits
from sklearn.metrics.pairwise import euclidean_distances

# Part 1: plot the theoretical dependency between n_components_min and
# n_samples

# range of admissible distortions
eps_range = np.linspace(0.1, 0.99, 5)
colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))

# range of number of samples (observation) to embed
n_samples_range = np.logspace(1, 9, 9)

plt.figure()
for eps, color in zip(eps_range, colors):
    min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, eps=eps)
    plt.loglog(n_samples_range, min_n_components, color=color)

plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right")
plt.xlabel("Number of observations to eps-embed")
plt.ylabel("Minimum number of dimensions")
plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")
plt.show()

# range of admissible distortions
eps_range = np.linspace(0.01, 0.99, 100)

# range of number of samples (observation) to embed
n_samples_range = np.logspace(2, 6, 5)
colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(n_samples_range)))

Example #37

0

Show file

File: plot_johnson_lindenstrauss_bound.py Project: AlexanderFabisch/scikit-learn

from sklearn.datasets import load_digits
from sklearn.metrics.pairwise import euclidean_distances

# Part 1: plot the theoretical dependency between n_components_min and
# n_samples

# range of admissible distortions
eps_range = np.linspace(0.1, 0.99, 5)
colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))

# range of number of samples (observation) to embed
n_samples_range = np.logspace(1, 9, 9)

plt.figure()
for eps, color in zip(eps_range, colors):
    min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, eps=eps)
    plt.loglog(n_samples_range, min_n_components, color=color)

plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right")
plt.xlabel("Number of observations to eps-embed")
plt.ylabel("Minimum number of dimensions")
plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")

# range of admissible distortions
eps_range = np.linspace(0.01, 0.99, 100)

# range of number of samples (observation) to embed
n_samples_range = np.logspace(2, 6, 5)
colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(n_samples_range)))

plt.figure()

Example #38

0

Show file

File: hw3.py Project: jezlax/python

def min_features(scaled_data):
    print johnson_lindenstrauss_min_dim(len(scaled_data),eps=0.1)

Example #39

0

Show file

        sys.exit(1)
    opts.n_components = type_auto_or_int(opts.n_components)
    opts.density = type_auto_or_float(opts.density)
    selected_transformers = opts.selected_transformers.split(',')

    ###########################################################################
    # Generate dataset
    ###########################################################################
    n_nonzeros = int(opts.ratio_nonzeros * opts.n_features)

    print('Dataset statics')
    print("===========================")
    print('n_samples \t= %s' % opts.n_samples)
    print('n_features \t= %s' % opts.n_features)
    if opts.n_components == "auto":
        print('n_components \t= %s (auto)' % johnson_lindenstrauss_min_dim(
            n_samples=opts.n_samples, eps=opts.eps))
    else:
        print('n_components \t= %s' % opts.n_components)
    print('n_elements \t= %s' % (opts.n_features * opts.n_samples))
    print('n_nonzeros \t= %s per feature' % n_nonzeros)
    print('ratio_nonzeros \t= %s' % opts.ratio_nonzeros)
    print('')

    ###########################################################################
    # Set transformer input
    ###########################################################################
    transformers = {}

    ###########################################################################
    # Set GaussianRandomProjection input
    gaussian_matrix_params = {

Example #40

0

Show file

File: bench_random_projections.py Project: 0664j35t3r/scikit-learn

    opts.n_components = type_auto_or_int(opts.n_components)
    opts.density = type_auto_or_float(opts.density)
    selected_transformers = opts.selected_transformers.split(',')

    ###########################################################################
    # Generate dataset
    ###########################################################################
    n_nonzeros = int(opts.ratio_nonzeros * opts.n_features)

    print('Dataset statics')
    print("===========================")
    print('n_samples \t= %s' % opts.n_samples)
    print('n_features \t= %s' % opts.n_features)
    if opts.n_components == "auto":
        print('n_components \t= %s (auto)' %
              johnson_lindenstrauss_min_dim(n_samples=opts.n_samples,
                                            eps=opts.eps))
    else:
        print('n_components \t= %s' % opts.n_components)
    print('n_elements \t= %s' % (opts.n_features * opts.n_samples))
    print('n_nonzeros \t= %s per feature' % n_nonzeros)
    print('ratio_nonzeros \t= %s' % opts.ratio_nonzeros)
    print('')

    ###########################################################################
    # Set transformer input
    ###########################################################################
    transformers = {}

    ###########################################################################
    # Set GaussianRandomProjection input
    gaussian_matrix_params = {