Python johnson_lindenstrauss_min_dimの例、sklearn.random_projection.johnson_lindenstrauss_min_dim Pythonの例

コード例 #1

0

ファイルを表示

def test_input_size_jl_min_dim():
    with pytest.raises(ValueError):
        johnson_lindenstrauss_min_dim(3 * [100], eps=2 * [0.9])

    johnson_lindenstrauss_min_dim(
        np.random.randint(1, 10, size=(10, 10)), eps=np.full((10, 10), 0.5)
    )

コード例 #2

0

ファイルを表示

ファイル: test_random_projection.py プロジェクト: allefpablo/scikit-learn

def test_input_size_jl_min_dim():
    assert_raises(ValueError, johnson_lindenstrauss_min_dim,
                  3 * [100], 2 * [0.9])

    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 3 * [100],
                  2 * [0.9])

    johnson_lindenstrauss_min_dim(np.random.randint(1, 10, size=(10, 10)),
                                  np.full((10, 10), 0.5))

コード例 #3

0

ファイルを表示

ファイル: test_random_projection.py プロジェクト: zinc-40/scikit-learn

def test_input_size_jl_min_dim():
    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 3 * [100],
                  2 * [0.9])

    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 3 * [100],
                  2 * [0.9])

    johnson_lindenstrauss_min_dim(np.random.randint(1, 10, size=(10, 10)),
                                  0.5 * np.ones((10, 10)))

コード例 #4

0

ファイルを表示

ファイル: the_Johnson-Lindenstrauss_bound_for_embedding_with_random_projections.py プロジェクト: AkiraKane/Python

def plotDependencyEPS():

    """Plot thoretical dependency between n_components and eps"""
    
    # range of admissible distortions
    eps_range = np.linspace(0.01, 0.99, 100)

    # range of number of samples to embed
    n_samples_range = np.logspace(2, 6, 5)
    colors = pl.cm.Blues(np.linspace(0.3, 1.0, len(n_samples_range)))

    pl.figure()

    for n_samples, color in zip(n_samples_range, colors):
        min_n_components = johnson_lindenstrauss_min_dim(n_samples, \
                                                         eps=eps_range)
        pl.semilogy(eps_range, min_n_components, color=color)

    pl.legend(["n_samples = %d" % n for n in n_samples_range], \
              loc="upper right")

    pl.xlabel("Distortion eps")
    pl.ylabel("Minimum number of dimensions")
    pl.title("Johnson-Lindenstrauss bounds:\nn_components vs eps")
    pl.show()

コード例 #5

0

ファイルを表示

ファイル: q2rp.py プロジェクト: auimendoza/cs7641-omscs-a3

def plot_jl_bounds(label, X):
    """
    http://scikit-learn.org/stable/auto_examples/plot_johnson_lindenstrauss_bound.html#sphx-glr-auto-examples-plot-johnson-lindenstrauss-bound-py
    """
    print("calculating jl bounds")
    eps_ranges = []
    eps_ranges.append(np.linspace(0.2, 0.99, 5))

    # range of number of samples (observation) to embed
    n_samples_range = np.linspace(100, 6000, 5)

    for i, eps_range in enumerate(eps_ranges):
        colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))
        plt.figure()
        for eps, color in zip(eps_range, colors):
            min_n_components = johnson_lindenstrauss_min_dim(n_samples_range,
                                                             eps=eps)
            plt.plot(n_samples_range, min_n_components, color=color)

        plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="best")
        plt.xlabel("Number of observations to eps-embed")
        plt.ylabel("Minimum number of dimensions")
        plt.title("Johnson-Lindenstrauss bounds:\n%s Data" % (label))
        plt.axhline(y=X.shape[1], color='r', linestyle='--', alpha=0.3)
        plt.axvline(x=X.shape[0], color='r', linestyle='--', alpha=0.3)
        plt.gcf()
        plt.savefig('%s-jlbounds.png' % (label.replace(" ", "-")))
        plt.close()

コード例 #6

0

ファイルを表示

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        eps = self.hyperparams['eps']
        n_components = johnson_lindenstrauss_min_dim(n_samples=self._x_dim,
                                                     eps=eps)
        _logger.info("[INFO] n_components is " + str(n_components))
        if n_components > self._y_dim:
            # Default n_components == 'auto' fails. Need to explicitly assign n_components
            self._model = GaussianRandomProjection(
                n_components=self._y_dim, random_state=self.random_seed)
        else:
            try:
                self._model = GaussianRandomProjection(
                    eps=eps, random_state=self.random_seed)
                self._model.fit(self._training_data)
            except:
                _logger.info(
                    "[Warning] Using given eps value failed, will use default conditions."
                )
                self._model = GaussianRandomProjection()

        self._model.fit(self._training_data)

        self._fitted = True
        return CallResult(None, has_finished=True)

コード例 #7

0

ファイルを表示

ファイル: ml_utils.py プロジェクト: SIholin/dpEmu

def reduce_dimensions(data, random_state, target_dim=2):
    """
    Reduces the dimensionality of the data using UMAP for lower dimensions, PCA for higher dimensions and possibly
    even random projections if the number of dimension is over the limit given by the Johnson–Lindenstrauss lemma. Works
    for NumPy arrays.

    Args:
        data: The input data.
        random_state: Random state to generate reproducible results.
        target_dim: The targeted dimension.

    Returns:
        Lower dimension representation of the data.
    """
    jl_limit = johnson_lindenstrauss_min_dim(n_samples=data.shape[0], eps=.3)
    pca_limit = 30

    if data.shape[1] > jl_limit and data.shape[1] > pca_limit:
        data = SparseRandomProjection(
            n_components=jl_limit,
            random_state=random_state).fit_transform(data)

    if data.shape[1] > pca_limit:
        data = PCA(n_components=pca_limit,
                   random_state=random_state).fit_transform(data)

    return UMAP(n_components=target_dim,
                n_neighbors=30,
                min_dist=0.0,
                random_state=random_state).fit_transform(data)

コード例 #8

0

ファイルを表示

ファイル: the_Johnson-Lindenstrauss_bound_for_embedding_with_random_projections.py プロジェクト: AkiraKane/Python

def plotDependencyComponents():

    """Plot thoretical dependency between n_samples and n_components"""

    # range of admissible distortions
    eps_range = np.linspace(0.1, 0.99, 5)
    colors = pl.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))

    # range of number of samples to embed
    n_samples_range = np.logspace(1, 9, 9)

    
    pl.figure()

    for eps, color in zip(eps_range, colors):
        min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, \
                                                         eps=eps)
        pl.loglog(n_samples_range, min_n_components, color=color)

    pl.legend(["eps = %.1f" % eps for eps in eps_range], \
              loc="lower right")

    pl.xlabel("Number of observations to eps-embed")
    pl.ylabel("Minimum number of dimensions")
    pl.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")
    pl.show()

コード例 #9

0

ファイルを表示

def rp(X_train, X_test):
        num_components = johnson_lindenstrauss_min_dim(n_samples=X_train.shape[0], eps=0.1)
        print(num_components)
        print("# features: ", X_train.shape[1], " JL min dim:", num_components)
        print("JL number > #features so cant make any JL guarentees")
        # Of course not! It simply means that we can’t make any assumptions regarding the preservation of pairwise distances between data points.

        accuracies = []
        components = np.int32(np.linspace(1, 19, 19))

        model = LinearSVC()
        model.fit(X_train, y_train)
        baseline = metrics.accuracy_score(model.predict(X_test), y_test)

        # loop over the projection sizes
        for comp in components:
            # create the random projection
            sp = SparseRandomProjection(n_components=comp)
            X = sp.fit_transform(X_train)

            # train a classifier on the sparse random projection
            # TODO this is wrong.. needs to be KMeans
            model = LinearSVC(max_iter=1000)
            model.fit(X, y_train)

            # evaluate the model and update the list of accuracies
            test = sp.transform(X_test)
            accuracies.append(metrics.accuracy_score(model.predict(test), y_test))

        # create the figure
        plt.figure()
        plt.title("Accuracy of Sparse Random Projection on Churn")
        plt.xlabel("# of Components")
        plt.ylabel("Accuracy")
        plt.xlim([1, 20])
        plt.ylim([0, 1.0])

        # plot the baseline and random projection accuracies
        plt.plot(components, [baseline] * len(accuracies), color="r")
        plt.plot(components, accuracies)

        plt.show()
        # average looks to be around 5 components in RP to best the baseline
        sp = SparseRandomProjection(n_components = 5)
        X_transformed = sp.fit_transform(X_train)

        km = KMeans(n_clusters=2,
                    init='k-means++',
                    n_init=10,
                    max_iter=300,
                    random_state=RAND)
        plot_silhouette(km, X_transformed, title="SRP(5) KM(2)")

        km = KMeans(n_clusters=3,
                    init='k-means++',
                    n_init=10,
                    max_iter=300,
                    random_state=RAND)
        plot_silhouette(km, X_transformed, title="SRP(5) KM(3)")

コード例 #10

0

ファイルを表示

ファイル: utils.py プロジェクト: jean3108/Research-Data-Science-Methodology

 def gaussianRP(self, parameters):
     #defaut parameters
     n_components = parameters["n_components"] if "n_components" in parameters else "auto"
     eps = parameters["eps"] if "eps" in parameters else 1e-1
     if('johnsonRP' in parameters):
         n_components = johnson_lindenstrauss_min_dim(parameters['johnsonRP']['n_samples'], eps=parameters['johnsonRP']['eps'])
     
     #algo Object
     return GaussianRandomProjection(n_components=n_components, eps=eps)

コード例 #11

0

ファイルを表示

def r_projection(input_data, no_components=None, e=0.1):
    if no_components == None:
        no_components = johnson_lindenstrauss_min_dim(
            n_samples=input_data.shape[0], eps=e)

    projected_data = random_projection.GaussianRandomProjection(
        n_components=no_components).fit_transform(input_data)

    return projected_data

コード例 #12

0

ファイルを表示

def test():
    s = 50
    d = 1000
    miu = 0.3
    k = johnson_lindenstrauss_min_dim(s, miu)
    if k > d:
        raise ValueError("can't embed into smaller dimension")
    # TODO check the result guarantee of jl and change the 'print' to 'assure'
    print __test_transform__(s, d, k, miu, 100)

コード例 #13

0

ファイルを表示

ファイル: jl.py プロジェクト: menisadi/pydp

def test():
    s = 50
    d = 1000
    miu = 0.3
    k = johnson_lindenstrauss_min_dim(s, miu)
    if k > d:
        raise ValueError("can't embed into smaller dimension")
    # TODO check the result guarantee of jl and change the 'print' to 'assure'
    print __test_transform__(s, d, k, miu, 100)

コード例 #14

0

ファイルを表示

ファイル: NoDimReduction.py プロジェクト: paramoecium/dim_reduction_via_sparse_coding

def reduction(eps, input_x, out_dir):
	print 'JL bound:', random_projection.johnson_lindenstrauss_min_dim(len(input_x[0]),eps),'(eps={})'.format(eps)
	transformer = random_projection.GaussianRandomProjection(50,eps)
	data_reduced = transformer.fit_transform(code)
	with open('{}/projection'.format(out_dir), "w") as op:
		for component in data_reduced:
			line = ', '.join(str(round(e,3)) for e in component)
        		op.write( line + '\n')
	return data_reduced

コード例 #15

0

ファイルを表示

 def _get_eps(self, n_samples, n_dims, n_slice=int(1e4)):
   new_dim = n_dims * self.keep_rate
   for i in range(1, n_slice):
     eps = i / n_slice
     jl_dim = johnson_lindenstrauss_min_dim(n_samples=n_samples, eps=eps)
     if jl_dim <= new_dim:
       print("rate %.3f, n_dims %d, new_dim %d, dims error rate: %.4f" % (self.keep_rate, n_dims, jl_dim, ((new_dim-jl_dim) / new_dim)) )
       return eps
   return -1

コード例 #16

0

ファイルを表示

ファイル: random_projections.py プロジェクト: shayanmukhtar/CS_7641_Assgnt3

def determine_min_dim(params, x_data):
    eps = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    min_dim = johnson_lindenstrauss_min_dim(x_data.shape[0], eps)

    plt.figure()
    plt.plot(eps, min_dim)
    plt.ylabel("Minimum Number of Dimensions")
    plt.xlabel("Distortion EPS")
    plt.title(params['min_dim_graph'])
    plt.savefig(params['min_dim_graph'] + ".png")

コード例 #17

0

ファイルを表示

ファイル: utils.py プロジェクト: jean3108/Research-Data-Science-Methodology

 def sparseRP(self, parameters):
     #defaut parameters
     n_components = parameters["n_components"] if "n_components" in parameters else "auto"
     density = parameters["density"] if "density" in parameters else 'auto'
     eps = parameters["eps"] if "eps" in parameters else 1e-1
     if('johnsonRP' in parameters):
         n_components = johnson_lindenstrauss_min_dim(parameters['johnsonRP']['n_samples'], eps=parameters['johnsonRP']['eps'])
     
     #algo Object
     return SparseRandomProjection(n_components=n_components, eps=eps, density=density)

コード例 #18

0

ファイルを表示

ファイル: reduction.py プロジェクト: emanuele-albini/emutils

def dim_reduce(
    points,
    technique='tsne',
    random_state=2021,
    n_components=2,
):
    if technique == 'tsne':
        tsne = TSNE(
            n_components=n_components,
            perplexity=30.0,
            early_exaggeration=12.0,
            learning_rate=200.0,
            n_iter=1000,
            n_iter_without_progress=300,
            min_grad_norm=1e-07,
            metric='euclidean',
            init='random',
            verbose=100,
            random_state=random_state,
            method='barnes_hut',
            angle=0.5,
            n_jobs=None,
        )
        transformed_points = tsne.fit_transform(points)
    elif technique == 'jlt':
        print(
            'Minimum JL components (eps = .99): ',
            johnson_lindenstrauss_min_dim(len(points),
                                          eps=1 - np.finfo(float).eps))
        grp = GaussianRandomProjection(n_components=n_components, eps=.99)
        transformed_points = grp.fit_transform(points)
    elif technique == 'pca':
        pca = PCA(
            n_components=n_components,
            copy=True,
            whiten=False,
            svd_solver='auto',
            tol=0.0,
            iterated_power='auto',
            random_state=random_state,
        )
        transformed_points = pca.fit_transform(points)

    elif technique == 'lle':
        lle = LocallyLinearEmbedding(
            n_components=n_components,
            random_state=random_state,
        )
        return lle.fit_transform(points)

    else:
        raise ValueError('Invalid technique.')

    return transformed_points

コード例 #19

0

ファイルを表示

def flastVectorization(dataPoints, reduceDim=True, dim=0, eps=0.33):
    countVec = CountVectorizer()
    Z_full = countVec.fit_transform(dataPoints)
    if reduceDim:
        if dim <= 0:
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
        return Z
    else:
        return Z_full

コード例 #20

0

ファイルを表示

ファイル: timeseries_feature.py プロジェクト: liangmuxin/dsbox-ta2

 def fit(self,
         *,
         timeout: float = None,
         iterations: int = None) -> CallResult[None]:
     eps = self.hyperparams['eps']
     n_components = johnson_lindenstrauss_min_dim(n_samples=self._x_dim,
                                                  eps=eps)
     if n_components > self._x_dim:
         self._model = GaussianRandomProjection(n_components=self._x_dim)
     else:
         self._model = GaussianRandomProjection(eps=eps)
     self._model.fit(self._training_data)

コード例 #21

0

ファイルを表示

def preprocess(X, y):
    min_frame = min(X, key=lambda x: x.shape[2]).shape[2]
    X = np.array([x[:,:,:min_frame].flatten() for x in X])
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler = scaler.fit(X)
    X = scaler.transform(X)
    
    # reduce principle components to improve performance
    reduced_pc = 2000
    recommended_pc = johnson_lindenstrauss_min_dim(861,eps=0.1)
    min_pc = recommended_pc - reduced_pc
    sp = SparseRandomProjection(n_components = int(min_pc))
    X = sp.fit_transform(X)
    return np.array(X), np.array(y)

コード例 #22

0

ファイルを表示

ファイル: main.py プロジェクト: thomas-schillaci/cs7641assignment3

def jlmd_search(ubs, names):
    epsilons = np.linspace(0.2, 0.999, 1000)
    y = []

    for eps in epsilons:
        y.append(johnson_lindenstrauss_min_dim(40000, eps))

    plot.style.use('seaborn-darkgrid')
    ax = plot.subplots()[1]
    plot.title('Influence of epsilon on the minimum number of dimensions')
    plot.semilogy(epsilons, y)
    for ub in ubs:
        plot.semilogy([0, 1], [ub, ub])
    plot.legend(['Minimum number of dimensions', *names], loc='upper right')
    plot.show()

コード例 #23

0

ファイルを表示

ファイル: reduce.py プロジェクト: anmol6536/scprep

 def fit(self, X):
     if self.n_components == -1:
         super().set_params(
             n_components=random_projection.johnson_lindenstrauss_min_dim(
                 n_samples=X.shape[0], eps=self.eps))
     try:
         return super().fit(X)
     except ValueError as e:
         if self.n_components >= X.shape[1]:
             raise RuntimeError("eps={} and n_samples={} lead to a target "
                                "dimension of {} which is larger than the "
                                "original space with n_features={}".format(
                                    self.eps, X.shape[0], self.n_components,
                                    X.shape[1]))
         else:
             raise

コード例 #24

0

ファイルを表示

def data_JL_proj_data_diff(data):
    n_row = len(data)
    #finding minimum dimension reduction possible using JL lemma, while preserving pairwise distances upto a given eps value.
    min_dim = random_projection.johnson_lindenstrauss_min_dim(n_row, eps=0.1)
    print("min dim suggested by JL lemma with eps = 0.1 is " + str(min_dim))
    #creating transformer matrix to use for projecting the input data to target data. if O = IR. transformer is R here.
    transformer = random_projection.SparseRandomProjection()
    #transforming given "data"(input) to "projected_data"(output) by using "transformer" as random matrix R.
    projected_data = transformer.fit_transform(data)
    print(
        "new data dimensions after projection according to user provided target data dimension: "
        + str(np.shape(projected_data)))
    #printing pdist() of projected data
    #print("pdist of points in JL projected data")
    #print(sp.pdist(projected_data))
    print("\n\n")
    return sp.pdist(projected_data)

コード例 #25

0

ファイルを表示

ファイル: main.py プロジェクト: GavinNishizawa/crohme-experimentation

def train_drfs(train_x, train_y, eps=0.5, threshold="median"):
    n_samples, n_features, n_classes = \
            get_counts_tt(train_x, train_y)

    # pick number of components
    min_comp = random_projection.johnson_lindenstrauss_min_dim( \
            n_samples=n_samples, eps=eps)
    min_comp = min(min_comp, n_features)

    # scale and agglomerate to min_comp
    #scaler = preprocessing.StandardScaler()
    scaler = preprocessing.QuantileTransformer()
    feat_agg = cluster.FeatureAgglomeration( \
            n_clusters=min_comp)
    xtc = ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=-1)
    scaler2 = preprocessing.RobustScaler()
    #poly = preprocessing.PolynomialFeatures(degree=2, interaction_only=True)

    # train the model pipeline
    dr_pipe = pipeline.Pipeline([('scaler', scaler), \
            ('feat_agg', feat_agg), ('scaler2', scaler2)])

    dr_pipe.fit(train_x)

    # transform train_x to train xtc
    train_x = dr_pipe.transform(train_x)
    # train the xtc
    xtc.fit(train_x, train_y)

    print("Feature importances:")
    print("\tMax:", max(xtc.feature_importances_))
    print("\tMin:", min(xtc.feature_importances_))
    #print(xtc.feature_importances_)

    # create the feature selection model from the xtc
    feat_sel = feature_selection.SelectFromModel( \
            xtc, prefit=True, threshold=threshold)

    # create the pipeline to reduce dim then feature select
    drfs_pipe = pipeline.Pipeline(\
            [('dr_pipe', dr_pipe), ('feat_sel', feat_sel)])

    return drfs_pipe

コード例 #26

0

ファイルを表示

ファイル: pyDRMetrics.py プロジェクト: mtsai101/Multi-level-Feature-Driven-Storage-Server

 def plot_JL_curve(self):
     '''
     Plot the Johnson-Lindenstrauss minimum dimensions curve against the maximum distortion rate for Random Projection.
     The plot is also saved to a local jpg file.
     '''
     fig = plt.figure(figsize=(6, 4))
     eps_range = np.linspace(0.01, 0.99, 100)
     min_n_components = johnson_lindenstrauss_min_dim(n_samples=len(self.X),
                                                      eps=eps_range)
     plt.plot(eps_range, min_n_components)
     plt.xlabel('maximum distortion rate', fontsize=16)
     plt.ylabel('mimimum dimensions to keep', fontsize=16)
     plt.ylim(0, 20000)
     plt.title(
         'johnson_lindenstrauss_min_dim vs max_distortion_rate \nsample size = '
         + str(len(self.X)),
         fontsize=16)
     plt.show()
     return plt2base64(plt)

コード例 #27

0

ファイルを表示

def rp(X_train, X_test):
    num_components = johnson_lindenstrauss_min_dim(n_samples=X_train.shape[0],
                                                   eps=0.1)
    print(num_components)
    print("# features: ", X_train.shape[1], " JL min dim:", num_components)
    print("JL number > #features so cant make any JL guarentees")
    # Of course not! It simply means that we can’t make any assumptions regarding the preservation of pairwise distances between data points.

    accuracies = []
    components = np.int32(np.linspace(2, 64, 20))

    model = LinearSVC()
    model.fit(X_train, y_train)
    baseline = metrics.accuracy_score(model.predict(X_test), y_test)

    # loop over the projection sizes
    for comp in components:
        # create the random projection
        sp = SparseRandomProjection(n_components=comp)
        X = sp.fit_transform(X_train)

        # train a classifier on the sparse random projection
        model = LinearSVC()
        model.fit(X, y_train)

        # evaluate the model and update the list of accuracies
        test = sp.transform(X_test)
        accuracies.append(metrics.accuracy_score(model.predict(test), y_test))

    # create the figure
    plt.figure()
    plt.title("Accuracy of Sparse Projection on Sonar")
    plt.xlabel("# of Components")
    plt.ylabel("Accuracy")
    plt.xlim([2, 64])
    plt.ylim([0, 1.0])

    # plot the baseline and random projection accuracies
    plt.plot(components, [baseline] * len(accuracies), color="r")
    plt.plot(components, accuracies)

    plt.show()

コード例 #28

0

ファイルを表示

def preparation(inputFile, dim=0):
    vectorizer = HashingVectorizer()  # compute "TF"
    testCases = [line.rstrip("\n") for line in open(inputFile)]
    testSuite = vectorizer.fit_transform(testCases)

    # dimensionality reduction
    if dim <= 0:
        e = 0.5  # epsilon in jl lemma
        dim = johnson_lindenstrauss_min_dim(len(testCases), eps=e)
    srp = SparseRandomProjection(n_components=dim)
    projectedTestSuite = srp.fit_transform(testSuite)

    # map sparse matrix to dict
    TS = []
    for i in range(len(testCases)):
        tc = {}
        for j in projectedTestSuite[i].nonzero()[1]:
            tc[j] = projectedTestSuite[i, j]
        TS.append(tc)

    return TS

コード例 #29

0

ファイルを表示

ファイル: ClusterDimRedFeatTransf.py プロジェクト: manimalakumar/ML-Unrevealed

def checkOptimaldimensionality(s):
    # range of distortions
    eps_range = np.linspace(0.1, 0.99, 10)
    colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))

    # range of number of samples (observation) to embed
    n_samples_range = np.logspace(1, 4, s)

    plt.figure()
    for eps, color in zip(eps_range, colors):
        min_n_components = johnson_lindenstrauss_min_dim(n_samples_range,
                                                         eps=eps)
        plt.loglog(n_samples_range, min_n_components, color=color)
    plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right")
    plt.xlabel("Number of observations to eps-embed")
    plt.ylabel("Minimum number of dimensions")
    plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")
    plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right")
    plt.xlabel("Number of observations to eps-embed")
    plt.ylabel("Minimum number of dimensions")
    plt.title(
        "Johnson-Lindenstrauss bounds:\nn_samples vs n_components w.r.t eps")

コード例 #30

0

ファイルを表示

ファイル: bench_random_projections.py プロジェクト: turian/scikit-learn

        sys.exit(1)
    opts.n_components = type_auto_or_int(opts.n_components)
    opts.density = type_auto_or_float(opts.density)
    selected_transformers = opts.selected_transformers.split(",")

    ###########################################################################
    # Generate dataset
    ###########################################################################
    n_nonzeros = int(opts.ratio_nonzeros * opts.n_features)

    print("Dataset statics")
    print("===========================")
    print("n_samples \t= %s" % opts.n_samples)
    print("n_features \t= %s" % opts.n_features)
    if opts.n_components == "auto":
        print("n_components \t= %s (auto)" % johnson_lindenstrauss_min_dim(n_samples=opts.n_samples, eps=opts.eps))
    else:
        print("n_components \t= %s" % opts.n_components)
    print("n_elements \t= %s" % (opts.n_features * opts.n_samples))
    print("n_nonzeros \t= %s per feature" % n_nonzeros)
    print("ratio_nonzeros \t= %s" % opts.ratio_nonzeros)
    print("")

    ###########################################################################
    # Set transformer input
    ###########################################################################
    transformers = {}

    ###########################################################################
    # Set GaussianRandomProjection input
    gaussian_matrix_params = {"n_components": opts.n_components, "random_state": opts.random_seed}

コード例 #31

0

ファイルを表示

ファイル: a3.py プロジェクト: rkaufholz3/a3

def rp_analysis(X, y, dataset, plot, X_test):

    if plot:
        # Project in 2D for visualization
        rp = GaussianRandomProjection(n_components=2)
        projected = rp.fit_transform(X)
        plot_2d(projected, y)

        # Project in 3D for visualization
        rp = GaussianRandomProjection(n_components=3)
        projected = rp.fit_transform(X)
        plot_3d(projected, y)

    # # Plot eps vs. n components
    # eps_range = [0.4, 0.6, 0.8, 0.99]  # For Fashion MNIST eps 0.4 to 0.999 (must be < 1)
    # num_components = []
    # for eps in eps_range:
    #     rp = GaussianRandomProjection(n_components='auto', eps=eps)
    #     projected = rp.fit_transform(X)
    #     num_components.append(projected.shape)
    # print(num_components)

    # Determine min components for varying eps
    min_dims = []
    eps_range = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99]
    for e in eps_range:
        min_dims.append(johnson_lindenstrauss_min_dim(n_samples=X.shape[0], eps=e))
    print('\nmin dims', min_dims)
    print('\nX shape:', X.shape)

    # Measure variation across multiple runs
    means_list = []
    stdev_list = []
    kurtosis_list = []
    iterations = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    for i in iterations:
        rp3 = GaussianRandomProjection(n_components=10)  # 10 components to help visualize the variation
        projected3 = rp3.fit_transform(X)
        means_list.append(np.mean(projected3))
        stdev_list.append(np.std(projected3))
        kurtosis_list.append(np.mean(kurtosis(projected3)))
        projected_df = pd.DataFrame(projected3)
        projected_df.to_csv('projected.csv')
        print(plot_kurtosis(projected3))

    # http://kitchingroup.cheme.cmu.edu/blog/2013/09/13/Plotting-two-datasets-with-very-different-scales/
    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    ax1.plot(iterations, means_list, label='Mean', color='red')
    ax1.plot(iterations, stdev_list, label='Std Deviation', color='blue')
    ax1.set_xlabel('Iteration', fontsize=18)
    ax1.legend()
    ax2 = ax1.twinx()
    ax2.plot(iterations, kurtosis_list, label='Kurtosis', color='green')
    plt.legend()
    plt.show()

    # print('\ncomponents_ shape:', rp3.components_.shape)

    # Project on to an 'optimal' number of components
    rp2 = GaussianRandomProjection(n_components=331)
    projected2_train = rp2.fit_transform(X)
    projected2_test = rp2.transform(X_test)
    print('\nRP projected X_train:', projected2_train.shape)

    return projected2_train, projected2_test

コード例 #32

0

ファイルを表示

from sklearn import  datasets, metrics, decomposition, random_projection
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.random_projection import johnson_lindenstrauss_min_dim
from sklearn.model_selection import train_test_split, validation_curve, learning_curve, ShuffleSplit,GridSearchCV
from sklearn.cross_validation import StratifiedKFold, cross_val_score
from functions import *

data = datasets.load_digits()
X = data.data
y = data.target

johnson_lindenstrauss_min_dim(1797,eps=0.1)

accuracies = []
components = range(2,X.shape[1])

split = train_test_split(X, y, test_size = 0.33,
    random_state = 42)
#digits = datasets.load_digits()
#split = train_test_split(digits.data, digits.target, test_size = 0.3,
#    random_state = 42)
(trainData, testData, trainTarget, testTarget) = split

model = LinearSVC()
model.fit(trainData, trainTarget)
baseline = metrics.accuracy_score(model.predict(testData), testTarget)

コード例 #33

0

ファイルを表示

colnames = [
    "make", "address", "all", "3d", "our", "over", "remove", "internet",
    "order", "mail", "receive", "will", "people", "report", "addresses",
    "free", "business", "email", "you", "credit", "your", "font", "000",
    "money", "hp", "hpl", "george", "650", "lab", "labs", "telnet", "857",
    "data", "415", "85", "technology", "1999", "parts", "pm", "direct", "cs",
    "meeting", "original", "project", "re", "edu", "table", "conference", ";",
    "(", "[", "!", "$", "#", "average", "longest", "total", "class"
]

data.columns = colnames
X, y = data.iloc[:, :-1], data.iloc[:, -1]
X.columns = colnames[:len(colnames) - 1]

print johnson_lindenstrauss_min_dim(4601, eps=0.1)

split = train_test_split(X, y, test_size=0.3, random_state=42)
(trainData, testData, trainTarget, testTarget) = split
accuracies = []
components = np.int32(np.linspace(2, 56, 14))
model = LinearSVC()
model.fit(trainData, trainTarget)
baseline = metrics.accuracy_score(model.predict(testData), testTarget)
# loop over the projection sizes
for comp in components:
    # create the random projection
    sp = SparseRandomProjection(n_components=comp)
    X = sp.fit_transform(trainData)

    # train a classifier on the sparse random projection

コード例 #34

0

ファイルを表示

ファイル: 02b_reduction_rand_proj.py プロジェクト: jagrusy/UnsupervisedLearning

def johnson_lindenstrauss(data, data_name):
    # `normed` is being deprecated in favor of `density` in histograms
    if LooseVersion(matplotlib.__version__) >= '2.1':
        density_param = {'density': True}
    else:
        density_param = {'normed': True}

    # Part 1: plot the theoretical dependency between n_components_min and
    # n_samples

    # range of admissible distortions
    eps_range = np.linspace(0.1, 0.99, 5)
    colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))

    # range of number of samples (observation) to embed
    n_samples_range = np.logspace(1, 9, 9)

    plt.figure()
    for eps, color in zip(eps_range, colors):
        min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, eps=eps)
        plt.loglog(n_samples_range, min_n_components, color=color)

    plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right")
    plt.xlabel("Number of observations to eps-embed")
    plt.ylabel("Minimum number of dimensions")
    plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")
    plt.savefig('Figs/02b_rp_comp_samples')

    # range of admissible distortions
    eps_range = np.linspace(0.01, 0.99, 100)

    n_samples_range = np.logspace(2, 6, 5)
    colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(n_samples_range)))

    plt.figure()
    for n_samples, color in zip(n_samples_range, colors):
        min_n_components = johnson_lindenstrauss_min_dim(n_samples, eps=eps_range)
        plt.semilogy(eps_range, min_n_components, color=color)

    plt.legend(["n_samples = %d" % n for n in n_samples_range], loc="upper right")
    plt.xlabel("Distortion eps")
    plt.ylabel("Minimum number of dimensions")
    plt.title("Johnson-Lindenstrauss bounds:\nn_components vs eps")
    plt.savefig('Figs/02b_rp_comp_eps')

    # Part 2: perform sparse random projection of some digits images which are
    # quite low dimensional and dense or documents of the 20 newsgroups dataset
    # which is both high dimensional and sparse

    n_samples, n_features = data.shape
    print("Embedding %d samples with dim %d using various random projections"
        % (n_samples, n_features))

    n_components_range = np.array([1,10,100,1000])
    dists = euclidean_distances(data, squared=True).ravel()

    # select only non-identical samples pairs
    nonzero = dists != 0
    dists = dists[nonzero]

    for n_components in n_components_range:
        t0 = time()
        rp = SparseRandomProjection(n_components=n_components)
        projected_data = rp.fit_transform(data)
        print("Projected %d samples from %d to %d in %0.3fs"
            % (n_samples, n_features, n_components, time() - t0))
        if hasattr(rp, 'components_'):
            n_bytes = rp.components_.data.nbytes
            n_bytes += rp.components_.indices.nbytes
            print("Random matrix with size: %0.3fMB" % (n_bytes / 1e6))

        projected_dists = euclidean_distances(
            projected_data, squared=True).ravel()[nonzero]

        plt.figure()
        plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu)
        plt.xlabel("Pairwise squared distances in original space")
        plt.ylabel("Pairwise squared distances in projected space")
        plt.title("Pairwise distances distribution for n_components=%d" %
                n_components)
        cb = plt.colorbar()
        cb.set_label('Sample pairs counts')

        rates = projected_dists / dists
        print("Mean distances rate: %0.2f (%0.2f)"
            % (np.mean(rates), np.std(rates)))
        plt.savefig('Figs/02b_rp_pwdist_{}_{}'.format(data_name, n_components))

        plt.figure()
        plt.hist(rates, bins=50, range=(0., 2.), edgecolor='k', **density_param)
        plt.xlabel("Squared distances rate: projected / original")
        plt.ylabel("Distribution of samples pairs")
        plt.title("Histogram of pairwise distance rates for n_components=%d" %
                n_components)
        plt.savefig('Figs/02b_rp_histogram_{}_{}'.format(data_name, n_components))
        plt.clf()

コード例 #35

0

ファイルを表示

ファイル: real_rand_proj_data.py プロジェクト: m0r17z/thesis

def generate_real_dataset_rp(data_path, sparse=False, eps=0.1):
    ################################################ LOADING AND CLEANING THE DATA #########################################
    samples = open(os.path.join(data_path, 'samples.txt'))
    labels = open(os.path.join(data_path, 'labels.txt'))
    annotations = open(os.path.join(data_path, 'annotations.txt'))
    out_f = open(os.path.join(data_path,'rp_out'),'w')

    bad_samples = []
    real_labels = []
    qpoint_lists = []
    label_list = []
    annotation_list = []
    label_count = np.zeros((1,13))

    for data in samples:
        qpoint_lists = data.split(';')
    for data in labels:
        label_list = data.split(';')
    for data in annotations:
        annotation_list = data.split(';')

    out_s = 'found %i qpoint lists.\n' % len(qpoint_lists) + 'found %i labels.\n' % len(label_list) + 'found %i annotations.\n\n' % len(annotation_list)
    print out_s
    out_f.write(out_s)
    out_f.close()

    for list_ind in np.arange(len(qpoint_lists)):
        bad = False

        ################# PROCESS THE LABELS
        if annotation_list[list_ind][0:2] != 'vo' and annotation_list[list_ind][0:2] != 'fl' and annotation_list[list_ind][0:2] != 'mi' and annotation_list[list_ind][0:2] != 'ja':
            real_labels.append(0)
            label_count[0][0] += 1
        else:
            position = label_list[list_ind].split(',')
            if float(position[0]) == -2000 or float(position[0]) == -1000:
                real_labels.append(-1)
                bad = True
            else:
                lab = determine_label((float(position[0]),float(position[1]),float(position[2])))
                real_labels.append(lab)
                label_count[0][lab] += 1

        ################# PROCESS THE Q-POINTS
        qpoint_lists[list_ind] = qpoint_lists[list_ind].split(':')
        for point_ind in np.arange(len(qpoint_lists[list_ind])):
            qpoint_lists[list_ind][point_ind] = qpoint_lists[list_ind][point_ind].split(',')
            if len(qpoint_lists[list_ind][point_ind]) != 7:
                bad = True

        if bad:
            bad_samples.append(list_ind)

    print 'need to remove %i bad samples.' %len(bad_samples)
    ################# REMOVE BAD SAMPLES
    ind = 0
    for bad_ind in bad_samples:
        real_ind = bad_ind - ind
        qpoint_lists.pop(real_ind)
        real_labels.pop(real_ind)
        annotation_list.pop(real_ind)
        ind += 1

    out_f = open(os.path.join(data_path,'rp_out'),'a')
    out_s = str(len(qpoint_lists)) + ' samples remain after purging.\n' + str(len(real_labels)) + ' labels remain after purging.\n'\
            + str(len(annotation_list)) + ' annotations remain after purging.\n' + 'percentages of the labels are %s\n\n' %str(label_count/len(qpoint_lists))
    print out_s
    out_f.write(out_s)
    out_f.close()

    samples.close()
    labels.close()
    annotations.close()

    ################################################## PROJECTING THE DATA INTO A GRID #####################################
    pcol = 0
    ps = 0

    # ASSUMPTION: relevant area is never less than 0.7 meters and more than 4.4 meters on the x-axis, 2.5 meters to both sides on the y-axis
    # and 2 meters on the z-axis away from the sensors
    bin_cm = 3
    max_x_cm = 440
    min_x_cm = 70
    max_y_cm = 250
    max_z_cm = 200

    x_range = max_x_cm / bin_cm - min_x_cm / bin_cm
    y_range = max_y_cm * 2 / bin_cm
    z_range = max_z_cm / bin_cm

    out_f = open(os.path.join(data_path,'rp_out'),'a')
    out_s = 'length of data in original space: %d\n\n' %(x_range*y_range*z_range)
    print out_s
    out_f.write(out_s)
    out_f.close()

    # compute a conservative estimate of the number of latent dimensions required to guarantuee the given epsilons
    n_dims = johnson_lindenstrauss_min_dim(len(qpoint_lists),eps)

    out_f = open(os.path.join(data_path,'rp_out'),'a')
    out_s = 'number of latent dimensions needed to guarantee %f epsilon is %f\n\n' %(eps, n_dims)
    print out_s
    out_f.write(out_s)
    out_f.close()

    f_path = os.path.join(data_path,'rp_real_sparse.hdf5') if sparse else os.path.join(data_path,'rp_real_gauss.hdf5')
    print f_path
    f = h5.File(f_path, "w")
    f.create_dataset('data_set/data_set', (len(qpoint_lists), n_dims), dtype='f')
    f.create_dataset('labels/real_labels', (len(real_labels),), dtype='i')
    dt = h5.special_dtype(vlen=unicode)
    f.create_dataset('annotations/annotations', (len(annotation_list),), dtype=dt)

    transformer = random_projection.SparseRandomProjection(n_components=n_dims) if sparse else random_projection.GaussianRandomProjection(n_components=n_dims)
    if sparse:
        print 'performing projection with sparse matrix'
    else:
        print 'performing projection with gaussian matrix'

    # this is not the way it's supposed to be done BUT the proper training set doesn't fit into the memory
    transformer.components_ = transformer._make_random_matrix(n_dims, x_range*y_range*z_range)
    last_per = -1

    for ind, qpoint_list in enumerate(qpoint_lists):
        grid = np.zeros((x_range, y_range, z_range))

        for qpoint in qpoint_list:
            x = int(float(qpoint[0])*100) / bin_cm
            y = (int(float(qpoint[1])*100) + max_y_cm) / bin_cm
            z = int(float(qpoint[2])*100) / bin_cm
            if x - min_x_cm/bin_cm < 0 or x - min_x_cm/bin_cm > x_range-1 or y > y_range-1 or y < 0 or z > z_range-1 or z < 0:
                continue
            pow = float(qpoint[4])
            if grid[x-min_x_cm/bin_cm][y][z] != 0:
                pcol += 1
                if grid[x-min_x_cm/bin_cm][y][z] < pow:
                    grid[x-min_x_cm/bin_cm][y][z] = pow
            else:
                grid[x-min_x_cm/bin_cm][y][z] = pow
            ps += 1

        f['data_set/data_set'][ind] = transformer.transform(np.reshape(grid,(1,-1)))
        f['labels/real_labels'][ind] = real_labels[ind]
        f['annotations/annotations'][ind] = annotation_list[ind]
        curr_percent = int(float(ind) / len(qpoint_lists) * 100)
        if last_per != curr_percent:
            last_per = curr_percent
            out_f = open(os.path.join(data_path,'rp_out'),'a')
            out_s = 'have now looked at %i%% of the data.\n' % int(float(ind) / len(qpoint_lists) * 100)
            print out_s
            out_f.write(out_s)
            out_f.close()

    print 'done with projecting onto the grid (without binning)'
    print 'percentage of point collision: ' + str(float(pcol)/ps)
    print 'number of samples: ' +str(len(f['data_set/data_set']))
    print 'dimensionality of the samples: ' +str(len(f['data_set/data_set'][0]))
    print 'number of labels: ' +str(len(f['labels/real_labels']))
    print 'number of annotations: ' +str(len(f['annotations/annotations']))

    out_f = open(os.path.join(data_path,'rp_out'),'a')
    out_s = 'projection done, new dimension is %d\n\n' %len(f['data_set/data_set'][0])
    print out_s
    out_f.write(out_s)
    out_f.close()

    f.close()

    if sparse:
        generate_train_val_test_set(os.path.join(data_path,"rp_real_sparse.hdf5"), os.path.join(data_path,"train_val_test_rp_real_sparse.hdf5"))
    else:
        generate_train_val_test_set(os.path.join(data_path,"rp_real_gauss.hdf5"), os.path.join(data_path,"train_val_test_rp_real_gauss.hdf5"))

コード例 #36

0

ファイルを表示

ファイル: plot_johnson_lindenstrauss_bound.py プロジェクト: Sapphirine/TV-Analytics

from sklearn.datasets import load_digits
from sklearn.metrics.pairwise import euclidean_distances

# Part 1: plot the theoretical dependency between n_components_min and
# n_samples

# range of admissible distortions
eps_range = np.linspace(0.1, 0.99, 5)
colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))

# range of number of samples (observation) to embed
n_samples_range = np.logspace(1, 9, 9)

plt.figure()
for eps, color in zip(eps_range, colors):
    min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, eps=eps)
    plt.loglog(n_samples_range, min_n_components, color=color)

plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right")
plt.xlabel("Number of observations to eps-embed")
plt.ylabel("Minimum number of dimensions")
plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")
plt.show()

# range of admissible distortions
eps_range = np.linspace(0.01, 0.99, 100)

# range of number of samples (observation) to embed
n_samples_range = np.logspace(2, 6, 5)
colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(n_samples_range)))

コード例 #37

0

ファイルを表示

ファイル: plot_johnson_lindenstrauss_bound.py プロジェクト: AlexanderFabisch/scikit-learn

from sklearn.datasets import load_digits
from sklearn.metrics.pairwise import euclidean_distances

# Part 1: plot the theoretical dependency between n_components_min and
# n_samples

# range of admissible distortions
eps_range = np.linspace(0.1, 0.99, 5)
colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))

# range of number of samples (observation) to embed
n_samples_range = np.logspace(1, 9, 9)

plt.figure()
for eps, color in zip(eps_range, colors):
    min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, eps=eps)
    plt.loglog(n_samples_range, min_n_components, color=color)

plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right")
plt.xlabel("Number of observations to eps-embed")
plt.ylabel("Minimum number of dimensions")
plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")

# range of admissible distortions
eps_range = np.linspace(0.01, 0.99, 100)

# range of number of samples (observation) to embed
n_samples_range = np.logspace(2, 6, 5)
colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(n_samples_range)))

plt.figure()

コード例 #38

0

ファイルを表示

ファイル: hw3.py プロジェクト: jezlax/python

def min_features(scaled_data):
    print johnson_lindenstrauss_min_dim(len(scaled_data),eps=0.1)

コード例 #39

0

ファイルを表示

        sys.exit(1)
    opts.n_components = type_auto_or_int(opts.n_components)
    opts.density = type_auto_or_float(opts.density)
    selected_transformers = opts.selected_transformers.split(',')

    ###########################################################################
    # Generate dataset
    ###########################################################################
    n_nonzeros = int(opts.ratio_nonzeros * opts.n_features)

    print('Dataset statics')
    print("===========================")
    print('n_samples \t= %s' % opts.n_samples)
    print('n_features \t= %s' % opts.n_features)
    if opts.n_components == "auto":
        print('n_components \t= %s (auto)' % johnson_lindenstrauss_min_dim(
            n_samples=opts.n_samples, eps=opts.eps))
    else:
        print('n_components \t= %s' % opts.n_components)
    print('n_elements \t= %s' % (opts.n_features * opts.n_samples))
    print('n_nonzeros \t= %s per feature' % n_nonzeros)
    print('ratio_nonzeros \t= %s' % opts.ratio_nonzeros)
    print('')

    ###########################################################################
    # Set transformer input
    ###########################################################################
    transformers = {}

    ###########################################################################
    # Set GaussianRandomProjection input
    gaussian_matrix_params = {

コード例 #40

0

ファイルを表示

ファイル: bench_random_projections.py プロジェクト: 0664j35t3r/scikit-learn

    opts.n_components = type_auto_or_int(opts.n_components)
    opts.density = type_auto_or_float(opts.density)
    selected_transformers = opts.selected_transformers.split(',')

    ###########################################################################
    # Generate dataset
    ###########################################################################
    n_nonzeros = int(opts.ratio_nonzeros * opts.n_features)

    print('Dataset statics')
    print("===========================")
    print('n_samples \t= %s' % opts.n_samples)
    print('n_features \t= %s' % opts.n_features)
    if opts.n_components == "auto":
        print('n_components \t= %s (auto)' %
              johnson_lindenstrauss_min_dim(n_samples=opts.n_samples,
                                            eps=opts.eps))
    else:
        print('n_components \t= %s' % opts.n_components)
    print('n_elements \t= %s' % (opts.n_features * opts.n_samples))
    print('n_nonzeros \t= %s per feature' % n_nonzeros)
    print('ratio_nonzeros \t= %s' % opts.ratio_nonzeros)
    print('')

    ###########################################################################
    # Set transformer input
    ###########################################################################
    transformers = {}

    ###########################################################################
    # Set GaussianRandomProjection input
    gaussian_matrix_params = {