def test_input_size_jl_min_dim(): with pytest.raises(ValueError): johnson_lindenstrauss_min_dim(3 * [100], eps=2 * [0.9]) johnson_lindenstrauss_min_dim( np.random.randint(1, 10, size=(10, 10)), eps=np.full((10, 10), 0.5) )
def test_input_size_jl_min_dim(): assert_raises(ValueError, johnson_lindenstrauss_min_dim, 3 * [100], 2 * [0.9]) assert_raises(ValueError, johnson_lindenstrauss_min_dim, 3 * [100], 2 * [0.9]) johnson_lindenstrauss_min_dim(np.random.randint(1, 10, size=(10, 10)), np.full((10, 10), 0.5))
def test_input_size_jl_min_dim(): assert_raises(ValueError, johnson_lindenstrauss_min_dim, 3 * [100], 2 * [0.9]) assert_raises(ValueError, johnson_lindenstrauss_min_dim, 3 * [100], 2 * [0.9]) johnson_lindenstrauss_min_dim(np.random.randint(1, 10, size=(10, 10)), 0.5 * np.ones((10, 10)))
def plotDependencyEPS(): """Plot thoretical dependency between n_components and eps""" # range of admissible distortions eps_range = np.linspace(0.01, 0.99, 100) # range of number of samples to embed n_samples_range = np.logspace(2, 6, 5) colors = pl.cm.Blues(np.linspace(0.3, 1.0, len(n_samples_range))) pl.figure() for n_samples, color in zip(n_samples_range, colors): min_n_components = johnson_lindenstrauss_min_dim(n_samples, \ eps=eps_range) pl.semilogy(eps_range, min_n_components, color=color) pl.legend(["n_samples = %d" % n for n in n_samples_range], \ loc="upper right") pl.xlabel("Distortion eps") pl.ylabel("Minimum number of dimensions") pl.title("Johnson-Lindenstrauss bounds:\nn_components vs eps") pl.show()
def plot_jl_bounds(label, X): """ http://scikit-learn.org/stable/auto_examples/plot_johnson_lindenstrauss_bound.html#sphx-glr-auto-examples-plot-johnson-lindenstrauss-bound-py """ print("calculating jl bounds") eps_ranges = [] eps_ranges.append(np.linspace(0.2, 0.99, 5)) # range of number of samples (observation) to embed n_samples_range = np.linspace(100, 6000, 5) for i, eps_range in enumerate(eps_ranges): colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range))) plt.figure() for eps, color in zip(eps_range, colors): min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, eps=eps) plt.plot(n_samples_range, min_n_components, color=color) plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="best") plt.xlabel("Number of observations to eps-embed") plt.ylabel("Minimum number of dimensions") plt.title("Johnson-Lindenstrauss bounds:\n%s Data" % (label)) plt.axhline(y=X.shape[1], color='r', linestyle='--', alpha=0.3) plt.axvline(x=X.shape[0], color='r', linestyle='--', alpha=0.3) plt.gcf() plt.savefig('%s-jlbounds.png' % (label.replace(" ", "-"))) plt.close()
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: eps = self.hyperparams['eps'] n_components = johnson_lindenstrauss_min_dim(n_samples=self._x_dim, eps=eps) _logger.info("[INFO] n_components is " + str(n_components)) if n_components > self._y_dim: # Default n_components == 'auto' fails. Need to explicitly assign n_components self._model = GaussianRandomProjection( n_components=self._y_dim, random_state=self.random_seed) else: try: self._model = GaussianRandomProjection( eps=eps, random_state=self.random_seed) self._model.fit(self._training_data) except: _logger.info( "[Warning] Using given eps value failed, will use default conditions." ) self._model = GaussianRandomProjection() self._model.fit(self._training_data) self._fitted = True return CallResult(None, has_finished=True)
def reduce_dimensions(data, random_state, target_dim=2): """ Reduces the dimensionality of the data using UMAP for lower dimensions, PCA for higher dimensions and possibly even random projections if the number of dimension is over the limit given by the Johnson–Lindenstrauss lemma. Works for NumPy arrays. Args: data: The input data. random_state: Random state to generate reproducible results. target_dim: The targeted dimension. Returns: Lower dimension representation of the data. """ jl_limit = johnson_lindenstrauss_min_dim(n_samples=data.shape[0], eps=.3) pca_limit = 30 if data.shape[1] > jl_limit and data.shape[1] > pca_limit: data = SparseRandomProjection( n_components=jl_limit, random_state=random_state).fit_transform(data) if data.shape[1] > pca_limit: data = PCA(n_components=pca_limit, random_state=random_state).fit_transform(data) return UMAP(n_components=target_dim, n_neighbors=30, min_dist=0.0, random_state=random_state).fit_transform(data)
def plotDependencyComponents(): """Plot thoretical dependency between n_samples and n_components""" # range of admissible distortions eps_range = np.linspace(0.1, 0.99, 5) colors = pl.cm.Blues(np.linspace(0.3, 1.0, len(eps_range))) # range of number of samples to embed n_samples_range = np.logspace(1, 9, 9) pl.figure() for eps, color in zip(eps_range, colors): min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, \ eps=eps) pl.loglog(n_samples_range, min_n_components, color=color) pl.legend(["eps = %.1f" % eps for eps in eps_range], \ loc="lower right") pl.xlabel("Number of observations to eps-embed") pl.ylabel("Minimum number of dimensions") pl.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components") pl.show()
def rp(X_train, X_test): num_components = johnson_lindenstrauss_min_dim(n_samples=X_train.shape[0], eps=0.1) print(num_components) print("# features: ", X_train.shape[1], " JL min dim:", num_components) print("JL number > #features so cant make any JL guarentees") # Of course not! It simply means that we can’t make any assumptions regarding the preservation of pairwise distances between data points. accuracies = [] components = np.int32(np.linspace(1, 19, 19)) model = LinearSVC() model.fit(X_train, y_train) baseline = metrics.accuracy_score(model.predict(X_test), y_test) # loop over the projection sizes for comp in components: # create the random projection sp = SparseRandomProjection(n_components=comp) X = sp.fit_transform(X_train) # train a classifier on the sparse random projection # TODO this is wrong.. needs to be KMeans model = LinearSVC(max_iter=1000) model.fit(X, y_train) # evaluate the model and update the list of accuracies test = sp.transform(X_test) accuracies.append(metrics.accuracy_score(model.predict(test), y_test)) # create the figure plt.figure() plt.title("Accuracy of Sparse Random Projection on Churn") plt.xlabel("# of Components") plt.ylabel("Accuracy") plt.xlim([1, 20]) plt.ylim([0, 1.0]) # plot the baseline and random projection accuracies plt.plot(components, [baseline] * len(accuracies), color="r") plt.plot(components, accuracies) plt.show() # average looks to be around 5 components in RP to best the baseline sp = SparseRandomProjection(n_components = 5) X_transformed = sp.fit_transform(X_train) km = KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300, random_state=RAND) plot_silhouette(km, X_transformed, title="SRP(5) KM(2)") km = KMeans(n_clusters=3, init='k-means++', n_init=10, max_iter=300, random_state=RAND) plot_silhouette(km, X_transformed, title="SRP(5) KM(3)")
def gaussianRP(self, parameters): #defaut parameters n_components = parameters["n_components"] if "n_components" in parameters else "auto" eps = parameters["eps"] if "eps" in parameters else 1e-1 if('johnsonRP' in parameters): n_components = johnson_lindenstrauss_min_dim(parameters['johnsonRP']['n_samples'], eps=parameters['johnsonRP']['eps']) #algo Object return GaussianRandomProjection(n_components=n_components, eps=eps)
def r_projection(input_data, no_components=None, e=0.1): if no_components == None: no_components = johnson_lindenstrauss_min_dim( n_samples=input_data.shape[0], eps=e) projected_data = random_projection.GaussianRandomProjection( n_components=no_components).fit_transform(input_data) return projected_data
def test(): s = 50 d = 1000 miu = 0.3 k = johnson_lindenstrauss_min_dim(s, miu) if k > d: raise ValueError("can't embed into smaller dimension") # TODO check the result guarantee of jl and change the 'print' to 'assure' print __test_transform__(s, d, k, miu, 100)
def reduction(eps, input_x, out_dir): print 'JL bound:', random_projection.johnson_lindenstrauss_min_dim(len(input_x[0]),eps),'(eps={})'.format(eps) transformer = random_projection.GaussianRandomProjection(50,eps) data_reduced = transformer.fit_transform(code) with open('{}/projection'.format(out_dir), "w") as op: for component in data_reduced: line = ', '.join(str(round(e,3)) for e in component) op.write( line + '\n') return data_reduced
def _get_eps(self, n_samples, n_dims, n_slice=int(1e4)): new_dim = n_dims * self.keep_rate for i in range(1, n_slice): eps = i / n_slice jl_dim = johnson_lindenstrauss_min_dim(n_samples=n_samples, eps=eps) if jl_dim <= new_dim: print("rate %.3f, n_dims %d, new_dim %d, dims error rate: %.4f" % (self.keep_rate, n_dims, jl_dim, ((new_dim-jl_dim) / new_dim)) ) return eps return -1
def determine_min_dim(params, x_data): eps = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] min_dim = johnson_lindenstrauss_min_dim(x_data.shape[0], eps) plt.figure() plt.plot(eps, min_dim) plt.ylabel("Minimum Number of Dimensions") plt.xlabel("Distortion EPS") plt.title(params['min_dim_graph']) plt.savefig(params['min_dim_graph'] + ".png")
def sparseRP(self, parameters): #defaut parameters n_components = parameters["n_components"] if "n_components" in parameters else "auto" density = parameters["density"] if "density" in parameters else 'auto' eps = parameters["eps"] if "eps" in parameters else 1e-1 if('johnsonRP' in parameters): n_components = johnson_lindenstrauss_min_dim(parameters['johnsonRP']['n_samples'], eps=parameters['johnsonRP']['eps']) #algo Object return SparseRandomProjection(n_components=n_components, eps=eps, density=density)
def dim_reduce( points, technique='tsne', random_state=2021, n_components=2, ): if technique == 'tsne': tsne = TSNE( n_components=n_components, perplexity=30.0, early_exaggeration=12.0, learning_rate=200.0, n_iter=1000, n_iter_without_progress=300, min_grad_norm=1e-07, metric='euclidean', init='random', verbose=100, random_state=random_state, method='barnes_hut', angle=0.5, n_jobs=None, ) transformed_points = tsne.fit_transform(points) elif technique == 'jlt': print( 'Minimum JL components (eps = .99): ', johnson_lindenstrauss_min_dim(len(points), eps=1 - np.finfo(float).eps)) grp = GaussianRandomProjection(n_components=n_components, eps=.99) transformed_points = grp.fit_transform(points) elif technique == 'pca': pca = PCA( n_components=n_components, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=random_state, ) transformed_points = pca.fit_transform(points) elif technique == 'lle': lle = LocallyLinearEmbedding( n_components=n_components, random_state=random_state, ) return lle.fit_transform(points) else: raise ValueError('Invalid technique.') return transformed_points
def flastVectorization(dataPoints, reduceDim=True, dim=0, eps=0.33): countVec = CountVectorizer() Z_full = countVec.fit_transform(dataPoints) if reduceDim: if dim <= 0: dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps) srp = SparseRandomProjection(n_components=dim) Z = srp.fit_transform(Z_full) return Z else: return Z_full
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: eps = self.hyperparams['eps'] n_components = johnson_lindenstrauss_min_dim(n_samples=self._x_dim, eps=eps) if n_components > self._x_dim: self._model = GaussianRandomProjection(n_components=self._x_dim) else: self._model = GaussianRandomProjection(eps=eps) self._model.fit(self._training_data)
def preprocess(X, y): min_frame = min(X, key=lambda x: x.shape[2]).shape[2] X = np.array([x[:,:,:min_frame].flatten() for x in X]) scaler = MinMaxScaler(feature_range=(-1, 1)) scaler = scaler.fit(X) X = scaler.transform(X) # reduce principle components to improve performance reduced_pc = 2000 recommended_pc = johnson_lindenstrauss_min_dim(861,eps=0.1) min_pc = recommended_pc - reduced_pc sp = SparseRandomProjection(n_components = int(min_pc)) X = sp.fit_transform(X) return np.array(X), np.array(y)
def jlmd_search(ubs, names): epsilons = np.linspace(0.2, 0.999, 1000) y = [] for eps in epsilons: y.append(johnson_lindenstrauss_min_dim(40000, eps)) plot.style.use('seaborn-darkgrid') ax = plot.subplots()[1] plot.title('Influence of epsilon on the minimum number of dimensions') plot.semilogy(epsilons, y) for ub in ubs: plot.semilogy([0, 1], [ub, ub]) plot.legend(['Minimum number of dimensions', *names], loc='upper right') plot.show()
def fit(self, X): if self.n_components == -1: super().set_params( n_components=random_projection.johnson_lindenstrauss_min_dim( n_samples=X.shape[0], eps=self.eps)) try: return super().fit(X) except ValueError as e: if self.n_components >= X.shape[1]: raise RuntimeError("eps={} and n_samples={} lead to a target " "dimension of {} which is larger than the " "original space with n_features={}".format( self.eps, X.shape[0], self.n_components, X.shape[1])) else: raise
def data_JL_proj_data_diff(data): n_row = len(data) #finding minimum dimension reduction possible using JL lemma, while preserving pairwise distances upto a given eps value. min_dim = random_projection.johnson_lindenstrauss_min_dim(n_row, eps=0.1) print("min dim suggested by JL lemma with eps = 0.1 is " + str(min_dim)) #creating transformer matrix to use for projecting the input data to target data. if O = IR. transformer is R here. transformer = random_projection.SparseRandomProjection() #transforming given "data"(input) to "projected_data"(output) by using "transformer" as random matrix R. projected_data = transformer.fit_transform(data) print( "new data dimensions after projection according to user provided target data dimension: " + str(np.shape(projected_data))) #printing pdist() of projected data #print("pdist of points in JL projected data") #print(sp.pdist(projected_data)) print("\n\n") return sp.pdist(projected_data)
def train_drfs(train_x, train_y, eps=0.5, threshold="median"): n_samples, n_features, n_classes = \ get_counts_tt(train_x, train_y) # pick number of components min_comp = random_projection.johnson_lindenstrauss_min_dim( \ n_samples=n_samples, eps=eps) min_comp = min(min_comp, n_features) # scale and agglomerate to min_comp #scaler = preprocessing.StandardScaler() scaler = preprocessing.QuantileTransformer() feat_agg = cluster.FeatureAgglomeration( \ n_clusters=min_comp) xtc = ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=-1) scaler2 = preprocessing.RobustScaler() #poly = preprocessing.PolynomialFeatures(degree=2, interaction_only=True) # train the model pipeline dr_pipe = pipeline.Pipeline([('scaler', scaler), \ ('feat_agg', feat_agg), ('scaler2', scaler2)]) dr_pipe.fit(train_x) # transform train_x to train xtc train_x = dr_pipe.transform(train_x) # train the xtc xtc.fit(train_x, train_y) print("Feature importances:") print("\tMax:", max(xtc.feature_importances_)) print("\tMin:", min(xtc.feature_importances_)) #print(xtc.feature_importances_) # create the feature selection model from the xtc feat_sel = feature_selection.SelectFromModel( \ xtc, prefit=True, threshold=threshold) # create the pipeline to reduce dim then feature select drfs_pipe = pipeline.Pipeline(\ [('dr_pipe', dr_pipe), ('feat_sel', feat_sel)]) return drfs_pipe
def plot_JL_curve(self): ''' Plot the Johnson-Lindenstrauss minimum dimensions curve against the maximum distortion rate for Random Projection. The plot is also saved to a local jpg file. ''' fig = plt.figure(figsize=(6, 4)) eps_range = np.linspace(0.01, 0.99, 100) min_n_components = johnson_lindenstrauss_min_dim(n_samples=len(self.X), eps=eps_range) plt.plot(eps_range, min_n_components) plt.xlabel('maximum distortion rate', fontsize=16) plt.ylabel('mimimum dimensions to keep', fontsize=16) plt.ylim(0, 20000) plt.title( 'johnson_lindenstrauss_min_dim vs max_distortion_rate \nsample size = ' + str(len(self.X)), fontsize=16) plt.show() return plt2base64(plt)
def rp(X_train, X_test): num_components = johnson_lindenstrauss_min_dim(n_samples=X_train.shape[0], eps=0.1) print(num_components) print("# features: ", X_train.shape[1], " JL min dim:", num_components) print("JL number > #features so cant make any JL guarentees") # Of course not! It simply means that we can’t make any assumptions regarding the preservation of pairwise distances between data points. accuracies = [] components = np.int32(np.linspace(2, 64, 20)) model = LinearSVC() model.fit(X_train, y_train) baseline = metrics.accuracy_score(model.predict(X_test), y_test) # loop over the projection sizes for comp in components: # create the random projection sp = SparseRandomProjection(n_components=comp) X = sp.fit_transform(X_train) # train a classifier on the sparse random projection model = LinearSVC() model.fit(X, y_train) # evaluate the model and update the list of accuracies test = sp.transform(X_test) accuracies.append(metrics.accuracy_score(model.predict(test), y_test)) # create the figure plt.figure() plt.title("Accuracy of Sparse Projection on Sonar") plt.xlabel("# of Components") plt.ylabel("Accuracy") plt.xlim([2, 64]) plt.ylim([0, 1.0]) # plot the baseline and random projection accuracies plt.plot(components, [baseline] * len(accuracies), color="r") plt.plot(components, accuracies) plt.show()
def preparation(inputFile, dim=0): vectorizer = HashingVectorizer() # compute "TF" testCases = [line.rstrip("\n") for line in open(inputFile)] testSuite = vectorizer.fit_transform(testCases) # dimensionality reduction if dim <= 0: e = 0.5 # epsilon in jl lemma dim = johnson_lindenstrauss_min_dim(len(testCases), eps=e) srp = SparseRandomProjection(n_components=dim) projectedTestSuite = srp.fit_transform(testSuite) # map sparse matrix to dict TS = [] for i in range(len(testCases)): tc = {} for j in projectedTestSuite[i].nonzero()[1]: tc[j] = projectedTestSuite[i, j] TS.append(tc) return TS
def checkOptimaldimensionality(s): # range of distortions eps_range = np.linspace(0.1, 0.99, 10) colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range))) # range of number of samples (observation) to embed n_samples_range = np.logspace(1, 4, s) plt.figure() for eps, color in zip(eps_range, colors): min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, eps=eps) plt.loglog(n_samples_range, min_n_components, color=color) plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right") plt.xlabel("Number of observations to eps-embed") plt.ylabel("Minimum number of dimensions") plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components") plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right") plt.xlabel("Number of observations to eps-embed") plt.ylabel("Minimum number of dimensions") plt.title( "Johnson-Lindenstrauss bounds:\nn_samples vs n_components w.r.t eps")
sys.exit(1) opts.n_components = type_auto_or_int(opts.n_components) opts.density = type_auto_or_float(opts.density) selected_transformers = opts.selected_transformers.split(",") ########################################################################### # Generate dataset ########################################################################### n_nonzeros = int(opts.ratio_nonzeros * opts.n_features) print("Dataset statics") print("===========================") print("n_samples \t= %s" % opts.n_samples) print("n_features \t= %s" % opts.n_features) if opts.n_components == "auto": print("n_components \t= %s (auto)" % johnson_lindenstrauss_min_dim(n_samples=opts.n_samples, eps=opts.eps)) else: print("n_components \t= %s" % opts.n_components) print("n_elements \t= %s" % (opts.n_features * opts.n_samples)) print("n_nonzeros \t= %s per feature" % n_nonzeros) print("ratio_nonzeros \t= %s" % opts.ratio_nonzeros) print("") ########################################################################### # Set transformer input ########################################################################### transformers = {} ########################################################################### # Set GaussianRandomProjection input gaussian_matrix_params = {"n_components": opts.n_components, "random_state": opts.random_seed}
def rp_analysis(X, y, dataset, plot, X_test): if plot: # Project in 2D for visualization rp = GaussianRandomProjection(n_components=2) projected = rp.fit_transform(X) plot_2d(projected, y) # Project in 3D for visualization rp = GaussianRandomProjection(n_components=3) projected = rp.fit_transform(X) plot_3d(projected, y) # # Plot eps vs. n components # eps_range = [0.4, 0.6, 0.8, 0.99] # For Fashion MNIST eps 0.4 to 0.999 (must be < 1) # num_components = [] # for eps in eps_range: # rp = GaussianRandomProjection(n_components='auto', eps=eps) # projected = rp.fit_transform(X) # num_components.append(projected.shape) # print(num_components) # Determine min components for varying eps min_dims = [] eps_range = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99] for e in eps_range: min_dims.append(johnson_lindenstrauss_min_dim(n_samples=X.shape[0], eps=e)) print('\nmin dims', min_dims) print('\nX shape:', X.shape) # Measure variation across multiple runs means_list = [] stdev_list = [] kurtosis_list = [] iterations = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] for i in iterations: rp3 = GaussianRandomProjection(n_components=10) # 10 components to help visualize the variation projected3 = rp3.fit_transform(X) means_list.append(np.mean(projected3)) stdev_list.append(np.std(projected3)) kurtosis_list.append(np.mean(kurtosis(projected3))) projected_df = pd.DataFrame(projected3) projected_df.to_csv('projected.csv') print(plot_kurtosis(projected3)) # http://kitchingroup.cheme.cmu.edu/blog/2013/09/13/Plotting-two-datasets-with-very-different-scales/ fig = plt.figure() ax1 = fig.add_subplot(111) ax1.plot(iterations, means_list, label='Mean', color='red') ax1.plot(iterations, stdev_list, label='Std Deviation', color='blue') ax1.set_xlabel('Iteration', fontsize=18) ax1.legend() ax2 = ax1.twinx() ax2.plot(iterations, kurtosis_list, label='Kurtosis', color='green') plt.legend() plt.show() # print('\ncomponents_ shape:', rp3.components_.shape) # Project on to an 'optimal' number of components rp2 = GaussianRandomProjection(n_components=331) projected2_train = rp2.fit_transform(X) projected2_test = rp2.transform(X_test) print('\nRP projected X_train:', projected2_train.shape) return projected2_train, projected2_test
from sklearn import datasets, metrics, decomposition, random_projection from sklearn.preprocessing import StandardScaler import pandas as pd import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from sklearn.random_projection import johnson_lindenstrauss_min_dim from sklearn.model_selection import train_test_split, validation_curve, learning_curve, ShuffleSplit,GridSearchCV from sklearn.cross_validation import StratifiedKFold, cross_val_score from functions import * data = datasets.load_digits() X = data.data y = data.target johnson_lindenstrauss_min_dim(1797,eps=0.1) accuracies = [] components = range(2,X.shape[1]) split = train_test_split(X, y, test_size = 0.33, random_state = 42) #digits = datasets.load_digits() #split = train_test_split(digits.data, digits.target, test_size = 0.3, # random_state = 42) (trainData, testData, trainTarget, testTarget) = split model = LinearSVC() model.fit(trainData, trainTarget) baseline = metrics.accuracy_score(model.predict(testData), testTarget)
colnames = [ "make", "address", "all", "3d", "our", "over", "remove", "internet", "order", "mail", "receive", "will", "people", "report", "addresses", "free", "business", "email", "you", "credit", "your", "font", "000", "money", "hp", "hpl", "george", "650", "lab", "labs", "telnet", "857", "data", "415", "85", "technology", "1999", "parts", "pm", "direct", "cs", "meeting", "original", "project", "re", "edu", "table", "conference", ";", "(", "[", "!", "$", "#", "average", "longest", "total", "class" ] data.columns = colnames X, y = data.iloc[:, :-1], data.iloc[:, -1] X.columns = colnames[:len(colnames) - 1] print johnson_lindenstrauss_min_dim(4601, eps=0.1) split = train_test_split(X, y, test_size=0.3, random_state=42) (trainData, testData, trainTarget, testTarget) = split accuracies = [] components = np.int32(np.linspace(2, 56, 14)) model = LinearSVC() model.fit(trainData, trainTarget) baseline = metrics.accuracy_score(model.predict(testData), testTarget) # loop over the projection sizes for comp in components: # create the random projection sp = SparseRandomProjection(n_components=comp) X = sp.fit_transform(trainData) # train a classifier on the sparse random projection
def johnson_lindenstrauss(data, data_name): # `normed` is being deprecated in favor of `density` in histograms if LooseVersion(matplotlib.__version__) >= '2.1': density_param = {'density': True} else: density_param = {'normed': True} # Part 1: plot the theoretical dependency between n_components_min and # n_samples # range of admissible distortions eps_range = np.linspace(0.1, 0.99, 5) colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range))) # range of number of samples (observation) to embed n_samples_range = np.logspace(1, 9, 9) plt.figure() for eps, color in zip(eps_range, colors): min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, eps=eps) plt.loglog(n_samples_range, min_n_components, color=color) plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right") plt.xlabel("Number of observations to eps-embed") plt.ylabel("Minimum number of dimensions") plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components") plt.savefig('Figs/02b_rp_comp_samples') # range of admissible distortions eps_range = np.linspace(0.01, 0.99, 100) n_samples_range = np.logspace(2, 6, 5) colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(n_samples_range))) plt.figure() for n_samples, color in zip(n_samples_range, colors): min_n_components = johnson_lindenstrauss_min_dim(n_samples, eps=eps_range) plt.semilogy(eps_range, min_n_components, color=color) plt.legend(["n_samples = %d" % n for n in n_samples_range], loc="upper right") plt.xlabel("Distortion eps") plt.ylabel("Minimum number of dimensions") plt.title("Johnson-Lindenstrauss bounds:\nn_components vs eps") plt.savefig('Figs/02b_rp_comp_eps') # Part 2: perform sparse random projection of some digits images which are # quite low dimensional and dense or documents of the 20 newsgroups dataset # which is both high dimensional and sparse n_samples, n_features = data.shape print("Embedding %d samples with dim %d using various random projections" % (n_samples, n_features)) n_components_range = np.array([1,10,100,1000]) dists = euclidean_distances(data, squared=True).ravel() # select only non-identical samples pairs nonzero = dists != 0 dists = dists[nonzero] for n_components in n_components_range: t0 = time() rp = SparseRandomProjection(n_components=n_components) projected_data = rp.fit_transform(data) print("Projected %d samples from %d to %d in %0.3fs" % (n_samples, n_features, n_components, time() - t0)) if hasattr(rp, 'components_'): n_bytes = rp.components_.data.nbytes n_bytes += rp.components_.indices.nbytes print("Random matrix with size: %0.3fMB" % (n_bytes / 1e6)) projected_dists = euclidean_distances( projected_data, squared=True).ravel()[nonzero] plt.figure() plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu) plt.xlabel("Pairwise squared distances in original space") plt.ylabel("Pairwise squared distances in projected space") plt.title("Pairwise distances distribution for n_components=%d" % n_components) cb = plt.colorbar() cb.set_label('Sample pairs counts') rates = projected_dists / dists print("Mean distances rate: %0.2f (%0.2f)" % (np.mean(rates), np.std(rates))) plt.savefig('Figs/02b_rp_pwdist_{}_{}'.format(data_name, n_components)) plt.figure() plt.hist(rates, bins=50, range=(0., 2.), edgecolor='k', **density_param) plt.xlabel("Squared distances rate: projected / original") plt.ylabel("Distribution of samples pairs") plt.title("Histogram of pairwise distance rates for n_components=%d" % n_components) plt.savefig('Figs/02b_rp_histogram_{}_{}'.format(data_name, n_components)) plt.clf()
def generate_real_dataset_rp(data_path, sparse=False, eps=0.1): ################################################ LOADING AND CLEANING THE DATA ######################################### samples = open(os.path.join(data_path, 'samples.txt')) labels = open(os.path.join(data_path, 'labels.txt')) annotations = open(os.path.join(data_path, 'annotations.txt')) out_f = open(os.path.join(data_path,'rp_out'),'w') bad_samples = [] real_labels = [] qpoint_lists = [] label_list = [] annotation_list = [] label_count = np.zeros((1,13)) for data in samples: qpoint_lists = data.split(';') for data in labels: label_list = data.split(';') for data in annotations: annotation_list = data.split(';') out_s = 'found %i qpoint lists.\n' % len(qpoint_lists) + 'found %i labels.\n' % len(label_list) + 'found %i annotations.\n\n' % len(annotation_list) print out_s out_f.write(out_s) out_f.close() for list_ind in np.arange(len(qpoint_lists)): bad = False ################# PROCESS THE LABELS if annotation_list[list_ind][0:2] != 'vo' and annotation_list[list_ind][0:2] != 'fl' and annotation_list[list_ind][0:2] != 'mi' and annotation_list[list_ind][0:2] != 'ja': real_labels.append(0) label_count[0][0] += 1 else: position = label_list[list_ind].split(',') if float(position[0]) == -2000 or float(position[0]) == -1000: real_labels.append(-1) bad = True else: lab = determine_label((float(position[0]),float(position[1]),float(position[2]))) real_labels.append(lab) label_count[0][lab] += 1 ################# PROCESS THE Q-POINTS qpoint_lists[list_ind] = qpoint_lists[list_ind].split(':') for point_ind in np.arange(len(qpoint_lists[list_ind])): qpoint_lists[list_ind][point_ind] = qpoint_lists[list_ind][point_ind].split(',') if len(qpoint_lists[list_ind][point_ind]) != 7: bad = True if bad: bad_samples.append(list_ind) print 'need to remove %i bad samples.' %len(bad_samples) ################# REMOVE BAD SAMPLES ind = 0 for bad_ind in bad_samples: real_ind = bad_ind - ind qpoint_lists.pop(real_ind) real_labels.pop(real_ind) annotation_list.pop(real_ind) ind += 1 out_f = open(os.path.join(data_path,'rp_out'),'a') out_s = str(len(qpoint_lists)) + ' samples remain after purging.\n' + str(len(real_labels)) + ' labels remain after purging.\n'\ + str(len(annotation_list)) + ' annotations remain after purging.\n' + 'percentages of the labels are %s\n\n' %str(label_count/len(qpoint_lists)) print out_s out_f.write(out_s) out_f.close() samples.close() labels.close() annotations.close() ################################################## PROJECTING THE DATA INTO A GRID ##################################### pcol = 0 ps = 0 # ASSUMPTION: relevant area is never less than 0.7 meters and more than 4.4 meters on the x-axis, 2.5 meters to both sides on the y-axis # and 2 meters on the z-axis away from the sensors bin_cm = 3 max_x_cm = 440 min_x_cm = 70 max_y_cm = 250 max_z_cm = 200 x_range = max_x_cm / bin_cm - min_x_cm / bin_cm y_range = max_y_cm * 2 / bin_cm z_range = max_z_cm / bin_cm out_f = open(os.path.join(data_path,'rp_out'),'a') out_s = 'length of data in original space: %d\n\n' %(x_range*y_range*z_range) print out_s out_f.write(out_s) out_f.close() # compute a conservative estimate of the number of latent dimensions required to guarantuee the given epsilons n_dims = johnson_lindenstrauss_min_dim(len(qpoint_lists),eps) out_f = open(os.path.join(data_path,'rp_out'),'a') out_s = 'number of latent dimensions needed to guarantee %f epsilon is %f\n\n' %(eps, n_dims) print out_s out_f.write(out_s) out_f.close() f_path = os.path.join(data_path,'rp_real_sparse.hdf5') if sparse else os.path.join(data_path,'rp_real_gauss.hdf5') print f_path f = h5.File(f_path, "w") f.create_dataset('data_set/data_set', (len(qpoint_lists), n_dims), dtype='f') f.create_dataset('labels/real_labels', (len(real_labels),), dtype='i') dt = h5.special_dtype(vlen=unicode) f.create_dataset('annotations/annotations', (len(annotation_list),), dtype=dt) transformer = random_projection.SparseRandomProjection(n_components=n_dims) if sparse else random_projection.GaussianRandomProjection(n_components=n_dims) if sparse: print 'performing projection with sparse matrix' else: print 'performing projection with gaussian matrix' # this is not the way it's supposed to be done BUT the proper training set doesn't fit into the memory transformer.components_ = transformer._make_random_matrix(n_dims, x_range*y_range*z_range) last_per = -1 for ind, qpoint_list in enumerate(qpoint_lists): grid = np.zeros((x_range, y_range, z_range)) for qpoint in qpoint_list: x = int(float(qpoint[0])*100) / bin_cm y = (int(float(qpoint[1])*100) + max_y_cm) / bin_cm z = int(float(qpoint[2])*100) / bin_cm if x - min_x_cm/bin_cm < 0 or x - min_x_cm/bin_cm > x_range-1 or y > y_range-1 or y < 0 or z > z_range-1 or z < 0: continue pow = float(qpoint[4]) if grid[x-min_x_cm/bin_cm][y][z] != 0: pcol += 1 if grid[x-min_x_cm/bin_cm][y][z] < pow: grid[x-min_x_cm/bin_cm][y][z] = pow else: grid[x-min_x_cm/bin_cm][y][z] = pow ps += 1 f['data_set/data_set'][ind] = transformer.transform(np.reshape(grid,(1,-1))) f['labels/real_labels'][ind] = real_labels[ind] f['annotations/annotations'][ind] = annotation_list[ind] curr_percent = int(float(ind) / len(qpoint_lists) * 100) if last_per != curr_percent: last_per = curr_percent out_f = open(os.path.join(data_path,'rp_out'),'a') out_s = 'have now looked at %i%% of the data.\n' % int(float(ind) / len(qpoint_lists) * 100) print out_s out_f.write(out_s) out_f.close() print 'done with projecting onto the grid (without binning)' print 'percentage of point collision: ' + str(float(pcol)/ps) print 'number of samples: ' +str(len(f['data_set/data_set'])) print 'dimensionality of the samples: ' +str(len(f['data_set/data_set'][0])) print 'number of labels: ' +str(len(f['labels/real_labels'])) print 'number of annotations: ' +str(len(f['annotations/annotations'])) out_f = open(os.path.join(data_path,'rp_out'),'a') out_s = 'projection done, new dimension is %d\n\n' %len(f['data_set/data_set'][0]) print out_s out_f.write(out_s) out_f.close() f.close() if sparse: generate_train_val_test_set(os.path.join(data_path,"rp_real_sparse.hdf5"), os.path.join(data_path,"train_val_test_rp_real_sparse.hdf5")) else: generate_train_val_test_set(os.path.join(data_path,"rp_real_gauss.hdf5"), os.path.join(data_path,"train_val_test_rp_real_gauss.hdf5"))
from sklearn.datasets import load_digits from sklearn.metrics.pairwise import euclidean_distances # Part 1: plot the theoretical dependency between n_components_min and # n_samples # range of admissible distortions eps_range = np.linspace(0.1, 0.99, 5) colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range))) # range of number of samples (observation) to embed n_samples_range = np.logspace(1, 9, 9) plt.figure() for eps, color in zip(eps_range, colors): min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, eps=eps) plt.loglog(n_samples_range, min_n_components, color=color) plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right") plt.xlabel("Number of observations to eps-embed") plt.ylabel("Minimum number of dimensions") plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components") plt.show() # range of admissible distortions eps_range = np.linspace(0.01, 0.99, 100) # range of number of samples (observation) to embed n_samples_range = np.logspace(2, 6, 5) colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(n_samples_range)))
from sklearn.datasets import load_digits from sklearn.metrics.pairwise import euclidean_distances # Part 1: plot the theoretical dependency between n_components_min and # n_samples # range of admissible distortions eps_range = np.linspace(0.1, 0.99, 5) colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range))) # range of number of samples (observation) to embed n_samples_range = np.logspace(1, 9, 9) plt.figure() for eps, color in zip(eps_range, colors): min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, eps=eps) plt.loglog(n_samples_range, min_n_components, color=color) plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right") plt.xlabel("Number of observations to eps-embed") plt.ylabel("Minimum number of dimensions") plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components") # range of admissible distortions eps_range = np.linspace(0.01, 0.99, 100) # range of number of samples (observation) to embed n_samples_range = np.logspace(2, 6, 5) colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(n_samples_range))) plt.figure()
def min_features(scaled_data): print johnson_lindenstrauss_min_dim(len(scaled_data),eps=0.1)
sys.exit(1) opts.n_components = type_auto_or_int(opts.n_components) opts.density = type_auto_or_float(opts.density) selected_transformers = opts.selected_transformers.split(',') ########################################################################### # Generate dataset ########################################################################### n_nonzeros = int(opts.ratio_nonzeros * opts.n_features) print('Dataset statics') print("===========================") print('n_samples \t= %s' % opts.n_samples) print('n_features \t= %s' % opts.n_features) if opts.n_components == "auto": print('n_components \t= %s (auto)' % johnson_lindenstrauss_min_dim( n_samples=opts.n_samples, eps=opts.eps)) else: print('n_components \t= %s' % opts.n_components) print('n_elements \t= %s' % (opts.n_features * opts.n_samples)) print('n_nonzeros \t= %s per feature' % n_nonzeros) print('ratio_nonzeros \t= %s' % opts.ratio_nonzeros) print('') ########################################################################### # Set transformer input ########################################################################### transformers = {} ########################################################################### # Set GaussianRandomProjection input gaussian_matrix_params = {
opts.n_components = type_auto_or_int(opts.n_components) opts.density = type_auto_or_float(opts.density) selected_transformers = opts.selected_transformers.split(',') ########################################################################### # Generate dataset ########################################################################### n_nonzeros = int(opts.ratio_nonzeros * opts.n_features) print('Dataset statics') print("===========================") print('n_samples \t= %s' % opts.n_samples) print('n_features \t= %s' % opts.n_features) if opts.n_components == "auto": print('n_components \t= %s (auto)' % johnson_lindenstrauss_min_dim(n_samples=opts.n_samples, eps=opts.eps)) else: print('n_components \t= %s' % opts.n_components) print('n_elements \t= %s' % (opts.n_features * opts.n_samples)) print('n_nonzeros \t= %s per feature' % n_nonzeros) print('ratio_nonzeros \t= %s' % opts.ratio_nonzeros) print('') ########################################################################### # Set transformer input ########################################################################### transformers = {} ########################################################################### # Set GaussianRandomProjection input gaussian_matrix_params = {