def e123(fixed_projections=False,fixed_embedding=False, visualization_method='mds',smart=False,**kwargs): X = np.genfromtxt('samples/123/123.csv',delimiter=',') X1 = np.genfromtxt('samples/123/1.csv',delimiter=',') X2 = np.genfromtxt('samples/123/2.csv',delimiter=',') X3 = np.genfromtxt('samples/123/3.csv',delimiter=',') proj = projections.PROJ() Q = proj.generate(number=3,method='cylinder') if fixed_projections: fixed_projections = Q else: fixed_projections = None if fixed_embedding: fixed_embedding = X else: fixed_embedding = None mv = MPSE([X1,X2,X3],fixed_embedding=fixed_embedding, fixed_projections=fixed_projections,verbose=2, sample_colors=X1[:,0], visualization_method=visualization_method,**kwargs) mv.plot_embedding(title='initial embedding') if smart and fixed_projections is False and fixed_embedding is False: mv.smart_initialize() mv.plot_embedding(title='smart initialize') mv.gd(**kwargs) mv.plot_computations() mv.plot_embedding(title='final embeding') mv.plot_images() plt.draw() plt.pause(0.2)
def time(): print('\n***mpse_test.time()***') N = [int(10**a) for a in [1, 1.5, 2, 2.5]] repeats = 3 successes = np.zeros(len(N)) ratios = np.zeros(len(N)) time = np.zeros(len(N)) for i in range(len(N)): for j in range(repeats): X = misc.disk(N[i], dim=3) proj = projections.PROJ() Q = proj.generate(number=3, method='standard') D = multigraph.multigraph_from_projections(proj, Q, X) vis = mpse.MPSE(D, verbose=1) vis.gd(min_step=1e-4, verbose=1) if vis.cost < 1e-3: successes[i] += 1 time[i] += vis.H['time'] if successes[i] != 0: time[i] /= successes[i] ratios[i] = successes[i] / repeats fig = plt.plot() plt.loglog(N, time) plt.xlabel('number of points') plt.ylabel('time') plt.title('computation time') plt.show()
def 123(): X = np.genfromtxt('samples/123/123.csv',delimiter=',') X1 = np.genfromtxt('samples/123/1.csv',delimiter=',') X2 = np.genfromtxt('samples/123/2.csv',delimiter=',') X3 = np.genfromtxt('samples/123/3.csv',delimiter=',') proj = projections.PROJ() Q = proj.generate(number=3,method='cylinder') return [X1,X2,X3], (X,Q)
def time(n_samples, n_perspectives, fixed_projections=False, batch_size=20, method='random', trials=50, attempts=3, best=40, verbose=0, max_iter=500): proj = projections.PROJ() times = [] for k in range(trials): X = misc.disk(n_samples, dim=3) Q = proj.generate(number=n_perspectives, method=method) data = setup.setup_distances_from_multiple_perspectives( proj.project(Q, X)) if fixed_projections: Q0 = Q else: Q0 = None best_time = np.Inf best_cost = np.Inf for i in range(attempts): mv = mpse.MPSE(data, fixed_projectiosn=Q0) mv.gd(batch_size=batch_size, max_iter=max_iter, min_cost=1e-3, min_grad=1e-8) if verbose > 1: print(k, i, mv.cost, mv.time) if mv.cost < 1.5e-3 and mv.time < best_time: best_time = mv.time best_cost = mv.cost if best_cost < 1.5e-3: times.append(best_time) #mv.plot_computations() #mv.plot_embedding() #mv.plot_images() #plt.show() print(len(times), np.average(np.sort(times)[0:best]))
def disk(N=100,fixed_projections=False,fixed_embedding=False,**kwargs): X = misc.disk(N,dim=3) proj = projections.PROJ() Q = proj.generate(number=3,method='standard') X1, X2, X3 = proj.project(Q,X) if fixed_projections: mv = MPSE([X1,X2,X3],Q=Q,verbose=2,sample_colors=X1[:,0]) elif fixed_embedding: mv = MPSE([X1,X2,X3],X=X,verbose=2,sample_colors=X1[:,0]) else: mv = MPSE([X1,X2,X3],verbose=2,sample_colors=X1[:,0]) mv.plot_embedding(title='initial embedding') if fixed_projections: mv.gd(fixed_projections=True,**kwargs) elif fixed_embedding: mv.gd(fixed_embedding=True,**kwargs) else: mv.gd(**kwargs) mv.plot_computations() mv.plot_embedding(title='final embeding') mv.plot_images() plt.draw() plt.pause(0.2)
def add_projections(self, attributes=3, d1=3, X=None, Q=None, **kwargs): """\ Adds attributes from projections. """ assert self.attributes == 0 if X is None: X = misc.disk(self.node_number, dim=d1) else: assert isinstance(X, np.ndarray) node_number, dim = X.shape assert node_number == self.node_number d1 = dim self.X = X if self.node_colors is None: self.node_colors = X[:, 0] proj = projections.PROJ(d1=d1, **kwargs) if Q is None or isinstance(Q, str): Q = proj.generate(number=attributes, method=Q, **kwargs) else: assert len(Q) == attributes self.Q = Q for k in range(attributes): Y = proj.project(Q[k], X) self.add_feature(Y, **kwargs)
def comparison(): n_samples = np.array(10**np.arange(1.5, 4.01, .5), dtype=int) N = len(n_samples) n_perspectives = [2, 3, 4, 5] K = len(n_perspectives) trials = 2 best = 3 timef = np.empty((N, K, trials)) timev = np.empty((N, K, trials)) proj = projections.PROJ() for i in range(N): for j in range(K): for k in range(trials): X = misc.disk(n_samples[i], dim=3) Q = proj.generate(number=n_perspectives[j], method='random') data = proj.project(Q, X) X0 = misc.disk(n_samples[i], dim=3) mvf = mpse.MPSE(data, fixed_projections=Q, initial_embedding=X0) mvf.gd(batch_size=20, max_iter=500, min_cost=1e-4) timef[i, j, k] = mvf.time print(i, j, k, mvf.cost, mvf.time) mvf.plot_computations() plt.show() mvv = mpse.MPSE(data, initial_embedding=X0) mvv.gd(batch_size=20, max_iter=500, min_cost=1e-4) timev[i, j, k] = mvv.time print(mvv.cost, mvv.time) mvv.plot_computations() plt.show()
def mload(dataset, n_samples=100, n_perspectives=2, **kwargs): "returns dictionary with datasets" distances = [] data = {} if dataset == 'equidistant': length = n_samples * (n_samples - 1) // 2 for persp in range(n_perspectives): distances.append(np.random.normal(1, 0.1, length)) data['image_colors'] = n_samples - 1 elif dataset == 'disk': import misc, projections X = misc.disk(n_samples, dim=3) proj = projections.PROJ() Q = proj.generate(number=n_perspectives, method='random') Y = proj.project(Q, X) data['true_images'] = Y data['true_embedding'] = X data['true_projections'] = Q distances = Y data['image_colors'] = 0 elif dataset == 'clusters2a': from clusters import createClusters D, data['image_colors'] = \ createClusters(n_samples, n_perspectives) elif dataset == 'clusters': from clusters import clusters distances = [] data['image_classes'] = [] data['image_colors'] = [] if 'n_clusters' in kwargs: n_clusters = kwargs.pop('n_clusters') if isinstance(n_clusters, int): n_clusters = [n_clusters] * n_perspectives else: n_perspectives = len(n_clusters) for i in range(n_perspectives): d, c = clusters(n_samples, n_clustesr=n_clusters[i], **kwargs) distances.append(d) data['image_classes'].append(c) data['image_colors'].append(c) elif dataset == 'clusters2': from clusters import clusters2 distances = [] data['image_colors'] = [] if 'n_clusters' in kwargs: n_clusters = kwargs['n_clusters'] if isinstance(n_clusters, int): n_clusters = [n_clusters] * n_perspectives for persp in range(n_perspectives): d, c = clusters2(n_samples, n_clusters[persp]) distances.append(d) data['image_colors'].append(c) elif dataset == '123': import projections X = np.genfromtxt(directory + '/123/123.csv', delimiter=',') X1 = np.genfromtxt(directory + '/123/1.csv', delimiter=',') X2 = np.genfromtxt(directory + '/123/2.csv', delimiter=',') X3 = np.genfromtxt(directory + '/123/3.csv', delimiter=',') proj = projections.PROJ() Q = proj.generate(number=3, method='cylinder') distances = [X1, X2, X3] data['true_embedding'] = X data['true_projections'] = Q data['true_images'] = [X1, X2, X3] data['colors'] = True elif dataset == 'florence': import florence distances, dictf = florence.setup() for key, value in dictf.items(): data[key] = value elif dataset == 'credit': import csv path = directory + '/credit/' Y = [] for ind in ['1', '2', '3']: filec = open(path + 'discredit3_tsne_cluster_1000_' + ind + '.csv') array = np.array(list(csv.reader(filec)), dtype='float') array += np.random.randn(len(array), len(array)) * 1e-4 Y.append(array) distances = Y elif dataset == 'phishing': import phishing features = phishing.features labels = phishing.group_names if n_samples is None: n_samples = len(features[0]) Y, perspective_labels = [], [] for group in [0, 1, 2, 3]: assert group in [0, 1, 2, 3] Y.append(features[group][0:n_samples]) perspective_labels.append(labels[group]) sample_colors = phishing.results[0:n_samples] distances = Y data['sample_colors'] = sample_colors data['perspective_labels'] = perspective_labels elif dataset == 'mnist': X, data['sample_colors'] = mnist(**kwargs) data['features'] = X distances = [X[:, 0:28 * 14], X[:, 28 * 14::]] data['sample_classes'] = data['sample_colors'] else: print('***dataset not found***') return distances, data
def __init__(self, data, weights=None, data_args=None, fixed_embedding=None, fixed_projections=None, initial_embedding=None, initial_projections=None, visualization_method='mds', visualization_args={}, total_cost_function='rms', embedding_dimension=3, image_dimension=2, projection_family='linear',projection_constraint='orthogonal', hidden_samples=None, sample_labels=None, perspective_labels=None, sample_colors=None, image_colors=None, verbose=0, indent='', **kwargs): """\ Initializes MPSE object. Parameters ---------- data : list, length (n_perspectives) List containing distance/dissimilarity/feature data for each perspective. Each array can be of the following forms: 1) A 1D condensed distance array 2) A square distance matrix 3) An array containing features ***4) A dictionary describing a graph weights : None or string or array or list If visualization allows for it, weights to be used in computation of cost/gradiant of each perspective. IF a list is given, then the list must have length equal to the number of perspectives. Otherwise, it is assumed that the given weights are the same for all perspectives. The possible weights are described in setup.setup_weights. These are: 1) None : no weights are used 2) string : method to compute weights based on distances 3) function : function to compute weights based on distances 4) array : array containing pairwise weights or node weights, depending on size (must be of length of distances or of samples). data_args : dictionary (optional) or list Optional arguments to pass to distances.setup(). If a list is passed, then the length must be the number of perspectives and each element must be a dictionary. Then, each set of distances will be set up using a different set of arguments. fixed_embedding : array If an array is given, this is assumed to be the true embedding and by default optimization is done w.r.t. the projections only. fixed_projections : list If a list is given, this is assumed to be the true projections and by default optimization is done w.r.t. the embedding coordinates only. initial_embedding : array If given, this is the initial embedding used. initial_projections : list If given, this is the initial projections used. visualization_method : str Visualization method. Current options are 'mds' and 'tsne'. The visualization method can be different for different perspectives, by passing a list of visualization methods instead. visualization_args : dict Dictionary with arguments to pass to each visualization method. Different arguments can be passed to different visualization methods by passing a list of dictionaries instead. embedding_dimension : int Dimension of embedding. image_dimension : int Dimension of image (after projection). Each perspective can have a different image dimension, by specifying a list instead. projection_family : str Projection family. Options are 'linear'. projection_constraint : str Constraints on projection family. Options are None, 'orthogonal', 'similar'. embedding_dimension : int > 0 Dimension of the embedding. Alternative name: embedding_dimension projection_dimension : int or array Dimension of projections. Can be different for each perspective. persp : Object instance of projections.Persp class or int > 0. Describes set of allowed projection functions and stores list of projection parameters. See perspective.py. If instead of a Persp object a positive integer int is given, then it is assumed that embedding_dimension=image_dimension=int and that all projections are the identity. sample_labels : list (optional) List containing labels of samples (used in plots). sample_colors : array (optional) Array containing color value of samples (used in plots). image_colors : array-like, shape (n_perspectives, n_samples) Colors for each image. """ self.verbose, self.indent = verbose, indent if verbose > 0: print(indent+'mview.MPSE():') ##set up sets of distances from data self.distances = setup.setup_distances_from_multiple_perspectives( data, data_args) self.n_perspectives = len(self.distances) self.n_samples = scipy.spatial.distance.num_obs_y(self.distances[0]) ##set up weights from data if isinstance(weights,list) or isinstance(weights, np.ndarray): assert len(weights) == self.n_perspectives self.weights = weights else: self.weights = [weights]*self.n_perspectives for i in range(self.n_perspectives): self.weights[i] = setup.setup_weights(self.distances[i], \ self.weights[i], min_weight = 0) ##set up parameters self.embedding_dimension = embedding_dimension self.image_dimension = image_dimension self.projection_family = projection_family self.projection_constraint = projection_constraint proj = projections.PROJ(embedding_dimension,image_dimension, projection_family,projection_constraint) self.proj = proj ##set up hidden samples if hidden_samples is not None: assert isinstance(hidden_samples, list) assert len(hidden_samples) == self.n_perspectives self.hidden_samples = hidden_samples if verbose > 0: print(indent+' data details:') print(indent+f' number of perspectives : {self.n_perspectives}') print(indent+f' number of samples : {self.n_samples}') print(indent+' visualization details:') print(indent+' embedding dimension :',self.embedding_dimension) print(indent+f' image dimension : {self.image_dimension}') print(indent+f' visualization type : {visualization_method}') #setup sample labels: if sample_labels is not None: assert len(sample_labels) == self.n_samples self.sample_labels = sample_labels #setup perspective labels: if perspective_labels is None: perspective_labels = range(1,self.n_perspectives+1) else: assert len(perspective_labels) == self.n_perspectives self.perspective_labels = perspective_labels #setup colors: self.sample_colors = sample_colors self.image_colors = image_colors #setup visualization instances: self.visualization_instances = [] self.visualization_method = visualization_method if isinstance(visualization_method,str): visualization_method = [visualization_method]*self.n_perspectives if isinstance(visualization_args,dict): visualization_args = [visualization_args]*self.n_perspectives for i in range(self.n_perspectives): assert visualization_method[i] in ['mds','tsne'] if self.verbose > 0: print(' setup visualization instance for perspective', self.perspective_labels[i],':') if visualization_method[i] == 'mds': vis = mds.MDS(self.distances[i], weights = self.weights[i], embedding_dimension=self.image_dimension, verbose=self.verbose, indent=self.indent+' ', **visualization_args[i]) elif visualization_method[i] == 'tsne': vis = tsne.TSNE(self.distances[i], embedding_dimension=self.image_dimension, verbose=self.verbose, indent=self.indent+' ', **visualization_args[i]) self.visualization_instances.append(vis) self.visualization = self.visualization_instances #setup objectives: if total_cost_function == 'rms': self.total_cost_function = lambda individual_costs : \ np.sqrt(np.sum(individual_costs**2)/self.n_perspectives) else: assert callable(total_cost_function) self.total_cost_function = total_cost_function def cost_function(X,Q,Y=None,**kwargs): if Y is None: Y = self.proj.project(Q,X) individual_costs = np.zeros(self.n_perspectives) for k in range(self.n_perspectives): individual_costs[k] = \ self.visualization[k].objective(Y[k],**kwargs) cost = self.total_cost_function(individual_costs) return cost, individual_costs self.cost_function = cost_function #setup gradient function: if self.projection_family == 'linear': def gradient(embedding,projections,batch_size=None,indices=None, return_embedding=True,return_projections=True, return_cost=True, return_individual_costs=False): """\ Returns MPSE gradient(s), along with cost and individual costs (optional). Parameters ---------- embedding : numpy array Current embedding. projections : numpy array Current projections (as a single array). return_embedding : boolean If True, returns MPSE gradient w.r.t. embedding. return_projections : boolean If True, returns MPSE gradient w.r.t. projections. return_cost : boolean If True, returns MPSE cost. return_individual_costs : boolean If True, returns individual embedding costs. """ if return_embedding: dX = np.zeros(embedding.shape) if return_projections: dQ = [] individual_costs = np.empty(self.n_perspectives) Y = self.proj.project(projections,embedding) for k in range(self.n_perspectives): dY_k, cost_k = self.visualization[k].gradient( Y[k],batch_size=batch_size,indices=indices) individual_costs[k] = cost_k if return_embedding: dX += dY_k @ projections[k][:2, :3] if return_projections: dQ.append(dY_k.T @ embedding) if return_embedding: dX /= self.n_perspectives cost = self.total_cost_function(individual_costs) if return_embedding is False: grad = np.array(dQ) elif return_projections is False: grad = dX else: grad = [dX,np.array(dQ)] if return_individual_costs: return grad, cost, individual_costs else: return grad, cost self.gradient = gradient else: def gradient_X(X,Q,Y=None): pgradient = self.proj.compute_gradient(X[0],params_list=Q) if Y is None: Y = self.proj.project(X,params_list=Q) gradient = np.zeros((self.n_samples,self.embedding_dimension)) for k in range(self.n_perspectives): gradient += self.visualization[k].gradient(Y[k]) \ @ pgradient[k] return gradient self.gradient_X = gradient_X #set up initial embedding and projections (fixed optional): if verbose > 0: print(indent+' initialize:') #set fixed and initial embedding: if fixed_embedding is not None: if verbose > 0: print(indent+' fixed embedding : True') self.embedding = fixed_embedding self.initial_embedding = fixed_embedding self.fixed_embedding = True else: if verbose > 0: print(indent+' fixed embedding : False') if initial_embedding is None: if verbose > 0: print(indent+' initial embedding : random') self.initial_embedding = misc.initial_embedding( self.n_samples,dim=self.embedding_dimension, radius=1) else: assert isinstance(initial_embedding,np.ndarray) assert initial_embedding.shape == ( self.n_samples, self.embedding_dimension) if verbose > 0: print(indent+' initial embedding : given') self.initial_embedding = initial_embedding self.embedding = self.initial_embedding self.fixed_embedding = False #set fixed and initial projections: if fixed_projections is not None: if isinstance(fixed_projections,str): fixed_projections = self.proj.generate(number= \ self.n_perspectives,method=fixed_projections) assert(all([isinstance(fp,np.ndarray) for fp in fixed_projections])) fixed_projections = [f[:2, :3] for f in fixed_projections] self.projections = fixed_projections self.initial_projections = fixed_projections self.fixed_projections = True if verbose > 0: print(indent+' fixed projections : True') else: if verbose > 0: print(indent+' fixed projections : False') if initial_projections is None: if verbose > 0: print(indent+' initial projections : random') self.initial_projections = self.proj.generate( number=self.n_perspectives, **kwargs) else: if verbose > 0: print(indent+' initial projections : given') if isinstance(initial_projections,str): initial_projections = self.proj.generate(number= \ self.n_perspectives,method=initial_projections) self.initial_projections = initial_projections self.projections = self.initial_projections self.fixed_projections = False print(indent+' Projection is:') print(self.projections) self.initial_cost = None self.initial_individual_cost = None self.computation_history = [] self.time = 0 self.update(**kwargs)