def split_af(_af, _inds): """ splits the input matrix into diagonal and off-diagonal portions, with the split being determined by _inds :param _af: :param _inds: :return: """ _af = _af.tocoo() _r = _af.row _c = _af.col _d = _af.data _d_non = [] _d_scc = [] _shape = _af.shape for i in range(len(_d)): if _r[i] in _inds and _c[i] in _inds: _d_non.append(0) _d_scc.append(_d[i]) else: _d_non.append(_d[i]) _d_scc.append(0) _af_non = csc_matrix((_d_non, (_r, _c)), shape=_shape) _af_scc = csc_matrix((_d_scc, (_r, _c)), shape=_shape) assert (_af_non + _af_scc - _af).nnz == 0 return _af_non, _af_scc
def test_bfs(self): a = igl.adjacency_matrix(self.f1) p, d = igl.bfs(a, 0) self.assertEqual(p.shape, (self.v1.shape[0],)) self.assertEqual(p.shape, (self.v1.shape[0],)) try: p, d, = igl.bfs(a, -1) self.assertTrue(False) except IndexError as e: pass a = csc.csc_matrix(np.zeros([0, 0], dtype=np.int32)) try: p, d, = igl.bfs(a, 0) self.assertTrue(False) except ValueError as e: pass a = csc.csc_matrix(np.zeros([10, 11], dtype=np.int32)) try: p, d, = igl.bfs(a, 0) self.assertTrue(False) except ValueError as e: pass a = csc.csc_matrix(np.zeros([10, 10], dtype=np.int32)) p, d, = igl.bfs(a, 0) self.assertEqual(p.shape, ()) self.assertTrue(np.array_equal(d, -np.ones(10))) self.assertTrue(p.flags.c_contiguous)
def __init__(self, positions, masses, springs, fixed, method, profiling_rate=0): """ method: 'Newton' | 'FMS' | 'Jacobi' profiling_rate: the step rate at which the performance will be graphed """ # General state setup self.profiling_rate = profiling_rate self.method = method self.g = -9.8 self.m = len(positions) self.s = len(springs) self.q0 = np.array(positions).reshape(self.ndim * self.m, 1) self.q = copy(self.q0) # Store initial state self.state0 = copy(self.q) self.fixed = fixed self.qFixed = copy(self.q0) self.M = kron(diags(masses), np.eye(self.ndim), format='csc') self.Minv = kron(diags( list(map(lambda x: 0 if x == 0 else 1.0 / x, masses))), np.eye(self.ndim), format='csc') self.springs = np.array(springs, dtype=spring_type) self.d = np.empty((self.ndim * self.s, 1)) # Matrices L and J setup self.L = csc_matrix((self.m, self.m)) self.J = csc_matrix((self.m, self.s)) for idx, s in enumerate(self.springs): Ai = None if idx in self.fixed: Ai = csc_matrix(([1], ([s['p1']], [0])), shape=(self.m, 1)) else: Ai = csc_matrix(([1, -1], ([s['p1'], s['p2']], [0, 0])), shape=(self.m, 1)) self.L += s['k'] * Ai * Ai.transpose() self.J += s['k'] * Ai * csc_matrix( ([1.0], ([idx], [0])), shape=(self.s, 1)).transpose() self.L = kron(self.L, np.eye(self.ndim), format='csc') self.J = kron(self.J, np.eye(self.ndim), format='csc') # Matrix A precomputation (Global step) self.A = self.M + self.dt2 * self.L self.Ch = cho_factor(self.A.toarray()) # Implemented methods self.implemented = { 'FMS': self.step_LocalGlobal, 'Jacobi': self.step_Jacobi, 'Newton': self.step_Newton, 'Anderson': self.step_Anderson }
def load_dataset(fname): z = np.load(open(fname,'rb')) X_train = z['arr_0'] X_test = z['arr_1'] y_train = z['arr_2'] y_test = z['arr_3'] X_train = csc.csc_matrix(X_train.tolist()) X_test = csc.csc_matrix(X_test.tolist()) return X_train, X_test, y_train, y_test
def load_lda_dataset_small(uid, neg_to_pos_rate): fname = 'ldads_small%d_%d' % (neg_to_pos_rate, uid) fname = join(DATASETS_FOLDER, '%s.npz' % fname) z = np.load(open(fname,'rb')) X_train_lda = z['arr_0'] X_test_lda = z['arr_1'] y_train = z['arr_2'] y_test = z['arr_3'] X_train_lda = csc.csc_matrix(X_train_lda.tolist()) X_test_lda = csc.csc_matrix(X_test_lda.tolist()) return X_train_lda, X_test_lda, y_train, y_test
def load_lda_dataset_big(uid, train_size): fname = 'ldads_%d' % uid if train_size: fname += '_tr%d' % train_size fname = join(DATASETS_FOLDER, '%s.npz' % fname) z = np.load(open(fname,'rb')) X_train_lda = z['arr_0'] X_test_lda = z['arr_1'] y_train = z['arr_2'] y_test = z['arr_3'] X_train_lda = csc.csc_matrix(X_train_lda.tolist()) X_test_lda = csc.csc_matrix(X_test_lda.tolist()) return X_train_lda, X_test_lda, y_train, y_test
def positive_mass_stiffness_smooth(triangles, vertices, nb_iter=1, diffusion_step=1.0, flow_file=None, gaussian_threshold=0.2, angle_threshold=1.0): vertices_csc = csc_matrix(vertices) curvature_normal_mtx = mean_curvature_normal_matrix(triangles, vertices, area_weighted=False) # mass_mtx = mass_matrix(triangles, vertices_csc) if isinstance(diffusion_step, (int, long, float)): diffusion_step = diffusion_step * np.ones(len(vertices)) if flow_file is not None: mem_map = np.memmap(flow_file, dtype=G_DTYPE, mode='w+', shape=(nb_iter, vertices.shape[0], vertices.shape[1])) for i in range(nb_iter): stdout.write("\r step %d on %d done" % (i, nb_iter)) stdout.flush() if flow_file is not None: mem_map[i] = vertices_csc.todense() # third try mass_mtx = mass_matrix(triangles, vertices_csc) pos_curv = vertices_cotan_curvature(triangles, vertices_csc, False) > -G_ATOL if gaussian_threshold is not None: # Gaussian threshold: maximum value PI, cube corner = PI/2 # = 0.8 deg_vts = np.abs( vertices_gaussian_curvature(triangles, vertices_csc, False)) > gaussian_threshold pos_curv = np.logical_or(pos_curv, deg_vts) if angle_threshold is not None: # angle_threshold: PI, cube corner = PI/2 # = 1.7 deg_seg = edge_triangle_normal_angle( triangles, vertices_csc).max(1).toarray().squeeze() > angle_threshold pos_curv = np.logical_or(pos_curv, deg_seg) possitive_diffusion_step = pos_curv * diffusion_step # (D - d*L)*y = D*x = b A_matrix = mass_mtx - \ (diags(possitive_diffusion_step, 0).dot(curvature_normal_mtx)) b_matrix = mass_mtx.dot(vertices_csc) vertices_csc = spsolve(A_matrix, b_matrix) stdout.write("\r step %d on %d done \n" % (nb_iter, nb_iter)) return vertices_csc.toarray()
def log_det_estimate_shogun(Q): logging.debug("Entering") op = RealSparseMatrixOperator(csc_matrix(Q)) engine = SerialComputationEngine() linear_solver = CGMShiftedFamilySolver() accuracy = 1e-3 eigen_solver = LanczosEigenSolver(op) eigen_solver.set_min_eigenvalue(OzonePosterior.ridge) op_func = LogRationalApproximationCGM(op, engine, eigen_solver, linear_solver, accuracy) # limit computation time linear_solver.set_iteration_limit(1000) eigen_solver.set_max_iteration_limit(1000) logging.info("Computing Eigenvalues (only largest)") eigen_solver.compute() trace_sampler = ProbingSampler(op) log_det_estimator = LogDetEstimator(trace_sampler, op_func, engine) n_estimates = 1 logging.info("Sampling log-determinant with probing vectors and rational approximation") estimates = log_det_estimator.sample(n_estimates) logging.debug("Leaving") return mean(estimates)
def curvature_normal_smooth(triangles, vertices, nb_iter=1, diffusion_step=1.0, area_weighted=False, backward_step=False, flow_file=None): if flow_file is not None: mem_map = np.memmap(flow_file, dtype=G_DTYPE, mode='w+', shape=(nb_iter, vertices.shape[0], vertices.shape[1])) vertices_csc = csc_matrix(vertices) if isinstance(diffusion_step, (int, float)): diffusion_step = diffusion_step * np.ones(len(vertices)) for i in range(nb_iter): stdout.write("\r step %d on %d done" % (i, nb_iter)) stdout.flush() if flow_file is not None: mem_map[i] = vertices_csc.toarray() # get curvature_normal_matrix curvature_normal_mtx = mean_curvature_normal_matrix( triangles, vertices_csc, area_weighted=area_weighted) next_vertices_csc = euler_step( curvature_normal_mtx, vertices_csc, diffusion_step, backward_step) vertices_csc = next_vertices_csc stdout.write("\r step %d on %d done \n" % (nb_iter, nb_iter)) # return next_vertices_csc return vertices_csc.toarray()
def volume_curvature_normal_smooth(triangles, vertices, nb_iter=1, diffusion_step=1.0, area_weighted=False, backward_step=False, flow_file=None): if isinstance(diffusion_step, (int, float)): diffusion_step = diffusion_step * np.ones(len(vertices)) if flow_file is not None: mem_map = np.memmap(flow_file, dtype=G_DTYPE, mode='w+', shape=(nb_iter, vertices.shape[0], vertices.shape[1])) for i in range(nb_iter): stdout.write("\r step %d on %d done" % (i, nb_iter)) stdout.flush() if flow_file is not None: mem_map[i] = vertices # get curvature_normal_matrix # todo not optimal, because operation done twice etc curvature_normal_mtx = mean_curvature_normal_matrix( triangles, vertices, area_weighted=area_weighted) # do the first step next_vertices = euler_step(curvature_normal_mtx, csc_matrix( vertices), diffusion_step, backward_step).toarray() # test if direction is positive direction = next_vertices - vertices normal_dir = vertices_cotan_normal(triangles, vertices, normalize=True) dotv = dot(normalize_vectors(direction), normal_dir, keepdims=True) vertices += direction * np.maximum(0.0, -dotv) stdout.write("\r step %d on %d done \n" % (nb_iter, nb_iter)) return vertices
def dtm_to_gensim_corpus(dtm): """ Convert a (sparse) DTM to a Gensim Corpus object. .. seealso:: :func:`~tmtoolkit.bow.dtm.gensim_corpus_to_dtm` for the reverse function or :func:`~tmtoolkit.bow.dtm.dtm_and_vocab_to_gensim_corpus_and_dict` which additionally creates a Gensim :class:`~gensim.corpora.dictionary.Dictionary`. :param dtm: (sparse) document-term-matrix of size NxM (N docs, M is vocab size) with raw terms counts :return: a Gensim :class:`gensim.matutils.Sparse2Corpus` object """ import gensim # DTM with documents to words sparse matrix in COO format has to be converted to transposed sparse matrix in CSC # format dtm_t = dtm.transpose() if issparse(dtm_t): if dtm_t.format != 'csc': dtm_sparse = dtm_t.tocsc() else: dtm_sparse = dtm_t else: from scipy.sparse.csc import csc_matrix dtm_sparse = csc_matrix(dtm_t) return gensim.matutils.Sparse2Corpus(dtm_sparse)
def log_det_estimate_shogun(Q): logging.debug("Entering") op = RealSparseMatrixOperator(csc_matrix(Q)) engine = SerialComputationEngine() linear_solver = CGMShiftedFamilySolver() accuracy = 1e-3 eigen_solver = LanczosEigenSolver(op) eigen_solver.set_min_eigenvalue(OzonePosterior.ridge) op_func = LogRationalApproximationCGM(op, engine, eigen_solver, linear_solver, accuracy) # limit computation time linear_solver.set_iteration_limit(1000) eigen_solver.set_max_iteration_limit(1000) logging.info("Computing Eigenvalues (only largest)") eigen_solver.compute() trace_sampler = ProbingSampler(op) log_det_estimator = LogDetEstimator(trace_sampler, op_func, engine) n_estimates = 1 logging.info( "Sampling log-determinant with probing vectors and rational approximation" ) estimates = log_det_estimator.sample(n_estimates) logging.debug("Leaving") return mean(estimates)
def solve_sparse_linear_system_shogun(A, b): logging.debug("Entering") solver = DirectSparseLinearSolver() operator = RealSparseMatrixOperator(csc_matrix(A)) result = solver.solve(operator, b) logging.debug("Leaving") return result
def test_from_csc1(): from siconos.numerics import SBM_from_csparse, SBM_get_value from scipy.sparse.csc import csc_matrix M = csc_matrix([[1,2,3], [4,5,6], [7,8,9]]) # print(M.indices) # print(M.indptr) # print(M.data) blocksize =3 r,SBM = SBM_from_csparse(blocksize,M) assert SBM_get_value(SBM,0,0) == 1 assert SBM_get_value(SBM,0,1) == 2 assert SBM_get_value(SBM,0,2) == 3 assert SBM_get_value(SBM,1,0) == 4 assert SBM_get_value(SBM,1,1) == 5 assert SBM_get_value(SBM,1,2) == 6 assert SBM_get_value(SBM,2,0) == 7 assert SBM_get_value(SBM,2,1) == 8 assert SBM_get_value(SBM,2,2) == 9
def load_results(f_results): """ Load results from CNMF on various FOVs and merge them after some preprocessing """ # load data i = 0 A_s = [] C_s = [] YrA_s = [] Cn_s = [] shape = None b_s = [] f_s = [] for f_res in f_results: print f_res i += 1 with np.load(f_res) as ld: A_s.append(csc.csc_matrix(ld['A2'])) C_s.append(ld['C2']) YrA_s.append(ld['YrA']) Cn_s.append(ld['Cn']) b_s.append(ld['b2']) f_s.append(ld['f2']) if shape is not None: shape_new = (ld['d1'], ld['d2']) if shape_new != shape: raise Exception('Shapes of FOVs not matching') else: shape = shape_new else: shape = (ld['d1'], ld['d2']) return A_s, C_s, YrA_s, Cn_s, b_s, f_s, shape
def load_results(f_results): """ Load results from CNMF on various FOVs and merge them after some preprocessing """ # load data i=0 A_s=[] C_s=[] YrA_s=[] Cn_s=[] shape = None b_s=[] f_s=[] for f_res in f_results: print(f_res) i+=1 with np.load(f_res) as ld: A_s.append(csc.csc_matrix(ld['A2'])) C_s.append(ld['C2']) YrA_s.append(ld['YrA']) Cn_s.append(ld['Cn']) b_s.append(ld['b2']) f_s.append(ld['f2']) if shape is not None: shape_new=(ld['d1'],ld['d2']) if shape_new != shape: raise Exception('Shapes of FOVs not matching') else: shape = shape_new else: shape=(ld['d1'],ld['d2']) return A_s,C_s,YrA_s, Cn_s, b_s, f_s, shape
def mass_stiffness_smooth(triangles, vertices, nb_iter=1, diffusion_step=1.0, flow_file=None): vertices_csc = csc_matrix(vertices) curvature_normal_mtx = mean_curvature_normal_matrix( triangles, vertices_csc, area_weighted=False) # mass_mtx = mass_matrix(triangles, vertices_csc).astype(np.float) if isinstance(diffusion_step, (int, float)): diffusion_step = diffusion_step * np.ones(len(vertices)) if flow_file is not None: mem_map = np.memmap(flow_file, dtype=G_DTYPE, mode='w+', shape=(nb_iter, vertices.shape[0], vertices.shape[1])) for i in range(nb_iter): stdout.write("\r step %d on %d done" % (i, nb_iter)) stdout.flush() if flow_file is not None: mem_map[i] = vertices_csc.toarray() # get curvature_normal_matrix mass_mtx = mass_matrix(triangles, vertices_csc).astype(np.float) # (D - d*L)*y = D*x = b A_matrix = mass_mtx - \ (diags(diffusion_step, 0).dot(curvature_normal_mtx)) b_matrix = mass_mtx.dot(vertices_csc) vertices_csc = spsolve(A_matrix, b_matrix) stdout.write("\r step %d on %d done \n" % (nb_iter, nb_iter)) # return next_vertices_csc return vertices_csc.toarray()
def CSCfromCompactRepresentation(diag_vals, upper_rows, upper_cols, upper_vals): n = len(diag_vals) rows = np.concatenate((np.arange(n), upper_rows, upper_cols)) cols = np.concatenate((np.arange(n), upper_cols, upper_rows)) ij = np.vstack((rows, cols)) vals = np.concatenate((diag_vals, upper_vals, upper_vals)) return csc_matrix((vals, ij), shape=(n, n))
def positive_curvature_normal_smooth(triangles, vertices, nb_iter=1, diffusion_step=1.0, area_weighted=False, backward_step=False, flow_file=None): if flow_file is not None: mem_map = np.memmap(flow_file, dtype=G_DTYPE, mode='w+', shape=(nb_iter, vertices.shape[0], vertices.shape[1])) if isinstance(diffusion_step, (int, float)): diffusion_step = diffusion_step * np.ones(len(vertices)) curvature_normal_mtx = mean_curvature_normal_matrix( triangles, vertices, area_weighted=area_weighted) for i in range(nb_iter): stdout.write("\r step %d on %d done" % (i, nb_iter)) stdout.flush() if flow_file is not None: mem_map[i] = vertices # do the first step next_vertices = euler_step(curvature_normal_mtx, csc_matrix( vertices), diffusion_step, backward_step).toarray() # test if direction is positive direction = next_vertices - vertices normal_dir = vertices_normal(triangles, next_vertices, normalize=False) pos_curv = dot(direction, normal_dir, keepdims=True) < 0 vertices += direction * pos_curv stdout.write("\r step %d on %d done \n" % (nb_iter, nb_iter)) return vertices
def volume_mass_stiffness_smooth(triangles, vertices, nb_iter=1, diffusion_step=1.0, flow_file=None): vertices_csc = csc_matrix(vertices) curvature_normal_mtx = mean_curvature_normal_matrix(triangles, vertices, area_weighted=False) if isinstance(diffusion_step, (int, long, float)): diffusion_step = diffusion_step * np.ones(len(vertices)) if flow_file is not None: mem_map = np.memmap(flow_file, dtype=G_DTYPE, mode='w+', shape=(nb_iter, vertices.shape[0], vertices.shape[1])) for i in range(nb_iter): stdout.write("\r step %d on %d done" % (i, nb_iter)) stdout.flush() if flow_file is not None: mem_map[i] = vertices_csc.toarray() # get curvature_normal_matrix mass_mtx = mass_matrix(triangles, vertices) raise NotImplementedError() # (D - d*L)*y = D*x = b A_matrix = mass_mtx - \ diags(diffusion_step, 0).dot(curvature_normal_mtx) b_matrix = mass_mtx.dot(csc_matrix(vertices_csc)) next_vertices = spsolve(A_matrix, b_matrix) # test if direction is positive direction = next_vertices.toarray() - vertices_csc normal_dir = vertices_cotan_normal(triangles, next_vertices, normalize=True) dotv = normalize_vectors(direction).multiply(normal_dir) vertices_csc += direction * np.maximum(0.0, -dotv) # vertices_csc += direction * sigmoid(-np.arctan(dotv)*np.pi - np.pi) # vertices_csc += direction * softplus(-dotv) stdout.write("\r step %d on %d done \n" % (nb_iter, nb_iter)) return vertices_csc.toarray()
def test_tocsc(self): cscm = self.basic_m.tocsc() m = self.basic_m scipym = csc_matrix((m.data, (m.row, m.col)), shape=(4, 4)) self.assertListEqual(cscm.indices.tolist(), scipym.indices.tolist()) self.assertListEqual(cscm.indptr.tolist(), scipym.indptr.tolist()) self.assertListEqual(cscm.data.tolist(), scipym.data.tolist()) self.assertEqual(cscm.shape, scipym.shape) self.assertIsInstance(cscm, SparseBase)
def build_sparse(R,U,rows,cols): """ Returns an equivalent matrix in CSC format. """ n = len(R) all_rows = np.concatenate((np.arange(n),rows,cols)) all_cols = np.concatenate((np.arange(n),cols,rows)) ij = np.vstack((all_rows,all_cols)) vals = np.concatenate((R,U,U)) return csc_matrix((vals,ij),shape=(n,n))
def subtract_dtm_frequencies(dtm_1, terms_1, dtm_2, terms_2): """ :param dtm_1: DTM to subtract frequencies from :param terms_1: Terms (column names) for dtm_1 :param dtm_2: DTM to subtract frequencies to dtm_1 :param terms_2: Terms (column names) for dtm_2 :return: DTM with a unique row with the difference of frequencies from terms_1 minus terms_2 """ arr_freq_1 = dtm_1.sum(axis=0).getA1() arr_freq_2 = dtm_2.sum(axis=0).getA1() return csc_matrix(subtract_term_frequencies(terms_1, arr_freq_1, terms_2, arr_freq_2))
def transpose(self, axes=None, copy=False): if axes is not None: raise ValueError(("Sparse matrices do not support " "an 'axes' parameter because swapping " "dimensions is the only logical permutation.")) M, N = self.shape from scipy.sparse.csc import csc_matrix return csc_matrix((self.data, self.indices, self.indptr), shape=(N, M), copy=copy)
def laplacian_smooth(triangles, vertices, nb_iter=1, diffusion_step=1.0, l2_dist_weighted=False, area_weighted=False, backward_step=False, flow_file=None): if flow_file is not None: mem_map = np.memmap(flow_file, dtype=G_DTYPE, mode='w+', shape=(nb_iter, vertices.shape[0], vertices.shape[1])) vertices_csc = csc_matrix(vertices) if isinstance(diffusion_step, (int, long, float)): diffusion_step = diffusion_step * np.ones(len(vertices)) for i in range(nb_iter): stdout.write("\r step %d on %d done" % (i, nb_iter)) stdout.flush() if flow_file is not None: mem_map[i] = vertices_csc.toarray() if l2_dist_weighted: # if l2_dist_weighted, we need to compute laplacian_matrix # each iteration (because ||e_ij|| change) # A_ij_l2_dist_weighted = A_ij / ||e_ij|| adjacency_matrix = edge_length(triangles, vertices_csc) ################################################################### # adjacency_matrix.data **= -1 # laplacian_matrix = laplacian(adjacency_matrix, diag_of_1=False) ################################################################### adjacency_matrix.data **= 1 # 1 laplacian_matrix = laplacian(adjacency_matrix, diag_of_1=True) else: adjacency_matrix = edge_adjacency(triangles, vertices_csc) laplacian_matrix = laplacian(adjacency_matrix, diag_of_1=True) if area_weighted: vts_mix_area = vertices_mix_area(triangles, vertices_csc) laplacian_matrix = diags(vts_mix_area**-1, 0).dot(laplacian_matrix) next_vertices_csc = euler_step(laplacian_matrix, vertices_csc, diffusion_step, backward_step) vertices_csc = next_vertices_csc stdout.write("\r step %d on %d done \n" % (nb_iter, nb_iter)) # return next_vertices_csc return next_vertices_csc.toarray()
def exhaustive_set(G, query_nodes, target_nodes, n_edges, start_dist): """Exaustively searches all the combinations of k links between a set of query nodes Q and a set of absorbing target nodes C such that Q \cap C = \emptyset. Parameters ---------- G : Networkx graph The graph from which the team will be selected. query : list The set of nodes from which random walker starts. target : list The set of nodes from where the random walker ends. n_edges : integer the number of links to be added start_dist: list The starting distribution over the query set Returns ------- links : list The set of links that reduce the absorbing RW centrality ac_scores: list The set of scores of adding the links """ query_set_size = len(query_nodes) map_query_to_org = dict(zip(query_nodes, range(query_set_size))) P = csc_matrix(nx.google_matrix(G, alpha=1)) P_abs = P[list(query_nodes),:][:,list(query_nodes)] F = compute_fundamental(P_abs) row_sums = start_dist.dot(F.sum())[0,0] candidates = list(product(query_nodes, target_nodes)) eligible = [candidates[i] for i in range(len(candidates)) if G.has_edge(candidates[i][0], candidates[i][1]) == False] ac_scores = [row_sums] exhaustive_links = [] for L in range(1, n_edges+1): print '\t Number of edges {}'.format(L) round_min = -1 best_combination = [] for subset in combinations(eligible, L): H = G.copy() F_modified = F.copy() for links_to_add in subset: F_updated = update_fundamental_mat(F_modified, H, map_query_to_org, links_to_add[0]) H.add_edge(links_to_add[0], links_to_add[1]) F_modified = F_updated abs_cen = start_dist.dot( F_updated.sum(axis = 1))[0,0] if abs_cen < round_min or round_min == -1: best_combination = subset round_min = abs_cen exhaustive_links.append(best_combination) ac_scores.append(round_min) return exhaustive_links, ac_scores
def dtm_to_gensim_corpus(dtm): import gensim # DTM with documents to words sparse matrix in COO format has to be converted to transposed sparse matrix in CSC # format dtm_t = dtm.transpose() if hasattr(dtm_t, 'tocsc'): dtm_sparse = dtm_t.tocsc() else: from scipy.sparse.csc import csc_matrix dtm_sparse = csc_matrix(dtm_t) return gensim.matutils.Sparse2Corpus(dtm_sparse)
def test_get_dtm_frequency_diff(self): texts_2 = [ "Más texto con la letra eñe", "Este texto también contiene la eñe" ] dtm_1 = vec.fit_transform(TEXTS) terms_1 = np.array(vec.get_feature_names()) dtm_2 = vec.fit_transform(texts_2) terms_2 = np.array(vec.get_feature_names()) result = dtm.subtract_dtm_frequencies(dtm_1, terms_1, dtm_2, terms_2) expected = csc_matrix([0, 2, 1, 1, -1, -1, 0, 1, 2, 1]) self.assertTrue(dtm.equal(expected, result))
def test(): """ Test function ran with -t flag """ print "Running 5 node test ...." g = csc_matrix([[0, 1, 0, 0, 1], [1, 0, 1, 1, 0], [0, 1, 0, 1, 0], [0, 1, 1, 0, 0], [1, 0, 0, 0, 1]]) print "Input csc: \n", g.todense() print "Python igraph ...\n", csc_to_igraph(g).get_adjacency() from r_utils import r_igraph_get_adjacency print "R igraph ...\n", r_igraph_get_adjacency(csc_to_r_igraph(g))
def tocsc(self, copy=False): idx_dtype = get_index_dtype((self.indptr, self.indices), maxval=max(self.nnz, self.shape[0])) indptr = np.empty(self.shape[1] + 1, dtype=idx_dtype) indices = np.empty(self.nnz, dtype=idx_dtype) data = np.empty(self.nnz, dtype=upcast(self.dtype)) csr_tocsc(self.shape[0], self.shape[1], self.indptr.astype(idx_dtype), self.indices.astype(idx_dtype), self.data, indptr, indices, data) from scipy.sparse.csc import csc_matrix A = csc_matrix((data, indices, indptr), shape=self.shape) A.has_sorted_indices = True return A
def test_jaccard(): implicit_matrix = np.array([[1, 1, 1, 1], [1, 1, 0, 0], [0, 0, 1, 0]]) assert implicit_matrix.shape == (3, 4) implicit_matrix = csc_matrix(implicit_matrix) n2i = {"huey": 0, "dewey": 1, "louie": 2, "chewy": 3} t2i = {"Batman": 0, "Mystery Men": 1, "Taxi Driver": 2} i2n = {v: k for k, v in n2i.items()} i2t = {v: k for k, v in t2i.items()} jrec = JaccardRecommender(implicit_matrix, p2i=None, t2i=t2i, i2t=i2t, i2p=None) print(jrec.item_to_item(N=10, title="Batman"))
def tf_idf(df, voc, idf=None, mode='train'): vectorizer_tit = CountVectorizer(token_pattern='\w+', vocabulary=voc) vectorizer_des = CountVectorizer(token_pattern='\w+', vocabulary=voc) vectorizer_com = CountVectorizer(token_pattern='\w+', vocabulary=voc) # fit vectorizer on data(text:srting) -> vector of features vec_fit_tit = vectorizer_tit.fit_transform(df['title']) vec_fit_des = vectorizer_des.fit_transform(df['description']) vec_fit_com = vectorizer_com.fit_transform(df['combined']) # count each word counts_tit = np.array(vec_fit_tit.sum(axis=0)).flatten().tolist() counts_des = np.array(vec_fit_des.sum(axis=0)).flatten().tolist() counts_com = np.array(vec_fit_com.sum(axis=0)).flatten().tolist() # get each uniq word in data words_tit = vectorizer_tit.get_feature_names() words_des = vectorizer_des.get_feature_names() words_com = vectorizer_com.get_feature_names() # dictionary of word and collection frequency df_tit = pd.Series(counts_tit, index=words_tit).to_dict() df_des = pd.Series(counts_des, index=words_des).to_dict() df_com = pd.Series(counts_com, index=words_com).to_dict() if (mode == 'train'): # calculate idf vector N = df.shape[0] idf = {} for term in df_com.keys(): idf[term] = np.log(N) - np.log( df_com[term] + 0.000001 ) # calculate idf based on combined 'title'+'description'. idf = csc.csc_matrix(list( idf.values())) # convert idf values to sparse matrix # calculate tfidf vectors tfidf_tit_csc = idf.multiply(vec_fit_tit) tfidf_des_csc = idf.multiply(vec_fit_des) tfidf_com_csc = idf.multiply(vec_fit_com) for index, row in df.iterrows(): df.at[index, 'title'] = tfidf_tit_csc[index] df.at[index, 'description'] = tfidf_des_csc[index] df.at[index, 'combined'] = tfidf_com_csc[index] return (df, idf)
def test_mul_sparse_matrix(self): #from pyomo.contrib.pynumero.sparse.block_matrix import BlockMatrix # test unsymmetric times unsymmetric m = self.basic_m dense_m = m.toarray() res = m * m dense_res = np.matmul(dense_m, dense_m) self.assertFalse(res.is_symmetric) self.assertTrue(np.allclose(res.toarray(), dense_res)) # test symmetric result m = self.basic_m dense_m = m.toarray() res = m.transpose() * m dense_res = np.matmul(dense_m.transpose(), dense_m) self.assertTrue(res.is_symmetric) self.assertTrue(np.allclose(res.toarray(), dense_res)) # test unsymmetric with rectangular m = self.basic_m dense_m2 = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]) m2 = CSCMatrix(dense_m2) res = m * m2 dense_res = np.matmul(m.toarray(), dense_m2) self.assertFalse(res.is_symmetric) self.assertTrue(np.allclose(res.toarray(), dense_res)) # test unsymmetric with rectangular scipycsr m = self.basic_m dense_m2 = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]) m2 = csc_matrix(dense_m2) with self.assertRaises(Exception) as context: res = m * m2 # test product with symmetric matrix m = self.basic_m dense_m = m.todense() m2 = self.basic_sym_m dense_m2 = m2.todense() res = m * m2 res_dense = np.matmul(dense_m, dense_m2) self.assertTrue(np.allclose(res.todense(), res_dense)) """
def lpDot(mat, arr): """ CSC matrix-vector or CSC matrix-matrix dot product (A x b) :param mat: CSC sparse matrix (A) :param arr: dense vector or matrix of object type (b) :return: vector or matrix result of the product """ n_rows, n_cols = mat.shape # check dimensional compatibility assert (n_cols == arr.shape[0]) # check that the sparse matrix is indeed of CSC format if mat.format == 'csc': mat_2 = mat else: # convert the matrix to CSC sparse mat_2 = csc_matrix(mat) if len(arr.shape) == 1: """ Uni-dimensional sparse matrix - vector product """ res = np.zeros(n_rows, dtype=arr.dtype) for i in range(n_cols): for ii in range(mat_2.indptr[i], mat_2.indptr[i + 1]): j = mat_2.indices[ii] # row index res[j] += mat_2.data[ii] * arr[ i] # C.data[ii] is equivalent to C[i, j] else: """ Multi-dimensional sparse matrix - matrix product """ cols_vec = arr.shape[1] res = np.zeros((n_rows, cols_vec), dtype=arr.dtype) for k in range( cols_vec ): # for each column of the matrix "vec", do the matrix vector product for i in range(n_cols): for ii in range(mat_2.indptr[i], mat_2.indptr[i + 1]): j = mat_2.indices[ii] # row index res[j, k] += mat_2.data[ii] * arr[ i, k] # C.data[ii] is equivalent to C[i, j] return res
def test(): """ Test function ran with -t flag """ print "Running 5 node test ...." g = csc_matrix([ [0, 1, 0, 0, 1], [1, 0, 1, 1, 0], [0, 1, 0, 1, 0], [0, 1, 1, 0, 0], [1, 0, 0, 0, 1] ]) print "Input csc: \n", g.todense() print "Python igraph ...\n", csc_to_igraph(g).get_adjacency() from r_utils import r_igraph_get_adjacency print "R igraph ...\n", r_igraph_get_adjacency(csc_to_r_igraph(g))
def random_links(G, query_nodes, target_nodes, n_edges, start_dist): """Selects a random set of links between a set of query nodes Q and a set of absorbing target nodes C such that Q \cap C = \emptyset. Parameters ---------- G : Networkx graph The graph from which the team will be selected. query : list The set of nodes from which random walker starts. target : list The set of nodes from where the random walker ends. n_edges : integer the number of links to be added start_dist: list The starting distribution over the query set Returns ------- links : list The set of links that reduce the absorbing RW centrality ac_scores: list The set of scores of adding the links """ query_set_size = len(query_nodes) map_query_to_org = dict(zip(query_nodes, range(query_set_size))) P = csc_matrix(nx.google_matrix(G, alpha=1)) P_abs = P[list(query_nodes),:][:,list(query_nodes)] F = compute_fundamental(P_abs) row_sums = start_dist.dot(F.sum())[0,0] candidates = list(product(query_nodes, target_nodes)) eligible = [candidates[i] for i in range(len(candidates)) if G.has_edge(candidates[i][0], candidates[i][1]) == False] links_to_add = sample(eligible, n_edges) ac_scores = [] ac_scores.append(row_sums) i = 0 while i < n_edges: F_updated = update_fundamental_mat(F, G, map_query_to_org, links_to_add[i][0]) G.add_edge(links_to_add[i][0], links_to_add[i][1]) abs_cen = start_dist.dot(F_updated.sum(axis = 1))[0,0] F = F_updated ac_scores.append(abs_cen) i += 1 return links_to_add, ac_scores
def doFeatureEncoding(self, features): """ do feature encoding to original features""" encodedFeatures = None whitenedFeatures = whiten(features) if self._featureEncodingMethod == 'vector-quantization': # Vector quantization # each row is a feature vector index, _ = vq(whitenedFeatures, self._codebook) row, _ = features.shape col = config.codebookSize encodedFeatures = np.zeros((row, col)) for i in xrange(len(index)): encodedFeatures[i, index[i]] = 1 elif self._featureEncodingMethod == 'sparse-coding': # Sparse coding # each column is a feature vector X = np.asfortranarray(whitenedFeatures.transpose()) X = np.asfortranarray(X / np.tile(np.sqrt((X*X).sum(axis=0)), (X.shape[0],1)), dtype= X.dtype) D = np.asfortranarray(self._codebook.transpose()) D = np.asfortranarray(D / np.tile(np.sqrt((D*D).sum(axis=0)), (D.shape[0],1)), dtype = D.dtype) # Parameters of the optimization are chosen param = { 'lambda1': 0.15, 'numThreads': -1, 'mode': 0 } alpha = spams.lasso(X, D, **param) # alpha is sparse matrix alphaShape = (D.shape[1], X.shape[1]) denseMatrix = csc_matrix(alpha, shape = alphaShape).todense() encodedFeatures = np.asarray(denseMatrix).transpose() return encodedFeatures
def kmeans(X, K, maxiters, M=None, eps=1e-3): """Standard k-means. _X_ is data rowwise. _K_ is the number of clusters. _M_ is the set of centers. Implementation tries to save some computation cycles. """ N, d = X.shape if M is None: tmp = np.random.permutation(N) M = X[tmp[:K]].copy() costs = [] last = np.inf X_sq_sum = np.sum(X**2) for i in xrange(maxiters): # see metric.py, but here: don't need squares from # X, because _minimal_ cost over K is independent from it. cost = -2*np.dot(X, M.T) + np.sum(M**2, axis=1) idx = np.argmin(cost, axis=1) cost = cost[xrange(N), idx] costs.append(X_sq_sum + np.sum(cost)) if (last - costs[-1]) < eps: break last = costs[-1] # Determine new centers # Sparseification from Jakob Verbeek's kmeans code, # http://lear.inrialpes.fr/~verbeek/software.php ind = csc.csc_matrix( (np.ones(N), (idx, xrange(N))), shape=(K, N)) M = ind.dot(X) weights = np.array(ind.sum(axis=1)) # Handle problem: no points assigned to a cluster zeros_idx = (weights.ravel()==0) zeros = np.sum(zeros_idx) tmp = np.random.permutation(N) M[zeros_idx, :] = X[tmp[:zeros]].copy() weights[zeros_idx] = 1 M /= weights return M, costs
def kmeans(X, K, maxiters, M=None, eps=1e-3): """Standard k-means. _X_ is data rowwise. _K_ is the number of clusters. _M_ is the set of centers. Implementation tries to save some computation cycles. """ N, d = X.shape if M is None: tmp = np.random.permutation(N) M = X[tmp[:K]].copy() costs = [] last = np.inf X_sq_sum = np.sum(X**2) for i in xrange(maxiters): # see metric.py, but here: don't need squares from # X, because _minimal_ cost over K is independent from it. cost = -2 * np.dot(X, M.T) + np.sum(M**2, axis=1) idx = np.argmin(cost, axis=1) cost = cost[xrange(N), idx] costs.append(X_sq_sum + np.sum(cost)) if (last - costs[-1]) < eps: break last = costs[-1] # Determine new centers # Sparseification from Jakob Verbeek's kmeans code, # http://lear.inrialpes.fr/~verbeek/software.php ind = csc.csc_matrix((np.ones(N), (idx, xrange(N))), shape=(K, N)) M = ind.dot(X) weights = np.array(ind.sum(axis=1)) # Handle problem: no points assigned to a cluster zeros_idx = (weights.ravel() == 0) zeros = np.sum(zeros_idx) tmp = np.random.permutation(N) M[zeros_idx, :] = X[tmp[:zeros]].copy() weights[zeros_idx] = 1 M /= weights return M, costs
def kmeans_np(X, lmbda, M=None): """Non-parametric kmeans. _X_ is input data, rowwise. _lmbda_ controls tradeoff between standard kmeans and cluster penalty term. See http://www.cs.berkeley.edu/~jordan/papers/kulis-jordan-icml12.pdf """ N, d = X.shape if M is None: M = np.mean(X, axis=0).reshape(1, d) k = M.shape[0] - 1 X_sq_sum = np.sum(X**2, axis=1) ind = np.zeros(N) old_ind = ind.copy() tmp = 0 iters = 1 while True: print "Iteration ", iters iters = iters + 1 for i in xrange(N): tmp = -2 * np.dot(X[i], M.T) + np.sum(M**2, axis=1) idx = np.argmin(tmp) if (X_sq_sum[i] + tmp[idx]) > lmbda: k = k + 1 M = np.append(M, X[i].copy().reshape(1, d), axis=0) ind[i] = k print "Adding cluster for ", i, k else: ind[i] = idx if np.all(old_ind == ind): break # see kmeans above ind_all = csc.csc_matrix((np.ones(N), (ind, xrange(N))), shape=(k + 1, N)) M = ind_all.dot(X) M /= np.array(ind_all.sum(axis=1)) old_ind = ind ind = np.zeros(N) return M, np.array(ind_all.sum(axis=1)).ravel()
def writetest(desikan): """ Write Test function ran with -t flag Positional Args: =============== desikan - use the desikan mapping? """ from scipy.sparse.csc import csc_matrix print "Running 5 node test ...." g = csc_matrix([ [0, 1, 0, 0, 5], [1, 0, 3, 1, 0], [0, 3, 0, 1, 0], [0, 1, 1, 0, 0], [5, 0, 0, 0, 0] ]) src = csc_to_graphml(g, test=True, desikan=desikan) print "Test complete ..." print src
def kmeans_np(X, lmbda, M=None): """Non-parametric kmeans. _X_ is input data, rowwise. _lmbda_ controls tradeoff between standard kmeans and cluster penalty term. See http://www.cs.berkeley.edu/~jordan/papers/kulis-jordan-icml12.pdf """ N, d = X.shape if M is None: M = np.mean(X, axis=0).reshape(1, d) k = M.shape[0] - 1 X_sq_sum = np.sum(X**2, axis=1) ind = np.zeros(N) old_ind = ind.copy() tmp = 0 iters = 1 while True: print "Iteration ", iters iters = iters + 1 for i in xrange(N): tmp = -2*np.dot(X[i], M.T) + np.sum(M**2, axis=1) idx = np.argmin(tmp) if (X_sq_sum[i] + tmp[idx]) > lmbda: k = k + 1 M = np.append(M, X[i].copy().reshape(1, d), axis=0) ind[i] = k print "Adding cluster for ", i, k else: ind[i] = idx if np.all(old_ind == ind): break # see kmeans above ind_all = csc.csc_matrix((np.ones(N), (ind, xrange(N))), shape=(k+1, N)) M = ind_all.dot(X) M /= np.array(ind_all.sum(axis=1)) old_ind = ind ind = np.zeros(N) return M, np.array(ind_all.sum(axis=1)).ravel()
def igraph_to_csc(g, save=False, fn="csc_matlab"): """ Convert an igraph to scipy.sparse.csc.csc_matrix Positional arguments: ===================== g - the igraph graph Optional arguments: =================== save - save file to disk fn - the file name to be used when writing (appendmat = True by default) """ assert isinstance(g, igraph.Graph), "Arg1 'g' must be an igraph graph" print "Creating CSC from igraph object ..." gs = csc_matrix(g.get_adjacency().data) # Equiv of calling to_dense so may case MemError print "CSC creation complete ..." if save: print "Saving to MAT file ..." sio.savemat(fn, {"data":gs}, True) # save as MAT format only. No other options! return gs
def doFeatureEncoding(self, features): """ do feature encoding to original features""" encodedFeatures = None whitenedFeatures = whiten(features) if self._featureEncodingMethod == 'vector-quantization': # Vector quantization # each row is a feature vector index, _ = vq(whitenedFeatures, self._codebook) row, _ = features.shape col = config.codebookSize encodedFeatures = np.zeros((row, col)) for i in xrange(len(index)): encodedFeatures[i, index[i]] = 1 elif self._featureEncodingMethod == 'sparse-coding': # Sparse coding # each column is a feature vector X = np.asfortranarray(whitenedFeatures.transpose()) X = np.asfortranarray(X / np.tile(np.sqrt((X * X).sum(axis=0)), (X.shape[0], 1)), dtype=X.dtype) D = np.asfortranarray(self._codebook.transpose()) D = np.asfortranarray(D / np.tile(np.sqrt((D * D).sum(axis=0)), (D.shape[0], 1)), dtype=D.dtype) # Parameters of the optimization are chosen param = {'lambda1': 0.15, 'numThreads': -1, 'mode': 0} alpha = spams.lasso(X, D, **param) # alpha is sparse matrix alphaShape = (D.shape[1], X.shape[1]) denseMatrix = csc_matrix(alpha, shape=alphaShape).todense() encodedFeatures = np.asarray(denseMatrix).transpose() return encodedFeatures
def test_from_csc1(): from siconos.numerics import SBM_from_csparse, SBM_get_value from scipy.sparse.csc import csc_matrix M = csc_matrix([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) # print(M.indices) # print(M.indptr) # print(M.data) blocksize = 3 r, SBM = SBM_from_csparse(blocksize, M) assert SBM_get_value(SBM, 0, 0) == 1 assert SBM_get_value(SBM, 0, 1) == 2 assert SBM_get_value(SBM, 0, 2) == 3 assert SBM_get_value(SBM, 1, 0) == 4 assert SBM_get_value(SBM, 1, 1) == 5 assert SBM_get_value(SBM, 1, 2) == 6 assert SBM_get_value(SBM, 2, 0) == 7 assert SBM_get_value(SBM, 2, 1) == 8 assert SBM_get_value(SBM, 2, 2) == 9
# ================================================================================ import numpy as np from scipy.sparse.csc import csc_matrix import pyGPs from pyGPs.Validation import valid from pyGPs.GraphExtensions import graphUtil, graphKernels location = "graphData/" data = np.load(location + "MUTAG.npz") # n = num of nodes # N = num of graphs # p = num of labels A = csc_matrix( (data["adj_data"], data["adj_indice"], data["adj_indptr"]), shape=data["adj_shape"] ) # n x n adjancy array (sparse matrix) gr_id = data["graph_ind"] # n x 1 graph id array node_label = data["responses"] # n x 1 node label array graph_label = data["labels"] # N x 1 graph label array N = graph_label.shape[0] # number of graphs) graph_label = np.int8(graph_label) for i in xrange(N): if graph_label[i, 0] == 0: graph_label[i, 0] -= 1 # =========================================================================== # COMPUTE PROPAGATION KERNELS # =========================================================================== num_Iteration = 10
# Marion Neumann, Daniel Marthaler, Shan Huang & Kristian Kersting, 18/02/2014 #================================================================================ import numpy as np from scipy.sparse.csc import csc_matrix import pyGPs from pyGPs.Validation import valid from pyGPs.GraphExtensions import graphUtil,graphKernels location = 'graphData/' data = np.load(location+'MUTAG.npz') # n = num of nodes # N = num of graphs # p = num of labels A = csc_matrix( (data['adj_data'], data['adj_indice'], \ data['adj_indptr']), shape=data['adj_shape']) # n x n adjancy array (sparse matrix) gr_id = data['graph_ind'] # n x 1 graph id array node_label = data['responses'] # n x 1 node label array graph_label = data['labels'] # N x 1 graph label array N = graph_label.shape[0] # number of graphs) graph_label = np.int8(graph_label) for i in range(N): if graph_label[i,0] == 0: graph_label[i,0] -= 1 #=========================================================================== # COMPUTE PROPAGATION KERNELS #=========================================================================== num_Iteration = 10 w = 1e-4
def get_approx_boundary(G, query_nodes, target_nodes, n_edges, start_dist): """ Used to calculate an approximation guarantee for greedy algorithm """ H = G.copy() # GET A COPY OF THE GRAPH query_set_size = len(query_nodes) target_set_size = len(target_nodes) map_query_to_org = dict(zip(query_nodes, range(query_set_size))) candidates = list(product(query_nodes, target_nodes)) # ALL minus exitsting in G eligible = [candidates[i] for i in range(len(candidates)) if H.has_edge(candidates[i][0], candidates[i][1]) == False] # CALCULATE MARGINAL GAIN TO EMPTY SET FOR ALL NODES IN STEEPNESS FUNCTION P = csc_matrix(nx.google_matrix(H, alpha=1)) P_abs = P[list(query_nodes),:][:,list(query_nodes)] F = compute_fundamental(P_abs) row_sums_empty = start_dist.dot(F.sum(axis=1))[0,0] # F(\emptyset) # candidates = list(product(query_nodes, target_nodes)) ac_marginal_empty = [] ac_marginal_full = [] source_idx_empty = [] node_processed = -1 for out_edge in eligible: abs_cen = -1 source_node = out_edge[0] if(node_processed == source_node): # skip updating matrix because this updates the F matrix in the same way continue node_processed = source_node F_updated = update_fundamental_mat(F, H, map_query_to_org, source_node) abs_cen = start_dist.dot(F_updated.sum(axis = 1))[0,0] ac_marginal_empty.append(abs_cen) source_idx_empty.append(source_node) sorted_indexes_empty = [i[0] for i in sorted(enumerate(source_idx_empty), key=lambda x:x[1])] ac_marginal_empty = [ac_marginal_empty[i] for i in sorted_indexes_empty] # CALCULATE MARGINAL GAIN FOR FULL SET H.add_edges_from(eligible) P_all = csc_matrix(nx.google_matrix(H, alpha=1)) P_abs_all = P_all[list(query_nodes),:][:,list(query_nodes)] F_all = compute_fundamental(P_abs_all) row_sums_all = start_dist.dot(F_all.sum(axis=1))[0,0] node_prcessed = -1 source_idx = [] for out_edge in eligible: abs_cen = -1 source_node = out_edge[0] if(node_prcessed == source_node): # skip updating matrix because this updates the F matrix in the same way continue node_prcessed = source_node F_all_updated = update_rev_fundamental_mat(F_all, H, map_query_to_org, source_node) abs_cen = start_dist.dot(F_all_updated.sum(axis = 1))[0,0] ac_marginal_full.append(abs_cen) source_idx.append(source_node) sorted_indexes = [i[0] for i in sorted(enumerate(source_idx), key=lambda x:x[1])] ac_marginal_full = [ac_marginal_full[i] for i in sorted_indexes] assert sorted_indexes == sorted_indexes_empty , "Something is wrong with the way scores are appended" all_steepness = (asarray(ac_marginal_full) - row_sums_all) / (row_sums_empty-asarray(ac_marginal_empty)) s = min(all_steepness) node_max = argmin(all_steepness) return 1-s, sorted_indexes[node_max]
def greedy_navigation(G, query_nodes, target_nodes, n_edges, start_dist): """Selects a set of links with a greedy descent algorithm that reduce the absorbing RW centrality between a set of query nodes Q and a set of absorbing target nodes C such that Q \cap C = \emptyset. The query and target set must be a 'viable' partition of the graph. Parameters ---------- G : Networkx graph The graph from which the team will be selected. query : list The set of nodes from which random walker starts. target : list The set of nodes from where the random walker ends. n_edges : integer the number of links to be added start_dist: list The starting distribution over the query set P : Scipy matrix The transition matrix of the graph G F : Scipy matrix The fundamental matrix for the graph G with the given set of absorbing random walk nodes Returns ------- links : list The set of links that reduce the absorbing RW centrality """ H = G.copy() prng = RandomState() query_set_size = len(query_nodes) target_set_size = len(target_nodes) map_query_to_org = dict(zip(query_nodes, range(query_set_size))) P = csc_matrix(nx.google_matrix(H, alpha=1)) P_abs = P[list(query_nodes),:][:,list(query_nodes)] F = compute_fundamental(P_abs) row_sums = start_dist.dot(F.sum(axis=1))[0,0] best_F = zeros(F.shape) optimal_set = [] ac_scores = [] ac_scores.append(row_sums) while n_edges > 0: round_min = -1 best_node = -1 for i in query_nodes: abs_neighbours = [l for l in H.neighbors(i) if l in target_nodes] if len(abs_neighbours) == target_set_size: continue F_updated = update_fundamental_mat(F, H, map_query_to_org, i) abs_cen = start_dist.dot( F_updated.sum(axis = 1))[0,0] if abs_cen < round_min or round_min == -1: best_node = i round_min = abs_cen best_F = F_updated F = best_F ac_scores.append(round_min) optimal_candidate_edges = [(best_node, k, round_min) for k in target_nodes if H.has_edge(best_node, k) == False ] try: edge_idx = prng.randint(0, len(optimal_candidate_edges)) except ValueError: print(H.neighbors(best_node)) print([l for l in H.neighbors(best_node) if l in target_nodes]) print(best_node) print(optimal_candidate_edges) print(target_nodes) H.add_edge(optimal_candidate_edges[edge_idx][0], optimal_candidate_edges[edge_idx][1]) optimal_set.append(optimal_candidate_edges[edge_idx]) n_edges -= 1 return optimal_set, ac_scores
def reverse_greedy(G, query_nodes, target_nodes, n_edges, start_dist): """Selects a set of links with a reverse greedy descent algorithm that reduce the absorbing RW centrality between a set of query nodes Q and a set of absorbing target nodes C such that Q \cap C = \emptyset. The query and target set must be a 'viable' partition of the graph. Parameters ---------- G : Networkx graph The graph from which the team will be selected. query : list The set of nodes from which random walker starts. target : list The set of nodes from where the random walker ends. n_edges : integer the number of links to be added start_dist: list The starting distribution over the query set P : Scipy matrix The transition matrix of the graph G F : Scipy matrix The fundamental matrix for the graph G with the given set of absorbing random walk nodes Returns ------- links : list The set of links that reduce the absorbing RW centrality """ H = G.copy() query_set_size = len(query_nodes) map_query_to_org = dict(zip(query_nodes, range(query_set_size))) candidates = list(product(query_nodes, target_nodes)) eligible = [candidates[i] for i in range(len(candidates)) if H.has_edge(candidates[i][0], candidates[i][1]) == False] H.add_edges_from(eligible) P = csc_matrix(nx.google_matrix(H, alpha=1)) P_abs = P[list(query_nodes),:][:,list(query_nodes)] F = compute_fundamental(P_abs) row_sums = start_dist.dot(F.sum(axis=1))[0,0] # candidates = list(product(query_nodes, target_nodes)) worst_F = zeros(F.shape) worst_set = [] optimal_set = [] ac_scores = [] # ac_scores.append(row_sums) while len(eligible) > 0: round_min = -1 worst_link = (-1,-1) node_prcessed = -1 for out_edge in eligible: source_node = out_edge[0] if(node_prcessed == source_node): # skip updating matrix because this updates the F matrix in the same way continue node_prcessed = source_node F_updated = update_rev_fundamental_mat(F, H, map_query_to_org, source_node) abs_cen = start_dist.dot(F_updated.sum(axis = 1))[0,0] if abs_cen < round_min or round_min == -1: worst_link = out_edge round_min = abs_cen worst_F = F_updated F = worst_F H.remove_edge(*worst_link) worst_set.append(worst_link) eligible.remove(worst_link) if (len(eligible) <= n_edges): ac_scores.append(round_min) optimal_set.append(worst_link) return list(reversed(optimal_set)), list(reversed(ac_scores))
def link_prediction(G, query_nodes, target_nodes, n_edges, start_dist, alg = "ra"): """Selects a random set of links between based on the scores calculated by a standard link-prediction algorithm from networkx library Parameters ---------- G : Networkx graph The graph from which the team will be selected. query : list The set of nodes from which random walker starts. target : list The set of nodes from where the random walker ends. n_edges : integer the number of links to be added start_dist: list The starting distribution over the query set alg: string A string describing the link-prediction algorithm to be used Returns ------- links : list The set of links that reduce the absorbing RW centrality ac_scores: list The set of scores of adding the links """ assert alg in ["ra", "pa", "jaccard", "aa"], "alg must be one of [\"ra\", \"pa\", \"jaccard\", \"aa\"]." H = G.copy() query_set_size = len(query_nodes) map_query_to_org = dict(zip(query_nodes, range(query_set_size))) P = csc_matrix(nx.google_matrix(H, alpha=1)) P_abs = P[list(query_nodes),:][:,list(query_nodes)] F = compute_fundamental(P_abs) row_sums = start_dist.dot(F.sum())[0,0] candidates = list(product(query_nodes, target_nodes)) eligible = [candidates[i] for i in range(len(candidates)) if H.has_edge(candidates[i][0], candidates[i][1]) == False] links_to_add = [] if alg == 'ra': preds = nx.resource_allocation_index(H, eligible) elif alg == 'jaccard': preds = nx.jaccard_coefficient(H, eligible) elif alg == 'aa': preds = nx.adamic_adar_index(H, eligible) elif alg == 'pa': preds = nx.preferential_attachment(H, eligible) for u,v,p in preds: links_to_add.append((u,v,p)) links_to_add.sort(key=lambda x: x[2], reverse = True) ac_scores = [] ac_scores.append(row_sums) i = 0 while i < n_edges: F_updated = update_fundamental_mat(F, H, map_query_to_org, links_to_add[i][0]) H.add_edge(links_to_add[i][0], links_to_add[i][1]) abs_cen = start_dist.dot(F_updated.sum(axis = 1))[0,0] F = F_updated ac_scores.append(abs_cen) i += 1 return links_to_add, ac_scores
def threshold_components(A_s,shape,min_size=5,max_size=np.inf,max_perc=.5,remove_unconnected_components=True): """ Threshold components output of a CNMF algorithm (A matrices) Parameters: ---------- A_s: list list of A matrice output from CNMF min_size: int min size of the component in pixels max_size: int max size of the component in pixels max_perc: float fraction of the maximum of each component used to threshold remove_unconnected_components: boolean whether to remove components that are fragmented in space Returns: ------- B_s: list of the thresholded components lab_imgs: image representing the components in ndimage format cm_s: center of masses of each components """ B_s=[] lab_imgs=[] cm_s=[] for A_ in A_s: print('*') max_comps=A_.max(0).todense().T tmp=[] cm=[] lim=np.zeros(shape) for idx,a in enumerate(A_.T): #create mask by thresholding to 50% of the max mask=np.reshape(a.todense()>(max_comps[idx]*max_perc),shape) label_im, nb_labels = ndimage.label(mask) sizes = ndimage.sum(mask, label_im, list(range(nb_labels + 1))) if remove_unconnected_components: l_largest=(label_im==np.argmax(sizes)) cm.append(scipy.ndimage.measurements.center_of_mass(l_largest,l_largest)) lim[l_largest] = (idx+1) # #remove connected components that are too small mask_size=np.logical_or(sizes<min_size,sizes>max_size) if np.sum(mask_size[1:])>1: print(('removing ' + str( np.sum(mask_size[1:])-1) + ' components')) remove_pixel=mask_size[label_im] label_im[remove_pixel] = 0 label_im=(label_im>0)*1 tmp.append(label_im.flatten()) cm_s.append(cm) lab_imgs.append(lim) B_s.append(csc.csc_matrix(np.array(tmp)).T) return B_s, lab_imgs, cm_s
def log_det_shogun_exact(Q): logging.debug("Entering") logdet = Statistics.log_det(csc_matrix(Q)) logging.debug("Leaving") return logdet
def log_det_shogun_exact_plus_noise(Q): logging.debug("Entering") logdet = Statistics.log_det(csc_matrix(Q)) + randn() logging.debug("Leaving") return logdet
def log_det_scikits(Q): d = cholesky(csc_matrix(Q)).L().diagonal() return 2 * sum(log(d)) raise Exception("cholmod not installed")
fh.seek(pos) print "Getting edges ..." line = "" while True: line += fh.readline().replace(" ", "").strip() # remove if inefficient if line.endswith("</edge>"): edge = get_edge(line) g[edge[0], edge[1]] = edge[2] # Naive i.e slow. TODO: Optimize line = "" elif line.endswith("</graphml>"): break return csc_matrix(g) # Convert to CSC first def get_edge(st): """ Given a string I need to extract src, dest, weight (if available) No other edge attributes are representable Positional Args: =============== st - the string """ global __weight__ src = int(re.search("(?<=source=[\"']n)\d+", st).group()) dest = int(re.search("(?<=target=[\"']n)\d+", st).group()) if __weight__:
def csc_to_graphml(g, is_weighted=True, desikan=False, is_directed=False, save_fn="default_name.graphml", is_tri=False, test=False): """ Convert a csc graph to graphml format for writing to disk Positional arguments: ==================== g - the csc graph Optional arguments: =================== is_weighted - is the graph weighted. Type: boolean. desikan - use the desikan mapping to label nodes. Type: boolean is_directed - is g symmetric ? Type: boolean save_fn - file name to use when saving. Type: boolean is_tri - is the adjacency mat upper or lower triangular. Type: boolean test - are we running a test. Type: boolean """ print "Beginning graphml construction .." if test: test_str = "" tabs = 2 # How many tabs on affix to the front src = """<?xml version="1.0" encoding="UTF-8"?> <graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd"> <!-- Created by script: %s -->\n""" % __file__ # Do we have desikan labels ? if desikan: from mrcap import desikan src += " "*2+"<key id=\"v_region\" for=\"node\" attr.name=\"region\" attr.type=\"string\"/>\n" # Desikan vertex attr called v_region tabs = 3 # Is our graph weighted ? if is_weighted: src += " "*2+"<key id=\"e_weight\" for=\"edge\" attr.name=\"weight\" attr.type=\"double\"/>\n" # Desikan vertex attr called v_region tabs = 3 # Directed graph ? if is_directed: src += "\n <graph id=\"G\" edgedefault=\"undirected\">" # Undirected graph? else: # not directed so just use upper tri if not is_tri: print "Converting to upper triangular ..." from scipy.sparse.csc import csc_matrix from scipy.sparse import triu g = g = csc_matrix(triu(g, k=0)) src += "\n <graph id=\"G\" edgedefault=\"undirected\">\n" NUM_NODES = g.shape[0] if not test: f = open(save_fn if os.path.splitext(save_fn)[1] == ".graphml" else save_fn+".graphml", "wb") # Can be #pragma for for node in xrange(NUM_NODES): # Cycle through all nodes s = "<node id=\"n%d\">\n" % node if desikan: s += " "*(tabs+1)+"<data key=\"v_region\">\"%s\"</data>\n" % (desikan.des_map.get(node, "Undefined")) s += " "*tabs+"</node>\n" src += " "*tabs+s if node % 50000 == 0: print "Processing node %d / %d ..." % (node, NUM_NODES) if test: test_str += src else: f.write(src) src = "" del s # free mem print "Adding edges to graph ..." # Get all edge data nodes_from, nodes_to = g.nonzero() data = g.data del g # free some mem # Can be #pragma for NUM_EDGES = nodes_from.shape[0] for idx in xrange(NUM_EDGES): # Only the edges that exist src += " "*tabs+"<edge source=\"n%d\" target=\"n%d\">\n" % (nodes_from[idx], nodes_to[idx]) if is_weighted: src += " "*(tabs+1)+"<data key=\"e_weight\">%d</data>\n" % data[idx] src += " "*tabs+"</edge>\n" if idx % 100000 == 0: print "Processing edge %d / %d ..." % (idx, NUM_EDGES) if test: test_str += src else: f.write(src) src = "" src += " </graph>\n</graphml>" if test: test_str += src return test_str f.write(src) f.close