def _fit_transform(self, X, decompose_graph): X = check_array(X, accept_sparse='csr') self.nbrs_ = NearestNeighbors(n_neighbors=self.n_neighbors, algorithm=self.neighbors_algorithm, n_jobs=self.n_jobs) self.nbrs_.fit(X) self.training_data_ = self.nbrs_._fit_X self.kernel_pca_ = KernelPCA(n_components=self.n_components, kernel="precomputed", eigen_solver=self.eigen_solver, tol=self.tol, max_iter=self.max_iter, n_jobs=self.n_jobs) if self.radius is None: self.distance_graph = kneighbors_graph(self.nbrs_, self.n_neighbors, mode='distance', n_jobs=self.n_jobs) else: self.distance_graph = radius_neighbors_graph(self.nbrs_, radius=self.radius, mode='distance', n_jobs=self.n_jobs) if decompose_graph: temp = sparse.csgraph.connected_components(self.distance_graph) self.num_connected_comps = temp[0] self.connected_comps = [] self.dist_matrices = [] self.embeddings = [] # self.distance_graphs = [] for i0 in range(self.num_connected_comps): idx = temp[1] == i0 graph = self.distance_graph[idx, :][:, idx] self.connected_comps.append(graph) dist_matrix = graph_shortest_path(graph, method=self.path_method, directed=False) self.dist_matrices.append(dist_matrix) G = dist_matrix**2 G *= -0.5 embedding = self.kernel_pca_.fit_transform(G) self.embeddings.append(embedding) else: self.dist_matrix_ = graph_shortest_path(self.distance_graph, method=self.path_method, directed=False) G = self.dist_matrix_**2 G *= -0.5 self.embedding_ = self.kernel_pca_.fit_transform(G)
def _geodesic_distance(self, X): X_distance = graph_shortest_path(X) X_distance[X_distance == 0] = np.inf # graph_shortest_path returns a # float64 array, so inserting np.inf does not change the type. # Ideally however, graph_shortest_path would return an int array! np.fill_diagonal(X_distance, 0) return X_distance
def isomap(data, n_components=2, n_neighbors=6): data = distance_mat(data, n_neighbors) graph = graph_shortest_path(data, directed=False) graph = -0.5 * (graph**2) return mds(graph, n_components)
def redux(X, name): graph = neigh.kneighbors_graph(X, 10) pie = gsp.graph_shortest_path(graph) Y = np.random.rand(len(X), 2) * 100 derivs = np.zeros_like(Y) step = .00005 #Need step size to be very small to avoid diverging to NaN values for _ in tqdm(range(500), leave=False, desc=name): for i in range(len(X)): temp = np.ones_like(Y) * Y[i] A = np.subtract(temp, Y) normedA = prep.normalize( A) #Scikit handles the norm (0,0) case by just returning (0,0) pies = pie[i, :] deriv = np.sum(A, axis=0) deriv = deriv - np.matmul(pies.T, normedA) derivs[i] = deriv Y = Y - step * derivs return Y
def gen_triplets_from_knn(data, indices, num_neighbors=50): """ Description: Generate triplet data given distance matrix and random indices. :param data: #TODO :param indices: :param num_neighbors: :return: """ print('Generating the knn graph') sys.stdout.flush() kng = kneighbors_graph(data, num_neighbors, mode='distance', n_jobs=8) print('Computing the shortest path metric on the knn graph') sys.stdout.flush() sp_dist_matrix = graph_shortest_path(kng, method='auto', directed=False) del kng num_triplets = indices.shape[0] # Compute the number of triplets. triplet_set = np.zeros((num_triplets, 3), dtype=int) # Initializing the triplet set triplet_set[:, 0] = indices[:, 0] # Initialize index 1 randomly. d1 = sp_dist_matrix[indices[:, 0], indices[:, 1]] d2 = sp_dist_matrix[indices[:, 0], indices[:, 2]] det = np.sign(d1 - d2) triplet_set[:, 1] = ((indices[:, 1] + indices[:, 2] - det * indices[:, 1] + det * indices[:, 2]) / 2) triplet_set[:, 2] = ((indices[:, 1] + indices[:, 2] + det * indices[:, 1] - det * indices[:, 2]) / 2) triplet_set = triplet_set.astype(dtype=int) return triplet_set
def make_adjacency(data, dist_func="euclidean", eps=1): """ Step one of ISOMAP algorithm, make Adjacency and distance matrix Compute the WEIGHTED adjacency matrix A from the given data points. Points are considered neighbors if they are within epsilon of each other. Distance between points will be calculated using SciPy's cdist which will compute the D matrix for us. https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html INPUT ------ data - (ndarray) the dataset which should be a numpy array dist_func - (str) the distance metric to use. See SciPy cdist for list of options eps - (int/float) epsilon value to define the local region. I.e. two points are connected if they are within epsilon of each other. OUTPUT ------ short - (ndarray) Distance matrix, the shortest path from every point to every other point in the set, INF if not reachable. """ n, m = data.shape dist = cdist(data.T, data.T, metric=dist_func) adj = np.zeros((m, m)) + np.inf bln = dist < eps adj[bln] = dist[bln] short = graph_shortest_path(adj) return short
def get_rank_high(data, k_neighbours=15, knn_sym=True): # computes ranking of the original dataset through geodesic distances KNN = kneighbors_graph(data, k_neighbours, mode='distance', include_self=False).toarray() if knn_sym: KNN = np.maximum(KNN, KNN.T) n_components, labels = csgraph.connected_components(KNN) print(n_components) D_high = graph_shortest_path(KNN) if n_components: max_dist = np.max(D_high) * 10 for comp in np.unique(labels): ix_comp = np.where(labels == comp)[0] ix_not_comp = np.where(labels != comp)[0] for i in ix_comp: for j in ix_not_comp: D_high[i, j] = max_dist D_high[j, i] = max_dist Rank_high = get_ranking(D_high) return Rank_high
def test_FloydWarshall(): dist_matrix = generate_graph(20) for directed in (True, False): graph_FW = graph_shortest_path(dist_matrix, directed, 'FW') graph_py = FloydWarshallSlow(dist_matrix.copy(), directed) assert_array_almost_equal(graph_FW, graph_py)
def test_floyd_warshall(): dist_matrix = generate_graph(20) for directed in (True, False): graph_FW = graph_shortest_path(dist_matrix, directed, 'FW') graph_py = floyd_warshall_slow(dist_matrix.copy(), directed) assert_array_almost_equal(graph_FW, graph_py)
def test_Dijkstra(): dist_matrix = generate_graph(20) for directed in (True, False): graph_D = graph_shortest_path(dist_matrix, directed, 'D') graph_py = FloydWarshallSlow(dist_matrix.copy(), directed) assert_array_almost_equal(graph_D, graph_py)
def test_dijkstra(): dist_matrix = generate_graph(20) for directed in (True, False): graph_D = graph_shortest_path(dist_matrix, directed, 'D') graph_py = floyd_warshall_slow(dist_matrix.copy(), directed) assert_array_almost_equal(graph_D, graph_py)
def __compute_geodesics(self, dataset): """ Takes high-dimensional data and a user specified parameter k as input, and returns a distance matrix D, where D_ij is the shortestF-path distance between x_i and x_j along the manifold """ distance_matrix = k_nearest(dataset, self.k) return sg.graph_shortest_path(distance_matrix)
def residual_variance(X, X_m, n_neighbors=20): kng_h = kneighbors_graph(X, n_neighbors=n_neighbors, mode='distance', n_jobs=mp.cpu_count()).toarray() D_h = graph_shortest_path(kng_h, method='D', directed=False) #D_h = pairwise_distances(X, X, metric='euclidean') #D_l = kneighbors_graph(X_m, n_neighbors=50, mode='distance').toarray() D_l = pairwise_distances(X_m, X_m, metric='euclidean') r, _ = spearmanr(D_h.flatten(), D_l.flatten()) return 1 - r**2.0
def isomap(df, p, k): X = df.to_numpy() graph = kneighbors_graph(X, p, mode='distance') A = kneighbors_graph(X, p, mode='connectivity').toarray() distances = graph_shortest_path(graph, directed=False, method='FW') X = MDS(distances, k, True, False) cc = Graph(A).connected_components() if (len(cc) != 1): print("The graph is disconnected. Therefore we will have", len(cc), "separated graphs") return X
def gen_knn_graph_with_sp(data, num_neighbors): """ Description: Generate knn graph with the shortest path distance given data, #neighbors. :param data: #TODO :param num_neighbors: :return: """ kng = neighbors.kneighbors_graph(data, num_neighbors, mode='distance', n_jobs=8) sp_dist_matrix = graph_shortest_path(kng, method='auto', directed=False) return sp_dist_matrix
def cal_ontology_emb(dim=20, mi=0): fin = open( '/oak/stanford/groups/rbaltman/swang91/Sheng_repo/data/SingleCell/cl.ontology' ) lset = set() s2p = {} for line in fin: s, p = line.strip().split('\t') if s not in s2p: s2p[s] = set() s2p[s].add(p) lset.add(s) lset.add(p) fin.close() lset = np.sort(list(lset)) nl = len(lset) l2i = dict(zip(lset, range(nl))) i2l = dict(zip(range(nl), lset)) A = np.zeros((nl, nl)) for s in s2p: for p in s2p[s]: A[l2i[s], l2i[p]] = 1 A[l2i[p], l2i[s]] = 1 if mi == 0: sp = graph_shortest_path(A, method='FW', directed=False) X = svd_emb(sp, dim=dim) sp *= -1. elif mi == 1: sp = graph_shortest_path(A, method='FW', directed=False) X = DCA_vector(sp, dim=dim)[0] sp *= -1. elif mi == 2: sp = RandomWalkRestart(A, 0.8) X = svd_emb(sp, dim=dim) elif mi == 3: sp = RandomWalkRestart(A, 0.8) X = DCA_vector(sp, dim=dim)[0] return X, l2i, i2l, sp
def isomap(X, k=5): # Build graph according to euclidean length K = isomap_distance(X, k) # Compute the shortest graph distance, and square from sklearn.utils.graph_shortest_path import graph_shortest_path G = graph_shortest_path(K) #print(np.unique(G, return_counts=True)) # Double centering # G = double_centering(G) # MDS Y = mds(G) return Y
def isomap(z, n_dim, n_neighbor=None): num_samples, num_features = z.shape adj_mat = affinity_mat(z, n_neighbor=n_neighbor) shortest_paths = graph_shortest_path(adj_mat) h = np.eye(num_samples) - (1 / num_samples) * np.ones( (num_samples, num_samples)) k = -0.5 * h.dot(shortest_paths**2).dot(h) eigen_values, eigen_vectors = np.linalg.eigh(k) idx = eigen_values.argsort()[::-1] eigen_values, eigen_vectors = eigen_values[idx], eigen_vectors[:, idx] eigen_values, eigen_vectors = eigen_values[:n_dim], eigen_vectors[:, : n_dim] embedding = np.dot(eigen_vectors, np.diag(eigen_values**(1 / 2))) return embedding
def cal_ontology_emb(dim=20, mi=0, DATA_DIR = '../../OnClass_data/'): fin = open(DATA_DIR + 'cell_ontology/cl.ontology') lset = set() s2p = {} for line in fin: s,p = line.strip().split('\t') if s not in s2p: s2p[s] = set() s2p[s].add(p) lset.add(s) lset.add(p) fin.close() lset = np.sort(list(lset)) nl = len(lset) l2i = dict(zip(lset, range(nl))) i2l = dict(zip(range(nl), lset)) A = np.zeros((nl, nl)) for s in s2p: for p in s2p[s]: A[l2i[s], l2i[p]] = 1 A[l2i[p], l2i[s]] = 1 if mi==0: sp = graph_shortest_path(A,method='FW',directed =False) X = svd_emb(sp, dim=dim) sp *= -1. elif mi==1: sp = graph_shortest_path(A,method='FW',directed =False) X = DCA_vector(sp, dim=dim)[0] sp *= -1. elif mi==2: sp = RandomWalkRestart(A, 0.8) X = svd_emb(sp, dim=dim) elif mi==3: sp = RandomWalkRestart(A, 0.8) X = DCA_vector(sp, dim=dim)[0] return X, l2i, i2l, sp
def prob_to_dist_func(prob): """ Return a matrix of distances based the given probabilities matrix. """ N, M = prob.shape assert N == M, "a square matrix is required" # find set of nodes which have zero probability keep_idxs = np.nonzero(np.sum(prob, 1) > 0)[0] # convert probabilities to distances dist = -np.log(prob) # complete distances dist_func = graph_shortest_path(dist) # slice distance function dist_func = dist_func[keep_idxs, :][:, keep_idxs] return dist_func, keep_idxs
def get_dist_manifold(data, k_neighbours=20, knn_sym=True): """ Computes ranking of the original dataset through geodesic distances: we estimate KNN graph and find shortest distance on it. The geodesic distance between disconnected componenents is set to infinity. """ KNN = kneighbors_graph(data, k_neighbours, mode='distance', include_self=False).toarray() if knn_sym: KNN = np.maximum(KNN, KNN.T) n_components, labels = csgraph.connected_components(KNN) if (n_components > 1): print('Connecting', n_components) distances = pairwise_distances(data, metric='euclidean') KNN = connect_knn(KNN, distances, n_components, labels) D_high = graph_shortest_path(KNN) return D_high
def find_geodesic_distance_matrix(self): # ----- find k-nearest neighbor graph (distance matrix): if self.n_neighbors == None: n_samples = self.X.shape[1] self.n_neighbors = n_samples knn = KNN( n_neighbors=self.n_neighbors + 1, algorithm='kd_tree', n_jobs=self.n_jobs) #+1 because the point itself is also counted knn.fit(X=self.X.T) # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html#sklearn.neighbors.NearestNeighbors.kneighbors_graph # the following function gives n_samples*n_samples matrix, and puts 0 for diagonal and also where points are not connected directly in KNN graph # if K=n_samples, only diagonal is zero. Euclidean_distance_matrix = knn.kneighbors_graph( X=self.X.T, n_neighbors=self.n_neighbors, mode='distance') #--> gives Euclidean distances #Euclidean_distance_matrix = Euclidean_distance_matrix.toarray() # ----- find geodesic distance graph: # https://scikit-learn.org/stable/modules/generated/sklearn.utils.graph_shortest_path.graph_shortest_path.html self.geodesic_dist_matrix = graph_shortest_path( dist_matrix=Euclidean_distance_matrix, method="auto", directed=False)
def creat_graph_and_calc_dist_verb(A): """ creates graph from ajacency matrix and calculates shortest path """ return gp.graph_shortest_path(A, method='auto', directed=False)
def cal_ontology_emb( dim=20, mi=0, use_pretrain=None, ontology_nlp_file='../../OnClass_data/cell_ontology/cl.ontology.nlp', ontology_file='../../OnClass_data/cell_ontology/cl.ontology'): if use_pretrain is None or not os.path.isfile( use_pretrain + 'X.npy') or not os.path.isfile(use_pretrain + 'sp.npy'): cl_nlp = collections.defaultdict(dict) if ontology_nlp_file is not None: fin = open(ontology_nlp_file) for line in fin: s, p, wt = line.upper().strip().split('\t') cl_nlp[s][p] = float(wt) cl_nlp[p][s] = float(wt) fin.close() fin = open(ontology_file) lset = set() s2p = {} for line in fin: w = line.strip().split('\t') s = w[0] p = w[1] if len(w) == 2: if p in cl_nlp and s in cl_nlp[p]: wt = cl_nlp[p][s] else: wt = 1. else: wt = float(w[2]) if s not in s2p: s2p[s] = {} s2p[s][p] = wt lset.add(s) lset.add(p) fin.close() lset = np.sort(list(lset)) nl = len(lset) l2i = dict(zip(lset, range(nl))) i2l = dict(zip(range(nl), lset)) A = np.zeros((nl, nl)) for s in s2p: for p in s2p[s]: A[l2i[s], l2i[p]] = s2p[s][p] A[l2i[p], l2i[s]] = s2p[s][p] if mi == 0: sp = graph_shortest_path(A, method='FW', directed=False) X = svd_emb(sp, dim=dim) sp *= -1. elif mi == 1: sp = graph_shortest_path(A, method='FW', directed=False) X = DCA_vector(sp, dim=dim)[0] sp *= -1. elif mi == 2: sp = RandomWalkRestart(A, 0.8) X = svd_emb(sp, dim=dim) elif mi == 3: sp = RandomWalkRestart(A, 0.8) X = DCA_vector(sp, dim=dim)[0] if use_pretrain is not None: i2l_file = use_pretrain + 'i2l.npy' l2i_file = use_pretrain + 'l2i.npy' X_file = use_pretrain + 'X.npy' sp_file = use_pretrain + 'sp.npy' np.save(X_file, X) np.save(i2l_file, i2l) np.save(l2i_file, l2i) np.save(sp_file, sp) else: i2l_file = use_pretrain + 'i2l.npy' l2i_file = use_pretrain + 'l2i.npy' X_file = use_pretrain + 'X.npy' sp_file = use_pretrain + 'sp.npy' X = np.load(X_file) i2l = np.load(i2l_file, allow_pickle=True).item() l2i = np.load(l2i_file, allow_pickle=True).item() sp = np.load(sp_file, allow_pickle=True) return X, l2i, i2l, sp
def get_shortest_paths(weighted_matrix: np.ndarray, inf: float = 1e6) -> np.ndarray: """Perform a shortest-path graph search on a positive directed or undirected graph.""" return graph_shortest_path(make_distance_matrix(weighted_matrix, inf))
edges = get_edges_weights(distances) G.add_edges_from(edges) nx.draw(G) nx.draw_networkx(G, node_size=25, edge_color='white', with_labels=False) # --- Exporting to CSV --- # Edge = namedtuple('Edge', ['source', 'target', 'weight']) edges = [] for i in range(distances.shape[0]): for j in range(distances.shape[1]): edges.append(Edge(i + 1, j + 1, distances[i, j])) edge_df = pd.DataFrame(edges) # --- Projecting data into 2 dimensions via PCA--- # graph = graph_shortest_path(distances) sc = StandardScaler() pc = PCA(2) projected = pc.fit_transform(sc.fit_transform(graph)) plt.scatter(projected[:, 0], projected[:, 1], s=5, alpha=.5) # --- Showing numpy array as image --- # img1 = data[0, :].reshape(64, 64) plt.imshow(img1, cmap='gray') # --- Full ISOMAP --- # A = make_affinity_matrix(data, e=22.5) weighted_A = make_weighted_matrix(A) distances = make_distance_matrix(weighted_A) graph = graph_shortest_path(distances) tau = make_tau_matrix(graph)
crawler = Crawler() try: crawler.breadth_first_search() except IndexError: pass maze = plot_maze(crawler.history) maze2 = condition_maze(maze) [x_oxy], [y_oxy] = np.where(maze == 2) [x_start], [y_start] = np.where(maze == 3) graph = image.grid_to_graph(*maze2.shape, mask=maze2, return_as=np.ndarray) shortest_paths = graph_shortest_path(graph) print("Time-to-oxygen = {} minutes".format(np.max(np.unique(shortest_paths)))) # ij_to_g = ij_to_graph_index(maze2) # print("SHORTEST PATH ISSSSSSSS!") # print(shortest_paths[ij_to_g[(x_start, y_start)], ij_to_g[(x_oxy, y_oxy)]]) # ic = IntcodeComputer(allow_pausing=True) # ic.code[0] = 2 # ic.run(0) # ic.resume(0) # ic.resume(0) # while ic.continue_flag: # ic.resume(next_input)
def MMfeaturesBoot(Location, filename, summary, slots_offered): beta_coef = np.append([0], np.random.rand(len(slots_offered) + 3)) features_df, disc_cols, eco_cols, gr_cols = get_active_features( summary, slots_offered) beta_ext = expand_beta(beta_coef, len(disc_cols), len(eco_cols), len(gr_cols)) design_df = get_design_matrix(features_df.columns.tolist(), slots_offered) assortment_df = summary.loc[:, [ 'C_' + col for col in ['NO_PURCHASE'] + slots_offered ]].fillna(0) choice_df = summary.loc[:, [col for col in ['NO_PURCHASE'] + slots_offered]].fillna(0) design = design_df.values features = features_df.values assortment = assortment_df.values choice = choice_df.values C = np.where(choice == 1)[1] membership = assortment nprods = assortment.shape[1] ## check if the MM algorithm would coverge by testing if the item-item graph # is strongly connected row = [] col = [] data = [] for i in range(membership.shape[0]): assort = list(np.nonzero(membership[i, :])[0]) try: assort.remove(C[i]) except ValueError: print(i, C[i], assort) break row += len(assort) * [C[i]] col += assort data += len(assort) * [1] dist_matrix = csr_matrix((data, (row, col)), shape=(nprods, nprods)) Z = graph_shortest_path.graph_shortest_path( dist_matrix, method='D') # Dijkstra's algorithm I = np.eye(nprods) if np.count_nonzero(I + Z) < nprods**2: # condition for convergence of MM algo not met sys.stderr.write( 'Warning: Convergence condition for MM algorithm not met...adding noise to the data matrix...\n' ) pairs = [ pair for pair in combinations(np.delete(np.arange(nprods), 0), 2) ] npairs = len(pairs) pairs = np.array(pairs) pairs = np.tile(pairs, (2, 1)) Z = np.zeros((len(pairs), nprods)) for i, pair in enumerate(pairs): Z[i, pair] = 1 assortment = np.vstack((assortment, Z)) d = np.append(pairs[:npairs, 0], pairs[npairs:, 1]) choicenew = np.zeros((Z.shape[0], nprods)) choicenew[np.arange(Z.shape[0]), d] = 1 choice = np.vstack((choice, choicenew)) featuresnew = np.zeros((Z.shape[0], features.shape[1])) featuresnew[:, np.arange(nprods)] = 1 features = np.vstack((features, featuresnew)) i = 0 while True: i += 1 beta = np.copy(beta_coef) beta_ext_cp = np.copy(beta_ext) beta_coef, Q = update_beta(design, features, disc_cols, eco_cols, gr_cols, assortment, choice, beta_ext_cp, beta, slots_offered) log_likeli = sum(np.log(sum(Q * choice, 1))) beta_ext = expand_beta(beta_coef, len(disc_cols), len(eco_cols), len(gr_cols)) print('Iteration=', i, 'loglikelihood =', log_likeli, 'beta_disc', beta_coef[-3], 'beta_eco', beta_coef[-2], 'beta_gr', beta_coef[-1]) if np.linalg.norm(beta_coef[:-1] - beta[:-1]) < 10**-6 or i > 500: predict_prob_df = pd.DataFrame(Q, columns=['NO_PURCHASE'] + slots_offered) beta_df = pd.DataFrame([np.array(beta_coef)], columns=['NO_PURCHASE'] + slots_offered + ['Discount', 'Eco', 'Gr']) predict_prob_df.to_csv(Location + filename + 'predprobfeatures.csv') beta_df.to_csv(Location + filename + 'betafeatures.csv') del summary, predict_prob_df, design_df, features_df, assortment_df, choice_df, design, features, assortment, choice break return beta_df.iloc[0]
from sklearn.neighbors import NearestNeighbors from sklearn.utils.graph_shortest_path import graph_shortest_path import networkx as nx import pickle feat = 'upc' k = 3 nbrs = NearestNeighbors( n_neighbors=k + 1, metric='cosine', algorithm='brute').fit( x_new_stack_T) # k=(n_neighbors-1) (first neighbour is 'v' itself) #distances, indices = nbrs.kneighbors(x_new_T) # not directly needed, for now knnmatrix = nbrs.kneighbors_graph( x_new_stack_T, mode='distance' ) # sparse matrix(68x68) with nearest KNeighbours for each of the 68 pt knnmatrix.data[np.where(knnmatrix.data < 0)] = 0 sp = graph_shortest_path( knnmatrix, directed=False ) # shortest-path-edge-weight from (v_i to v_j), (doc-https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/graph_shortest_path.pyx) G = nx.Graph(knnmatrix) spl = nx.shortest_path( G, weight='weight' ) # shortest-path dict-array from each v_i to v_j, do len(array) to find path-length ## spl = nx.shortest_path(G) # Without weight (just connections-1/0) pickle.dump(knnmatrix, open('knn_' + feat + '_k_' + str(k) + '.pickle.dump', 'wb')) # used to smooth out features pickle.dump(sp, open('sp_all_' + feat + '_k_' + str(k) + '.pickle.dump', 'wb')) #np.savetxt('sp_all_'+feat+'_k_'+str(k)+'.np.save', sp) pickle.dump(spl, open('spl_all_' + feat + '_k_' + str(k) + '.pickle.dump', 'wb')) ## knnmatrix_all = pickle.load(
if pair != -1: if pair not in thresh_g: thresh_g.node[n]["Pair"] = -1 thresh_g.node[n]["Pair ID"] = -1 n_missing += 1 mg = MetaGraph(thresh_g, weight="max_norm_weight") meta = mg.meta adj = mg.adj.copy() # colsums = np.sum(adj, axis=0) # colsums[colsums == 0] = 1 # adj = adj / colsums[np.newaxis, :] adj = pass_to_ranks(adj) if use_spl: adj = graph_shortest_path(adj) if plus_c: adj += np.min(adj) if embed == "lse": latent = lse(adj, None, ptr=False) elif embed == "ase": latent = ase(adj, None, ptr=False) rot_latent, diff = procrustes_match(latent, meta) rot_latent = latent n_components = latent.shape[1] plot_df = pd.DataFrame(data=rot_latent) plot_df["Class"] = mg["Class 1"] fig, ax = plt.subplots(1, 1, figsize=(10, 10))
l2i = npzfile['l2i'].item() i2l = npzfile['i2l'].item() cls2cls = npzfile['cls2cls'] test_Y = npzfile['test_Y'] ntest = len(test_Y) ncls = nseen + len(unseen_l) seen_l = np.array(range(nseen)) cls2cls = np.zeros((ncls, ncls)) fin = open(DATA_DIR + '/cell_ontology/cl.ontology') for line in fin: w = line.strip().split('\t') #w[1] is parent of w[0] cls2cls[int(l2i[w[0]]), int(l2i[w[1]])] = 1 cls2cls[int(l2i[w[1]]), int(l2i[w[0]])] = 1 fin.close() sp = graph_shortest_path(cls2cls, method='FW', directed=False) pname = translate_paramter([nn_nhidden, keep_prob, KNN]) pred_Y_all = np.load(our_output_dir + '/' + dname + '/' + str(iter) + '/' + str(unseen_ratio) + '/' + pname + 'pred_Y_all.npy') #res = evaluate(pred_Y_all, test_Y, unseen_l, nseen, Y_ind = test_Y_ind, Y_net = onto_net, write_screen = False, metrics = metrics, prefix = str(KNN)) Y_truth_bin_mat = ConvertLabels(test_Y, ncls) class_auc_macro = np.full(ncls, np.nan) class_auprc_macro = np.full(ncls, np.nan) for i in unseen_l: if len(np.unique(Y_truth_bin_mat[:, i])) == 2: class_auc_macro[i] = roc_auc_score(Y_truth_bin_mat[:, i], pred_Y_all[:, i]) for cutoff in cutoffs:
def gen_triplets_from_knn_in_batches(data, random_triplet_indices, num_neighbors=50, batch_size=10000): """ Description: Generate triplet data given distance matrix and random indices. :param data: #TODO :param random_triplet_indices: :param num_neighbors: :param batch_size: :return: """ kng = kneighbors_graph(data, num_neighbors, mode='distance', n_jobs=8) sp_dist_matrix = graph_shortest_path(kng, method='auto', directed=False) del kng num_triplets = random_triplet_indices.shape[ 0] # Compute the number of triplets. number_of_batches = np.int(np.ceil(num_triplets / batch_size)) # Number of batches triplet_set = np.zeros((num_triplets, 3), dtype=int) # Initializing the triplet set for i in range(number_of_batches): if i == (number_of_batches - 1): indices = random_triplet_indices[(i * batch_size):, :] triplet_set[(i * batch_size):, 0] = indices[:, 0] d1 = sp_dist_matrix[indices[:, 0], indices[:, 1]] d2 = sp_dist_matrix[indices[:, 0], indices[:, 2]] det = np.sign(d1 - d2) triplet_set[(i * batch_size):, 1] = ((indices[:, 1] + indices[:, 2] - det * indices[:, 1] + det * indices[:, 2]) / 2) triplet_set[(i * batch_size):, 2] = ((indices[:, 1] + indices[:, 2] + det * indices[:, 1] - det * indices[:, 2]) / 2) else: indices = random_triplet_indices[(i * batch_size):((i + 1) * batch_size), :] triplet_set[(i * batch_size):((i + 1) * batch_size), 0] = indices[:, 0] d1 = sp_dist_matrix[indices[:, 0], indices[:, 1]] d2 = sp_dist_matrix[indices[:, 0], indices[:, 2]] det = np.sign(d1 - d2) triplet_set[(i * batch_size):((i + 1) * batch_size), 1] = ((indices[:, 1] + indices[:, 2] - det * indices[:, 1] + det * indices[:, 2]) / 2) triplet_set[(i * batch_size):((i + 1) * batch_size), 2] = ((indices[:, 1] + indices[:, 2] + det * indices[:, 1] - det * indices[:, 2]) / 2) triplet_set = triplet_set.astype(dtype=int) triplet_set[:, 0] = random_triplet_indices[:, 0] # Initialize index 1 randomly. return triplet_set
x_new_te = sparse.lil_matrix(sparse.csr_matrix(XTE)[:,list(range(upcStart-1,nextStart-1))]) x_new_stack_T = vstack([x_new_tr,x_new_te]).T ## see boundry elements- print(sparse.csr_matrix(x_new_stack_T)[0]) ## #x_new = sparse.lil_matrix(sparse.csr_matrix(XD)[:,list(range(47,115))]) #x_new_T = x_new.T ## from sklearn.neighbors import NearestNeighbors from sklearn.utils.graph_shortest_path import graph_shortest_path import networkx as nx import pickle feat='upc' k=3 nbrs = NearestNeighbors(n_neighbors=k+1,metric='cosine',algorithm='brute').fit(x_new_stack_T) # k=(n_neighbors-1) (first neighbour is 'v' itself) #distances, indices = nbrs.kneighbors(x_new_T) # not directly needed, for now knnmatrix = nbrs.kneighbors_graph(x_new_stack_T,mode='distance') # sparse matrix(68x68) with nearest KNeighbours for each of the 68 pt knnmatrix.data[np.where(knnmatrix.data<0)]=0 sp = graph_shortest_path(knnmatrix,directed=False) # shortest-path-edge-weight from (v_i to v_j), (doc-https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/graph_shortest_path.pyx) G = nx.Graph(knnmatrix) spl = nx.shortest_path(G, weight='weight') # shortest-path dict-array from each v_i to v_j, do len(array) to find path-length ## spl = nx.shortest_path(G) # Without weight (just connections-1/0) pickle.dump(knnmatrix,open('knn_'+feat+'_k_'+str(k)+'.pickle.dump','wb')) # used to smooth out features pickle.dump(sp,open('sp_all_'+feat+'_k_'+str(k)+'.pickle.dump','wb')) #np.savetxt('sp_all_'+feat+'_k_'+str(k)+'.np.save', sp) pickle.dump(spl,open('spl_all_'+feat+'_k_'+str(k)+'.pickle.dump','wb')) ## knnmatrix_all = pickle.load(open('knn_'+feat+'_k_'+str(k)+'.pickle.dump','rb')) sp_all = pickle.load(open('sp_all_'+feat+'_k_'+str(k)+'.pickle.dump','rb')) #sp_all = np.loadtxt('sp_all_'+feat+'_k_'+str(k)+'.np.save') spl_all = pickle.load(open('spl_all_'+feat+'_k_'+str(k)+'.txt','rb')) # cosine_dist_all = pairwise_distances(x_new_stack_T, metric="cosine") pickle.dump(cosine_dist_all,open('cosine_dist_all_'+feat+'.pickle.dump','wb'))