def compute_emst(embedding: np.ndarray) -> np.ndarray: if embedding.shape[0] <= 1: logger.warn("can't compute EMST for %d points", embedding.shape[0]) return np.zeros((0, 3), dtype=np.float64) return mlp.emst(embedding)["output"]
def get_emst(embedding): if embedding.shape[0] <= 1: logger.warn("can't compute EMST for %d points", embedding.shape[0]) return np.zeros((0, 3), dtype=np.float64) return mlp.emst(embedding)['output']
def get_embedding_mst(embedding, alpha, coordinate_scale, offset, candidates): _, depth, height, width = embedding.shape coordinates = np.meshgrid( np.arange(0, depth * coordinate_scale[0], coordinate_scale[0]), np.arange(0, height * coordinate_scale[1], coordinate_scale[1]), np.arange(0, width * coordinate_scale[2], coordinate_scale[2]), indexing="ij", ) for i in range(len(coordinates)): coordinates[i] = coordinates[i].astype(np.float32) embedding = np.concatenate([embedding, coordinates], 0) embedding = np.transpose(embedding, axes=[1, 2, 3, 0]) embedding = embedding.reshape(depth * width * height, -1) candidates = candidates.reshape(depth * width * height) embedding = embedding[candidates == 1, :] emst = mlp.emst(embedding)["output"] mst = nx.DiGraph() for u, v, distance in emst: u = int(u) pos_u = pos = embedding[u][-3:] / coordinate_scale v = int(v) pos_v = pos = embedding[v][-3:] / coordinate_scale mst.add_node(u, pos=pos_u + offset) mst.add_node(v, pos=pos_v + offset) if alpha > distance: mst.add_edge(u, v) return mst
def get_embedding_mst(embedding, coordinate_scale, voxel_size, offset, candidates): _, depth, height, width = embedding.shape coordinates = np.meshgrid( np.arange(0, (depth - 0.5) * coordinate_scale[0], coordinate_scale[0]), np.arange(0, (height - 0.5) * coordinate_scale[1], coordinate_scale[1]), np.arange(0, (width - 0.5) * coordinate_scale[2], coordinate_scale[2]), indexing="ij", ) for i in range(len(coordinates)): coordinates[i] = coordinates[i].astype(np.float32) embedding = np.concatenate([embedding, coordinates], 0) embedding = np.transpose(embedding, axes=[1, 2, 3, 0]) embedding = embedding.reshape(depth * width * height, -1) candidates = candidates.reshape(depth * width * height) embedding = embedding[candidates == 1, :] emst = mlp.emst(embedding)["output"] mst = nx.DiGraph() for u, v, distance in emst: u = int(u) pos_u = (embedding[u][-3:] / coordinate_scale) * voxel_size v = int(v) pos_v = (embedding[v][-3:] / coordinate_scale) * voxel_size mst.add_node(u, location=pos_u + offset) mst.add_node(v, location=pos_v + offset) mst.add_edge(u, v, d=distance) for node, attrs in mst.nodes.items(): assert "location" in attrs return mst
def process(self, batch, request: BatchRequest): outputs = Batch() voxel_size = batch[self.embeddings].spec.voxel_size offset = batch[self.embeddings].spec.roi.get_begin() embeddings = batch[self.embeddings].data candidates = batch[self.mask].data _, depth, height, width = embeddings.shape coordinates = np.meshgrid( np.arange(0, (depth - 0.5) * self.coordinate_scale[0], self.coordinate_scale[0]), np.arange(0, (height - 0.5) * self.coordinate_scale[1], self.coordinate_scale[1]), np.arange(0, (width - 0.5) * self.coordinate_scale[2], self.coordinate_scale[2]), indexing="ij", ) for i in range(len(coordinates)): coordinates[i] = coordinates[i].astype(np.float32) embedding = np.concatenate([embeddings, coordinates], 0) embedding = np.transpose(embedding, axes=[1, 2, 3, 0]) embedding = embedding.reshape(depth * width * height, -1) candidates = candidates.reshape(depth * width * height) embedding = embedding[candidates == 1, :] emst = mlp.emst(embedding)["output"] nodes = set() edges = [] for u, v, distance in emst: u = int(u) pos_u = embedding[u][-3:] / self.coordinate_scale * voxel_size v = int(v) pos_v = embedding[v][-3:] / self.coordinate_scale * voxel_size nodes.add(Node(u, location=pos_u + offset)) nodes.add(Node(v, location=pos_v + offset)) edges.append(Edge(u, v, attrs={self.distance_attr: distance})) graph_spec = request[self.mst] graph_spec.directed = False outputs[self.mst] = Graph(nodes, edges, graph_spec) logger.debug( f"OUTPUTS CONTAINS MST WITH {len(list(outputs[self.mst].nodes))} NODES" ) return outputs
def _get_mst_exact(self, X, cur_state): if cur_state["affinity"] == "precomputed": X = X.reshape(X.shape[0], -1) if X.shape[1] not in [1, X.shape[0]]: raise ValueError("`X` must be distance vector " "or a square-form distance matrix, " "see `scipy.spatial.distance.pdist` or " "`scipy.spatial.distance.squareform`.") if X.shape[1] == 1: # from a very advanced and sophisticated quadratic equation: n_samples = int( round((math.sqrt(1.0 + 8.0 * X.shape[0]) + 1.0) / 2.0)) assert n_samples * (n_samples - 1) // 2 == X.shape[0] else: n_samples = X.shape[0] else: if cur_state["cast_float32"]: if scipy.sparse.isspmatrix(X): raise ValueError("Sparse matrices are (currently) only " "supported when `exact` is False") X = np.array(X, dtype=np.float32, order="C", copy=False, ndmin=2) n_samples = X.shape[0] if cur_state["n_features"] < 0: cur_state["n_features"] = X.shape[1] if cur_state["mlpack_enabled"] == "auto": cur_state["mlpack_enabled"] = mlpack is not None and \ cur_state["affinity"] == "l2" and \ X.shape[1] <= 6 and \ cur_state["M"] == 1 if cur_state["mlpack_enabled"]: if mlpack is None: raise ValueError("Package `mlpack` is not available.") elif cur_state["affinity"] != "l2": raise ValueError( "`mlpack` can only be used with `affinity` = 'l2'.") elif cur_state["M"] != 1: raise ValueError("`mlpack` can only be used with `M` = 1.") mst_dist = None mst_ind = None nn_dist = None nn_ind = None d_core = None if self._last_state_ is not None and \ cur_state["X"] == self._last_state_["X"] and \ cur_state["affinity"] == self._last_state_["affinity"] and \ cur_state["exact"] == self._last_state_["exact"] and \ cur_state["cast_float32"] == self._last_state_["cast_float32"]: if cur_state["M"] == self._last_state_["M"]: mst_dist = self._mst_dist_ mst_ind = self._mst_ind_ nn_dist = self._nn_dist_ nn_ind = self._nn_ind_ elif cur_state["M"] < self._last_state_["M"]: nn_dist = self._nn_dist_ nn_ind = self._nn_ind_ if cur_state["mlpack_enabled"]: assert cur_state["M"] == 1 assert cur_state["affinity"] == "l2" if mst_dist is None or mst_ind is None: _res = mlpack.emst(input=X, leaf_size=cur_state["mlpack_leaf_size"], naive=False, copy_all_inputs=False, verbose=cur_state["verbose"])["output"] mst_dist = _res[:, 2].astype(X.dtype, order="C") mst_ind = _res[:, :2].astype(np.intp, order="C") else: if cur_state["M"] >= 2: # else d_core = None # Genie+HDBSCAN --- determine d_core # TODO: mlpack for k-nns? if cur_state["M"] - 1 >= X.shape[0]: raise ValueError("`M` is too large") if nn_dist is None or nn_ind is None: nn_dist, nn_ind = internal.knn_from_distance( X, # if not c_contiguous, raises an error k=cur_state["M"] - 1, metric=cur_state["affinity"], # supports "precomputed" verbose=cur_state["verbose"]) d_core = internal.get_d_core(nn_dist, nn_ind, cur_state["M"]) # Use Prim's algorithm to determine the MST # w.r.t. the distances computed on the fly if mst_dist is None or mst_ind is None: mst_dist, mst_ind = internal.mst_from_distance( X, # if not c_contiguous, raises an error metric=cur_state["affinity"], d_core=d_core, verbose=cur_state["verbose"]) self.n_samples_ = n_samples self._mst_dist_ = mst_dist self._mst_ind_ = mst_ind self._nn_dist_ = nn_dist self._nn_ind_ = nn_ind self._d_core_ = d_core return cur_state
def euclideanMstWeight(X): mstEdges = mlpack.emst(X, copy_all_inputs=True, leaf_size=1, naive=False)['output'] mstWeight = mstEdges[:, 2].sum() return mstWeight
from preparacaoDados import tratamentoDados import pandas as pd import mlpack data, label = tratamentoDados("sem OHE") # Carrega os dados dados = pd.read_csv("dados_stacking.csv", encoding="utf-8") # Carrega os dados do Stacking # Calculando EMST result = mlpack.emst(dados) emst = pd.DataFrame(*result.values()) emst.to_csv('emst.csv', index=False, header=False) # Retira o ponto do codigo do rotulo label['natureza_despesa_cod'] = [ label['natureza_despesa_cod'].iloc[i].replace(".", "") for i in range(data.shape[0]) ] # Convertendo os dados para o formato correto data = dados.astype("str") for i in range(data.shape[0]): for j in range(data.shape[1]): data.iloc[i].iloc[j] = "FEAT" + str(j + 1) + ":" + str( data.iloc[i].iloc[j]) data.reset_index(drop=True, inplace=True) dados_preparados = pd.concat([label, data], axis=1) # Salva os dados dados_preparados.to_csv('dados_preparados.csv', index=False, header=False,
print("raw is open") embedding = daisy.open_ds(args.raw_file, args.emb_dataset) channels = embedding.shape[0] embedding_transp = np.array(embedding.data).transpose(1,2,0) cx = np.arange(embedding_transp.shape[1], dtype=embedding_transp.dtype) cy = np.arange(embedding_transp.shape[0], dtype=embedding_transp.dtype) coords = np.meshgrid(cx, cy, copy=True) coords = np.stack(coords, axis=-1) print(embedding_transp.shape) embedding_transp[..., :2] += coords embedding_transp = embedding_transp.reshape((-1, channels)) mst = mlp.emst(embedding_transp)["output"] # label_image = label(label_image).astype(np.uint32) # for i in range(embedding_transp.shape[0]//10): # u,v,s = mst[i] # uf.union(int(u), int(v)) # from tqdm import tqdm # for i in tqdm(range(embedding_transp.shape[0])): # label_image.flatten()[i] = uf.find(i) print("embedding is open") print(raw.data.shape, embedding.data.shape) for a in [raw, embedding]:
import faiss n = 25_000 res = [] for n in (10_000, 20_000, 40_000): for d in range(2, 10): DATASET = "random.norm(n=%d, d=%d)" % (n, d) np.random.seed(123) X = np.random.normal(size=(n, d)) print(DATASET) t02 = time.time() tree2 = genieclust.internal.mst_from_distance(X) t12 = time.time() print("genieclust.from_distance: %.3f" % (t12 - t02)) t01 = time.time() tree1 = mlpack.emst(input=X) t11 = time.time() print("mlpack.emst: %.3f" % (t11 - t01)) res.append( dict(n=n, d=d, mlpack_emst=t11 - t01, genieclust_from_distance=t12 - t02)) import pandas as pd res = pd.DataFrame(res) print(res) res = res.set_index(["n", "d"]).stack().reset_index() res.columns = ["n", "d", "method", "time"]
def process(self, batch, request: BatchRequest): outputs = Batch() voxel_size = batch[self.embeddings].spec.voxel_size offset = batch[self.embeddings].spec.roi.get_begin() embeddings = batch[self.embeddings].data candidates = batch[self.candidates].to_nx_graph() _, depth, height, width = embeddings.shape coordinates = np.meshgrid( np.arange( 0, (depth - 0.5) * self.coordinate_scale[0], self.coordinate_scale[0] ), np.arange( 0, (height - 0.5) * self.coordinate_scale[1], self.coordinate_scale[1] ), np.arange( 0, (width - 0.5) * self.coordinate_scale[2], self.coordinate_scale[2] ), indexing="ij", ) for i in range(len(coordinates)): coordinates[i] = coordinates[i].astype(np.float32) embedding = np.concatenate([embeddings, coordinates], 0) embedding = np.transpose(embedding, axes=[1, 2, 3, 0]) embedding = embedding.reshape(depth * width * height, -1) nodes = set() edges = [] for i, component in enumerate(nx.connected_components(candidates)): candidates_array = np.zeros((depth, height, width), dtype=bool) locs_to_ids = {} for node in component: attrs = candidates.nodes[node] location = attrs["location"] voxel_location = tuple( int(x) for x in ((location - offset) // voxel_size) ) locs_to_ids[voxel_location] = node candidates_array[voxel_location] = True candidates_array = candidates_array.reshape(-1) component_embedding = embedding[candidates_array, :] logger.info( f"processing component {i} with " f"{len(component)} candidates" ) component_emst = mlp.emst(component_embedding)["output"] for u, v, distance in component_emst: u = int(u) pos_u = component_embedding[u][-3:] / self.coordinate_scale * voxel_size u_index = locs_to_ids[ tuple(int(np.round(x)) for x in (pos_u / voxel_size)) ] v = int(v) pos_v = component_embedding[v][-3:] / self.coordinate_scale * voxel_size v_index = locs_to_ids[ tuple(int(np.round(x)) for x in (pos_v / voxel_size)) ] nodes.add(Node(u_index, location=pos_u + offset)) nodes.add(Node(v_index, location=pos_v + offset)) edges.append( Edge(u_index, v_index, attrs={self.distance_attr: distance}) ) graph_spec = request[self.mst] graph_spec.directed = False logger.info( f"candidates has {candidates.number_of_nodes()} nodes and " f"{candidates.number_of_edges()} edges and " f"{len(list(nx.connected_components(candidates)))} components" ) outputs[self.mst] = Graph(nodes, edges, graph_spec) output_graph = outputs[self.mst].to_nx_graph() logger.info( f"output_graph has {output_graph.number_of_nodes()} nodes and " f"{output_graph.number_of_edges()} edges and " f"{len(list(nx.connected_components(output_graph)))} components" ) logger.debug( f"OUTPUTS CONTAINS MST WITH {len(list(outputs[self.mst].nodes))} NODES" ) return outputs