def compute_emst(embedding: np.ndarray) -> np.ndarray:

    if embedding.shape[0] <= 1:
        logger.warn("can't compute EMST for %d points", embedding.shape[0])
        return np.zeros((0, 3), dtype=np.float64)

    return mlp.emst(embedding)["output"]
Esempio n. 2
0
def get_emst(embedding):

    if embedding.shape[0] <= 1:
        logger.warn("can't compute EMST for %d points", embedding.shape[0])
        return np.zeros((0, 3), dtype=np.float64)

    return mlp.emst(embedding)['output']
Esempio n. 3
0
def get_embedding_mst(embedding, alpha, coordinate_scale, offset, candidates):

    _, depth, height, width = embedding.shape
    coordinates = np.meshgrid(
        np.arange(0, depth * coordinate_scale[0], coordinate_scale[0]),
        np.arange(0, height * coordinate_scale[1], coordinate_scale[1]),
        np.arange(0, width * coordinate_scale[2], coordinate_scale[2]),
        indexing="ij",
    )
    for i in range(len(coordinates)):
        coordinates[i] = coordinates[i].astype(np.float32)
    embedding = np.concatenate([embedding, coordinates], 0)
    embedding = np.transpose(embedding, axes=[1, 2, 3, 0])
    embedding = embedding.reshape(depth * width * height, -1)
    candidates = candidates.reshape(depth * width * height)
    embedding = embedding[candidates == 1, :]

    emst = mlp.emst(embedding)["output"]

    mst = nx.DiGraph()
    for u, v, distance in emst:
        u = int(u)
        pos_u = pos = embedding[u][-3:] / coordinate_scale
        v = int(v)
        pos_v = pos = embedding[v][-3:] / coordinate_scale
        mst.add_node(u, pos=pos_u + offset)
        mst.add_node(v, pos=pos_v + offset)
        if alpha > distance:
            mst.add_edge(u, v)
    return mst
Esempio n. 4
0
def get_embedding_mst(embedding, coordinate_scale, voxel_size, offset,
                      candidates):

    _, depth, height, width = embedding.shape
    coordinates = np.meshgrid(
        np.arange(0, (depth - 0.5) * coordinate_scale[0], coordinate_scale[0]),
        np.arange(0, (height - 0.5) * coordinate_scale[1],
                  coordinate_scale[1]),
        np.arange(0, (width - 0.5) * coordinate_scale[2], coordinate_scale[2]),
        indexing="ij",
    )
    for i in range(len(coordinates)):
        coordinates[i] = coordinates[i].astype(np.float32)

    embedding = np.concatenate([embedding, coordinates], 0)
    embedding = np.transpose(embedding, axes=[1, 2, 3, 0])
    embedding = embedding.reshape(depth * width * height, -1)
    candidates = candidates.reshape(depth * width * height)
    embedding = embedding[candidates == 1, :]

    emst = mlp.emst(embedding)["output"]

    mst = nx.DiGraph()
    for u, v, distance in emst:
        u = int(u)
        pos_u = (embedding[u][-3:] / coordinate_scale) * voxel_size
        v = int(v)
        pos_v = (embedding[v][-3:] / coordinate_scale) * voxel_size
        mst.add_node(u, location=pos_u + offset)
        mst.add_node(v, location=pos_v + offset)
        mst.add_edge(u, v, d=distance)
    for node, attrs in mst.nodes.items():
        assert "location" in attrs
    return mst
Esempio n. 5
0
    def process(self, batch, request: BatchRequest):
        outputs = Batch()

        voxel_size = batch[self.embeddings].spec.voxel_size
        offset = batch[self.embeddings].spec.roi.get_begin()
        embeddings = batch[self.embeddings].data
        candidates = batch[self.mask].data
        _, depth, height, width = embeddings.shape
        coordinates = np.meshgrid(
            np.arange(0, (depth - 0.5) * self.coordinate_scale[0],
                      self.coordinate_scale[0]),
            np.arange(0, (height - 0.5) * self.coordinate_scale[1],
                      self.coordinate_scale[1]),
            np.arange(0, (width - 0.5) * self.coordinate_scale[2],
                      self.coordinate_scale[2]),
            indexing="ij",
        )
        for i in range(len(coordinates)):
            coordinates[i] = coordinates[i].astype(np.float32)

        embedding = np.concatenate([embeddings, coordinates], 0)
        embedding = np.transpose(embedding, axes=[1, 2, 3, 0])
        embedding = embedding.reshape(depth * width * height, -1)
        candidates = candidates.reshape(depth * width * height)
        embedding = embedding[candidates == 1, :]

        emst = mlp.emst(embedding)["output"]

        nodes = set()
        edges = []
        for u, v, distance in emst:
            u = int(u)
            pos_u = embedding[u][-3:] / self.coordinate_scale * voxel_size
            v = int(v)
            pos_v = embedding[v][-3:] / self.coordinate_scale * voxel_size
            nodes.add(Node(u, location=pos_u + offset))
            nodes.add(Node(v, location=pos_v + offset))
            edges.append(Edge(u, v, attrs={self.distance_attr: distance}))

        graph_spec = request[self.mst]
        graph_spec.directed = False

        outputs[self.mst] = Graph(nodes, edges, graph_spec)
        logger.debug(
            f"OUTPUTS CONTAINS MST WITH {len(list(outputs[self.mst].nodes))} NODES"
        )

        return outputs
Esempio n. 6
0
    def _get_mst_exact(self, X, cur_state):
        if cur_state["affinity"] == "precomputed":
            X = X.reshape(X.shape[0], -1)
            if X.shape[1] not in [1, X.shape[0]]:
                raise ValueError("`X` must be distance vector "
                                 "or a square-form distance matrix, "
                                 "see `scipy.spatial.distance.pdist` or "
                                 "`scipy.spatial.distance.squareform`.")
            if X.shape[1] == 1:
                # from a very advanced and sophisticated quadratic equation:
                n_samples = int(
                    round((math.sqrt(1.0 + 8.0 * X.shape[0]) + 1.0) / 2.0))
                assert n_samples * (n_samples - 1) // 2 == X.shape[0]
            else:
                n_samples = X.shape[0]
        else:
            if cur_state["cast_float32"]:
                if scipy.sparse.isspmatrix(X):
                    raise ValueError("Sparse matrices are (currently) only "
                                     "supported when `exact` is False")
                X = np.array(X,
                             dtype=np.float32,
                             order="C",
                             copy=False,
                             ndmin=2)

            n_samples = X.shape[0]
            if cur_state["n_features"] < 0:
                cur_state["n_features"] = X.shape[1]

        if cur_state["mlpack_enabled"] == "auto":
            cur_state["mlpack_enabled"] = mlpack is not None and \
                    cur_state["affinity"] == "l2" and \
                    X.shape[1] <= 6 and \
                    cur_state["M"] == 1

        if cur_state["mlpack_enabled"]:
            if mlpack is None:
                raise ValueError("Package `mlpack` is not available.")
            elif cur_state["affinity"] != "l2":
                raise ValueError(
                    "`mlpack` can only be used with `affinity` = 'l2'.")
            elif cur_state["M"] != 1:
                raise ValueError("`mlpack` can only be used with `M` = 1.")

        mst_dist = None
        mst_ind = None
        nn_dist = None
        nn_ind = None
        d_core = None

        if self._last_state_ is not None and \
                cur_state["X"]            == self._last_state_["X"] and \
                cur_state["affinity"]     == self._last_state_["affinity"] and \
                cur_state["exact"]        == self._last_state_["exact"] and \
                cur_state["cast_float32"] == self._last_state_["cast_float32"]:

            if cur_state["M"] == self._last_state_["M"]:
                mst_dist = self._mst_dist_
                mst_ind = self._mst_ind_
                nn_dist = self._nn_dist_
                nn_ind = self._nn_ind_
            elif cur_state["M"] < self._last_state_["M"]:
                nn_dist = self._nn_dist_
                nn_ind = self._nn_ind_

        if cur_state["mlpack_enabled"]:
            assert cur_state["M"] == 1
            assert cur_state["affinity"] == "l2"

            if mst_dist is None or mst_ind is None:
                _res = mlpack.emst(input=X,
                                   leaf_size=cur_state["mlpack_leaf_size"],
                                   naive=False,
                                   copy_all_inputs=False,
                                   verbose=cur_state["verbose"])["output"]
                mst_dist = _res[:, 2].astype(X.dtype, order="C")
                mst_ind = _res[:, :2].astype(np.intp, order="C")
        else:
            if cur_state["M"] >= 2:  # else d_core   = None
                # Genie+HDBSCAN --- determine d_core
                # TODO: mlpack for k-nns?

                if cur_state["M"] - 1 >= X.shape[0]:
                    raise ValueError("`M` is too large")

                if nn_dist is None or nn_ind is None:
                    nn_dist, nn_ind = internal.knn_from_distance(
                        X,  # if not c_contiguous, raises an error
                        k=cur_state["M"] - 1,
                        metric=cur_state["affinity"],  # supports "precomputed"
                        verbose=cur_state["verbose"])

                d_core = internal.get_d_core(nn_dist, nn_ind, cur_state["M"])

            # Use Prim's algorithm to determine the MST
            # w.r.t. the distances computed on the fly
            if mst_dist is None or mst_ind is None:
                mst_dist, mst_ind = internal.mst_from_distance(
                    X,  # if not c_contiguous, raises an error
                    metric=cur_state["affinity"],
                    d_core=d_core,
                    verbose=cur_state["verbose"])

        self.n_samples_ = n_samples
        self._mst_dist_ = mst_dist
        self._mst_ind_ = mst_ind
        self._nn_dist_ = nn_dist
        self._nn_ind_ = nn_ind
        self._d_core_ = d_core

        return cur_state
def euclideanMstWeight(X):
    mstEdges = mlpack.emst(X, copy_all_inputs=True, leaf_size=1,
                           naive=False)['output']
    mstWeight = mstEdges[:, 2].sum()
    return mstWeight
Esempio n. 8
0
from preparacaoDados import tratamentoDados
import pandas as pd
import mlpack

data, label = tratamentoDados("sem OHE")  #  Carrega os dados
dados = pd.read_csv("dados_stacking.csv",
                    encoding="utf-8")  # Carrega os dados do Stacking

# Calculando EMST
result = mlpack.emst(dados)
emst = pd.DataFrame(*result.values())
emst.to_csv('emst.csv', index=False, header=False)

# Retira o ponto do codigo do rotulo
label['natureza_despesa_cod'] = [
    label['natureza_despesa_cod'].iloc[i].replace(".", "")
    for i in range(data.shape[0])
]
# Convertendo os dados para o formato correto
data = dados.astype("str")
for i in range(data.shape[0]):
    for j in range(data.shape[1]):
        data.iloc[i].iloc[j] = "FEAT" + str(j + 1) + ":" + str(
            data.iloc[i].iloc[j])
data.reset_index(drop=True, inplace=True)
dados_preparados = pd.concat([label, data], axis=1)

# Salva os dados
dados_preparados.to_csv('dados_preparados.csv',
                        index=False,
                        header=False,
Esempio n. 9
0
    print("raw is open")
    embedding = daisy.open_ds(args.raw_file, args.emb_dataset)
    channels = embedding.shape[0]
    
    embedding_transp = np.array(embedding.data).transpose(1,2,0)

    cx = np.arange(embedding_transp.shape[1], dtype=embedding_transp.dtype)
    cy = np.arange(embedding_transp.shape[0], dtype=embedding_transp.dtype)
    coords = np.meshgrid(cx, cy, copy=True)
    coords = np.stack(coords, axis=-1)

    print(embedding_transp.shape)
    embedding_transp[..., :2] += coords
    embedding_transp = embedding_transp.reshape((-1, channels))

    mst = mlp.emst(embedding_transp)["output"]

    # label_image = label(label_image).astype(np.uint32)
    # for i in range(embedding_transp.shape[0]//10):
    #     u,v,s = mst[i]
    #     uf.union(int(u), int(v))

    # from tqdm import tqdm
    # for i in tqdm(range(embedding_transp.shape[0])):
    #     label_image.flatten()[i] = uf.find(i)

    print("embedding is open")
    
    print(raw.data.shape, embedding.data.shape)

    for a in [raw, embedding]:
Esempio n. 10
0
import faiss

n = 25_000
res = []
for n in (10_000, 20_000, 40_000):
    for d in range(2, 10):
        DATASET = "random.norm(n=%d, d=%d)" % (n, d)
        np.random.seed(123)
        X = np.random.normal(size=(n, d))
        print(DATASET)
        t02 = time.time()
        tree2 = genieclust.internal.mst_from_distance(X)
        t12 = time.time()
        print("genieclust.from_distance: %.3f" % (t12 - t02))
        t01 = time.time()
        tree1 = mlpack.emst(input=X)
        t11 = time.time()
        print("mlpack.emst: %.3f" % (t11 - t01))
        res.append(
            dict(n=n,
                 d=d,
                 mlpack_emst=t11 - t01,
                 genieclust_from_distance=t12 - t02))

import pandas as pd
res = pd.DataFrame(res)
print(res)

res = res.set_index(["n", "d"]).stack().reset_index()
res.columns = ["n", "d", "method", "time"]
Esempio n. 11
0
    def process(self, batch, request: BatchRequest):
        outputs = Batch()

        voxel_size = batch[self.embeddings].spec.voxel_size
        offset = batch[self.embeddings].spec.roi.get_begin()
        embeddings = batch[self.embeddings].data
        candidates = batch[self.candidates].to_nx_graph()
        _, depth, height, width = embeddings.shape
        coordinates = np.meshgrid(
            np.arange(
                0, (depth - 0.5) * self.coordinate_scale[0], self.coordinate_scale[0]
            ),
            np.arange(
                0, (height - 0.5) * self.coordinate_scale[1], self.coordinate_scale[1]
            ),
            np.arange(
                0, (width - 0.5) * self.coordinate_scale[2], self.coordinate_scale[2]
            ),
            indexing="ij",
        )
        for i in range(len(coordinates)):
            coordinates[i] = coordinates[i].astype(np.float32)

        embedding = np.concatenate([embeddings, coordinates], 0)
        embedding = np.transpose(embedding, axes=[1, 2, 3, 0])
        embedding = embedding.reshape(depth * width * height, -1)

        nodes = set()
        edges = []

        for i, component in enumerate(nx.connected_components(candidates)):
            candidates_array = np.zeros((depth, height, width), dtype=bool)
            locs_to_ids = {}
            for node in component:
                attrs = candidates.nodes[node]
                location = attrs["location"]
                voxel_location = tuple(
                    int(x) for x in ((location - offset) // voxel_size)
                )
                locs_to_ids[voxel_location] = node
                candidates_array[voxel_location] = True
            candidates_array = candidates_array.reshape(-1)
            component_embedding = embedding[candidates_array, :]

            logger.info(
                f"processing component {i} with "
                f"{len(component)} candidates"
            )

            component_emst = mlp.emst(component_embedding)["output"]
            
            for u, v, distance in component_emst:
                u = int(u)
                pos_u = component_embedding[u][-3:] / self.coordinate_scale * voxel_size
                u_index = locs_to_ids[
                    tuple(int(np.round(x)) for x in (pos_u / voxel_size))
                ]
                v = int(v)
                pos_v = component_embedding[v][-3:] / self.coordinate_scale * voxel_size
                v_index = locs_to_ids[
                    tuple(int(np.round(x)) for x in (pos_v / voxel_size))
                ]
                nodes.add(Node(u_index, location=pos_u + offset))
                nodes.add(Node(v_index, location=pos_v + offset))
                edges.append(
                    Edge(u_index, v_index, attrs={self.distance_attr: distance})
                )

        graph_spec = request[self.mst]
        graph_spec.directed = False

        logger.info(
            f"candidates has {candidates.number_of_nodes()} nodes and "
            f"{candidates.number_of_edges()} edges and "
            f"{len(list(nx.connected_components(candidates)))} components"
        )

        outputs[self.mst] = Graph(nodes, edges, graph_spec)
        output_graph = outputs[self.mst].to_nx_graph()

        logger.info(
            f"output_graph has {output_graph.number_of_nodes()} nodes and "
            f"{output_graph.number_of_edges()} edges and "
            f"{len(list(nx.connected_components(output_graph)))} components"
        )

        logger.debug(
            f"OUTPUTS CONTAINS MST WITH {len(list(outputs[self.mst].nodes))} NODES"
        )

        return outputs