def test_int_float_dict(): rng = np.random.RandomState(0) keys = np.unique(rng.randint(100, size=10).astype(np.intp, copy=False)) values = rng.rand(len(keys)) d = IntFloatDict(keys, values) for key, value in zip(keys, values): assert d[key] == value other_keys = np.arange(50, dtype=np.intp)[::2] other_values = np.full(50, 0.5)[::2] other = IntFloatDict(other_keys, other_values) # Complete smoke test max_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1) average_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
def _init_inertia_from_scratch(self, global_bbox_id_list): pdist = self._computer_pdist(global_bbox_id_list, with_st_const=False) # Return a new array of given shape and type, without initializing entries. n_samples = len(global_bbox_id_list) n_nodes = 2 * n_samples - 1 # batch size A = np.empty(n_nodes, dtype=object) inertia = list() print( f'Constructing can not link for detections in the same image ... ') # start_time = time.time() mask = self.estimate_same_image_mask(global_bbox_id_list) # print("Constructing can not link for detections in the same image done in " # "[--- %s seconds ---]" % (time.time() - start_time)) start_time = time.time() for i in range(n_samples): row = np.where(mask[i, :] != 0)[0] # np.where return a tuple data = pdist[i, row] A[i] = IntFloatDict(np.array(row, dtype=np.intp), np.array(data, dtype=np.float64)) # fast dict # We keep only the upper triangular for the heap # Generator expressions are faster than arrays on the following inertia.extend( _hierarchical.WeightedEdge(d, i, r) for r, d in zip(row, data) if r > i) # This function accepts an arbitrary list and converts it to a heap # (sorted list) heapify(inertia) print("Initializing A and inertia done in %s seconds ..." % (time.time() - start_time)) print(f'len(inertia) = {len(inertia)} ... ') return inertia, A, pdist
def test_int_float_dict(): rng = np.random.RandomState(0) keys = np.unique(rng.randint(100, size=10).astype(np.intp)) values = rng.rand(len(keys)) d = IntFloatDict(keys, values) for key, value in zip(keys, values): assert d[key] == value assert len(d) == len(keys) d.append(120, 3.) assert d[120] == 3.0 assert len(d) == len(keys) + 1 for i in range(2000): d.append(i + 1000, 4.0) assert d[1100] == 4.0
def test_int_float_dict_argmin(): # Test the argmin implementation on the IntFloatDict keys = np.arange(100, dtype=np.intp) values = np.arange(100, dtype=np.float64) d = IntFloatDict(keys, values) assert argmin(d) == (0, 0)
def _init_inertia_from_clusters(self, parent, children, used_node, pdist, clusters, linkage): # Return a new array of given shape and type, without initializing entries. n_samples = pdist.shape[0] n_nodes = 2 * n_samples - 1 # batch size A = np.empty(n_nodes, dtype=object) inertia = list() start_time = time.time() labels = self.get_complete_node_list(parent=parent, n_samples=n_samples, used_node=used_node) num_clusters = len(labels) # for this batch cluster_pdist_dict = { x: [] for x in labels } # 2d array, first [] saves the rows, second the data. for k in range(num_clusters - 1): n_k = labels[k] rows = self.get_leaves(children=children, n_leaves=n_samples, node_id=n_k) for l in range(k + 1, num_clusters): n_l = labels[l] combined_cluster = clusters[n_k] + clusters[n_l] # -------------------------------- do we need to check image id? think in future image_ids = [x[0] for x in combined_cluster] # Detections xi and xj that are in the same images may be two isolated nodes # So they will be skipped if two clusters have overlap in image ids if not self.is_unique_list(image_ids): continue # ------------------------------------------- cols = self.get_leaves(children=children, n_leaves=n_samples, node_id=n_l) assert len(clusters[n_k]) > 0 and len(clusters[n_l]) > 0 dist = AhcMetric.cluster_dist_from_pdist_rows_cols( pdist=pdist, rows=rows, cols=cols, linkage=linkage) # if appearance_dist >= 2: # 1.0212946 ok, [-1,1] for similarity, 0,2 for distance 1 - simialrity # print(f'self.temporal_distance(clusters[n_k], clusters[n_l]) =' # f' {self.temporal_distance(clusters[n_k], clusters[n_l])}') assert dist <= 2.0 cluster_pdist_dict[n_k].append([n_l, dist]) cluster_pdist_dict[n_l].append([n_k, dist]) # We keep only the upper triangular for the heap # Generator expressions are faster than arrays on the following if n_l < n_k: inertia.append(_hierarchical.WeightedEdge(dist, n_l, n_k)) else: inertia.append(_hierarchical.WeightedEdge(dist, n_k, n_l)) for i in labels: if len(cluster_pdist_dict[i]) == 0: A[i] = 0 else: # 0 is the row, 1 is the dist A[i] = IntFloatDict( np.array([x[0] for x in cluster_pdist_dict[i]], dtype=np.intp), np.array([x[1] for x in cluster_pdist_dict[i]], dtype=np.float64)) # fast dict heapify(inertia) del cluster_pdist_dict, labels print("Initializing A and inertia done in %s ..." % (time.time() - start_time)) return inertia, A
def linkage_tree(X, connectivity=None, n_clusters=None, linkage='complete', affinity="euclidean", return_distance=False): """Linkage agglomerative clustering based on a Feature matrix. The inertia matrix uses a Heapq-based representation. This is the structured version, that takes into account some topological structure between samples. Read more in the :ref:`User Guide <hierarchical_clustering>`. Parameters ---------- X : array, shape (n_samples, n_features) feature matrix representing n_samples samples to be clustered connectivity : sparse matrix (optional). connectivity matrix. Defines for each sample the neighboring samples following a given structure of the data. The matrix is assumed to be symmetric and only the upper triangular half is used. Default is None, i.e, the Ward algorithm is unstructured. n_clusters : int (optional) Stop early the construction of the tree at n_clusters. This is useful to decrease computation time if the number of clusters is not small compared to the number of samples. In this case, the complete tree is not computed, thus the 'children' output is of limited use, and the 'parents' output should rather be used. This option is valid only when specifying a connectivity matrix. linkage : {"average", "complete", "single"}, optional, default: "complete" Which linkage criteria to use. The linkage criterion determines which distance to use between sets of observation. - average uses the average of the distances of each observation of the two sets - complete or maximum linkage uses the maximum distances between all observations of the two sets. - single uses the minimum of the distances between all observations of the two sets. affinity : string or callable, optional, default: "euclidean". which metric to use. Can be "euclidean", "manhattan", or any distance know to paired distance (see metric.pairwise) return_distance : bool, default False whether or not to return the distances between the clusters. Returns ------- children : 2D array, shape (n_nodes-1, 2) The children of each non-leaf node. Values less than `n_samples` correspond to leaves of the tree which are the original samples. A node `i` greater than or equal to `n_samples` is a non-leaf node and has children `children_[i - n_samples]`. Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node `n_samples + i` n_connected_components : int The number of connected components in the graph. n_leaves : int The number of leaves in the tree. parents : 1D array, shape (n_nodes, ) or None The parent of each node. Only returned when a connectivity matrix is specified, elsewhere 'None' is returned. distances : ndarray, shape (n_nodes-1,) Returned when return_distance is set to True. distances[i] refers to the distance between children[i][0] and children[i][1] when they are merged. See also -------- ward_tree : hierarchical clustering with ward linkage """ X = np.asarray(X) if X.ndim == 1: X = np.reshape(X, (-1, 1)) n_samples, n_features = X.shape linkage_choices = { 'complete': _hierarchical.max_merge, 'average': _hierarchical.average_merge, 'single': None } # Single linkage is handled differently try: join_func = linkage_choices[linkage] except KeyError: raise ValueError('Unknown linkage option, linkage should be one ' 'of %s, but %s was given' % (linkage_choices.keys(), linkage)) if affinity == 'cosine' and np.any(~np.any(X, axis=1)): raise ValueError( 'Cosine affinity cannot be used when X contains zero vectors') if connectivity is None: from scipy.cluster import hierarchy # imports PIL if n_clusters is not None: warnings.warn( 'Partial build of the tree is implemented ' 'only for structured clustering (i.e. with ' 'explicit connectivity). The algorithm ' 'will build the full tree and only ' 'retain the lower branches required ' 'for the specified number of clusters', stacklevel=2) if affinity == 'precomputed': # for the linkage function of hierarchy to work on precomputed # data, provide as first argument an ndarray of the shape returned # by pdist: it is a flat array containing the upper triangular of # the distance matrix. i, j = np.triu_indices(X.shape[0], k=1) X = X[i, j] elif affinity == 'l2': # Translate to something understood by scipy affinity = 'euclidean' elif affinity in ('l1', 'manhattan'): affinity = 'cityblock' elif callable(affinity): X = affinity(X) i, j = np.triu_indices(X.shape[0], k=1) X = X[i, j] out = hierarchy.linkage(X, method=linkage, metric=affinity) children_ = out[:, :2].astype(np.int, copy=False) if return_distance: distances = out[:, 2] return children_, 1, n_samples, None, distances return children_, 1, n_samples, None connectivity, n_connected_components = _fix_connectivity(X, connectivity, affinity=affinity) connectivity = connectivity.tocoo() # Put the diagonal to zero diag_mask = (connectivity.row != connectivity.col) connectivity.row = connectivity.row[diag_mask] connectivity.col = connectivity.col[diag_mask] connectivity.data = connectivity.data[diag_mask] del diag_mask if affinity == 'precomputed': distances = X[connectivity.row, connectivity.col].astype('float64', **_astype_copy_false(X)) else: # FIXME We compute all the distances, while we could have only computed # the "interesting" distances distances = paired_distances(X[connectivity.row], X[connectivity.col], metric=affinity) connectivity.data = distances if n_clusters is None: n_nodes = 2 * n_samples - 1 else: assert n_clusters <= n_samples n_nodes = 2 * n_samples - n_clusters if linkage == 'single': return _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters, n_connected_components, return_distance) if return_distance: distances = np.empty(n_nodes - n_samples) # create inertia heap and connection matrix A = np.empty(n_nodes, dtype=object) inertia = list() # LIL seems to the best format to access the rows quickly, # without the numpy overhead of slicing CSR indices and data. connectivity = connectivity.tolil() # We are storing the graph in a list of IntFloatDict for ind, (data, row) in enumerate(zip(connectivity.data, connectivity.rows)): A[ind] = IntFloatDict(np.asarray(row, dtype=np.intp), np.asarray(data, dtype=np.float64)) # We keep only the upper triangular for the heap # Generator expressions are faster than arrays on the following inertia.extend( _hierarchical.WeightedEdge(d, ind, r) for r, d in zip(row, data) if r < ind) del connectivity heapify(inertia) # prepare the main fields parent = np.arange(n_nodes, dtype=np.intp) used_node = np.ones(n_nodes, dtype=np.intp) children = [] # recursive merge loop for k in range(n_samples, n_nodes): # identify the merge while True: edge = heappop(inertia) if used_node[edge.a] and used_node[edge.b]: break i = edge.a j = edge.b if return_distance: # store distances distances[k - n_samples] = edge.weight parent[i] = parent[j] = k children.append((i, j)) # Keep track of the number of elements per cluster n_i = used_node[i] n_j = used_node[j] used_node[k] = n_i + n_j used_node[i] = used_node[j] = False # update the structure matrix A and the inertia matrix # a clever 'min', or 'max' operation between A[i] and A[j] coord_col = join_func(A[i], A[j], used_node, n_i, n_j) for l, d in coord_col: A[l].append(k, d) # Here we use the information from coord_col (containing the # distances) to update the heap heappush(inertia, _hierarchical.WeightedEdge(d, k, l)) A[k] = coord_col # Clear A[i] and A[j] to save memory A[i] = A[j] = 0 # Separate leaves in children (empty lists up to now) n_leaves = n_samples # # return numpy array for efficient caching children = np.array(children)[:, ::-1] if return_distance: return children, n_connected_components, n_leaves, parent, distances return children, n_connected_components, n_leaves, parent