def test_int_float_dict(): rng = np.random.RandomState(0) keys = np.unique(rng.randint(100, size=10).astype(np.intp)) values = rng.rand(len(keys)) d = IntFloatDict(keys, values) for key, value in zip(keys, values): assert d[key] == value other_keys = np.arange(50).astype(np.intp)[::2] other_values = 0.5 * np.ones(50)[::2] other = IntFloatDict(other_keys, other_values) # Complete smoke test max_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1) average_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
def test_int_float_dict(): rng = np.random.RandomState(0) keys = np.unique(rng.randint(100, size=10).astype(np.intp)) values = rng.rand(len(keys)) d = IntFloatDict(keys, values) for key, value in zip(keys, values): assert_equal(d[key], value) assert_equal(len(d), len(keys)) d.append(120, 3.) assert_equal(d[120], 3.0) assert_equal(len(d), len(keys) + 1) for i in xrange(2000): d.append(i + 1000, 4.0) assert_equal(d[1100], 4.0)
def test(): logging.info("Starting") a = {} a = PreshMap() #a = judy.JudyIntObjectMap() #db = pickledb.load("pickledb.db", False) dict_size = 100000000 keys = np.random.randint(0, 1000000000000, dict_size) values = np.random.randint(0, 1000000000, dict_size).astype(np.float64) logging.info("Creating") a = IntFloatDict(keys, values) logging.info("Done") return #a = dict(zip(np.random.randint(0, 100000000000, dict_size), np.random.randint(0, 100000000000, dict_size))) for i in range(0, 10000): if i % 1000000 == 0: print(i) number = randint(0, 10000000000000) a[number] = randint(0, 3000000000) #db.set(str(number), randint(0, 3000000000)) #db.dump() logging.info("writing to file") with open("testdict.pckl", "wb") as f: pickle.dump(a, f, protocol=4) logging.info("Wrote to file")
def test_int_float_dict(): rng = np.random.RandomState(0) keys = np.unique(rng.randint(100, size=10).astype(np.intp)) values = rng.rand(len(keys)) d = IntFloatDict(keys, values) for key, value in zip(keys, values): assert d[key] == value assert len(d) == len(keys) d.append(120, 3.) assert d[120] == 3.0 assert len(d) == len(keys) + 1 for i in range(2000): d.append(i + 1000, 4.0) assert d[1100] == 4.0
def linkage_tree(X, connectivity=None, n_components=None, n_clusters=None, linkage='complete', affinity="euclidean", return_distance=False): """Linkage agglomerative clustering based on a Feature matrix. The inertia matrix uses a Heapq-based representation. This is the structured version, that takes into account some topological structure between samples. Read more in the :ref:`User Guide <hierarchical_clustering>`. Parameters ---------- X : array, shape (n_samples, n_features) feature matrix representing n_samples samples to be clustered connectivity : sparse matrix (optional). connectivity matrix. Defines for each sample the neighboring samples following a given structure of the data. The matrix is assumed to be symmetric and only the upper triangular half is used. Default is None, i.e, the Ward algorithm is unstructured. n_components : int (optional) Number of connected components. If None the number of connected components is estimated from the connectivity matrix. NOTE: This parameter is now directly determined directly from the connectivity matrix and will be removed in 0.18 n_clusters : int (optional) Stop early the construction of the tree at n_clusters. This is useful to decrease computation time if the number of clusters is not small compared to the number of samples. In this case, the complete tree is not computed, thus the 'children' output is of limited use, and the 'parents' output should rather be used. This option is valid only when specifying a connectivity matrix. linkage : {"average", "complete"}, optional, default: "complete" Which linkage critera to use. The linkage criterion determines which distance to use between sets of observation. - average uses the average of the distances of each observation of the two sets - complete or maximum linkage uses the maximum distances between all observations of the two sets. affinity : string or callable, optional, default: "euclidean". which metric to use. Can be "euclidean", "manhattan", or any distance know to paired distance (see metric.pairwise) return_distance : bool, default False whether or not to return the distances between the clusters. Returns ------- children : 2D array, shape (n_nodes-1, 2) The children of each non-leaf node. Values less than `n_samples` correspond to leaves of the tree which are the original samples. A node `i` greater than or equal to `n_samples` is a non-leaf node and has children `children_[i - n_samples]`. Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node `n_samples + i` n_components : int The number of connected components in the graph. n_leaves : int The number of leaves in the tree. parents : 1D array, shape (n_nodes, ) or None The parent of each node. Only returned when a connectivity matrix is specified, elsewhere 'None' is returned. distances : ndarray, shape (n_nodes-1,) Returned when return_distance is set to True. distances[i] refers to the distance between children[i][0] and children[i][1] when they are merged. See also -------- ward_tree : hierarchical clustering with ward linkage """ X = np.asarray(X) if X.ndim == 1: X = np.reshape(X, (-1, 1)) n_samples, n_features = X.shape linkage_choices = { 'complete': _hierarchical.max_merge, 'average': _hierarchical.average_merge } try: join_func = linkage_choices[linkage] except KeyError: raise ValueError('Unknown linkage option, linkage should be one ' 'of %s, but %s was given' % (linkage_choices.keys(), linkage)) if connectivity is None: from scipy.cluster import hierarchy # imports PIL if n_clusters is not None: warnings.warn( 'Partial build of the tree is implemented ' 'only for structured clustering (i.e. with ' 'explicit connectivity). The algorithm ' 'will build the full tree and only ' 'retain the lower branches required ' 'for the specified number of clusters', stacklevel=2) if affinity == 'precomputed': # for the linkage function of hierarchy to work on precomputed # data, provide as first argument an ndarray of the shape returned # by pdist: it is a flat array containing the upper triangular of # the distance matrix. i, j = np.triu_indices(X.shape[0], k=1) X = X[i, j] elif affinity == 'l2': # Translate to something understood by scipy affinity = 'euclidean' elif affinity in ('l1', 'manhattan'): affinity = 'cityblock' elif callable(affinity): X = affinity(X) i, j = np.triu_indices(X.shape[0], k=1) X = X[i, j] out = hierarchy.linkage(X, method=linkage, metric=affinity) children_ = out[:, :2].astype(np.int) if return_distance: distances = out[:, 2] return children_, 1, n_samples, None, distances return children_, 1, n_samples, None if n_components is not None: warnings.warn( "n_components is now directly calculated from the connectivity " "matrix and will be removed in 0.18", DeprecationWarning) connectivity, n_components = _fix_connectivity(X, connectivity) connectivity = connectivity.tocoo() # Put the diagonal to zero diag_mask = (connectivity.row != connectivity.col) connectivity.row = connectivity.row[diag_mask] connectivity.col = connectivity.col[diag_mask] connectivity.data = connectivity.data[diag_mask] del diag_mask if affinity == 'precomputed': distances = X[connectivity.row, connectivity.col] else: # FIXME We compute all the distances, while we could have only computed # the "interesting" distances distances = paired_distances(X[connectivity.row], X[connectivity.col], metric=affinity) connectivity.data = distances if n_clusters is None: n_nodes = 2 * n_samples - 1 else: assert n_clusters <= n_samples n_nodes = 2 * n_samples - n_clusters if return_distance: distances = np.empty(n_nodes - n_samples) # create inertia heap and connection matrix A = np.empty(n_nodes, dtype=object) inertia = list() # LIL seems to the best format to access the rows quickly, # without the numpy overhead of slicing CSR indices and data. connectivity = connectivity.tolil() # We are storing the graph in a list of IntFloatDict for ind, (data, row) in enumerate(zip(connectivity.data, connectivity.rows)): A[ind] = IntFloatDict(np.asarray(row, dtype=np.intp), np.asarray(data, dtype=np.float64)) # We keep only the upper triangular for the heap # Generator expressions are faster than arrays on the following inertia.extend( _hierarchical.WeightedEdge(d, ind, r) for r, d in zip(row, data) if r < ind) del connectivity heapify(inertia) # prepare the main fields parent = np.arange(n_nodes, dtype=np.intp) used_node = np.ones(n_nodes, dtype=np.intp) children = [] # recursive merge loop for k in xrange(n_samples, n_nodes): # identify the merge while True: edge = heappop(inertia) if used_node[edge.a] and used_node[edge.b]: break i = edge.a j = edge.b if return_distance: # store distances distances[k - n_samples] = edge.weight parent[i] = parent[j] = k children.append((i, j)) # Keep track of the number of elements per cluster n_i = used_node[i] n_j = used_node[j] used_node[k] = n_i + n_j used_node[i] = used_node[j] = False # update the structure matrix A and the inertia matrix # a clever 'min', or 'max' operation between A[i] and A[j] coord_col = join_func(A[i], A[j], used_node, n_i, n_j) for l, d in coord_col: A[l].append(k, d) # Here we use the information from coord_col (containing the # distances) to update the heap heappush(inertia, _hierarchical.WeightedEdge(d, k, l)) A[k] = coord_col # Clear A[i] and A[j] to save memory A[i] = A[j] = 0 # Separate leaves in children (empty lists up to now) n_leaves = n_samples # # return numpy array for efficient caching children = np.array(children)[:, ::-1] if return_distance: return children, n_components, n_leaves, parent, distances return children, n_components, n_leaves, parent
def test_int_float_dict_argmin(): # Test the argmin implementation on the IntFloatDict keys = np.arange(100, dtype=np.intp) values = np.arange(100, dtype=np.float64) d = IntFloatDict(keys, values) assert argmin(d) == (0, 0)
def linkage_tree(X, connectivity=None, n_components=None, n_clusters=None, linkage='complete', affinity="euclidean", return_distance=False, max_size=sys.maxint, means=None, variances=None): """Linkage agglomerative clustering based on a Feature matrix. The inertia matrix uses a Heapq-based representation. This is the structured version, that takes into account some topological structure between samples. Read more in the :ref:`User Guide <hierarchical_clustering>`. Parameters ---------- X : array, shape (n_samples, n_features) feature matrix representing n_samples samples to be clustered connectivity : sparse matrix (optional). connectivity matrix. Defines for each sample the neighboring samples following a given structure of the data. The matrix is assumed to be symmetric and only the upper triangular half is used. Default is None, i.e, the Ward algorithm is unstructured. n_components : int (optional) Number of connected components. If None the number of connected components is estimated from the connectivity matrix. NOTE: This parameter is now directly determined directly from the connectivity matrix and will be removed in 0.18 n_clusters : int (optional) Stop early the construction of the tree at n_clusters. This is useful to decrease computation time if the number of clusters is not small compared to the number of samples. In this case, the complete tree is not computed, thus the 'children' output is of limited use, and the 'parents' output should rather be used. This option is valid only when specifying a connectivity matrix. linkage : {"average", "complete"}, optional, default: "complete" Which linkage critera to use. The linkage criterion determines which distance to use between sets of observation. - average uses the average of the distances of each observation of the two sets - complete or maximum linkage uses the maximum distances between all observations of the two sets. affinity : string or callable, optional, default: "euclidean". which metric to use. Can be "euclidean", "manhattan", or any distance know to paired distance (see metric.pairwise) return_distance : bool, default False whether or not to return the distances between the clusters. Returns ------- children : 2D array, shape (n_nodes-1, 2) The children of each non-leaf node. Values less than `n_samples` correspond to leaves of the tree which are the original samples. A node `i` greater than or equal to `n_samples` is a non-leaf node and has children `children_[i - n_samples]`. Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node `n_samples + i` n_components : int The number of connected components in the graph. n_leaves : int The number of leaves in the tree. parents : 1D array, shape (n_nodes, ) or None The parent of each node. Only returned when a connectivity matrix is specified, elsewhere 'None' is returned. distances : ndarray, shape (n_nodes-1,) Returned when return_distance is set to True. distances[i] refers to the distance between children[i][0] and children[i][1] when they are merged. See also -------- ward_tree : hierarchical clustering with ward linkage """ X = np.asarray(X) if X.ndim == 1: X = np.reshape(X, (-1, 1)) n_samples, n_features = X.shape linkage_choices = {'complete': _hierarchical.max_merge, 'average': _hierarchical.average_merge} try: join_func = linkage_choices[linkage] except KeyError: raise ValueError( 'Unknown linkage option, linkage should be one ' 'of %s, but %s was given' % (linkage_choices.keys(), linkage)) if connectivity is None: from scipy.cluster import hierarchy # imports PIL if n_clusters is not None: warnings.warn('Partial build of the tree is implemented ' 'only for structured clustering (i.e. with ' 'explicit connectivity). The algorithm ' 'will build the full tree and only ' 'retain the lower branches required ' 'for the specified number of clusters', stacklevel=2) if affinity == 'precomputed': # for the linkage function of hierarchy to work on precomputed # data, provide as first argument an ndarray of the shape returned # by pdist: it is a flat array containing the upper triangular of # the distance matrix. i, j = np.triu_indices(X.shape[0], k=1) X = X[i, j] elif affinity == 'l2': # Translate to something understood by scipy affinity = 'euclidean' elif affinity in ('l1', 'manhattan'): affinity = 'cityblock' elif callable(affinity): X = affinity(X) i, j = np.triu_indices(X.shape[0], k=1) X = X[i, j] out = hierarchy.linkage(X, method=linkage, metric=affinity) children_ = out[:, :2].astype(np.int) # JESSE: removed return distance and converted return to yield # if return_distance: # distances = out[:, 2] # return children_, 1, n_samples, None, distances yield children_, 1, n_samples, None if n_components is not None: warnings.warn( "n_components is now directly calculated from the connectivity " "matrix and will be removed in 0.18", DeprecationWarning) connectivity, n_components = _fix_connectivity(X, connectivity) connectivity = connectivity.tocoo() # Put the diagonal to zero diag_mask = (connectivity.row != connectivity.col) connectivity.row = connectivity.row[diag_mask] connectivity.col = connectivity.col[diag_mask] connectivity.data = connectivity.data[diag_mask] del diag_mask # if affinity == 'precomputed': # distances = X[connectivity.row, connectivity.col] # else: # FIXME We compute all the distances, while we could have only computed # the "interesting" distances # distances = paired_distances(X[connectivity.row], # X[connectivity.col], # metric=affinity) # connectivity.data = distances if n_clusters is None: n_nodes = 2 * n_samples - 1 else: assert n_clusters <= n_samples n_nodes = 2 * n_samples - n_clusters # if return_distance: # distances = np.empty(n_nodes - n_samples) # create inertia heap and connection matrix A = np.empty(n_nodes, dtype=object) inertia = list() # LIL seems to the best format to access the rows quickly, # without the numpy overhead of slicing CSR indices and data. connectivity = connectivity.tolil() # We are storing the graph in a list of IntFloatDict for ind, (data, row) in enumerate(zip(connectivity.data, connectivity.rows)): A[ind] = IntFloatDict(np.asarray(row, dtype=np.intp), np.asarray(data, dtype=np.float64)) # We keep only the upper triangular for the heap # Generator expressions are faster than arrays on the following inertia.extend(_hierarchical.WeightedEdge(d, ind, r) for r, d in zip(row, data) if r < ind) del connectivity heapify(inertia) # prepare the main fields parent = np.arange(n_nodes, dtype=np.intp) used_node = np.ones(n_nodes, dtype=np.intp) children = [] # recursive merge loop # print "n_samples: " + str(n_samples) # DEBUG # print "n_nodes: " + str(n_nodes) # DEBUG for k in xrange(n_samples, n_nodes): # print "k: " + str(k) # DEBUG # Jesse: get labels in order to count cluster sizes labels = _hierarchical.hc_get_heads(parent, copy=False) labels = np.asarray(labels) # identify the merge while True: edge = heappop(inertia) if used_node[edge.a] and used_node[edge.b]: # Jesse: cancel merger if it would put us over the dev max cluster size if (len(np.where(labels == labels[edge.a])[0]) + len(np.where(labels == labels[edge.b])[0]) > max_size): continue break i = edge.a j = edge.b print "k: " + str(k) + ", merging (" + str(i) + ", " + str(j) + ") with dist " + str(edge.weight) # DEBUG # if return_distance: # # store distances # distances[k - n_samples] = edge.weight parent[i] = parent[j] = k children.append((i, j)) # Keep track of the number of elements per cluster n_i = used_node[i] n_j = used_node[j] used_node[k] = n_i + n_j used_node[i] = used_node[j] = False # update the structure matrix A and the inertia matrix # a clever 'min', or 'max' operation between A[i] and A[j] # Jesse: modified from cython documentation of this function: """Merge two IntFloatDicts with the average strategy: when the same key is present in the two dicts, the weighted average of the two values is used. Parameters ========== A[i], A[j] : IntFloatDict object The IntFloatDicts to merge used_node : ndarray array of dtype integer and of dimension 1 a mask for keys to ignore: if not used_node[key] the corresponding key is skipped in the output dictionary n_i, n_j : float n_i and n_j are weights for i and j for the merge strategy. They are used for a weighted mean. Returns ======= out : IntFloatDict object The IntFloatDict resulting from the merge """ if variances is None: coord_col = join_func(A[i], A[j], used_node, n_i, n_j) # Jesse: in order to implement kl-divergence-based clustering, we need to override # Jesse: this function with one that computes new distances based on the merged clusters' # Jesse: new means and variance vectors; no average/maximum weighting is necessary since # Jesse: average mean vectors and average variance vectors are instead used to calculate # Jesse: the new O(n) distances to the remaining cluster centroids else: # Jesse: update means and variances structures with merge means = np.vstack([means, np.add(means[i] * n_i, means[j] * n_j) / (n_i + n_j)]) variances = np.vstack([variances, np.add(variances[i] * n_i, variances[j] * n_j) / (n_i + n_j)]) # Jesse: calculate the O(n) kl distances between merged cluster and previous clusters # Jesse: testing alternative where all distances are re-calculated on narsil-1 tmux 2 coord_col = {} # need to calculate distances for every key; nothing is preserved in kl merge for key, dist in A[i]: if used_node[key]: coord_col[key] = dist for key, dist in A[j]: if used_node[key]: if key not in coord_col: coord_col[key] = dist else: # compute new kl distance between entries kld = multivariate_kl_distance(means[k], variances[k], means[key], variances[key]) if not np.isfinite(kld): kld = float(sys.maxint) coord_col[key] = kld coord_col = IntFloatDict(np.asarray(coord_col.keys()), np.asarray([coord_col[kidx] for kidx in coord_col.keys()])) for l, d in coord_col: A[l].append(k, d) # Here we use the information from coord_col (containing the # distances) to update the heap # print "pushing " + str(k) + ", " + str(l) + " onto heap with distance " + str(d) # DEBUG heappush(inertia, _hierarchical.WeightedEdge(d, k, l)) A[k] = coord_col # Clear A[i] and A[j] to save memory A[i] = A[j] = 0 # Jesse: yield after every merge n_leaves = n_samples r_children = np.array(children)[:, ::-1] yield r_children, n_components, n_leaves, parent