def test_int_float_dict():
    rng = np.random.RandomState(0)
    keys = np.unique(rng.randint(100, size=10).astype(np.intp))
    values = rng.rand(len(keys))

    d = IntFloatDict(keys, values)
    for key, value in zip(keys, values):
        assert d[key] == value

    other_keys = np.arange(50).astype(np.intp)[::2]
    other_values = 0.5 * np.ones(50)[::2]
    other = IntFloatDict(other_keys, other_values)
    # Complete smoke test
    max_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
    average_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
def test_int_float_dict():
    rng = np.random.RandomState(0)
    keys = np.unique(rng.randint(100, size=10).astype(np.intp))
    values = rng.rand(len(keys))

    d = IntFloatDict(keys, values)
    for key, value in zip(keys, values):
        assert_equal(d[key], value)
    assert_equal(len(d), len(keys))

    d.append(120, 3.)
    assert_equal(d[120], 3.0)
    assert_equal(len(d), len(keys) + 1)
    for i in xrange(2000):
        d.append(i + 1000, 4.0)
    assert_equal(d[1100], 4.0)
Exemple #3
0
def test():
    logging.info("Starting")
    a = {}
    a = PreshMap()
    #a = judy.JudyIntObjectMap()
    #db = pickledb.load("pickledb.db", False)

    dict_size = 100000000

    keys = np.random.randint(0, 1000000000000, dict_size)
    values = np.random.randint(0, 1000000000, dict_size).astype(np.float64)
    logging.info("Creating")
    a = IntFloatDict(keys, values)
    logging.info("Done")
    return

    #a = dict(zip(np.random.randint(0, 100000000000, dict_size), np.random.randint(0, 100000000000, dict_size)))
    for i in range(0, 10000):
        if i % 1000000 == 0:
            print(i)

        number = randint(0, 10000000000000)

        a[number] = randint(0, 3000000000)
        #db.set(str(number), randint(0, 3000000000))

    #db.dump()
    logging.info("writing to file")
    with open("testdict.pckl", "wb") as f:
        pickle.dump(a, f, protocol=4)
    logging.info("Wrote to file")
def test_int_float_dict():
    rng = np.random.RandomState(0)
    keys = np.unique(rng.randint(100, size=10).astype(np.intp))
    values = rng.rand(len(keys))

    d = IntFloatDict(keys, values)
    for key, value in zip(keys, values):
        assert d[key] == value
    assert len(d) == len(keys)

    d.append(120, 3.)
    assert d[120] == 3.0
    assert len(d) == len(keys) + 1
    for i in range(2000):
        d.append(i + 1000, 4.0)
    assert d[1100] == 4.0
Exemple #5
0
def test_int_float_dict():
    rng = np.random.RandomState(0)
    keys = np.unique(rng.randint(100, size=10).astype(np.intp))
    values = rng.rand(len(keys))

    d = IntFloatDict(keys, values)
    for key, value in zip(keys, values):
        assert_equal(d[key], value)
    assert_equal(len(d), len(keys))

    d.append(120, 3.)
    assert_equal(d[120], 3.0)
    assert_equal(len(d), len(keys) + 1)
    for i in xrange(2000):
        d.append(i + 1000, 4.0)
    assert_equal(d[1100], 4.0)
Exemple #6
0
def linkage_tree(X,
                 connectivity=None,
                 n_components=None,
                 n_clusters=None,
                 linkage='complete',
                 affinity="euclidean",
                 return_distance=False):
    """Linkage agglomerative clustering based on a Feature matrix.

    The inertia matrix uses a Heapq-based representation.

    This is the structured version, that takes into account some topological
    structure between samples.

    Read more in the :ref:`User Guide <hierarchical_clustering>`.

    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        feature matrix representing n_samples samples to be clustered

    connectivity : sparse matrix (optional).
        connectivity matrix. Defines for each sample the neighboring samples
        following a given structure of the data. The matrix is assumed to
        be symmetric and only the upper triangular half is used.
        Default is None, i.e, the Ward algorithm is unstructured.

    n_components : int (optional)
        Number of connected components. If None the number of connected
        components is estimated from the connectivity matrix.
        NOTE: This parameter is now directly determined directly
        from the connectivity matrix and will be removed in 0.18

    n_clusters : int (optional)
        Stop early the construction of the tree at n_clusters. This is
        useful to decrease computation time if the number of clusters is
        not small compared to the number of samples. In this case, the
        complete tree is not computed, thus the 'children' output is of
        limited use, and the 'parents' output should rather be used.
        This option is valid only when specifying a connectivity matrix.

    linkage : {"average", "complete"}, optional, default: "complete"
        Which linkage critera to use. The linkage criterion determines which
        distance to use between sets of observation.
            - average uses the average of the distances of each observation of
              the two sets
            - complete or maximum linkage uses the maximum distances between
              all observations of the two sets.

    affinity : string or callable, optional, default: "euclidean".
        which metric to use. Can be "euclidean", "manhattan", or any
        distance know to paired distance (see metric.pairwise)

    return_distance : bool, default False
        whether or not to return the distances between the clusters.

    Returns
    -------
    children : 2D array, shape (n_nodes-1, 2)
        The children of each non-leaf node. Values less than `n_samples`
        correspond to leaves of the tree which are the original samples.
        A node `i` greater than or equal to `n_samples` is a non-leaf
        node and has children `children_[i - n_samples]`. Alternatively
        at the i-th iteration, children[i][0] and children[i][1]
        are merged to form node `n_samples + i`

    n_components : int
        The number of connected components in the graph.

    n_leaves : int
        The number of leaves in the tree.

    parents : 1D array, shape (n_nodes, ) or None
        The parent of each node. Only returned when a connectivity matrix
        is specified, elsewhere 'None' is returned.

    distances : ndarray, shape (n_nodes-1,)
        Returned when return_distance is set to True.

        distances[i] refers to the distance between children[i][0] and
        children[i][1] when they are merged.

    See also
    --------
    ward_tree : hierarchical clustering with ward linkage
    """
    X = np.asarray(X)
    if X.ndim == 1:
        X = np.reshape(X, (-1, 1))
    n_samples, n_features = X.shape

    linkage_choices = {
        'complete': _hierarchical.max_merge,
        'average': _hierarchical.average_merge
    }
    try:
        join_func = linkage_choices[linkage]
    except KeyError:
        raise ValueError('Unknown linkage option, linkage should be one '
                         'of %s, but %s was given' %
                         (linkage_choices.keys(), linkage))

    if connectivity is None:
        from scipy.cluster import hierarchy  # imports PIL

        if n_clusters is not None:
            warnings.warn(
                'Partial build of the tree is implemented '
                'only for structured clustering (i.e. with '
                'explicit connectivity). The algorithm '
                'will build the full tree and only '
                'retain the lower branches required '
                'for the specified number of clusters',
                stacklevel=2)

        if affinity == 'precomputed':
            # for the linkage function of hierarchy to work on precomputed
            # data, provide as first argument an ndarray of the shape returned
            # by pdist: it is a flat array containing the upper triangular of
            # the distance matrix.
            i, j = np.triu_indices(X.shape[0], k=1)
            X = X[i, j]
        elif affinity == 'l2':
            # Translate to something understood by scipy
            affinity = 'euclidean'
        elif affinity in ('l1', 'manhattan'):
            affinity = 'cityblock'
        elif callable(affinity):
            X = affinity(X)
            i, j = np.triu_indices(X.shape[0], k=1)
            X = X[i, j]
        out = hierarchy.linkage(X, method=linkage, metric=affinity)
        children_ = out[:, :2].astype(np.int)

        if return_distance:
            distances = out[:, 2]
            return children_, 1, n_samples, None, distances
        return children_, 1, n_samples, None

    if n_components is not None:
        warnings.warn(
            "n_components is now directly calculated from the connectivity "
            "matrix and will be removed in 0.18", DeprecationWarning)
    connectivity, n_components = _fix_connectivity(X, connectivity)

    connectivity = connectivity.tocoo()
    # Put the diagonal to zero
    diag_mask = (connectivity.row != connectivity.col)
    connectivity.row = connectivity.row[diag_mask]
    connectivity.col = connectivity.col[diag_mask]
    connectivity.data = connectivity.data[diag_mask]
    del diag_mask

    if affinity == 'precomputed':
        distances = X[connectivity.row, connectivity.col]
    else:
        # FIXME We compute all the distances, while we could have only computed
        # the "interesting" distances
        distances = paired_distances(X[connectivity.row],
                                     X[connectivity.col],
                                     metric=affinity)
    connectivity.data = distances

    if n_clusters is None:
        n_nodes = 2 * n_samples - 1
    else:
        assert n_clusters <= n_samples
        n_nodes = 2 * n_samples - n_clusters

    if return_distance:
        distances = np.empty(n_nodes - n_samples)
    # create inertia heap and connection matrix
    A = np.empty(n_nodes, dtype=object)
    inertia = list()

    # LIL seems to the best format to access the rows quickly,
    # without the numpy overhead of slicing CSR indices and data.
    connectivity = connectivity.tolil()
    # We are storing the graph in a list of IntFloatDict
    for ind, (data, row) in enumerate(zip(connectivity.data,
                                          connectivity.rows)):
        A[ind] = IntFloatDict(np.asarray(row, dtype=np.intp),
                              np.asarray(data, dtype=np.float64))
        # We keep only the upper triangular for the heap
        # Generator expressions are faster than arrays on the following
        inertia.extend(
            _hierarchical.WeightedEdge(d, ind, r) for r, d in zip(row, data)
            if r < ind)
    del connectivity

    heapify(inertia)

    # prepare the main fields
    parent = np.arange(n_nodes, dtype=np.intp)
    used_node = np.ones(n_nodes, dtype=np.intp)
    children = []

    # recursive merge loop
    for k in xrange(n_samples, n_nodes):
        # identify the merge
        while True:
            edge = heappop(inertia)
            if used_node[edge.a] and used_node[edge.b]:
                break
        i = edge.a
        j = edge.b

        if return_distance:
            # store distances
            distances[k - n_samples] = edge.weight

        parent[i] = parent[j] = k
        children.append((i, j))
        # Keep track of the number of elements per cluster
        n_i = used_node[i]
        n_j = used_node[j]
        used_node[k] = n_i + n_j
        used_node[i] = used_node[j] = False

        # update the structure matrix A and the inertia matrix
        # a clever 'min', or 'max' operation between A[i] and A[j]
        coord_col = join_func(A[i], A[j], used_node, n_i, n_j)
        for l, d in coord_col:
            A[l].append(k, d)
            # Here we use the information from coord_col (containing the
            # distances) to update the heap
            heappush(inertia, _hierarchical.WeightedEdge(d, k, l))
        A[k] = coord_col
        # Clear A[i] and A[j] to save memory
        A[i] = A[j] = 0

    # Separate leaves in children (empty lists up to now)
    n_leaves = n_samples

    # # return numpy array for efficient caching
    children = np.array(children)[:, ::-1]

    if return_distance:
        return children, n_components, n_leaves, parent, distances
    return children, n_components, n_leaves, parent
def test_int_float_dict_argmin():
    # Test the argmin implementation on the IntFloatDict
    keys = np.arange(100, dtype=np.intp)
    values = np.arange(100, dtype=np.float64)
    d = IntFloatDict(keys, values)
    assert argmin(d) == (0, 0)
Exemple #8
0
def linkage_tree(X, connectivity=None, n_components=None,
                 n_clusters=None, linkage='complete', affinity="euclidean",
                 return_distance=False, max_size=sys.maxint,
                 means=None, variances=None):
    """Linkage agglomerative clustering based on a Feature matrix.
    The inertia matrix uses a Heapq-based representation.
    This is the structured version, that takes into account some topological
    structure between samples.
    Read more in the :ref:`User Guide <hierarchical_clustering>`.
    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        feature matrix representing n_samples samples to be clustered
    connectivity : sparse matrix (optional).
        connectivity matrix. Defines for each sample the neighboring samples
        following a given structure of the data. The matrix is assumed to
        be symmetric and only the upper triangular half is used.
        Default is None, i.e, the Ward algorithm is unstructured.
    n_components : int (optional)
        Number of connected components. If None the number of connected
        components is estimated from the connectivity matrix.
        NOTE: This parameter is now directly determined directly
        from the connectivity matrix and will be removed in 0.18
    n_clusters : int (optional)
        Stop early the construction of the tree at n_clusters. This is
        useful to decrease computation time if the number of clusters is
        not small compared to the number of samples. In this case, the
        complete tree is not computed, thus the 'children' output is of
        limited use, and the 'parents' output should rather be used.
        This option is valid only when specifying a connectivity matrix.
    linkage : {"average", "complete"}, optional, default: "complete"
        Which linkage critera to use. The linkage criterion determines which
        distance to use between sets of observation.
            - average uses the average of the distances of each observation of
              the two sets
            - complete or maximum linkage uses the maximum distances between
              all observations of the two sets.
    affinity : string or callable, optional, default: "euclidean".
        which metric to use. Can be "euclidean", "manhattan", or any
        distance know to paired distance (see metric.pairwise)
    return_distance : bool, default False
        whether or not to return the distances between the clusters.
    Returns
    -------
    children : 2D array, shape (n_nodes-1, 2)
        The children of each non-leaf node. Values less than `n_samples`
        correspond to leaves of the tree which are the original samples.
        A node `i` greater than or equal to `n_samples` is a non-leaf
        node and has children `children_[i - n_samples]`. Alternatively
        at the i-th iteration, children[i][0] and children[i][1]
        are merged to form node `n_samples + i`
    n_components : int
        The number of connected components in the graph.
    n_leaves : int
        The number of leaves in the tree.
    parents : 1D array, shape (n_nodes, ) or None
        The parent of each node. Only returned when a connectivity matrix
        is specified, elsewhere 'None' is returned.
    distances : ndarray, shape (n_nodes-1,)
        Returned when return_distance is set to True.
        distances[i] refers to the distance between children[i][0] and
        children[i][1] when they are merged.
    See also
    --------
    ward_tree : hierarchical clustering with ward linkage
    """
    X = np.asarray(X)
    if X.ndim == 1:
        X = np.reshape(X, (-1, 1))
    n_samples, n_features = X.shape

    linkage_choices = {'complete': _hierarchical.max_merge,
                       'average': _hierarchical.average_merge}
    try:
        join_func = linkage_choices[linkage]
    except KeyError:
        raise ValueError(
            'Unknown linkage option, linkage should be one '
            'of %s, but %s was given' % (linkage_choices.keys(), linkage))

    if connectivity is None:
        from scipy.cluster import hierarchy     # imports PIL

        if n_clusters is not None:
            warnings.warn('Partial build of the tree is implemented '
                          'only for structured clustering (i.e. with '
                          'explicit connectivity). The algorithm '
                          'will build the full tree and only '
                          'retain the lower branches required '
                          'for the specified number of clusters',
                          stacklevel=2)

        if affinity == 'precomputed':
            # for the linkage function of hierarchy to work on precomputed
            # data, provide as first argument an ndarray of the shape returned
            # by pdist: it is a flat array containing the upper triangular of
            # the distance matrix.
            i, j = np.triu_indices(X.shape[0], k=1)
            X = X[i, j]
        elif affinity == 'l2':
            # Translate to something understood by scipy
            affinity = 'euclidean'
        elif affinity in ('l1', 'manhattan'):
            affinity = 'cityblock'
        elif callable(affinity):
            X = affinity(X)
            i, j = np.triu_indices(X.shape[0], k=1)
            X = X[i, j]
        out = hierarchy.linkage(X, method=linkage, metric=affinity)
        children_ = out[:, :2].astype(np.int)

        # JESSE: removed return distance and converted return to yield
        # if return_distance:
        #     distances = out[:, 2]
        #     return children_, 1, n_samples, None, distances
        yield children_, 1, n_samples, None

    if n_components is not None:
        warnings.warn(
            "n_components is now directly calculated from the connectivity "
            "matrix and will be removed in 0.18",
            DeprecationWarning)
    connectivity, n_components = _fix_connectivity(X, connectivity)

    connectivity = connectivity.tocoo()
    # Put the diagonal to zero
    diag_mask = (connectivity.row != connectivity.col)
    connectivity.row = connectivity.row[diag_mask]
    connectivity.col = connectivity.col[diag_mask]
    connectivity.data = connectivity.data[diag_mask]
    del diag_mask

    # if affinity == 'precomputed':
    #     distances = X[connectivity.row, connectivity.col]
    # else:
        # FIXME We compute all the distances, while we could have only computed
        # the "interesting" distances
    #     distances = paired_distances(X[connectivity.row],
    #                                  X[connectivity.col],
    #                                  metric=affinity)
    # connectivity.data = distances

    if n_clusters is None:
        n_nodes = 2 * n_samples - 1
    else:
        assert n_clusters <= n_samples
        n_nodes = 2 * n_samples - n_clusters

    # if return_distance:
    #     distances = np.empty(n_nodes - n_samples)
    # create inertia heap and connection matrix
    A = np.empty(n_nodes, dtype=object)
    inertia = list()

    # LIL seems to the best format to access the rows quickly,
    # without the numpy overhead of slicing CSR indices and data.
    connectivity = connectivity.tolil()
    # We are storing the graph in a list of IntFloatDict
    for ind, (data, row) in enumerate(zip(connectivity.data,
                                          connectivity.rows)):
        A[ind] = IntFloatDict(np.asarray(row, dtype=np.intp),
                              np.asarray(data, dtype=np.float64))
        # We keep only the upper triangular for the heap
        # Generator expressions are faster than arrays on the following
        inertia.extend(_hierarchical.WeightedEdge(d, ind, r)
                       for r, d in zip(row, data) if r < ind)
    del connectivity

    heapify(inertia)

    # prepare the main fields
    parent = np.arange(n_nodes, dtype=np.intp)
    used_node = np.ones(n_nodes, dtype=np.intp)
    children = []

    # recursive merge loop
    # print "n_samples: " + str(n_samples)  # DEBUG
    # print "n_nodes: " + str(n_nodes)  # DEBUG
    for k in xrange(n_samples, n_nodes):
        # print "k: " + str(k)  # DEBUG

        # Jesse: get labels in order to count cluster sizes
        labels = _hierarchical.hc_get_heads(parent, copy=False)
        labels = np.asarray(labels)

        # identify the merge
        while True:
            edge = heappop(inertia)

            if used_node[edge.a] and used_node[edge.b]:

                # Jesse: cancel merger if it would put us over the dev max cluster size
                if (len(np.where(labels == labels[edge.a])[0]) +
                        len(np.where(labels == labels[edge.b])[0]) > max_size):
                    continue

                break

        i = edge.a
        j = edge.b
        print "k: " + str(k) + ", merging (" + str(i) + ", " + str(j) + ") with dist " + str(edge.weight)  # DEBUG

        # if return_distance:
        #     # store distances
        #     distances[k - n_samples] = edge.weight

        parent[i] = parent[j] = k
        children.append((i, j))
        # Keep track of the number of elements per cluster
        n_i = used_node[i]
        n_j = used_node[j]
        used_node[k] = n_i + n_j
        used_node[i] = used_node[j] = False

        # update the structure matrix A and the inertia matrix
        # a clever 'min', or 'max' operation between A[i] and A[j]
        # Jesse: modified from cython documentation of this function:
        """Merge two IntFloatDicts with the average strategy: when the
        same key is present in the two dicts, the weighted average of the two
        values is used.
        Parameters
        ==========
        A[i], A[j] : IntFloatDict object
            The IntFloatDicts to merge
        used_node : ndarray array of dtype integer and of dimension 1
            a mask for keys to ignore: if not used_node[key] the corresponding key
            is skipped in the output dictionary
        n_i, n_j : float
            n_i and n_j are weights for i and j for the merge strategy.
            They are used for a weighted mean.
        Returns
        =======
        out : IntFloatDict object
            The IntFloatDict resulting from the merge
        """
        if variances is None:
            coord_col = join_func(A[i], A[j], used_node, n_i, n_j)

        # Jesse: in order to implement kl-divergence-based clustering, we need to override
        # Jesse: this function with one that computes new distances based on the merged clusters'
        # Jesse: new means and variance vectors; no average/maximum weighting is necessary since
        # Jesse: average mean vectors and average variance vectors are instead used to calculate
        # Jesse: the new O(n) distances to the remaining cluster centroids
        else:
            # Jesse: update means and variances structures with merge
            means = np.vstack([means, np.add(means[i] * n_i, means[j] * n_j) / (n_i + n_j)])
            variances = np.vstack([variances, np.add(variances[i] * n_i, variances[j] * n_j) / (n_i + n_j)])

            # Jesse: calculate the O(n) kl distances between merged cluster and previous clusters
            # Jesse: testing alternative where all distances are re-calculated on narsil-1 tmux 2
            coord_col = {}  # need to calculate distances for every key; nothing is preserved in kl merge
            for key, dist in A[i]:
                if used_node[key]:
                    coord_col[key] = dist
            for key, dist in A[j]:
                if used_node[key]:
                    if key not in coord_col:
                        coord_col[key] = dist
                    else:
                        # compute new kl distance between entries
                        kld = multivariate_kl_distance(means[k], variances[k], means[key], variances[key])
                        if not np.isfinite(kld):
                            kld = float(sys.maxint)
                        coord_col[key] = kld
            coord_col = IntFloatDict(np.asarray(coord_col.keys()),
                                     np.asarray([coord_col[kidx] for kidx in coord_col.keys()]))

        for l, d in coord_col:
            A[l].append(k, d)
            # Here we use the information from coord_col (containing the
            # distances) to update the heap
            # print "pushing " + str(k) + ", " + str(l) + " onto heap with distance " + str(d)  # DEBUG
            heappush(inertia, _hierarchical.WeightedEdge(d, k, l))
        A[k] = coord_col
        # Clear A[i] and A[j] to save memory
        A[i] = A[j] = 0

        # Jesse: yield after every merge
        n_leaves = n_samples
        r_children = np.array(children)[:, ::-1]
        yield r_children, n_components, n_leaves, parent