Example #1
0
def _config_med(x):
    """
    slice the vector up to the last value to exclude grouping column
    and transpose the vector to be compatible with hd.geomedian
    """
    X = x.values[:, :-1]
    return np.array(hd.geomedian(X.T))
            for item in dic.items():
                if item[1] <= mean:
                    need_del.append(item[0])
            print(need_del)
            one_layer_set = np.delete(one_layer_set, need_del, axis=0)
            selected = one_layer_set.mean(axis=0)
            selected = np.reshape(np.array(selected), shape_cur_layer)
            res.append(selected)

        return res


if __name__ == "__main__":
    # baseline = Baseline()
    a = np.array([[1], [2], [3], [4], [5]]).astype(float)
    b = hdm.geomedian(a, axis=0)
    print(np.array(b))
    # a = np.array([ 1.0,2.0 ])
    # b = np.array( [ [ [1,2], [3,4], [4,5] ], [ [7,8], [9,10], [11,12] ], [ [13,14], [15,16], [17,18] ] ] )
    # c = np.array([ [ [ [1,2], [3,4] ] ], [ [ [5,6], [7,8] ] ] ])
    # # print(a, a.shape)
    # # print(b, b.shape)
    # # print(c, c.shape)
    # # print(b.flatten())
    # test_input = [ [a,b,c], [ a + 1, b + 1, c + 1], [ a + 2, b + 2, c + 2]]
    # print(test_input)
    # for item in baseline.cal_TrimmedMean(test_input):
    #     print(item, item.shape)
    # test_input = np.array([[1,2,3.4,4,6,2],
    #                        [1,2,3.4,4,6,2],
    #                        [1,2,3.4,4,6,2],
def compute_quantization(samples, binwidth, placement_scheme='on_mode'):
    """
  Calculates the assignment points for uniformly-spaced quantization bins

  The problem we need to solve is: given that we have bins with uniform spacing
  (an therefore fixed width), how should they be aligned? Should we place an
  assignment point directly on the mean of the distribution? On the mode of
  the distribution? On the median? On the value zero? This function calculates
  the assignment points based on one of these choices.

  Parameters
  ----------
  samples : ndarray (d, n) or (d,)
      This is an array of d samples of an n-dimensional random variable
      that we wish to find the uniform quantizer for. If these are scalar random
      variables, we will accept a 1d array as input.
  binwidth : ndarray (n, ) or float
      The width of the quantization bins in each dimension. If the input is
      multivariate (samples.ndim = 2), then we must specify a binwidth for each
      of the dimensions.
  placement_scheme : str, optional
      Determines where we place one of the assignment points. It can be one of
      {'on_mode', 'on_median', 'on_mean', 'on_zero'}.
      'on_mode': estimating the distribution from a histogram, take the mode
        of this estimate and place a point directly on this value.
      'on_median': place a point directly on the median of these values.
      'on_mean': place a point directly on the mean of these values.
      'on_zero': place a point directly on the value 0.0.
      Default 'on_mode'.

  Returns
  -------
  assignment_pts : ndarray (m, n) or (m,)
      The converged assignment points
  cluster_assignments : ndarray (d, )
      For each sample, the index of the codeword to which uniform quantization
      assigns this datapoint. We can compute the actual quantized values outside
      this function by invoking `assignment_pts[cluster_assignments]`
  MSE : float
      The mean squared error (the mean l2-normed-squared to be precise) for the
      returned quantization.
  shannon_entropy : float
      The (empirical) Shannon entropy for this code. We can say that assuming
      we use a lossless binary source code, that our expected codeword length
      is precisely this value.
  """
    if samples.ndim == 2:
        assert type(binwidth) == np.ndarray
        assert len(binwidth) == samples.shape[1]
    if placement_scheme == 'on_mode':
        assert samples.shape[0] > 1000, (
            'Cannot accurately estimate the mode of the ' +
            'distribution with so few samples. Try another placement scheme')

    if placement_scheme == 'on_mode':
        if samples.ndim == 1:
            # numpy's histogramdd() is slow on 1d samples for some reason so we
            # use good old-fashioned histogram()
            counts, hist_bin_edges = np.histogram(samples, 100)
            hist_bin_centers = (hist_bin_edges[:-1] + hist_bin_edges[1:]) / 2
            largest_count = np.argmax(counts)
            anchored_pt = hist_bin_centers[
                largest_count]  # the mode of the dist
        else:
            counts, hist_bin_edges = np.histogramdd(samples, 100)
            hist_bin_centers = [
                (hist_bin_edges[x][:-1] + hist_bin_edges[x][1:]) / 2
                for x in range(len(hist_bin_edges))
            ]
            largest_count = np.unravel_index(np.argmax(counts), counts.shape)
            anchored_pt = np.array([
                hist_bin_centers[coord_idx][largest_count[coord_idx]]
                for coord_idx in range(counts.ndim)
            ])
            #^ the mode of the dist, in n dimensions
    elif placement_scheme == 'on_median':
        if samples.ndim == 1:
            anchored_pt = np.median(samples)
        else:
            # the geometric median is a high-dimensional generalization of the median.
            # It minimizes the sum of distances, NOT the sum of squared distances,
            # which makes it *different from the multvariate mean, or centroid*. You
            # can verify this for yourself on synthetic data.
            anchored_pt = np.array(hdmedians.geomedian(samples, axis=0))
    elif placement_scheme == 'on_mean':
        anchored_pt = np.mean(samples, axis=0)
    elif placement_scheme == 'on_zero':
        if samples.ndim == 1:
            anchored_pt = 0.0
        else:
            anchored_pt = np.zeros((samples.shape[1], ))
    else:
        raise KeyError('Unrecognized placement scheme ' + placement_scheme)

    max_val_each_dim = np.max(samples, axis=0)
    min_val_each_dim = np.min(samples, axis=0)
    assert np.all(anchored_pt < max_val_each_dim)
    assert np.all(anchored_pt >= min_val_each_dim)
    num_pts_lower = np.floor((anchored_pt - min_val_each_dim) / binwidth)
    num_pts_higher = np.floor((max_val_each_dim - anchored_pt) / binwidth)
    num_a_pts_each_dim = num_pts_lower + num_pts_higher + 1
    if samples.ndim == 1:
        assignment_pts = np.linspace(anchored_pt - num_pts_lower * binwidth,
                                     anchored_pt + num_pts_higher * binwidth,
                                     num_a_pts_each_dim)
        if placement_scheme == 'on_zero':
            # for some reason there seams to be a numerical issue with linspace
            # keeping the anchored point exactly on zero - it can drift to like 1e-14.
            # since we clearly want the point exactly on zero I'm going to correct
            # this before we return
            assignment_pts[np.argmin(np.abs(assignment_pts))] = 0.0
    else:
        # careful, this can get huge in high dimensions.
        assignment_pts = np.array(
            list(
                cartesian_product(*[
                    np.linspace(
                        anchored_pt[x] -
                        num_pts_lower[x] * binwidth[x], anchored_pt[x] +
                        num_pts_higher[x] * binwidth[x], num_a_pts_each_dim[x])
                    for x in range(samples.shape[1])
                ])))
        if placement_scheme == 'on_zero':
            # See above for r.e. this correction
            assignment_pts[np.argmin(np.linalg.norm(assignment_pts, axis=1))] = \
                np.zeros((samples.shape[1], ))

    quantized_code, cluster_assignments = quantize(samples, assignment_pts,
                                                   True)

    if samples.ndim == 1:
        MSE = np.mean(np.square(quantized_code - samples))
    else:
        MSE = np.mean(np.sum(np.square(quantized_code - samples), axis=1))

    cword_probs = calculate_assignment_probabilites(cluster_assignments,
                                                    assignment_pts.shape[0])
    assert np.isclose(np.sum(cword_probs), 1.0)
    nonzero_prob_pts = np.where(cword_probs != 0)  # avoid log2(0)
    shannon_entropy = -1 * np.sum(
        cword_probs[nonzero_prob_pts] * np.log2(cword_probs[nonzero_prob_pts]))

    return assignment_pts, cluster_assignments, MSE, shannon_entropy
Example #4
0
 def test_hdmedians(self):
     exp = np.array([2.01956244, 1.53164546, 2.60571752, 0.91424179,
                     1.76214416, 1.69943057])
     obs = np.array(hd.geomedian(self.eq_mat.data))
     npt.assert_almost_equal(obs, exp, decimal=6)
Example #5
0
def pcoord(X, c=None):
    n, p = X.shape
    dims = range(1, p + 1)
    for obs in X:
        plt.plot(dims, obs, c=c)


from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
X1 = X[y == 1]
plt.figure(figsize=(10, 3), dpi=200)
pcoord(X1, c='#aaaaaa')
md = hd.medoid(X1, axis=0)
gm = hd.geomedian(X1, axis=0)
xx = np.arange(X.shape[1]) + 1
plt.plot(xx, md, c='m', ls='--', lw=2, label='Medoid')
plt.plot(xx, gm, c='r', ls='-', lw=2, label='Geometric Median')
plt.xticks(xx, iris.feature_names)
plt.title('Iris data set (' + iris.target_names[1].title() + ' class)')
plt.grid(color='k', ls=':', axis='x')
plt.legend(framealpha=1.0)
plt.savefig('docs/fig1.svg')

# n, p = (40, 20)
# loc1 = np.random.normal(1, 2.0, size=(p,))
# loc2 = loc1 + np.random.normal(1.0, 1.0, size=(p,))
# sd = np.random.uniform(0.1, 0.2, size=(p,))
# X1 = np.random.normal(loc=loc1, scale=sd, size=(n, p))
# X2 = np.random.normal(loc=loc2, scale=sd, size=(n, p))
Example #6
0
def test_geomedian_two_obs():
    data = np.array([[1.0, 2.0, 1.0],
                     [2.0, 1.0, 1.0]])
    m = hd.geomedian(data, axis=0)
    r = np.array([1.5, 1.5, 1.0])
    assert_array_almost_equal(m, r, decimal=3)
Example #7
0
 def test_hdmedians(self):
     exp = np.array([2.01956244, 1.53164546, 2.60571752, 0.91424179,
                     1.76214416, 1.69943057])
     obs = np.array(hd.geomedian(self.eq_mat.data))
     npt.assert_almost_equal(obs, exp, decimal=6)
Example #8
0
def test_geomedian_same_values():
    data = np.ones((4, 2))
    m = hd.geomedian(data, axis=1)
    print(np.sum(m))
    r = np.median(data, axis=1)
    assert_array_almost_equal(m, r, decimal=3)
Example #9
0
def test_geomedian_1d():
    data = np.ones((1, 3))
    m = hd.geomedian(data, axis=1)
    r = np.median(data, axis=1)
    assert_array_almost_equal(m, r, decimal=3)
Example #10
0
def test_geomedian_axis_one():
    m = hd.geomedian(DATA1.astype(np.float32), axis=1)
    r = np.array([684.9332, 962.1752, 1247.556, 2340.647, 3473.594, 2584.103])
    assert_array_almost_equal(m, r, decimal=3)
Example #11
0
def test_geomedian_shape_axis_one():
    a = np.random.normal(1, size=(6, 10))
    m = hd.geomedian(a, axis=1)
    assert_equal(m.shape, (6, ))
Example #12
0
def test_geomedian_shape_noaxis():
    a = np.random.normal(1, size=(6, 10))
    m = hd.geomedian(a)
    assert_equal(m.shape, (6, ))
Example #13
0
def _config_med(x):
    """
    slice the vector up to the last value to exclude grouping column and transpose the vector to be compatible with hd.geomedian
    """
    X = x.values[:, :-1]
    return np.array(hd.geomedian(X.T))
    def __call__(self, params: List[Parameters]) -> Parameters:
        '''

        This aggregator takes n lists of model parameters and returns a list of component-wise geometric median.

        Parameters
        ----------
        params A list of Paramters objects. These objects support addition and scalar multiplication.

        Returns
        -------
        A new parameter object that is the geometric median of params.

        '''

        models = params

        shapes = []
        b = []
        once = True
        newWeightsList = []
        try:
            for i, model in enumerate(models):
                w2 = model.get()
                c = []
                c = np.array(c)
                for i in range(len(w2)):
                    z = np.array(w2[i])

                    if len(shapes) < len(w2):
                        shapes.append(z.shape)
                    d = np.array(w2[i].flatten()).squeeze()
                    c = np.concatenate([c, d])
                if (once):
                    b = np.zeros_like(c)
                    b[:] = c[:]
                    once = False
                else:
                    once = False
            b = np.concatenate([b.reshape(
                (-1, 1)), c.reshape((-1, 1))],
                               axis=1)
            median_val = np.array(hd.geomedian(b))
            sizes = []
            for j in shapes:
                size = 1
                for k in j:
                    size *= k
                sizes.append(size)
            newWeightsList = []

            chunks = []
            count = 0
            for size in sizes:
                chunks.append([median_val[i + count] for i in range(size)])
                count += size
            for chunk, i in zip(chunks, range(len(shapes))):
                newWeightsList.append(np.array(chunk).reshape(shapes[i]))

        except Exception as e:
            print("Error happened! Message is ", e)
        newParams = params[0].getCopy()
        return newParams.set(newWeightsList)