コード例 #1
0
 def get_obj(adj_xyz, adj_xz, adj_yz, adj_z, weight):
     N = len(weight)
     information_samples = [0 for cnt in range(N)]
     for i in range(N):
         information_samples[i] += weight[i] * digamma(len(adj_xyz[i]) - 1)
         information_samples[i] += weight[i] * -digamma(len(adj_xz[i]) - 1)
         information_samples[i] += weight[i] * -digamma(
             np.sum(weight[j] for j in adj_yz[i]) - weight[i])
         information_samples[i] += weight[i] * digamma(
             np.sum(weight[j] for j in adj_z[i]) - weight[i])
     return np.mean(information_samples)
コード例 #2
0
def mi(x_orig, y_orig, use_rank_order=False, k=5):
    """Estimates the mutual information of two random variables based on their observed values.

    `mi` takes two random variables :math:`x` and :math:`y` to estimate the mutual information between them using the KSG estimator.
    It relies on the cKDTree function in scipy to query the kNN with KDTree algorithm.

    Arguments
    ---------
        x_orig: `List`
            One random variable from the time-series data.
        y_orig: `List`
            Another random variable from the time-series data.
        use_rank_order: `bool` (default: False)
            Whether to use rank order instead of actual value for MI calculation.
        k: (Default: 5)
            Number for nearest neighbors used in entropy calculation

    Returns
    -------
    A numeric value of mutual information estimate
    """
    x = deepcopy(x_orig)
    y = deepcopy(y_orig)

    assert len(x) == len(y), "Lists should have same length"
    N = len(x)

    dx = len(x[0])
    dy = len(y[0])

    # if use_rank_order:
    #    x = rank_order(x)
    #    y = rank_order(y)

    data = np.concatenate((x, y), axis=1)

    tree_xy = ss.cKDTree(data)
    tree_x = ss.cKDTree(x)
    tree_y = ss.cKDTree(y)

    # knn_dis = [tree_xy.query(point, k + 1, p=np.inf)[0][k] for point in data]
    knn_dis = tree_xy.query(data, k + 1, p=np.inf)[0]
    information_samples = [digamma(N) for i in range(N)]

    for i in range(N):
        information_samples[i] += digamma(
            len(tree_xy.query_ball_point(data[i], knn_dis[i][k], p=np.inf)) -
            1)
        information_samples[i] += -digamma(
            len(tree_x.query_ball_point(x[i], knn_dis[i][k], p=np.inf)) - 1)
        information_samples[i] += -digamma(
            len(tree_y.query_ball_point(y[i], knn_dis[i][k], p=np.inf)) - 1)

    return np.mean(information_samples)
コード例 #3
0
def alternate_umi(x,
                  y,
                  k=5,
                  density_estimation_method="kde",
                  k_density=5,
                  bw=.2):
    assert len(x) == len(y), "Lists should have same length"
    assert k <= len(x) - 1, "Set k smaller than num. samples - 1"
    N = len(x)
    dx = len(x[0])
    dy = len(y[0])
    data = np.concatenate((x, y), axis=1)

    tree_xy = ss.cKDTree(data)
    tree_x = ss.cKDTree(x)
    tree_y = ss.cKDTree(y)

    if density_estimation_method.lower() == "kde":
        kernel = KernelDensity(bandwidth=bw)
        kernel.fit(x)
        kde = np.exp(kernel.score_samples(x))
        weight = (1 / kde) / np.mean(1 / kde)

    elif density_estimation_method.lower() == "knn":
        knn_dis = [
            tree_x.query(point, k_density + 1, p=np.inf)[0][k_density]
            for point in x
        ]
        density_estimate = np.array([
            float(k_density) / N / knn_dis[i]**dx for i in range(len(knn_dis))
        ])
        weight = (1 / density_estimate) / np.mean(1 / density_estimate)

    else:
        raise ValueError("The density estimation method is not recognized")

    knn_dis = [tree_xy.query(point, k + 1, p=2)[0][k] for point in data]
    ans = log(k) + 2 * log(N - 1) - digamma(N) + vd(dx) + vd(dy) - vd(dx + dy)

    # weight_y = np.zeros(N)
    # for i in range(N):
    #     weight_y[i] = np.sum(weight[j] for j in tree_y.query_ball_point(y[i], knn_dis[i], p=2)) - weight[i]
    # weight_y *= N/np.sum(weight_y)

    for i in range(N):
        nx = len(tree_x.query_ball_point(x[i], knn_dis[i], p=2)) - 1
        ny = len(tree_y.query_ball_point(y[i], knn_dis[i], p=2)) - 1
        ans += -weight[i] * log(nx) / N
        ans += -weight[i] * log(ny) / N
        # for j in tree_y.query_ball_point(y[i], knn_dis[i], p=2):
        # ans += -weight[j] * log(weight[j]) /N/ny
        # ans += -weight[i] * log(weight[i]) / N

    return ans
コード例 #4
0
def entropy(x, k=5):
    """Estimates the entropy of a continuous random variable.

    `entropy` takes a continuous random variable and then estimates entropy using the KSG estimator. It relies on the
    cKDTree function in scipy to query the kNN with KDTree algorithm.

    Arguments
    ---------
        x: 'np.ndarray'
            Data matrix used for calculating the entropy.
        k: Number for nearest neighbors used in entropy calculation

    Returns
    -------
    A numeric value of entropy estimate
    """
    N = len(x)  # The number of observed samples
    # k = int(np.floor(np.sqrt(N)))
    d = len(x[0])  # The number of the dimensions of the data
    tree = ss.cKDTree(x)  # kd-tree for quick nearest-neighbor lookup
    knn_dis = [tree.query(point, k + 1, p=np.inf)[0][k] for point in x
               ]  # distance to the kth nearest neighbor for all points
    ans = -digamma(k) + digamma(N)
    return ans + d * np.mean(map(log, knn_dis))
コード例 #5
0
def sc(x,
       y,
       k=5,
       bw=0.2,
       init_weight_option=1,
       eta=0.5,
       lamb=100,
       T=10,
       method="grad",
       regularization_type="0",
       th=1e-3):
    def get_obj(adj_x, adj_y, weight):
        N = len(weight)
        ans = 0
        for i in range(N):
            nx = len(adj_x[i]) - 1
            ny = np.sum(weight[j] for j in adj_y[i]) - weight[i]
            ans += -weight[i] * log(nx) / N
            ans += -weight[i] * log(ny) / N
        return ans

    def get_stoch_grad(adj_x, adj_y, weight, i):
        N = len(weight)
        ans = np.zeros(N)
        nx = len(adj_x[i]) - 1
        ny = np.sum(weight[j] for j in adj_y[i]) - weight[i]
        for j in adj_y[i]:
            ans[j] += -weight[i] / (ny * N)
        ans[i] += -(log(nx) + log(ny)) / N  #+ weight[i] / (ny * N)
        return ans * np.sqrt(N)

    def get_grad(adj_x, adj_y, weight):
        N = len(weight)
        ans = np.zeros(N)
        for i in range(N):
            nx = len(adj_x[i]) - 1
            ny = np.sum(weight[j] for j in adj_y[i]) - weight[i]
            for j in adj_y[i]:
                ans[j] += -weight[i] / (ny * N)
            ans[i] += -(log(nx) + log(ny)) / N  #+ weight[i] / (ny * N)
        return ans

    assert len(x) == len(y), "Lists should have same length"
    assert k <= len(x) - 1, "Set k smaller than num. samples - 1"

    # solvers.options['show_progress'] = False

    N = len(x)

    # Sorting x and y based on x
    unsorted_xy = sorted(zip(x, y))
    x = [i for i, _ in unsorted_xy]
    y = [i for _, i in unsorted_xy]
    data = np.concatenate((x, y), axis=1)

    tree_xy = ss.cKDTree(data)
    tree_x = ss.cKDTree(x)
    tree_y = ss.cKDTree(y)

    knn_dis = [tree_xy.query(point, k + 1, p=np.inf)[0][k] for point in data]
    adj_x = []
    adj_y = []
    for i in range(N):
        adj_x.append(tree_x.query_ball_point(x[i], knn_dis[i], p=np.inf))
        adj_y.append(tree_y.query_ball_point(y[i], knn_dis[i], p=np.inf))

    if init_weight_option == 0:
        weight = np.ones(N) + nr.normal(0, 0.1, N)
    else:
        k_density = 5
        dx = len(x[0])
        knn_dis = [
            tree_x.query(point, k_density + 1, p=np.inf)[0][k_density]
            for point in x
        ]
        density_estimate = np.array([
            float(k_density) / N / knn_dis[i]**dx for i in range(len(knn_dis))
        ])
        weight = (1 / density_estimate) / np.mean(1 / density_estimate)
        # kernel = KernelDensity(bandwidth=bw)
        # kernel.fit(x)
        # kde = np.exp(kernel.score_samples(x))
        # weight = (1 / kde).clip(1e-8, np.sqrt(N))
        # weight = weight / np.mean(weight)

    A = np.zeros(N)
    b = 0
    for i in range(N):
        A[i] = (x[i] - np.mean(x))**2
        b += weight[i] * A[i]

    ans = digamma(k) + 2 * log(N - 1) - digamma(
        N)  #+ vd(dx) + vd(dy) - vd(dx + dy)

    for i in range(T):

        if method == "grad":
            gradient = get_grad(adj_x, adj_y, weight)
        elif method == "stoch_grad":
            ind = nr.randint(N)
            gradient = get_stoch_grad(adj_x, adj_y, weight, ind)
        else:
            raise ValueError("Cannot recognize the method")
        # gradient = gradient/np.linalg.norm(gradient)
        # print np.linalg.norm(gradient)

        weight = weight + eta * gradient
        if regularization_type == "1":
            weight = weight - eta * lamb * d_regularizer(weight)
        elif regularization_type == "2":
            weight = d_regularizer_2(weight)
        elif regularization_type == "3":
            weight = d_regularizer_3(weight)

        weight = projection(weight, A, b)

        print(ans + get_obj(adj_x, adj_y, weight))

    print(weight)
    plt.plot(weight)
    plt.show()
    return ans + get_obj(adj_x, adj_y, weight)
コード例 #6
0
def cumi(x_orig,
         y_orig,
         z_orig,
         normalization=False,
         k=5,
         density_estimation_method="kde",
         k_density=5,
         bw=.01):
    """Calculates the uniformed conditional mutual information where the distribution for :math:`x` and :math:`z` is replaced by a uniform distribution.

    `cumi` takes two random variable :math:`x` and :math:`y` and estimated their mutual information conditioned on the
    third random variable :math:`z` using the KSG estimator while :math:`x`, :math:`y` is replaced by a uniform distribution.

    Arguments
    ---------
        x_orig: `List`
            One random variable from the time-series data.
        y_orig: `List`
            Another random variable from the time-series data.
        z_orig: `List`
            Another random variable from the time-series data.
        normalization: `bool` (Default: False)
            Whether to normalize the expression of :math:`x, y, z` by their standard deviation.
        k: `int` (Default: 5)
            Number for nearest neighbors used in entropy calculation
        density_estimation_method: `str` (Default: `kde`)
            Which 2D density estimator you would like to use. `kde` is kde estimator while `knn` is knn based estimator.
        k_density: `bool` (default: False)
            The number of k nearest neighbors you would like to use when calculating the density (only applicable when
            density_estimation_method is to be `knn` or using knn based density estimation).
        bw: `float` (default: 0.01)
            Bindwidth used for the kernel density estimator.

    Returns
    -------
    A estimated conditional mutual information value between two variables (x, y), conditioning on a third variable z where
    the distribution for the x, z is replaced by a uniform distribution.
    """
    x = deepcopy(x_orig)
    y = deepcopy(y_orig)
    z = deepcopy(z_orig)

    assert len(x) == len(y), "Lists should have same length"
    assert len(x) == len(z), "Lists should have same length"

    N = len(x)

    dx = len(x[0])
    dy = len(y[0])
    dz = len(z[0])

    if normalization:
        x /= np.std(x)
        y /= np.std(y)
        z /= np.std(z)

    data_xyz = np.concatenate((x, y, z), axis=1)
    data_xz = np.concatenate((x, z), axis=1)
    data_yz = np.concatenate((y, z), axis=1)

    tree_xyz = ss.cKDTree(data_xyz)
    tree_xz = ss.cKDTree(data_xz)
    tree_yz = ss.cKDTree(data_yz)
    tree_z = ss.cKDTree(z)

    if density_estimation_method.lower() == "kde":
        kernel = KernelDensity(bandwidth=bw)
        kernel.fit(data_xz)
        kde = np.exp(kernel.score_samples(data_xz))
        weight = (1 / kde) / np.mean(1 / kde)
    elif density_estimation_method.lower() == "knn":
        knn_dis = [
            tree_xz.query(point, k_density + 1, p=np.inf)[0][k_density]
            for point in data_xz
        ]
        density_estimate = np.array([
            float(k_density) / N / knn_dis[i]**(dx + dz)
            for i in range(len(knn_dis))
        ])
        weight = (1 / density_estimate) / np.mean(1 / density_estimate)
    else:
        raise ValueError("The density estimation method is not recognized")

    knn_dis = [
        tree_xyz.query(point, k + 1, p=np.inf)[0][k] for point in data_xyz
    ]
    information_samples = [0 for i in range(N)]
    for i in range(N):
        information_samples[i] += weight[i] * digamma(
            len(tree_xyz.query_ball_point(data_xyz[i], knn_dis[i], p=np.inf)) -
            1)
        information_samples[i] += weight[i] * -digamma(
            len(tree_xz.query_ball_point(data_xz[i], knn_dis[i], p=np.inf)) -
            1)
        information_samples[i] += weight[i] * -digamma(
            np.sum(weight[j] for j in tree_yz.query_ball_point(
                data_yz[i], knn_dis[i], p=np.inf)) - weight[i])
        information_samples[i] += weight[i] * digamma(
            np.sum(
                weight[j]
                for j in tree_z.query_ball_point(z[i], knn_dis[i], p=np.inf)) -
            weight[i])
    return np.mean(information_samples)
コード例 #7
0
def umi(x, y, k=5, density_estimation_method="kde", k_density=5, bw=.01):
    """Calculates the uniformed mutual information where the distribution for :math:`x` is replaced by a uniform distribution.

    `umi` takes two random variable x and y and estimated their mutual using the KSG estimator while x is replaced by a
    uniform distribution.

    Arguments
    ---------
        x: `List`
            One random variable from the time-series data.
        y: `List`
            Another random variable from the time-series data.
        k: `int` (Default: 5)
            Number for nearest neighbors used in entropy calculation
        density_estimation_method: `str` (Default: `kde`)
            Which 2D density estimator you would like to use. `kde` is kde estimator while `knn` is knn based estimator.
        k_density: `bool` (default: False)
            The number of k nearest neighbors you would like to use when calculating the density (only applicable when
            density_estimation_method is to be `knn` or using knn based density estimation).
        bw: `float` (default: 0.1)
            Bindwidth used for the kernel density estimator.

    Returns
    -------
    A estimated uniform mutual information value between two variables (x, y) where the distribution for the x is replaced
    by a uniform distribution.
    """
    assert len(x) == len(y), "Lists should have same length"
    assert k <= len(x) - 1, "Set k smaller than num. samples - 1"
    N = len(x)
    dx = len(x[0])
    dy = len(y[0])
    data = np.concatenate((x, y), axis=1)

    tree_xy = ss.cKDTree(data)
    tree_x = ss.cKDTree(x)
    tree_y = ss.cKDTree(y)

    if density_estimation_method.lower() == "kde":
        kernel = KernelDensity(bandwidth=bw)
        kernel.fit(x)
        kde = np.exp(kernel.score_samples(x))
        weight = (1 / kde) / np.mean(1 / kde)

    elif density_estimation_method.lower() == "knn":
        knn_dis = [
            tree_x.query(point, k_density + 1, p=np.inf)[0][k_density]
            for point in x
        ]
        density_estimate = np.array([
            float(k_density) / N / knn_dis[i]**dx for i in range(len(knn_dis))
        ])
        weight = (1 / density_estimate) / np.mean(1 / density_estimate)

    else:
        raise ValueError("The density estimation method is not recognized")

    knn_dis = [tree_xy.query(point, k + 1, p=2)[0][k] for point in data]
    ans = digamma(k) + 2 * log(N - 1) - digamma(N) + vd(dx) + vd(dy) - vd(dx +
                                                                          dy)

    weight_y = np.zeros(N)
    for i in range(N):
        weight_y[i] = np.sum(weight[j] for j in tree_y.query_ball_point(
            y[i], knn_dis[i], p=2)) - weight[i]
    weight_y *= N / np.sum(weight_y)

    for i in range(N):
        nx = len(tree_x.query_ball_point(x[i], knn_dis[i], p=2)) - 1
        ny = np.sum(weight[j] for j in tree_y.query_ball_point(
            y[i], knn_dis[i], p=2)) - weight[i]
        ans += -weight[i] * log(nx) / N
        # ans += -ny * log(ny) / N / (len(tree_y.query_ball_point(y[i], knn_dis[i], p=2))-1)
        ans += -weight[i] * log(ny) / N
    return ans
コード例 #8
0
def cmi(x_orig, y_orig, z_orig, normalization=False, k=5):
    """Estimates the CONDITIONAL mutual information of :math:`x` and :math:`y` given :math:`z`.

    `cmi` takes two random variable :math:`x` and :math:`y` and estimated their mutual information conditioned on the
    third random variable :math:`z` using the KSG estimator. It relies on the cKDTree function in scipy to query the kNN
    with KDTree algorithm.

    Arguments
    ---------
        x_orig: `List`
            One random variable from the time-series data.
        y_orig: `List`
            Another random variable from the time-series data.
        z_orig: `List`
            Condition random variable for variables (:math:`x, y`) from the time-series data.
        use_rank_order: `bool` (default: False)
            Whether to use rank order instead of actual value for MI calculation.
        k: `int` (Default: 5)
            Number for nearest neighbors used in entropy calculation

    Returns
    -------
    A numeric value of conditional mutual information estimate
    """
    x = deepcopy(x_orig)
    y = deepcopy(y_orig)
    z = deepcopy(z_orig)

    #print(z_orig)
    #print('##########################')
    assert len(x) == len(y), "Lists should have same length"
    assert len(x) == len(z), "Lists should have same length"

    N = len(x)

    dx = len(x[0])
    dy = len(y[0])
    dz = len(z[0])

    if normalization:
        x /= np.std(x)
        y /= np.std(y)
        z /= np.std(z)

    data_xyz = np.concatenate((x, y, z), axis=1)
    data_xz = np.concatenate((x, z), axis=1)
    data_yz = np.concatenate((y, z), axis=1)

    tree_xyz = ss.cKDTree(data_xyz, balanced_tree=False)
    tree_xz = ss.cKDTree(data_xz, balanced_tree=False)
    tree_yz = ss.cKDTree(data_yz, balanced_tree=False)
    tree_z = ss.cKDTree(z, balanced_tree=False)

    # knn_dis = [tree_xyz.query(point, k + 1, p=np.inf)[0][k] for point in data_xyz]
    knn_dis = tree_xyz.query(data_xyz, k + 1, p=np.inf)[0][:, k]
    information_samples = np.zeros(N)
    for i in range(N):
        information_samples[i] += digamma(
            len(tree_xyz.query_ball_point(data_xyz[i], knn_dis[i], p=np.inf)) -
            1)
        information_samples[i] += -digamma(
            len(tree_xz.query_ball_point(data_xz[i], knn_dis[i], p=np.inf)) -
            1)
        information_samples[i] += -digamma(
            len(tree_yz.query_ball_point(data_yz[i], knn_dis[i], p=np.inf)) -
            1)
        information_samples[i] += digamma(
            len(tree_z.query_ball_point(z[i], knn_dis[i], p=np.inf)) - 1)
    return np.mean(information_samples)