def get_obj(adj_xyz, adj_xz, adj_yz, adj_z, weight): N = len(weight) information_samples = [0 for cnt in range(N)] for i in range(N): information_samples[i] += weight[i] * digamma(len(adj_xyz[i]) - 1) information_samples[i] += weight[i] * -digamma(len(adj_xz[i]) - 1) information_samples[i] += weight[i] * -digamma( np.sum(weight[j] for j in adj_yz[i]) - weight[i]) information_samples[i] += weight[i] * digamma( np.sum(weight[j] for j in adj_z[i]) - weight[i]) return np.mean(information_samples)
def mi(x_orig, y_orig, use_rank_order=False, k=5): """Estimates the mutual information of two random variables based on their observed values. `mi` takes two random variables :math:`x` and :math:`y` to estimate the mutual information between them using the KSG estimator. It relies on the cKDTree function in scipy to query the kNN with KDTree algorithm. Arguments --------- x_orig: `List` One random variable from the time-series data. y_orig: `List` Another random variable from the time-series data. use_rank_order: `bool` (default: False) Whether to use rank order instead of actual value for MI calculation. k: (Default: 5) Number for nearest neighbors used in entropy calculation Returns ------- A numeric value of mutual information estimate """ x = deepcopy(x_orig) y = deepcopy(y_orig) assert len(x) == len(y), "Lists should have same length" N = len(x) dx = len(x[0]) dy = len(y[0]) # if use_rank_order: # x = rank_order(x) # y = rank_order(y) data = np.concatenate((x, y), axis=1) tree_xy = ss.cKDTree(data) tree_x = ss.cKDTree(x) tree_y = ss.cKDTree(y) # knn_dis = [tree_xy.query(point, k + 1, p=np.inf)[0][k] for point in data] knn_dis = tree_xy.query(data, k + 1, p=np.inf)[0] information_samples = [digamma(N) for i in range(N)] for i in range(N): information_samples[i] += digamma( len(tree_xy.query_ball_point(data[i], knn_dis[i][k], p=np.inf)) - 1) information_samples[i] += -digamma( len(tree_x.query_ball_point(x[i], knn_dis[i][k], p=np.inf)) - 1) information_samples[i] += -digamma( len(tree_y.query_ball_point(y[i], knn_dis[i][k], p=np.inf)) - 1) return np.mean(information_samples)
def alternate_umi(x, y, k=5, density_estimation_method="kde", k_density=5, bw=.2): assert len(x) == len(y), "Lists should have same length" assert k <= len(x) - 1, "Set k smaller than num. samples - 1" N = len(x) dx = len(x[0]) dy = len(y[0]) data = np.concatenate((x, y), axis=1) tree_xy = ss.cKDTree(data) tree_x = ss.cKDTree(x) tree_y = ss.cKDTree(y) if density_estimation_method.lower() == "kde": kernel = KernelDensity(bandwidth=bw) kernel.fit(x) kde = np.exp(kernel.score_samples(x)) weight = (1 / kde) / np.mean(1 / kde) elif density_estimation_method.lower() == "knn": knn_dis = [ tree_x.query(point, k_density + 1, p=np.inf)[0][k_density] for point in x ] density_estimate = np.array([ float(k_density) / N / knn_dis[i]**dx for i in range(len(knn_dis)) ]) weight = (1 / density_estimate) / np.mean(1 / density_estimate) else: raise ValueError("The density estimation method is not recognized") knn_dis = [tree_xy.query(point, k + 1, p=2)[0][k] for point in data] ans = log(k) + 2 * log(N - 1) - digamma(N) + vd(dx) + vd(dy) - vd(dx + dy) # weight_y = np.zeros(N) # for i in range(N): # weight_y[i] = np.sum(weight[j] for j in tree_y.query_ball_point(y[i], knn_dis[i], p=2)) - weight[i] # weight_y *= N/np.sum(weight_y) for i in range(N): nx = len(tree_x.query_ball_point(x[i], knn_dis[i], p=2)) - 1 ny = len(tree_y.query_ball_point(y[i], knn_dis[i], p=2)) - 1 ans += -weight[i] * log(nx) / N ans += -weight[i] * log(ny) / N # for j in tree_y.query_ball_point(y[i], knn_dis[i], p=2): # ans += -weight[j] * log(weight[j]) /N/ny # ans += -weight[i] * log(weight[i]) / N return ans
def entropy(x, k=5): """Estimates the entropy of a continuous random variable. `entropy` takes a continuous random variable and then estimates entropy using the KSG estimator. It relies on the cKDTree function in scipy to query the kNN with KDTree algorithm. Arguments --------- x: 'np.ndarray' Data matrix used for calculating the entropy. k: Number for nearest neighbors used in entropy calculation Returns ------- A numeric value of entropy estimate """ N = len(x) # The number of observed samples # k = int(np.floor(np.sqrt(N))) d = len(x[0]) # The number of the dimensions of the data tree = ss.cKDTree(x) # kd-tree for quick nearest-neighbor lookup knn_dis = [tree.query(point, k + 1, p=np.inf)[0][k] for point in x ] # distance to the kth nearest neighbor for all points ans = -digamma(k) + digamma(N) return ans + d * np.mean(map(log, knn_dis))
def sc(x, y, k=5, bw=0.2, init_weight_option=1, eta=0.5, lamb=100, T=10, method="grad", regularization_type="0", th=1e-3): def get_obj(adj_x, adj_y, weight): N = len(weight) ans = 0 for i in range(N): nx = len(adj_x[i]) - 1 ny = np.sum(weight[j] for j in adj_y[i]) - weight[i] ans += -weight[i] * log(nx) / N ans += -weight[i] * log(ny) / N return ans def get_stoch_grad(adj_x, adj_y, weight, i): N = len(weight) ans = np.zeros(N) nx = len(adj_x[i]) - 1 ny = np.sum(weight[j] for j in adj_y[i]) - weight[i] for j in adj_y[i]: ans[j] += -weight[i] / (ny * N) ans[i] += -(log(nx) + log(ny)) / N #+ weight[i] / (ny * N) return ans * np.sqrt(N) def get_grad(adj_x, adj_y, weight): N = len(weight) ans = np.zeros(N) for i in range(N): nx = len(adj_x[i]) - 1 ny = np.sum(weight[j] for j in adj_y[i]) - weight[i] for j in adj_y[i]: ans[j] += -weight[i] / (ny * N) ans[i] += -(log(nx) + log(ny)) / N #+ weight[i] / (ny * N) return ans assert len(x) == len(y), "Lists should have same length" assert k <= len(x) - 1, "Set k smaller than num. samples - 1" # solvers.options['show_progress'] = False N = len(x) # Sorting x and y based on x unsorted_xy = sorted(zip(x, y)) x = [i for i, _ in unsorted_xy] y = [i for _, i in unsorted_xy] data = np.concatenate((x, y), axis=1) tree_xy = ss.cKDTree(data) tree_x = ss.cKDTree(x) tree_y = ss.cKDTree(y) knn_dis = [tree_xy.query(point, k + 1, p=np.inf)[0][k] for point in data] adj_x = [] adj_y = [] for i in range(N): adj_x.append(tree_x.query_ball_point(x[i], knn_dis[i], p=np.inf)) adj_y.append(tree_y.query_ball_point(y[i], knn_dis[i], p=np.inf)) if init_weight_option == 0: weight = np.ones(N) + nr.normal(0, 0.1, N) else: k_density = 5 dx = len(x[0]) knn_dis = [ tree_x.query(point, k_density + 1, p=np.inf)[0][k_density] for point in x ] density_estimate = np.array([ float(k_density) / N / knn_dis[i]**dx for i in range(len(knn_dis)) ]) weight = (1 / density_estimate) / np.mean(1 / density_estimate) # kernel = KernelDensity(bandwidth=bw) # kernel.fit(x) # kde = np.exp(kernel.score_samples(x)) # weight = (1 / kde).clip(1e-8, np.sqrt(N)) # weight = weight / np.mean(weight) A = np.zeros(N) b = 0 for i in range(N): A[i] = (x[i] - np.mean(x))**2 b += weight[i] * A[i] ans = digamma(k) + 2 * log(N - 1) - digamma( N) #+ vd(dx) + vd(dy) - vd(dx + dy) for i in range(T): if method == "grad": gradient = get_grad(adj_x, adj_y, weight) elif method == "stoch_grad": ind = nr.randint(N) gradient = get_stoch_grad(adj_x, adj_y, weight, ind) else: raise ValueError("Cannot recognize the method") # gradient = gradient/np.linalg.norm(gradient) # print np.linalg.norm(gradient) weight = weight + eta * gradient if regularization_type == "1": weight = weight - eta * lamb * d_regularizer(weight) elif regularization_type == "2": weight = d_regularizer_2(weight) elif regularization_type == "3": weight = d_regularizer_3(weight) weight = projection(weight, A, b) print(ans + get_obj(adj_x, adj_y, weight)) print(weight) plt.plot(weight) plt.show() return ans + get_obj(adj_x, adj_y, weight)
def cumi(x_orig, y_orig, z_orig, normalization=False, k=5, density_estimation_method="kde", k_density=5, bw=.01): """Calculates the uniformed conditional mutual information where the distribution for :math:`x` and :math:`z` is replaced by a uniform distribution. `cumi` takes two random variable :math:`x` and :math:`y` and estimated their mutual information conditioned on the third random variable :math:`z` using the KSG estimator while :math:`x`, :math:`y` is replaced by a uniform distribution. Arguments --------- x_orig: `List` One random variable from the time-series data. y_orig: `List` Another random variable from the time-series data. z_orig: `List` Another random variable from the time-series data. normalization: `bool` (Default: False) Whether to normalize the expression of :math:`x, y, z` by their standard deviation. k: `int` (Default: 5) Number for nearest neighbors used in entropy calculation density_estimation_method: `str` (Default: `kde`) Which 2D density estimator you would like to use. `kde` is kde estimator while `knn` is knn based estimator. k_density: `bool` (default: False) The number of k nearest neighbors you would like to use when calculating the density (only applicable when density_estimation_method is to be `knn` or using knn based density estimation). bw: `float` (default: 0.01) Bindwidth used for the kernel density estimator. Returns ------- A estimated conditional mutual information value between two variables (x, y), conditioning on a third variable z where the distribution for the x, z is replaced by a uniform distribution. """ x = deepcopy(x_orig) y = deepcopy(y_orig) z = deepcopy(z_orig) assert len(x) == len(y), "Lists should have same length" assert len(x) == len(z), "Lists should have same length" N = len(x) dx = len(x[0]) dy = len(y[0]) dz = len(z[0]) if normalization: x /= np.std(x) y /= np.std(y) z /= np.std(z) data_xyz = np.concatenate((x, y, z), axis=1) data_xz = np.concatenate((x, z), axis=1) data_yz = np.concatenate((y, z), axis=1) tree_xyz = ss.cKDTree(data_xyz) tree_xz = ss.cKDTree(data_xz) tree_yz = ss.cKDTree(data_yz) tree_z = ss.cKDTree(z) if density_estimation_method.lower() == "kde": kernel = KernelDensity(bandwidth=bw) kernel.fit(data_xz) kde = np.exp(kernel.score_samples(data_xz)) weight = (1 / kde) / np.mean(1 / kde) elif density_estimation_method.lower() == "knn": knn_dis = [ tree_xz.query(point, k_density + 1, p=np.inf)[0][k_density] for point in data_xz ] density_estimate = np.array([ float(k_density) / N / knn_dis[i]**(dx + dz) for i in range(len(knn_dis)) ]) weight = (1 / density_estimate) / np.mean(1 / density_estimate) else: raise ValueError("The density estimation method is not recognized") knn_dis = [ tree_xyz.query(point, k + 1, p=np.inf)[0][k] for point in data_xyz ] information_samples = [0 for i in range(N)] for i in range(N): information_samples[i] += weight[i] * digamma( len(tree_xyz.query_ball_point(data_xyz[i], knn_dis[i], p=np.inf)) - 1) information_samples[i] += weight[i] * -digamma( len(tree_xz.query_ball_point(data_xz[i], knn_dis[i], p=np.inf)) - 1) information_samples[i] += weight[i] * -digamma( np.sum(weight[j] for j in tree_yz.query_ball_point( data_yz[i], knn_dis[i], p=np.inf)) - weight[i]) information_samples[i] += weight[i] * digamma( np.sum( weight[j] for j in tree_z.query_ball_point(z[i], knn_dis[i], p=np.inf)) - weight[i]) return np.mean(information_samples)
def umi(x, y, k=5, density_estimation_method="kde", k_density=5, bw=.01): """Calculates the uniformed mutual information where the distribution for :math:`x` is replaced by a uniform distribution. `umi` takes two random variable x and y and estimated their mutual using the KSG estimator while x is replaced by a uniform distribution. Arguments --------- x: `List` One random variable from the time-series data. y: `List` Another random variable from the time-series data. k: `int` (Default: 5) Number for nearest neighbors used in entropy calculation density_estimation_method: `str` (Default: `kde`) Which 2D density estimator you would like to use. `kde` is kde estimator while `knn` is knn based estimator. k_density: `bool` (default: False) The number of k nearest neighbors you would like to use when calculating the density (only applicable when density_estimation_method is to be `knn` or using knn based density estimation). bw: `float` (default: 0.1) Bindwidth used for the kernel density estimator. Returns ------- A estimated uniform mutual information value between two variables (x, y) where the distribution for the x is replaced by a uniform distribution. """ assert len(x) == len(y), "Lists should have same length" assert k <= len(x) - 1, "Set k smaller than num. samples - 1" N = len(x) dx = len(x[0]) dy = len(y[0]) data = np.concatenate((x, y), axis=1) tree_xy = ss.cKDTree(data) tree_x = ss.cKDTree(x) tree_y = ss.cKDTree(y) if density_estimation_method.lower() == "kde": kernel = KernelDensity(bandwidth=bw) kernel.fit(x) kde = np.exp(kernel.score_samples(x)) weight = (1 / kde) / np.mean(1 / kde) elif density_estimation_method.lower() == "knn": knn_dis = [ tree_x.query(point, k_density + 1, p=np.inf)[0][k_density] for point in x ] density_estimate = np.array([ float(k_density) / N / knn_dis[i]**dx for i in range(len(knn_dis)) ]) weight = (1 / density_estimate) / np.mean(1 / density_estimate) else: raise ValueError("The density estimation method is not recognized") knn_dis = [tree_xy.query(point, k + 1, p=2)[0][k] for point in data] ans = digamma(k) + 2 * log(N - 1) - digamma(N) + vd(dx) + vd(dy) - vd(dx + dy) weight_y = np.zeros(N) for i in range(N): weight_y[i] = np.sum(weight[j] for j in tree_y.query_ball_point( y[i], knn_dis[i], p=2)) - weight[i] weight_y *= N / np.sum(weight_y) for i in range(N): nx = len(tree_x.query_ball_point(x[i], knn_dis[i], p=2)) - 1 ny = np.sum(weight[j] for j in tree_y.query_ball_point( y[i], knn_dis[i], p=2)) - weight[i] ans += -weight[i] * log(nx) / N # ans += -ny * log(ny) / N / (len(tree_y.query_ball_point(y[i], knn_dis[i], p=2))-1) ans += -weight[i] * log(ny) / N return ans
def cmi(x_orig, y_orig, z_orig, normalization=False, k=5): """Estimates the CONDITIONAL mutual information of :math:`x` and :math:`y` given :math:`z`. `cmi` takes two random variable :math:`x` and :math:`y` and estimated their mutual information conditioned on the third random variable :math:`z` using the KSG estimator. It relies on the cKDTree function in scipy to query the kNN with KDTree algorithm. Arguments --------- x_orig: `List` One random variable from the time-series data. y_orig: `List` Another random variable from the time-series data. z_orig: `List` Condition random variable for variables (:math:`x, y`) from the time-series data. use_rank_order: `bool` (default: False) Whether to use rank order instead of actual value for MI calculation. k: `int` (Default: 5) Number for nearest neighbors used in entropy calculation Returns ------- A numeric value of conditional mutual information estimate """ x = deepcopy(x_orig) y = deepcopy(y_orig) z = deepcopy(z_orig) #print(z_orig) #print('##########################') assert len(x) == len(y), "Lists should have same length" assert len(x) == len(z), "Lists should have same length" N = len(x) dx = len(x[0]) dy = len(y[0]) dz = len(z[0]) if normalization: x /= np.std(x) y /= np.std(y) z /= np.std(z) data_xyz = np.concatenate((x, y, z), axis=1) data_xz = np.concatenate((x, z), axis=1) data_yz = np.concatenate((y, z), axis=1) tree_xyz = ss.cKDTree(data_xyz, balanced_tree=False) tree_xz = ss.cKDTree(data_xz, balanced_tree=False) tree_yz = ss.cKDTree(data_yz, balanced_tree=False) tree_z = ss.cKDTree(z, balanced_tree=False) # knn_dis = [tree_xyz.query(point, k + 1, p=np.inf)[0][k] for point in data_xyz] knn_dis = tree_xyz.query(data_xyz, k + 1, p=np.inf)[0][:, k] information_samples = np.zeros(N) for i in range(N): information_samples[i] += digamma( len(tree_xyz.query_ball_point(data_xyz[i], knn_dis[i], p=np.inf)) - 1) information_samples[i] += -digamma( len(tree_xz.query_ball_point(data_xz[i], knn_dis[i], p=np.inf)) - 1) information_samples[i] += -digamma( len(tree_yz.query_ball_point(data_yz[i], knn_dis[i], p=np.inf)) - 1) information_samples[i] += digamma( len(tree_z.query_ball_point(z[i], knn_dis[i], p=np.inf)) - 1) return np.mean(information_samples)