/
tree_entropy.py
79 lines (67 loc) · 2.3 KB
/
tree_entropy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import numpy as np
import warnings
from scipy.sparse.csgraph import minimum_spanning_tree, connected_components
from sklearn.metrics import euclidean_distances
def tree_information(X, y):
"""Computes mutual information objective using MST entropy estimate.
Parameters
----------
X: numpy array, shape=[n_samples, n_features]
datapoints
y: numpy array, shape=[n_samples], dtype=int
cluster assignments
"""
n_samples, n_features = X.shape
entropy = 0
classes = np.unique(y)
for c in classes:
inds = np.where(y == c)[0]
if len(inds) == 1:
continue
X_ = X[y == c]
n_samples_c = X_.shape[0]
L = spanning_tree_length(X_)
if L == 0:
warnings.warn("L is zero. This means there are identical points in"
"the dataset")
L = 1e-10
entropy += n_samples_c * ((n_features - 1) * np.log(n_samples_c) -
n_features * np.log(L))
return entropy / n_samples
def spanning_tree_length(X):
"""Compute the length of the euclidean MST of X.
Parameters
----------
X: ndarray, shape=[n_samples, n_features]
"""
if X.shape[0] < 2:
return 0
return minimum_spanning_tree(euclidean_distances(X)).sum()
def tree_information_sparse(forest, n_features):
"""Computes mutual information objective from forest.
Parameters
----------
forest: sparse matrix
graph containing trees representing cluster
n_features: int
dimensionality of input space.
"""
entropy = 0
sym_forest = forest + forest.T
n_components, components = connected_components(sym_forest)
if np.any(components < 0):
# there is a lonely node
entropy -= 1e10
# n_samples = len(components)
for i in range(n_components):
inds = np.where(components == i)[0]
subforest = forest[inds[:, np.newaxis], inds]
L = subforest.sum()
n_samples_c = len(inds)
if L == 0:
warnings.warn("L is zero. This means there are identical points in"
" the dataset")
L = 1e-10
entropy += (n_samples_c * ((n_features - 1) * np.log(n_samples_c) -
n_features * np.log(L)))
return entropy