Python entropy Examples, scipy.stats.distributions.entropy Python Examples

Example #1

0

Show file

File: BhmmI_py.py Project: scfrank/bhmmi

def v_measures(m):
    """A lot slower than cython'd version; here so the code is selfsufficient
    for now."""
    np.seterr(all='ignore')  # avoid annoying warnings
    """Returns v measure elements vm, vc, vh of a contingency matrix m."""

    c_size, k_size = m.shape
    sum_m = float(m.sum())
    gold_counts = m.sum(1)
    h_c = entropy(gold_counts) / math.log(2)  # entropy of classes (gold)
    found_counts = m.sum(0)
    h_k = entropy(found_counts) / math.log(
        2)  # entropy of clusters (estimated)

    h_c_k = 0  # H(C|K)
    h_k_c = 0  # H(K|C)
    for i in range(c_size):
        for j in range(k_size):
            if m[i, j] > 0:
                p_ij = m[i, j] / sum_m
                h_c_k += p_ij * math.log(m[i, j] / found_counts[j], 2)
                h_k_c += p_ij * math.log(m[i, j] / gold_counts[i], 2)

    vh = 1 if (h_c == 0) else 1 - (-h_c_k / h_c)
    vc = 1 if (h_k == 0) else 1 - (-h_k_c / h_k)
    vm = (2 * vh * vc) / (vh + vc)
    return (vm, vh, vc)

Example #2

0

Show file

File: evaluation.py Project: 2dpodcast/anytime_recognition

def info_loss(confidences, *args):
    """
    Return the loss defined by increase in entropy.

    Parameters
    ----------
    confidences: (N, K) or (K,) ndarray of float

    args: None
        To maintain compatibility with other loss funcions, we accept
        additional arguments and ignore them.

    Returns
    -------
    losses: (N,) ndarray of float or float
    """
    # guard against nan and -0
    if confidences.ndim == 1:
        scores = entropy(confidences) / np.log(confidences.shape[0])
        if np.isnan(scores):
            return 1
    else:
        scores = entropy(confidences.T) / np.log(confidences.shape[1])
        scores[np.isnan(scores)] = 1
    return np.maximum(0, scores)

Example #3

0

Show file

File: BhmmI_py.py Project: gbarsever/perplexity-project

def v_measures(m):
    """A lot slower than cython'd version; here so the code is selfsufficient
    for now."""
    np.seterr(all='ignore') # avoid annoying warnings
    """Returns v measure elements vm, vc, vh of a contingency matrix m."""

    c_size, k_size = m.shape
    sum_m = float(m.sum())
    gold_counts = m.sum(1)
    h_c = entropy(gold_counts)/math.log(2) # entropy of classes (gold)
    found_counts = m.sum(0)
    h_k = entropy(found_counts)/math.log(2) # entropy of clusters (estimated)

    h_c_k = 0 # H(C|K)
    h_k_c = 0 # H(K|C)
    for i in range(c_size):
        for j in range(k_size):
            if m[i, j] > 0:
                p_ij = m[i, j]/sum_m
                h_c_k += p_ij * math.log(m[i, j]/found_counts[j], 2)
                h_k_c += p_ij * math.log(m[i, j]/gold_counts[i], 2)

    vh = 1 if (h_c == 0) else 1 - (-h_c_k/h_c)
    vc = 1 if (h_k == 0) else 1 - (-h_k_c/h_k)
    vm = (2 * vh * vc) / (vh + vc)
    return (vm, vh, vc)

Example #4

0

Show file

File: ANN_ErrorCorrelations.py Project: Sciumo/ProximityForest

def script2(data=None, K=3, forest_size=6, numForests=5):
    '''
    Script to compute how correlated errors are between
    different forests (KDT,PF). A forest of KD-Trees and a
    Proximity Forest are constructed that have approximately
    equal predictive power (this happens on SIFT data with 3NN
    queries at about 6 trees). Errors are correlated between
    successive forests (of same size), to determine if
    proximity forests have more diversity than KDT Forests.
    @param data: A tuple (g,p) where g is a list of gallery
    points and p is a list of probe points. If set to None,
    then the Sift10K data will be used.
    @param K: How many neighbors to return for each probe?
    @param forest_size: The forest size to use when testing
    error correlation between forests.
    @return: Tuple (Err_KD, Err_PF) where Err_KD/PF are
    the matrices of errors, nxp, where n is the number of
    query points and p is the number of trees.
    '''
    if data is None:
        (g,p) = loadSift10K()
    else:
        (g,p) = data
        
    resT, _ = exact_knn(g, p, K=K)
    
    print "===================================="
    print "Computing errors using KD-Trees"
    print "===================================="
    errM_KD = KDTForest_ErrorCor(g, p, resT, K=K, forest_size=forest_size, numForests=numForests, binary_score=True)
    print ""
    print "===================================="
    print "Computing errors using Proximity Trees"
    print "===================================="
    errM_PF = ProxForest_ErrorCor(g, p, resT, K=K, forest_size=forest_size, numForests=numForests, binary_score=True)
    print ""
    print "===================================="
    print "Error correlations: KD-Trees"
    print "===================================="
    rc1 = avg_corr(errM_KD)
    print "===================================="
    print "Error correlations: Proximity Trees"
    print "===================================="
    rc2 = avg_corr(errM_PF)
    print ""
    print "===================================="
    print "Computing error entropy..."
    print "===================================="
    X_KD = sp.sum(errM_KD, axis=1)
    X_PF = sp.sum(errM_PF, axis=1)    
    H_KD = entropy(X_KD[X_KD>0])
    H_PF = entropy(X_PF[X_PF>0])
    print "Entropy of KD_Tree Errors: %3.3f"%H_KD
    print "Entropy of Prox Tree Errors: %3.3f"%H_PF
    
    return (errM_KD, errM_PF)

Example #5

0

Show file

File: ANN_ErrorCorrelations.py Project: Sciumo/ProximityForest

def script1(data=None, K=1, numTrees=20):
    '''
    Script to compute how correlated errors are from
    different trees of (KD-Tree/Proximity) forests.
    @param data: A tuple (g,p) where g is a list of gallery
    ponts and p is a list of probe points. If set to None,
    then the Sift10K data will be used.
    @param K: How many neighbors to return for each probe?
    @param numTrees: How many trees to test?
    @return: Tuple (Err_KD, Err_PF) where Err_KD/PF are
    the matrices of errors, nxp, where n is the number of
    query points and p is the number of trees.
    '''
    if data is None:
        (g,p) = loadSift10K()
    else:
        (g,p) = data
        
    resT, _ = exact_knn(g, p, K=K)
    
    print "===================================="
    print "Computing errors using KD-Trees"
    print "===================================="
    errM_KD = KDTree_ErrorCor(g, p, resT, K=K, numTrees=numTrees, binary_score=True)
    print ""
    print "===================================="
    print "Computing errors using Proximity Trees"
    print "===================================="
    errM_PF = ProxTree_ErrorCor(g, p, resT, K=K, numTrees=numTrees, binary_score=True)
    print ""
    print "===================================="
    print "Error correlations: KD-Trees"
    print "===================================="
    rc1 = avg_corr(errM_KD)
    print "===================================="
    print "Error correlations: Proximity Trees"
    print "===================================="
    rc2 = avg_corr(errM_PF)
    print ""
    print "===================================="
    print "Computing error entropy..."
    print "===================================="
    X_KD = sp.sum(errM_KD, axis=1)
    X_PF = sp.sum(errM_PF, axis=1)    
    H_KD = entropy(X_KD[X_KD>0])
    H_PF = entropy(X_PF[X_PF>0])
    print "Entropy of KD_Tree Errors: %3.3f"%H_KD
    print "Entropy of Prox Tree Errors: %3.3f"%H_PF
    
    return (errM_KD, errM_PF)

Example #6

0

Show file

File: SubspaceForest.py Project: Sciumo/ProximityForest

 def _computeEntropy(self):
     ''' Compute the entropy of the histogram of distances from the pivot element. Low entropy
     scores means the distribution is concentrated, and thus may be a good candidate for splitting.
     High entropy (at limit, a uniform distribution), may indicate that there is no good separation
     of the elements in this node.
     '''
     assert(self.Pivot != None)
     assert(self.Ds != None)
     assert(len(self.Ds) >= self.Tau)
     assert(len(self.Ds) == len(self.items))
         
     #create a list of distances not including the sample which was selected as the pivot
     #...which will have a distance of zero, within numerical errors.
     Dx = [D for D in self.Ds if D>0.01]
     
     #compute histogram using 10 bins of the Dx list
     HistInfo = scipy.histogram(Dx, bins=10)
     pk = scipy.array( HistInfo[0] )
     epsilon = 0.000001
     H = entropy(pk+epsilon)  #avoids log0 warnings
     #print "Histogram: ", HistInfo[0]
     #print "Entropy: %f"%H
     #print "Range: Min(>0)=%f, Max=%f, Mean=%f, Median=%f"%(min(Dx),max(self.Ds),scipy.mean(self.Ds),scipy.median(self.Ds))
     
     return H

Example #7

0

Show file

File: JensenShannon.py Project: gofflab/biolib

def js_div_matrix(a):
    a=array(a)
    W=zeros((a.shape[0],a.shape[0]))
    e=-entropy(a.transpose())
    for i in range(a.shape[0]):
        val_range=range(i+1,a.shape[0])
        sumAB=tile(a[i,:],(a.shape[0]-i-1,1))+a[val_range,:]
        result=0.5*(e[i]+e[val_range,:]-sum((sumAB)*nan_to_num(log(((sumAB)/2))),1))
        W[val_range,i]=result
        W[i,val_range]=result
    return W

Example #8

0

Show file

def train(X, y, y_train, labels, graph, output_file, top_n=100):
    indices = np.arange(len(y))
    total_promoted, epoch = 0, 1
    outputs = [['Epoch 0']]

    for _id, label in enumerate(labels):
        chunks = [label]
        current_output = []
        for i in indices[y_train == _id]:
            node = graph.entities.get_node(i)
            chunks.append(node)
            current_output.append(node)
            total_promoted += 1
        outputs.append([label] + current_output)
        print('\t'.join(chunks))

    print_state(epoch, labels, total_promoted, y_train)
    while total_promoted < 5522:  # 19875:  # 5522:
        outputs.append(['Epoch %d' % epoch])
        model = label_propagation.LabelPropagation(kernel='knn',
                                                   tol=0.01,
                                                   max_iter=2000,
                                                   n_jobs=16)
        model.fit(X, y_train)

        predictions = model.transduction_
        confidences = entropy(model.label_distributions_.T)

        for _id, label in enumerate(labels):
            mask = np.logical_and(predictions == _id, y_train == -1)
            ii = indices[mask]
            cc = confidences[mask]
            promoted = ii[np.argsort(cc)][:top_n]
            y_train[promoted] = _id
            chunks = [label]
            current_output = []
            for i in promoted:
                node = graph.entities.get_node(i)
                chunks.append(node)
                current_output.append(node)
            print('\t'.join(chunks))
            total_promoted += len(promoted)
            outputs.append([label] + current_output)
        print_state(epoch, labels, total_promoted, predictions)
        epoch += 1
    with open(output_file, 'w') as f:
        for line in pre_func(outputs):
            f.write(line + '\n')

Example #9

0

Show file

File: normalizations.py Project: fidiego/supermatrix

def logRowEntropyNormalization(theMatrix):
    print "...Normalizing Matrix Into LSA-style Log Entropy Values"
    
    numRows = len(theMatrix[:,0])
    numCols = len(theMatrix[0,:])
    normMatrix = scipy.zeros([numRows, numCols], float)
    rowEntropies = scipy.zeros([numRows],float)
    for i in range(numRows):
        currentVector = theMatrix[i,:]
        rowEntropies[i] = distributions.entropy(currentVector[np.nonzero(currentVector)])
        if rowEntropies[i] == 0:
            print "Warning: row %s had entropy of 0: Setting to 1."
            rowEntropies[i] = 1
    
    theMatrix = theMatrix + 1
    normMatrix = np.log10((theMatrix.transpose() / rowEntropies).transpose())

    return normMatrix

Example #10

0

Show file

def logRowEntropyNormalization(theMatrix):
    print "...Normalizing Matrix Into LSA-style Log Entropy Values"

    numRows = len(theMatrix[:, 0])
    numCols = len(theMatrix[0, :])
    normMatrix = scipy.zeros([numRows, numCols], float)
    rowEntropies = scipy.zeros([numRows], float)
    for i in range(numRows):
        currentVector = theMatrix[i, :]
        rowEntropies[i] = distributions.entropy(
            currentVector[np.nonzero(currentVector)])
        if rowEntropies[i] == 0:
            print "Warning: row %s had entropy of 0: Setting to 1."
            rowEntropies[i] = 1

    theMatrix = theMatrix + 1
    normMatrix = np.log10((theMatrix.transpose() / rowEntropies).transpose())

    return normMatrix

Example #11

0

Show file

File: evaluation.py Project: xufango/contrib_bk

def calcEntropy(probMat):
    probMat += 0.0001 # all values need to be >0.0 to compute entropy
    e = np.zeros((probMat.shape[0]))
    for i in np.arange(probMat.shape[0]):
        e[i] = distributions.entropy(probMat[i,:])
    return e

Example #12

0

Show file

File: ANN_ErrorCorrelations.py Project: Sciumo/ProximityForest

def err_entropy(ErrM):
    X = sp.sum(ErrM, axis=1)
    return entropy(X)

Example #13

0

Show file

 def _get_label_propagation_max_entropy_index(
         lp: BaseLabelPropagation) -> int:
     entropies = entropy(lp.label_distributions_.T)
     return np.argsort(entropies)[-1]

Example #14

0

Show file

File: label_propagation.py Project: rmjiang7/sciope

def get_label_entropies(label_distribution):
    return entropy(label_distribution.T)

Example #15

0

Show file

File: histogram.py Project: cmhealGT/python-histogram-plot

    if maxInterim > maxValue:
        maxValue = maxInterim
    minInterim = min(my_data[:, x])
    if minInterim < minValue:
        minValue = minInterim
        

binWidth = (maxValue - minValue) / (numBins)
newBins = np.arange(minValue, maxValue, binWidth)

# TODO process array only once for speedup?
for x in range (0, numModels):
    frequency = plt.hist(my_data[:, x], bins=newBins, histtype='step', normed=True, label=labels[x]);
    b[x, 0] = mean(my_data[:, x]);
    b[x, 1] = var(my_data[:, x]);
    b[x, 2] = skew(my_data[:, x]);
    b[x, 3] = kurtosis(my_data[:, x]);
    b[x, 4] = entropy(frequency[0])

plt.title(csvString + " Frequency")
plt.legend()
deg = u'\N{DEGREE SIGN}'

plt.xlabel("Airflow Rate (cfm)")
plt.ylabel("Frequency")

for i in range (0, 5):
    print(b[:, i])

plt.show()

Example #16

0

Show file

def entropy(v1):
    """
    Computes entropy, sum -p_i * log(p_i)
    """
    return DISTRIBS.entropy(v1.flatten())

Example #17

0

Show file

def evaluateClustering(centroids,
                       data,
                       assignments,
                       trueLabelMeans,
                       trueLabelHash,
                       histSize=101):
    '''
    Evaluates a clustering algorithm, when the true labels of the data have been given. Those
    labels are contained as mapped values in "trueLabelHash". 
    
    To evaluate the clustering algorithm's accuracy, we will follow twp base approach. To do this, we first
    observe that it is possible to compute the distance of every centroid to the mean values of the 
    true labels. Therefore, for every cluster it is possible to find the category mean to which it is closest in vector space.: 
    
    Approach #1: We will associate each centroid with its closest label and therefore compute the clustering
    quality in terms of misclassification error. In this case, the predicted labels are the clusters that 
    examples are assigned to.
    
    Approach #2: For every cluster, we build a histogram which plots the distribution of its points over 
    the ***true*** labels. Clusters whose points' majority true label coincide with the label whose mean 
    is closest to the centroid are more "accurate" than ones for which this condition does not hold.
    
    @param centroids: K x D ndarray, representing K centroids in D-space.
    @param data: N x D ndarray, representing the training data X.
    @param assignments: N-sized ndarray, mapping each example to its cluster. assignments[i] = k means that
            the ith example in "data" is mapped to the cluster represented by the kth centroid.
    @param trueLabelMeans: |labels| xD ndarray, holding the D-dimensional mean values of every class
    @param trueLabelHash: A hash which maps example indices to their true label.
    @param histSize: integer which represents the size of the histogram to pass to "getClusterHistogram".
            By default it's equal to 101, the amount of labels in Caltech101.
    @raise LogicalError: For various cases which have to do with argument sanity checking.
    @raise DatasetError: If provided with no data.
    @return The number of "accurate" clusters, as defined above.
    '''

    if centroids is None or assignments is None or trueLabelMeans is None or trueLabelHash is None:
        raise LogicalError, "Method %s: \"None\" argument(s) provided." % (
            stack()[0][3])
    if data is None or data.shape[0] == 0 or data.shape[1] == 0:
        raise DatasetError, "Method %s: No training data provided." % (
            stack()[0][3])
    if histSize is None or histSize <= 0:
        raise LogicalError, "Method %s: histSize parameter should be a positive integer (provided: %s)." % (
            stack()[0][3], str(histSize))

    if len(trueLabelMeans) != 101:
        raise LogicalError, "Method %s: trueLabelMeans array should have 101 dimensions." % (
            stack()[0][3])

    # for each centroid, find the category mean it is closest to. Then associate this cluster with this
    # mean in a hash.

    # I have tried quite a bit to find an efficient solution to this, and have failed. Instead,
    # I will write an inefficient for loop - based implementation.

    # Careful: the trueLabelMeans 2D ndarray is zero-indexed, whereas the labels are not!

    closestLabel = dict()
    for i in range(len(centroids)):
        closestLabel[i] = np.array(
            [norm(centroids[i] - mean)
             for mean in trueLabelMeans]).argmin() + 1

    # Implement approach #1: Assuming that every assigned cluster is a predicted label, compute
    # the cluster accuracy in terms of misclassification error.

    misclassifiedPoints = 0
    for exIndex in range(data.shape[0]):
        if trueLabelHash[exIndex] != closestLabel[assignments[exIndex]]:
            misclassifiedPoints += 1

    errorRate = 100 * (misclassifiedPoints / float(data.shape[0]))

    # Implement approach #2: Compute true label count histograms and gauge which clusters are "good".
    # "Good" clusters are closest to the mean of the majority
    # vote label voted by their points, as reported by the respective histogram.

    goodCentroids = 0
    histogramEntropies = []
    for i in range(len(centroids)):
        # Get the indices of all the points in the cluster
        pointsInCluster = [
            j for j in range(len(assignments)) if assignments[j] == i
        ]
        if len(pointsInCluster) > 0:
            clusterHist, majVoteLabel = getClusterHistogram(
                pointsInCluster, trueLabelHash, histSize)
            histogramEntropies.append(
                entropy([val for val in clusterHist if val > 0]))
            if closestLabel[i] != None and majVoteLabel == closestLabel[i]:
                goodCentroids += 1

    # Return all metrics to caller.

    return errorRate, goodCentroids, np.mean(histogramEntropies)

Example #18

0

Show file

File: evaluation.py Project: tbertinmahieux/Imputation

def entropy(v1):
    """
    Computes entropy, sum -p_i * log(p_i)
    """
    return DISTRIBS.entropy(v1.flatten())