Example #1
0
def v_measures(m):
    """A lot slower than cython'd version; here so the code is selfsufficient
    for now."""
    np.seterr(all='ignore')  # avoid annoying warnings
    """Returns v measure elements vm, vc, vh of a contingency matrix m."""

    c_size, k_size = m.shape
    sum_m = float(m.sum())
    gold_counts = m.sum(1)
    h_c = entropy(gold_counts) / math.log(2)  # entropy of classes (gold)
    found_counts = m.sum(0)
    h_k = entropy(found_counts) / math.log(
        2)  # entropy of clusters (estimated)

    h_c_k = 0  # H(C|K)
    h_k_c = 0  # H(K|C)
    for i in range(c_size):
        for j in range(k_size):
            if m[i, j] > 0:
                p_ij = m[i, j] / sum_m
                h_c_k += p_ij * math.log(m[i, j] / found_counts[j], 2)
                h_k_c += p_ij * math.log(m[i, j] / gold_counts[i], 2)

    vh = 1 if (h_c == 0) else 1 - (-h_c_k / h_c)
    vc = 1 if (h_k == 0) else 1 - (-h_k_c / h_k)
    vm = (2 * vh * vc) / (vh + vc)
    return (vm, vh, vc)
def info_loss(confidences, *args):
    """
    Return the loss defined by increase in entropy.

    Parameters
    ----------
    confidences: (N, K) or (K,) ndarray of float

    args: None
        To maintain compatibility with other loss funcions, we accept
        additional arguments and ignore them.

    Returns
    -------
    losses: (N,) ndarray of float or float
    """
    # guard against nan and -0
    if confidences.ndim == 1:
        scores = entropy(confidences) / np.log(confidences.shape[0])
        if np.isnan(scores):
            return 1
    else:
        scores = entropy(confidences.T) / np.log(confidences.shape[1])
        scores[np.isnan(scores)] = 1
    return np.maximum(0, scores)
Example #3
0
def v_measures(m):
    """A lot slower than cython'd version; here so the code is selfsufficient
    for now."""
    np.seterr(all='ignore') # avoid annoying warnings
    """Returns v measure elements vm, vc, vh of a contingency matrix m."""

    c_size, k_size = m.shape
    sum_m = float(m.sum())
    gold_counts = m.sum(1)
    h_c = entropy(gold_counts)/math.log(2) # entropy of classes (gold)
    found_counts = m.sum(0)
    h_k = entropy(found_counts)/math.log(2) # entropy of clusters (estimated)

    h_c_k = 0 # H(C|K)
    h_k_c = 0 # H(K|C)
    for i in range(c_size):
        for j in range(k_size):
            if m[i, j] > 0:
                p_ij = m[i, j]/sum_m
                h_c_k += p_ij * math.log(m[i, j]/found_counts[j], 2)
                h_k_c += p_ij * math.log(m[i, j]/gold_counts[i], 2)

    vh = 1 if (h_c == 0) else 1 - (-h_c_k/h_c)
    vc = 1 if (h_k == 0) else 1 - (-h_k_c/h_k)
    vm = (2 * vh * vc) / (vh + vc)
    return (vm, vh, vc)
def script2(data=None, K=3, forest_size=6, numForests=5):
    '''
    Script to compute how correlated errors are between
    different forests (KDT,PF). A forest of KD-Trees and a
    Proximity Forest are constructed that have approximately
    equal predictive power (this happens on SIFT data with 3NN
    queries at about 6 trees). Errors are correlated between
    successive forests (of same size), to determine if
    proximity forests have more diversity than KDT Forests.
    @param data: A tuple (g,p) where g is a list of gallery
    points and p is a list of probe points. If set to None,
    then the Sift10K data will be used.
    @param K: How many neighbors to return for each probe?
    @param forest_size: The forest size to use when testing
    error correlation between forests.
    @return: Tuple (Err_KD, Err_PF) where Err_KD/PF are
    the matrices of errors, nxp, where n is the number of
    query points and p is the number of trees.
    '''
    if data is None:
        (g,p) = loadSift10K()
    else:
        (g,p) = data
        
    resT, _ = exact_knn(g, p, K=K)
    
    print "===================================="
    print "Computing errors using KD-Trees"
    print "===================================="
    errM_KD = KDTForest_ErrorCor(g, p, resT, K=K, forest_size=forest_size, numForests=numForests, binary_score=True)
    print ""
    print "===================================="
    print "Computing errors using Proximity Trees"
    print "===================================="
    errM_PF = ProxForest_ErrorCor(g, p, resT, K=K, forest_size=forest_size, numForests=numForests, binary_score=True)
    print ""
    print "===================================="
    print "Error correlations: KD-Trees"
    print "===================================="
    rc1 = avg_corr(errM_KD)
    print "===================================="
    print "Error correlations: Proximity Trees"
    print "===================================="
    rc2 = avg_corr(errM_PF)
    print ""
    print "===================================="
    print "Computing error entropy..."
    print "===================================="
    X_KD = sp.sum(errM_KD, axis=1)
    X_PF = sp.sum(errM_PF, axis=1)    
    H_KD = entropy(X_KD[X_KD>0])
    H_PF = entropy(X_PF[X_PF>0])
    print "Entropy of KD_Tree Errors: %3.3f"%H_KD
    print "Entropy of Prox Tree Errors: %3.3f"%H_PF
    
    return (errM_KD, errM_PF)
def script1(data=None, K=1, numTrees=20):
    '''
    Script to compute how correlated errors are from
    different trees of (KD-Tree/Proximity) forests.
    @param data: A tuple (g,p) where g is a list of gallery
    ponts and p is a list of probe points. If set to None,
    then the Sift10K data will be used.
    @param K: How many neighbors to return for each probe?
    @param numTrees: How many trees to test?
    @return: Tuple (Err_KD, Err_PF) where Err_KD/PF are
    the matrices of errors, nxp, where n is the number of
    query points and p is the number of trees.
    '''
    if data is None:
        (g,p) = loadSift10K()
    else:
        (g,p) = data
        
    resT, _ = exact_knn(g, p, K=K)
    
    print "===================================="
    print "Computing errors using KD-Trees"
    print "===================================="
    errM_KD = KDTree_ErrorCor(g, p, resT, K=K, numTrees=numTrees, binary_score=True)
    print ""
    print "===================================="
    print "Computing errors using Proximity Trees"
    print "===================================="
    errM_PF = ProxTree_ErrorCor(g, p, resT, K=K, numTrees=numTrees, binary_score=True)
    print ""
    print "===================================="
    print "Error correlations: KD-Trees"
    print "===================================="
    rc1 = avg_corr(errM_KD)
    print "===================================="
    print "Error correlations: Proximity Trees"
    print "===================================="
    rc2 = avg_corr(errM_PF)
    print ""
    print "===================================="
    print "Computing error entropy..."
    print "===================================="
    X_KD = sp.sum(errM_KD, axis=1)
    X_PF = sp.sum(errM_PF, axis=1)    
    H_KD = entropy(X_KD[X_KD>0])
    H_PF = entropy(X_PF[X_PF>0])
    print "Entropy of KD_Tree Errors: %3.3f"%H_KD
    print "Entropy of Prox Tree Errors: %3.3f"%H_PF
    
    return (errM_KD, errM_PF)
Example #6
0
 def _computeEntropy(self):
     ''' Compute the entropy of the histogram of distances from the pivot element. Low entropy
     scores means the distribution is concentrated, and thus may be a good candidate for splitting.
     High entropy (at limit, a uniform distribution), may indicate that there is no good separation
     of the elements in this node.
     '''
     assert(self.Pivot != None)
     assert(self.Ds != None)
     assert(len(self.Ds) >= self.Tau)
     assert(len(self.Ds) == len(self.items))
         
     #create a list of distances not including the sample which was selected as the pivot
     #...which will have a distance of zero, within numerical errors.
     Dx = [D for D in self.Ds if D>0.01]
     
     #compute histogram using 10 bins of the Dx list
     HistInfo = scipy.histogram(Dx, bins=10)
     pk = scipy.array( HistInfo[0] )
     epsilon = 0.000001
     H = entropy(pk+epsilon)  #avoids log0 warnings
     #print "Histogram: ", HistInfo[0]
     #print "Entropy: %f"%H
     #print "Range: Min(>0)=%f, Max=%f, Mean=%f, Median=%f"%(min(Dx),max(self.Ds),scipy.mean(self.Ds),scipy.median(self.Ds))
     
     return H
     
Example #7
0
def js_div_matrix(a):
    a=array(a)
    W=zeros((a.shape[0],a.shape[0]))
    e=-entropy(a.transpose())
    for i in range(a.shape[0]):
        val_range=range(i+1,a.shape[0])
        sumAB=tile(a[i,:],(a.shape[0]-i-1,1))+a[val_range,:]
        result=0.5*(e[i]+e[val_range,:]-sum((sumAB)*nan_to_num(log(((sumAB)/2))),1))
        W[val_range,i]=result
        W[i,val_range]=result
    return W
Example #8
0
def train(X, y, y_train, labels, graph, output_file, top_n=100):
    indices = np.arange(len(y))
    total_promoted, epoch = 0, 1
    outputs = [['Epoch 0']]

    for _id, label in enumerate(labels):
        chunks = [label]
        current_output = []
        for i in indices[y_train == _id]:
            node = graph.entities.get_node(i)
            chunks.append(node)
            current_output.append(node)
            total_promoted += 1
        outputs.append([label] + current_output)
        print('\t'.join(chunks))

    print_state(epoch, labels, total_promoted, y_train)
    while total_promoted < 5522:  # 19875:  # 5522:
        outputs.append(['Epoch %d' % epoch])
        model = label_propagation.LabelPropagation(kernel='knn',
                                                   tol=0.01,
                                                   max_iter=2000,
                                                   n_jobs=16)
        model.fit(X, y_train)

        predictions = model.transduction_
        confidences = entropy(model.label_distributions_.T)

        for _id, label in enumerate(labels):
            mask = np.logical_and(predictions == _id, y_train == -1)
            ii = indices[mask]
            cc = confidences[mask]
            promoted = ii[np.argsort(cc)][:top_n]
            y_train[promoted] = _id
            chunks = [label]
            current_output = []
            for i in promoted:
                node = graph.entities.get_node(i)
                chunks.append(node)
                current_output.append(node)
            print('\t'.join(chunks))
            total_promoted += len(promoted)
            outputs.append([label] + current_output)
        print_state(epoch, labels, total_promoted, predictions)
        epoch += 1
    with open(output_file, 'w') as f:
        for line in pre_func(outputs):
            f.write(line + '\n')
Example #9
0
def logRowEntropyNormalization(theMatrix):
    print "...Normalizing Matrix Into LSA-style Log Entropy Values"
    
    numRows = len(theMatrix[:,0])
    numCols = len(theMatrix[0,:])
    normMatrix = scipy.zeros([numRows, numCols], float)
    rowEntropies = scipy.zeros([numRows],float)
    for i in range(numRows):
        currentVector = theMatrix[i,:]
        rowEntropies[i] = distributions.entropy(currentVector[np.nonzero(currentVector)])
        if rowEntropies[i] == 0:
            print "Warning: row %s had entropy of 0: Setting to 1."
            rowEntropies[i] = 1
    
    theMatrix = theMatrix + 1
    normMatrix = np.log10((theMatrix.transpose() / rowEntropies).transpose())

    return normMatrix
Example #10
0
def logRowEntropyNormalization(theMatrix):
    print "...Normalizing Matrix Into LSA-style Log Entropy Values"

    numRows = len(theMatrix[:, 0])
    numCols = len(theMatrix[0, :])
    normMatrix = scipy.zeros([numRows, numCols], float)
    rowEntropies = scipy.zeros([numRows], float)
    for i in range(numRows):
        currentVector = theMatrix[i, :]
        rowEntropies[i] = distributions.entropy(
            currentVector[np.nonzero(currentVector)])
        if rowEntropies[i] == 0:
            print "Warning: row %s had entropy of 0: Setting to 1."
            rowEntropies[i] = 1

    theMatrix = theMatrix + 1
    normMatrix = np.log10((theMatrix.transpose() / rowEntropies).transpose())

    return normMatrix
Example #11
0
def calcEntropy(probMat):
    probMat += 0.0001 # all values need to be >0.0 to compute entropy
    e = np.zeros((probMat.shape[0]))
    for i in np.arange(probMat.shape[0]):
        e[i] = distributions.entropy(probMat[i,:])
    return e
def err_entropy(ErrM):
    X = sp.sum(ErrM, axis=1)
    return entropy(X)
Example #13
0
 def _get_label_propagation_max_entropy_index(
         lp: BaseLabelPropagation) -> int:
     entropies = entropy(lp.label_distributions_.T)
     return np.argsort(entropies)[-1]
Example #14
0
def get_label_entropies(label_distribution):
    return entropy(label_distribution.T)
    if maxInterim > maxValue:
        maxValue = maxInterim
    minInterim = min(my_data[:, x])
    if minInterim < minValue:
        minValue = minInterim
        

binWidth = (maxValue - minValue) / (numBins)
newBins = np.arange(minValue, maxValue, binWidth)

# TODO process array only once for speedup?
for x in range (0, numModels):
    frequency = plt.hist(my_data[:, x], bins=newBins, histtype='step', normed=True, label=labels[x]);
    b[x, 0] = mean(my_data[:, x]);
    b[x, 1] = var(my_data[:, x]);
    b[x, 2] = skew(my_data[:, x]);
    b[x, 3] = kurtosis(my_data[:, x]);
    b[x, 4] = entropy(frequency[0])

plt.title(csvString + " Frequency")
plt.legend()
deg = u'\N{DEGREE SIGN}'

plt.xlabel("Airflow Rate (cfm)")
plt.ylabel("Frequency")

for i in range (0, 5):
    print(b[:, i])

plt.show()
Example #16
0
def entropy(v1):
    """
    Computes entropy, sum -p_i * log(p_i)
    """
    return DISTRIBS.entropy(v1.flatten())
Example #17
0
def evaluateClustering(centroids,
                       data,
                       assignments,
                       trueLabelMeans,
                       trueLabelHash,
                       histSize=101):
    '''
    Evaluates a clustering algorithm, when the true labels of the data have been given. Those
    labels are contained as mapped values in "trueLabelHash". 
    
    To evaluate the clustering algorithm's accuracy, we will follow twp base approach. To do this, we first
    observe that it is possible to compute the distance of every centroid to the mean values of the 
    true labels. Therefore, for every cluster it is possible to find the category mean to which it is closest in vector space.: 
    
    Approach #1: We will associate each centroid with its closest label and therefore compute the clustering
    quality in terms of misclassification error. In this case, the predicted labels are the clusters that 
    examples are assigned to.
    
    Approach #2: For every cluster, we build a histogram which plots the distribution of its points over 
    the ***true*** labels. Clusters whose points' majority true label coincide with the label whose mean 
    is closest to the centroid are more "accurate" than ones for which this condition does not hold.
    
    @param centroids: K x D ndarray, representing K centroids in D-space.
    @param data: N x D ndarray, representing the training data X.
    @param assignments: N-sized ndarray, mapping each example to its cluster. assignments[i] = k means that
            the ith example in "data" is mapped to the cluster represented by the kth centroid.
    @param trueLabelMeans: |labels| xD ndarray, holding the D-dimensional mean values of every class
    @param trueLabelHash: A hash which maps example indices to their true label.
    @param histSize: integer which represents the size of the histogram to pass to "getClusterHistogram".
            By default it's equal to 101, the amount of labels in Caltech101.
    @raise LogicalError: For various cases which have to do with argument sanity checking.
    @raise DatasetError: If provided with no data.
    @return The number of "accurate" clusters, as defined above.
    '''

    if centroids is None or assignments is None or trueLabelMeans is None or trueLabelHash is None:
        raise LogicalError, "Method %s: \"None\" argument(s) provided." % (
            stack()[0][3])
    if data is None or data.shape[0] == 0 or data.shape[1] == 0:
        raise DatasetError, "Method %s: No training data provided." % (
            stack()[0][3])
    if histSize is None or histSize <= 0:
        raise LogicalError, "Method %s: histSize parameter should be a positive integer (provided: %s)." % (
            stack()[0][3], str(histSize))

    if len(trueLabelMeans) != 101:
        raise LogicalError, "Method %s: trueLabelMeans array should have 101 dimensions." % (
            stack()[0][3])

    # for each centroid, find the category mean it is closest to. Then associate this cluster with this
    # mean in a hash.

    # I have tried quite a bit to find an efficient solution to this, and have failed. Instead,
    # I will write an inefficient for loop - based implementation.

    # Careful: the trueLabelMeans 2D ndarray is zero-indexed, whereas the labels are not!

    closestLabel = dict()
    for i in range(len(centroids)):
        closestLabel[i] = np.array(
            [norm(centroids[i] - mean)
             for mean in trueLabelMeans]).argmin() + 1

    # Implement approach #1: Assuming that every assigned cluster is a predicted label, compute
    # the cluster accuracy in terms of misclassification error.

    misclassifiedPoints = 0
    for exIndex in range(data.shape[0]):
        if trueLabelHash[exIndex] != closestLabel[assignments[exIndex]]:
            misclassifiedPoints += 1

    errorRate = 100 * (misclassifiedPoints / float(data.shape[0]))

    # Implement approach #2: Compute true label count histograms and gauge which clusters are "good".
    # "Good" clusters are closest to the mean of the majority
    # vote label voted by their points, as reported by the respective histogram.

    goodCentroids = 0
    histogramEntropies = []
    for i in range(len(centroids)):
        # Get the indices of all the points in the cluster
        pointsInCluster = [
            j for j in range(len(assignments)) if assignments[j] == i
        ]
        if len(pointsInCluster) > 0:
            clusterHist, majVoteLabel = getClusterHistogram(
                pointsInCluster, trueLabelHash, histSize)
            histogramEntropies.append(
                entropy([val for val in clusterHist if val > 0]))
            if closestLabel[i] != None and majVoteLabel == closestLabel[i]:
                goodCentroids += 1

    # Return all metrics to caller.

    return errorRate, goodCentroids, np.mean(histogramEntropies)
Example #18
0
def entropy(v1):
    """
    Computes entropy, sum -p_i * log(p_i)
    """
    return DISTRIBS.entropy(v1.flatten())