def v_measures(m): """A lot slower than cython'd version; here so the code is selfsufficient for now.""" np.seterr(all='ignore') # avoid annoying warnings """Returns v measure elements vm, vc, vh of a contingency matrix m.""" c_size, k_size = m.shape sum_m = float(m.sum()) gold_counts = m.sum(1) h_c = entropy(gold_counts) / math.log(2) # entropy of classes (gold) found_counts = m.sum(0) h_k = entropy(found_counts) / math.log( 2) # entropy of clusters (estimated) h_c_k = 0 # H(C|K) h_k_c = 0 # H(K|C) for i in range(c_size): for j in range(k_size): if m[i, j] > 0: p_ij = m[i, j] / sum_m h_c_k += p_ij * math.log(m[i, j] / found_counts[j], 2) h_k_c += p_ij * math.log(m[i, j] / gold_counts[i], 2) vh = 1 if (h_c == 0) else 1 - (-h_c_k / h_c) vc = 1 if (h_k == 0) else 1 - (-h_k_c / h_k) vm = (2 * vh * vc) / (vh + vc) return (vm, vh, vc)
def info_loss(confidences, *args): """ Return the loss defined by increase in entropy. Parameters ---------- confidences: (N, K) or (K,) ndarray of float args: None To maintain compatibility with other loss funcions, we accept additional arguments and ignore them. Returns ------- losses: (N,) ndarray of float or float """ # guard against nan and -0 if confidences.ndim == 1: scores = entropy(confidences) / np.log(confidences.shape[0]) if np.isnan(scores): return 1 else: scores = entropy(confidences.T) / np.log(confidences.shape[1]) scores[np.isnan(scores)] = 1 return np.maximum(0, scores)
def v_measures(m): """A lot slower than cython'd version; here so the code is selfsufficient for now.""" np.seterr(all='ignore') # avoid annoying warnings """Returns v measure elements vm, vc, vh of a contingency matrix m.""" c_size, k_size = m.shape sum_m = float(m.sum()) gold_counts = m.sum(1) h_c = entropy(gold_counts)/math.log(2) # entropy of classes (gold) found_counts = m.sum(0) h_k = entropy(found_counts)/math.log(2) # entropy of clusters (estimated) h_c_k = 0 # H(C|K) h_k_c = 0 # H(K|C) for i in range(c_size): for j in range(k_size): if m[i, j] > 0: p_ij = m[i, j]/sum_m h_c_k += p_ij * math.log(m[i, j]/found_counts[j], 2) h_k_c += p_ij * math.log(m[i, j]/gold_counts[i], 2) vh = 1 if (h_c == 0) else 1 - (-h_c_k/h_c) vc = 1 if (h_k == 0) else 1 - (-h_k_c/h_k) vm = (2 * vh * vc) / (vh + vc) return (vm, vh, vc)
def script2(data=None, K=3, forest_size=6, numForests=5): ''' Script to compute how correlated errors are between different forests (KDT,PF). A forest of KD-Trees and a Proximity Forest are constructed that have approximately equal predictive power (this happens on SIFT data with 3NN queries at about 6 trees). Errors are correlated between successive forests (of same size), to determine if proximity forests have more diversity than KDT Forests. @param data: A tuple (g,p) where g is a list of gallery points and p is a list of probe points. If set to None, then the Sift10K data will be used. @param K: How many neighbors to return for each probe? @param forest_size: The forest size to use when testing error correlation between forests. @return: Tuple (Err_KD, Err_PF) where Err_KD/PF are the matrices of errors, nxp, where n is the number of query points and p is the number of trees. ''' if data is None: (g,p) = loadSift10K() else: (g,p) = data resT, _ = exact_knn(g, p, K=K) print "====================================" print "Computing errors using KD-Trees" print "====================================" errM_KD = KDTForest_ErrorCor(g, p, resT, K=K, forest_size=forest_size, numForests=numForests, binary_score=True) print "" print "====================================" print "Computing errors using Proximity Trees" print "====================================" errM_PF = ProxForest_ErrorCor(g, p, resT, K=K, forest_size=forest_size, numForests=numForests, binary_score=True) print "" print "====================================" print "Error correlations: KD-Trees" print "====================================" rc1 = avg_corr(errM_KD) print "====================================" print "Error correlations: Proximity Trees" print "====================================" rc2 = avg_corr(errM_PF) print "" print "====================================" print "Computing error entropy..." print "====================================" X_KD = sp.sum(errM_KD, axis=1) X_PF = sp.sum(errM_PF, axis=1) H_KD = entropy(X_KD[X_KD>0]) H_PF = entropy(X_PF[X_PF>0]) print "Entropy of KD_Tree Errors: %3.3f"%H_KD print "Entropy of Prox Tree Errors: %3.3f"%H_PF return (errM_KD, errM_PF)
def script1(data=None, K=1, numTrees=20): ''' Script to compute how correlated errors are from different trees of (KD-Tree/Proximity) forests. @param data: A tuple (g,p) where g is a list of gallery ponts and p is a list of probe points. If set to None, then the Sift10K data will be used. @param K: How many neighbors to return for each probe? @param numTrees: How many trees to test? @return: Tuple (Err_KD, Err_PF) where Err_KD/PF are the matrices of errors, nxp, where n is the number of query points and p is the number of trees. ''' if data is None: (g,p) = loadSift10K() else: (g,p) = data resT, _ = exact_knn(g, p, K=K) print "====================================" print "Computing errors using KD-Trees" print "====================================" errM_KD = KDTree_ErrorCor(g, p, resT, K=K, numTrees=numTrees, binary_score=True) print "" print "====================================" print "Computing errors using Proximity Trees" print "====================================" errM_PF = ProxTree_ErrorCor(g, p, resT, K=K, numTrees=numTrees, binary_score=True) print "" print "====================================" print "Error correlations: KD-Trees" print "====================================" rc1 = avg_corr(errM_KD) print "====================================" print "Error correlations: Proximity Trees" print "====================================" rc2 = avg_corr(errM_PF) print "" print "====================================" print "Computing error entropy..." print "====================================" X_KD = sp.sum(errM_KD, axis=1) X_PF = sp.sum(errM_PF, axis=1) H_KD = entropy(X_KD[X_KD>0]) H_PF = entropy(X_PF[X_PF>0]) print "Entropy of KD_Tree Errors: %3.3f"%H_KD print "Entropy of Prox Tree Errors: %3.3f"%H_PF return (errM_KD, errM_PF)
def _computeEntropy(self): ''' Compute the entropy of the histogram of distances from the pivot element. Low entropy scores means the distribution is concentrated, and thus may be a good candidate for splitting. High entropy (at limit, a uniform distribution), may indicate that there is no good separation of the elements in this node. ''' assert(self.Pivot != None) assert(self.Ds != None) assert(len(self.Ds) >= self.Tau) assert(len(self.Ds) == len(self.items)) #create a list of distances not including the sample which was selected as the pivot #...which will have a distance of zero, within numerical errors. Dx = [D for D in self.Ds if D>0.01] #compute histogram using 10 bins of the Dx list HistInfo = scipy.histogram(Dx, bins=10) pk = scipy.array( HistInfo[0] ) epsilon = 0.000001 H = entropy(pk+epsilon) #avoids log0 warnings #print "Histogram: ", HistInfo[0] #print "Entropy: %f"%H #print "Range: Min(>0)=%f, Max=%f, Mean=%f, Median=%f"%(min(Dx),max(self.Ds),scipy.mean(self.Ds),scipy.median(self.Ds)) return H
def js_div_matrix(a): a=array(a) W=zeros((a.shape[0],a.shape[0])) e=-entropy(a.transpose()) for i in range(a.shape[0]): val_range=range(i+1,a.shape[0]) sumAB=tile(a[i,:],(a.shape[0]-i-1,1))+a[val_range,:] result=0.5*(e[i]+e[val_range,:]-sum((sumAB)*nan_to_num(log(((sumAB)/2))),1)) W[val_range,i]=result W[i,val_range]=result return W
def train(X, y, y_train, labels, graph, output_file, top_n=100): indices = np.arange(len(y)) total_promoted, epoch = 0, 1 outputs = [['Epoch 0']] for _id, label in enumerate(labels): chunks = [label] current_output = [] for i in indices[y_train == _id]: node = graph.entities.get_node(i) chunks.append(node) current_output.append(node) total_promoted += 1 outputs.append([label] + current_output) print('\t'.join(chunks)) print_state(epoch, labels, total_promoted, y_train) while total_promoted < 5522: # 19875: # 5522: outputs.append(['Epoch %d' % epoch]) model = label_propagation.LabelPropagation(kernel='knn', tol=0.01, max_iter=2000, n_jobs=16) model.fit(X, y_train) predictions = model.transduction_ confidences = entropy(model.label_distributions_.T) for _id, label in enumerate(labels): mask = np.logical_and(predictions == _id, y_train == -1) ii = indices[mask] cc = confidences[mask] promoted = ii[np.argsort(cc)][:top_n] y_train[promoted] = _id chunks = [label] current_output = [] for i in promoted: node = graph.entities.get_node(i) chunks.append(node) current_output.append(node) print('\t'.join(chunks)) total_promoted += len(promoted) outputs.append([label] + current_output) print_state(epoch, labels, total_promoted, predictions) epoch += 1 with open(output_file, 'w') as f: for line in pre_func(outputs): f.write(line + '\n')
def logRowEntropyNormalization(theMatrix): print "...Normalizing Matrix Into LSA-style Log Entropy Values" numRows = len(theMatrix[:,0]) numCols = len(theMatrix[0,:]) normMatrix = scipy.zeros([numRows, numCols], float) rowEntropies = scipy.zeros([numRows],float) for i in range(numRows): currentVector = theMatrix[i,:] rowEntropies[i] = distributions.entropy(currentVector[np.nonzero(currentVector)]) if rowEntropies[i] == 0: print "Warning: row %s had entropy of 0: Setting to 1." rowEntropies[i] = 1 theMatrix = theMatrix + 1 normMatrix = np.log10((theMatrix.transpose() / rowEntropies).transpose()) return normMatrix
def logRowEntropyNormalization(theMatrix): print "...Normalizing Matrix Into LSA-style Log Entropy Values" numRows = len(theMatrix[:, 0]) numCols = len(theMatrix[0, :]) normMatrix = scipy.zeros([numRows, numCols], float) rowEntropies = scipy.zeros([numRows], float) for i in range(numRows): currentVector = theMatrix[i, :] rowEntropies[i] = distributions.entropy( currentVector[np.nonzero(currentVector)]) if rowEntropies[i] == 0: print "Warning: row %s had entropy of 0: Setting to 1." rowEntropies[i] = 1 theMatrix = theMatrix + 1 normMatrix = np.log10((theMatrix.transpose() / rowEntropies).transpose()) return normMatrix
def calcEntropy(probMat): probMat += 0.0001 # all values need to be >0.0 to compute entropy e = np.zeros((probMat.shape[0])) for i in np.arange(probMat.shape[0]): e[i] = distributions.entropy(probMat[i,:]) return e
def err_entropy(ErrM): X = sp.sum(ErrM, axis=1) return entropy(X)
def _get_label_propagation_max_entropy_index( lp: BaseLabelPropagation) -> int: entropies = entropy(lp.label_distributions_.T) return np.argsort(entropies)[-1]
def get_label_entropies(label_distribution): return entropy(label_distribution.T)
if maxInterim > maxValue: maxValue = maxInterim minInterim = min(my_data[:, x]) if minInterim < minValue: minValue = minInterim binWidth = (maxValue - minValue) / (numBins) newBins = np.arange(minValue, maxValue, binWidth) # TODO process array only once for speedup? for x in range (0, numModels): frequency = plt.hist(my_data[:, x], bins=newBins, histtype='step', normed=True, label=labels[x]); b[x, 0] = mean(my_data[:, x]); b[x, 1] = var(my_data[:, x]); b[x, 2] = skew(my_data[:, x]); b[x, 3] = kurtosis(my_data[:, x]); b[x, 4] = entropy(frequency[0]) plt.title(csvString + " Frequency") plt.legend() deg = u'\N{DEGREE SIGN}' plt.xlabel("Airflow Rate (cfm)") plt.ylabel("Frequency") for i in range (0, 5): print(b[:, i]) plt.show()
def entropy(v1): """ Computes entropy, sum -p_i * log(p_i) """ return DISTRIBS.entropy(v1.flatten())
def evaluateClustering(centroids, data, assignments, trueLabelMeans, trueLabelHash, histSize=101): ''' Evaluates a clustering algorithm, when the true labels of the data have been given. Those labels are contained as mapped values in "trueLabelHash". To evaluate the clustering algorithm's accuracy, we will follow twp base approach. To do this, we first observe that it is possible to compute the distance of every centroid to the mean values of the true labels. Therefore, for every cluster it is possible to find the category mean to which it is closest in vector space.: Approach #1: We will associate each centroid with its closest label and therefore compute the clustering quality in terms of misclassification error. In this case, the predicted labels are the clusters that examples are assigned to. Approach #2: For every cluster, we build a histogram which plots the distribution of its points over the ***true*** labels. Clusters whose points' majority true label coincide with the label whose mean is closest to the centroid are more "accurate" than ones for which this condition does not hold. @param centroids: K x D ndarray, representing K centroids in D-space. @param data: N x D ndarray, representing the training data X. @param assignments: N-sized ndarray, mapping each example to its cluster. assignments[i] = k means that the ith example in "data" is mapped to the cluster represented by the kth centroid. @param trueLabelMeans: |labels| xD ndarray, holding the D-dimensional mean values of every class @param trueLabelHash: A hash which maps example indices to their true label. @param histSize: integer which represents the size of the histogram to pass to "getClusterHistogram". By default it's equal to 101, the amount of labels in Caltech101. @raise LogicalError: For various cases which have to do with argument sanity checking. @raise DatasetError: If provided with no data. @return The number of "accurate" clusters, as defined above. ''' if centroids is None or assignments is None or trueLabelMeans is None or trueLabelHash is None: raise LogicalError, "Method %s: \"None\" argument(s) provided." % ( stack()[0][3]) if data is None or data.shape[0] == 0 or data.shape[1] == 0: raise DatasetError, "Method %s: No training data provided." % ( stack()[0][3]) if histSize is None or histSize <= 0: raise LogicalError, "Method %s: histSize parameter should be a positive integer (provided: %s)." % ( stack()[0][3], str(histSize)) if len(trueLabelMeans) != 101: raise LogicalError, "Method %s: trueLabelMeans array should have 101 dimensions." % ( stack()[0][3]) # for each centroid, find the category mean it is closest to. Then associate this cluster with this # mean in a hash. # I have tried quite a bit to find an efficient solution to this, and have failed. Instead, # I will write an inefficient for loop - based implementation. # Careful: the trueLabelMeans 2D ndarray is zero-indexed, whereas the labels are not! closestLabel = dict() for i in range(len(centroids)): closestLabel[i] = np.array( [norm(centroids[i] - mean) for mean in trueLabelMeans]).argmin() + 1 # Implement approach #1: Assuming that every assigned cluster is a predicted label, compute # the cluster accuracy in terms of misclassification error. misclassifiedPoints = 0 for exIndex in range(data.shape[0]): if trueLabelHash[exIndex] != closestLabel[assignments[exIndex]]: misclassifiedPoints += 1 errorRate = 100 * (misclassifiedPoints / float(data.shape[0])) # Implement approach #2: Compute true label count histograms and gauge which clusters are "good". # "Good" clusters are closest to the mean of the majority # vote label voted by their points, as reported by the respective histogram. goodCentroids = 0 histogramEntropies = [] for i in range(len(centroids)): # Get the indices of all the points in the cluster pointsInCluster = [ j for j in range(len(assignments)) if assignments[j] == i ] if len(pointsInCluster) > 0: clusterHist, majVoteLabel = getClusterHistogram( pointsInCluster, trueLabelHash, histSize) histogramEntropies.append( entropy([val for val in clusterHist if val > 0])) if closestLabel[i] != None and majVoteLabel == closestLabel[i]: goodCentroids += 1 # Return all metrics to caller. return errorRate, goodCentroids, np.mean(histogramEntropies)