def test_adjusted_mutual_info_score():
    # Compute the Adjusted Mutual Information and test against known values
    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
    # Mutual information
    mi = mutual_info_score(labels_a, labels_b)
    assert_almost_equal(mi, 0.41022, 5)
    # with provided sparse contingency
    C = contingency_matrix(labels_a, labels_b, sparse=True)
    mi = mutual_info_score(labels_a, labels_b, contingency=C)
    assert_almost_equal(mi, 0.41022, 5)
    # with provided dense contingency
    C = contingency_matrix(labels_a, labels_b)
    mi = mutual_info_score(labels_a, labels_b, contingency=C)
    assert_almost_equal(mi, 0.41022, 5)
    # Expected mutual information
    n_samples = C.sum()
    emi = expected_mutual_information(C, n_samples)
    assert_almost_equal(emi, 0.15042, 5)
    # Adjusted mutual information
    ami = adjusted_mutual_info_score(labels_a, labels_b)
    assert_almost_equal(ami, 0.27502, 5)
    ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3])
    assert_equal(ami, 1.0)
    # Test with a very large array
    a110 = np.array([list(labels_a) * 110]).flatten()
    b110 = np.array([list(labels_b) * 110]).flatten()
    ami = adjusted_mutual_info_score(a110, b110)
    # This is not accurate to more than 2 places
    assert_almost_equal(ami, 0.37, 2)
def test_contingency_matrix():
    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
    C = contingency_matrix(labels_a, labels_b)
    C2 = np.histogram2d(labels_a, labels_b, bins=(np.arange(1, 5), np.arange(1, 5)))[0]
    assert_array_almost_equal(C, C2)
    C = contingency_matrix(labels_a, labels_b, eps=0.1)
    assert_array_almost_equal(C, C2 + 0.1)
def test_contingency_matrix_sparse():
    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
    C = contingency_matrix(labels_a, labels_b)
    C_sparse = contingency_matrix(labels_a, labels_b, sparse=True).toarray()
    assert_array_almost_equal(C, C_sparse)
    C_sparse = assert_raise_message(
        ValueError, "Cannot set 'eps' when sparse=True", contingency_matrix, labels_a, labels_b, eps=1e-10, sparse=True
    )
Exemple #4
0
def munkres_score(gt, pred):
    """
    :param gt: a list of lists, each containing ints
    :param pred: a list of lists, each containing ints
    :return: accuracy
    """

    # Combine all the sequences into one long sequence for both gt and pred
    gt_combined = np.concatenate(gt)
    pred_combined = np.concatenate(pred)

    # Make sure we're comparing the right shapes
    assert (gt_combined.shape == pred_combined.shape)

    # Build out the contingency matrix
    # This follows the methodology suggested by Zhou, De la Torre & Hodgkins, PAMI 2013.
    mat = contingency_matrix(gt_combined, pred_combined)

    # Make the cost matrix
    # Use the fact that no entry can exceed the total length of the sequence
    cost_mat = make_cost_matrix(mat, lambda x: gt_combined.shape[0] - x)

    # Apply the Munkres method (also called the Hungarian method) to find the optimal cluster correspondence
    m = Munkres()
    indexes = m.compute(cost_mat)

    # Pull out the associated 'costs' i.e. the cluster overlaps for the correspondences found
    cluster_overlaps = mat[list(zip(*indexes))]

    # Now compute the accuracy
    accuracy = np.sum(cluster_overlaps) / float(np.sum(mat))

    return accuracy
def test_dbscan_optics_parity(eps, min_samples):
    # Test that OPTICS clustering labels are <= 5% difference of DBSCAN

    centers = [[1, 1], [-1, -1], [1, -1]]
    X, labels_true = make_blobs(n_samples=750, centers=centers,
                                cluster_std=0.4, random_state=0)

    # calculate optics with dbscan extract at 0.3 epsilon
    op = OPTICS(min_samples=min_samples).fit(X)
    core_optics, labels_optics = op.extract_dbscan(eps)

    # calculate dbscan labels
    db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)

    contingency = contingency_matrix(db.labels_, labels_optics)
    agree = min(np.sum(np.max(contingency, axis=0)),
                np.sum(np.max(contingency, axis=1)))
    disagree = X.shape[0] - agree

    # verify core_labels match
    assert_array_equal(core_optics, db.core_sample_indices_)

    non_core_count = len(labels_optics) - len(core_optics)
    percent_mismatch = np.round((disagree - 1) / non_core_count, 2)

    # verify label mismatch is <= 5% labels
    assert percent_mismatch <= 0.05
Exemple #6
0
def clustering_evaluation(model, labels, data):
    result = " Adjusted Rand Index : " + str(
        metrics.adjusted_rand_score(labels, model.labels_))
    result += "\n Homogeneity Score : " + str(
        metrics.homogeneity_score(labels, model.labels_))
    result += "\n Silhoutte Score : " + str(
        metrics.silhouette_score(data, model.labels_, metric='l2'))
    return result, contingency_matrix(labels, model.labels_)
Exemple #7
0
def calculate_clusteringAccuracy(labels_true, labels_pred):
    labels_true = np.array(labels_true)
    labels_true = labels_true.reshape(labels_true.size)
    labels_pred = np.array(labels_pred)
    labels_pred = labels_pred.reshape(labels_pred.size)

    matrix = contingency_matrix(labels_true, labels_pred)
    return get_IndicesClusterxClass(matrix)
Exemple #8
0
def print_5_measure(y_true, y_pred):
    x1 = metrics.homogeneity_score(y_true, y_pred)
    x2 = metrics.completeness_score(y_true, y_pred)
    x3 = metrics.v_measure_score(y_true, y_pred)
    x4 = metrics.adjusted_rand_score(y_true, y_pred)
    x5 = metrics.adjusted_mutual_info_score(y_true, y_pred)
    x6 = contingency_matrix(y_true, y_pred)
    return [x1, x2, x3, x4, x5], x6
Exemple #9
0
def RandIndex(labels_true, labels_pred):
    n_samples = len(labels_true)
    contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
    a = sum(comb2(n_ij) for n_ij in contingency.data)
    b = sum(comb2(n_c) for n_c in np.ravel(contingency.sum(axis=0))) - a
    c = sum(comb2(n_c) for n_c in np.ravel(contingency.sum(axis=1))) - a
    d = comb(n_samples, 2) - a - b - c
    return (a + d) / comb(n_samples, 2)
Exemple #10
0
def confusion_matrix(labels_true, labels_pred):
    n_samples = len(labels_true)
    c = contingency_matrix(labels_true, labels_pred, sparse=True)
    total = n_samples * (n_samples - 1)
    tp = np.dot(c.data, c.data) - n_samples
    fp = np.sum(np.asarray(c.sum(axis=0)).ravel()**2) - n_samples - tp
    fn = np.sum(np.asarray(c.sum(axis=1)).ravel()**2) - n_samples - tp
    tn = total - (tp + fn + fp)
    return tp, tn, fp, fn
Exemple #11
0
def purity(labels_true, labels_pred):
    '''
        input: labels_true: an array of labels of the given partitions
               labels_pred: an array of labels of the clusters
        return: the purity between the partitions and clusters
    '''
    # contingency matrix
    cmat = contingency_matrix(labels_true, labels_pred)

    return (cmat.max(axis=0) / cmat.sum()).sum()
Exemple #12
0
def purity_weights(gt, pred):
    # Build the contingency matrix
    cmat = contingency_matrix(gt, pred)

    # Find assignments based on a purity criteria
    # Maps clusters to gt-labels
    pure_assignments = np.argmax(cmat, axis=0)

    # A weight for each time-step (= 1 if cluster matches assigned gt-label otherwise = 0)
    return (gt == pure_assignments[pred]).astype(int)
def purity_score(y_true, y_pred):
    contingency_matrix1 = contingency_matrix(y_true, y_pred)
    print("contingency_matrix")
    print(contingency_matrix1)
    row_ind, col_ind = linear_sum_assignment(-contingency_matrix1)
    #print(row_ind,col_ind)
    #print(contingency_matrix1[row_ind,col_ind])
    print("Purity-score is:", end='')
    return (contingency_matrix1[row_ind,
                                col_ind].sum()) / (np.sum(contingency_matrix1))
def ConditionalEntropy(clusters, partitions):
    contigencyTable = contingency_matrix(clusters, partitions)
    H = []
    Hci = 0
    for i in range(contigencyTable.shape[0]):
        ni = np.sum(contigencyTable[i])
        for j in range(contigencyTable.shape[1]):
            Hci -= contigencyTable[i][j] / ni * np.log10(
                contigencyTable[i][j] / ni)
        H.append(Hci)
    return H
def Fmeasure(clusters, partitions):
    F = 0
    contigencyTable = contingency_matrix(clusters, partitions)
    idx = contigencyTable.max(axis=1)
    for i in range(contigencyTable.shape[0]):
        nij = contigencyTable.max(axis=1)
        ni = np.sum(contigencyTable[i])
        ji = contigencyTable[:, idx[i]]
        mji = np.sum(ji)
        F += 2 * nij / (ni + mji)
    return F / contigencyTable.shape[0]
Exemple #16
0
def get_cluster_data(X, y, name, km_k, gmm_k, rdir, pdir, perplexity=30):
    """Generates 2D dataset that contains cluster labels for K-Means and GMM,
    as well as the class labels for the given dataset.

    Args:
        X (Numpy.Array): Attributes.
        y (Numpy.Array): Labels.
        name (str): Dataset name.
        perplexity (int): Perplexity parameter for t-SNE.
        km_k (int): Number of clusters for K-Means.
        gmm_k (int): Number of components for GMM.
        rdir (str): Folder to save results CSV.

    """
    print('get_cluster_data: %s' % name)
    # generate 2D X dataset
    X2D = TSNE(n_iter=5000, perplexity=perplexity).fit_transform(X)

    # get cluster labels using best k
    km = KMeans(random_state=0).set_params(n_clusters=km_k)
    gmm = GMM(random_state=0).set_params(n_components=gmm_k)
    km_contingency_matrix = contingency_matrix(y, km.fit(X).labels_)
    gm_contingency_matrix = contingency_matrix(y, gmm.fit(X).predict(X))
    print km.cluster_centers_
    generate_contingency_matrix(
        km_contingency_matrix,
        gm_contingency_matrix,
        name,
        pdir,
    )
    km_cl = np.atleast_2d(km.fit(X2D).labels_).T
    gmm_cl = np.atleast_2d(gmm.fit(X2D).predict(X2D)).T
    y = np.atleast_2d(y).T

    # create concatenated dataset
    cols = ['x1', 'x2', 'km', 'gmm', 'class']
    df = pd.DataFrame(np.hstack((X2D, km_cl, gmm_cl, y)), columns=cols)

    # save as CSV
    filename = '{}_2D.csv'.format(name)
    save_dataset(df, filename, sep=',', subdir=rdir, header=True)
Exemple #17
0
def class_cluster_match(y_true, y_pred):
    """Translate prediction labels to maximize the accuracy.

    Translate the prediction labels of a clustering output to enable calc
    of external metrics (eg. accuracy, f1_score, ...). Translation is done by
    maximization of the confusion matrix :math:`C` main diagonal sum
    :math:`\sum{i=0}^{K}C_{i, i}`. Notice the number of cluster has to be equal
     or smaller than the number of true classes.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
        Ground truth (correct) target values.
    y_pred : array, shape = [n_samples]
        Estimated targets as returned by a clustering algorithm.

    Returns
    -------
    trans : array, shape = [n_classes, n_classes]
        Mapping of y_pred clusters, such that :math:`trans\subseteq y_true`

    References
    ----------
    """

    classes = unique_labels(y_true).tolist()
    n_classes = len(classes)
    clusters = unique_labels(y_pred).tolist()
    n_clusters = len(clusters)

    if n_clusters > n_classes:
        classes += [
            'DEF_CLASS' + str(i) for i in range(n_clusters - n_classes)
        ]
    elif n_classes > n_clusters:
        clusters += [
            'DEF_CLUSTER' + str(i) for i in range(n_classes - n_clusters)
        ]

    C = contingency_matrix(y_true, y_pred)
    true_idx, pred_idx = linear_sum_assignment(-C)

    true_idx = true_idx.tolist()
    pred_idx = pred_idx.tolist()

    true_idx = [classes[idx] for idx in true_idx]
    true_idx = true_idx + sorted(set(classes) - set(true_idx))
    pred_idx = [clusters[idx] for idx in pred_idx]
    pred_idx = pred_idx + sorted(set(clusters) - set(pred_idx))

    return_list = [true_idx[pred_idx.index(y)] for y in y_pred]

    return return_list
Exemple #18
0
def doClusters(num_clusters, reducer, X, opt_file, i):
    start = time.time()
    if (reducer == 'pca'):
        pca = PCA(n_components=i)
        X = pca.fit_transform(X)
        if (i == 141):
            for j in range(i):
                file = io.open(folder_name + "-out" + "\\pca\\pca-" + str(j) +
                               ".txt",
                               'w',
                               encoding="utf-8")
                file.write("num_words " + str(len(words)) + "\n\n")
                for val in range(len(pca.components_[j])):
                    file.write(words[val] + " :" +
                               str(pca.components_[j][val]) + "\n")
            #print(pca.explained_variance_)
    elif (reducer == 'kpca,lin'):
        kpcal = KernelPCA(n_components=i, kernel='linear')
        X = kpcal.fit_transform(X)
    elif (reducer == 'kpca,poly'):
        kpcap = KernelPCA(n_components=i, kernel='poly')
        X = kpcap.fit_transform(X)
    elif (reducer == 'kpca,cos'):
        kpcac = KernelPCA(n_components=i, kernel='cosine')
        X = kpcac.fit_transform(X)
    elif (reducer == 'kpca,sig'):
        kpcas = KernelPCA(n_components=i, kernel='sigmoid')
        X = kpcas.fit_transform(X)
    elif (reducer == 'none' and i != 141):
        return
    rt = time.time() - start
    start = time.time()
    km = KMeans(n_clusters=num_clusters,
                init='k-means++',
                n_init=20,
                random_state=0)
    y = km.fit_predict(X)
    ct = time.time() - start

    if (reducer == 'none'):
        reducer = 'tfidf'
        i = 18872

    time_file.write("\n" + reducer + "--" + str(i) + "\n" + str(rt) + "\n" +
                    str(ct) + "\n" + str(rt + ct) + "\n")
    print("reducer: " + reducer + ": " + str(i) + " dims - done")
    confusion = contingency_matrix(actuallabels, y)

    dr = (i / 141) * 100
    db = round(metrics.davies_bouldin_score(X, y), 4)
    table.write(
        str(i) + ", " + str(round(100 - dr, 4)) + ", " + str(db) + ", ")
    write2d(opt_file, reducer + "--" + str(i), confusion, actuallabels, y, db)
Exemple #19
0
def fowlkes_mallows_score(gt_labels, pred_labels, sparse=True):
    n_samples, = gt_labels.shape

    c = contingency_matrix(gt_labels, pred_labels, sparse=sparse)
    tk = np.dot(c.data, c.data) - n_samples
    pk = np.sum(np.asarray(c.sum(axis=0)).ravel()**2) - n_samples
    qk = np.sum(np.asarray(c.sum(axis=1)).ravel()**2) - n_samples
    avg_pre = tk / pk
    avg_rec = tk / qk
    fscore = _compute_fscore(avg_pre, avg_rec)

    return avg_pre, avg_rec, fscore
Exemple #20
0
def k_means(k, dimensions):
    rows = dimensions.shape[0]
    cols = dimensions.shape[1]

    mn = np.mean(dimensions, axis=0)
    std = np.std(dimensions, axis=0)
    centers = np.random.randn(k, cols) * std + mn

    #     plt.scatter(centers[:,0], centers[:,1], marker='+', c='r', s=150)

    # to store old centers
    co = np.zeros(centers.shape)
    # to Store new centers
    cn = deepcopy(centers)

    clusters = np.zeros(rows)
    distances = np.zeros((rows, k))

    error = np.linalg.norm(cn - co)

    # When, after an update, the estimate of that center stays the same, exit loop
    while error != 0:
        # Measure the distance to every center
        for i in range(k):
            distances[:, i] = np.linalg.norm(dimensions - cn[i], axis=1)
        # Assign all training data to closest center
        clusters = np.argmin(distances, axis=1)

        co = deepcopy(cn)
        # Calculate mean for every cluster and update the center
        for i in range(k):
            cn[i] = np.mean(dimensions[clusters == i], axis=0)
        error = np.linalg.norm(cn - co)
    # centers_new
#     plt.scatter(cn[:,0], cn[:,1], marker='+', c='g', s=150)
#     print(clusters)
#     print(np.unique(clusters))

#
    cmat = contingency_matrix(clusters, lclass)
    #     print(cmat)

    for i, item in enumerate(cmat):
        print("Purity of clusters :", i, " :", max(item) * 100 / sum(item))

    pure = 0
    for row in cmat:
        #         print(max(row))
        pure += max(row)
    purity0 = pure / len(label)

    return purity0
def calculate_accuracy(labels, pred_labels):
    label_map = np.argmax(contingency_matrix(labels, pred_labels),
                          axis=1).tolist()

    # print("argmax ", np.argmax(contingency_matrix(true_labels, pred_labels), axis=1))

    def map_labels(x):
        try:
            return label_map.index(x) + 1
        except ValueError:
            return 0

    mapped_pred_labels = list(map(map_labels, pred_labels))
    return accuracy_score(labels, mapped_pred_labels)
Exemple #22
0
def fms_compare(XX, YY, npoints, plot_title, plot_save):
    #Clustering
    ZXc = hierarchy.linkage(XX, method=clustering_method)
    ZYc = hierarchy.linkage(YY, method=clustering_method)

    #Cut dendrogram to obtain labelling for each k value
    #Warning: using hierarchy.cut_tree, but this function has a known bug!
    fms_dict = {}
    mean_dict = {}
    mean_dict[npoints]=0
    varbound_dict = {}
    varbound_dict[npoints]=0
    for i in range(1,npoints+1):
        ZXc_cut = [l for sublist in hierarchy.cut_tree(ZXc, i) for l in sublist]
        ZYc_cut = [l for sublist in hierarchy.cut_tree(ZYc, i) for l in sublist] 

        #Compute FM scores
        score = fms(ZXc_cut, ZYc_cut)
        fms_dict[i] = score

        #Compute moments for plotting and analysis
        c = contingency_matrix(ZXc_cut, ZYc_cut, sparse=True)
        tk = np.dot(c.data, c.data) - npoints
        pk = np.sum(np.asarray(c.sum(axis=0)).ravel() ** 2) - npoints
        qk = np.sum(np.asarray(c.sum(axis=1)).ravel() ** 2) - npoints
        pk2 = np.sum(np.asarray(c.sum(axis=0)).ravel() ** 3) - 3*(np.sum(np.asarray(c.sum(axis=0)).ravel() ** 2)) + 2*(np.sum(np.asarray(c.sum(axis=0)).ravel())) 
        qk2 = np.sum(np.asarray(c.sum(axis=1)).ravel() ** 3) - 3*(np.sum(np.asarray(c.sum(axis=1)).ravel() ** 2)) + 2*(np.sum(np.asarray(c.sum(axis=1)).ravel()))  
        if i < npoints:
            mean = (np.sqrt(pk*qk)) / (npoints*(npoints-1))
            mean_dict[i] = mean
            variance = (2/(npoints*(npoints-1))) + ((4*pk2*qk2)/(npoints*(npoints-1)*(npoints-2)*pk*qk))+ (((pk-2-((4*pk2)/pk))*(qk-2-((4*qk2)/qk)))/(npoints*(npoints-1)*(npoints-2)*(npoints-3))) - ((pk*qk)/((npoints**2)*((npoints-1)**2))) 
            varbound_dict[i] = 2* (variance**0.5)

    #Plot Bk and variance bounds
    lists = sorted(fms_dict.items())
    x, z = zip(*lists)
    upper = [mean_dict[i]+varbound_dict[i] for i in x]
    lower = [mean_dict[i]-varbound_dict[i] for i in x]
    means = [mean_dict[i] for i in x]

    #plt.plot(x,z)
    plt.scatter(x,z)
    plt.plot(x,upper)
    plt.plot(x, means)
    plt.plot(x,lower)
    plt.title(plot_title)
    plt.xlabel('# clusters')
    plt.ylabel('B_k')
    plt.savefig(path_fm_plot+ plot_save+'.jpg')
    plt.clf()
def eval_cluster(gt_labels, pred_labels, sparse=True):
    n_samples = gt_labels.shape
    c = contingency_matrix(gt_labels, pred_labels, sparse=sparse)
    tk = np.dot(c.data, c.data) - n_samples
    pk = np.sum(np.asarray(c.sum(axis=0)).ravel() ** 2) - n_samples
    qk = np.sum(np.asarray(c.sum(axis=1)).ravel() ** 2) - n_samples

    avg_pre = tk / pk
    avg_rec = tk / qk

    rec = avg_rec[0]
    pre = avg_pre[0]
    fscore = 2. * pre * rec / (pre + rec)
    return rec, pre, fscore
Exemple #24
0
def calculate_purity(labels_true, labels_pred):
    labels_true = np.array(labels_true)
    labels_true = labels_true.reshape(labels_true.size)
    labels_pred = np.array(labels_pred)
    labels_pred = labels_pred.reshape(labels_pred.size)

    k = np.size(np.unique(labels_pred))
    purityVector = np.zeros(k)
    purity = 0
    matrix = as_float_array(contingency_matrix(labels_true, labels_pred))
    for i in xrange(k):
        moda = np.float(np.max(matrix[:, i]))
        purityVector[i] = moda / np.sum(matrix[:, i])
        purity += purityVector[i] * np.sum(matrix[:, i]) / np.size(labels_pred)
    return purity, purityVector
Exemple #25
0
def _compute_counts(y_true, y_pred):  # TODO(tsitsulin): add docstring pylint: disable=missing-function-docstring
    contingency = contingency_matrix(y_true, y_pred)
    same_class_true = np.max(contingency, 1)
    same_class_pred = np.max(contingency, 0)
    diff_class_true = contingency.sum(axis=1) - same_class_true
    diff_class_pred = contingency.sum(axis=0) - same_class_pred
    total = contingency.sum()

    true_positives = (same_class_true * (same_class_true - 1)).sum()
    false_positives = (diff_class_true * same_class_true * 2).sum()
    false_negatives = (diff_class_pred * same_class_pred * 2).sum()
    true_negatives = total * (
            total - 1) - true_positives - false_positives - false_negatives

    return true_positives, false_positives, false_negatives, true_negatives
Exemple #26
0
def IrisKNN(K):
    
    #Implementa o Algoritmo KNN
    neigh = KNeighborsClassifier(n_neighbors=int(K[0]), leaf_size=int(K[1]),
                                 p=int(K[2]) ,weights="uniform")
    neigh.fit(X_train, y_train)
    
    #Prevendo valores da porção de teste
    y_pred = neigh.predict(X_test)
    
    """
    Gera a Matriz de Contingência, que mostra os acertos e erros do agrupamento,
    alem de especificar para qual cluster esses dados foram associados
    """
    contMatrix = contingency_matrix(y_pred, y_test)

    """
    Aqui estou percorrendo a Matriz de Contingência, calculando a porcentagem de 
    acerto para cada cluster e salvando o resultado no vetor clusterScores
    """
    nClusters = len(contMatrix)
    clusterScores = []
    hitPercentage = 0
    totalHits = 0
    globalScore = 0
    

    for i in range(nClusters):
    
        centr = np.argmax(contMatrix[i,:])
        centrValue = contMatrix[i, centr]
        soma = 0
    
        for j in range(nClusters):
            soma = soma + contMatrix[i,j]
            
        hitPercentage = centrValue/soma
        clusterScores.append(hitPercentage)
        totalHits = totalHits + centrValue

    """
    Mede a porcentagem total de acertos desconsiderando o nome dado aos clusters
    (grau de similaridade)
    """   

    globalScore = totalHits/len(y_pred)
    
    return -globalScore
Exemple #27
0
def fit_labels(gt_labels, tested_labels):
    gt_unique_classes = np.unique(gt_labels)
    tested_unique_classes, tested_classes_count = np.unique(tested_labels,
                                                            return_counts=True)
    idx = np.argsort(-tested_classes_count)
    tested_unique_classes = tested_unique_classes[idx]
    con_mat = contingency_matrix(gt_labels, tested_labels)
    tested_labels_remap = np.copy(tested_labels)
    for i_class in range(len(tested_unique_classes)):
        new_label_idx = np.argmax(con_mat[:, [tested_unique_classes[i_class]]])
        tested_labels_remap[
            tested_labels ==
            tested_unique_classes[i_class]] = gt_unique_classes[new_label_idx]
        con_mat[:, tested_unique_classes[i_class]] = -1
        con_mat[new_label_idx, :] = -1
    return tested_labels_remap
Exemple #28
0
def k_means_clustering(training_data,
                       target_labels,
                       title='Contingency Matrix',
                       n_clusters=20,
                       random_state=0,
                       max_iter=1000,
                       n_init=30):
    start = time.time()
    km = KMeans(n_clusters=n_clusters,
                random_state=random_state,
                max_iter=max_iter,
                n_init=n_init)
    km.fit(training_data)
    print("Finished clustering in %f seconds" % (time.time() - start))

    cm = contingency_matrix(target_labels, km.labels_)
    # reorder to maximize along diagonal
    rows, cols = linear_sum_assignment(cm, maximize=True)
    new_cm = cm[rows[:, np.newaxis], cols]

    print("Show Contingency Matrix:")
    plot_contingency_table_20(new_cm, title=title)

    print("Report 5 Measures for K-Means Clustering")

    homogeneity = homogeneity_score(target_labels, km.labels_)
    completeness = completeness_score(target_labels, km.labels_)
    v_measure = v_measure_score(target_labels, km.labels_)
    adjusted_rand_index = adjusted_rand_score(target_labels, km.labels_)
    adjusted_mutual_info = adjusted_mutual_info_score(target_labels,
                                                      km.labels_)

    print("Homogeneity Score: %f" % homogeneity)
    print("Completeness Score: %f" % completeness)
    print("V-Measure Score: %f" % v_measure)
    print("Adjusted Rand Index: %f" % adjusted_rand_index)
    print("Adjusted Mutual Information: %f" % adjusted_mutual_info)

    results = {
        "homogeneity": homogeneity,
        "completeness": completeness,
        "v_measure": v_measure,
        "adjusted_rand_index": adjusted_rand_index,
        "adjusted_mutual_info": adjusted_mutual_info
    }

    return results, km
Exemple #29
0
def annotation(cellname_train, cellname_test, Y_pred_train, Y_pred_test):
    train_confusion_matrix = contingency_matrix(cellname_train, Y_pred_train)
    annotated_cluster = np.unique(Y_pred_train)[train_confusion_matrix.argmax(
        axis=1)]
    annotated_celltype = np.unique(cellname_train)
    annotated_score = np.max(train_confusion_matrix, axis=1) / np.sum(
        train_confusion_matrix, axis=1)
    annotated_celltype[(
        np.max(train_confusion_matrix, axis=1) /
        np.sum(train_confusion_matrix, axis=1)) < 0.5] = "unassigned"
    final_annotated_cluster = []
    final_annotated_celltype = []
    for i in np.unique(annotated_cluster):
        candidate_celltype = annotated_celltype[annotated_cluster == i]
        candidate_score = annotated_score[annotated_cluster == i]
        final_annotated_cluster.append(i)
        final_annotated_celltype.append(
            candidate_celltype[np.argmax(candidate_score)])
    annotated_cluster = np.array(final_annotated_cluster)
    annotated_celltype = np.array(final_annotated_celltype)

    succeed_annotated_train = 0
    succeed_annotated_test = 0
    test_annotation_label = np.array(
        ["original versions for unassigned cell ontology types"] *
        len(cellname_test))
    for i in range(len(annotated_cluster)):
        succeed_annotated_train += (
            cellname_train[Y_pred_train == annotated_cluster[i]] ==
            annotated_celltype[i]).sum()
        succeed_annotated_test += (
            cellname_test[Y_pred_test == annotated_cluster[i]] ==
            annotated_celltype[i]).sum()
        test_annotation_label[Y_pred_test ==
                              annotated_cluster[i]] = annotated_celltype[i]
    annotated_train_accuracy = np.around(
        succeed_annotated_train / len(cellname_train), 4)
    total_overlop_test = 0
    for celltype in np.unique(cellname_train):
        total_overlop_test += (cellname_test == celltype).sum()
    annotated_test_accuracy = np.around(
        succeed_annotated_test / total_overlop_test, 4)
    test_annotation_label[
        test_annotation_label ==
        "original versions for unassigned cell ontology types"] = "unassigned"
    return annotated_train_accuracy, annotated_test_accuracy, test_annotation_label
Exemple #30
0
def plot_contingency_matrix(ax,
                            labels_audio,
                            labels,
                            cmap=plt.cm.Blues,
                            normalize=True):

    np.set_printoptions(precision=2)
    # Compute contingency matrix
    matrix = contingency_matrix(labels_audio, labels)

    cm = np.array([i / np.sum(i) for i in matrix])
    title = 'Normalized contingency matrix'

    # Only use the labels that appear in the data
    labels_audio = np.unique(labels_audio)  #ylabels
    labels = np.unique(labels)  #xlabels

    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(
        xticks=np.arange(cm.shape[1]),
        yticks=np.arange(cm.shape[0]),
        # ... and label them with the respective list entries
        xticklabels=labels,
        yticklabels=labels_audio,
        title=title,
        ylabel='True label [Audio]',
        xlabel='Predicted label [Subjetive]')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), ha="right", rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j,
                    i,
                    format(cm[i, j], fmt),
                    ha="center",
                    va="center",
                    color="white" if cm[i, j] > thresh else "black")
    ax.figure.tight_layout()
    return ax
Exemple #31
0
def over_division_cluster_ratio(labels_true, labels_pred):
    from sklearn.metrics.cluster import contingency_matrix
    a_true = np.array(labels_true)
    a_pred = np.array(labels_pred)
    n_cluster = np.unique(a_true).shape[0]
    cm = contingency_matrix(a_true, a_pred, sparse=True)

    nz_true, nz_pred = cm.nonzero()

    n_fail_cluster = 0
    for uniq_label_true, uniq_cnt_true in zip(
            *np.unique(nz_true, return_counts=True)):
        # multiple estimated cluster for 1 answer cluster?
        if uniq_cnt_true > 1:
            n_fail_cluster += 1

    return 1. * n_fail_cluster / n_cluster
Exemple #32
0
def rel_purity(y_true, y_pred):
    cm = contingency_matrix(y_true, y_pred)
    labels_sum = np.sum(cm, axis=1)
    rm = np.zeros(cm.shape)
    for j in range(cm.shape[1]):
        for i in range(cm.shape[0]):
            rm[i][j] = cm[i][j] / labels_sum[i]
    # print("Relative Contingency Matrix")
    # print(rm)
    # print(np.max(rm, axis=0))

    max_indexes = np.argmax(rm, axis=0)
    # print(max_indexes)
    sum = 0
    for j in range(rm.shape[1]):
        sum += cm[max_indexes[j]][j]
    return sum / np.sum(cm)
def fowlkes_mallows_score(gt_labels, pred_labels, sparse=True):
    ''' The original function is from `sklearn.metrics.fowlkes_mallows_score`.
        We output the pairwise precision, pairwise recall and F-measure,
        instead of calculating the geometry mean of precision and recall.
    '''
    n_samples, = gt_labels.shape

    c = contingency_matrix(gt_labels, pred_labels, sparse=sparse)
    tk = np.dot(c.data, c.data) - n_samples
    pk = np.sum(np.asarray(c.sum(axis=0)).ravel()**2) - n_samples
    qk = np.sum(np.asarray(c.sum(axis=1)).ravel()**2) - n_samples

    avg_pre = tk / pk
    avg_rec = tk / qk
    fscore = _compute_fscore(avg_pre, avg_rec)

    return avg_pre, avg_rec, fscore
Exemple #34
0
def GMM_fun(dimensions):

    GMM = GaussianMixture(n_components=5).fit(dimensions)
    gmmlabel = GMM.predict(dimensions)

    np.unique(gmmlabel)
    cmat = contingency_matrix(gmmlabel, lclass)

    for i, item in enumerate(cmat):
        print("Purity of clusters :", i, " :", max(item) * 100 / sum(item))

    pure1 = 0
    for i in cmat:
        pure1 += max(i)
    #     print(max(i))
    purity1 = pure1 / len(label)
    print('GMM Purity:', purity1)
def test_dbscan_optics_parity(eps, min_samples):
    # Test that OPTICS clustering labels are <= 5% difference of DBSCAN

    centers = [[1, 1], [-1, -1], [1, -1]]
    X, labels_true = make_blobs(n_samples=750, centers=centers,
                                cluster_std=0.4, random_state=0)

    # calculate optics with dbscan extract at 0.3 epsilon
    op = OPTICS(min_samples=min_samples, cluster_method='dbscan',
                eps=eps).fit(X)

    # calculate dbscan labels
    db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)

    contingency = contingency_matrix(db.labels_, op.labels_)
    agree = min(np.sum(np.max(contingency, axis=0)),
                np.sum(np.max(contingency, axis=1)))
    disagree = X.shape[0] - agree

    percent_mismatch = np.round((disagree - 1) / X.shape[0], 2)

    # verify label mismatch is <= 5% labels
    assert percent_mismatch <= 0.05
def test_adjusted_rand_score_sparse():
    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
    C_sparse = contingency_matrix(labels_a, labels_b, sparse=True)
    assert_almost_equal(adjusted_rand_score(labels_a, labels_b), adjusted_rand_score(None, None, contingency=C_sparse))
def test_contingency_matrix_sparse():
    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
    C = contingency_matrix(labels_a, labels_b)
    C_sparse = contingency_matrix(labels_a, labels_b, sparse=True).toarray()
    assert_array_almost_equal(C, C_sparse)