Python adjusted_mutual_info_score Examples, sklearn.metrics.cluster.adjusted_mutual_info_score Python Examples

Example #1

0

Show file

File: test_supervised.py Project: tdurieux/BugSwarm-dissection

def test_adjusted_mutual_info_score():
    # Compute the Adjusted Mutual Information and test against known values
    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
    # Mutual information
    mi = mutual_info_score(labels_a, labels_b, log_base='e')
    assert_almost_equal(mi, 0.41022, 5)
    # with provided sparse contingency
    C = contingency_matrix(labels_a, labels_b, sparse=True)
    mi = mutual_info_score(labels_a, labels_b, contingency=C, log_base='e')
    assert_almost_equal(mi, 0.41022, 5)
    # with provided dense contingency
    C = contingency_matrix(labels_a, labels_b)
    mi = mutual_info_score(labels_a, labels_b, contingency=C, log_base='e')
    assert_almost_equal(mi, 0.41022, 5)
    # Expected mutual information
    n_samples = C.sum()
    emi = expected_mutual_information(C, n_samples, log_base='e')
    assert_almost_equal(emi, 0.15042, 5)
    # Adjusted mutual information
    ami = adjusted_mutual_info_score(labels_a, labels_b, log_base='e')
    assert_almost_equal(ami, 0.27502, 5)
    ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3])
    assert_equal(ami, 1.0)
    # Test with a very large array
    a110 = np.array([list(labels_a) * 110]).flatten()
    b110 = np.array([list(labels_b) * 110]).flatten()
    ami = adjusted_mutual_info_score(a110, b110, log_base='e')
    # This is not accurate to more than 2 places
    assert_almost_equal(ami, 0.37, 2)

Example #2

0

Show file

def test_adjusted_mutual_info_score():
    # Compute the Adjusted Mutual Information and test against known values
    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
    # Mutual information
    mi = mutual_info_score(labels_a, labels_b)
    assert_almost_equal(mi, 0.41022, 5)
    # with provided sparse contingency
    C = contingency_matrix(labels_a, labels_b, sparse=True)
    mi = mutual_info_score(labels_a, labels_b, contingency=C)
    assert_almost_equal(mi, 0.41022, 5)
    # with provided dense contingency
    C = contingency_matrix(labels_a, labels_b)
    mi = mutual_info_score(labels_a, labels_b, contingency=C)
    assert_almost_equal(mi, 0.41022, 5)
    # Expected mutual information
    n_samples = C.sum()
    emi = expected_mutual_information(C, n_samples)
    assert_almost_equal(emi, 0.15042, 5)
    # Adjusted mutual information
    ami = adjusted_mutual_info_score(labels_a, labels_b)
    assert_almost_equal(ami, 0.27821, 5)
    ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3])
    assert ami == pytest.approx(1.0)
    # Test with a very large array
    a110 = np.array([list(labels_a) * 110]).flatten()
    b110 = np.array([list(labels_b) * 110]).flatten()
    ami = adjusted_mutual_info_score(a110, b110)
    assert_almost_equal(ami, 0.38, 2)

Example #3

0

Show file

File: test_supervised.py Project: ddahlmeier/scikit-learn

def test_adjusted_mutual_info_score():
    # Compute the Adjusted Mutual Information and test against known values
    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
    # Mutual information
    mi = mutual_info_score(labels_a, labels_b)
    assert_almost_equal(mi, 0.41022, 5)
    # with provided sparse contingency
    C = contingency_matrix(labels_a, labels_b, sparse=True)
    mi = mutual_info_score(labels_a, labels_b, contingency=C)
    assert_almost_equal(mi, 0.41022, 5)
    # with provided dense contingency
    C = contingency_matrix(labels_a, labels_b)
    mi = mutual_info_score(labels_a, labels_b, contingency=C)
    assert_almost_equal(mi, 0.41022, 5)
    # Expected mutual information
    n_samples = C.sum()
    emi = expected_mutual_information(C, n_samples)
    assert_almost_equal(emi, 0.15042, 5)
    # Adjusted mutual information
    ami = adjusted_mutual_info_score(labels_a, labels_b)
    assert_almost_equal(ami, 0.27502, 5)
    ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3])
    assert_equal(ami, 1.0)
    # Test with a very large array
    a110 = np.array([list(labels_a) * 110]).flatten()
    b110 = np.array([list(labels_b) * 110]).flatten()
    ami = adjusted_mutual_info_score(a110, b110)
    # This is not accurate to more than 2 places
    assert_almost_equal(ami, 0.37, 2)

Example #4

0

Show file

File: porownanie.py Project: aleksandramiesiac/pd3

def Porownaj_algorytmy(data, klasy, labels, method, baza):
    """
    Oblicza indeksy AM, AR i FM dla wszystkich algorytmów, aprócz napisanego przeze mnie.
    """

    wektor =[]
    test =[0]*len(method)
    i=0
    #algorytmy linkage
    for name in method:
        Z = linkage(data, name)
        test[i] = cluster.hierarchy.cut_tree(Z,klasy)
        test[i] = [y for x in test[i] for y in x]
        wektor.append([fowlkes_mallows_score(labels,test[i]), adjusted_mutual_info_score(labels, test[i]),adjusted_rand_score(labels,test[i]),baza ])
        i+=1
    # algorytm genieclust
    wynikMG = genieclust.genie.Genie(n_clusters=klasy).fit_predict(data)
    wektor.append([fowlkes_mallows_score(labels,wynikMG), adjusted_mutual_info_score(labels, wynikMG),adjusted_rand_score(labels,wynikMG),baza ])
    
    #MeanShift
    wynikCL = MeanShift(bandwidth=klasy).fit(data).labels_
    wektor.append([fowlkes_mallows_score(labels,wynikCL), adjusted_mutual_info_score(labels, wynikCL),adjusted_rand_score(labels,wynikCL),baza ])
    
    #AgglomerativeClustering
    wynikFA = AgglomerativeClustering(n_clusters=klasy).fit(data).labels_
    wektor.append([fowlkes_mallows_score(labels,wynikFA), adjusted_mutual_info_score(labels, wynikFA),adjusted_rand_score(labels,wynikFA),baza ])
    
    #KMeans
    wynikKM = KMeans(n_clusters=klasy, random_state=123).fit(data).labels_
    wektor.append([fowlkes_mallows_score(labels,wynikKM), adjusted_mutual_info_score(labels, wynikKM),adjusted_rand_score(labels,wynikKM) ,baza])
    
    
    index = ["single",'complete','average','weighted','centroid','median','ward', "genieclust","AgglomerativeClustering","KMeans","MeanShift"]
    
    return pd.DataFrame(wektor, index = index, columns = ["FM","AM","AR", "Dane"])

Example #5

0

Show file

def test_exactly_zero_info_score():
    # Check numerical stability when information is exactly zero
    for i in np.logspace(1, 4, 4).astype(int):
        labels_a, labels_b = (np.ones(i, dtype=int), np.arange(i, dtype=int))
        assert normalized_mutual_info_score(labels_a, labels_b) == 0.0
        assert v_measure_score(labels_a, labels_b) == 0.0
        assert adjusted_mutual_info_score(labels_a, labels_b) == 0.0
        assert normalized_mutual_info_score(labels_a, labels_b) == 0.0
        for method in ["min", "geometric", "arithmetic", "max"]:
            assert adjusted_mutual_info_score(labels_a, labels_b,
                                              method) == 0.0
            assert normalized_mutual_info_score(labels_a, labels_b,
                                                method) == 0.0

Example #6

0

Show file

File: test_supervised.py Project: MartinThoma/scikit-learn

def test_exactly_zero_info_score():
    # Check numerical stability when information is exactly zero
    for i in np.logspace(1, 4, 4).astype(np.int):
        labels_a, labels_b = (np.ones(i, dtype=np.int),
                              np.arange(i, dtype=np.int))
        assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0)
        assert_equal(v_measure_score(labels_a, labels_b), 0.0)
        assert_equal(adjusted_mutual_info_score(labels_a, labels_b), 0.0)
        assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0)
        for method in ["min", "geometric", "arithmetic", "max"]:
            assert adjusted_mutual_info_score(labels_a, labels_b,
                                              method) == 0.0
            assert normalized_mutual_info_score(labels_a, labels_b,
                                                method) == 0.0

Example #7

0

Show file

def clustering(label_file, embedding_file, embedding_dim, clusters):
    print(
        'performing kmeans clustering -------------------------------------------'
    )

    embeddings = np.fromfile(embedding_file,
                             np.float32).reshape(-1, embedding_dim)
    #embeddings = read_embeddings_avg(embedding_file)
    kmeans = KMeans(n_clusters=clusters, random_state=0).fit(embeddings)
    node_labels = kmeans.labels_

    with open(label_file, 'r') as f:
        reader = csv.reader(f, delimiter=' ')
        label_list = list(reader)

    labels = []
    for item in label_list:
        labels.append(int(item[1]))

    nmi_score = normalized_mutual_info_score(node_labels, labels)
    adj_score = adjusted_mutual_info_score(node_labels, labels)

    print(nmi_score)
    print(adj_score)

    return nmi_score

Example #8

0

Show file

    def show_result(self, prediction, msg):
        new_line(50)
        print(msg)
        new_line(50)

        real = self.train_labels

        print "Confusion Matrix: "
        print str(confusion_matrix(real, prediction))

        homo_score = homogeneity_score(real, prediction)
        complete_score = completeness_score(real, prediction)
        v_score = v_measure_score(real, prediction)
        rand_score = adjusted_rand_score(real, prediction)
        mutual_info = adjusted_mutual_info_score(real, prediction)

        print("Homogeneity Score: %0.3f" % homo_score)
        print("Completeness Score: %0.3f" % complete_score)
        print("V-measure: %0.3f" % v_score)
        print("Adjusted Rand Score: %0.3f" % rand_score)
        print("Adjusted Mutual Info Score: %0.3f\n" % mutual_info)

        return {
            'Homogeneity': homo_score,
            'Completeness': complete_score,
            'V-measure': v_score,
            'RAND': rand_score,
            'Mutual': mutual_info
        }

Example #9

0

Show file

File: functions.py Project: zhushaoquan/CPAC

def cluster_acc(y_true, y_pred):
    """
    calculating the accuracy of the clustering.
    since the index of each cluster might be different in y_true and y_pred, this function finds the linear
    assignment which maximizes the accuracy. This means some of the clusters might remain without a matching label.
    :param y_true: ground truth labeling
    :param y_pred: calculated from the model
    :return: the accuracy percentage, ami, nmi and the matrix w of all the combinations of indexes of the original clusters
    and the calculated ones
    """
    assert y_pred.size == y_true.size
    y_true_unique = np.unique(y_true)
    true_cluster_idx = np.nonzero(y_true[:, None] == y_true_unique)[1]
    D = max(y_pred.max()+1, len(y_true_unique)) # number of clusters
    w = np.zeros((D, len(y_true_unique)), dtype=np.int64) # D is in size number of clusters*number of clusters
    for i in range(y_pred.size):
        w[y_pred[i], true_cluster_idx[i]] += 1
    ind = linear_assignment(w.max() - w)
    # calculating the corresponding gt label most fit for each y_pred. since there are usually a lot of clusters,
    # the ones which didn't correspond to a value in the gt will receive the value -1
    y_pred_new = -1 * np.ones(len(y_pred), int)
    for i in range(0, len(y_pred)):
        j = np.argwhere(ind[:, 0] == y_pred[i])
        if j.shape[0] > 0:
            y_pred_new[i] = (ind[j[0], 1])
    acc = sum([w[i, j] for i, j in ind])*1.0/y_pred.size
    ami = adjusted_mutual_info_score(y_true, y_pred)
    nmi = normalized_mutual_info_score(y_true, y_pred)
    return acc, ami, nmi, w, y_pred_new

Example #10

0

Show file

def get_evaluation_metric_value(m_predicted_Y_file_path, m_actual_Y_file_path, m_evaluation_metric):
	print(m_predicted_Y_file_path, m_actual_Y_file_path)
	y_true, y_pred = get_pred_and_actual_y_arrays(m_predicted_Y_file_path, m_actual_Y_file_path)
	try:
		metric_value = 0
		if(m_evaluation_metric == EvaluationMetricType.ACCURACY):
			from sklearn.metrics import accuracy_score
			metric_value = accuracy_score(y_true, y_pred)
		elif(m_evaluation_metric == EvaluationMetricType.PRECISION):
			from sklearn.metrics import precision_score
			metric_value = precision_score(y_true, y_pred) #TODO
		elif(m_evaluation_metric == EvaluationMetricType.RECALL):
			from sklearn.metrics import recall_score
			metric_value = recall_score(y_true, y_pred) #TODO
		elif(m_evaluation_metric == EvaluationMetricType.F1_SCORE):
			from sklearn.metrics import f1_score
			metric_value = f1_score(y_true, y_pred) #TODO
		elif(m_evaluation_metric == EvaluationMetricType.ADJUSTED_MUTUTAL_INFO):
			from sklearn.metrics.cluster import adjusted_mutual_info_score
			metric_value = adjusted_mutual_info_score(y_true, y_pred)
		else:
			from sklearn.metrics import mean_squared_error
			metric_value = mean_squared_error(y_true, y_pred)
		return metric_value
	except Exception as e:
		raise Exception("SKLEARN_ERROR",e)

Example #11

0

Show file

def plot_MI_distance(data, ax, marker, dmax=100, method='NMI'):
    if len(data.shape) == 1:
        assert (len(data) % dmax == 0)
        data = data.reshape(len(data) // dmax, dmax)
    N, dmax = data.shape
    print(dmax)
    if method == 'NMI':
        NMIs = np.zeros(dmax)
        for d in range(1, dmax):
            NMIs[d] = normalized_mutual_info_score(data[:, 0], data[:, d])
        MIs = NMIs

    if method == 'AMI':
        AMIs = np.zeros(dmax)
        for d in range(1, dmax):
            AMIs[d] = adjusted_mutual_info_score(data[:, 0], data[:, d])
        MIs = AMIs

    if method == 'self_NMI':
        self_NMIs = np.zeros(dmax)
        for d in range(1, dmax):
            self_NMIs[d] = MI(data[:, 0], data[:, d])
        MIs = self_NMIs
    # return ax.plot(range(1,dmax), MIs[1:dmax],'o')
    return ax.plot(np.log10(range(1, dmax)), np.log10(MIs[1:dmax]), marker)[0]

Example #12

0

Show file

 def correlation(self, X, Y, heatmap=False):
     nb_classes = len(set(Y))
     print nb_classes
     km = KMeans(n_clusters=nb_classes, random_state=0).fit(X)
     label_kmeans = km.labels_
     purity = metric.compute_purity(label_kmeans, Y)
     nmi = normalized_mutual_info_score(Y, label_kmeans)
     ari = adjusted_rand_score(Y, label_kmeans)
     homogeneity = homogeneity_score(Y, label_kmeans)
     ami = adjusted_mutual_info_score(Y, label_kmeans)
     print('NMI = {}, ARI = {}, Purity = {},AMI = {}, Homogeneity = {}'.
           format(nmi, ari, purity, ami, homogeneity))
     if heatmap:
         x_ticks = [''] * len(Y)
         y_ticks = [''] * len(Y)
         idx = []
         for i in range(nb_classes):
             sub_idx = [j for j, item in enumerate(Y) if item == i]
             idx += [j for j, item in enumerate(Y) if item == i]
             x_ticks[len(idx) - 1] = str(i)
         assert len(idx) == len(Y)
         X = X[idx, :]
         Y = Y[idx]
         #similarity_mat = pairwise_distances(X,metric='cosine')
         similarity_mat = cosine_similarity(X)
         #sns.heatmap(similarity_mat,cmap='Blues')
         fig, ax = plt.subplots()
         #ax.set_yticks(range(len(y_ticks)))
         ax.set_yticklabels(y_ticks)
         ax.set_xticks(range(len(x_ticks)))
         ax.set_xticklabels(x_ticks)
         im = ax.imshow(similarity_mat, cmap='Blues')
         plt.colorbar(im)
         plt.savefig('heatmap_%s_dim%d.png' % (self.name, X.shape[1]),
                     dpi=600)

Example #13

0

Show file

    def elem_clustering(self,X,X_dec,y):
        mse = self.elem_calc_mse(X,X_dec)

        key = False
        while key == False:
            clst = KMeans(n_clusters = 2).fit(mse[:,np.newaxis])
            if clst.cluster_centers_[0] <clst.cluster_centers_[1]:
                key = True
            else:
                key = False
        
        ami = adjusted_mutual_info_score(y,clst.labels_)
        ari = adjusted_rand_score(y,clst.labels_)
        print(
            "adjusted_mutual_info_score : {0}\nadjusted_rand_score : {1}"
            .format(ami,ari)
            )
        if ami == 1.0 and ari == 1.0:
            thresh = sum(clst.cluster_centers_) / clst.n_clusters
            clf = KNeighborsClassifier(
                n_neighbors = 2
            ).fit(mse[:,np.newaxis],y)
            print(
                "Classification border was formed.\nCluster center = {0}\nClassification border threshold = {1}"
            .format(clst.cluster_centers_,thresh))
        else:
            print ("Cannot define classification border threshold!!")
            thresh = None
            clf = None

        return mse,thresh,clst,clf

Example #14

0

Show file

def calculate_NMI(cluster_assignments, true_classes):
    """ The function is to calculate NMI (the normalized mutual information) metric.

    Let C denote the set of clusters obtained from the ground truth and C' obtained
    from an algorithm. Their mutual information metric MI(C, C') is defined as follows:

    MI(C, C') = sum_{ci in C, cj' in C') p(ci, cj') * log2 (p(ci, cj') /(p(ci)p(cj')))

    where p(ci) and p(cj') are the probabilities that a data sample arbitrarily selected
    from the data set belongs to the clusters ci and cj', respectively, and p(ci, cj')
    is the joint probability that the arbitrarily selected data sample belongs to the
    clusters ci as well as cj' at the same time.

    Then the NMI is calculated as:

           NMI(C, C') = MI(C, C') / max(H(C), H(C'))

    where H(C) and H(C') are the entropies of C and C', respectively. It is easy to
    check that NMI(C, C') ranges from 0 to 1. NMI = 1 if two sets of clusters are identical,
    and NMI = 0 if the two sets are independent.

    Args:
        cluster_assignments (numpy array): an array contains cluster ids indicating the clustering
                assignment of each data point with the same order in the data set

        true_classes (numpy array): an array contains class ids indicating the true labels of each
                data point with the same order in the data set

    Returns:
	A number between 0 and 1.
    """

    return adjusted_mutual_info_score(cluster_assignments, true_classes)

Example #15

0

Show file

File: nmi.py Project: esteng/ULD

def ami(true_list, pred_list):
    """
	get the adjusted mutual information (correcting for agreement occuring by chance)
	between the predicted and the true alignment 
	"""
    true_list = [int(x) for x in true_list]
    pred_list = [int(x) for x in pred_list]
    return adjusted_mutual_info_score(true_list, pred_list)

Example #16

0

Show file

File: cluster_eval.py Project: sacry-/text-mining-haw-bachelor

 def calculate_scores(self):
     x, c, labels = self.x, self.c, self.labels
     self.v_measure = v_measure_score(c, labels)
     self.complete = completeness_score(c, labels)
     self.adjusted_mutual = adjusted_mutual_info_score(c, labels)
     self.adjusted_rand = adjusted_rand_score(c, labels)
     self.silhouette = silhouette_score(x, c)
     self.purity, self.partial_purity = self.__purity__()

Example #17

0

Show file

File: eval.py Project: dbrg77/scDEC

def cluster_eval(labels_true, labels_infer):
    purity = metric.compute_purity(labels_infer, labels_true)
    nmi = normalized_mutual_info_score(labels_true, labels_infer)
    ari = adjusted_rand_score(labels_true, labels_infer)
    homogeneity = homogeneity_score(labels_true, labels_infer)
    ami = adjusted_mutual_info_score(labels_true, labels_infer)
    #print('NMI = {}, ARI = {}, Purity = {},AMI = {}, Homogeneity = {}'.format(nmi,ari,purity,ami,homogeneity))
    return nmi, ari, homogeneity

Example #18

0

Show file

def print_scores(labels, predicted, svd):
    print "Homogeneity: " + str(homogeneity_score(labels, predicted))
    print "completeness: " + str(completeness_score(labels, predicted))
    print "V-measure: " + str(v_measure_score(labels, predicted))
    print "RAND score: " + str(adjusted_rand_score(labels, predicted))
    print "Mutual Info: " + str(adjusted_mutual_info_score(labels, predicted))
    ret = []
    ret.append(homogeneity_score(labels, predicted))
    ret.append(completeness_score(labels, predicted))
    ret.append(v_measure_score(labels, predicted))
    ret.append(adjusted_rand_score(labels, predicted))
    ret.append(adjusted_mutual_info_score(labels, predicted))
    if svd:
        svd_all.append(ret)
    else:
        nmf_all.append(ret)
    return homogeneity_score(labels, predicted)

Example #19

0

Show file

File: cluster_eval.py Project: jakobjoachim/text-mining-haw-bachelor

 def calculate_scores(self):
   x, c, labels = self.x, self.c, self.labels
   self.v_measure = v_measure_score(c, labels)
   self.complete = completeness_score(c, labels)
   self.adjusted_mutual = adjusted_mutual_info_score(c, labels)
   self.adjusted_rand = adjusted_rand_score(c, labels)
   self.silhouette = silhouette_score(x, c)
   self.purity, self.partial_purity = self.__purity__()

Example #20

0

Show file

File: test_supervised.py Project: JinguoGao/scikit-learn

def test_exactly_zero_info_score():
    """Check numerical stability when information is exactly zero"""
    for i in np.logspace(1, 4, 4).astype(np.int):
        labels_a, labels_b = np.ones(i, dtype=np.int), np.arange(i, dtype=np.int)
        assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0)
        assert_equal(v_measure_score(labels_a, labels_b), 0.0)
        assert_equal(adjusted_mutual_info_score(labels_a, labels_b), 0.0)
        assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0)

Example #21

0

Show file

File: evaluate_embeddings.py Project: eggpan95/fast-novelty-detection

def evaluateEmbeddingsKMeans(feature_data):
    X, y = feature_data['features'], feature_data['labels']
    num_classes = len(set(list(y.flatten())))
    kmeans_preds = KMeans(n_clusters=num_classes,
                          random_state=7123).fit_predict(X)
    NMI = normalized_mutual_info_score(kmeans_preds, y)
    AMI = adjusted_mutual_info_score(kmeans_preds, y)
    print("KMeans (w/ oracle k) AMI Score: {}".format(AMI))
    print("KMeans (w/ oracle k) NMI Score: {}".format(NMI))

Example #22

0

Show file

File: clusterbase.py Project: yilu1021/dr3

 def calc_performance_score(self, algo_type: str, predicted, y_train):
     homo_score = homogeneity_score(y_train, predicted)
     complete_socre = completeness_score(y_train, predicted)
     adjusted_mute_info_score = adjusted_mutual_info_score(
         y_train, predicted)
     print(algo_type + ' homo_score ' + "{:.2f}".format(homo_score))
     print(algo_type + ' complete_socre ' + "{:.2f}".format(complete_socre))
     print(algo_type + ' adjusted_mute_info_score ' +
           "{:.2f}".format(adjusted_mute_info_score))

Example #23

0

Show file

File: landmark_mf.py Project: ItayGabbay/ClusteringAlgorithmSelection

def get_landmarking(dataset_name, df):
    start = time.time()
    record = {'dataset': dataset_name.split('.')[0]}
    results = []
    n_samples = int(len(df)*0.1) if len(df) > 400 else min(df.shape[0], 40)
    data = df.sample(n=n_samples, replace=False)
    labels = get_dbscan(data)
    k = len(np.unique(labels))
    labels2 = get_Kmeans(data, k, 40)
    full_tree = DecisionTreeClassifier()
    full_tree.fit(data, labels)
    worst_attr = np.argmin(full_tree.feature_importances_)

    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3)
    best_stump = DecisionTreeClassifier(max_depth=1)
    random_stump = DecisionTreeClassifier(splitter="random", max_depth=1)
    worst_stump = DecisionTreeClassifier(max_depth=1)
    elite_knn = KNeighborsClassifier(n_neighbors=1)
    one_knn = KNeighborsClassifier(n_neighbors=1,
            algorithm="auto",
            weights="uniform",
            p=2,
            metric="minkowski")
    nb = GaussianNB()
    lda = LinearDiscriminantAnalysis()
    best_stump.fit(X_train, y_train)
    random_stump.fit(X_train, y_train)
    worst_stump.fit(X_train.iloc[:, worst_attr].values.reshape(-1, 1), y_train)
    elite_knn.fit(X_train, y_train)
    one_knn.fit(X_train, y_train)
    # lda.fit(X_train, y_train)
    nb.fit(X_train, y_train)

    record['LM1'] = np.log2(df.shape[0])
    record['LM2'] = np.log2(df.shape[1])
    record['LM3'] = accuracy_score(best_stump.predict(X_test), y_test)
    # record['LM4'] = f1_score(best_stump.predict(X_test), y_test, average='weighted')
    record['LM5'] = accuracy_score(random_stump.predict(X_test), y_test)
    # record['LM6'] = f1_score(random_stump.predict(X_test), y_test, average='weighted')
    # record['LM7'] = model.inertia_
    record['LM8'] = accuracy_score(elite_knn.predict(X_test), y_test)
    # record['LM9'] = f1_score(elite_knn.predict(X_test), y_test, average='weighted')
    # record['LM10'] = accuracy_score(lda.predict(X_test), y_test)
    # record['LM11'] = f1_score(lda.predict(X_test), y_test, average='weighted')
    record['LM12'] = accuracy_score(nb.predict(X_test), y_test)
    # record['LM13'] = f1_score(nb.predict(X_test), y_test, average='weighted')
    record['LM14'] = accuracy_score(one_knn.predict(X_test), y_test)
    # record['LM15'] = f1_score(one_knn.predict(X_test), y_test, average='weighted')
    record['LM16'] = accuracy_score(worst_stump.predict(X_test.iloc[:, worst_attr].values.reshape(-1, 1)), y_test)
    # record['LM17'] = f1_score(worst_stump.predict(X_test.iloc[:, worst_attr].values.reshape(-1, 1)), y_test, average='weighted')
    record['LM18'] = adjusted_rand_score(labels, labels2)
    record['LM19'] = adjusted_mutual_info_score(labels, labels2)
    record['LM20'] = completeness_score(labels, labels2)
    record['LM21'] = fowlkes_mallows_score(labels, labels2)

    end = time.time()
    return record, (df.shape[0], df.shape[1], end-start)

Example #24

0

Show file

def print_scores(labels, predicted):
    print "Contingency: "
    print str(confusion_matrix(labels, predicted))

    ret = []
    ret.append(homogeneity_score(labels, predicted))
    ret.append(completeness_score(labels, predicted))
    ret.append(v_measure_score(labels, predicted))
    ret.append(adjusted_rand_score(labels, predicted))
    ret.append(adjusted_mutual_info_score(labels, predicted))

    print "Homogeneity: " + str(homogeneity_score(labels, predicted))
    print "completeness: " + str(completeness_score(labels, predicted))
    print "V-measure: " + str(v_measure_score(labels, predicted))
    print "RAND score: " + str(adjusted_rand_score(labels, predicted))
    print "Mutual Info: " + str(adjusted_mutual_info_score(labels, predicted))

    return ret

Example #25

0

Show file

def validate( measure, classes, clustering ):
	if measure == "nmi":
		return normalized_mutual_info_score( classes, clustering )
	elif measure == "ami":
		return adjusted_mutual_info_score( classes, clustering )
	elif measure == "ari":
		return adjusted_rand_score( classes, clustering )
	log.error("Unknown validation measure: %s" % measure )
	return None

Example #26

0

Show file

File: proj2.py Project: Laura9505/EE219

def five_measure_scores(label_true, label_pred):
    print("Homogeneity_score = %f" % homogeneity_score(label_true, label_pred))
    print("Completeness_score = %f" %
          completeness_score(label_true, label_pred))
    print("Adjusted_rand_score = %f" %
          adjusted_rand_score(label_true, label_pred))
    print("V_measure_score = %f" % v_measure_score(label_true, label_pred))
    print("Adjusted_mutual_info_score = %f" %
          adjusted_mutual_info_score(label_true, label_pred))

Example #27

0

Show file

File: modularity.py Project: BitbeyHub/elegansbrainmap

def adjusted_mutual_info(comm1, comm2):
    """
    comm1: community 1
    comm2: community 2
    
    list of community labels, index i is integer community label of node i 
    """

    return adjusted_mutual_info_score(comm1.membership, comm2.membership)

Example #28

0

Show file

File: test_supervised.py Project: AbhishekHP2016/MHML

def test_exactly_zero_info_score():
    """Check numerical stabability when information is exactly zero"""
    for i in np.logspace(1, 4, 4):
        labels_a, labels_b = np.ones(i, dtype=np.int),\
            np.arange(i, dtype=np.int)
        assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0)
        assert_equal(v_measure_score(labels_a, labels_b), 0.0)
        assert_equal(adjusted_mutual_info_score(labels_a, labels_b), 0.0)
        assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0)

Example #29

0

Show file

File: algo_utils.py Project: arnaudmarcoux/histo_mri

def compute_nmi(preprocessed_brain, transformation_matrix):

    ref = nib.load(preprocessed_brain.file_paths["t2s"]).get_data()
    sum_of_mr = np.zeros(ref.shape, dtype=ref.dtype)
    for modality in preprocessed_brain.file_paths:
        sum_of_mr += nib.load(preprocessed_brain.file_paths[modality]).get_data()
    sum_of_mr[sum_of_mr != sum_of_mr] = 0
    grayscale_histo = np.mean(np.array(Image.open(preprocessed_brain.histo_path)), axis=2)
    w_grayscale_histo = warp(grayscale_histo, transformation_matrix, output_shape=sum_of_mr.shape)
    return adjusted_mutual_info_score(np.ravel(w_grayscale_histo.astype(int)), np.ravel(sum_of_mr.astype(int)))

Example #30

0

Show file

def print_stats(x, y, quiet=True):
    ari = adjusted_rand_score(x, y)
    ami = adjusted_mutual_info_score(x, y)
    fms = fowlkes_mallows_score(x, y)

    if not quiet:
        print("ARI: {}".format(ari), file=sys.stderr)
        print("AMI: {}".format(ami), file=sys.stderr)
        print("FMS: {}".format(fms), file=sys.stderr)

    return ari, ami, fms

Example #31

0

Show file

    def get_mutual_info(self, outcome_dict):
        """Gets mutual information given hash strings and outcomes using sklearn function."""
        hashes, outcomes = list(zip(*list(outcome_dict.items())))

        outcomes = [
            element for outcomeval in outcomes for element in outcomeval
        ]

        return adjusted_mutual_info_score(
            hashes,
            outcomes)  # <<<< problem here, outcomes isn't of correct shape

Example #32

0

Show file

File: util.py Project: kimmo1019/multiome

 def correlation(self, X, Y, heatmap=False):
     nb_classes = len(set(Y))
     print(nb_classes)
     km = KMeans(n_clusters=nb_classes, random_state=0).fit(X)
     label_kmeans = km.labels_
     purity = metric.compute_purity(label_kmeans, Y)
     nmi = normalized_mutual_info_score(Y, label_kmeans)
     ari = adjusted_rand_score(Y, label_kmeans)
     homogeneity = homogeneity_score(Y, label_kmeans)
     ami = adjusted_mutual_info_score(Y, label_kmeans)
     print('NMI = {}, ARI = {}, Purity = {},AMI = {}, Homogeneity = {}'.
           format(nmi, ari, purity, ami, homogeneity))

Example #33

0

Show file

def compute_score(predict_labels, labels, verbose=0):
    """计算聚类得分"""
    ari = adjusted_rand_score(predict_labels, labels)
    ami = adjusted_mutual_info_score(predict_labels, labels)
    nmi = normalized_mutual_info_score(predict_labels, labels)
    
    if verbose==1:
        print('-'*30)
        print('ari\tami\tnmi')
        print('{:.4f}  {:.4f}  {:.4f}'.format(ari, ami, nmi))
        
    return [ari, ami, nmi]

Example #34

0

Show file

File: porownanie.py Project: aleksandramiesiac/pd3

def Porownaj_algorytmy2(data, klasy, labels, baza):
    """
    Oblicza indeksy AM, AR i FM dla algorytmu napisanego przeze mnie.
    """
    wektor =[]
    
    #moj algorytm
    wynikM = spectral_clustering(data, k=klasy, M=5)
    wektor.append([fowlkes_mallows_score(labels,wynikM), adjusted_mutual_info_score(labels, wynikM),adjusted_rand_score(labels,wynikM),baza ])
    
    index=["Moj"]
    
    return pd.DataFrame(wektor, index = index, columns = ["FM","AM","AR", "Dane"])

Example #35

0

Show file

File: validation.py Project: duongtrung/topic-stability

	def evaluate( self, partition, clustered_ids ):
		# no class info?
		if not self.has_class_info():
			return {}
		# get two clusterings that we can compare
		n = len(clustered_ids)
		classes_subset = np.zeros( n )
		for row in range(n):
			classes_subset[row] = self.class_map[clustered_ids[row]]		
		scores = {}
		scores["external-nmi"] = normalized_mutual_info_score( classes_subset, partition )
		scores["external-ami"] = adjusted_mutual_info_score( classes_subset, partition )
		scores["external-ari"] = adjusted_rand_score( classes_subset, partition )
		return scores

Example #36

0

Show file

File: cluster_agreement_examples.py Project: Bigxiaofeng/CommunityEvaluation

def sklearn_measures(U, V):
    #     http://scikit-learn.org/stable/modules/classes.html#clustering-metrics
    import sklearn.metrics.cluster as sym
    U_labels = np.nonzero(U)[1]
    V_labels = np.nonzero(V)[1]
    print U_labels, V_labels
#     V2_labels = np.nonzero(V2)[1]
    print 'entro(U)=',sym.entropy(U_labels),'entro(V)=',sym.entropy(V_labels), 'entro(U,V)=',sym.mutual_info_score(U_labels, V_labels)
    res = [ ['ari', 'nmi', 'ami', 'vm' ], \
            [ sym.adjusted_rand_score(U_labels, V_labels),\
              sym.normalized_mutual_info_score(U_labels, V_labels),\
              sym.adjusted_mutual_info_score(U_labels, V_labels),\
              sym.v_measure_score(U_labels, V_labels)]]
    print res
    return res

Example #37

0

Show file

File: mutual_information.py Project: pawni/meoir

def crossvalidate(profiles, true_group_name, holdout_group_name=None, 
                  train=NNClassifier, distance='cosine'):
    profiles.assert_not_isnan()
    keys = profiles.keys()
    true_labels = profiles.regroup(true_group_name)
    profiles.data = np.array([d for k, d in zip(keys, profiles.data) if tuple(k) in true_labels])
    profiles._keys = [k for k in keys if tuple(k) in true_labels]
    keys = profiles.keys()
    labels = list(set(true_labels.values()))

    if holdout_group_name:
        holdouts = profiles.regroup(holdout_group_name)
    else:
        holdouts = dict((k, k) for k in keys)
    
    true_indices = []
    pred_indices = []
    for ho in set(holdouts.values()):
        test_set_mask = np.array([tuple(holdouts[k]) == ho for k in keys], 
                                 dtype=bool)
        training_features = profiles.data[~test_set_mask, :]
        training_labels = [labels.index(true_labels[tuple(k)]) 
                           for k, m in zip(keys, ~test_set_mask) if m]

        model = train(training_features, training_labels, distance=distance)
        for k, f, m in zip(keys, profiles.data, test_set_mask):
            if not m:
                continue
            true = true_labels[k]
            predicted = labels[model.classify(f)]
            
            true_indices.append(labels.index(true))
            pred_indices.append(labels.index(predicted))
    
    true_indices = np.array(true_indices)
    pred_indices = np.array(pred_indices)

    nmi_score = normalized_mutual_info_score(true_indices, pred_indices)
    ami_score = adjusted_mutual_info_score(true_indices, pred_indices)
 
    return nmi_score, ami_score

Example #38

0

Show file

File: bencharking.py Project: KevinDalleau/benchmarking

clusterer_ap = cluster.AffinityPropagation()
clusterer_agg_ap = cluster.AffinityPropagation(affinity="precomputed")
cluster_ap = clusterer_ap.fit_predict(data)
cluster_agg_ap = clusterer_agg_ap.fit_predict(data_agg)
cluster_agg_ap2 = clusterer_agg_ap.fit_predict(data_agg2)
cluster_agg_ap4 = clusterer_agg_ap.fit_predict(data_agg4)
cluster_agg_ap4_w = clusterer_agg_ap.fit_predict(data_agg4_w)
cluster_agg_ap4_ws = clusterer_agg_ap.fit_predict(data_agg4_ws)
cluster_agg_ap4_just_season = clusterer_agg_ap.fit_predict(data_agg4_just_season)
cluster_agg_ap4_just_leaf = clusterer_agg_ap.fit_predict(data_agg4_just_leaf)
cluster_agg_ap4_just_seed = clusterer_agg_ap.fit_predict(data_agg4_just_seed)
cluster_agg_ap4_just_weather = clusterer_agg_ap.fit_predict(data_agg4_just_weather)


mutual_info_score = adjusted_mutual_info_score(labels,cluster_ap)
mutual_info_score_agg = adjusted_mutual_info_score(labels,cluster_agg_ap)

v_score = homogeneity_completeness_v_measure(labels,cluster_ap)
v_score_agg2 = homogeneity_completeness_v_measure(labels,cluster_agg_ap2)
v_score_agg4 = homogeneity_completeness_v_measure(labels,cluster_agg_ap4)
v_score_agg4_w = homogeneity_completeness_v_measure(labels,cluster_agg_ap4_w)
v_score_agg4_ws = homogeneity_completeness_v_measure(labels,cluster_agg_ap4_ws)
v_score_agg4_just_season = homogeneity_completeness_v_measure(labels,cluster_agg_ap4_just_season)
v_score_agg4_just_leaf = homogeneity_completeness_v_measure(labels,cluster_agg_ap4_just_leaf)
v_score_agg4_just_seed = homogeneity_completeness_v_measure(labels,cluster_agg_ap4_just_seed)
v_score_agg4_just_weather = homogeneity_completeness_v_measure(labels,cluster_agg_ap4_just_weather)


print(v_score)
print(v_score_agg2)

Example #39

0

Show file

File: annotationclustering.py Project: stevenweaver/partis

def vollmers(info, threshold, reco_info=None, debug=False):
    """
    From Vollmers paper:
        Lineage Clustering. IGH sequences were clustered into IGH lineages according
        to similarity in their junctional region. Lineages were created according to the
        following steps. A lineage is formed and populated with one IGH sequence (seed). Then, all
        IGH sequences in the lineages (initially only the seed) are compared with all
        other IGH sequences of the same length using the same V and J segments. If
        their junctional regions (untemplated nucleotides and D segments) are at
        least 90% identical, the IGH sequence is added to the lineage. This process is
        repeated until the lineage does not grow.

    NOTE I'm interpreting this to mean
      - if *any* sequence already in the cluster is 90% to the prospective sequence that it's added to the cluster
      - 'sequences the same length' means cdr3 the same length (entire sequence the same length only made sense for their primers
      - since the 90% is on d + insertions, also have to not merge if the d + insertions aren't the same length
    """

    id_clusters = {}  # map from cluster id to list of query names

    def get_d_plus_insertions(uid):
        return info[uid]['vd_insertion'] + info[uid]['d_qr_seq'] + info[uid]['dj_insertion']

    def get_cdr3_seq(uid):
        cpos = info[uid]['cyst_position']
        tpos = info[uid]['tryp_position']
        assert len(info[uid]['seqs']) == 1
        seq = info[uid]['seqs'][0]
        cdr3_seq = seq[cpos : tpos+3]
        if len(cdr3_seq) != info[uid]['cdr3_length']:
            raise Exception('ERROR bad cdr3 sequence %s %d' % (cdr3_seq, info[uid]['cdr3_length']))
        return cdr3_seq

    def from_same_lineage(cluster_id, uid):
        for clid in id_clusters[cluster_id]:  # loop over seqs already in the cluster (it only has to match one of 'em)
            is_match = True
            for key in ('cdr3_length', 'v_gene', 'j_gene'):  # same cdr3 length, v gene, and j gene
                if info[clid][key] != info[uid][key]:
                    is_match = False
                    break
            if not is_match:
                continue
            cl_seq = get_d_plus_insertions(clid)
            u_seq = get_d_plus_insertions(uid)
            if len(cl_seq) != len(u_seq):
                continue
            hamming_frac = utils.hamming_fraction(cl_seq, u_seq)
            if hamming_frac > 1. - threshold:
                continue

            return True  # if we get to here, it's a match

        return False

    def check_unclustered_seqs():
        """ loop through all unclustered sequences, adding them to the most recently created cluster """
        uids_to_remove = []
        for unique_id in unclustered_seqs:
            assert unique_id not in id_clusters[last_cluster_id]  # not sure why I had the below, but I swear it's impossible. Remove this assertion when it fails to get triggered for a while
            # if unique_id in id_clusters[last_cluster_id]:  # sequence is already in this cluster
            #     continue
            if from_same_lineage(last_cluster_id, unique_id):
                if debug:
                    print '     adding', unique_id
                id_clusters[last_cluster_id].append(unique_id)
                uids_to_remove.append(unique_id)
        for uid in uids_to_remove:
            unclustered_seqs.remove(uid)

    def add_cluster(clid):
        if debug:
            print '  starting cluster %d' % clid
        id_clusters[clid] = [unclustered_seqs[0],]
        unclustered_seqs.remove(unclustered_seqs[0])
        while True:
            last_size = len(id_clusters[clid])
            check_unclustered_seqs()
            if last_size == len(id_clusters[clid]):  # stop when cluster stops growing
                break
            if debug:
                print '    running again (%d --> %d)' % (last_size, len(id_clusters[clid]))

    # ----------------------------------------------------------------------------------------
    # the business
    unclustered_seqs = info.keys()
    last_cluster_id = 0
    while len(unclustered_seqs) > 0:
        add_cluster(last_cluster_id)
        last_cluster_id += 1

    adj_mi = -1
    if reco_info is not None:
        true_cluster_list, inferred_cluster_list = [], []
        for clid, uids in id_clusters.items():
            for uid in uids:
                true_cluster_list.append(reco_info[uid]['reco_id'])
                inferred_cluster_list.append(clid)
        adj_mi = adjusted_mutual_info_score(true_cluster_list, inferred_cluster_list)
        print '       threshold  %.2f:   %d clusters (%d true)   adj_mi: %.3f' % (threshold, len(set(inferred_cluster_list)), len(set(true_cluster_list)), adj_mi)

    partition = [uids for uids in id_clusters.values()]  # convert to list of lists (no clid info)
    return adj_mi, partition

Example #40

0

Show file

File: stability.py Project: yarikoptic/reprclust

def compute_stability_fold(samples, train, test, method='ward',
                           max_k=None, stack=False,
                           stability=True, cv_likelihood=False,
                           corr_score=None,
                           ground_truth=None, n_neighbors=1,  **kwargs):
    """
    General function to compute the stability on a cross-validation fold.
    
    Parameters:
    -----------
        samples : list of arrays
            List of arrays containing the samples to cluster, each
            array has shape (n_samples, n_features) in PyMVPA terminology.
            We are clustering the features, i.e., the nodes.
        train : list or array
            Indices for the training set.
        test : list or array
            Indices for the test set.
        method : {'complete', 'gmm', 'kmeans', 'ward'}
            Clustering method to use. Default is 'ward'.
        max_k : int or None
            Maximum k to compute the stability testing, starting from 2. By
            default it will compute up to the maximum possible k, i.e.,
            the number of points.
        stack : bool
            Whether to stack or average the datasets. Default is False,
            meaning that the datasets are averaged by default.
        stability : bool
            Whether to compute the stability measure described in Lange et
            al., 2004. Default is True.
        cv_likelihood : bool
            Whether to compute the cross-validated likelihood for mixture
            model; only valid if 'gmm' method is used. Default is False.
        corr_score : {'pearson','spearman'} or None
            Whether to compute the specified type of correlation score. 
            Default is None.
        ground_truth : array or None
            Array containing the ground truth of the clustering of the data,
            useful to compare stability against ground truth for simulations.
        n_neighbors : int
            Number of neighbors to use to predict clustering solution on
            test set using K-nearest neighbors. Currently used only for
            methods `complete` and `ward`. Default is 1.
        kwargs : optional
            Keyword arguments being passed to the clustering method (only for
            'ward' and 'gmm').
    
    Returns:
    --------
        ks : array
            A (max_k-1,) array, where ks[i] is the `k` of the clustering
            solution for iteration `i`.
        ari : array
            A (max_k-1,) array, where ari[i] is the Adjusted Rand Index of the
            predicted clustering solution on the test set and the actual
            clustering solution of the test set for `k` of ks[i].
        ami : array
            A (max_k-1,) array, where ari[i] is the Adjusted Mutual
            Information of the predicted clustering solution on the test set
            and the actual clustering solution of the test set for
            `k` of ks[i].
        stab : array or None
            A (max_k-1,) array, where stab[i] is the stability measure
            described in Lange et al., 2004 for `k` of ks[i]. Note that this
            measure is the un-normalized one. It will be normalized later in
            the process.
        likelihood : array or None
            If method is 'gmm' and cv_likelihood is True, a
            (max_k-1,) array, where likelihood[i] is the cross-validated
            likelihood of the GMM clustering solution for `k` of ks[i].
            Otherwise returns None.
        ari_gt : array or None
            If ground_truth is not None, a (max_k-1,) array, where ari_gt[i]
            is the Adjusted Rand Index of the predicted clustering solution on
            the test set for `k` of ks[i] and the ground truth clusters of the
            data.
            Otherwise returns None.
        ami_gt : array or None
            If ground_truth is not None, a (max_k-1,) array, where ami_gt[i]
            is the Adjusted Mutual Information of the predicted clustering
            solution on the test set for `k` of ks[i] and the ground truth
            clusters of the data.
            Otherwise returns None.
        stab_gt : array or None
            If ground_truth is not None, a (max_k-1,) array, where stab_gt[i]
            is the stability measure of the predicted clustering
            solution on the test set for `k` of ks[i] and the ground truth
            clusters of the data.
            Otherwise returns None.
        corr : array or None
            Average correlation for each fold. TODO
        corr_gt : array or None
            Avg correlation against GT. TODO
    """
    if method not in AVAILABLE_METHODS:
        raise ValueError('Method {0} not implemented'.format(method))

    if cv_likelihood and method != 'gmm':
        raise ValueError(
            "Cross-validated likelihood is only available for 'gmm' method")

    # if max_k is None, set max_k to maximum value
    if not max_k:
        max_k = samples[0].shape[1]

    # preallocate arrays for results
    ks = np.zeros(max_k-1, dtype=int)
    ari = np.zeros(max_k-1)
    ami = np.zeros(max_k-1)
    if stability:
        stab = np.zeros(max_k-1)
    if cv_likelihood:
        likelihood = np.zeros(max_k-1)
    if corr_score is not None:
        corr = np.zeros(max_k-1)
    if ground_truth is not None:
        ari_gt = np.zeros(max_k-1)
        ami_gt = np.zeros(max_k-1)
        if stability:
            stab_gt = np.zeros(max_k-1)
        if corr_score is not None:
            corr_gt = np.zeros(max_k-1)

    # get training and test
    train_set = [samples[x] for x in train]
    test_set = [samples[x] for x in test]
    
    if stack:
        train_ds = np.vstack(train_set)
        test_ds = np.vstack(test_set)
    else:
        train_ds = np.mean(np.dstack(train_set), axis=2)
        test_ds = np.mean(np.dstack(test_set), axis=2)

    # compute clustering on training set
    if method == 'complete':
        train_ds_dist = pdist(train_ds.T, metric='correlation')
        test_ds_dist = pdist(test_ds.T, metric='correlation')
        # I'm computing the full tree and then cutting
        # afterwards to speed computation
        Y_train = complete(train_ds_dist)
        # same on testing set
        Y_test = complete(test_ds_dist)
    elif method == 'ward':
        (children_train, n_comp_train, 
         n_leaves_train, parents_train) = ward_tree(train_ds.T, **kwargs)
        # same on testing set
        (children_test, n_comp_test, 
         n_leaves_test, parents_test) = ward_tree(test_ds.T, **kwargs)
    elif method == 'gmm' or method == 'kmeans':
        pass  # we'll have to run it for each k
    else:
        raise ValueError("We shouldn't get here")

    for i_k, k in enumerate(range(2, max_k+1)):
        if method == 'complete':
            # cut the tree with right K for both train and test
            train_label = cut_tree_scipy(Y_train, k)
            test_label = cut_tree_scipy(Y_test, k)
            # train a classifier on this clustering
            knn = KNeighborsClassifier(#algorithm='brute',
            # metric='correlation',
                                       n_neighbors=n_neighbors)
            knn.fit(train_ds.T, train_label)
            # predict the clusters in the test set
            prediction_label = knn.predict(test_ds.T)
        elif method == 'ward':
            # cut the tree with right K for both train and test
            train_label = _hc_cut(k, children_train, n_leaves_train)
            test_label = _hc_cut(k, children_test, n_leaves_test)
            # train a classifier on this clustering
            knn = KNeighborsClassifier(n_neighbors=n_neighbors)
            knn.fit(train_ds.T, train_label)
            # predict the clusters in the test set
            prediction_label = knn.predict(test_ds.T)
        elif method == 'gmm':
            gmm = GMM(n_components=k, **kwargs)
            # fit on train and predict test
            gmm.fit(train_ds.T)
            prediction_label = gmm.predict(test_ds.T)
            if cv_likelihood:
                log_prob = np.sum(gmm.score(test_ds.T))
            # fit on test and get labels
            gmm.fit(test_ds.T)
            test_label = gmm.predict(test_ds.T)
        elif method == 'kmeans':
            kmeans = KMeans(n_clusters=k)
            # fit on train and predict test
            kmeans.fit(train_ds.T)
            prediction_label = kmeans.predict(test_ds.T)
            # fit on test and get labels
            kmeans.fit(test_ds.T)
            test_label = kmeans.predict(test_ds.T)
        else:
            raise ValueError("We shouldn't get here")
            
        # append results
        ks[i_k] = k
        ari[i_k] = adjusted_rand_score(prediction_label, test_label)
        ami[i_k] = adjusted_mutual_info_score(prediction_label, test_label)
        if stability:
            stab[i_k] = stability_score(prediction_label, test_label, k)
        if cv_likelihood:
            likelihood[i_k] = log_prob
        if corr_score is not None:
            corr[i_k] = correlation_score(prediction_label, test_label,
                                          test_ds, corr_score)
        if ground_truth is not None:
            ari_gt[i_k] = adjusted_rand_score(prediction_label, ground_truth)
            ami_gt[i_k] = adjusted_mutual_info_score(prediction_label,
                                                     ground_truth)
            if stability:
                stab_gt[i_k] = stability_score(prediction_label,
                                               ground_truth, k)
            if corr_score is not None:
                corr_gt[i_k] = correlation_score(prediction_label,
                                                 ground_truth,
                                                 test_ds, corr_score)

    results = [ks, ari, ami]
    if stability:
        results.append(stab)
    else:
        results.append(None)
    if cv_likelihood:
        results.append(likelihood)
    else:
        results.append(None)

    if ground_truth is not None:
        results += [ari_gt, ami_gt]
    else:
        results += [None, None]

    if stability and ground_truth is not None:
        results.append(stab_gt)
    else:
        results.append(None)

    if corr_score is not None:
        results.append(corr)
    else:
        results.append(None)

    if corr_score is not None and ground_truth is not None:
        results.append(corr_gt)
    else:
        results.append(None)

    return results

Example #41

0

Show file

File: cluster_metrics.py Project: mvdoc/reprclust

 def __call__(self, test_label, predicted_label, **kwargs):
     return adjusted_mutual_info_score(test_label, predicted_label)