Example #1
0
def test():
    data = SimData(400, 4, 15)
    cor = np.nan_to_num(np.corrcoef(data.answers, rowvar=0)) # pearson metric
    cor = np.nan_to_num(np.corrcoef(cor))
    label1 = kmeans2(cor, 6, minit='points', iter=100)[1] # hack pocet komponent
    label2 = kmeans(cor, 6, True)

    xs, ys = mds(cor, euclid=True)
    plt.subplot(1, 2, 1)
    plt.title('kmeans2 ' + str(adjusted_rand_score(data.item_concept, label1)))
    plot_clustering(
        range(cor.shape[0]), xs, ys,
        labels=label1,
        shapes=data.item_concept,
    )

    plt.subplot(1, 2, 2)
    plt.title('Kmeans ' + str(adjusted_rand_score(data.item_concept, label2)))
    plot_clustering(
        range(cor.shape[0]), xs, ys,
        labels=label2,
        shapes=data.item_concept,
    )

    plt.show()
def aggregate_stats(infiles, outfile):
    """
    Combine all the aggstats into a single file
    
    Compute summary statistics
    """

    res = []
    for infile in infiles:
        d = pickle.load(open(infile, 'r'))
        print "The file is", infile
        assigndf = d['df']
        meta = d['meta']
        neurons = meta['neurons']


        m = extract_metadata(infile)
        if len(m) == 0:
            # skip the stupid non-replicated ones
            continue 

        for k, v in m.iteritems():
            assigndf[k] = v
        

        assigndf['true_assign_role'] = [np.array(neurons['role']) for _ in range(len(assigndf))]
        # compute the statistics
        assigndf['ari'] = assigndf.apply(lambda x : metrics.adjusted_rand_score(x['true_assign'], irm.util.canonicalize_assignment(x['assign'])), axis=1)

        assigndf['homogeneity'] = assigndf.apply(lambda x : metrics.homogeneity_score(x['true_assign'], irm.util.canonicalize_assignment(x['assign'])), axis=1)

        assigndf['completeness'] = assigndf.apply(lambda x : metrics.completeness_score(x['true_assign'], irm.util.canonicalize_assignment(x['assign'])), axis=1)


        # don't consider the ones where the role is "none" as these are multi-role ones
        neurons.ix[neurons['role'].isnull(), 'role'] = 'I'
        
        assigndf['role_ari'] = assigndf.apply(lambda x : metrics.adjusted_rand_score(neurons['role'], 
                                                                                     irm.util.canonicalize_assignment(x['assign'])), axis=1)

        assigndf['role_homogeneity'] = assigndf.apply(lambda x : metrics.homogeneity_score(neurons['role'], 
                                                                                           irm.util.canonicalize_assignment(x['assign'])), axis=1)

        assigndf['role_completeness'] = assigndf.apply(lambda x : metrics.completeness_score(neurons['role'], 
                                                                                             irm.util.canonicalize_assignment(x['assign'])), axis=1)



        assigndf['type_n_true'] = assigndf.apply(lambda x : len(np.unique(x['true_assign'])), axis=1)
        assigndf['type_n_learned'] = assigndf.apply(lambda x : len(np.unique(x['assign'])), axis=1)
        assigndf['auc'] = assigndf.apply(lambda x: metrics.roc_auc_score(x['heldout_link_truth'], x['heldout_link_predprob']), axis=1)
        #assigndf['f1'] = assigndf.apply(lambda x: metrics.f1_score(x['heldout_link_truth'], x['heldout_link_predprob']), axis=1)

        # 

        # fraction of mass in top N types
        
        res.append(assigndf)
    alldf = pandas.concat(res)
    pickle.dump(alldf, open(outfile, 'w'), -1)
Example #3
0
def acc_ari(X, lbls_true, lbls_pred, reject, strat_lbl_inds, use_strat=False):
    if use_strat:
        ari = metrics.adjusted_rand_score(lbls_true[strat_lbl_inds], lbls_pred[strat_lbl_inds])
        perc = np.int(np.float(len(strat_lbl_inds))/np.float(lbls_true.size) * 100.0)
        desc = ('ARI (strat={0})'.format(perc), 'ARI')
    else:
        ari = metrics.adjusted_rand_score(lbls_true, lbls_pred)
        desc = ('ARI', 'ARI')
    return ari, desc
Example #4
0
def test_affinities():
    X, y = make_blobs(n_samples=40, random_state=1, centers=[[1, 1], [-1, -1]], cluster_std=0.4)
    # nearest neighbors affinity
    sp = SpectralClustering(n_clusters=2, affinity="nearest_neighbors", random_state=0)
    labels = sp.fit(X).labels_
    assert_equal(adjusted_rand_score(y, labels), 1)

    sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0)
    labels = sp.fit(X).labels_
    assert_equal(adjusted_rand_score(y, labels), 1)
def kmeans(input_file, n_clusters, Output):
    lvltrace.lvltrace("LVLEntree dans kmeans unsupervised")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    sample_size, n_features = X.shape
    k_means=cluster.KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
    k_means.fit(X)
    reduced_data = k_means.transform(X)
    values = k_means.cluster_centers_.squeeze()
    labels = k_means.labels_
    k_means_cluster_centers = k_means.cluster_centers_
    print "#########################################################################################################\n"
    #print y
    #print labels
    print "K-MEANS\n"
    print('homogeneity_score: %f'%metrics.homogeneity_score(y, labels))
    print('completeness_score: %f'%metrics.completeness_score(y, labels))
    print('v_measure_score: %f'%metrics.v_measure_score(y, labels))
    print('adjusted_rand_score: %f'%metrics.adjusted_rand_score(y, labels))
    print('adjusted_mutual_info_score: %f'%metrics.adjusted_mutual_info_score(y,  labels))
    print('silhouette_score: %f'%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    print('\n')
    print "#########################################################################################################\n"
    results = Output+"kmeans_scores.txt"
    file = open(results, "w")
    file.write("K-Means Scores\n")
    file.write("Homogeneity Score: %f\n"%metrics.homogeneity_score(y, labels))
    file.write("Completeness Score: %f\n"%metrics.completeness_score(y, labels))
    file.write("V-Measure: %f\n"%metrics.v_measure_score(y, labels))
    file.write("The adjusted Rand index: %f\n"%metrics.adjusted_rand_score(y, labels))
    file.write("Adjusted Mutual Information: %f\n"%metrics.adjusted_mutual_info_score(y,  labels))
    file.write("Silhouette Score: %f\n"%metrics.silhouette_score(X, labels, metric='euclidean', sample_size=sample_size))
    file.write("\n")
    file.write("True Value, Cluster numbers, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f, %f, %i\n"%(y[n],labels[n],(n+1)))
    file.close()
    import pylab as pl
    from itertools import cycle
    # plot the results along with the labels
    k_means_cluster_centers = k_means.cluster_centers_
    fig, ax = plt.subplots()
    im=ax.scatter(X[:, 0], X[:, 1], c=labels, marker='.')
    for k in xrange(n_clusters):
        my_members = labels == k
        cluster_center = k_means_cluster_centers[k]
        ax.plot(cluster_center[0], cluster_center[1], 'w', color='b',
                marker='x', markersize=6)
    fig.colorbar(im)
    plt.title("Number of clusters: %i"%n_clusters)
    save = Output + "kmeans.png"
    plt.savefig(save)
    lvltrace.lvltrace("LVLsortie dans kmeans unsupervised")
Example #6
0
def compare_results(block,kmA,kmL):
    blockB = [[int(b==l) for b in block] for l in xrange(6)]
    for l in xrange(5):
        print "Block "+repr(l)+" results:"
        num_diff = vn.num_diff_w_perms(blockB[l], kmA)
        ari = adjusted_rand_score(blockB[l],kmA)
        print "Adjacency: num error="+repr(num_diff)+" ari="+repr(ari)
        
        num_diff = vn.num_diff_w_perms(blockB[l], kmL)
        ari = adjusted_rand_score(blockB[l],kmL)
        print "Laplacian: num error="+repr(num_diff)+" ari="+repr(ari)
Example #7
0
def compute_cluster_metrics_raw(chains, cells):

    all_chains = []
    for chain_i, chain in enumerate(chains):

        sample_latent = chain['state']
        cell_assignment = np.array(sample_latent['domains']['d1']['assignment'])
        ca = irm.util.canonicalize_assignment(cell_assignment)

        cells['cluster'] = ca

        canon_true_fine = irm.util.canonicalize_assignment(cells['type_id'])
        canon_true_coarse = irm.util.canonicalize_assignment(cells['coarse'])



        ari = metrics.adjusted_rand_score(canon_true_fine, ca)
        ari_coarse = metrics.adjusted_rand_score(canon_true_coarse, ca)

        ami = metrics.adjusted_mutual_info_score(canon_true_fine, ca)
        ami_coarse = metrics.adjusted_mutual_info_score(canon_true_coarse, ca)


        jaccard = rand.compute_jaccard(canon_true_fine, ca)
        jaccard_coarse = rand.compute_jaccard(canon_true_coarse, ca)

        ss = rand.compute_similarity_stats(canon_true_fine, ca)

        # other statistics 

        # cluster count

        # average variance x
        vars = cells.groupby('cluster').var()
        # average variance y
        # average variance z

        chain_info = {'ari' : ari, 
                     'ari_coarse' : ari_coarse, 
                     'ami' : ami, 
                     'ami_coarse' : ami_coarse, 
                     'jaccard' : jaccard, 
                     'jaccard_coarse' : jaccard_coarse,
                     'n11' : ss['n11'], 
                     'vars' : vars, 
                      'cluster_n' : len(np.unique(cells['cluster'])),
                      'chain_i' : chain_i, 
                      'score' : chain['scores'][-1],
                      'df' : cells, 
                     }
        all_chains.append(chain_info)
    df = pandas.DataFrame(all_chains)
    return df
def ARI_CrossCat(Xc, Xrv, XRc, XRrv):
    ''' Adjusted Rand Index (ARI) calculation for a CrossCat clustered table
    
    To calculate ARI based on the CrossCat partition, each cell in the
    table is considered as an instance to be assigned to a cluster. A cluster
    is defined by both the view index AND the category index. In other words,
    if, and only if, two cells, regardless of which columns and rows they belong
    to, are lumped into the same view and category, the two cells are considered
    to be in the same cluster. 

    For a table of size Nrow x Ncol
    Xc: (1 x Ncol) array of view assignment for each column.
        Note: It is assumed that the view indices are consecutive integers
        starting from 0. Hence, the number of views is equal to highest
        view index plus 1.
    Xrv: (Nrow x Nview) array where each row is the assignmennt of categories for the
        corresponding row in the data table. The i-th element in a row
        corresponds to the category assignment of the i-th view of that row.
    XRc and XRrv have the same format as Xr and Xrv respectively.
    The ARI index is calculated from the comparison of the table clustering
    define by (XRc, XRrv) and (Xc, Xrv).
    '''
    Xrv = Xrv.T
    XRrv = XRrv.T
    # Find the highest category index of all views
    max_cat_index = numpy.max(Xrv)
    # re-assign category indices so that they have different values in
    # different views
    Xrv = Xrv + numpy.arange(0,Xrv.shape[1])*(max_cat_index+1)
    
    # similarly for the reference partition
    max_cat_index = numpy.max(XRrv)
    XRrv = XRrv + numpy.arange(0,XRrv.shape[1])*(max_cat_index+1)
    
    # Table clustering assignment for the first partition
    CellClusterAssgn = numpy.zeros((Xrv.shape[0], Xc.size))
    for icol in range(Xc.size):
        CellClusterAssgn[:,icol]=Xrv[:,Xc[icol]]
    # Flatten the table to a 1-D array compatible with the ARI function 
    CellClusterAssgn = CellClusterAssgn.reshape(CellClusterAssgn.size)
        
    # Table clustering assignment for the second partition
    RefCellClusterAssgn = numpy.zeros((Xrv.shape[0], Xc.size))
    for icol in range(Xc.size):
        RefCellClusterAssgn[:,icol]=XRrv[:,XRc[icol]]
    # Flatten the table
    RefCellClusterAssgn = RefCellClusterAssgn.reshape(RefCellClusterAssgn.size)
        
    # Compare the two partitions using ARI
    ARI = metrics.adjusted_rand_score(RefCellClusterAssgn, CellClusterAssgn)
    ARI_viewonly = metrics.adjusted_rand_score(Xc, XRc)

    return ARI, ARI_viewonly
Example #9
0
  def results(self, algo, hasgnc = False, filename="_"):
    title = self.__class__.__name__
    AMI_increase = []
    ARI_increase = []
    rounds = 1
    if hasgnc: rounds = 10
    print "Runing ", algo.__name__, "for", rounds, "rounds"
    for i in range(rounds):
      vd = algo(self.g, weights = [ (lambda w: max(w,0) )(w) for w in self.g.es["weight"]] )
      try:
        vc = vd.as_clustering()
      except:
        vc = vd #in case a VertexCluster instance is returned
      self.write_vertex_clustering(vc, "_weighted%s" % filename)
      if hasgnc:
        for cc in range(len(vc)):
          for cci in vc[cc]:
            self.g.vs[cci]["fastgreedy_withweight"] = str(cc)
      vd = algo(self.g)
      try:
        vc = vd.as_clustering()
      except:
        vc = vd #in case a VertexCluster instance is returned
      self.write_vertex_clustering(vc, "_unweighted%s" % filename)
      if hasgnc:
        for cc in range(len(vc)):
          for cci in vc[cc]:
            self.g.vs[cci]["fastgreedy_withoutweight"] = str(cc)
        #self.g.write_gml("%s.gml" % title)
        #print "%s.gml written with attributes" % title,
        #print self.g.vs.attributes()
      if hasgnc:
        #print "Weighted:"
        #print "Adjusted Mutual Information:", 
        ami_weight = metrics.adjusted_mutual_info_score(self.g.vs["fastgreedy_withweight"], self.g.vs["comm"])
        #print "Adjusted Rand index:", 
        ari_weight = metrics.adjusted_rand_score(self.g.vs["fastgreedy_withweight"], self.g.vs["comm"])
        #print "~"*30
        #print "Unweighted:"
        #print "Adjusted Mutual Information:", 
        ami_unweight = metrics.adjusted_mutual_info_score(self.g.vs["fastgreedy_withoutweight"], self.g.vs["comm"])
        #print "Adjusted Rand index:", 
        ari_unweight = metrics.adjusted_rand_score(self.g.vs["fastgreedy_withoutweight"], self.g.vs["comm"])

        AMI_increase.append(ami_weight - ami_unweight)
        ARI_increase.append(ari_weight - ari_unweight)
    if hasgnc:
      print "Adjusted Mutual Information increases by",
      print 1.0 * sum(AMI_increase) / len(AMI_increase)
      print "Adjusted Rand index increases by",
      print 1.0 * sum(ARI_increase) / len(ARI_increase)
      print "-" * 20
      return AMI_increase
Example #10
0
    def tracking(self, d_start=gb.D_START_TRACKING, d_end=gb.D_END_TRACKING, path=""):
        print("\n --------- tracking ...")

        times_fsp, axes_fsp, labels_fsp = [], [], []
        times_ssp, axes_ssp, labels_ssp = [], [], []

        timedelta = datetime.timedelta(
            milliseconds=60 * 60 * 1000)  # read chunk by chunk (each chunk is of 'timedelta' milliseconds)
        date = d_start
        while date < d_end:
            if date + timedelta >= d_end: timedelta = d_end - date

            times, axes, labels = self.predict_fsp(d_start=date, d_end=date + timedelta)
            # self.plot_colored_signals(times, axes, labels, path, figname="_FSP.png")
            times_fsp += times;
            axes_fsp += axes;
            labels_fsp += labels

            times, axes, labels = self.predict_ssp(d_start=date, d_end=date + timedelta, update=True)
            # self.plot_colored_signals(times, axes, labels, path, figname="_SSP.png")
            times_ssp += times;
            axes_ssp += axes;
            labels_ssp += labels

            date += timedelta

        # ----------------------------
        if gb.ARTIFICIAL:
            times, values, true_labels = self.sigReaders[0].getSignal(start=d_start, end=d_end, dated=gb.DATED,
                                                                      get_modes=True)

            ari_fps = adjusted_rand_score(true_labels, labels_fsp);
            ari_sps = adjusted_rand_score(true_labels, labels_ssp)
            ami_fps = adjusted_mutual_info_score(true_labels, labels_fsp);
            ami_sps = adjusted_mutual_info_score(true_labels, labels_ssp)
            ho_fps, com_fps, vm_fps = homogeneity_completeness_v_measure(true_labels, labels_fsp);
            ho_sps, com_sps, vm_sps = homogeneity_completeness_v_measure(true_labels, labels_ssp)

            print("---------------------------------------------------")
            print("adjusted_rand_score \t (ari_fps, ari_sps)", (ari_fps, ari_sps))
            print("adjusted_mutual_info \t (ami_fps, ami_sps)", (ami_fps, ami_sps))
            print("homogeneity \t (ho_fps, ho_sps)", (ho_fps, ho_sps))
            print("completeness \t (com_fps, com_sps)", (com_fps, com_sps))
            print("v_measure \t (vm_fps, vm_sps)", (vm_fps, vm_sps))

            #return (ari_fps, ari_sps), (ami_fps, ami_sps), (ho_fps, ho_sps), (com_fps, com_sps), (vm_fps, vm_sps)
            return ((ari_fps, ari_sps), (ami_fps, ami_sps), (ho_fps, ho_sps), (com_fps, com_sps), (vm_fps, vm_sps)), (times_fsp,axes_fsp,labels_fsp,times_ssp,axes_ssp,labels_ssp)

        else:
            return 0., 0.
Example #11
0
 def test_evaluate_meta_tree_result(self):
     scores = evaluate_meta_tree_result(
         self.true_events, self.pred_events, self.all_entry_ids, methods=[metrics.adjusted_rand_score]
     )
     assert_almost_equal(
         metrics.adjusted_rand_score([2, 2, 1, 1, 1, 1], [2, 2, 0, 1, 0, 1]), scores["adjusted_rand_score"]
     )
     assert_almost_equal(
         metrics.adjusted_rand_score([2, 2, 0, 0, 1, 1, 1, 1, 0, 0], [2, 2, 0, 0, 0, 1, 0, 1, 1, 0]),
         scores["adjusted_rand_score(all)"],
     )
     assert_almost_equal(0.8, scores["precision"])
     assert_almost_equal(2 / 3.0, scores["recall"])
     assert_almost_equal(8 / 11.0, scores["f1"])
Example #12
0
def kmeans_analysis(G):
    block = nx.get_node_attributes(G,'block').values()  
    
    xA, xL = get_embedding(G,2)
    
    cA,kmA,_ = k_means(xA,2)
    cB,kmL,_ = k_means(xL,2)
    
#    plt.subplot(221); plt.scatter(xA[:,0],xA[:,1],c=block)
#    plt.subplot(222); plt.scatter(xA[:,0],xA[:,1],c=kmA)
#    plt.subplot(223); plt.scatter(xL[:,0],xL[:,1],c=block)
#    plt.subplot(224); plt.scatter(xL[:,0],xL[:,1],c=kmL)

    ax = plt.subplot(121); plt.scatter(xA[:,0],xA[:,1],c=block,marker='x')
    ax.set_aspect('equal','datalim')
    lim = plt.axis()
    a = cA[0,:]-cA[1,:]
    a = np.array([1, -a[0]/a[1]])
    b = np.mean(cA,axis=0)
    x = np.array([b+a,b-a])
    plt.plot(x[:,0],x[:,1],'k--',linewidth=1)
    plt.axis(lim)
    
    ax = plt.subplot(122); plt.scatter(xL[:,0],xL[:,1],c=block,marker='x')
    ax.set_aspect('equal','datalim')
    lim = plt.axis()
    a = cB[0,:]-cB[1,:]
    a = np.array([1, -a[0]/a[1]])
    b = np.mean(cB,axis=0)
    x = np.array([b+a,b-a])
    plt.plot(x[:,0],x[:,1],'k--',linewidth=1)
    plt.axis(lim)
    
    
    
    compare_results(block,kmA,kmL)
    
    _,kmA,_ = k_means(xA,5)
    _,kmL,_ = k_means(xL,5)
    
    print "ALL FIVE"
    num_diff = vn.num_diff_w_perms(block, kmA)
    ari = adjusted_rand_score(block,kmA)
    print "Adjacency: num error="+repr(num_diff)+" ari="+repr(ari)
    
    num_diff = vn.num_diff_w_perms(block, kmL)
    ari = adjusted_rand_score(block,kmL)
    print "Laplacian: num error="+repr(num_diff)+" ari="+repr(ari)
def test_spectral_clustering(eigen_solver, assign_labels):
    S = np.array([[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
                  [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
                  [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
                  [0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0],
                  [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
                  [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
                  [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0]])

    for mat in (S, sparse.csr_matrix(S)):
        model = SpectralClustering(random_state=0, n_clusters=2,
                                   affinity='precomputed',
                                   eigen_solver=eigen_solver,
                                   assign_labels=assign_labels
                                   ).fit(mat)
        labels = model.labels_
        if labels[0] == 0:
            labels = 1 - labels

        assert adjusted_rand_score(labels, [1, 1, 1, 0, 0, 0, 0]) == 1

        model_copy = pickle.loads(pickle.dumps(model))
        assert model_copy.n_clusters == model.n_clusters
        assert model_copy.eigen_solver == model.eigen_solver
        assert_array_equal(model_copy.labels_, model.labels_)
def check_clustering(name, Alg):
    X, y = make_blobs(n_samples=50, random_state=1)
    X, y = shuffle(X, y, random_state=7)
    X = StandardScaler().fit_transform(X)
    n_samples, n_features = X.shape
    # catch deprecation and neighbors warnings
    with warnings.catch_warnings(record=True):
        alg = Alg()
    set_fast_parameters(alg)
    if hasattr(alg, "n_clusters"):
        alg.set_params(n_clusters=3)
    set_random_state(alg)
    if name == 'AffinityPropagation':
        alg.set_params(preference=-100)
        alg.set_params(max_iter=100)

    # fit
    alg.fit(X)
    # with lists
    alg.fit(X.tolist())

    assert_equal(alg.labels_.shape, (n_samples,))
    pred = alg.labels_
    assert_greater(adjusted_rand_score(pred, y), 0.4)
    # fit another time with ``fit_predict`` and compare results
    if name is 'SpectralClustering':
        # there is no way to make Spectral clustering deterministic :(
        return
    set_random_state(alg)
    with warnings.catch_warnings(record=True):
        pred2 = alg.fit_predict(X)
    assert_array_equal(pred, pred2)
Example #15
0
def eval_clustering(labels_true, labels_guess):
    """
    Given the ground truth and our guessed clustering assignment, use the Adjusted Rand index to measure
    assignment similarity
    :return: Rand Index
    """
    return metrics.adjusted_rand_score(labels_true, labels_guess)
Example #16
0
def cluster(X,label):
    labels_true=X[label]
    X=X.drop(['churn','appetency','upselling',label],axis='columns')
    name='AffinityPropagation'
    est=AffinityPropagation(preference=-50)
    adjustD={}
    clusters_n={}
    db = est.fit(X)
    labels = db.labels_
    adjustD[name]=metrics.adjusted_rand_score(labels_true,labels)
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    clusters_n[name]=n_clusters_
    print('Estimated estimator: %s' % name)
    print('Estimated number of clusters: %d' % n_clusters_)
    print("Adjusted Rand Index: %0.3f"
          % metrics.adjusted_rand_score(labels_true, labels))
def cluster_evaluation(D, y_true, n_clusters, eps=0.8, min_samples=10):
    ##############################################################################
    # Extract Y true
    labels_true = y_true

    ##############################################################################
    # transform distance matrix into a similarity matrix
    S = 1 - D 

    ##############################################################################
    # compute DBSCAN
    #db = DBSCAN(eps=eps, min_samples=min_samples).fit(S)
    db = Ward(n_clusters=n_clusters).fit(S)
    #core_samples = db.core_sample_indices_
    labels = db.labels_

    # number of clusters in labels, ignoring noise if present
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print 'Number of clusters: %d' % n_clusters_
    print 'Homogeneity: %0.3f' % metrics.homogeneity_score(labels_true, labels)
    print 'Completeness: %0.3f' % metrics.completeness_score(labels_true, labels)
    print 'V-meassure: %0.3f' % metrics.v_measure_score(labels_true, labels)
    print 'Adjusted Rand Index: %0.3f' % metrics.adjusted_rand_score(labels_true, labels)
    print 'Adjusted Mutual Information: %0.3f' % metrics.adjusted_mutual_info_score(labels_true, labels)
    print 'Silhouette Coefficient: %0.3f' % metrics.silhouette_score(D, labels, metric='precomputed')
Example #18
0
def compare(method1, method2, fig=False):
    X1 = np.load('{0}_{1}_X_2d.npy'.format(species, method1))
    X2 = np.load('{0}_{1}_X_2d.npy'.format(species, method2))
    
    print 'n_cluster\tHomo\tCompl\tNMI\tARI'
    for i in range(2, 6):
        clust1 = Clustering(species, method1, X1, None, n_clusters=i)
        clust2 = Clustering(species, method2, X2, None, n_clusters=i)
        
        clust1.agglomerative(linkage='ward')
        clust2.agglomerative(linkage='ward')
        
        label1 = clust1.pred_labels('ward')
        label2 = clust2.pred_labels('ward')
        
        
        if i == 3 and fig:
            names = np.unique(label1)
            figName = '{0}_{1}_on_{2}'.format(species, method1, method2)
            plot2d(X2, label1, names, figName, figName)

            names = np.unique(label2)
            figName = '{0}_{1}_on_{2}'.format(species, method2, method1)
            plot2d(X1, label2, names, figName, figName)
    
        print '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(i, metrics.homogeneity_score(label1, label2),
                                                metrics.completeness_score(label1, label2),
                                                metrics.normalized_mutual_info_score(label1, label2),
                                                metrics.adjusted_rand_score(label1, label2))
Example #19
0
def cluster():
    eps_set = 0.5 * np.arange(1, 7)
    npt_set = np.arange(1, 6)
    scores = []
    global res
    res = []
    for eps in eps_set:
        for npt in npt_set:
            est = DBSCAN(eps=eps, min_samples=npt)
            est.fit(x)
            ari = metrics.adjusted_rand_score(y, est.labels_)
            scores.append(ari)
            n_noise = len([ l for l in est.labels_ if l == -1])
            res.append((ari, np.max(est.labels_) + 1 , n_noise))
            print ari
    max_score = np.max(scores)
    max_idx = scores.index(max_score)
    max_eps = eps_set[max_idx / len(npt_set)]
    max_npt = npt_set[max_idx % len(npt_set)]
    print max_score, max_eps, max_npt
    scores = np.array(scores).reshape(len(eps_set), len(npt_set))
    pl.imshow(scores, interpolation='nearest', cmap=pl.cm.spectral)
    pl.colorbar()
    pl.xticks(np.arange(len(npt_set)), npt_set)
    pl.yticks(np.arange(len(eps_set)), eps_set)
    pl.ylabel('eps')
    pl.xlabel('min_samples')
    pl.show()
Example #20
0
def test_affinities():
    X, y = make_blobs(n_samples=40, random_state=1, centers=[[1, 1], [-1, -1]],
                      cluster_std=0.4)
    # nearest neighbors affinity
    sp = SpectralClustering(n_clusters=2, affinity='nearest_neighbors',
                            random_state=0)
    labels = sp.fit(X).labels_
    assert_equal(adjusted_rand_score(y, labels), 1)

    sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0)
    labels = sp.fit(X).labels_
    assert_equal(adjusted_rand_score(y, labels), 1)

    # raise error on unknown affinity
    sp = SpectralClustering(n_clusters=2, affinity='<unknown>')
    assert_raises(ValueError, sp.fit, X)
Example #21
0
def compare_direct_undir():
    from sklearn import metrics
    g = gt.Graph.Read_GraphML('ed_tag.graphml')
    gt.net_stat(g)
    gu = gt.Graph.Read_GraphML('ed_tag_undir.graphml')
    gt.net_stat(gu)
    com = g.community_infomap(edge_weights='weight', vertex_weights='weight')
    comu1 = gu.community_infomap(edge_weights='weight', vertex_weights='weight')
    comu2 = gu.community_infomap(edge_weights='weight', vertex_weights='weight')
    mem = com.membership
    memu1 = comu1.membership
    memu2 = comu2.membership
    print metrics.adjusted_rand_score(mem, memu1)
    print metrics.normalized_mutual_info_score(mem, memu1)
    print metrics.adjusted_rand_score(memu2, memu1)
    print metrics.normalized_mutual_info_score(memu2, memu1)
Example #22
0
def compute_metrics(answers, predictions):
    aris = []
    vscores = []
    fscores = []
    weights = []
    for k in answers.keys():
        idx = np.argsort(np.array(answers[k][0]))
        true = np.array(answers[k][1])[idx]
        pred = np.array(predictions[k][1])
        weights.append(pred.shape[0])
        if len(np.unique(true)) > 1:
            aris.append(adjusted_rand_score(true, pred))
        vscores.append(v_measure_score(true, pred))
        fscores.append(compute_fscore(true, pred))
#        print '%s: ari=%f, vscore=%f, fscore=%f' % (k, aris[-1], vscores[-1], fscores[-1])
    aris = np.array(aris)
    vscores = np.array(vscores)
    fscores = np.array(fscores)
    weights = np.array(weights)
    print 'number of one-sense words: %d' % (len(vscores) - len(aris))
    print 'mean ari: %f' % np.mean(aris)
    print 'mean vscore: %f' % np.mean(vscores)
    print 'weighted vscore: %f' % np.sum(vscores * (weights / float(np.sum(weights))))
    print 'mean fscore: %f' % np.mean(fscores)
    print 'weighted fscore: %f' % np.sum(fscores * (weights / float(np.sum(weights))))
    return np.mean(aris),np.mean(vscores)
Example #23
0
def test():
    global est
    est = DBSCAN(eps=1, min_samples=1)
    est.fit(x)
    print est.labels_
    ari = metrics.adjusted_rand_score(y, est.labels_)
    print ari
Example #24
0
def check_iris():
    iris = ds.load_iris()
    data = iris.data[:100] # data
    y_iris = iris.target[:100]  # clusters

    # pred_optics = OPTICS(eps=10, min_pts=4).fit_predict(data, dbscan=True, dbscan_eps=0.75)
    pred_optics = OPTICS(eps=0.6, min_pts=5).fit_predict(data, xi=0.3)
    pl.subplot(2, 2, 1)
    pl.scatter(data[:, 0], data[:, 1], c=y_iris, cmap=pl.cm.RdBu, lw=0, s=30)
    pl.xlabel('Sepal length, reference clusters')
    pl.ylabel('Sepal width')

    pl.subplot(2, 2, 2)
    pl.scatter(data[:, 2], data[:, 3], c=y_iris, cmap=pl.cm.RdBu, lw=0, s=30)
    pl.xlabel('Petal length, reference clusters')
    pl.ylabel('Petal width')

    pl.subplot(2, 2, 3)
    pl.scatter(data[:, 0], data[:, 1], c=pred_optics, cmap=pl.cm.RdBu, lw=0, s=30)
    pl.xlabel('Sepal length, optics clusters')
    pl.ylabel('Sepal width')

    pl.subplot(2, 2, 4)
    pl.scatter(data[:, 2], data[:, 3], c=pred_optics, cmap=pl.cm.RdBu, lw=0, s=30)
    pl.xlabel('Petal length, optics clusters')
    pl.ylabel('Petal width')
    pl.show()
    print "Adjusted Rand index for iris is: %.2f" % smt.adjusted_rand_score(y_iris, pred_optics)
Example #25
0
 def get_ari(self):
     if self.mclustRes is None:
         print "No results yet. Use run(x)"
     if self._ari is None:
         mc_class = np.array(self.mclustRes['classification']).astype(int)-1
         self._ari = metrics.adjusted_rand_score(mc_class,self.labels)
     return self._ari
def rand_compare(truth_file, estimate_file, topics_only=False, randomize_pov=False):
    truth_labels = read_clustering_file(truth_file,
                                        topics_only=topics_only)
    estimate_labels = read_clustering_file(estimate_file,
                                           topics_only=topics_only,
                                           randomize_pov=randomize_pov)
    return metrics.adjusted_rand_score(truth_labels, estimate_labels)
Example #27
0
def run_clustering( clusterer, data, labels ):
    """
    Cluster: Using a predefined and parameterized clustering algorithm, fit
    some dataset and perform metrics given a set of ground-truth labels.

        clusterer: the clustering algorithm, from sklearn
        data:      array-like dataset input
        labels:    vector of ground-truth labels

    """

    # Time the operation
    t0 = time()
    clusterer.fit(data)
    t1 = time()

    # Perform metrics
    runtime         = (t1 - t0)
    homogeneity     = metrics.homogeneity_score(   labels, clusterer.labels_ )
    completeness    = metrics.completeness_score(  labels, clusterer.labels_ )
    v_measure       = metrics.v_measure_score(     labels, clusterer.labels_ )
    adjusted_rand   = metrics.adjusted_rand_score( labels, clusterer.labels_ )
    adjusted_mutual = metrics.adjusted_mutual_info_score( labels,
                                                          clusterer.labels_ )

    # Output to logs
    logging.info("  |-        Execution time: %fs"   % runtime)
    logging.info("  |-           Homogeneity: %0.3f" % homogeneity)
    logging.info("  |-          Completeness: %0.3f" % completeness)
    logging.info("  |-             V-measure: %0.3f" % v_measure)
    logging.info("  |-   Adjusted Rand-Index: %.3f"  % adjusted_rand)
    logging.info("  |-  Adjusted Mutual Info: %.3f"  % adjusted_mutual)
def test_spectral_clustering_with_arpack_amg_solvers():
    # Test that spectral_clustering is the same for arpack and amg solver
    # Based on toy example from plot_segmentation_toy.py

    # a small two coin image
    x, y = np.indices((40, 40))

    center1, center2 = (14, 12), (20, 25)
    radius1, radius2 = 8, 7

    circle1 = (x - center1[0]) ** 2 + (y - center1[1]) ** 2 < radius1 ** 2
    circle2 = (x - center2[0]) ** 2 + (y - center2[1]) ** 2 < radius2 ** 2

    circles = circle1 | circle2
    mask = circles.copy()
    img = circles.astype(float)

    graph = img_to_graph(img, mask=mask)
    graph.data = np.exp(-graph.data / graph.data.std())

    labels_arpack = spectral_clustering(
        graph, n_clusters=2, eigen_solver='arpack', random_state=0)

    assert len(np.unique(labels_arpack)) == 2

    if amg_loaded:
        labels_amg = spectral_clustering(
            graph, n_clusters=2, eigen_solver='amg', random_state=0)
        assert adjusted_rand_score(labels_arpack, labels_amg) == 1
    else:
        assert_raises(
            ValueError, spectral_clustering,
            graph, n_clusters=2, eigen_solver='amg', random_state=0)
Example #29
0
    def test_KMeans_scores(self):
        digits = datasets.load_digits()
        df = pdml.ModelFrame(digits)

        scaled = pp.scale(digits.data)
        df.data = df.data.pp.scale()
        self.assert_numpy_array_almost_equal(df.data.values, scaled)

        clf1 = cluster.KMeans(init='k-means++', n_clusters=10,
                              n_init=10, random_state=self.random_state)
        clf2 = df.cluster.KMeans(init='k-means++', n_clusters=10,
                                 n_init=10, random_state=self.random_state)
        clf1.fit(scaled)
        df.fit_predict(clf2)

        expected = m.homogeneity_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.homogeneity_score(), expected)

        expected = m.completeness_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.completeness_score(), expected)

        expected = m.v_measure_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.v_measure_score(), expected)

        expected = m.adjusted_rand_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.adjusted_rand_score(), expected)

        expected = m.homogeneity_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.homogeneity_score(), expected)

        expected = m.silhouette_score(scaled, clf1.labels_, metric='euclidean',
                                      sample_size=300, random_state=self.random_state)
        result = df.metrics.silhouette_score(metric='euclidean', sample_size=300,
                                             random_state=self.random_state)
        self.assertAlmostEqual(result, expected)
Example #30
0
 def test_n_trials(self):
     c = PinchRatioClustering(2, similarity.KNN(9),
                              n_trials=5)
     data, labels = self.make_blobs(150)
     c.fit(data)
     score = metrics.adjusted_rand_score(c.labels, labels)
     self.assertEqual(score, 1.0)
Example #31
0
 def test_adjusted_rand_score(self):
     result = self.df.metrics.adjusted_rand_score()
     expected = metrics.adjusted_rand_score(self.target, self.pred)
     self.assertEqual(result, expected)
Example #32
0
def performance():
    randind_kmean = []
    purity_kmean = []
    randind_gmm = []
    purity_gmm = []
    num_trials = 3

    data = pd.read_csv(DATASET)
    X, y = preprocess(data)
    kf = KFold(n_splits=num_trials)
    kf.get_n_splits(X)

    i = 1
    print("Running {} train and evaluate iterations".format(num_trials))
    for train_index, test_index in kf.split(X):
        print("Iteration {} out of {}".format(i, num_trials))
        X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[
            test_index], y.iloc[train_index], y.iloc[test_index]
        split_data = [X_train, X_test, y_train, y_test]
        kmean, gmm, X_test, y_test = train(False, split_data)

        kmean_pred = kmean.predict(X_test)
        gmm_pred = gmm.predict(X_test)

        randind_kmean.append(adjusted_rand_score(y_test, kmean_pred))
        randind_gmm.append(adjusted_rand_score(y_test, gmm_pred))

        purity_kmean.append(purity(y_test, kmean_pred))
        purity_gmm.append(purity(y_test, gmm_pred))

        i += 1

    randind_gmm_av = pd.DataFrame(randind_gmm).mean()
    purity_gmm_av = pd.DataFrame(purity_gmm).mean()

    randind_kmean_av = pd.DataFrame(randind_kmean).mean()
    purity_kmean_av = pd.DataFrame(purity_kmean).mean()

    print('({:d} trials)'.format(num_trials))
    print('_________________K-Means Model____________________')
    print("The average adjusted rand score for testing data: %.2f" %
          randind_kmean_av.iloc[0])
    print("The average purity for testing data: %.2f" %
          purity_kmean_av.iloc[0])

    print('_________________GMM Model____________________')
    print("The average adjusted rand score for testing data: %.2f" %
          randind_gmm_av.iloc[0])
    print("The average purity for testing data: %.2f" % purity_gmm_av.iloc[0])
    print('******************************************************* ')

    with open(os.path.join(PERFORMANCE_DIR, PERFORMANCE_NAME), 'w') as outfile:
        headers = ['Model', 'Adjusted Rand Score', 'Purity']
        kmean_results = pd.DataFrame(
            [['K-Means', randind_kmean_av.iloc[0], purity_kmean_av.iloc[0]]],
            columns=headers)
        gmm_results = pd.DataFrame([[
            'Guassian Mixture', randind_gmm_av.iloc[0], purity_gmm_av.iloc[0]
        ]],
                                   columns=headers)

        results = pd.DataFrame(columns=headers)

        results = results.append(kmean_results)
        results = results.append(gmm_results)

        print("Writing results to {}".format(outfile.name))
        results.to_csv(path_or_buf=outfile, index=False)
Example #33
0
def news_cluster_advanced():
    opt = Config()

    data = get_divided_data()
    stop_words = get_stopwords()

    X, vectorizer, svd = extract_traits(data, stop_words)

    labels = data['DividedContent']
    true_k = 31  # 聚类数量

    if opt.minibatch:
        km = MiniBatchKMeans(n_clusters=true_k,
                             init='k-means++',
                             n_init=2,
                             init_size=1000,
                             batch_size=1500,
                             verbose=opt.verbose)
    else:
        km = KMeans(n_clusters=true_k,
                    init='k-means++',
                    max_iter=300,
                    n_init=5,
                    n_jobs=-1,
                    verbose=opt.verbose)

    print("对稀疏数据(Sparse Data) 采用 %s" % km)
    t0 = time.time()
    km.fit(X)
    print("完成所耗费时间:%0.3fs" % (time.time() - t0))
    print()

    print("Homogeneity值: %0.3f" %
          metrics.homogeneity_score(labels, km.labels_))
    print("Completeness值: %0.3f" %
          metrics.completeness_score(labels, km.labels_))
    print("V-measure值: %0.3f" % metrics.v_measure_score(labels, km.labels_))
    print("Adjusted Rand-Index值: %.3f" %
          metrics.adjusted_rand_score(labels, km.labels_))
    print("Silhouette Coefficient值: %0.3f" %
          metrics.silhouette_score(X, km.labels_, sample_size=1000))

    print()

    # 用训练好的聚类模型反推文档的所属的主题类别
    label_prediction = km.predict(X)
    label_prediction = list(label_prediction)

    if not opt.use_hashing:
        print("每个聚类的TOP关键词:")

        if opt.n_components:
            original_space_centroids = svd.inverse_transform(
                km.cluster_centers_)
            order_centroids = original_space_centroids.argsort()[:, ::-1]
        else:
            order_centroids = km.cluster_centers_.argsort()[:, ::-1]

        terms = vectorizer.get_feature_names()
        for i in range(true_k):
            print("簇群 %d   " % (i + 1), end='')
            print(
                "该簇群所含文档占比为", '%.4f%%' % (int(label_prediction.count(i)) /
                                          int(len(data['DividedContent']))))
            print("簇群关键词:")
            for ind in order_centroids[i, :80]:
                print(' %s,' % terms[ind], end='')
            print(
                '\n------------------------------------------------------------------------------------------------'
            )
clusters = unique(yhat)

#
# Exibição dos gráficos 2D e 3D e componentes
#
plot_pca_2d(x_pca,yhat)
plot_pca_3d(x_pca,yhat,elevacao,azimute)
plot_componentes(df_pca_componentes)


#
# salvando em um dataframe/csv o resultado do modelo
#
df_algoritimos["HC"]=yhat

#
# Análise ARI
#
st.title("Similaridade entre os clusters")
st.header("(Adjusted Rand Score Index)")
lista=["KMeans","mean_s", "dbscan","GMM","HC"]
ari_score={}
for mod1 in lista:
    reg={}
    for mod2 in lista:
        ari=adjusted_rand_score(df_algoritimos[mod1].values, df_algoritimos[mod2].values)
        reg[mod2]=ari
    ari_score[mod1]=reg
df_ari_score=pd.DataFrame(ari_score)
sns.heatmap(df_ari_score,annot=True, cmap="Blues")
st.pyplot()
    plt.xticks([], [])
    plt.xlabel('')
    plt.yticks([], [])
    plt.ylabel('')
    #plt.show()
    plt.savefig('./clustering/' + name + '.png')


muestra_agrupacion(X, y, "No Clustering")

METRICAS = pd.DataFrame(columns=['Resultado'])
for i in range(2, 8):
    kmeans = KMeans(n_clusters=i, random_state=0).fit(X)
    #muestra_agrupacion(X, kmeans.labels_)
    print("kmean with k=" + str(i))
    METRICAS.loc['K=' + str(i)] = metrics.adjusted_rand_score(
        y, kmeans.labels_, "K" + str(i))

print(METRICAS)

#seleccion
RESULTADOS_SELEC = pd.DataFrame(columns=['Accuracy', "Tiempo"])
clasif = KNeighborsClassifier(n_neighbors=1, n_jobs=-1)
selector = SelectKBest(chi2, k=20)

RESULTADOS_SELEC["No selector"] = experimento_clas(clasif, X_train, y_train,
                                                   X_test, y_test)
selector = SelectKBest(chi2, k=20)
selector.fit_transform(X, y)
X_train_sel, X_test_sel, y_train_sel, y_test_sel = train_test_split(
    X, y, test_size=0.8)
RESULTADOS_SELEC["chi2"] = experimento_clas(clasif, X_train_sel, y_train_sel,
Example #36
0
def GMM(data, labels, n_components):
    gmm = mixture.GaussianMixture(n_components=n_components,
                                  covariance_type='full',
                                  random_state=0)
    gmm = gmm.fit(data)
    return metrics.adjusted_rand_score(labels, gmm.predict(data))
Example #37
0
def get_ar(y_true, y_pred):
    return metrics.adjusted_rand_score(y_true, y_pred)
Example #38
0
print("\t n_samples %d, \t n_features %d" % (n_samples, n_features))

print(82 * '_')
print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')

#clustering on raw data
t0 = time.time()
kmeans = KMeans(init='k-means++', n_clusters=53, n_init=10)
kmeans.fit(data)
print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' %
      ('k-means++', (time.time() - t0), kmeans.inertia_,
       metrics.homogeneity_score(labels, kmeans.labels_),
       metrics.completeness_score(labels, kmeans.labels_),
       metrics.v_measure_score(labels, kmeans.labels_),
       metrics.adjusted_rand_score(labels, kmeans.labels_),
       metrics.adjusted_mutual_info_score(
           labels, kmeans.labels_, average_method='arithmetic'),
       metrics.silhouette_score(
           data, kmeans.labels_, metric='euclidean', sample_size=sample_size)))

t0 = time.time()
kmeans = KMeans(init='random', n_clusters=53, n_init=10)
kmeans.fit(data)
print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' %
      ('random', (time.time() - t0), kmeans.inertia_,
       metrics.homogeneity_score(labels, kmeans.labels_),
       metrics.completeness_score(labels, kmeans.labels_),
       metrics.v_measure_score(labels, kmeans.labels_),
       metrics.adjusted_rand_score(labels, kmeans.labels_),
       metrics.adjusted_mutual_info_score(
Example #39
0
def test_DBSCAN(*data):
    X, labels_true = data
    clst = cluster.DBSCAN();
    predict_labels = clst.fit_predict(X)
    print("ARI:%s" % adjusted_rand_score(labels_true, predict_labels))
    print("Core sample num:%d" % len(clst.core_sample_indices_))
Example #40
0
h = persim.heat(originalPers, comparePers)
h_u = persim.heat(originalPers, upscalePers)

print "Computing wasserstein..."
ws = persim.wasserstein(originalPers, comparePers)
ws_u = persim.wasserstein(originalPers, upscalePers)
end = time.time()
#gh = persim.gromov_hausdorff(originalPers, comparePers)
#gh_u = persim.gromov_hausdorff(originalPers, upscalePers)

baselineCluster = np.genfromtxt(outDir + "kmeans++/reducedData.csv")
print "Calculating ARI with k-means as the baseline comparison..."
#measure (dis)similarity between clusterings
#sklearn.metrics.adjusted_rand_score(labels_true, labels_pred)
#placeholder for now, need to confrim with nick
kmeansARI = met.adjusted_rand_score(baselineCluster, baselineCluster)
agglomWardARI = met.adjusted_rand_score(
    baselineCluster, outDir + "agglomerativeWard/reducedData.csv")
agglomSingleARi = met.adjusted_rand_score(
    baselineCluster, outDir + "agglomerativeSingle/reducedData.csv")
hdbscanARI = met.adjusted_rand_score(baselineCluster,
                                     outDir + "hdbscan/reducedData.csv")
randomARI = met.adjusted_rand_score(baselineCluster,
                                    outDir + "random/reducedData.csv")
print "Calculating Silhouette score "
#silhouette sore - shows how close each point in one cluster is to points in neighboring clusters - used to find optimum num clusters
#sklearn.metrics.silhouette_score(X, labels, metric='euclidean', sample_size=None, random_state=None, **kwds)
#kmeansARI = met.silhouette_score(
#agglomWardARI = met.silhouette_score(
#agglomSingleARi = met.silhouette_score(
#hdbscanARI = met.silhouette_score(
p = 10 * np.median(S)

##############################################################################
# Compute Affinity Propagation
af = AffinityPropagation().fit(S, p)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

n_clusters_ = len(cluster_centers_indices)

print 'Estimated number of clusters: %d' % n_clusters_
print "Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)
print "Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)
print "V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)
print "Adjusted Rand Index: %0.3f" % \
    metrics.adjusted_rand_score(labels_true, labels)

##############################################################################
# Plot result
import pylab as pl
from itertools import cycle

pl.close('all')
pl.figure(1)
pl.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    class_members = labels == k
    cluster_center = X[cluster_centers_indices[k]]
    pl.plot(X[class_members, 0], X[class_members, 1], col + '.')
                     init='k-means++',
                     n_init=1,
                     init_size=1000,
                     batch_size=1000)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f" %
      metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f" %
      metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()

print("Top terms per cluster:")

#original_space_centroids = svd.inverse_transform(km.cluster_centers_)
#order_centroids = original_space_centroids.argsort()[:, ::-1]
#
#terms = vectorizer.get_feature_names()
#for i in range(true_k):
#    print("Cluster %d:" % i, end='')
#    for ind in order_centroids[i, :10]:
#        print(' %s' % terms[ind], end='')
Example #43
0
def train(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr,
          sp_master, spark_rdd_compress, spark_driver_maxResultSize,
          sp_exe_memory, sp_core_max, zipout_dir, zipcode_dir, zip_file_name,
          mongo_tuples, labelnameflag, fromweb, src_filename, jobname,
          model_data_folder):

    # create zip files for Spark workers ================= ================
    zip_file_path = ml_build_zip_file(zipout_dir,
                                      zipcode_dir,
                                      zip_file_name,
                                      prefix='zip_feature_util')
    print "INFO: zip_file_path=", zip_file_path

    #data_folder = hdfs_feat_dir + "/"
    #local_out_dir = local_out_dir + "/"
    if not os.path.exists(local_out_dir):
        os.makedirs(local_out_dir)

    # ML model filename ====
    model_fname = os.path.join(model_data_folder, row_id_str + '.pkl')
    print "INFO: model_data_folder=", model_data_folder
    # create out folders and clean up old model files ====
    ml_util.ml_prepare_output_dirs(row_id_str, local_out_dir,
                                   model_data_folder, model_fname)

    # init Spark context ====
    sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress,
                                      spark_driver_maxResultSize,
                                      sp_exe_memory, sp_core_max, jobname,
                                      [zip_file_path])

    # start here =================================================================== ===============
    t0 = time()

    ### load libsvm file: may or may not be PCA-ed ###
    libsvm_data_file = os.path.join(hdfs_feat_dir, src_filename)
    print "INFO: libsvm_data_file=", libsvm_data_file

    # feature count is a variable if PCA
    feature_count = 0

    # samples_rdd may be from PCAed data
    # load sample RDD from text file
    # format (LabeledPoint,hash) from str2LabeledPoint_hash()
    samples_rdd, feature_count = zip_feature_util.get_sample_rdd(
        sc, libsvm_data_file, feature_count, '')

    # collect all data to local for processing ===============
    all_data = samples_rdd.collect()
    total_sample_count = len(all_data)
    # 2-D array, may be PCAed
    features_list = [x.features.toArray() for x, _ in all_data]
    # label array
    labels_list_all = [x.label for x, _ in all_data]
    # hash array
    hash_list_all = [x for _, x in all_data]
    # convert to np array
    features_array_reduced = np.array(features_list)
    hash_list_all = np.array(hash_list_all)
    labels_list_all = np.array(labels_list_all)
    true_label_array = np.array(labels_list_all, dtype=np.int8)

    print "INFO: total_sample_count=", total_sample_count
    print "INFO: features_array_reduced.shape=", features_array_reduced.shape
    print "INFO: labels_list_all.shape=", labels_list_all.shape
    print "INFO: true_label_array.shape=", true_label_array.shape

    t1 = time()
    print 'INFO: data generating time: %f' % (t1 - t0)

    ###############################################
    ########## build learning model ###############
    ###############################################

    ### parse parameters and generate the model ###
    (model, alg, n_clusters) = parse_para_and_get_model(ml_opts_jstr)
    if model is None:
        return

    labels_kmeans = None
    #### fit the model to training dataset ####
    try:
        model.fit(features_array_reduced)
        labels_kmeans = model.labels_  #'numpy.ndarray'

    except:
        print "ERROR: Error in model.fit(): ", "model=", model, ", sys.exc_info:", sys.exc_info(
        )[0]
        return

    #### save clf for future use ####
    #joblib.dump(model, model_data_folder + row_id_str+'.pkl')
    joblib.dump(model, model_fname)

    #print "**model:intercept***"
    #print clf.intercept_

    print "INFO: model type=", type(model), " model=", model

    ###################################################
    ### generate label names (family names) ###########
    ### connect to database to get the column list which contains all column number of the corresponding feature####
    ###################################################

    if labelnameflag == 1:
        key = "dic_name_label"
        jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}'
        jstr_proj = '{"value":1}'

        # get parent dataset's data
        if ds_id != row_id_str:
            jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}'

        doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj)
        dic_list = doc['value']

        label_dic = {}
        for i in range(0, len(dic_list)):
            for key in dic_list[i]:
                label_dic[dic_list[i][key]] = key.encode('UTF8')
        print "INFO: label_dic:", label_dic
    else:
        label_dic = {}
        label_set = set(labels_list_all)
        for label_value in label_set:
            label_dic[int(label_value)] = str(int(label_value))
        print "INFO: generated label_dic:", label_dic

    labels_list = []
    for key in sorted(label_dic):
        labels_list.append(label_dic[key])
    print "INFO: labels_list=", labels_list

    #Adjusted Mutual Information between two clusterings
    amis = adjusted_mutual_info_score(labels_list_all, labels_kmeans)
    print "INFO: Adjusted_mutual_info_score=", amis
    #Similarity measure between two clusterings
    ars = adjusted_rand_score(labels_list_all, labels_kmeans)
    print "INFO: Adjusted_rand_score=", ars

    ###################################################
    #######plot histogram                       ####
    ###################################################
    plot_col_num = int(math.ceil(math.sqrt(n_clusters)))
    figsize = (4 * plot_col_num,
               3 * int(math.ceil(n_clusters * 1.0 / plot_col_num)))
    print "INFO: labels_list_all.shape=", labels_list_all.shape, "labels_kmeans.shape=", labels_kmeans.shape
    print "INFO: labels_list_all t=", type(
        labels_list_all), "labels_kmeans t=", type(labels_kmeans)
    print "INFO: n_clusters=", n_clusters, ",label_dic=", label_dic
    print "INFO: plot_col_num=", plot_col_num, ",figsize=", figsize, ",local_out_dir=", local_out_dir

    # kmeans histogram
    _, p_true = ml_plot_kmeans_histogram_subfigures(labels_list_all,
                                                    labels_kmeans,
                                                    n_clusters,
                                                    names=label_dic,
                                                    plot_col_num=plot_col_num,
                                                    figsize=figsize,
                                                    folder=local_out_dir,
                                                    rid=row_id_str)
    # normalized kmeans histogram
    _, p_true_norm = ml_plot_kmeans_histogram_subfigures(
        labels_list_all,
        labels_kmeans,
        n_clusters,
        names=label_dic,
        plot_col_num=plot_col_num,
        figsize=figsize,
        normalize=True,
        folder=local_out_dir,
        rid=row_id_str)

    ####plot "reverse" histogram with labels ####
    #num_bars = len(np.unique(labels_list_all))
    num_bars = max(labels_list_all) + 1
    figsize = (4 * plot_col_num,
               3 * int(math.ceil(num_bars * 1.0 / plot_col_num)))

    _, p_cluster = ml_plot_kmeans_histogram_subfigures(
        labels_kmeans,
        labels_list_all,
        num_bars,
        names=label_dic,
        plot_col_num=plot_col_num,
        figsize=figsize,
        reverse=True,
        folder=local_out_dir,
        rid=row_id_str)

    #### plot dot figures ####
    #mtx_label = model.labels_
    mtx_center = model.cluster_centers_
    # dot plot for Kmeans   ===========
    filename = os.path.join(local_out_dir, row_id_str + '_cluster.png')
    filename_3d = os.path.join(local_out_dir, row_id_str + '_cluster_3d.json')
    ml_plot_kmeans_dot_graph_save_file(features_array_reduced,
                                       labels_kmeans,
                                       mtx_center,
                                       n_clusters,
                                       figsize=(10, 7),
                                       filename=filename,
                                       title='KMeans',
                                       filename_3d=filename_3d)

    # dot plot for True Labels  ===========
    filename = os.path.join(local_out_dir, row_id_str + '_cluster_tl.png')
    filename_3d = os.path.join(local_out_dir,
                               row_id_str + '_cluster_3d_tl.json')
    ml_plot_kmeans_dot_graph_save_file(features_array_reduced,
                                       true_label_array,
                                       mtx_center,
                                       n_clusters,
                                       figsize=(10, 7),
                                       filename=filename,
                                       title='True Labels',
                                       filename_3d=filename_3d)

    dataset_info = {
        "training_fraction": 1,
        "class_count": n_clusters,
        "dataset_count": total_sample_count
    }
    # only update db for web request   ===========
    if fromweb == "1":
        #print "database update"
        str_sql="UPDATE atdml_document set accuracy = '" \
            +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \
            +"', total_feature_numb='"+str(feature_count) \
            +"', perf_measures='{}" \
            +"', dataset_info='"+json.dumps(dataset_info) \
            +"' where id="+row_id_str
        ret = exec_sqlite.exec_sql(str_sql)
        print "INFO: Data update done! ret=", str(ret)
    else:
        print "INFO: accuracy = '" + str(accuracy * 100) + "%"

    t1 = time()
    print 'INFO: running time: %f' % (t1 - t0)

    #print 'Finished!'
    return 0
Example #44
0
X, y = make_blobs(n_samples=500,
                  n_features=2,
                  centers=4,
                  cluster_std=1,
                  center_box=(-10.0, 10.0),
                  shuffle=True,
                  random_state=1)
plot_data(X, y)

kmeans_model = cluster.KMeans(n_clusters=5, random_state=1)
kmeans_model.fit(X)
kmeans_model.cluster_centers_
kmeans_model.labels_

#metrics when target labels are not known
silhouette_avg = metrics.silhouette_score(X,
                                          kmeans_model.labels_,
                                          metric='euclidean')
print(silhouette_avg)
silhouette_samples = metrics.silhouette_samples(X,
                                                kmeans_model.labels_,
                                                metric='euclidean')
print(silhouette_samples)
ch_score = metrics.calinski_harabaz_score(X, kmeans_model.labels_)
print(ch_score)

#metrics when target labels are known
df = pd.DataFrame({'GT': y, 'Pred': kmeans_model.labels_})
print(metrics.adjusted_rand_score(y, kmeans_model.labels_))
print(metrics.adjusted_mutual_info_score(y, kmeans_model.labels_))
Example #45
0
             label='mini-batch k-means')

    C_mbkm_wr = mini_batch_kmeans(X, C_init, b=b, t=t, replacement=True)
    plt.plot(C_mbkm_wr[:, 0],
             C_mbkm_wr[:, 1],
             'mo',
             markersize=10,
             label='mini-batch k-means w/o rep.')

    # from sklearn.cluster import MiniBatchKMeans
    # mbkm_skl = MiniBatchKMeans(n_clusters=k, max_iter=1, max_no_improvement=None, tol=0.0, batch_size=b, init=C_init, compute_labels=False)
    # mbkm_skl.fit(X)
    # C_mbkm_skl = mbkm_skl.cluster_centers_
    # plt.plot(C_mbkm_skl[:,0], C_mbkm_skl[:,1], 'co', markersize=10, label='mini-batch k-means SKL')

    plt.legend(numpoints=1, loc='lower right')

    labels_init = compute_labels(X, C_init)
    labels_kmeans = compute_labels(X, C_kmeans)
    labels_mbkm = compute_labels(X, C_mbkm)
    labels_mbkm_wr = compute_labels(X, C_mbkm_wr)
    print("Adjusted rand scores:")
    print("labels_kmeans, labels_init =",
          adjusted_rand_score(labels_kmeans, labels_init))
    print("labels_kmeans, labels_mbkm =",
          adjusted_rand_score(labels_kmeans, labels_mbkm))
    print("labels_kmeans, labels_mbkm_wr =",
          adjusted_rand_score(labels_kmeans, labels_mbkm_wr))

    plt.show()
Example #46
0
                                                 linkage='complete').fit(data)
            labels = clustering.labels_

            #Kmeans (Euclidean)
            # clustering = KMeans(n_clusters=k, random_state=0).fit(data)
            #labels = clustering.labels_

            #Fuzzy c-means
            #cntr, u, u0, dat, jm, p, fpc  = cmeans(np.transpose(data), c=k, m=2, error = 0.005, maxiter=100, init=None, seed=None)
            #labels = np.argmax(u, axis=0)

            #Kmedoids
            #labels, sse_all, j, closest_observations_prev = Kmedoids(timeseries, k, max_iter, window_size)

            MI = adjusted_mutual_info_score(classes, labels)
            RS = adjusted_rand_score(classes, labels)
            HS = metrics.homogeneity_score(classes, labels)
            CS = metrics.completeness_score(classes, labels)
            FMS = metrics.fowlkes_mallows_score(classes, labels)
            RI = rand_index_score(classes, labels)

            counter_data_set = counter_data_set + 1

            output_row['dataset'] = d
            output_row['adjusted Mutual information'] = MI
            output_row['adjusted Rand index'] = RS
            output_row['Homogeneity'] = HS
            output_row['Completeness'] = CS
            output_row['Fowlkes Mallows'] = FMS
            output_row['Rand index'] = RI
Example #47
0
def gen_nets_comm(ks,
                  data_path=None,
                  data_file_name='data.npy',
                  target_file_name='target.npy',
                  metric='euclidean',
                  community_method='community_multilevel',
                  n_jobs=-1):
    if not data_path:
        raise ValueError('data_path not specified.')
    path = Path(data_path)

    x = np.load(path / data_file_name)
    y = np.load(path / target_file_name)

    similarity_matrix = 1 / (
        1 + pairwise_distances(x, metric=metric, n_jobs=n_jobs))
    result_modularity = []
    result_ari = []
    result_nmi = []
    connec_point = None
    for k in ks:
        M = similarity_matrix.copy()
        to_remove = M.shape[0] - (k + 1)  # add 1 to eliminate loops
        for vec in M:
            vec[vec.argsort()[:to_remove]] = 0

        g = Graph.Weighted_Adjacency(M.tolist(),
                                     mode=ADJ_UNDIRECTED,
                                     loops=False)
        g.vs['name'] = y

        # Verify in which k the network is connected
        if not connec_point and not g.is_connected():
            connec_point = k

        y_pred, modularity = detect_community(g, community_method)

        path_save = Path(path) / 'nets'
        path_save.mkdir(parents=True, exist_ok=True)
        net_name = 'net_%s_k_%i.xnet' % (metric, k)
        labels_name = 'net_%s_k_%i_labels_comm.txt' % (metric, k)

        to_xnet(g, path_save / net_name, names=True)
        np.savetxt(path_save / labels_name, y_pred, fmt='%s')

        metrics.adjusted_rand_score(y, y_pred)

        result_modularity.append(modularity)
        result_ari.append(metrics.adjusted_rand_score(y, y_pred))
        result_nmi.append(metrics.normalized_mutual_info_score(y, y_pred))

    path_results = path / 'results'
    path_results.mkdir(parents=True, exist_ok=True)

    df = pd.DataFrame({
        'NMI': result_nmi,
        'ARI': result_ari,
        'Modularity': result_modularity
    })
    df.to_csv(path_results / ('%s.csv' % metric))
    df.index = sorted(ks)  #df.index + 1
    plot = df.plot(xticks=[1] + list(range(0,
                                           max(ks) + 1, 5))[1:],
                   ylim=(0, 1),
                   use_index=True)
    plot.set_xlabel('k')

    plot.axvline(connec_point, color='k', linestyle='--')
    plot.text(connec_point + 0.01, 0.98, 'connected', rotation=90)

    fig = plot.get_figure()
    fig.savefig(path_results / ('%s.pdf' % metric))