Ejemplo n.º 1
0
 def test_confidence_interval(self):
     data = [8.0, 7.0, 5.0, 9.0, 9.5, 11.3, 5.2, 8.5]
     
     self.assertAlmostEqual(1.6772263663789651, 
                            half_confidence_interval_size(data, 0.95), 5)
 
 
     data = [8.0, 7.0, 5.0, 9.0, 9.5, 11.3, 5.2, 8.5, 
             4.0, 7.4, 4.4, 9.0, 1.1, 0.0, 0.2, 9.5, 
             1.0, 2.0, 3.0, 4.0, 5.5, 8.2, 4.2, 4.5, 
             7.2, 7.0, 1.2, 5.3, 8.5, 1.3, 5.3, 9.5]
     self.assertAlmostEqual(1.4173919794304153, 
                            half_confidence_interval_size(data, 0.99), 5)
Ejemplo n.º 2
0
    def test_confidence_interval_axis(self):
        data = [[8.0, 7.0, 5.0, 9.0, 9.5, 11.3, 5.2, 8.5],
                [8.0, 7.0, 5.0, 9.0, 9.5, 11.3, 5.2, 8.5]]

        assert_array_almost_equal([1.67722628,  1.67722628],
                                  half_confidence_interval_size(data, .95, 
                                                                axis=1))

        assert_array_almost_equal([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
                                  half_confidence_interval_size(data, .95, 
                                                                axis=0))
        
        self.assertAlmostEqual(1.06902922476,
                               half_confidence_interval_size(data, .95))
Ejemplo n.º 3
0
def kmeans_betacv(data, num_cluster, batch_kmeans=False, n_runs = 10,
                  confidence = 0.90):
    '''
    Computes the BetaCV for running Kmeans on the dataset. This method
    returns the BetaCV value and half of the size of the confidence interval
    for the same value (BetaCV is an average or the number of runs given).
    
    Arguments
    ---------
    data: matrix
        A matrix of observations. If this is sparse, `batch_kmeans` must 
        be True
    num_cluster: int 
        number of clusters to run k-means for
    batch_kmeans: bool (defauts to False)
        if `sklearn.cluster.MiniBatchKMeans` should be used. This is faster
        and suitable for sparse datasets, but less accurate.
    n_runs: int (default = 10)
        Number of runs to compute the BetaCV
    confidence: double [0, 1) (default = 0.9)
        The confidence used to compute half the confidence interval size
    
    Returns
    -------
    The betacv and half of the confidence interval size
    '''
    algorithm = None
    if not batch_kmeans:
        algorithm = KMeans(num_cluster)
    else:
        algorithm = MiniBatchKMeans(num_cluster)
    
    inter_array = np.zeros(n_runs)
    intra_array = np.zeros(n_runs)
    for i in xrange(n_runs):
        #Run K-Means
        algorithm.fit(data)
        
        centers = algorithm.cluster_centers_
        labels = algorithm.labels_
        
        #KMeans in sklearn uses euclidean
        dist_centers = pairwise.euclidean_distances(centers)
        
        #Inter distance
        mean_dist_between_centers = np.mean(dist_centers)
        inter_array[i] = mean_dist_between_centers

        #Intra distance
        dist_all_centers = algorithm.transform(data)
        intra_dists = []
        for doc_id, cluster in enumerate(labels):
            dist = dist_all_centers[doc_id, cluster]
            intra_dists.append(dist)
        intra_array[i] = np.mean(intra_dists)
    
    betacv = intra_array / inter_array
    cinterval = half_confidence_interval_size(betacv, confidence)
    return np.mean(betacv), cinterval
Ejemplo n.º 4
0
def main(tcu_fpath):
    data = tcu_io.load_untreated_csv_to_numpy(tcu_fpath)
    data = data[data['Situacao'] == 'Aceito e Habilitado']
    
    desc_column = data['Descricao']
    des_cmp_column = data['DescricaoComplementar']
    unidade_column = data['UnidadeFornecimento']
    qtd_column = [str(qtd) for qtd in data['Quantidade']]
    
    #Transforms descriptions to base strings
    as_docs = []
    for as_text in zip(desc_column, des_cmp_column, unidade_column, qtd_column):
        doc = " ".join(as_text)
        as_docs.append(doc)

    #Vectorizes to TF-IDF
    vectorizer = Vectorizer()
    doc_sparse_matrix = vectorizer.fit_transform(as_docs)
    
    #Compute clusters
    inter = {}
    intra = {}
    n_runs = 20
    k_vals = range(2, 16)
    for i in xrange(n_runs):
        for k in k_vals:
            #Each K has n_runs clusterings
            inter_array = inter.setdefault(k, np.zeros(n_runs))
            intra_array = intra.setdefault(k, np.zeros(n_runs))
            
            #Run K-Means
            mbkm = MiniBatchKMeans(k, init = 'random')
            mbkm.fit(doc_sparse_matrix)
            
            centers = mbkm.cluster_centers_
            labels = mbkm.labels_
            
            #Inter distance. We use min because the ideia is to maximize this.
            #Min serves as a penalty for worse case.
            dist_centers = pairwise.euclidean_distances(centers)
            min_dist_between_centers = \
                np.min(dist_centers[dist_centers > 0])
            inter_array[i] = min_dist_between_centers

            #Intra distance
            dist_all_centers = mbkm.transform(doc_sparse_matrix)
            intra_dists = []
            for doc_id, cluster in enumerate(labels):
                dist = dist_all_centers[doc_id, cluster]
                intra_dists.append(dist)
            intra_array[i] = np.mean(intra_dists)
            
            #Prints num elements per cluster
            print('Run %d ; k = %d' %(i, k))
            counter = Counter(labels)
            for cluster, population in counter.items():
                print('\tK = %d; Pop = %d' %(cluster, population))
            print()
    
    x = inter.keys()
    y = []
    c = []
    for k in x:
        div = inter[k] / intra[k]
        y.append(np.mean(div))
        c.append(half_confidence_interval_size(div, 0.90))
    
    #hack for the zero to apper
    x = [0] + x
    y = [0] + y
    c = [0] + c
    
    ax = plt.gca()
    ax.set_yscale('log')
    ax.set_xticks(range(0, 16))
    plt.ylabel('InterCluster/IntraCluster Ratio')
    plt.xlabel('Number of clusters')
    plt.errorbar(x, y, yerr=c, fmt='bo', markersize=8, elinewidth=2)
    plt.show()