Beispiel #1
0
def main():
    if not os.path.exists('data'):
        raise UserWarning(
            'Could not find ./data/ directory. Have you created a '
            'directory for data and collected data yet?')
    classes = sys.argv[1:]

    train_paths = sorted(
        ['data/{}_train.json'.format(name) for name in classes])
    test_paths = sorted(['data/{}_test.json'.format(name) for name in classes])
    X_train, Y_train, ordering = create_dataset(train_paths)
    X_test, Y_test, _ = create_dataset(test_paths, ordering=ordering)

    Y_train_oh = np.eye(len(classes))[Y_train]
    kmeans = KMeans(n_clusters=len(classes)).fit(X_train)
    mapping = majority_vote(kmeans.predict(X_train), Y_train)
    params = kmeans.get_params()
    train_accuracy = evaluate(Y_train, predict(kmeans, mapping, X_train))
    validation_accuracy = evaluate(Y_test, predict(kmeans, mapping, X_test))

    print('Train accuracy ({}%), Validation accuracy ({}%)'.format(
        train_accuracy * 100, validation_accuracy * 100))
    np.save('kmeans_ordering.npy', np.array(ordering))

    joblib.dump(kmeans, 'kmeans_model.pkl')
    with open('kmeans_mapping.pkl', 'wb') as f:
        pickle.dump(mapping, f)
    sys.stdout.flush()
Beispiel #2
0
class KMeansCluster(Intent):
    def __init__(self, n_clusters: int) -> None:
        super().__init__()
        self.kmeans = KMeans(n_clusters, random_state=0)

    def to_string(self) -> str:
        return 'Cluster:KMeans'

    def compute(self, df: pd.DataFrame) -> pd.DataFrame:
        nan_dropped = df.dropna()
        self.kmeans.fit(nan_dropped)

        labels = pd.DataFrame(data=self.kmeans.labels_,
                              index=nan_dropped.index).applymap(str)

        inc_nan = labels.reindex(index=df.index, fill_value='NaN')
        values = inc_nan.iloc[:, 0].unique()
        result = pd.concat(
            map(
                lambda v: pd.DataFrame(  # type: ignore
                    data=(inc_nan.iloc[:, 0] == v).astype('int').values,
                    columns=[self.to_string() + ":" + v],
                    index=df.index,
                    dtype=int),
                values),
            axis='columns')
        return result

    def info(self) -> Optional[Dict[str, Any]]:
        return {"params": self.kmeans.get_params()}
Beispiel #3
0
def k_means(training_vectors,
            test_vectors_clean,
            test_vectors_anomalous,
            k=10):
    kmeans = KMeans(n_clusters=k,
                    init='k-means++',
                    n_init=100,
                    max_iter=300,
                    tol=1e-04,
                    random_state=0)

    print("Fitting with Parameters: ", kmeans.get_params())
    labels = kmeans.fit_predict(training_vectors)
    centroids = kmeans.cluster_centers_

    clusters = __sort_to_cluster(k, labels, training_vectors)
    clusters_radii = __get_radius_for_clusters(k, clusters, centroids)

    print("Training done! Switch to testing.")
    print("**************************")
    print("Start prediction...")

    result_clean = __predict_outliers(k, clusters_radii, centroids,
                                      test_vectors_clean)
    result_anomalous = __predict_outliers(k, clusters_radii, centroids,
                                          test_vectors_anomalous)

    print("Predicting successful!")
    print("**************************")

    return np.asarray(result_clean), np.asarray(result_anomalous), np.asarray(
        [])
Beispiel #4
0
def train_model(data):
    model = KMeans(init='k-means++', n_clusters=4, n_init=10)
    model.fit(data)
    pred_y = model.predict(data)
    centers = model.cluster_centers_
    print(model.get_params())
    l, r = min(data[:, 0]), max(data[:, 0])
    b, t = min(data[:, 1]), max(data[:, 1])
    grid_x = np.meshgrid(np.linspace(l, r, 500),
                         np.linspace(b, t, 500))
    flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
    flat_y = model.predict(flat_x)
    grid_y = flat_y.reshape(grid_x[0].shape)
    cm_light = mp.colors.ListedColormap(['orangered', 'deepskyblue', 'limegreen', 'whitesmoke'])
    cm_dark = mp.colors.ListedColormap(['whitesmoke', 'limegreen', 'deepskyblue',  'orangered'])
    plt.figure()
    plt.pcolormesh(grid_x[0], grid_x[1], grid_y)
    plt.scatter(data[:, 0], data[:, 1], c=pred_y, cmap=cm_dark)
    plt.scatter(centers[:, 0], centers[:, 1], c='b', marker='+', s=300)
    plt.text(-1, 0, r'{}'.format(np.round(centers[0], 3)), fontsize=12, color='k')
    plt.text(-1, 5, r'{}'.format(np.round(centers[1], 3)), fontsize=12, color='k')
    plt.text(3, 0, r'{}'.format(np.round(centers[3], 3)), fontsize=12, color='k')
    plt.text(3, 5, r'{}'.format(np.round(centers[2], 3)), fontsize=12, color='k')
    plt.savefig(config.APP_IMAGES_TXT + 'k_means.png')
    plt.show()
Beispiel #5
0
def iter_kmeans(df, n_clusters, num_iters=5):
    rng = range(1, num_iters + 1)
    vals = pd.Series(index=rng)
    for i in rng:
        k = KMeans(n_clusters=n_clusters, n_init=3)
        k.fit(df)
        print "Ref k: %s" % k.get_params()['n_clusters']
        vals[i] = k.inertia_
    return vals
Beispiel #6
0
def iter_kmeans(df, n_clusters, num_iters=5):
	rng =  range(1, num_iters + 1)
	vals = pd.Series(index=rng)
	for i in rng:
		k = KMeans(n_clusters=n_clusters, n_init=3)
		k.fit(df)
		print "Ref k: %s" % k.get_params()['n_clusters']
		vals[i] = k.inertia_
	return vals
Beispiel #7
0
    def perform_kmeans(self, no_clusters):
        kmeans_clusterer = KMeans(n_clusters=no_clusters)
        kmeans_clusterer.fit(self.data)
        self.kmeans_results = {
            "parameters": kmeans_clusterer.get_params(),
            "labels": kmeans_clusterer.labels_,
            "n_clusters": no_clusters,
            'clusters': label_cnt_dict(kmeans_clusterer.labels_),
            "cluster_centers": kmeans_clusterer.cluster_centers_,
            "inertia": kmeans_clusterer.inertia_
        }

        print_dict(self.kmeans_results)
Beispiel #8
0
 def applyAlgo(self):
     if (self.Listbox1.curselection()[0] == 0):
         n_clusters=int(simpledialog.askinteger("Info", "How many clusters?"))
         n_init=int(simpledialog.askinteger("Info", "How many times the k-means algorithm should run with different centroid seeds?"))
         max_iter=int(simpledialog.askinteger("Info", "What should be the maximum number of iterations of the k-means algorithm for a single run?"))
         model = KMeans(n_clusters=n_clusters, n_init=n_init, max_iter=max_iter, random_state=1).fit(self.input_col)
     elif (self.Listbox1.curselection()[0] == 1):
         min_samples=int(simpledialog.askinteger("Info", "How many samples (or total weight) in a neighborhood for a point to be considered as a core point?"))
         algorithm=simpledialog.askstring("Info", "Specify the algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors. It must be one of ‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’")
         model = DBSCAN(algorithm=algorithm, min_samples=min_samples).fit(self.input_col)
     
     labels = model.labels_
     self.data["PredictedCluster"] = pandas.DataFrame(labels, columns = ["PredictedCluster"])
     params = model.get_params()
     algoDetails=''
     for key, value in params.items():
         algoDetails = algoDetails+'\n'+key+' : '+str(value)
     self.Label5.configure(text=algoDetails)
     self.Label5.text = algoDetails
     
     algoResults=''
     if(self.groundTruth):
         adjusted_rand_score=str(metrics.adjusted_rand_score(self.labels_true, labels))
         algoResults = algoResults+'\n'+'adjusted_rand_score'+' : '+adjusted_rand_score
         
         adjusted_mutual_info_score=str(metrics.adjusted_mutual_info_score(self.labels_true, labels))
         algoResults = algoResults+'\n'+'adjusted_mutual_info_score'+' : '+adjusted_mutual_info_score
         mutual_info_score=str(metrics.mutual_info_score(self.labels_true, labels)) 
         algoResults = algoResults+'\n'+'mutual_info_score'+' : '+mutual_info_score
         
         homogeneity_score=str(metrics.homogeneity_score(self.labels_true, labels))
         algoResults = algoResults+'\n'+'homogeneity_score'+' : '+homogeneity_score
         
         completeness_score=str(metrics.completeness_score(self.labels_true, labels))
         algoResults = algoResults+'\n'+'completeness_score'+' : '+completeness_score
         
         v_measure_score=str(metrics.v_measure_score(self.labels_true, labels))
         algoResults = algoResults+'\n'+'v_measure_score'+' : '+v_measure_score
         
         fowlkes_mallows_score=str(metrics.fowlkes_mallows_score(self.labels_true, labels))
         algoResults = algoResults+'\n'+'fowlkes_mallows_score'+' : '+fowlkes_mallows_score
         
     silhouette_score=str(metrics.silhouette_score(self.input_col, labels, metric='euclidean'))
     algoResults = algoResults+'\n'+'silhouette_score'+' : '+silhouette_score
     
     calinski_harabaz_score=str(metrics.calinski_harabaz_score(self.input_col, labels))
     algoResults = algoResults+'\n'+'calinski_harabaz_score'+' : '+calinski_harabaz_score
     
     self.Label6.configure(text=algoResults)
     self.Label6.text = algoResults
Beispiel #9
0
 def begin(self, inarray):
     inarray = self.make2d(inarray)
     inarray = normalize(inarray)
     inarray = self.make2d(inarray)
     for i in range(2, 12):
         kd = KMeans(n_clusters=i)
         temparray = inarray
         result = kd.fit_transform(temparray)
         params = kd.get_params()
         print('\n\n\n======================================\n\n\n')
         print("variance = {0}".format(result.var()))
         print(params)
         fIO.FileIO().saveWork((result, params, kd),
                               'kmeansfit_fulldata'.format(i), 2)
     input("press any key to exit...")
Beispiel #10
0
    def perform_kmeans(self, no_clusters, params={'n_jobs': -1}):
        #start_time = time()
        kmeans_clusterer = KMeans(n_clusters=no_clusters, **params)
        kmeans_clusterer.fit(self.data)
        #print("-- %s seconds --"%(time()-start_time))

        self.kmeans_results = {
            "parameters": kmeans_clusterer.get_params(),
            "labels": kmeans_clusterer.labels_,
            "n_clusters": no_clusters,
            'clusters': label_cnt_dict(kmeans_clusterer.labels_),
            "cluster_centers": kmeans_clusterer.cluster_centers_,
            "inertia": kmeans_clusterer.inertia_
        }

        print_dict(self.kmeans_results)
def performKmeansClustering(logger, model, n_init, num_clusters, word_vectors):
    # clustering
    logger.info("performing K means cluster predictions")
    logger.info("Creating: %s clusters.", num_clusters)
    start = time.time()
    kmeans_clustering = KMeans(n_clusters=num_clusters,
                               n_jobs=-1,
                               n_init=n_init,
                               precompute_distances=True)
    centroids = kmeans_clustering.fit_predict(word_vectors)
    word_centroid_map = dict(zip(model.wv.index2word, centroids))
    end = time.time()
    elapsed = end - start
    logger.info("Time taken for K Means clustering: %s seconds.", elapsed)
    logger.info("KMeans parameters: %s", kmeans_clustering.get_params())
    return word_centroid_map
Beispiel #12
0
 def run_categorization(self, clusters_amount, vectors, true_labels):
     kmeans = KMeans(n_clusters=clusters_amount,
                     init='k-means++',
                     n_init=10,
                     max_iter=50,
                     tol=0.0001,
                     precompute_distances='auto',
                     verbose=0,
                     random_state=None,
                     copy_x=True,
                     n_jobs=1,
                     algorithm='auto')
     predicted_labels = kmeans.fit_predict(vectors, true_labels)
     centroids = kmeans.cluster_centers_
     inertia = kmeans.inertia_
     params = str(kmeans.get_params())
     return predicted_labels, true_labels, centroids, inertia, params
 def clusterTickets(self, dataset):
     ''' Clustering using KMeans
     
         Parameters
         ----------
         dataset : pandas dataframe
      
         Returns
         -------
         y_pred : prediction labels for a given training set.
         y_true : Dependent feature representing original class / cluster of each row.
     '''
     vectorizer = Vectorizer()
     X, y_true = vectorizer.vectorize(dataset)
     kmeans = KMeans(n_clusters=5,
                     init='k-means++',
                     n_init=100,
                     max_iter=1000)
     y_predict = kmeans.fit_predict(X)
     print(kmeans.get_params())
     return y_predict, y_true
Beispiel #14
0
import numpy as np
from util import get_x_y_data
from sklearn.cluster import KMeans

TEST_DATA_ROWS = 20

# class sklearn.cluster.KMeans
# (n_clusters=8, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1)

x_data, y_data, zone_cnt, zone_int_dict = get_x_y_data()
# no duplicate value, so reverse this dictionary
int_zone_dict = dict(zip(zone_int_dict.values(), zone_int_dict.keys()))

kmeans = KMeans(n_clusters=zone_cnt)  # a,b,c,d,e  5 centor
kmeans.fit(x_data)
print kmeans.get_params()
# centers
print kmeans.cluster_centers_

# every lable for cluster
print kmeans.labels_

# the smaller inertia is, the better the classifier works
print kmeans.inertia_

indices = np.random.permutation(len(x_data))
x_test = x_data[indices[-TEST_DATA_ROWS:]]
x_distance = kmeans.transform(x_test)
test_result = kmeans.predict(x_test)  # test

for type, dis in zip(test_result, x_distance):
Beispiel #15
0
import gc
n = 100000
#xx,yy,zz=x[:n],y[:n],z[:n]
xx, yy, zz = x, y, z
#kmeans = KMeans(n_clusters=97,n_jobs=-1,max_iter=3000).fit(xx)
kmeans = KMeans(n_clusters=25, n_jobs=-1).fit(xx)
print(kmeans.score(xx, zz))  #-19.0829318168
#score怎么出来负数了呢??是做成回归了吗?怎么是用分类?
#答:把x中的每个value减去同意分类中的所在维度的平均值的平均值后做平方,再把这些平方们做加和。score(x,y)的y根本没用。
fenlei1 = kmeans.predict(xx)
fenlei1 = pd.Series(fenlei1)
way4 = 'C:/Users/Administrator/Desktop/ali/data/3_tempt/fenlei1.csv'
fenlei1.to_csv(way4)
#fenlei1=pd.read_csv(way4)
way5 = 'C:/Users/Administrator/Desktop/ali/data/3_tempt/params1.csv'
pd.Series(kmeans.get_params()).to_csv(way5)

from sklearn.svm import SVC
for i in range(25):
    print(i)
    xuanze = fenlei1 == i
    svc = SVC(C=1, cache_size=1000, decision_function_shape='ovo')
    svc.fit(xx[xuanze], yy[xuanze])
    print('++++++++++++++++')
    print(svc.score(xx[xuanze], yy[xuanze]))
    way_tempt = 'C:/Users/Administrator/Desktop/ali/data/3_tempt/params2' + str(
        i) + '.csv'
    params_tempt = pd.Series(svc.get_params())
    params_tempt.to_csv(way_tempt)
    print('--------------------------')
    gc.collect()
import pandas as pd

a = pd.DataFrame(np.arange(9).reshape(3, 3))
#b=pd.Series(list('xyz'))
b = pd.Series(np.arange(3))

cv = CalibratedClassifierCV()
cv.fit(a, b)
print(cv.get_params)
print(predict(a))
print(score(a, b))

#---------
# 无监督学习
# cluster可以用于n个分类,分为‘n_clusters’类
# 即使分类器只能分两类,也可用sklearn.multioutput.MultiOutputClassifier变为分多类
from sklearn.cluster import KMeans
import numpy as np

X = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0], [7, 2], [7, 4],
              [7, 0]])
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
print(kmeans.labels_)  #[2 2 2 1 1 1 0 0 0]
print(kmeans.predict([[0, 0], [4, 4], [7, 2]]))  #[2 1 0]
print(kmeans.cluster_centers_)  #[[ 7.  2.],[ 4.  2.],[ 1.  2.]]
print(kmeans.get_params())
#{'n_jobs': 1, 'algorithm': 'auto', 'n_clusters': 3, 'max_iter': 300,
# 'init': 'k-means++', 'random_state': 0, 'n_init': 10, 'tol': 0.0001,
# 'precompute_distances': 'auto', 'copy_x': True, 'verbose': 0}
#--------------------------
class KMEANS(object):
    def __init__(self,
                 n_clusters=8,
                 init='k-means++',
                 n_init=10,
                 max_iter=300,
                 tol=1e-4,
                 precompute_distances='deprecated',
                 verbose=0,
                 random_state=None,
                 copy_x=True,
                 n_jobs='deprecated',
                 algorithm='auto'):
        """
        Parameters
        ----------
        n_clusters : TYPE, optional   簇数
            DESCRIPTION. The default is 8.
        init : TYPE, optional
        {"k-means++","random", ndarray}
            DESCRIPTION. The default is 'k-means++'.
        n_init : TYPE, optional
        kmeans 运行的次数
            DESCRIPTION. The default is 10.
        max_iter : TYPE, optional
        单次运行的k均值算法的最大迭代次数
            DESCRIPTION. The default is 300.

        tol : TYPE, optional
        该范数表示两个连续迭代的聚类中心的差异,用于声明收敛
            DESCRIPTION. The default is 1e-4.
        
        precompute_distances : TYPE, optional
        {"auto", True, False}
        预计算距离, "auto": 如果n_samples*n_clusters>1200w,则不预计算距离,使用双精度
            DESCRIPTION. The default is 'deprecated'.
        verbose : TYPE, optional
            DESCRIPTION. The default is 0.
        random_state : TYPE, optional
            DESCRIPTION. The default is None.
        copy_x : TYPE, optional
            DESCRIPTION. The default is True.
        n_jobs : TYPE, optional
            DESCRIPTION. The default is 'deprecated'.
        algorithm : TYPE, optional
        {"auto","full","elkan"}   
            DESCRIPTION. The default is 'auto'.

        Returns
        -------
        None.

        """
        self.kmeans = KMeans(n_clusters=n_clusters,
                             init=init,
                             n_init=n_init,
                             max_iter=max_iter,
                             tol=tol,
                             precompute_distances=precompute_distances,
                             verbose=verbose,
                             random_state=random_state,
                             copy_x=copy_x,
                             n_jobs=n_jobs,
                             algorithm=algorithm)

    def fit(self, x, y=None, sample_weight=None):
        self.kmeans.fit(X=x, y=y, sample_weight=sample_weight)

    def fit_transform(self, x, y=None, sample_weight=None):
        return self.kmeans.fit_transform(X=x, y=y, sample_weight=sample_weight)

    def transform(self, x):
        self.kmeans.transform(X=x)

    def fit_predict(self, x, y=None, sample_weight=None):
        return self.kmeans.fit_predict(X=x, y=y, sample_weight=sample_weight)

    def get_params(self, deep=True):
        self.kmeans.get_params(deep=deep)

    def predict(self, x, sample_weight=None):
        return self.kmeans.predict(X=x, sample_weight=sample_weight)

    def set_params(self, params):
        self.kmeans.set_params(**params)

    def score(self, x, y=None, sample_weight=None):
        return self.kmeans.score(X=x, y=y, sample_weight=sample_weight)

    def get_cluster_centers(self):
        return self.kmeans.cluster_centers_

    def get_labels(self):
        return self.kmeans.labels_

    def get_inertial(self):  # 样本到最近的簇中心的平方距离和
        return self.kmeans.inertia_

    def get_n_iter(self):  # 迭代次数
        return self.kmeans.n_iter_
for idx in range(0,max_records):
	print "processing dog."+str(idx)+".jpg\n"
	img = cv2.imread('train/train/dog.'+str(idx)+'.jpg')
	gray= cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
	kp = sift.detect(gray,None)
	tkp, td = sift.compute(gray, kp)
	temp_points = []
	for k in tkp:
		tuples = (int(math.ceil(k.pt[0])),int(math.ceil(k.pt[1])))
		points.append(tuples)
		temp_points.append(tuples)
	dog_features[idx] = temp_points

kmeans = KMeans()
kmeans = kmeans.fit(points)
params = kmeans.get_params()
n_clusters = params["n_clusters"]

overall_feats = []
count = 1
for cats in cat_features:
	print "Record-->"+str(count)
	clusters = kmeans.predict(cat_features[cats])
	print clusters
	feats = []
	for i in range(0,n_clusters):
		feats.append(0)
	feats.append(0)
	for num in clusters:
		feats[num] = feats[num]+1
	overall_feats.append(feats)
Beispiel #19
0
def nab_and_format_bispec(fn, clfdict, expected_struct, clffull, get_cats=False, need_PCA=False, full_clf=True):
    df = pd.read_csv(fn, skiprows=8, delim_whitespace=True, low_memory=False)
    # get rid of the bogus first columns
    cols = df.columns[2::]
    df.drop(df.columns[-2::], 1, inplace=True)
    df.columns = cols

    tdata = df

    # Need the stats for each bispec component
    print "\nre-normalizing data..."
    for x in tdata.columns[5::]:
        m = pickle.load(open("./data_stats/" + str(x) + "_m.p", "rb"))
        s = pickle.load(open("./data_stats/" + str(x) + "_s.p", "rb"))
        tdata[x] = tdata[x].map(norm(s, m))
    if need_PCA == True:
        # now we transform it using the previously trained PCA
        print "\nloading in PCA..."
        pca = pickle.load(open("pca.p", "rb"))
        trans_values = pca.transform(tdata[tdata.columns[5::]].values)
    else:
        trans_values = tdata[tdata.columns[5::]].values

    if get_cats == True:
        KM = KMeans(n_clusters=2)
        print "\n Separating into " + str(KM.get_params()["n_clusters"]) + " parts, and removing surface atoms."
        nonsurf = remove_surface_atoms(df)
        atom_cats = np.zeros(len(trans_values))
        atom_cats[nonsurf] = KM.fit_predict(trans_values[nonsurf])
        return df, atom_cats
        # make_output(fn,df,atom_cats)

    classdict = pickle.load(open("classdict.p", "rb"))

    print "\n making prediction..."
    if full_clf == False:
        predictions = {}
        values = {}
        for k in classdict.keys():
            # values[k]=clfdict[k].predict(trans_values)
            predictions[k] = clfdict[k].predict_proba(trans_values)

        out = np.zeros(len(trans_values))
        # need the maximum likelyhood defect (could be bulk if all are small)
        for x in range(len(trans_values)):
            poss_def = np.zeros(len(classdict))
            for k in classdict.keys():
                if k > 9:
                    p = 1
                else:
                    p = 0
                if predictions[k][x][p] >= 0.60 and classdict[k].find(expected_struct) != -1:
                    poss_def[k] = predictions[k][x][p]
            if sum(poss_def) > 0:
                out[x] = np.array(poss_def).argmax()
            else:
                out[x] = -1
    else:  # default full clf structure
        probs = clffull.predict_proba(trans_values)
        out = []
        for p in probs:
            # if the difference is less than 5%, and one option is a defect, take the defect!
            mp = np.array(p).argmax()
            """			
			if max(p) - second_largest(p) <= 0.05 and (np.array(p).argmax() == 9 or np.array(p).argmax() == 10): 
				mp = (np.array(p)==second_largest(p)).argmax()
			else:
				mp = np.array(p).argmax()
			"""
            out.append(mp)

        predictions = probs
    return df, out, trans_values, tdata, predictions, classdict
print "KMeans: F1 score on test: {}".format(f1_score(labels_test, pred_labels_test))
 
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script.

# set up a classifier and grid parameters
base_clf  = KMeans()
parameters = {'random_state' : [42], 
              'n_clusters'   : [4, 8, 16],
              'max_iter'     : [300, 1000, 10000],
              'init'         : ['k-means++', 'random'] }

# do the grid search and print results
print "KMeans Grid search ..."
from sklearn import grid_search
gs_clf = grid_search.GridSearchCV(base_clf, parameters, scoring='f1')
gs_clf.fit(features_train, labels_train)
clf =  gs_clf.best_estimator_
best_parameters = clf.get_params()
print "Best score: {:0.3f} with parameters:".format(gs_clf.best_score_)
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))  

test_classifier(clf, my_dataset, features_list)

### Dump your classifier, dataset, and features_list so 
### anyone can run/check your results.

dump_classifier_and_data(clf, my_dataset, features_list)

Beispiel #21
0
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

data = pd.read_csv('kmeans.csv')
print("The data :", data)
X = data.iloc[:, :-1]
print("X: ", X)
y = data.iloc[:, -1]
print("Y :", y)
model = KMeans(n_clusters=2)
model.fit(X)
print("Bruh :", model.get_params().keys())
print("bruh2 :", model.labels_)

print(model.cluster_centers_)
predictions = model.predict(X)
print("The prediction are :", predictions)
print("The cluster centers :", model.cluster_centers_)
centers = model.cluster_centers_
plt.scatter(data.iloc[:, 0], data.iloc[:, 1])
plt.scatter(centers[:, 0], centers[:, 1])
plt.show()
class ClusterCentroids(BaseUnderSampler):
    """Undersample by generating centroids based on clustering methods.

    Method that under samples the majority class by replacing a
    cluster of majority samples by the cluster centroid of a KMeans
    algorithm.  This algorithm keeps N majority samples by fitting the
    KMeans algorithm with N cluster to the majority class and using
    the coordinates of the N cluster centroids as the new majority
    samples.

    Read more in the :ref:`User Guide <cluster_centroids>`.

    Parameters
    ----------
    {sampling_strategy}

    {random_state}

    estimator : estimator object, default=None
        A scikit-learn compatible clustering method that exposes a `n_clusters`
        parameter and a `cluster_centers_` fitted attribute. By default, it will
        be a default :class:`~sklearn.cluster.KMeans` estimator.

    voting : {{"hard", "soft", "auto"}}, default='auto'
        Voting strategy to generate the new samples:

        - If ``'hard'``, the nearest-neighbors of the centroids found using the
          clustering algorithm will be used.
        - If ``'soft'``, the centroids found by the clustering algorithm will
          be used.
        - If ``'auto'``, if the input is sparse, it will default on ``'hard'``
          otherwise, ``'soft'`` will be used.

        .. versionadded:: 0.3.0

    Attributes
    ----------
    sampling_strategy_ : dict
        Dictionary containing the information to sample the dataset. The keys
        corresponds to the class labels from which to sample and the values
        are the number of samples to sample.

    estimator_ : estimator object
        The validated estimator created from the `estimator` parameter.

    voting_ : str
        The validated voting strategy.

    n_features_in_ : int
        Number of features in the input dataset.

        .. versionadded:: 0.9

    See Also
    --------
    EditedNearestNeighbours : Under-sampling by editing samples.

    CondensedNearestNeighbour: Under-sampling by condensing samples.

    Notes
    -----
    Supports multi-class resampling by sampling each class independently.

    Examples
    --------

    >>> from collections import Counter
    >>> from sklearn.datasets import make_classification
    >>> from imblearn.under_sampling import \
ClusterCentroids # doctest: +NORMALIZE_WHITESPACE
    >>> X, y = make_classification(n_classes=2, class_sep=2,
    ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
    ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
    >>> print('Original dataset shape %s' % Counter(y))
    Original dataset shape Counter({{1: 900, 0: 100}})
    >>> cc = ClusterCentroids(random_state=42)
    >>> X_res, y_res = cc.fit_resample(X, y)
    >>> print('Resampled dataset shape %s' % Counter(y_res))
    ... # doctest: +ELLIPSIS
    Resampled dataset shape Counter({{...}})
    """
    @_deprecate_positional_args
    def __init__(
        self,
        *,
        sampling_strategy="auto",
        random_state=None,
        estimator=None,
        voting="auto",
    ):
        super().__init__(sampling_strategy=sampling_strategy)
        self.random_state = random_state
        self.estimator = estimator
        self.voting = voting

    def _validate_estimator(self):
        """Private function to create the KMeans estimator"""
        if self.estimator is None:
            self.estimator_ = KMeans(random_state=self.random_state)
        else:
            self.estimator_ = clone(self.estimator)
            if "n_clusters" not in self.estimator_.get_params():
                raise ValueError(
                    "`estimator` should be a clustering estimator exposing a parameter"
                    " `n_clusters` and a fitted parameter `cluster_centers_`.")

    def _generate_sample(self, X, y, centroids, target_class):
        if self.voting_ == "hard":
            nearest_neighbors = NearestNeighbors(n_neighbors=1)
            nearest_neighbors.fit(X, y)
            indices = nearest_neighbors.kneighbors(centroids,
                                                   return_distance=False)
            X_new = _safe_indexing(X, np.squeeze(indices))
        else:
            if sparse.issparse(X):
                X_new = sparse.csr_matrix(centroids, dtype=X.dtype)
            else:
                X_new = centroids
        y_new = np.array([target_class] * centroids.shape[0], dtype=y.dtype)

        return X_new, y_new

    def _fit_resample(self, X, y):
        self._validate_estimator()

        if self.voting == "auto":
            if sparse.issparse(X):
                self.voting_ = "hard"
            else:
                self.voting_ = "soft"
        else:
            if self.voting in VOTING_KIND:
                self.voting_ = self.voting
            else:
                raise ValueError(f"'voting' needs to be one of {VOTING_KIND}. "
                                 f"Got {self.voting} instead.")

        X_resampled, y_resampled = [], []
        for target_class in np.unique(y):
            target_class_indices = np.flatnonzero(y == target_class)
            if target_class in self.sampling_strategy_.keys():
                n_samples = self.sampling_strategy_[target_class]
                self.estimator_.set_params(**{"n_clusters": n_samples})
                self.estimator_.fit(_safe_indexing(X, target_class_indices))
                if not hasattr(self.estimator_, "cluster_centers_"):
                    raise RuntimeError(
                        "`estimator` should be a clustering estimator exposing a "
                        "fitted parameter `cluster_centers_`.")
                X_new, y_new = self._generate_sample(
                    _safe_indexing(X, target_class_indices),
                    _safe_indexing(y, target_class_indices),
                    self.estimator_.cluster_centers_,
                    target_class,
                )
                X_resampled.append(X_new)
                y_resampled.append(y_new)
            else:
                X_resampled.append(_safe_indexing(X, target_class_indices))
                y_resampled.append(_safe_indexing(y, target_class_indices))

        if sparse.issparse(X):
            X_resampled = sparse.vstack(X_resampled)
        else:
            X_resampled = np.vstack(X_resampled)
        y_resampled = np.hstack(y_resampled)

        return X_resampled, np.array(y_resampled, dtype=y.dtype)

    def _more_tags(self):
        return {"sample_indices": False}
Beispiel #23
0
class KMEANS(object):
    def __init__(self,
                 n_clusters=8,
                 init='k-means++',
                 n_init=10,
                 max_iter=300,
                 tol=1e-4,
                 precompute_distances='auto',
                 verbose=0,
                 random_state=None,
                 copy_x=True,
                 n_jobs=None,
                 algorithm='auto'):
        """
        :param n_clusters:  簇数,int 可选项
        :param init:  "k-means++", "randoms"  或一个ndarray ,默认kmeans++
        :param n_init: int 默认10
        :param max_iter:  最大迭代数 int  默认300
        :param tol:关于惯性宣布收敛的相对容忍度 1e-4,
        :param precompute_distances:  "auto" True, False,预计算距离,快速但耗内存。
        自动时,如果n_samples * n_clusters > 1200万,则不计算距离. True,计算, False 不计算
        :param verbose: 默认为0, int, 冗余模型
        :param random_state:  随机种子
        :param copy_x:
        :param n_jobs:  # 进程数  int
        :param algorithm: 默认为auto, {"auto", "full", "elkan"},
        """
        self.model = KMeans(n_clusters=n_clusters,
                            init=init,
                            n_init=n_init,
                            max_iter=max_iter,
                            tol=tol,
                            precompute_distances=precompute_distances,
                            verbose=verbose,
                            random_state=random_state,
                            copy_x=copy_x,
                            n_jobs=n_jobs,
                            algorithm=algorithm)

    def fit(self, x, y=None, sample_weight=None):  # 计算k-means 聚类
        self.model.fit(X=x, y=y, sample_weight=sample_weight)

    def transform(self, x):  # 将x 转换到簇距离空间中
        return self.model.transform(X=x)

    def fit_transform(self, x, y=None, sample_weight=None):
        return self.model.fit_transform(X=x, y=y, sample_weight=sample_weight)

    def fit_predict(self,
                    x,
                    y=None,
                    sample_weight=None):  # == fit+predict 计算簇中心预测每个样本归属簇
        return self.model.fit_predict(X=x, y=y, sample_weight=sample_weight)

    def get_params(self, deep):
        return self.model.get_params(deep=deep)

    def predict(self, x):  # U预测样本属于最近的哪个簇
        return self.model.predict(X=x)

    def score(self, x, y=None, sample_weight=None):  # 与k-均值目标上的x值相反
        return self.model.score(X=x, y=y, sample_weight=sample_weight)

    def set_params(self, **params):
        self.model.set_params(**params)

    def get_attributes(self):  # 生成模型之后,才能获取
        cluster_centers = self.model.cluster_centers_  # 簇中心坐标 array, [n_clusters, n_features]
        labels = self.model.labels_  # 每个点的标签
        inertia = self.model.inertia_  # 样本到距离他们最近的簇中心的距离
        n_iter = self.model.n_iter_  # 运行的迭代数
        return cluster_centers, labels, inertia, n_iter
Beispiel #24
0
def editor_input_clustering(filtered_editor_log, code_template, user_info, ankors):
    def _unicode(c):
        if u'\u4e00' <= c <= u'\u9fff':
            return False
        try:
            c.decode('ascii')
        except UnicodeDecodeError:
            return False
        except UnicodeEncodeError:
            return False
        return True
    editor_cmd_data = filtered_editor_log.map(lambda x: x.filter_editor_log(['insert', 'remove', 'paste', 'copy', 'save', 'open'])).map(lambda x: x.combine_editor_input())
    insert_data = editor_cmd_data.flatmap(lambda x: x.cmd_list).filter_by(lambda x: x['action']==u'insert').map(lambda x: x['lines'][0])
    template_filtered_data = editor_cmd_data.flatmap(lambda x: x.cmd_list).filter_by(lambda x: x['action']==u'paste').map(lambda x: x['text'])
    template_filtered_data = template_filtered_data.map(lambda x: code_template.strip_template(x))
    total_input = data_reader.SList(insert_data + template_filtered_data.flatmap(lambda x: x.split(u"\n")))
    total_input = total_input.filter_by(lambda x: len(filter(lambda y: not y in [u"\n", u"\t", u"\r", u" "] and _unicode(y), x))>5)
    print len(total_input)
    feature_set, ankor_set = _generate_feature_set(total_input,ankors.splitter)
    print len(feature_set)
    # pca = PCA(n_components=2)
    # pca.fit(feature_set)
    # plot_data = pca.transform(feature_set)

    # fig, ax = report_tools.prepare_plot()
    # ax.scatter([item[0] for item in plot_data], [item[1] for item in plot_data])
    # plt.title('Scatter plot on editor input')
    # plt.savefig('scatter_editor_input.png')

    # fig = plt.figure()
    # ax = fig.add_subplot(111, projection='3d')
    # ax.scatter([item[0] for item in plot_data], [item[1] for item in plot_data], [item[2] for item in plot_data])
    # plt.title('Scatter plot on editor input')
    # plt.savefig('3d_scatter_editor_input.png')

    # db = Birch().fit(feature_set)
    # labels = db.labels_
    model = KMeans(n_clusters=300)
    labels = model.fit_predict(feature_set)
    result  = zip(labels, total_input)
    size_list = []
    cluster_list = []
    print len(set(labels))
    for label in set(labels):
        tmp_result = filter(lambda x: x[0]==label, result)
        if len(tmp_result) > 100:
            size_list.append(len(tmp_result))
            cluster_list.append(label)
            with codecs.open("clustering_{}.txt".format(label), 'w', 'utf-8') as f_out:
                f_out.write(u"Size of cluster: {}\n".format(len(tmp_result)))
                for item in tmp_result:
                    f_out.write(u"{}\n".format(item[1]))
    fig, ax = report_tools.prepare_plot(figsize=(20, 5))
    ind = np.arange(len(size_list))
    width = 0.5
    ax.bar(ind, size_list, width)
    ax.set_xticks(ind+width)
    ax.set_xticklabels(['C{}'.format(i) for i in cluster_list], rotation='90')
    plt.title('Cluster size')
    plt.savefig('cluster_size.png')

    ankor_label = model.predict(ankor_set)
    with open('ankor_label.txt', 'w') as f_out:
        for item in zip(ankors.splitter, ankor_label):
            f_out.write("{}\n{}\n\n".format(item[0], item[1]))

    with open('model.json', 'w') as f_out:
        json.dump(model.get_params(), f_out)
Beispiel #25
0
print(newarr)
testarr[5:10, :] = newarr
testarr

# Generate 2 groups of random 2d data
bgroup = 20 * np.random.rand(100, 2)
rgroup = -20 * np.random.rand(50, 2)
# Replace the second half of bgroup with rgroup
bgroup[50:100, :] = rgroup
plt.scatter(bgroup[0:50, 0], bgroup[0:50, 1], s = 10, c = 'b')
plt.scatter(bgroup[50:100, 0], bgroup[50:100, 1], s = 10, c = 'r')
plt.show()

Kmean = KMeans(n_clusters = 2)
Kmean.fit(bgroup)
Kmean.get_params()

print(Kmean.labels_)
labels = Kmean.labels_
centroids = Kmean.cluster_centers_

for i in range(100):
    if labels[i] == 1:
        plt.scatter(bgroup[i][0], bgroup[i][1], s = 10, c = 'c')
    else:
        plt.scatter(bgroup[i][0], bgroup[i][1], s = 10, c = 'm')

plt.scatter(centroids[0][0], centroids[0][1], s = 100, c = 'm', marker='s')
plt.scatter(centroids[1][0], centroids[1][1], s = 100, c = 'c', marker='s')
plt.show()
# Making a tree structure by adding the MicroDataset objects as children of this group
cluster_grp.addChildren([
    ds_label_mat, ds_cluster_centroids, ds_cluster_inds, ds_cluster_vals,
    ds_labels_spec_inds, ds_labels_spec_vals
])

print('\nWill write the following tree:')
cluster_grp.showTree()

cluster_grp.attrs['num_clusters'] = num_clusters
cluster_grp.attrs['num_samples'] = h5_main.shape[0]
cluster_grp.attrs['cluster_algorithm'] = 'KMeans'

# Get the parameters of the KMeans object that was used and write them as attributes of the group
for parm in estimators.get_params().keys():
    cluster_grp.attrs[parm] = estimators.get_params()[parm]

print('\nWriting the following attrbutes to the group:')
for at_name in cluster_grp.attrs:
    print(at_name, ':', cluster_grp.attrs[at_name])

###############################################################################
# Write to H5 and access the written objects
# ==========================================
#
# Once the tree is prepared (previous cell), ioHDF5 will handle all the file writing.

h5_clust_refs = hdf.writeData(cluster_grp, print_log=True)

h5_labels = px.hdf_utils.getH5DsetRefs(['Labels'], h5_clust_refs)[0]
##data.plot(x='Ddeq', y='V', kind='scatter',color='Black', title='Title')

##Clustering

######################################################################################
##K-Means algorithm (Preprocessing is essential for kmeans algorithm (mean=0 std=1))##
######################################################################################

normalize = StandardScaler().fit(data[['Ddeq', 'V', 'Aratio', 'ASPratio']])
normalized_data = normalize.transform(data[['Ddeq', 'V', 'Aratio',
                                            'ASPratio']])

km = KMeans(n_clusters=number_cluster, n_init=40, init='k-means++')
km.fit(normalized_data)
labels = data['KM_ID'] = km.predict(normalized_data)
km.get_params()

#data.plot(kind='scatter', x='Ddeq',y='V', c=labels, colormap='Set1')
#plt.axis([0, 5, 0, 3])

#Divide the cluster (different type of snow)

KM_cluster0 = data[data['KM_ID'] == 0]
KM_cluster1 = data[data['KM_ID'] == 1]
KM_cluster2 = data[data['KM_ID'] == 2]

#########################################################
##Gaussian Mixture (no necessity of preprocessing data)##
#########################################################

gmix = mixture.GaussianMixture(n_components=number_cluster)
Beispiel #28
0
        info = {
            'name': 'Agglomerative Clustering',
            "rep": rep,
            "rep_params": rep_params,
            'params': params
        }

        methods.append((agg, info))
elif algo == "kmeans" or algo == "k-means":
    # k-means
    for n_clusters in n_clusterss:
        km = KMeans(n_clusters=n_clusters,
                    n_jobs=-1,
                    max_iter=1000,
                    n_init=100)
        params = km.get_params()
        info = {
            'name': 'K-Means',
            "rep": rep,
            "rep_params": rep_params,
            'params': params
        }
        methods.append((km, info))

elif algo == "online":
    # online
    oc = OnlineClustering(tau=tau)
    info = {
        'name': "Online",
        "rep": rep,
        "rep_params": rep_params,
Beispiel #29
0
kmeans = KMeans(n_clusters = n_clusters, init = init, n_init = n_init, max_iter = max_iter, random_state = random_state)

# training the model
kmeans.fit(x_train)

# looking at the attributes
cluster_centers = kmeans.cluster_centers_ # the feature values for the centres of the clusters
labels = kmeans.labels_ # number of cluster each training dataset is assigned to
inertia = kmeans.inertia_ # sum of squared distances between data points and their cluster centre
n_iter = kmeans.n_iter_ # number of iternation runs

# looking at the methods
fit_predict = kmeans.fit_predict(x_train) # looks like labels_ the cluster number for each training dataset
fit_transform = kmeans.fit_transform(x_train) # perform clustering and convert x_train to a cluster-distance space
get_params = kmeans.get_params() # returns the model parameters
prediction = kmeans.predict(x_test) # running the test set through the model
train_score = kmeans.score(x_train) # Opposite of the value of the training dataset on the K-means objective
test_score = kmeans.score(x_test) # Opposite of the value of the test dataset on the K-means objective
transform = kmeans.transform(x_train) # convert x_train to cluster-distance space

# I am interested to see how the clustering worked
# attached labels_ to train dataset
new_train_df = x_train_with_names
new_train_df['labels'] = labels

# now I want to look at each cluster
array_of_clusters = []
for i in np.unique(labels):
    array_of_clusters.append(new_train_df.loc[new_train_df['labels'] == i])
#------------------------------------------------------------------------
#改成了两层kmeans

from sklearn.cluster import KMeans
import gc
xx, zz = x, z
kmeans = KMeans(n_clusters=25, n_jobs=-1).fit(xx)
print(kmeans.score(xx, zz))  #-19.0829318168
#score怎么出来负数了呢??是做成回归了吗?怎么是用分类?
#答:把x中的每个value减去同意分类中的所在维度的平均值的平均值后做平方,再把这些平方们做加和。score(x,y)的y根本没用。
fenlei1 = kmeans.predict(xx)
fenlei1 = pd.Series(fenlei1)
way4 = 'C:/Users/Administrator/Desktop/ali/data/3_tempt/fenlei1.csv'
fenlei1.to_csv(way4, index=False)
way5 = 'C:/Users/Administrator/Desktop/ali/data/3_tempt/params1.csv'
pd.Series(kmeans.get_params()).to_csv(way5)

#第二层kmeans
for i in range(25):
    print(i)
    xuanze = ((fenlei1 == i))

    #有的样本个数就少于40个(其实只有i为21时,只有三个样本)
    if len(xx[xuanze]) <= 40:
        kmeans_tempt = KMeans(len(xx[xuanze]), n_jobs=-1).fit(xx[xuanze])
    else:  #绝大部分的样本数量要大于40
        kmeans_tempt = KMeans(n_clusters=40, n_jobs=-1).fit(xx[xuanze])
    print(kmeans.score(xx[xuanze]))
    fenlei_tempt = pd.Series(kmeans_tempt.predict(xx[xuanze]))
    way_fenlei_tempt = 'C:/Users/Administrator/Desktop/ali/data/3_tempt/second_kmeans/fenlei2_' + str(
        i) + '.csv'
Beispiel #31
0
class ClusterBag(BaseEstimator):
    def __init__(self, estimatorcluster=None, estimatormodel=None):
        '''
        parameter : the estimator to use in the clustering part 
        the model to use in the supervised fit 
        parameter for both of them 
        '''

        if estimatorcluster == None:
            self.estimatorcluster = KMeans(n_clusters=3)
        else:
            self.estimatorcluster = estimatorcluster

        if estimatormodel == None:
            self.estimatormodel = LinearRegression()
        else:
            self.estimatormodel = estimatormodel

        self.estimators_ = {}

        self.is_fitted_cluster = 0
        self.is_fitted = 0

    def fit(self, X, y):
        '''
        fit the cluster model then the estimators models
        '''

        self.fit_estimator_cluster(X)
        self.fit_estimators(X, y, self.estimatormodel)

        self.is_fitted_cluster = 1

        return self

    def fit_estimator_cluster(self, X, y=None):
        '''
        Fit the cluster model
        '''

        ## Fit the cluster estimator :

        self.labels_ = self.estimatorcluster.fit_predict(X)
        self.estimators_ = {}

        self.is_fitted_cluster = 1

    def fit_estimator(self, X, y, label_cluster, estimatorprediction):
        '''
        fit an estimator on the dataset related to his cluster
        '''

        ## check if estimatorcluster is fitted

        if self.is_fitted_cluster == 0:
            raise Exception('You need to first fit the cluster estimator ')

        ## Create a clone of our model

        estimator = clone(estimatorprediction)

        ## Range our model cleanly

        ind = []
        ind = np.where(self.labels_ == label_cluster)[0]
        self.estimators_[label_cluster] = estimator

        ## Fit our model

        self.estimators_[label_cluster].fit(X[ind], y[ind])

    def fit_estimators(self, X, y, estimatorprediction):
        '''
        fit all models for all clusters 
        '''

        ## check if estimatorcluster is fitted

        if self.is_fitted_cluster == 0:
            raise Exception('You need to first fit the cluster estimator ')

        ## Fit each cluster with our model

        for label in np.unique(self.labels_):
            self.fit_estimator(X, y, label, estimatorprediction)

        self.is_fitted = 1

    def predict(self, X):
        '''
        make a prediction by first finding the cluster and the model related to X 
        and then applying predict on the correct model
        '''

        ## Check everything is fitted

        if self.is_fitted == 0:
            raise Exception('You need to first fit the cluster bag estimator')

        label = self.estimatorcluster.predict(X)

        prediction = []
        for i in range(0, len(label)):
            result = self.estimators_[label[i]].predict(X[i].reshape(1, -1))
            prediction.append(result)

        return np.array(prediction)

    def get_params(self, deep=True):
        out = {}

        out["estimatorcluster"] = self.estimatorcluster
        out["estimatormodel"] = self.estimatorcluster

        for model in self.estimators_:
            out["estimator" + str(model)] = self.estimators_[model]

        for key in self.estimatorcluster.get_params():
            out["estimatorcluster__" +
                str(key)] = self.estimatorcluster.get_params()[key]

        for key in self.estimatormodel.get_params():
            out["estimatormodel__" +
                str(key)] = self.estimatormodel.get_params()[key]

        for model in self.estimators_:
            for key in self.estimators_[model].get_params():
                out["estimator" + str(model) + "__" +
                    str(key)] = self.estimators_[model].get_params()[key]

        return out
Beispiel #32
0
class CodeBook(BaseEstimator, ClusterMixin, TransformerMixin):
    """Code Book creation and manimpulation for Bag-of-(visual)Fetures.

    Parameters
    ----------

    n_words : int, optional, default: 36
        The number of clusters to form as well as the number of
        words (centroids) to generate.

    cluster_core : sklearn.cluster, default: KMeans
        Clustering technique used to quantisize the feature space to
        generate the code book.
        #TODO: its default should be described by _default_clustering()

    max_iter : int, default: 300
        Maximum number of iterations of the k-means algorithm for a
        single run.

    n_init : int, default: 10
        Number of time the k-means algorithm will be run with different
        centroid seeds. The final results will be the best output of
        n_init consecutive runs in terms of inertia.

    init : {'k-means++', 'random' or an ndarray}
        Method for initialization, defaults to 'k-means++':

        'k-means++' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence. See section
        Notes in k_init for more details.

        'random': choose k observations (rows) at random from data for
        the initial centroids.

        If an ndarray is passed, it should be of shape (n_clusters, n_features)
        and gives the initial centers.


    precompute_distances : {'auto', True, False}
        Precompute distances (faster but takes more memory).

        'auto' : do not precompute distances if n_samples * n_words > 12
        million. This corresponds to about 100MB overhead per job using
        double precision.

        True : always precompute distances

        False : never precompute distances

    tol : float, default: 1e-4
        Relative tolerance with regards to inertia to declare convergence

    n_jobs : int
        The number of jobs to use for the computation. This works by computing
        each of the n_init runs in parallel.

        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging. For n_jobs below -1,
        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
        are used.

    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.

    verbose : int, default 0
        Verbosity mode.

    copy_x : boolean, default True
        When pre-computing distances it is more numerically accurate to center
        the data first.  If copy_x is True, then the original data is not
        modified.  If False, the original data is modified, and put back before
        the function returns, but small numerical differences may be introduced
        by subtracting and then adding the data mean.

    Attributes
    ----------
    cook_book_ : array, [n_words, n_features]
        Coordinates of cluster centers

    labels_ :
        Labels of each point

    inertia_ : float
        Sum of distances of samples to their closest cluster center.

    Notes
    ------
    The k-means problem is solved using Lloyd's algorithm.

    The average complexity is given by O(k n T), were n is the number of
    samples and T is the number of iteration.

    The worst case complexity is given by O(n^(k+2/p)) with
    n = n_samples, p = n_features. (D. Arthur and S. Vassilvitskii,
    'How slow is the k-means method?' SoCG2006)

    In practice, the k-means algorithm is very fast (one of the fastest
    clustering algorithms available), but it falls in local minima. That's why
    it can be useful to restart it several times.

    See also
    --------

    dictionary_code
    """
    #TODO: test n_words default = 36
    #TODO: does make sense these paremters: max_iter, n_init
    #TODO: change the cluter_core from cluster_code=None to
    #      cluster_code=_default_cluster(), doing all the apropiated
    #      changes. Check that BaseEstimator asks for strict declaration
    #
    #      def _default_cluster(self, n_words=36,
    #                   init='k-means++', n_init=10, max_iter=300,
    #                   tol=1e-4, precompute_distances='auto',
    #                   verbose=0, random_state=None, copy_x=True, n_jobs=1):
    #          """Default space clustering strategy to determine the code book"""
    #          from sklearn.cluster import KMeans
    #          return KMeans(n_clusters=n_words, ...)
    #
    #       Then self.set_param can also be used to setup the parameters for the
    #       current classification methodology

    def __init__(self, n_words=36, cluster_core=None, init='k-means++',
                 n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto',
                 verbose=0, random_state=None, copy_x=True, n_jobs=1):

        if hasattr(init, '__array__'):
            n_words = init.shape[0]
            init = np.asarray(init, dtype=np.float64)

        self.n_words = n_words
        self.cluster_core_name = cluster_core
        self.init = init
        self.max_iter = max_iter
        self.tol = tol
        self.precompute_distances = precompute_distances
        self.n_init = n_init
        self.verbose = verbose
        self.random_state = random_state
        self.copy_x = copy_x
        self.n_jobs = n_jobs

        if self.cluster_core_name == 'random-words':
            self.n_init = 1
            self.max_iter = 1
            print 'The number of iterations and try as been fixed to 1.'

        if ( (self.cluster_core_name is     None      ) or 
             (self.cluster_core_name == 'random-words')    ):
            from sklearn.cluster import KMeans
            self.cluster_core = KMeans(n_clusters=self.n_words, init=self.init,
                                       max_iter=self.max_iter, tol=self.tol,
                                       precompute_distances=self.precompute_distances,
                                       n_init=self.n_init, verbose=self.verbose,
                                       random_state=self.random_state,
                                       copy_x=self.copy_x, n_jobs=self.n_jobs)
    def _check_fit_data(self, X):
        """Verify that the number of samples given is larger than n_words"""
        X = check_array(X, accept_sparse='csr', dtype=np.float64)
        if X.shape[0] < self.n_words:
            raise ValueError("n_samples=%d should be >= n_words=%d" % (
                X.shape[0], self.n_words))
        return X

    def _check_test_data(self, X):
        X = check_array(X, accept_sparse='csr')
        n_samples, n_features = X.shape
        expected_n_features = self.cook_book_.shape[1]
        if not n_features == expected_n_features:
            raise ValueError("Incorrect number of features. "
                             "Got %d features, expected %d" % (
                                 n_features, expected_n_features))
        if X.dtype.kind != 'f':
            warnings.warn("Got data type %s, converted to float "
                          "to avoid overflows" % X.dtype,
                          RuntimeWarning, stacklevel=2)
            X = X.astype(np.float)

        return X

    def fit(self, X, y=None):
        """Compute the clustering of the space.
        #TODO: right now only for K_means, however a dispatcher is
               needed so that other clustering stragegies are called
               indisticntly

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
        """
        self.cluster_core = self.cluster_core.fit(X, y)
        return self

    def fit_predict(self, X, y=None):
        """Compute cluster centers and predict cluster index for each sample.

        Convenience method; equivalent to calling fit(X) followed by
        predict(X).
        """
        #return self.fit(X).labels_
        raise NotImplementedError

    def fit_transform(self, X, y=None):
        """Compute clustering and transform X to cluster-distance space.

        Equivalent to fit(X).transform(X), but more efficiently implemented.
        """
        # Currently, this just skips a copy of the data if it is not in
        # np.array or CSR format already.
        # XXX This skips _check_test_data, which may change the dtype;
        # we should refactor the input validation.
        # 
        # X = self._check_fit_data(X)
        # return self.fit(X)._transform(X)
        raise NotImplementedError

    def transform(self, X, y=None):
        """Transform X to a cluster-distance space.

        In the new space, each dimension is the distance to the cluster
        centers.  Note that even if X is sparse, the array returned by
        `transform` will typically be dense.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            New data to transform.

        Returns
        -------
        X_new : array, shape [n_samples, k]
            X transformed in the new space.
        """
        # check_is_fitted(self, 'cook_book_')

        # X = self._check_test_data(X)
        # return self._transform(X)
        raise NotImplementedError

    def _transform(self, X):
        """guts of transform method; no input validation"""
        # return euclidean_distances(X, self.cook_book_)
        raise NotImplementedError


    def predict(self, X):
        """Predicts the index value of the closest word within the code book.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            New data to predict.

        Returns
        -------
        labels : array, shape [n_samples,]
            Index of the closest word within the code book.
        """
        return self.cluster_core.predict(X)

    def get_dictionary(self):
        """Retrieves the words forming the code book

        Returns
        -------
        dictionary : array, shape [n_words, n_features]
            Code book elements (words of the dictionary) represented
            in the feature space
        """
        #TODO: check that the coodebook is fitted
        return self.cluster_core.cluster_centers_

    def get_BoF_descriptor(self, X):

        # norm = lambda x: x.astype(float)/np.linalg.norm(x)
        # return norm(np.bincount(self.predict(X)))
        return np.histogram(self.predict(X),
                            bins=range(self.n_words+1),
                            density=True)

    def get_BoF_pramide_descriptor(self, X):
        """ Split the image (or volume) in a piramide manner and get
        a descriptor for each level (and part). Concatenate the output.
        TODO: build proper documentaiton

        """
        def split_data_by2(X):
            # TODO: rewrite this in a nice manner that uses len(X.shape)
            # TODO: this can rise ERROR if length of X is odd
            parts = [np.split(x, 2, axis=2) for x in [np.split(x, 2, axis=1) for x in
             np.slit(X, 2, axis=0) ]]
            return parts

        def get_occurrences(X):
            return np.histogram(X, bins=range(self.n_words+1))

        def build_piramide(X, level=2):
            if level is 0:
                return get_occurrences(X)
            else:
                return [get_occurrences(X)] + [build_piramide(Xpart, level-1)
                       for Xpart in split_data_by2(X)]

        return build_piramide(self.predict(X))

    def get_params(self, deep=True):
        return self.cluster_core.get_params()

    def set_params(self, **params):
        self.cluster_core.set_params(**params)
Beispiel #33
0
import pickle
import matplotlib as plt

from sklearn.cluster import KMeans
import numpy as np

#load data from pickles
labels = pickle.load(open('labels.p', 'rb'))
# encoded_data = pickle.load(open('encoded_data.p','rb'))

kmeans = KMeans()
print(kmeans.get_params(labels.all()))
                    init='random',
                    n_init=50).fit(X)
        #db = DBSCAN(eps=e, min_samples=s).fit(X)
        #core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        #core_samples_mask[db.core_sample_indices_] = True
        labels = db.labels_

        # Number of clusters in labels, ignoring noise if present.
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

        #print('Estimated number of clusters: %d' % n_clusters_)

        #print("Silhouette Coefficient: %0.3f"
        #      % metrics.silhouette_score(X, labels))
        performance.append(
            [db.get_params(),
             metrics.silhouette_score(X, labels)])
        print(db.cluster_centers_.shape)
    except:
        pass

id = generate_id()
performDf = pd.DataFrame(np.array(performance),
                         columns=cfg.header_cluster_performance)
performDf.to_csv(cfg.performanceClustering.format(id),
                 index=False,
                 sep=cfg.sep)
#
# print("Eps: {}, min_samples: {}".format(e, s))
#
# db = DBSCAN(eps=e, min_samples=s).fit(X)
log_kmeans_clustering_summary(km, X, n_clusters=17)

# tests
exp = neptune.get_experiment()

# check logs
assert list(exp.get_logs().keys()) == ['charts_sklearn'
                                       ], '{} - incorrect logs'.format(exp)

# check cluster labels
assert X.shape[0] == len(
    km.labels_), '{} incorrect number of cluster labels'.format(exp)

# check sklearn parameters
assert set(exp.get_properties().keys()) == set(
    km.get_params().keys()), '{} parameters do not match'.format(exp)

# check neptune parameters
assert set(exp.get_parameters().keys()) == set(
    parameters.keys()), '{} parameters do not match'.format(exp)

## Step 5: Stop Neptune experiment after logging summary

neptune.stop()

## Explore Results

# Other logging options

## Before you start: create and fit gradient boosting classifier
Beispiel #36
0
class SameSizeKMeans(object):
    '''SameSize K-Means clustering algorithm

    Parameters
    ----------
    n_clusters : int, optional, default: 8
        The number of clusters to form as well as the number of
        centroids to generate.

    init_model: KMeans object, default: None
        The initial KMeans model to fit on. Leaving as None
        defaults to KMeans with default parameters except for
        passing the as-specified n_clusters.

    save_labels: bool, default: False
        Whether to save labels at each step of the fitting
        process.

    metric: str, default: 'l2'
        Specifies the distance metric to use after the initial
        KMeans clustering algorithm is run. Only 'l2' (Euclidean
        distance) is currently guaranteed to work.

    Attributes
    ----------
    cluster_centers_ : array, [n_clusters, n_features]
        Final coordinates of cluster centers

    final_labels: array, [n_points, 1]
        Final labels of each point

    all_labels_: None or list
        Labels of each point at each step of the fitting
        process. None unless save_labels is set to True.

    '''

    LOOP_BUFFER = 1
    ORDER_DICT = {
        'largest_first': 'l',
        'smallest_first': 's',
        'l': 'l',
        's': 's',
        'min_v': 'min_v',
        'max_v': 'max_v'
    }

    def __init__(self,
                 n_clusters=8,
                 init_model=None,
                 save_labels=True,
                 metric='l2'):
        self.n_clusters = n_clusters
        if init_model is None:
            self.init_model = KMeans(n_clusters=n_clusters)
        else:
            self.init_model = init_model
        self.init_params = self.init_model.get_params()
        self.save_labels = save_labels
        if save_labels:
            self.all_labels_ = []
        else:
            self.all_labels_ = None
        self.metric = metric

    def fit(self, X, weights=None, weight_tol=0, order='s'):
        ''' Fit the SSKMeans model, populating final_labels

        Parameters
        ----------
        X: array, [n_points, 2]
            Coordinates of each

        weights: float, default: 1e-4
            Fractional tolerance of the weight

        weight_tol : float, default: 1e-4
            Fractional tolerance of the weight

        order: str or list of strings, default: 's'
            The order in which to adjust clusters. Options are:
                - "smallest_first" or "s": Adjusts the smallest
                    clusters first
                - "largest_first" or "l": Adjusts the largest
                    clusters first
                - "min_v": Adjusts the cluster closest to the
                    optimal size first
                - "max_v": Adjusts the cluster farthest from
                    the optimal size first
            Alternatively, you may pass a list of any combination
            of these options. The list must be as long as n_clusters

        Returns
        -------
        None

        Populates the following model attributes:
            final_labels
            all_labels_ (if save_labels == True)
        '''

        self._save_fit_params(X, weights, weight_tol, order)

        # Get temporary labels from a naive KMeans clustering
        _temp_labels = self._fit_naive_KMeans(X)

        for step in range(self.n_clusters - self.LOOP_BUFFER):

            print('Starting step ', step + 1)

            # Determine the cluster to update on this step, along
            # with its associated info
            _coords, _weights, _label, _score_arr, _centroid = (
                self._get_cluster_info(order, step))

            # Adjust the determined cluster, and return its finalized
            # associated coordinates
            _coords = self._adjust_cluster(_coords, _weights, _label,
                                           _score_arr, order, step)

            # Set _coords as having finalized labels, and set their labels
            for coord in _coords:
                _mask = np.isclose(self.X, coord).all(1)
                self._labels_finalized[_mask] = True
                self.final_labels[_mask] = _label

            # Remove _label from the list of unfinalized clusters
            self._clusters_unfinalized = [
                label for label in self._clusters_unfinalized
                if label != _label
            ]

            # Update centroids
            self._update_centroids()

            if self.save_labels:
                _temp_labels = deepcopy(self.final_labels)
                self.all_labels_.append(deepcopy(_temp_labels))

    def _save_fit_params(self, X, weights, weight_tol, order):
        '''Saves the fit parameters during fit()

        Parameters
        ----------
        X : array, shape = [n_samples, 2]

        weights: array, shape = (n_samples,)

        weight_tol: float

        For descriptions, see fit()

        Returns
        -------
        None
        '''

        self.X = X
        self.weight_tol = weight_tol
        self.order = order

        if weights is None:
            self.weights = np.ones(X.shape[0])
        else:
            try:
                assert (weights.shape[0] == X.shape[0])
                self.weights = weights
            except AssertionError:
                raise AssertionError('X and weights are not the same length')

        self._ideal_cluster_weight = np.sum(weights) / self.n_clusters

        return None

    def _fit_naive_KMeans(self, X):
        '''Fits the initial KMeans model

        Parameters
        ----------
        X : array, shape = [n_samples, 2]
            See fit()

        Returns
        -------
        _temp_labels: array, shape = [n_samples,]
            The inital KMeans labels assigned to each point
        '''

        # Setup naive KMeans model
        _temp_model = KMeans(**self.init_params)

        # Get labels from KMeans model, and save them if desired
        _temp_labels = _temp_model.fit_predict(X)
        self._unique_labels = np.unique(_temp_labels)
        if self.save_labels:
            self.all_labels_.append(deepcopy(_temp_labels))

        # Create a dictionary of cluster centers by label
        self.cluster_centers_ = {
            label: np.mean(X[_temp_labels == label], axis=0)
            for label in self._unique_labels
        }

        # Save the current labels as the final labels. To be
        # updated as the program progresses.
        self.final_labels = deepcopy(_temp_labels)

        # Numpy array specifying whether the point is finalized
        # according to its assignment. Updated accordingly later.
        self._labels_finalized = np.array([False for coords in X])

        # List of unfinalized clusters
        self._clusters_unfinalized = [label for label in self._unique_labels]

        return _temp_labels

    def _get_cluster_info(self, order, step):
        '''Convenience function getting properties of the chosen cluster

        Parameters
        ----------
        order: str or list
            See fit()

        step: int
            The current iteration of the SSKMeans adjustment.

        Returns
        -------
        _coords

        _weights

        _label

        _score_arr

        _centroid
        '''

        _order = self._get_order(order, step)

        # Find the coordinates, weights, and label of the cluster
        # determined by order.
        _coords, _weights, _label = self._find_cluster(_order)
        _centroid = self.cluster_centers_[_label]

        if _order == 'l':
            _score_arr = None
        elif _order == 's':
            _score_arr = self._score_other_points(_coords, _centroid)
        elif _order == 'min_v':
            if np.sum(_weights) > self._ideal_cluster_weight:
                _score_arr = None
            else:
                _score_arr = self._score_other_points(_coords, _centroid)
        elif _order == 'max_v':
            if np.sum(_weights) > self._ideal_cluster_weight:
                _score_arr = None
            else:
                _score_arr = self._score_other_points(_coords, _centroid)

        return (_coords, _weights, _label, _score_arr, _centroid)

    def _get_order(self, order, step):
        '''Convenience function for setting the clustering order
        '''

        try:
            _order = self.ORDER_DICT[order]
        except KeyError:
            raise KeyError('order must be one of: {}'.format(
                self.ORDER_DICT.keys()))
        except TypeError:
            try:
                _order = self.ORDER_DICT[order[step]]
            except KeyError:
                raise KeyError('order must be one of: {}'.format(
                    self.ORDER_DICT.keys()))
            except TypeError:
                raise TypeError('Order must be a string or list of strings.')

        return _order

    def _find_cluster(self, order):
        '''Finds the unfinalized cluster according to self.order
        '''

        # Find the coordinates, weights, and current assigned
        # label of each point which doesn't have a finalized
        # cluster
        _X = self.X[np.logical_not(self._labels_finalized)]
        _temp_weights = self.weights[np.logical_not(self._labels_finalized)]
        _labels = self.final_labels[np.logical_not(self._labels_finalized)]

        # Get the weights of the clusters that aren't finalized
        # if order is 'l' or 's'
        if (order == 'l' or order == 's'):
            _label_weights = {
                label: np.sum(_temp_weights[_labels == label])
                for label in self._clusters_unfinalized
            }
        # Get the distance of weights of the clusters from the
        # ideal cluster weight for the clusters that aren't
        # finalized if order is 'min_v' or 'max_v'
        elif (order == 'min_v' or order == 'max_v'):
            _label_weights = {
                label: abs(
                    np.sum(_temp_weights[_labels == label]) -
                    self._ideal_cluster_weight)
                for label in self._clusters_unfinalized
            }

        if order == 'l':
            # Find the label of the largest unfinalized cluster
            _cluster = max(_label_weights, key=_label_weights.get)
        elif order == 's':
            # Find the label of the smallest unfinalized cluster
            _cluster = min(_label_weights, key=_label_weights.get)
        elif order == 'min_v':
            # Find the label of the cluster closest to the optimal size
            _cluster = min(_label_weights, key=_label_weights.get)
        elif order == 'max_v':
            # Find the label of the cluster farthest from the optimal size
            _cluster = max(_label_weights, key=_label_weights.get)

        _coords = _X[_labels == _cluster]
        _weights = _temp_weights[_labels == _cluster]

        return (_coords, _weights, _cluster)

    def _adjust_cluster(self, coords, weights, label, score_arr, order, step):
        '''Performs the adjustment of the given cluster
        '''

        _order = self._get_order(order, step)

        # Check if the cluster has the right total weight by comparing it
        # to the ideal cluster weight and the specified weight tolerance.
        # If it doesn't, find the point farthest from the center of mass,
        # and reassign it. Repeat until satisfied.
        while (abs(np.sum(weights) - self._ideal_cluster_weight) /
               self._ideal_cluster_weight > self.weight_tol):

            if _order == 'l':
                coords, weights = self._reassign_farthest(
                    coords, weights, label)
            elif _order == 's':
                coords, weights, score_arr = (self._reassign_closest(
                    coords, weights, label, score_arr))
            elif _order == 'min_v':
                if np.sum(weights) > self._ideal_cluster_weight:
                    coords, weights = self._reassign_farthest(
                        coords, weights, label)
                else:
                    coords, weights, score_arr = (self._reassign_closest(
                        coords, weights, label, score_arr))
            elif _order == 'max_v':
                if np.sum(weights) > self._ideal_cluster_weight:
                    coords, weights = self._reassign_farthest(
                        coords, weights, label)
                else:
                    coords, weights, score_arr = (self._reassign_closest(
                        coords, weights, label, score_arr))

        return (coords)

    def _score_centroids_one_point(self, coords, label):
        '''Gets distance between point and remaining clusters
        '''

        point_dict = {
            _label: self._calculate_distance(coords,
                                             self.cluster_centers_[_label])
            for _label in self._clusters_unfinalized if _label != label
        }

        return point_dict

    def _reassign_farthest(self, coords, weights, label):
        '''Sends point farthest from center to closest cluster
        '''

        # Calculated the cluster's current center of mass
        center_of_mass = (np.sum(weights[:, None] * coords, axis=0) /
                          np.sum(weights))[None, :]

        # Reassign this cluster's center to the center of mass
        self.cluster_centers_[label] = center_of_mass

        # Find the squared distances of the points to the center
        # of mass of the cluster, and find the point farthest
        _sqrd_dists = np.sum((coords - center_of_mass)**2, axis=1)
        _farthest_point_ind = np.argmax(_sqrd_dists)
        _farthest_point_dist = _sqrd_dists[_farthest_point_ind]
        _farthest_point = coords[_sqrd_dists == _farthest_point_dist]

        # Score the unfinalized centroids that are not from
        # the current cluster on the farthest point, then
        # find the best cluster for that point
        _centroid_scores = self._score_centroids_one_point(
            _farthest_point, label)
        _best_cluster = min(_centroid_scores, key=_centroid_scores.get)

        # Reassign the label in the final_label array
        _full_point_mask = np.isclose(self.X, _farthest_point).all(1)
        self.final_labels[_full_point_mask] = _best_cluster

        # Remove the farthest point from the current coordinate
        # and weight array
        _point_mask = (_sqrd_dists != _farthest_point_dist)
        _coords = coords[_point_mask]
        _weights = weights[_point_mask]

        return (_coords, _weights)

    def _score_other_points(self, coords, centroid):
        '''Finds the distance between points in coords and centroid
        '''

        # Make a mask for each point in coords that tells us
        # if it's in X
        _coords_masks = [np.isclose(self.X, coord).all(1) for coord in coords]

        # Sum all the masks, and negate the result to obtain
        # a mask for only those points not in coords, then
        # use it to obtain an array of only those points
        _other_points_mask = np.logical_not(
            np.sum(_coords_masks, axis=0).astype(np.bool))
        _other_points = self.X[np.logical_and(
            np.logical_not(self._labels_finalized), _other_points_mask)]

        # Make an array where the first column is the point
        # being considered, and the 2nd column is the squared
        # distance to the specified centroid
        score_arr = np.array([[point, np.sum((point - centroid)**2)]
                              for point in _other_points])

        return score_arr

    def _reassign_closest(self, coords, weights, label, score_arr):
        '''Reassigns closest point to current cluster (label)
        '''

        # Select the closest point
        _best_score_row = np.argmin(score_arr[:, 1])
        _best_point = score_arr[_best_score_row][0]

        # Remove the closest point from score_arr to pass it out
        # of the function
        _score_arr = np.delete(score_arr, (_best_score_row), axis=0)

        # Reassign the label in the final_label array, and get
        # the weight of the best point
        _best_point_mask = np.isclose(self.X, _best_point).all(1)
        self._labels_finalized[_best_point_mask] = True
        self.final_labels[_best_point_mask] = label
        _best_weight = self.weights[_best_point_mask]

        # Add the best point to the _coords and _weights arrays
        _coords = np.vstack((coords, _best_point))
        _weights = np.append(weights, _best_weight)

        return (_coords, _weights, _score_arr)

    def _update_centroids(self):
        '''Recalculates cluster centroids
        '''

        # Create a dictionary of cluster centers by label
        self.cluster_centers_ = {
            label: np.mean(self.X[self.final_labels == label], axis=0)
            for label in self._unique_labels
        }

        return None

    def _calculate_distance(self, coord_arr, point=None):
        '''Distance between points in coord_arr and point
        '''

        if coord_arr.ndim == 1:
            _axis = 0
        else:
            _axis = 1

        if point is None:
            _point = 0
        else:
            _point = point

        if self.metric == 'l2':
            _distances = np.sum((coord_arr - _point)**2, axis=_axis)
        elif self.metric == 'l1':
            _distances = np.sum(np.abs(coord_arr - _point), axis=_axis)
        elif self.metrid == 'l_inf':
            _distances = np.max(np.abs(coord_arr - _point), axis=_axis)

        return _distances