Python KMeans.score Beispiele, sklearn.cluster.KMeans.score Python Beispiele

Beispiel #1

0

Datei anzeigen

def cluster(train_latents, train_labels, test_latents, test_labels):
    num_classes = np.shape(train_labels)[-1]
    labels_hot = np.argmax(test_labels, axis=-1)
    train_latents = np.reshape(train_latents,
                               newshape=[train_latents.shape[0], -1])
    test_latents = np.reshape(test_latents,
                              newshape=[test_latents.shape[0], -1])
    kmeans = KMeans(init='random',
                    n_clusters=num_classes,
                    random_state=0,
                    max_iter=1000,
                    n_init=FLAGS.n_init,
                    n_jobs=FLAGS.n_jobs)
    kmeans.fit(train_latents)
    print(kmeans.cluster_centers_)
    print('Train/Test k-means objective = %.4f / %.4f' %
          (-kmeans.score(train_latents), -kmeans.score(test_latents)))
    print('Train/Test accuracy %.4f / %.3f' %
          (error(np.argmax(train_labels, axis=-1),
                 kmeans.predict(train_latents),
                 k=num_classes),
           error(np.argmax(test_labels, axis=-1),
                 kmeans.predict(test_latents),
                 k=num_classes)))
    return error(labels_hot, kmeans.predict(test_latents), k=num_classes)

Beispiel #2

0

Datei anzeigen

    def k_means_analyze(self, rng=[2,12], init='k-means++', n_init=10,
                        max_iter=300, tol=0.0001, precompute_distances='deprecated',
                        verbose=0, random_state=None, copy_x=True, n_jobs='deprecated',
                        algorithm='auto'):
        km_scores= []
        km_silhouette = []
        db_score = []
        for i in range(rng[0], rng[1]):
            km = KMeans(n_clusters=i, init=init, n_init=n_init,
                        max_iter=max_iter, tol=tol, precompute_distances=precompute_distances,
                        verbose=verbose, random_state=random_state, copy_x=copy_x, n_jobs=n_jobs,
                        algorithm=algorithm).fit(self.__data)
            preds = km.predict(self.__data)

            print("Score for number of cluster(s) {}: {}".format(i,km.score(self.__data)))
            km_scores.append(-km.score(self.__data))

            silhouette = silhouette_score(self.__data,preds)
            km_silhouette.append(silhouette)
            print("Silhouette score for number of cluster(s) {}: {}".format(i,silhouette))

            db = davies_bouldin_score(self.__data,preds)
            db_score.append(db)
            print("Davies Bouldin score for number of cluster(s) {}: {}".format(i,db))

            print("-"*100)

        return km_scores, km_silhouette, db_score

Beispiel #3

0

Datei anzeigen

    def kmeans(self):  # Model 7
        print '\n### Running K-Means Algorithm\n'

        # Create K-Means classifier object model
        modelFile = '/tmp/model_7.joblib.pkl'
        if (os.path.isfile(modelFile)):
            model = self.loadmodel(7)
        else:
            model = KMeans(n_clusters=8, random_state=0)

        # Train the model using the training sets and check score
        model.fit(self.x_trn)
        print '\tTraining score\t\t', model.score(self.x_trn, self.y_trn)

        # Calculate algorithm scores
        predicted = model.predict(self.x_tst)
        avg_prec = average_precision_score(numpy.array(self.y_tst), predicted)
        recall = recall_score(self.y_tst, predicted, average='micro')

        # Print the algorithm scores
        print '\tTesting score\t\t', model.score(self.x_tst, predicted)
        print '\tAccuracy score\t\t', accuracy_score(self.y_tst, predicted)
        print '\tPrecision score:\t' + str(avg_prec)
        print '\tRecall score\t\t' + str(recall)

        self.savemodel(model, 7)  # Save the training model

Beispiel #4

0

Datei anzeigen

Datei: cluster_detection.py Projekt: aadland6/OutlierScores

def score_k(data, krange):
    """Generates the score and distoration for elbow plots
    """
    elbow_scores = []
    for k in krange:
        print("Evaluatng {0} clusters".format(k))
        current_model = KMeans(n_clusters=k)
        current_model.fit(data)
        current_model.score(data)
        elbow_scores.append(current_model.score(data))
    return elbow_scores

Beispiel #5

0

Datei anzeigen

Datei: create_plots.py Projekt: fabimaerz/good_news_everyone

    def plot_for_offset(Map):
        
        if False:
            #################  K Means  #################
            # Set number of clusters
            n_clusters = 7

            kmeans = KMeans(n_clusters=n_clusters, random_state=42)
            kmeans.fit(Map)
            idx = kmeans.fit_predict(Map)
            kmeans.score(Map)
        else:
            ########### Categories by word count per category ############
            idx = word_categories
        
        #################  Plot  #########################

        fig, ax = plt.subplots(figsize=(12,12))
        fig.tight_layout()
        
        # Set range
        ax.set_xlim(-15000,15000)
        ax.set_ylim(-15000,15000)
        
        # List of colors
        colors = ['green', 'orange', 'red', 'blue', 'yellow', 'brown', 'violet', 'grey']

        # Plot all words
        for word, idx_, vec, label in zip(words, idx, Map, word_categories_names):
            x = vec[0]
            y = vec[1]
            ms = scaler.transform(count_total[word])
            ax.plot(x, y, marker='o', ms=ms, c=colors[idx_], alpha=0.7, linestyle='none')
            plt.annotate(word, (x, y), ha='center', va='center', size=10)

        # Create legend
        l = []
        for i in range(len(likelihood_df.columns)):
            l.append(mpatches.Patch(color=colors[i], label=likelihood_df.columns[i]))
        ax.legend(handles=l)

        # Used to return the plot as an image array
        plt.close(fig)
        io_buf = io.BytesIO()
        fig.savefig(io_buf, format='raw')
        io_buf.seek(0)
        image = np.reshape(np.frombuffer(io_buf.getvalue(), dtype=np.uint8),
                        newshape=(int(fig.bbox.bounds[3]), int(fig.bbox.bounds[2]), -1))
        io_buf.close()

        return image

Beispiel #6

0

Datei anzeigen

Datei: helper.py Projekt: UC3MSocialRobots/novelty-detection-in-hri

def compute_print_scores(normal_users, queue):

    K_GMM_n, K_KMeans_n, K_GMM_s, K_KMeans_s = Ks

    print 'novelty score GMM'
    B = GMM(covariance_type='full', n_components = 1)
    B.fit(queue)
    x = [B.score([i]).mean() for i in queue]
    print get_score_last_item(x, K_GMM_n)

    print 'novelty score OneClassSVM'
    x = anom_one_class(queue, [queue[-1]])
    print x[-1]

    print 'novelty score LSA'
    anomalymodel = lsanomaly.LSAnomaly()
    X = np.array(queue)
    anomalymodel.fit(X)
    print anomalymodel.predict(np.array([queue[-1]]))

    print 'novelty score degree K_means'
    K = KMeans(n_clusters=1)
    K.fit(queue)
    x = [K.score([i]) for i in queue]
    print get_score_last_item(x, K_KMeans_n)

    normal_and_new = normal_users + [queue[-1]]

    print 'degree of belonging to known class GMM'
    B = GMM(covariance_type='full', n_components = 1)
    B.fit(normal_users)
    x = [B.score([i]).mean() for i in normal_and_new]
    print get_score_last_item(x, K_GMM_s)

    print 'degree of belonging to known class OneClassSVM'
    x = anom_one_class(normal_users, [queue[-1]])
    print x[-1]

    print 'degree of belonging to known class LSA'
    anomalymodel = lsanomaly.LSAnomaly()
    X = np.array(normal_users)
    anomalymodel.fit(X)
    print anomalymodel.predict(np.array([queue[-1]]))

    print 'degree of belonging to known class K_means'
    K = KMeans(n_clusters=1)
    K.fit(normal_users)
    x = [K.score([i]) for i in normal_and_new]
    print get_score_last_item(x, K_KMeans_s)

Beispiel #7

0

Datei anzeigen

Datei: Classifier.py Projekt: carolius123/logclassifier

    def __buildModel(k_, vectors):
        # 再次聚类并对结果分组。 Kmeans不支持余弦距离
        kmeans = KMeans(n_clusters=k_, n_init=20, max_iter=500).fit(vectors)
        norm_factor = -vectors.shape[1]  # 按字典宽度归一化
        groups = pd.DataFrame({
            'C':
            kmeans.labels_,
            'S': [kmeans.score([v]) / norm_factor for v in vectors]
        }).groupby('C')
        percents = groups.size() / len(vectors)  # 该簇向量数在聚类总向量数中的占比
        cfg_q = G.cfg.getfloat('Classifier', 'Quantile')
        quantiles = np.array([
            groups.get_group(i)['S'].quantile(cfg_q, interpolation='higher')
            for i in range(k_)
        ])
        boundaries = groups['S'].agg('max').values  # 该簇中最远点距离

        quantiles2 = quantiles * 2
        boundaries[boundaries > quantiles2] = quantiles2[
            boundaries > quantiles2]  # 边界太远的话，修正一下
        boundaries[boundaries < 1e-100] = 1e-100  # 边界为零的话，修正一下
        quantiles = boundaries - quantiles
        quantiles[quantiles < 1e-100] = 1e-100  # 避免出现0/0

        G.log.info(
            'Model(k=%d) built. inertia=%e， max proportion=%.2f%%, max quantile=%e, max border=%e',
            k_, kmeans.inertia_,
            max(percents) * 100, max(quantiles), max(boundaries))
        return kmeans, percents, boundaries, quantiles

Beispiel #8

0

Datei anzeigen

def kmeans(vector: np.array, n: int):
    k = KMeans(n_clusters=n, init='k-means++', n_init=6)
    cluster_coordinate = k.fit_transform(vector)
    cluster_label = k.fit(vector)
    score = k.score(vector)

    return cluster_coordinate, cluster_label.labels_, cluster_label.inertia_, score

Beispiel #9

0

Datei anzeigen

Datei: main.py Projekt: liuzhuocpp/KAD_Kmeans

def solve(dataId, usingExist=True):

    dataId = str(dataId)
    dataPath = './data/' + dataId + '.txt'
    binPath = './out/' + dataId + '.bin'
    outputPath = "out/ans" + dataId + ".txt"

    if not os.path.exists(binPath) or not usingExist:
        word2vec.word2vec(dataPath, binPath, size=100, verbose=True)

    # 使用word2vec载入binPath
    model = word2vec.load(binPath)

    # 打开输出文件
    output = codecs.open(outputPath, "w", "utf-8")

    ClustersNumber = 10
    WordNumber = len(model.vectors)

    # 使用Kmeans算法
    kmeans = KMeans(n_clusters=ClustersNumber,
                    random_state=0).fit(model.vectors)

    # 得到每个word ID 所属于的cluster 编号，编号范围[0, WordNumber)
    label = kmeans.labels_
    # 获取每个word 的得分，即是每个word和cluster中心的距离的相反数
    scores = []
    for i in xrange(WordNumber):
        scores.append(kmeans.score([model.vectors[i]]))

    # 把处于相同cluster的word ID 放入相同的list
    allCluster = []
    for i in xrange(ClustersNumber):
        allCluster.append([])
    for i in xrange(len(label)):
        allCluster[label[i]].append(i)

    # 定义两个word ID的大小关系，使用scores数组比较其大小关系
    def comparator(a, b):

        vala = scores[a]
        valb = scores[b]

        if vala > valb: return 1
        elif vala == valb: return 0
        else: return -1

    #对于每个cluster分别处理
    for clusterId in xrange(len(allCluster)):
        output.write("-----------------------------------cluster " +
                     str(clusterId) + ":\n")

        #排序，按照score从高到低排序
        allCluster[clusterId].sort(cmp=comparator, reverse=True)

        #获取前30个
        for x in allCluster[clusterId][:30]:
            #输出score的相反数，即输出距离
            output.write(model.vocab[x] + "  " + str(-scores[x]) + "\n")
    print '\n'

Beispiel #10

0

Datei anzeigen

Datei: evaluate.py Projekt: yuan776/ethz-data-mining

def run():
    cluster_centers = load_prediction()
    test_data = load_test_data()
    k = KMeans(n_clusters=200)
    k.cluster_centers_ = cluster_centers
    score = k.score(test_data)
    print("Score: %f" % (score / len(test_data) * -1))

Beispiel #11

0

Datei anzeigen

Datei: UnsupervisedClassification.py Projekt: cmazzoni87/UnsupervisedTextClassif

def init_cluster(word_vectors):
    print(word_vectors.vectors)
    model = KMeans(n_clusters=3, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)
    labels = model.labels_
    silhouette_score = metrics.silhouette_score(word_vectors.vectors, labels, metric='euclidean')
    print("Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):")
    print(model.score(word_vectors.vectors))
    print("Silhouette_score: ")
    print(silhouette_score)
    print(word_vectors.similar_by_vector(model.cluster_centers_[0], topn=50, restrict_vocab=None))
    print(word_vectors.similar_by_vector(model.cluster_centers_[1], topn=50, restrict_vocab=None))
    print(word_vectors.similar_by_vector(model.cluster_centers_[2], topn=50, restrict_vocab=None))
    y_kmeans = model.predict(word_vectors.vectors)
    plt.scatter(word_vectors.vectors[:, 0], word_vectors.vectors[:, 1], c=y_kmeans, s=50, cmap='viridis')
    centers = model.cluster_centers_
    plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)
    plt.show()
    words = pd.DataFrame(word_vectors.vocab.keys())
    words.columns = ['words']
    words = words[words['words'].str.len() > 3].reset_index(drop=True)
    words['vectors'] = words['words'].apply(lambda x: word_vectors.wv[f'{x}'])
    words['cluster'] = words['vectors'].apply(lambda x: model.predict([np.array(x)]))
    words['cluster'] = words['cluster'].apply(lambda x: x[0])
    words['cluster_value'] = [1 if i == 0 else -1 if i == 1 else 0 for i in words['cluster']]
    words['closeness_score'] = words.apply(lambda x: 1 / (model.transform([x['vectors']]).min()), axis=1)
    words['sentiment_coeff'] = words['closeness_score'] * words['cluster_value']
    words.to_csv('metrics_results\\predictive_scores_{}.csv'.format(time_stamp), index=False)
    return words

Beispiel #12

0

Datei anzeigen

Datei: kmeans.py Projekt: sosorry/Software

def runKMeans(cv_img, num_colors, init):

    imgdata = getimgdatapts(cv_img[-100:, :, :])  # FIX ME: arbitrary cut off
    kmc = KMeans(n_clusters=num_colors, max_iter=25, n_init=10, init=init)
    t1 = time.time()
    kmc.fit_predict(imgdata)
    t2 = time.time()
    print("fit time: %f" % (t2 - t1))
    trained_centers = kmc.cluster_centers_
    # print trained_centers
    labels = kmc.labels_
    labelcount = Counter()
    t1 = time.time()
    # IPython.embed()
    # for pixel in labels:
    # 	labelcount[pixel] += 1
    for i in np.arange(num_colors):
        labelcount[i] = np.sum(labels == i)

    t2 = time.time()
    print("counting labels time: %f" % (t2 - t1))
    # print labelcount
    # IPython.embed()
    score = kmc.score(imgdata)
    return trained_centers, labelcount, score

Beispiel #13

0

Datei anzeigen

Datei: k-means.py Projekt: james-wasson/NavRiskAssessor

def determineBestK(data, minRange=2, maxRange=-1, step=-1):
    if minRange < 0:
        minRange = 2
    if maxRange < 0:
        maxRange = len(data['names'])
    if maxRange < minRange:
        maxRange = len(data['names'])
        minRange = 2
    yVals = []
    if (step < 0):
        step = int(np.floor((maxRange - minRange) / 5))
    for i in range(minRange, maxRange + 1, step):
        kmeans = KMeans(n_init=10,
                        max_iter=10000,
                        tol=1e-8,
                        verbose=0,
                        n_clusters=i,
                        algorithm="elkan").fit(data['data'])

        avgError = abs(kmeans.score(data['data'])) / len(data['data'])

        yVals.append(avgError)
    plt.title('Best K-mean')
    plt.plot(list(range(minRange, maxRange + 1, step)), yVals)
    plt.axis([minRange - .5, maxRange + .5, 0, yVals[0] + .5])
    plt.show()

Beispiel #14

0

Datei anzeigen

def train(weight, true_k=10):
    print('[INFO] traning with K=', true_k)

    clf = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1)
    clf.fit(weight)
    result = list(clf.predict(weight))
    return result, -clf.score(weight)

Beispiel #15

0

Datei anzeigen

def cluster(plot):
    with open('GAIA_DATA_small.csv', 'r') as csv_file:
        file_reader = csv.reader(csv_file,
                                 delimiter=',',
                                 quotechar='|',
                                 quoting=csv.QUOTE_MINIMAL,
                                 lineterminator='\n')
        X = []
        y = []
        for row in file_reader:
            temp_x = row[1]
            temp_y = row[2]
            if temp_x != '' and temp_y != '':
                X.append(temp_x)
                y.append(temp_y)
        X = np.array(X, dtype=np.float).reshape(-1, 1)
        y = np.array(y, dtype=np.float).reshape(-1, 1)
        data = np.hstack((X, y))
        train, test = train_test_split(X, shuffle=False)
        for i in range(10, 50, 5):
            kmeans = KMeans(n_clusters=i).fit(train)
            print(kmeans.score(test))
        if plot:
            # Plotting the data.  No obvious real clusters
            plt.scatter(X, y, alpha=0.1, marker='.')
            plt.show()

Beispiel #16

0

Datei anzeigen

def clusterize(x, n_clusters):
    X = x.reshape((len(x), 1))
    clusterer = KMeans(n_clusters=n_clusters)
    labels = clusterer.fit_predict(X)
    score = silhouette_score(X, labels)
    quality = clusterer.score(X)
    return Clustering(n_clusters, labels, quality, score)

Beispiel #17

0

Datei anzeigen

def KMEANS(data, targets, dataset, n_classes): 
  print('APPLY KMEANS...')
  X_train, Y_train = data[0], targets[0]
  X_val, Y_val = data[1], targets[1]
  X_test, Y_test = data[2], targets[2]

  kmeans = KMeans(n_clusters=n_classes, random_state=0).fit(X_train)
  kmeans.labels_
  cluster_centers = kmeans.cluster_centers_
  print('Cluster Centers', cluster_centers.shape)

  prediction = kmeans.predict(X_test)
  print('Prediction', prediction.shape)
  scores = kmeans.score(X_test)

  acc = accuracy_score(y_true=Y_test, y_pred=prediction) 
  prec = sklearn.metrics.precision_score(y_true=Y_test, y_pred=prediction,  average='weighted' )
  rec = sklearn.metrics.recall_score(y_true=Y_test, y_pred=prediction,  average='weighted')
  class_report = sklearn.metrics.classification_report(y_true=Y_test, y_pred=prediction, output_dict=True)

  sens = class_report['1']['recall']
  spec = class_report['0']['recall']

  print('Test Accuracy, Precision, Recall', acc, prec, rec)
  print()

  return acc, prec, rec, sens, spec

Beispiel #18

0

Datei anzeigen

def plot_kmean_score_vs_k(x_pca, num_iterations=20):
    '''
    INPUT:
    x_pca - a dataframe pca data
    num_iterations - the number of iterations to fit KMeans clusters over
    
    OUTPUT:
    a chart showing KMeans score vs K
    '''
    # Over a number of different cluster counts...
    # run k-means clustering on the data and...
    # compute the average within-cluster distances.
    kmean_scores = []
    num_clusters = []
    for i in range(num_iterations):
        kmeans_i = KMeans(i + 1)
        model_i = kmeans_i.fit(x_pca)
        score_i = np.abs(kmeans_i.score(x_pca))
        kmean_scores.append(score_i)
        num_clusters.append(i + 1)

    # Investigate the change in within-cluster distance across number of clusters.
    # HINT: Use matplotlib's plot function to visualize this relationship.
    # now plot the scores
    plt.plot(num_clusters, kmean_scores, linestyle='--', marker='o', color='b')
    plt.xlabel('K')
    plt.ylabel('KMean Score')
    plt.title('KMean Score vs. K')

Beispiel #19

0

Datei anzeigen

 def classify(self):
     kmeans = KMeans(n_clusters=8, init='k-means++', n_init=10, max_iter=300, tol=0.0001,
                     precompute_distances='auto',
                     verbose=0, random_state=None, copy_x=True, n_jobs=1, algorithm='auto')
     kmeans.fit(self.finalList, self.classList)
     scoreKMeans = kmeans.score(self.testFinalList, self.testClassList)
     print("K-Means: ", scoreKMeans)

Beispiel #20

0

Datei anzeigen

Datei: 散点聚类.py Projekt: whyang78/machineLearning-base

def fit_plot_kmean_model(n_clusters, x):
    plt.xticks([])
    plt.yticks([])
    markers = ['o', '^', '*', 's']
    colors = ['r', 'b', 'y', 'k']

    clf = KMeans(n_clusters=n_clusters)
    clf.fit_predict(x)
    score = clf.score(x)  #绝对值越大越不好
    plt.title('k={},score={:.2f}'.format(n_clusters, score))

    labels = clf.labels_
    centers = clf.cluster_centers_
    for i in range(n_clusters):
        cluster = x[labels == i]
        plt.scatter(cluster[:, 0],
                    cluster[:, 1],
                    s=30,
                    c=colors[i],
                    marker=markers[i])
    plt.scatter(centers[:, 0],
                centers[:, 1],
                s=200,
                c='white',
                marker='o',
                alpha=0.9)
    for i, c in enumerate(centers):
        plt.scatter(c[0], c[1], s=50, marker='$%d$' % i, c=colors[i])

Beispiel #21

0

Datei anzeigen

Datei: anchor_clustering.py Projekt: nomadseed/BDDutils

def simpleKMeans(xlist, n_cluster=2, random_state=0, algorithm='auto'):
    """
    
    output:
        centroids: centroids acquired from clustering
        labels: clustering result of input array
        score: sum of distances from centroids to all the points in their cluster
        time: processing time of current k-means clustering
        algorithm:
            'full' for classical EM-style algorithm
            'elkan' is more efficient by using the traiangle inequality, but 
                not suitable for sparse data.
            'auto' will choose elkan for dense data and full for sparse data
        
    """
    if len(xlist) == 0:  #skip empty list
        return [], [], 1, 0

    starttime = time.time()
    kmeans = KMeans(n_clusters=n_cluster,
                    random_state=random_state,
                    algorithm=algorithm).fit(xlist)
    processtime = time.time() - starttime

    labels = kmeans.labels_
    centroids = kmeans.cluster_centers_
    score = -kmeans.score(xlist)

    return centroids, labels, score, processtime

Beispiel #22

0

Datei anzeigen

Datei: kmeans.py Projekt: michalt25/Courses

def explore(data):
  errList=[];
  for i in range(2,int(len(data)**0.5)):
    km = KMeans(n_clusters=i)
    km.fit(data)
    err = abs(km.score(data))
    errList.append(err)

Beispiel #23

0

Datei anzeigen

def diarization2(filename, n_speakers, winSize=WINSIZE):
    #mfcc
    print("---diarization2---")
    sr, signal = audioIo.read_audio_file(filename)
    mid_window = 1.28  #2.0s
    mid_step = 0.08  #0.2
    short_window = 0.032  #0.05s
    short_step = short_window * 0.5

    feature_all, _, _ = audioFeature.mid_feature_extraction(
        signal, sr, mid_window * sr, mid_step * sr, short_window * sr,
        short_step * sr)
    print("feature finally: ", feature_all.shape)

    #kmeans
    kmeans = KMeans(n_clusters=n_speakers, init='k-means++', random_state=0)
    kmeans.fit(feature_all.T)
    cls = kmeans.labels_
    segs, flags = audioMath.labels_to_segments(cls, mid_step)
    print("kmeans result:")
    for s in range(segs.shape[0]):
        print("{:.3f} {:.3f} {}".format(segs[s, 0], segs[s, 1], flags[s]))
    print("标签", len(cls), cls)
    #print("质心",kmeans.cluster_centers_)
    print("SSE", kmeans.inertia_)
    print("迭代次数", kmeans.n_iter_)
    print("分值", kmeans.score(feature_all.T))

Beispiel #24

0

Datei anzeigen

Datei: Categorizer.py Projekt: carolius123/logclassifier

    def __buildClusterModel(self, k_, vectors):
        # 再次聚类并对结果分组。 Kmeans不支持余弦距离
        kmeans = KMeans(n_clusters=k_, n_init=20, max_iter=500).fit(vectors)
        norm_factor = -vectors.shape[1]  # 按字典宽度归一化，保证不同模型的可比性
        groups = DataFrame({
            'C':
            kmeans.labels_,
            'S': [kmeans.score([v]) / norm_factor for v in vectors]
        }).groupby('C')
        alias = ['Type' + str(i) for i in range(k_)]  # 簇的别名，默认为Typei，可人工命名
        proportions = groups.size() / len(vectors)  # 该簇向量数在聚类总向量数中的占比
        quantiles = np.array([
            groups.get_group(i)['S'].quantile(self.__Quantile,
                                              interpolation='higher')
            for i in range(k_)
        ])
        boundaries = groups['S'].agg('max').values - quantiles  # 该簇中最远点到分位点的距离
        for i in range(k_):
            if boundaries[i] > quantiles[i]:  # 边界太远的话，修正一下
                boundaries[i] = quantiles[i]
            elif boundaries[i] == 0:  # 避免出现0/0
                boundaries[i] = 1e-100

        G.log.info(
            'Model(k=%d) built. inertia=%.3f， max proportion=%.2f%%, max quantile=%.3f, max border=%.3f',
            k_, kmeans.inertia_,
            max(proportions) * 100, max(quantiles), max(boundaries))
        return kmeans, alias, proportions, boundaries, quantiles

Beispiel #25

0

Datei anzeigen

    def clustering(self, data=None, n_clusters=None, apply_cluster_name=True):

        logger.info({"message": "Clustering phrases.",
                     "n_clusters": n_clusters, "apply_cluster_name": apply_cluster_name})

        if data != None:
            self.data = data
        elif self.data_processed != None:
            data = self.data_processed
        else:
            data = self.data

        if isinstance(n_clusters, int):
            self.n_clusters = n_clusters
        else:
            n_clusters = self.n_clusters

        X = apply_tfidf(data)

        # Initialize the clusterer with n_clusters value and a random generator for reproducibility.
        clusterer = KMeans(n_clusters=n_clusters, random_state=SEED)
        self.labels = clusterer.fit_predict(X)

        if apply_cluster_name:
            self.get_clusters_name(clean_texts=True)

        self.kmeans_score = clusterer.score(X)
        self.silhouette_score = silhouette_score(X, self.labels)

        return {"scores": {"kmeans_score": self.kmeans_score, "silhouette_score": self.silhouette_score}, "data": self.data, "labels": self.labels}

Beispiel #26

0

Datei anzeigen

Datei: genres.py Projekt: lwoloszy/albumpitch

def explore_k(svd_trans, k_range):
    '''
    Explores various values of k in KMeans

    Args:
        svd_trans: dense array with lsi transformed data
        k_range: the range of k-values to explore
    Returns:
        scores: list of intertia scores for each k value
    '''

    scores = []
    # spherical kmeans, so normalize
    normalizer = Normalizer()
    norm_data = normalizer.fit_transform(svd_trans)
    for k in np.arange:
        km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1,
                    verbose=2)
        km.fit(norm_data)
        scores.append(-1*km.score(norm_data))
    plt.plot(k_range, scores)
    plt.xlabel('# of clusters')
    plt.ylabel('Inertia')
    sns.despine(offset=5, trim=True)
    return scores

Beispiel #27

0

Datei anzeigen

Datei: clustering.py Projekt: yma17/476-final-project

def k_means_clustering(X, k=-1):
    """Perform k means algorithm."""

    if k == -1:
        begin = 5
        end = 1001
    else:
        begin = k
        end = k + 1

    for k in range(begin, end, 5):
        print(str(k) + " clusters started for k-means")
        kmeans = KMeans(n_clusters=k).fit(X)
        labels = kmeans.labels_

        clusters = []
        for i in range(k):
            row = []
            for j in range(len(labels)):
                if i == labels[j]:
                    row.append(j)
            clusters.append(row)

        results_file = open('results/k_means_' + str(k) + '.pickle', 'wb')
        pickle.dump(clusters, results_file)
        results_file.close()

        print(str(k) + " clusters completed. Score: " + str(kmeans.score(X)))

Beispiel #28

0

Datei anzeigen

    def run(self):
        df_pca = self.df_pca
        self.X_pca = df_pca.values[:, :self.settings.NUM_PRESSURE_LEVELS * 2]

        self.kmeans_objs = {}
        self.cluster_cluster_dists = {}
        self.df_labels = pd.DataFrame(index=self.df_pca.index)
        scores = []

        for n_clusters in self.settings.CLUSTERS:
            if n_clusters == self.settings.DETAILED_CLUSTER:
                # Only calc all seeds for details cluster.
                seeds = self.settings.RANDOM_SEEDS
            else:
                seeds = self.settings.RANDOM_SEEDS[:1]

            logger.info('Running for n_clusters = {}'.format(n_clusters))
            for seed in seeds:
                logger.debug('seed: {}'.format(seed))
                kmeans = KMeans(n_clusters=n_clusters, random_state=seed) \
                    .fit(self.X_pca[:, :self.n_pca_components])
                if seed == seeds[0]:
                    scores.append(kmeans.score(self.X_pca[:, :self.n_pca_components]))
                logger.debug(np.histogram(kmeans.labels_, bins=n_clusters - 1))

                cluster_cluster_dist = kmeans.transform(kmeans.cluster_centers_)
                ones = np.ones((n_clusters, n_clusters))
                cluster_cluster_dist = np.ma.masked_array(cluster_cluster_dist, np.tril(ones))
                self.cluster_cluster_dists[(n_clusters, seed)] = cluster_cluster_dist

                self.kmeans_objs[(n_clusters, seed)] = (self.n_pca_components, kmeans)
                label_key = 'nc-{}_seed-{}'.format(n_clusters, seed)
                self.df_labels[label_key] = kmeans.labels_
        self.scores = np.array(scores)

Beispiel #29

0

Datei anzeigen

def kmeans_binning(t, n_bins, n_trials=10):
    """
    Carries out kmeans binning
    
    Args:
        t (np.array): the template
        n_bins (int): the number of bins
        n_trials (int): the number of trials
    
    Returns:
        np.array: the binning vector
    """
    best_clustering = None
    best_score = None
    
    for _ in range(n_trials):
        kmeans= KMeans(n_clusters=n_bins, random_state= np.random.randint(100))
        kmeans.fit(t.reshape(-1, 1))
        score= kmeans.score(t.reshape(-1, 1))
        if best_score is None or score > best_score:
            best_score= score
            best_clustering= kmeans.labels_
    
    clusters= np.unique(best_clustering)
    for i in range(len(clusters)):
        for j in range(i+1, len(clusters)):
            if np.mean(t[best_clustering == clusters[i]]) < np.mean(t[best_clustering == clusters[j]]):
                tmp_clustering= best_clustering.copy()
                tmp_clustering[best_clustering == clusters[j]]= clusters[i]
                tmp_clustering[best_clustering == clusters[i]]= clusters[j]
                best_clustering= tmp_clustering
    
    return best_clustering

Beispiel #30

0

Datei anzeigen

Datei: K-means.py Projekt: Wonderful-Resource/scikit-learn-ml

def fit_plot_kmean_model(n_clusters, X):
    plt.xticks(())
    plt.yticks(())

    # 使用 k-均值算法进行拟合
    kmean = KMeans(n_clusters=n_clusters)
    kmean.fit_predict(X)

    labels = kmean.labels_
    centers = kmean.cluster_centers_
    markers = ['o', '^', '*', 's']
    colors = ['r', 'b', 'y', 'k']

    # 计算成本
    score = kmean.score(X)
    plt.title("k={}, score={}".format(n_clusters, (int)(score)))

    # 画样本
    for c in range(n_clusters):
        cluster = X[labels == c]
        plt.scatter(cluster[:, 0], cluster[:, 1],
                    marker=markers[c], s=20, c=colors[c])
    # 画出中心点
    plt.scatter(centers[:, 0], centers[:, 1],
                marker='o', c="white", alpha=0.9, s=300)
    for i, c in enumerate(centers):
        plt.scatter(c[0], c[1], marker='$%d$' % i, s=50, c=colors[i])

Beispiel #31

0

Datei anzeigen

def main():
    # setting the hyper parameters
    import argparse
    parser = argparse.ArgumentParser(description='train',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--latent_d_file', default='results/vae/latent_d.csv')
    args = parser.parse_args()

    #latent_d = np.load(args.latent_d_file)
    latent_d = pd.read_csv(args.latent_d_file).values[:,1:]
    n_clusters = []
    scores = []


    for x in range(2,20,1):
      print(str(x)+" : "+str(int(1.4**x)), end = "\r")
      kmeans = KMeans(n_clusters=int(1.5**x), n_init=4)
      kmeans = kmeans.fit(latent_d[:int((latent_d.shape[0])*0.85)])
      n_clusters.append(int(1.5**x))
      scores.append(kmeans.score(latent_d[int((latent_d.shape[0])*0.85):]))


    plt.scatter(n_clusters, scores)
    plt.title("K-means score for K")
    plt.xlabel("K")
    plt.ylabel("sklearn K-means score")
    plt.show()

Beispiel #32

0

Datei anzeigen

Datei: preprocess.py Projekt: cockroach54/smart-mirror-2nd

    def _fit_KMeans(self, data, k):
        model = KMeans(n_clusters=k,
                       init="k-means++",
                       max_iter=100,
                       random_state=8).fit(data)
        centroids = model.cluster_centers_

        if (self.SHOW_KMEANS_PLOT):
            pca = PCA(n_components=2)
            transformed = pca.fit_transform(data)
            transformed_centroids = pca.transform(centroids)

            plt.figure(figsize=(8, 8))
            plt.scatter(transformed[:, 0],
                        transformed[:, 1],
                        marker='o',
                        c=model.labels_)
            plt.scatter(transformed_centroids[:, 0],
                        transformed_centroids[:, 1],
                        marker='x',
                        c='r')
            for i in range(len(data)):
                plt.annotate(i, (transformed[i, 0], transformed[i, 1]))
            for i in range(len(transformed_centroids)):
                plt.annotate(
                    i,
                    (transformed_centroids[i, 0], transformed_centroids[i, 1]))

            plt.grid(False)
            plt.title("PCA&K-Means: k={}, iteration={}, score={:5.2f}".format(
                k, k * 10, model.score(data)))
            plt.show()

        return model

Beispiel #33

0

Datei anzeigen

Datei: new_input.py Projekt: yusuk-e/Sports_Analysis002

def Clustering():
    
    PCA_threshold = 0.8

    for team_id in team_dic.itervalues():
        BoF_Team = BoF[team_id]
        
        dim = np.shape(BoF_Team)[0]
        threshold_dim = 0
        for i in range(dim):
            pca = PCA(n_components = i)
            pca.fit(BoF_Team)
            X = pca.transform(BoF_Team)
            E = pca.explained_variance_ratio_
            if np.sum(E) > PCA_threshold:
                thereshold_dim = len(E)
                print 'Team' + str(team_id)+ ' dim:%d' % thereshold_dim
                break
    
        pca = PCA(n_components = thereshold_dim)
        pca.fit(BoF_Team)
        X = pca.transform(BoF_Team)
    
        min_score = 10000
        for i in range(100):
            model = KMeans(n_clusters=K, init='k-means++', max_iter=300, tol=0.0001).fit(X)
            if min_score > model.score(X):
                min_score = model.score(X)
                labels = model.labels_
        print min_score
    
        pca = PCA(n_components = 2)
        pca.fit(BoF_Team)
        X = pca.transform(BoF_Team)
        for k in range(K):
            labels_ind = np.where(labels == k)[0]
            plt.scatter(X[labels_ind,0], X[labels_ind,1], color=C[k])
        plt.legend(['C0','C1','C2','C3','C4'], loc=4)
    
        plt.title('Team' + str(team_id) + '1_PCA_kmeans')
        plt.savefig('Seq_Team' + str(team_id)+ '/Team' + str(team_id) + '_PCA_kmeans.png')
        plt.show()
        plt.close()
        np.savetxt('Seq_Team' + str(team_id) + '/labels_Team' + str(team_id) + '.csv', \
                   labels, delimiter=',')

Beispiel #34

0

Datei anzeigen

Datei: K-Means.py Projekt: bigfatnoob/Apriori-Confidence

def findKForKMeans(X):
    graph=[];
    for i in range(2,200):
        km = KMeans(n_clusters=i);
        km.fit(X);
        y = km.score(X);
        graph.append(y);
        print i, y
    print graph;

Beispiel #35

0

Datei anzeigen

Datei: cluster.py Projekt: shikharbahl/acai

def cluster(train_latents, train_labels, test_latents, test_labels):
    num_classes = np.shape(train_labels)[-1]
    labels_hot = np.argmax(test_labels, axis=-1)
    train_latents = np.reshape(train_latents,
                               newshape=[train_latents.shape[0], -1])
    test_latents = np.reshape(test_latents,
                              newshape=[test_latents.shape[0], -1])
    kmeans = KMeans(init='random', n_clusters=num_classes,
                    random_state=0, max_iter=1000, n_init=FLAGS.n_init,
                    n_jobs=FLAGS.n_jobs)
    kmeans.fit(train_latents)
    print(kmeans.cluster_centers_)
    print('Train/Test k-means objective = %.4f / %.4f' %
          (-kmeans.score(train_latents), -kmeans.score(test_latents)))
    print('Train/Test accuracy %.4f / %.3f' %
          (error(np.argmax(train_labels, axis=-1), kmeans.predict(train_latents), k=num_classes),
           error(np.argmax(test_labels, axis=-1), kmeans.predict(test_latents), k=num_classes)))
    return error(labels_hot, kmeans.predict(test_latents), k=num_classes)

Beispiel #36

0

Datei anzeigen

Datei: predict_lh.py Projekt: BIDS-collaborative/EDAM

def predictKMeans(X, y):
	col_mean = np.nanmean(X,axis=0)
	inds = np.where(np.isnan(X))
	X[inds]=np.take(col_mean,inds[1])
	km = KMeans(n_clusters=2)

	X_train, X_test, y_train, y_test = chooseRandom(X, y)
	km.fit(X_train, y_train)
	return km.score(X_test, y_test)

Beispiel #37

0

Datei anzeigen

Datei: hw4q19_sklearn.py Projekt: ashvant/Machine-Learning-Techniques-NTU

def experiment(nexperiments, nclusters):
    E_ins = []
    for _ in range(nexperiments):
        kmeans = KMeans(nclusters, max_iter=300, n_init=1, init='random')
        kmeans.fit(X_train)
        score = kmeans.score(X_train)
        E_in = -score / nsamples
        E_ins.append(E_in)
    return E_ins

Beispiel #38

0

Datei anzeigen

Datei: Clustering.py Projekt: jeka3230/Pattern-recognition

def showKMeans(X, N):
    scores = []
    for number in xrange(N / 6, N / 2):
        clustering = KMeans(n_clusters=number, max_iter=MAX_ITER, n_init=N_INIT, n_jobs=N_JOBS )
        clustering.fit_predict(X)
        scores.append(clustering.score(X))
    plt.plot(scores)
    plt.xlabel(XLABEL)
    plt.ylabel(YLABEL)
    plt.show()

Beispiel #39

0

Datei anzeigen

Datei: model_vis.py Projekt: banjopickin/women_workforce

def kmean_score(X,nclust):
    '''
    calculate kmeans score
    :param X:numpy array, data set to cluster
    :param nclust: int, number of cluster
    :return: float
    '''
    km = KMeans(nclust)
    km.fit(X)
    rss = -km.score(X)
    return rss

Beispiel #40

0

Datei anzeigen

Datei: partition_3_11_16.py Projekt: lujonathanh/coffdrop

def partition_gene_kmeans(geneToCases, patientToGenes, gene_list, num_components, num_bins, title=None, do_plot=True):

    # get gene index mapping
    giv = getgiv(geneToCases.keys(), gene_list)

    # convert patients into vectors
    patientToVector = getpatientToVector(patientToGenes, giv)

    vectors = patientToVector.values()

    print vectors[0]
    print "Length of vectors is ", len(vectors[0])

    km = KMeans(num_components)

    km.fit(vectors)

    clusterToPatient = {}

    for patient in patientToVector:
        cluster = km.predict(patientToVector[patient])[0]
        if cluster not in clusterToPatient:
            clusterToPatient[cluster] = set()
        clusterToPatient[cluster].add(patient)

    # plot patients in each cluster


    if do_plot:
        bins = range(0, max([len(p_gene) for p_gene in patientToGenes.values()]), max([len(p_gene) for p_gene in patientToGenes.values()])/num_bins)
        plt.figure()
        for cluster in clusterToPatient:
            plt.hist([len(patientToGenes[p]) for p in clusterToPatient[cluster]], bins=bins, label=str(cluster), alpha = 1.0/num_components)
        plt.xlabel('# Somatic Mutations In Tumor', fontsize=20)
        plt.ylabel('Number of Samples', fontsize=20)
        plt.legend()
        plt.title("Kmeans size " + str(num_components), fontsize=20)
        plt.show()



    data = {}
    data['Score'] = km.score(vectors)
    data['Number'] = num_components
    data['% Explained'] = np.round([100 * len(clusterToPatient[cluster]) * 1.0 / len(patientToGenes) for cluster in clusterToPatient], 2)
    data['Vector size'] = len(vectors[0])
    # data['Covariates'] = np.round(g.covars_,2)
    # data["Total log probability"] = sum(g.score(obs))
    # data["AIC"] = g.aic(obs)
    # data["BIC"] = g.bic(obs)
    # data['Explained'] = [np.round([len([in_w for in_w in respon if in_w[i] == max(in_w)]) * 1.0 /len(respon) for i in range(num_components)], 2)]

    return data

Beispiel #41

0

Datei anzeigen

Datei: cluster.py Projekt: jwchennlp/study

def cluster():
    f=open("./forum/flu.txt")
    flu =[]
    for line in f.readlines():
        flu.append(line)

    f.close()
    

    vectorizer = TfidfVectorizer(sublinear_tf= True,min_df=0,max_df=1.0,ngram_range=(1,1),smooth_idf=True,use_idf=1,strip_accents=None)
    x=vectorizer.fit_transform(flu)

    n_samples,n_features=x.shape

    print n_samples,n_features

    kmeans =KMeans(n_clusters=4, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1)
    kmeans.fit(x)
    print kmeans.score(x)
    c=kmeans.predict(x)

    f1=open('./forum/1.txt','w')
    f2=open('./forum/2.txt','w')
    f3=open('./forum/3.txt','w')
    f4=open('./forum4.txt','w')
    for i in range(0,len(c)):
        if c[i]== 0:
            f1.write('%s'%(flu[i]))
        elif c[i]==1:
            f2.write('%s'%(flu[i]))
        elif c[i]==2:
            f3.write('%s'%(flu[i]))
        else:
            f4.write('%s'%(flu[i]))
    f1.close()
    f2.close()
    f3.close()
    f4.close()

Beispiel #42

0

Datei anzeigen

Datei: ex7.py Projekt: freephys/mylab

def test_kmeans():
    X = loadmat('ex7/ex7data2.mat')['X']
    n_init = int(X.shape[0] ** 0.5)
    Js = []
    Ks = range(1, 11)
    for k in Ks:
        km = KMeans(n_clusters=k, n_init=n_init, n_jobs=-1)
        km.fit(X)
        Js.append(km.score(X))
#    plotJ(Ks,Js)
    bestK = best_accelation(Js)
    km = KMeans(n_clusters=bestK, n_init=n_init, n_jobs=-1)
    km.fit(X)
    plt.clf()
    plt.scatter(*np.split(X, 2, axis=1), c='g')
    plt.scatter(*np.split(km.cluster_centers_, 2, axis=1), c='b', marker='D')
    plt.show()

Beispiel #43

0

Datei anzeigen

Datei: Clustering.py Projekt: xwtt8/Hospital-Readmission-

def _runKmeans(train_data):
	#print ('Running Kmeans Clustering...')
	num_clusters = 5
	model = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10)
	
	max_score = 0
	iteration = 20
	best_classification = []
	for i in range(1,iteration): 
	#	print "Iteration number "+str(i)
		model.fit(train_data)
		score = model.score(train_data)
		
		if i == 1 or score > max_score:
			max_score = score
			best_classification =  model.predict(train_data)
	
	#print ("Done!")
	return best_classification.tolist()

Beispiel #44

0

Datei anzeigen

Datei: bike_challenge.py Projekt: zhaoqf123/Practice

def survey_n_clusters(data, n_kernels, path_out_clustering, df_original):
	from sklearn.cluster import KMeans
#	from sklearn.metrics import silhouette_score
	path_out_clustering = path_out_clustering[:-4] + '_' + str(n_kernels) + 'clusters.csv'
	model = KMeans(n_clusters=n_kernels, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1)
	label = model.fit_predict(data)

#	silhouette_avg = silhouette_score(data, label)
#	print("For n_clusters =", n_kernels, "The average silhouette_score is : ", silhouette_avg)
#	The above evaluation throw feedback "Killed: 9"
## Step 2.1 Output the labeled data, combining with the original data that is before one-hot-encoder
	print ("For n_clusters =", n_kernels, "The summation of all labels is : ", sum(label))
	print ("For n_clusters =", n_kernels, "The total number of non-zero labels is : ", sum(label > 0))
	print ("For n_clusters =", n_kernels, "The built-in score is : ", model.score(data))

	label = label.reshape(len(label),1)
	df_out = pd.DataFrame(label, columns = ['label'])
	df_out = pd.concat([df_original, df_out], axis = 1)
	df_out.to_csv(path_out_clustering, index = False)

Beispiel #45

0

Datei anzeigen

Datei: kMeans.py Projekt: wasiahmad/Insulting-Comment-Detection

def runKMneas(listOfTrainComments, listOfTestComments, listOfUniqueTokens):
	xTrain = []
	yTrain = []
	for i in range(len(listOfTrainComments)):
		BOW = generateBOW(listOfTrainComments[i], listOfUniqueTokens)
		xTrain.append(BOW)
		yTrain.append(listOfTrainComments[i].getStatus())

	xTest = []
	yTest = []
	for i in range(len(listOfTestComments)):
		BOW = generateBOW(listOfTestComments[i], listOfUniqueTokens)
		xTest.append(BOW)
		yTest.append(listOfTestComments[i].getStatus())

	clf = KMeans(n_clusters=2, max_iter = 300)
	clf.fit(xTrain, yTrain)
	score = clf.score(xTest)
	prediction = clf.predict(xTest)
	print('K-means Clustering, Score - ' + str(score), '\n')

Beispiel #46

0

Datei anzeigen

Datei: Clustering.py Projekt: xwtt8/Hospital-Readmission-

def runKmeans(data_file):
	train_data = csv_io.read_data(data_file)
	print len(train_data)
	num_clusters = 10
	model = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10)
	
	max_score = 0
	iteration = 2
	best_classification = []
	for i in range(1,iteration): 
		print "Iteration number "+str(i)
		model.fit(train_data)
		score = model.score(train_data)
		
		if i == 1 or score > max_score:
			max_score = score
			best_classification =  model.predict(train_data)
	
	print len(best_classification.tolist())
	return best_classification.tolist()

Beispiel #47

0

Datei anzeigen

Datei: multi_kmeans.py Projekt: danielhtrauner/cs451finalproject

def train_test(preprocessed_data):
	'''
	Trains a KMeans cluster classifier from sklearn 
	using encoded_data where encoded_data is a list 
	of lists where each list is an example.  Note that
	there are no important parameters to tweak.
	'''
	all_labels = list(person[0] for person in preprocessed_data)
	all_features = list(person[1:] for person in preprocessed_data)

	n = 100

	km_classifier = KMeans(n_clusters=5, n_init=n, max_iter=300, tol=0.0001, precompute_distances=True, n_jobs=2)

	predictions = km_classifier.fit_predict(all_features).tolist()

	# disgusting Python to get rid of a runtime error due to the score method wanting float instead of int64 input
	neg_kmeans = km_classifier.score([[float(feature) for feature in person] for person in all_features])

	print '\nNegated KMeans function value:', neg_kmeans
	print 'Data distribution by label:', Counter(all_labels)
	print 'Data Distribution by cluster:', Counter(predictions), '\n'

Beispiel #48

0

Datei anzeigen

Datei: kmeans.py Projekt: ChuangWang-Zoox/Software

def runKMeans(cv_img, num_colors, init):
	imgdata = getimgdatapts(cv_img[-100:,:,:])
	kmc = KMeans(n_clusters=num_colors, max_iter=25, n_init=10, init=init)
	t1 = time.time();
	kmc.fit_predict(imgdata)
	t2 = time.time();
	print("fit time: %f"%(t2-t1))
	trained_centers = kmc.cluster_centers_
	# print trained_centers
	labels = kmc.labels_
	labelcount = Counter()
	t1 = time.time();
	# IPython.embed()
	# for pixel in labels:
	# 	labelcount[pixel] += 1
	for i in np.arange(num_colors):
		labelcount[i]=np.sum(labels==i)

	t2 = time.time();
	print("counting labels time: %f"%(t2-t1))
	# print labelcount
	# IPython.embed()
	score=kmc.score(imgdata)
	return trained_centers, labelcount,score

Beispiel #49

0

Datei anzeigen

Datei: number_clusters.py Projekt: neurodata/non-parametric-clustering

#f = open('data.txt')
#X = np.array([[float(l.strip())] for l in f.readlines()])

X = np.concatenate((
        np.random.normal(0, 1, 10),
        np.random.normal(5, .5, 10),
        np.random.normal(10, 1, 10)
    ))
X = np.array([[x] for x in X])

ks = range(1, 10)
kmeans_scores = []
for k in ks:
    km = KMeans(n_clusters=k)
    km.fit(X)
    kmeans_scores.append(-km.score(X))

gmm_scores = []
bic_scores = []
for k in ks:
    gmm = GMM(n_components=k)
    gmm.fit(X)
    gmm_scores.append(-gmm.score(X).sum())
    bic_scores.append(gmm.bic(X))

_, kmeans_gaps = gap.kmeans_gap(X, nrefs=50, max_clusters=10)
_, gmm_gaps = gap.gmm_gap(X, nrefs=50, max_clusters=10)

fig = plt.figure(figsize=(3*6, 2*4))
ax = fig.add_subplot(231)
ax.plot(ks, kmeans_scores, 'b-o', lw=2, ms=8)

Beispiel #50

0

Datei anzeigen

Datei: machLearnBasics4.py Projekt: jamesmf/SGDintoMadness

ks  = [2,4,20]

for run,k in enumerate(ks):
    km  = KMeans(n_clusters=k)  #new KMeans object
    km.fit(data)                #fit it to the data
    rain    = cm.rainbow(np.linspace(0,1,k)) #assign each cluster index a matplotlib color
    clrs    = [rain[i] for i in km.labels_] #transform label indices into colors
    
    fig     = plt.figure(8+run)     #new figure
    ax      = fig.add_subplot(111)  
    ax.scatter(data[:,0],data[:,1],c=clrs)  #plot the data coloring by cluster
    plt.savefig("kmeans_"+str(k))



#show the change in clustering "score" as k increases
ks  = range(2,100,4)
scores  = []
for run,k in enumerate(ks):
    km  = KMeans(n_clusters=k)
    km.fit(data)
    scores.append(-km.score(data))
    
fig     = plt.figure(20)
ax      = fig.add_subplot(111)
ax.scatter(ks,scores)
ax.set_xlabel("Number of Clusters (k)")
ax.set_ylabel("Sum of Distances to Centroids")
plt.savefig("kmeansScores")

Beispiel #51

0

Datei anzeigen

Datei: KMeans_Unsupervized.py Projekt: RK900/Machine-Learning

# -*- coding: utf-8 -*-
"""
Created on Tue Aug  4 18:05:01 2015

@author: rohankoodli
"""

#import numpy as np
import matplotlib.pyplot as plt
#from scipy import stats

import seaborn
seaborn.set()

from sklearn.datasets.samples_generator import make_blobs

X,y = make_blobs(n_samples=150,centers=4,random_state=0,cluster_std=01.0)

plt.scatter(X[:,0],X[:,1],s=70)
plt.plot()

from sklearn.cluster import KMeans

est = KMeans(5)
est.fit(X)
y_kmeans = est.predict(X)
plt.scatter(X[:,0],X[:,1],c=y_kmeans,s=90,cmap='rainbow')
plt.plot()
print est.score(X,y)

Beispiel #52

0

Datei anzeigen

Datei: gap.py Projekt: neurodata/non-parametric-clustering

def kmeans(k, X):
    km = KMeans(k)
    km.fit(X)
    obj_value = np.abs(km.score(X))
    return obj_value

Beispiel #53

0

Datei anzeigen

Datei: input_meta.py Projekt: yusuk-e/Sports_Analysis

def Clustering(BoF_Team1, BoF_Team2):
    t0 = time()

    PCA_threshold = 0.8

    # --Team1--
    dim = np.shape(BoF_Team1)[0]
    threshold_dim = 0
    for i in range(dim):
        pca = PCA(n_components=i)
        pca.fit(BoF_Team1)
        X = pca.transform(BoF_Team1)
        E = pca.explained_variance_ratio_
        if np.sum(E) > PCA_threshold:
            thereshold_dim = len(E)
            print "Team1 dim:%d" % thereshold_dim
            break

    pca = PCA(n_components=thereshold_dim)
    pca.fit(BoF_Team1)
    X = pca.transform(BoF_Team1)

    min_score = 10000
    for i in range(200):
        model = KMeans(n_clusters=K, init="k-means++", max_iter=1000, tol=0.0001).fit(X)
        if min_score > model.score(X):
            labels_Team1 = model.labels_

    pca = PCA(n_components=2)
    pca.fit(BoF_Team1)
    X = pca.transform(BoF_Team1)
    for k in range(K):
        labels_Team1_ind = np.where(labels_Team1 == k)[0]
        plt.scatter(X[labels_Team1_ind, 0], X[labels_Team1_ind, 1], color=C[k])

    plt.title("Team1_PCA_kmeans")
    # plt.legend()
    plt.savefig("Seq_Team1/Team1_PCA_kmeans.png")
    # plt.show()
    plt.close()
    np.savetxt("Seq_Team1/labels_Team1.csv", labels_Team1, delimiter=",")

    # --Team2--
    dim = np.shape(BoF_Team2)[0]
    threshold_dim = 0
    for i in range(dim):
        pca = PCA(n_components=i)
        pca.fit(BoF_Team2)
        X = pca.transform(BoF_Team2)
        E = pca.explained_variance_ratio_
        if np.sum(E) > PCA_threshold:
            thereshold_dim = len(E)
            print "Team2 dim:%d" % thereshold_dim
            break

    min_score = 10000
    for i in range(200):
        model = KMeans(n_clusters=K, init="k-means++", max_iter=1000, tol=0.0001).fit(X)
        if min_score > model.score(X):
            labels_Team2 = model.labels_

    pca = PCA(n_components=2)
    pca.fit(BoF_Team2)
    X = pca.transform(BoF_Team2)
    for k in range(K):
        labels_Team2_ind = np.where(labels_Team2 == k)[0]
        plt.scatter(X[labels_Team2_ind, 0], X[labels_Team2_ind, 1], color=C[k])

    plt.title("Team2_PCA_kmeans")
    plt.savefig("Seq_Team2/Team2_PCA_kmeans.png")
    # plt.show()
    plt.close()
    np.savetxt("Seq_Team2/labels_Team2.csv", labels_Team2, delimiter=",")

    print "time:%f" % (time() - t0)
    return labels_Team1, labels_Team2

Beispiel #54

0

Datei anzeigen

Datei: helper.py Projekt: UC3MSocialRobots/novelty-detection-in-hri

def compute_scores(normal_users, queue, Ks=[]):

    '''
        Calculates the novelty scores (noise and strangeness) for the 4 algotithms
        Receives the list of normal users and the queue (all users) and the list of curiosity factors Ks
        Updates the global variables GMM_n, one_n, lsa_n, K_n, GMM_s, one_s, lsa_s with the results 
    '''
    
    global GMM_n, one_n, lsa_n, K_n, GMM_s, one_s, lsa_s, K_s #Novelty Scores for each algorithm, those ''_n are for noise score, ''_s are for strangeness score 

    GMM_n = []
    one_n = []
    lsa_n = []
    K_n = []
    GMM_s = []
    one_s = []
    lsa_s = []
    K_s = []

    K_GMM_n, K_KMeans_n, K_GMM_s, K_KMeans_s = Ks #K_GMM_n, K_KMeans_n are the noise curiosity factors for each algorithm
                                                  #K_GMM_s, K_KMeans_s are the strangeness curiosity factors for each algorithm
                                                  #Ks is a list containing the 4 above mentioned parameters
    

    '''
    
    For One_class_SVM and LSA, when asked to predict the new entry, a label is directly returned 
        LSA: 'anomaly' or '0' (normal)

        One One_class_SVM: -1 (anomaly) or 1 (normal)

    GMM and K means predict a fitting score. The novelty score is obtained calculating the zscore of the entry compared with the scores of all other entries, calling 
    the function get_score_last_item
        If the zscore returned >= 1 the new entry is anomalous

    '''

    '''
    Noise scores are computed with the queue as the base of knowledge, fitting all the entries but the last to the algorithm
    '''                                    
    B = GMM(covariance_type='full', n_components = 1)
    B.fit(queue[0:-1])
    x = [B.score([i]).mean() for i in queue]
    GMM_n.append(get_score_last_item(x, K_GMM_n))


    K = KMeans(n_clusters=1)
    K.fit(queue[0:-1])
    x = [K.score([i]) for i in queue]
    K_n.append(get_score_last_item(x, K_KMeans_n))

    oneClassSVM = OneClassSVM(nu=0.1)
    oneClassSVM.fit(queue[0:-1])
    x = oneClassSVM.predict(np.array([queue[-1]]))
    if x == -1:
        one_n.append(1)
    if x == 1:
        one_n.append(0)
    
    X = np.array(queue[0:-1])
    anomalymodel = lsanomaly.LSAnomaly()
    anomalymodel.fit(X)
    x = anomalymodel.predict(np.array([queue[-1]])) 
    if x == ['anomaly']:
        lsa_n.append(1)
    if x == [0]:
        lsa_n.append(0)

    '''
    Strangeness scores are computed with the normal users as the base of knowledge, fitting normal users to the algorithm
    ''' 

    normal_and_new = normal_users + [queue[-1]] #List to be passed to get_score_last_item to calculate the zscore of the last item, the new entry

    B = GMM(covariance_type='full', n_components = 1)
    B.fit(normal_users)
    x = [B.score([i]).mean() for i in normal_and_new]
    GMM_s.append(get_score_last_item(x, K_GMM_s))


    K = KMeans(n_clusters=1)
    K.fit(normal_users)
    x = [K.score([i]) for i in normal_and_new]
    K_s.append(get_score_last_item(x, K_KMeans_s))

    oneClassSVM = OneClassSVM(nu=0.1)
    oneClassSVM.fit(normal_users)
    x = oneClassSVM.predict(np.array([queue[-1]]))
    if x == -1:
        one_s.append(1)
    if x == 1:
        one_s.append(0)

    anomalymodel = lsanomaly.LSAnomaly()
    X = np.array(normal_users)
    anomalymodel.fit(X)
    x = anomalymodel.predict(np.array([queue[-1]])) 
    if x == ['anomaly']:
        lsa_s.append(1)
    if x == [0]:
        lsa_s.append(0)

    return GMM_n, one_n, lsa_n, K_n, GMM_s, one_s, lsa_s, K_s

Beispiel #55

0

Datei anzeigen

Datei: k_means.py Projekt: kensk8er/loan_prediction

    y_train_non_default = y_train[zero_train]
    y_train_default = y_train[none_zero_train]

    print "pre-processing train data..."
    scalar = preprocessing.StandardScaler().fit(x_train_default)
    x_train = scalar.transform(x_train_default)

    # if mode == 'test':
    #     print 'pre-processing test data...'
    #     x_test = scalar.transform(x_test)

    for k in range(1, 50):
        classifier = KMeans(n_clusters=k, n_init=10, max_iter=300, tol=0.0001, precompute_distances=True)

        classifier.fit(x_train_default)
        score = classifier.score(x_train_default)
        print k, "cluster:", score

        predicts = classifier.predict(x_train_default)
        count = {}
        loss_value = [[] for i in range(k)]

        for i in xrange(len(predicts)):
            count[predicts[i]] = count.get(predicts[i], 0) + 1
            loss_value[predicts[i]].append(y_train_default[i])  # = loss_value.get(predicts[i], 0) + y_train_default[i]

        for i in range(len(loss_value)):
            mean = np.mean(loss_value[i])
            std = np.std(loss_value[i])
            print "cluster", i, ": count =", count[i], ", mean =", mean, ", std =", std

Beispiel #56

0

Datei anzeigen

Datei: cluster.py Projekt: GaoSida/sfcc_core

points = np.array(data.loc[:, ['X', 'Y']])
dis = []
columns_name = []
for cat_data in data_grouped:
    tmp = cat_data[1].loc[:, ['X', 'Y']]
    if len(tmp) < 100:
        continue
    print "For %d: " % cat_data[0]
    columns_name.append("To%d" %cat_data[0])
    group = tmp.groupby(lambda x:random.randint(0,1))
    train_data = group.get_group(0)
    test_data = group.get_group(1)
    score = float('inf')
    K = 0
    for i in range(1, 8):
        oracle = KMeans(init='k-means++', n_clusters=i, n_init=10)
        oracle.fit(train_data)
        score_val = oracle.score(test_data)
        score_val = math.fabs(math.fabs(score_val) - oracle.inertia_) / oracle.inertia_
        if score_val < score:
            score = score_val
            K = i
    best = KMeans(init='k-means++', n_clusters=K, n_init=10)
    best.fit(tmp)
    label = best.predict(points)
    belong_point = best.cluster_centers_[label]
    dis.append(np.power(np.sum(np.power(belong_point - points,2), axis=1), 0.5))
pd.DataFrame(np.column_stack(dis), columns=columns_name).to_csv('vec.csv', index=False)

Beispiel #57

0

Datei anzeigen

Datei: kmeans.py Projekt: brenden17/infinity

from sklearn.cluster import KMeans
from sklearn.datasets import load_iris

iris = load_iris()
x = iris.data

km1 = KMeans(init='k-means++', n_clusters=3)
km1.fit(x)
print km1.labels_
print km1.cluster_centers_
print km1.score(x)

km2 = KMeans(init='random', n_clusters=3)
km2.fit(x)
print km2.labels_
print km2.cluster_centers_
print km2.score(x)

Beispiel #58

0

Datei anzeigen

Datei: Clustering.py Projekt: storm95/PriceVerification

def new_makeClusterKMeans(valueListFilename):
	maxNoOfClusters = 20
	for noOfClusters in range(maxNoOfClusters):
		noOfClusters += 1
		valueListFile = open(valueListFilename, 'r')

		partailFitTrainSize = 100
		#noOfClusters = 3
		lineNo = 0

		kmeans = KMeans(n_clusters = noOfClusters, max_iter=100)#MiniBatchKMeans(n_clusters = noOfClusters, max_iter=100, batch_size=100)

		trainValueList = []
		for line in valueListFile:
			lineNo += 1
			#if (lineNo % partailFitTrainSize == 0):
			'''	trainValueList = np.array(trainValueList)
				kmeans.partial_fit(trainValueList)
				trainValueList = []
			'''
			valueList = json.loads(line)
			tempList = []
			for col in clusterAttrList:
				tempList.append(valueList[col])
			trainValueList.append(tempList)
		
		#if (lineNo % partailFitTrainSize != 0):
		trainValueList = np.array(trainValueList)
		#kmeans.partial_fit(trainValueList)
		kmeans.fit(trainValueList)

		valueListFile.close()

		centroids = kmeans.cluster_centers_
		labels = kmeans.labels_
		
		print('centroids = ', centroids)
		print('labels = ', len(labels))

		valueListFile = open(valueListFilename, 'r')
		
		ansList = []

		xAxis = 0
		yAxis = 5

		scoreSum = 0
		lineNo = 0
		for line in valueListFile:
			valueList = json.loads(line)
			'''
			tempList = []
			for col in clusterAttrList:
				tempList.append(valueList[col])
			'''
			#ansList.append(kmeans.predict([tempList])[0])
			#thisLabel = kmeans.predict([tempList])[0]
			plt.plot(valueList[xAxis], valueList[yAxis], colors[labels[lineNo]], markersize = 10)
			#scoreSum += kmeans.score([tempList])
			lineNo += 1
		
		scoreSum = kmeans.score(trainValueList)
		
		plt.xlabel(trainAttrList[xAxis])
		plt.ylabel(trainAttrList[yAxis])

		#plt.scatter(centroids[:, 0], centroids[:, 1], marker = 'x', s = 150, linewidths = 5, zorder = 10)

		plt.show()
		

		valueListFile.close()
		print('noOfClusters = ', noOfClusters, '      sum = ', scoreSum)