Beispiel #1
0
def cluster(train_latents, train_labels, test_latents, test_labels):
    num_classes = np.shape(train_labels)[-1]
    labels_hot = np.argmax(test_labels, axis=-1)
    train_latents = np.reshape(train_latents,
                               newshape=[train_latents.shape[0], -1])
    test_latents = np.reshape(test_latents,
                              newshape=[test_latents.shape[0], -1])
    kmeans = KMeans(init='random',
                    n_clusters=num_classes,
                    random_state=0,
                    max_iter=1000,
                    n_init=FLAGS.n_init,
                    n_jobs=FLAGS.n_jobs)
    kmeans.fit(train_latents)
    print(kmeans.cluster_centers_)
    print('Train/Test k-means objective = %.4f / %.4f' %
          (-kmeans.score(train_latents), -kmeans.score(test_latents)))
    print('Train/Test accuracy %.4f / %.3f' %
          (error(np.argmax(train_labels, axis=-1),
                 kmeans.predict(train_latents),
                 k=num_classes),
           error(np.argmax(test_labels, axis=-1),
                 kmeans.predict(test_latents),
                 k=num_classes)))
    return error(labels_hot, kmeans.predict(test_latents), k=num_classes)
Beispiel #2
0
    def k_means_analyze(self, rng=[2,12], init='k-means++', n_init=10,
                        max_iter=300, tol=0.0001, precompute_distances='deprecated',
                        verbose=0, random_state=None, copy_x=True, n_jobs='deprecated',
                        algorithm='auto'):
        km_scores= []
        km_silhouette = []
        db_score = []
        for i in range(rng[0], rng[1]):
            km = KMeans(n_clusters=i, init=init, n_init=n_init,
                        max_iter=max_iter, tol=tol, precompute_distances=precompute_distances,
                        verbose=verbose, random_state=random_state, copy_x=copy_x, n_jobs=n_jobs,
                        algorithm=algorithm).fit(self.__data)
            preds = km.predict(self.__data)

            print("Score for number of cluster(s) {}: {}".format(i,km.score(self.__data)))
            km_scores.append(-km.score(self.__data))

            silhouette = silhouette_score(self.__data,preds)
            km_silhouette.append(silhouette)
            print("Silhouette score for number of cluster(s) {}: {}".format(i,silhouette))

            db = davies_bouldin_score(self.__data,preds)
            db_score.append(db)
            print("Davies Bouldin score for number of cluster(s) {}: {}".format(i,db))

            print("-"*100)

        return km_scores, km_silhouette, db_score
Beispiel #3
0
    def kmeans(self):  # Model 7
        print '\n### Running K-Means Algorithm\n'

        # Create K-Means classifier object model
        modelFile = '/tmp/model_7.joblib.pkl'
        if (os.path.isfile(modelFile)):
            model = self.loadmodel(7)
        else:
            model = KMeans(n_clusters=8, random_state=0)

        # Train the model using the training sets and check score
        model.fit(self.x_trn)
        print '\tTraining score\t\t', model.score(self.x_trn, self.y_trn)

        # Calculate algorithm scores
        predicted = model.predict(self.x_tst)
        avg_prec = average_precision_score(numpy.array(self.y_tst), predicted)
        recall = recall_score(self.y_tst, predicted, average='micro')

        # Print the algorithm scores
        print '\tTesting score\t\t', model.score(self.x_tst, predicted)
        print '\tAccuracy score\t\t', accuracy_score(self.y_tst, predicted)
        print '\tPrecision score:\t' + str(avg_prec)
        print '\tRecall score\t\t' + str(recall)

        self.savemodel(model, 7)  # Save the training model
def score_k(data, krange):
    """Generates the score and distoration for elbow plots
    """
    elbow_scores = []
    for k in krange:
        print("Evaluatng {0} clusters".format(k))
        current_model = KMeans(n_clusters=k)
        current_model.fit(data)
        current_model.score(data)
        elbow_scores.append(current_model.score(data))
    return elbow_scores
    def plot_for_offset(Map):
        
        if False:
            #################  K Means  #################
            # Set number of clusters
            n_clusters = 7

            kmeans = KMeans(n_clusters=n_clusters, random_state=42)
            kmeans.fit(Map)
            idx = kmeans.fit_predict(Map)
            kmeans.score(Map)
        else:
            ########### Categories by word count per category ############
            idx = word_categories
        
        #################  Plot  #########################

        fig, ax = plt.subplots(figsize=(12,12))
        fig.tight_layout()
        
        # Set range
        ax.set_xlim(-15000,15000)
        ax.set_ylim(-15000,15000)
        
        # List of colors
        colors = ['green', 'orange', 'red', 'blue', 'yellow', 'brown', 'violet', 'grey']

        # Plot all words
        for word, idx_, vec, label in zip(words, idx, Map, word_categories_names):
            x = vec[0]
            y = vec[1]
            ms = scaler.transform(count_total[word])
            ax.plot(x, y, marker='o', ms=ms, c=colors[idx_], alpha=0.7, linestyle='none')
            plt.annotate(word, (x, y), ha='center', va='center', size=10)

        # Create legend
        l = []
        for i in range(len(likelihood_df.columns)):
            l.append(mpatches.Patch(color=colors[i], label=likelihood_df.columns[i]))
        ax.legend(handles=l)

        # Used to return the plot as an image array
        plt.close(fig)
        io_buf = io.BytesIO()
        fig.savefig(io_buf, format='raw')
        io_buf.seek(0)
        image = np.reshape(np.frombuffer(io_buf.getvalue(), dtype=np.uint8),
                        newshape=(int(fig.bbox.bounds[3]), int(fig.bbox.bounds[2]), -1))
        io_buf.close()

        return image
def compute_print_scores(normal_users, queue):

    K_GMM_n, K_KMeans_n, K_GMM_s, K_KMeans_s = Ks

    print 'novelty score GMM'
    B = GMM(covariance_type='full', n_components = 1)
    B.fit(queue)
    x = [B.score([i]).mean() for i in queue]
    print get_score_last_item(x, K_GMM_n)

    print 'novelty score OneClassSVM'
    x = anom_one_class(queue, [queue[-1]])
    print x[-1]

    print 'novelty score LSA'
    anomalymodel = lsanomaly.LSAnomaly()
    X = np.array(queue)
    anomalymodel.fit(X)
    print anomalymodel.predict(np.array([queue[-1]]))

    print 'novelty score degree K_means'
    K = KMeans(n_clusters=1)
    K.fit(queue)
    x = [K.score([i]) for i in queue]
    print get_score_last_item(x, K_KMeans_n)

    normal_and_new = normal_users + [queue[-1]]

    print 'degree of belonging to known class GMM'
    B = GMM(covariance_type='full', n_components = 1)
    B.fit(normal_users)
    x = [B.score([i]).mean() for i in normal_and_new]
    print get_score_last_item(x, K_GMM_s)

    print 'degree of belonging to known class OneClassSVM'
    x = anom_one_class(normal_users, [queue[-1]])
    print x[-1]

    print 'degree of belonging to known class LSA'
    anomalymodel = lsanomaly.LSAnomaly()
    X = np.array(normal_users)
    anomalymodel.fit(X)
    print anomalymodel.predict(np.array([queue[-1]]))

    print 'degree of belonging to known class K_means'
    K = KMeans(n_clusters=1)
    K.fit(normal_users)
    x = [K.score([i]) for i in normal_and_new]
    print get_score_last_item(x, K_KMeans_s)
    def __buildModel(k_, vectors):
        # 再次聚类并对结果分组。 Kmeans不支持余弦距离
        kmeans = KMeans(n_clusters=k_, n_init=20, max_iter=500).fit(vectors)
        norm_factor = -vectors.shape[1]  # 按字典宽度归一化
        groups = pd.DataFrame({
            'C':
            kmeans.labels_,
            'S': [kmeans.score([v]) / norm_factor for v in vectors]
        }).groupby('C')
        percents = groups.size() / len(vectors)  # 该簇向量数在聚类总向量数中的占比
        cfg_q = G.cfg.getfloat('Classifier', 'Quantile')
        quantiles = np.array([
            groups.get_group(i)['S'].quantile(cfg_q, interpolation='higher')
            for i in range(k_)
        ])
        boundaries = groups['S'].agg('max').values  # 该簇中最远点距离

        quantiles2 = quantiles * 2
        boundaries[boundaries > quantiles2] = quantiles2[
            boundaries > quantiles2]  # 边界太远的话,修正一下
        boundaries[boundaries < 1e-100] = 1e-100  # 边界为零的话,修正一下
        quantiles = boundaries - quantiles
        quantiles[quantiles < 1e-100] = 1e-100  # 避免出现0/0

        G.log.info(
            'Model(k=%d) built. inertia=%e, max proportion=%.2f%%, max quantile=%e, max border=%e',
            k_, kmeans.inertia_,
            max(percents) * 100, max(quantiles), max(boundaries))
        return kmeans, percents, boundaries, quantiles
Beispiel #8
0
def kmeans(vector: np.array, n: int):
    k = KMeans(n_clusters=n, init='k-means++', n_init=6)
    cluster_coordinate = k.fit_transform(vector)
    cluster_label = k.fit(vector)
    score = k.score(vector)

    return cluster_coordinate, cluster_label.labels_, cluster_label.inertia_, score
Beispiel #9
0
def solve(dataId, usingExist=True):

    dataId = str(dataId)
    dataPath = './data/' + dataId + '.txt'
    binPath = './out/' + dataId + '.bin'
    outputPath = "out/ans" + dataId + ".txt"

    if not os.path.exists(binPath) or not usingExist:
        word2vec.word2vec(dataPath, binPath, size=100, verbose=True)

    # 使用word2vec载入binPath
    model = word2vec.load(binPath)

    # 打开输出文件
    output = codecs.open(outputPath, "w", "utf-8")

    ClustersNumber = 10
    WordNumber = len(model.vectors)

    # 使用Kmeans算法
    kmeans = KMeans(n_clusters=ClustersNumber,
                    random_state=0).fit(model.vectors)

    # 得到每个word ID 所属于的cluster 编号,编号范围[0, WordNumber)
    label = kmeans.labels_
    # 获取每个word 的得分,即是每个word和cluster中心的距离的相反数
    scores = []
    for i in xrange(WordNumber):
        scores.append(kmeans.score([model.vectors[i]]))

    # 把处于相同cluster的word ID 放入相同的list
    allCluster = []
    for i in xrange(ClustersNumber):
        allCluster.append([])
    for i in xrange(len(label)):
        allCluster[label[i]].append(i)

    # 定义两个word ID的大小关系,使用scores数组比较其大小关系
    def comparator(a, b):

        vala = scores[a]
        valb = scores[b]

        if vala > valb: return 1
        elif vala == valb: return 0
        else: return -1

    #对于每个cluster分别处理
    for clusterId in xrange(len(allCluster)):
        output.write("-----------------------------------cluster " +
                     str(clusterId) + ":\n")

        #排序,按照score从高到低排序
        allCluster[clusterId].sort(cmp=comparator, reverse=True)

        #获取前30个
        for x in allCluster[clusterId][:30]:
            #输出score的相反数,即输出距离
            output.write(model.vocab[x] + "  " + str(-scores[x]) + "\n")
    print '\n'
Beispiel #10
0
def run():
    cluster_centers = load_prediction()
    test_data = load_test_data()
    k = KMeans(n_clusters=200)
    k.cluster_centers_ = cluster_centers
    score = k.score(test_data)
    print("Score: %f" % (score / len(test_data) * -1))
def init_cluster(word_vectors):
    print(word_vectors.vectors)
    model = KMeans(n_clusters=3, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)
    labels = model.labels_
    silhouette_score = metrics.silhouette_score(word_vectors.vectors, labels, metric='euclidean')
    print("Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):")
    print(model.score(word_vectors.vectors))
    print("Silhouette_score: ")
    print(silhouette_score)
    print(word_vectors.similar_by_vector(model.cluster_centers_[0], topn=50, restrict_vocab=None))
    print(word_vectors.similar_by_vector(model.cluster_centers_[1], topn=50, restrict_vocab=None))
    print(word_vectors.similar_by_vector(model.cluster_centers_[2], topn=50, restrict_vocab=None))
    y_kmeans = model.predict(word_vectors.vectors)
    plt.scatter(word_vectors.vectors[:, 0], word_vectors.vectors[:, 1], c=y_kmeans, s=50, cmap='viridis')
    centers = model.cluster_centers_
    plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)
    plt.show()
    words = pd.DataFrame(word_vectors.vocab.keys())
    words.columns = ['words']
    words = words[words['words'].str.len() > 3].reset_index(drop=True)
    words['vectors'] = words['words'].apply(lambda x: word_vectors.wv[f'{x}'])
    words['cluster'] = words['vectors'].apply(lambda x: model.predict([np.array(x)]))
    words['cluster'] = words['cluster'].apply(lambda x: x[0])
    words['cluster_value'] = [1 if i == 0 else -1 if i == 1 else 0 for i in words['cluster']]
    words['closeness_score'] = words.apply(lambda x: 1 / (model.transform([x['vectors']]).min()), axis=1)
    words['sentiment_coeff'] = words['closeness_score'] * words['cluster_value']
    words.to_csv('metrics_results\\predictive_scores_{}.csv'.format(time_stamp), index=False)
    return words
Beispiel #12
0
def runKMeans(cv_img, num_colors, init):

    imgdata = getimgdatapts(cv_img[-100:, :, :])  # FIX ME: arbitrary cut off
    kmc = KMeans(n_clusters=num_colors, max_iter=25, n_init=10, init=init)
    t1 = time.time()
    kmc.fit_predict(imgdata)
    t2 = time.time()
    print("fit time: %f" % (t2 - t1))
    trained_centers = kmc.cluster_centers_
    # print trained_centers
    labels = kmc.labels_
    labelcount = Counter()
    t1 = time.time()
    # IPython.embed()
    # for pixel in labels:
    # 	labelcount[pixel] += 1
    for i in np.arange(num_colors):
        labelcount[i] = np.sum(labels == i)

    t2 = time.time()
    print("counting labels time: %f" % (t2 - t1))
    # print labelcount
    # IPython.embed()
    score = kmc.score(imgdata)
    return trained_centers, labelcount, score
Beispiel #13
0
def determineBestK(data, minRange=2, maxRange=-1, step=-1):
    if minRange < 0:
        minRange = 2
    if maxRange < 0:
        maxRange = len(data['names'])
    if maxRange < minRange:
        maxRange = len(data['names'])
        minRange = 2
    yVals = []
    if (step < 0):
        step = int(np.floor((maxRange - minRange) / 5))
    for i in range(minRange, maxRange + 1, step):
        kmeans = KMeans(n_init=10,
                        max_iter=10000,
                        tol=1e-8,
                        verbose=0,
                        n_clusters=i,
                        algorithm="elkan").fit(data['data'])

        avgError = abs(kmeans.score(data['data'])) / len(data['data'])

        yVals.append(avgError)
    plt.title('Best K-mean')
    plt.plot(list(range(minRange, maxRange + 1, step)), yVals)
    plt.axis([minRange - .5, maxRange + .5, 0, yVals[0] + .5])
    plt.show()
Beispiel #14
0
def train(weight, true_k=10):
    print('[INFO] traning with K=', true_k)

    clf = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1)
    clf.fit(weight)
    result = list(clf.predict(weight))
    return result, -clf.score(weight)
Beispiel #15
0
def cluster(plot):
    with open('GAIA_DATA_small.csv', 'r') as csv_file:
        file_reader = csv.reader(csv_file,
                                 delimiter=',',
                                 quotechar='|',
                                 quoting=csv.QUOTE_MINIMAL,
                                 lineterminator='\n')
        X = []
        y = []
        for row in file_reader:
            temp_x = row[1]
            temp_y = row[2]
            if temp_x != '' and temp_y != '':
                X.append(temp_x)
                y.append(temp_y)
        X = np.array(X, dtype=np.float).reshape(-1, 1)
        y = np.array(y, dtype=np.float).reshape(-1, 1)
        data = np.hstack((X, y))
        train, test = train_test_split(X, shuffle=False)
        for i in range(10, 50, 5):
            kmeans = KMeans(n_clusters=i).fit(train)
            print(kmeans.score(test))
        if plot:
            # Plotting the data.  No obvious real clusters
            plt.scatter(X, y, alpha=0.1, marker='.')
            plt.show()
Beispiel #16
0
def clusterize(x, n_clusters):
    X = x.reshape((len(x), 1))
    clusterer = KMeans(n_clusters=n_clusters)
    labels = clusterer.fit_predict(X)
    score = silhouette_score(X, labels)
    quality = clusterer.score(X)
    return Clustering(n_clusters, labels, quality, score)
Beispiel #17
0
def KMEANS(data, targets, dataset, n_classes): 
  print('APPLY KMEANS...')
  X_train, Y_train = data[0], targets[0]
  X_val, Y_val = data[1], targets[1]
  X_test, Y_test = data[2], targets[2]

  kmeans = KMeans(n_clusters=n_classes, random_state=0).fit(X_train)
  kmeans.labels_
  cluster_centers = kmeans.cluster_centers_
  print('Cluster Centers', cluster_centers.shape)

  prediction = kmeans.predict(X_test)
  print('Prediction', prediction.shape)
  scores = kmeans.score(X_test)

  acc = accuracy_score(y_true=Y_test, y_pred=prediction) 
  prec = sklearn.metrics.precision_score(y_true=Y_test, y_pred=prediction,  average='weighted' )
  rec = sklearn.metrics.recall_score(y_true=Y_test, y_pred=prediction,  average='weighted')
  class_report = sklearn.metrics.classification_report(y_true=Y_test, y_pred=prediction, output_dict=True)

  sens = class_report['1']['recall']
  spec = class_report['0']['recall']

  print('Test Accuracy, Precision, Recall', acc, prec, rec)
  print()

  return acc, prec, rec, sens, spec
Beispiel #18
0
def plot_kmean_score_vs_k(x_pca, num_iterations=20):
    '''
    INPUT:
    x_pca - a dataframe pca data
    num_iterations - the number of iterations to fit KMeans clusters over
    
    OUTPUT:
    a chart showing KMeans score vs K
    '''
    # Over a number of different cluster counts...
    # run k-means clustering on the data and...
    # compute the average within-cluster distances.
    kmean_scores = []
    num_clusters = []
    for i in range(num_iterations):
        kmeans_i = KMeans(i + 1)
        model_i = kmeans_i.fit(x_pca)
        score_i = np.abs(kmeans_i.score(x_pca))
        kmean_scores.append(score_i)
        num_clusters.append(i + 1)

    # Investigate the change in within-cluster distance across number of clusters.
    # HINT: Use matplotlib's plot function to visualize this relationship.
    # now plot the scores
    plt.plot(num_clusters, kmean_scores, linestyle='--', marker='o', color='b')
    plt.xlabel('K')
    plt.ylabel('KMean Score')
    plt.title('KMean Score vs. K')
Beispiel #19
0
 def classify(self):
     kmeans = KMeans(n_clusters=8, init='k-means++', n_init=10, max_iter=300, tol=0.0001,
                     precompute_distances='auto',
                     verbose=0, random_state=None, copy_x=True, n_jobs=1, algorithm='auto')
     kmeans.fit(self.finalList, self.classList)
     scoreKMeans = kmeans.score(self.testFinalList, self.testClassList)
     print("K-Means: ", scoreKMeans)
def fit_plot_kmean_model(n_clusters, x):
    plt.xticks([])
    plt.yticks([])
    markers = ['o', '^', '*', 's']
    colors = ['r', 'b', 'y', 'k']

    clf = KMeans(n_clusters=n_clusters)
    clf.fit_predict(x)
    score = clf.score(x)  #绝对值越大越不好
    plt.title('k={},score={:.2f}'.format(n_clusters, score))

    labels = clf.labels_
    centers = clf.cluster_centers_
    for i in range(n_clusters):
        cluster = x[labels == i]
        plt.scatter(cluster[:, 0],
                    cluster[:, 1],
                    s=30,
                    c=colors[i],
                    marker=markers[i])
    plt.scatter(centers[:, 0],
                centers[:, 1],
                s=200,
                c='white',
                marker='o',
                alpha=0.9)
    for i, c in enumerate(centers):
        plt.scatter(c[0], c[1], s=50, marker='$%d$' % i, c=colors[i])
Beispiel #21
0
def simpleKMeans(xlist, n_cluster=2, random_state=0, algorithm='auto'):
    """
    
    output:
        centroids: centroids acquired from clustering
        labels: clustering result of input array
        score: sum of distances from centroids to all the points in their cluster
        time: processing time of current k-means clustering
        algorithm:
            'full' for classical EM-style algorithm
            'elkan' is more efficient by using the traiangle inequality, but 
                not suitable for sparse data.
            'auto' will choose elkan for dense data and full for sparse data
        
    """
    if len(xlist) == 0:  #skip empty list
        return [], [], 1, 0

    starttime = time.time()
    kmeans = KMeans(n_clusters=n_cluster,
                    random_state=random_state,
                    algorithm=algorithm).fit(xlist)
    processtime = time.time() - starttime

    labels = kmeans.labels_
    centroids = kmeans.cluster_centers_
    score = -kmeans.score(xlist)

    return centroids, labels, score, processtime
Beispiel #22
0
def explore(data):
  errList=[];
  for i in range(2,int(len(data)**0.5)):
    km = KMeans(n_clusters=i)
    km.fit(data)
    err = abs(km.score(data))
    errList.append(err)
Beispiel #23
0
def diarization2(filename, n_speakers, winSize=WINSIZE):
    #mfcc
    print("---diarization2---")
    sr, signal = audioIo.read_audio_file(filename)
    mid_window = 1.28  #2.0s
    mid_step = 0.08  #0.2
    short_window = 0.032  #0.05s
    short_step = short_window * 0.5

    feature_all, _, _ = audioFeature.mid_feature_extraction(
        signal, sr, mid_window * sr, mid_step * sr, short_window * sr,
        short_step * sr)
    print("feature finally: ", feature_all.shape)

    #kmeans
    kmeans = KMeans(n_clusters=n_speakers, init='k-means++', random_state=0)
    kmeans.fit(feature_all.T)
    cls = kmeans.labels_
    segs, flags = audioMath.labels_to_segments(cls, mid_step)
    print("kmeans result:")
    for s in range(segs.shape[0]):
        print("{:.3f} {:.3f} {}".format(segs[s, 0], segs[s, 1], flags[s]))
    print("标签", len(cls), cls)
    #print("质心",kmeans.cluster_centers_)
    print("SSE", kmeans.inertia_)
    print("迭代次数", kmeans.n_iter_)
    print("分值", kmeans.score(feature_all.T))
    def __buildClusterModel(self, k_, vectors):
        # 再次聚类并对结果分组。 Kmeans不支持余弦距离
        kmeans = KMeans(n_clusters=k_, n_init=20, max_iter=500).fit(vectors)
        norm_factor = -vectors.shape[1]  # 按字典宽度归一化,保证不同模型的可比性
        groups = DataFrame({
            'C':
            kmeans.labels_,
            'S': [kmeans.score([v]) / norm_factor for v in vectors]
        }).groupby('C')
        alias = ['Type' + str(i) for i in range(k_)]  # 簇的别名,默认为Typei,可人工命名
        proportions = groups.size() / len(vectors)  # 该簇向量数在聚类总向量数中的占比
        quantiles = np.array([
            groups.get_group(i)['S'].quantile(self.__Quantile,
                                              interpolation='higher')
            for i in range(k_)
        ])
        boundaries = groups['S'].agg('max').values - quantiles  # 该簇中最远点到分位点的距离
        for i in range(k_):
            if boundaries[i] > quantiles[i]:  # 边界太远的话,修正一下
                boundaries[i] = quantiles[i]
            elif boundaries[i] == 0:  # 避免出现0/0
                boundaries[i] = 1e-100

        G.log.info(
            'Model(k=%d) built. inertia=%.3f, max proportion=%.2f%%, max quantile=%.3f, max border=%.3f',
            k_, kmeans.inertia_,
            max(proportions) * 100, max(quantiles), max(boundaries))
        return kmeans, alias, proportions, boundaries, quantiles
Beispiel #25
0
    def clustering(self, data=None, n_clusters=None, apply_cluster_name=True):

        logger.info({"message": "Clustering phrases.",
                     "n_clusters": n_clusters, "apply_cluster_name": apply_cluster_name})

        if data != None:
            self.data = data
        elif self.data_processed != None:
            data = self.data_processed
        else:
            data = self.data

        if isinstance(n_clusters, int):
            self.n_clusters = n_clusters
        else:
            n_clusters = self.n_clusters

        X = apply_tfidf(data)

        # Initialize the clusterer with n_clusters value and a random generator for reproducibility.
        clusterer = KMeans(n_clusters=n_clusters, random_state=SEED)
        self.labels = clusterer.fit_predict(X)

        if apply_cluster_name:
            self.get_clusters_name(clean_texts=True)

        self.kmeans_score = clusterer.score(X)
        self.silhouette_score = silhouette_score(X, self.labels)

        return {"scores": {"kmeans_score": self.kmeans_score, "silhouette_score": self.silhouette_score}, "data": self.data, "labels": self.labels}
Beispiel #26
0
def explore_k(svd_trans, k_range):
    '''
    Explores various values of k in KMeans

    Args:
        svd_trans: dense array with lsi transformed data
        k_range: the range of k-values to explore
    Returns:
        scores: list of intertia scores for each k value
    '''

    scores = []
    # spherical kmeans, so normalize
    normalizer = Normalizer()
    norm_data = normalizer.fit_transform(svd_trans)
    for k in np.arange:
        km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1,
                    verbose=2)
        km.fit(norm_data)
        scores.append(-1*km.score(norm_data))
    plt.plot(k_range, scores)
    plt.xlabel('# of clusters')
    plt.ylabel('Inertia')
    sns.despine(offset=5, trim=True)
    return scores
Beispiel #27
0
def k_means_clustering(X, k=-1):
    """Perform k means algorithm."""

    if k == -1:
        begin = 5
        end = 1001
    else:
        begin = k
        end = k + 1

    for k in range(begin, end, 5):
        print(str(k) + " clusters started for k-means")
        kmeans = KMeans(n_clusters=k).fit(X)
        labels = kmeans.labels_

        clusters = []
        for i in range(k):
            row = []
            for j in range(len(labels)):
                if i == labels[j]:
                    row.append(j)
            clusters.append(row)

        results_file = open('results/k_means_' + str(k) + '.pickle', 'wb')
        pickle.dump(clusters, results_file)
        results_file.close()

        print(str(k) + " clusters completed. Score: " + str(kmeans.score(X)))
Beispiel #28
0
    def run(self):
        df_pca = self.df_pca
        self.X_pca = df_pca.values[:, :self.settings.NUM_PRESSURE_LEVELS * 2]

        self.kmeans_objs = {}
        self.cluster_cluster_dists = {}
        self.df_labels = pd.DataFrame(index=self.df_pca.index)
        scores = []

        for n_clusters in self.settings.CLUSTERS:
            if n_clusters == self.settings.DETAILED_CLUSTER:
                # Only calc all seeds for details cluster.
                seeds = self.settings.RANDOM_SEEDS
            else:
                seeds = self.settings.RANDOM_SEEDS[:1]

            logger.info('Running for n_clusters = {}'.format(n_clusters))
            for seed in seeds:
                logger.debug('seed: {}'.format(seed))
                kmeans = KMeans(n_clusters=n_clusters, random_state=seed) \
                    .fit(self.X_pca[:, :self.n_pca_components])
                if seed == seeds[0]:
                    scores.append(kmeans.score(self.X_pca[:, :self.n_pca_components]))
                logger.debug(np.histogram(kmeans.labels_, bins=n_clusters - 1))

                cluster_cluster_dist = kmeans.transform(kmeans.cluster_centers_)
                ones = np.ones((n_clusters, n_clusters))
                cluster_cluster_dist = np.ma.masked_array(cluster_cluster_dist, np.tril(ones))
                self.cluster_cluster_dists[(n_clusters, seed)] = cluster_cluster_dist

                self.kmeans_objs[(n_clusters, seed)] = (self.n_pca_components, kmeans)
                label_key = 'nc-{}_seed-{}'.format(n_clusters, seed)
                self.df_labels[label_key] = kmeans.labels_
        self.scores = np.array(scores)
Beispiel #29
0
def kmeans_binning(t, n_bins, n_trials=10):
    """
    Carries out kmeans binning
    
    Args:
        t (np.array): the template
        n_bins (int): the number of bins
        n_trials (int): the number of trials
    
    Returns:
        np.array: the binning vector
    """
    best_clustering = None
    best_score = None
    
    for _ in range(n_trials):
        kmeans= KMeans(n_clusters=n_bins, random_state= np.random.randint(100))
        kmeans.fit(t.reshape(-1, 1))
        score= kmeans.score(t.reshape(-1, 1))
        if best_score is None or score > best_score:
            best_score= score
            best_clustering= kmeans.labels_
    
    clusters= np.unique(best_clustering)
    for i in range(len(clusters)):
        for j in range(i+1, len(clusters)):
            if np.mean(t[best_clustering == clusters[i]]) < np.mean(t[best_clustering == clusters[j]]):
                tmp_clustering= best_clustering.copy()
                tmp_clustering[best_clustering == clusters[j]]= clusters[i]
                tmp_clustering[best_clustering == clusters[i]]= clusters[j]
                best_clustering= tmp_clustering
    
    return best_clustering
def fit_plot_kmean_model(n_clusters, X):
    plt.xticks(())
    plt.yticks(())

    # 使用 k-均值算法进行拟合
    kmean = KMeans(n_clusters=n_clusters)
    kmean.fit_predict(X)

    labels = kmean.labels_
    centers = kmean.cluster_centers_
    markers = ['o', '^', '*', 's']
    colors = ['r', 'b', 'y', 'k']

    # 计算成本
    score = kmean.score(X)
    plt.title("k={}, score={}".format(n_clusters, (int)(score)))

    # 画样本
    for c in range(n_clusters):
        cluster = X[labels == c]
        plt.scatter(cluster[:, 0], cluster[:, 1],
                    marker=markers[c], s=20, c=colors[c])
    # 画出中心点
    plt.scatter(centers[:, 0], centers[:, 1],
                marker='o', c="white", alpha=0.9, s=300)
    for i, c in enumerate(centers):
        plt.scatter(c[0], c[1], marker='$%d$' % i, s=50, c=colors[i])
Beispiel #31
0
def main():
    # setting the hyper parameters
    import argparse
    parser = argparse.ArgumentParser(description='train',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--latent_d_file', default='results/vae/latent_d.csv')
    args = parser.parse_args()

    #latent_d = np.load(args.latent_d_file)
    latent_d = pd.read_csv(args.latent_d_file).values[:,1:]
    n_clusters = []
    scores = []


    for x in range(2,20,1):
      print(str(x)+" : "+str(int(1.4**x)), end = "\r")
      kmeans = KMeans(n_clusters=int(1.5**x), n_init=4)
      kmeans = kmeans.fit(latent_d[:int((latent_d.shape[0])*0.85)])
      n_clusters.append(int(1.5**x))
      scores.append(kmeans.score(latent_d[int((latent_d.shape[0])*0.85):]))


    plt.scatter(n_clusters, scores)
    plt.title("K-means score for K")
    plt.xlabel("K")
    plt.ylabel("sklearn K-means score")
    plt.show()
    def _fit_KMeans(self, data, k):
        model = KMeans(n_clusters=k,
                       init="k-means++",
                       max_iter=100,
                       random_state=8).fit(data)
        centroids = model.cluster_centers_

        if (self.SHOW_KMEANS_PLOT):
            pca = PCA(n_components=2)
            transformed = pca.fit_transform(data)
            transformed_centroids = pca.transform(centroids)

            plt.figure(figsize=(8, 8))
            plt.scatter(transformed[:, 0],
                        transformed[:, 1],
                        marker='o',
                        c=model.labels_)
            plt.scatter(transformed_centroids[:, 0],
                        transformed_centroids[:, 1],
                        marker='x',
                        c='r')
            for i in range(len(data)):
                plt.annotate(i, (transformed[i, 0], transformed[i, 1]))
            for i in range(len(transformed_centroids)):
                plt.annotate(
                    i,
                    (transformed_centroids[i, 0], transformed_centroids[i, 1]))

            plt.grid(False)
            plt.title("PCA&K-Means: k={}, iteration={}, score={:5.2f}".format(
                k, k * 10, model.score(data)))
            plt.show()

        return model
Beispiel #33
0
def Clustering():
    
    PCA_threshold = 0.8

    for team_id in team_dic.itervalues():
        BoF_Team = BoF[team_id]
        
        dim = np.shape(BoF_Team)[0]
        threshold_dim = 0
        for i in range(dim):
            pca = PCA(n_components = i)
            pca.fit(BoF_Team)
            X = pca.transform(BoF_Team)
            E = pca.explained_variance_ratio_
            if np.sum(E) > PCA_threshold:
                thereshold_dim = len(E)
                print 'Team' + str(team_id)+ ' dim:%d' % thereshold_dim
                break
    
        pca = PCA(n_components = thereshold_dim)
        pca.fit(BoF_Team)
        X = pca.transform(BoF_Team)
    
        min_score = 10000
        for i in range(100):
            model = KMeans(n_clusters=K, init='k-means++', max_iter=300, tol=0.0001).fit(X)
            if min_score > model.score(X):
                min_score = model.score(X)
                labels = model.labels_
        print min_score
    
        pca = PCA(n_components = 2)
        pca.fit(BoF_Team)
        X = pca.transform(BoF_Team)
        for k in range(K):
            labels_ind = np.where(labels == k)[0]
            plt.scatter(X[labels_ind,0], X[labels_ind,1], color=C[k])
        plt.legend(['C0','C1','C2','C3','C4'], loc=4)
    
        plt.title('Team' + str(team_id) + '1_PCA_kmeans')
        plt.savefig('Seq_Team' + str(team_id)+ '/Team' + str(team_id) + '_PCA_kmeans.png')
        plt.show()
        plt.close()
        np.savetxt('Seq_Team' + str(team_id) + '/labels_Team' + str(team_id) + '.csv', \
                   labels, delimiter=',')
def findKForKMeans(X):
    graph=[];
    for i in range(2,200):
        km = KMeans(n_clusters=i);
        km.fit(X);
        y = km.score(X);
        graph.append(y);
        print i, y
    print graph;
Beispiel #35
0
def cluster(train_latents, train_labels, test_latents, test_labels):
    num_classes = np.shape(train_labels)[-1]
    labels_hot = np.argmax(test_labels, axis=-1)
    train_latents = np.reshape(train_latents,
                               newshape=[train_latents.shape[0], -1])
    test_latents = np.reshape(test_latents,
                              newshape=[test_latents.shape[0], -1])
    kmeans = KMeans(init='random', n_clusters=num_classes,
                    random_state=0, max_iter=1000, n_init=FLAGS.n_init,
                    n_jobs=FLAGS.n_jobs)
    kmeans.fit(train_latents)
    print(kmeans.cluster_centers_)
    print('Train/Test k-means objective = %.4f / %.4f' %
          (-kmeans.score(train_latents), -kmeans.score(test_latents)))
    print('Train/Test accuracy %.4f / %.3f' %
          (error(np.argmax(train_labels, axis=-1), kmeans.predict(train_latents), k=num_classes),
           error(np.argmax(test_labels, axis=-1), kmeans.predict(test_latents), k=num_classes)))
    return error(labels_hot, kmeans.predict(test_latents), k=num_classes)
Beispiel #36
0
def predictKMeans(X, y):
	col_mean = np.nanmean(X,axis=0)
	inds = np.where(np.isnan(X))
	X[inds]=np.take(col_mean,inds[1])
	km = KMeans(n_clusters=2)

	X_train, X_test, y_train, y_test = chooseRandom(X, y)
	km.fit(X_train, y_train)
	return km.score(X_test, y_test)
def experiment(nexperiments, nclusters):
    E_ins = []
    for _ in range(nexperiments):
        kmeans = KMeans(nclusters, max_iter=300, n_init=1, init='random')
        kmeans.fit(X_train)
        score = kmeans.score(X_train)
        E_in = -score / nsamples
        E_ins.append(E_in)
    return E_ins
def showKMeans(X, N):
    scores = []
    for number in xrange(N / 6, N / 2):
        clustering = KMeans(n_clusters=number, max_iter=MAX_ITER, n_init=N_INIT, n_jobs=N_JOBS )
        clustering.fit_predict(X)
        scores.append(clustering.score(X))
    plt.plot(scores)
    plt.xlabel(XLABEL)
    plt.ylabel(YLABEL)
    plt.show()
def kmean_score(X,nclust):
    '''
    calculate kmeans score
    :param X:numpy array, data set to cluster
    :param nclust: int, number of cluster
    :return: float
    '''
    km = KMeans(nclust)
    km.fit(X)
    rss = -km.score(X)
    return rss
def partition_gene_kmeans(geneToCases, patientToGenes, gene_list, num_components, num_bins, title=None, do_plot=True):

    # get gene index mapping
    giv = getgiv(geneToCases.keys(), gene_list)

    # convert patients into vectors
    patientToVector = getpatientToVector(patientToGenes, giv)

    vectors = patientToVector.values()

    print vectors[0]
    print "Length of vectors is ", len(vectors[0])

    km = KMeans(num_components)

    km.fit(vectors)

    clusterToPatient = {}

    for patient in patientToVector:
        cluster = km.predict(patientToVector[patient])[0]
        if cluster not in clusterToPatient:
            clusterToPatient[cluster] = set()
        clusterToPatient[cluster].add(patient)

    # plot patients in each cluster


    if do_plot:
        bins = range(0, max([len(p_gene) for p_gene in patientToGenes.values()]), max([len(p_gene) for p_gene in patientToGenes.values()])/num_bins)
        plt.figure()
        for cluster in clusterToPatient:
            plt.hist([len(patientToGenes[p]) for p in clusterToPatient[cluster]], bins=bins, label=str(cluster), alpha = 1.0/num_components)
        plt.xlabel('# Somatic Mutations In Tumor', fontsize=20)
        plt.ylabel('Number of Samples', fontsize=20)
        plt.legend()
        plt.title("Kmeans size " + str(num_components), fontsize=20)
        plt.show()



    data = {}
    data['Score'] = km.score(vectors)
    data['Number'] = num_components
    data['% Explained'] = np.round([100 * len(clusterToPatient[cluster]) * 1.0 / len(patientToGenes) for cluster in clusterToPatient], 2)
    data['Vector size'] = len(vectors[0])
    # data['Covariates'] = np.round(g.covars_,2)
    # data["Total log probability"] = sum(g.score(obs))
    # data["AIC"] = g.aic(obs)
    # data["BIC"] = g.bic(obs)
    # data['Explained'] = [np.round([len([in_w for in_w in respon if in_w[i] == max(in_w)]) * 1.0 /len(respon) for i in range(num_components)], 2)]

    return data
Beispiel #41
0
def cluster():
    f=open("./forum/flu.txt")
    flu =[]
    for line in f.readlines():
        flu.append(line)

    f.close()
    

    vectorizer = TfidfVectorizer(sublinear_tf= True,min_df=0,max_df=1.0,ngram_range=(1,1),smooth_idf=True,use_idf=1,strip_accents=None)
    x=vectorizer.fit_transform(flu)

    n_samples,n_features=x.shape

    print n_samples,n_features

    kmeans =KMeans(n_clusters=4, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1)
    kmeans.fit(x)
    print kmeans.score(x)
    c=kmeans.predict(x)

    f1=open('./forum/1.txt','w')
    f2=open('./forum/2.txt','w')
    f3=open('./forum/3.txt','w')
    f4=open('./forum4.txt','w')
    for i in range(0,len(c)):
        if c[i]== 0:
            f1.write('%s'%(flu[i]))
        elif c[i]==1:
            f2.write('%s'%(flu[i]))
        elif c[i]==2:
            f3.write('%s'%(flu[i]))
        else:
            f4.write('%s'%(flu[i]))
    f1.close()
    f2.close()
    f3.close()
    f4.close()
Beispiel #42
0
def test_kmeans():
    X = loadmat('ex7/ex7data2.mat')['X']
    n_init = int(X.shape[0] ** 0.5)
    Js = []
    Ks = range(1, 11)
    for k in Ks:
        km = KMeans(n_clusters=k, n_init=n_init, n_jobs=-1)
        km.fit(X)
        Js.append(km.score(X))
#    plotJ(Ks,Js)
    bestK = best_accelation(Js)
    km = KMeans(n_clusters=bestK, n_init=n_init, n_jobs=-1)
    km.fit(X)
    plt.clf()
    plt.scatter(*np.split(X, 2, axis=1), c='g')
    plt.scatter(*np.split(km.cluster_centers_, 2, axis=1), c='b', marker='D')
    plt.show()
def _runKmeans(train_data):
	#print ('Running Kmeans Clustering...')
	num_clusters = 5
	model = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10)
	
	max_score = 0
	iteration = 20
	best_classification = []
	for i in range(1,iteration): 
	#	print "Iteration number "+str(i)
		model.fit(train_data)
		score = model.score(train_data)
		
		if i == 1 or score > max_score:
			max_score = score
			best_classification =  model.predict(train_data)
	
	#print ("Done!")
	return best_classification.tolist()
Beispiel #44
0
def survey_n_clusters(data, n_kernels, path_out_clustering, df_original):
	from sklearn.cluster import KMeans
#	from sklearn.metrics import silhouette_score
	path_out_clustering = path_out_clustering[:-4] + '_' + str(n_kernels) + 'clusters.csv'
	model = KMeans(n_clusters=n_kernels, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1)
	label = model.fit_predict(data)

#	silhouette_avg = silhouette_score(data, label)
#	print("For n_clusters =", n_kernels, "The average silhouette_score is : ", silhouette_avg)
#	The above evaluation throw feedback "Killed: 9"
## Step 2.1 Output the labeled data, combining with the original data that is before one-hot-encoder
	print ("For n_clusters =", n_kernels, "The summation of all labels is : ", sum(label))
	print ("For n_clusters =", n_kernels, "The total number of non-zero labels is : ", sum(label > 0))
	print ("For n_clusters =", n_kernels, "The built-in score is : ", model.score(data))

	label = label.reshape(len(label),1)
	df_out = pd.DataFrame(label, columns = ['label'])
	df_out = pd.concat([df_original, df_out], axis = 1)
	df_out.to_csv(path_out_clustering, index = False)
def runKMneas(listOfTrainComments, listOfTestComments, listOfUniqueTokens):
	xTrain = []
	yTrain = []
	for i in range(len(listOfTrainComments)):
		BOW = generateBOW(listOfTrainComments[i], listOfUniqueTokens)
		xTrain.append(BOW)
		yTrain.append(listOfTrainComments[i].getStatus())

	xTest = []
	yTest = []
	for i in range(len(listOfTestComments)):
		BOW = generateBOW(listOfTestComments[i], listOfUniqueTokens)
		xTest.append(BOW)
		yTest.append(listOfTestComments[i].getStatus())

	clf = KMeans(n_clusters=2, max_iter = 300)
	clf.fit(xTrain, yTrain)
	score = clf.score(xTest)
	prediction = clf.predict(xTest)
	print('K-means Clustering, Score - ' + str(score), '\n')
def runKmeans(data_file):
	train_data = csv_io.read_data(data_file)
	print len(train_data)
	num_clusters = 10
	model = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10)
	
	max_score = 0
	iteration = 2
	best_classification = []
	for i in range(1,iteration): 
		print "Iteration number "+str(i)
		model.fit(train_data)
		score = model.score(train_data)
		
		if i == 1 or score > max_score:
			max_score = score
			best_classification =  model.predict(train_data)
	
	print len(best_classification.tolist())
	return best_classification.tolist()
def train_test(preprocessed_data):
	'''
	Trains a KMeans cluster classifier from sklearn 
	using encoded_data where encoded_data is a list 
	of lists where each list is an example.  Note that
	there are no important parameters to tweak.
	'''
	all_labels = list(person[0] for person in preprocessed_data)
	all_features = list(person[1:] for person in preprocessed_data)

	n = 100

	km_classifier = KMeans(n_clusters=5, n_init=n, max_iter=300, tol=0.0001, precompute_distances=True, n_jobs=2)

	predictions = km_classifier.fit_predict(all_features).tolist()

	# disgusting Python to get rid of a runtime error due to the score method wanting float instead of int64 input
	neg_kmeans = km_classifier.score([[float(feature) for feature in person] for person in all_features])

	print '\nNegated KMeans function value:', neg_kmeans
	print 'Data distribution by label:', Counter(all_labels)
	print 'Data Distribution by cluster:', Counter(predictions), '\n'
Beispiel #48
0
def runKMeans(cv_img, num_colors, init):
	imgdata = getimgdatapts(cv_img[-100:,:,:])
	kmc = KMeans(n_clusters=num_colors, max_iter=25, n_init=10, init=init)
	t1 = time.time();
	kmc.fit_predict(imgdata)
	t2 = time.time();
	print("fit time: %f"%(t2-t1))
	trained_centers = kmc.cluster_centers_
	# print trained_centers
	labels = kmc.labels_
	labelcount = Counter()
	t1 = time.time();
	# IPython.embed()
	# for pixel in labels:
	# 	labelcount[pixel] += 1
	for i in np.arange(num_colors):
		labelcount[i]=np.sum(labels==i)

	t2 = time.time();
	print("counting labels time: %f"%(t2-t1))
	# print labelcount
	# IPython.embed()
	score=kmc.score(imgdata)
	return trained_centers, labelcount,score
#f = open('data.txt')
#X = np.array([[float(l.strip())] for l in f.readlines()])

X = np.concatenate((
        np.random.normal(0, 1, 10),
        np.random.normal(5, .5, 10),
        np.random.normal(10, 1, 10)
    ))
X = np.array([[x] for x in X])

ks = range(1, 10)
kmeans_scores = []
for k in ks:
    km = KMeans(n_clusters=k)
    km.fit(X)
    kmeans_scores.append(-km.score(X))

gmm_scores = []
bic_scores = []
for k in ks:
    gmm = GMM(n_components=k)
    gmm.fit(X)
    gmm_scores.append(-gmm.score(X).sum())
    bic_scores.append(gmm.bic(X))

_, kmeans_gaps = gap.kmeans_gap(X, nrefs=50, max_clusters=10)
_, gmm_gaps = gap.gmm_gap(X, nrefs=50, max_clusters=10)

fig = plt.figure(figsize=(3*6, 2*4))
ax = fig.add_subplot(231)
ax.plot(ks, kmeans_scores, 'b-o', lw=2, ms=8)
ks  = [2,4,20]

for run,k in enumerate(ks):
    km  = KMeans(n_clusters=k)  #new KMeans object
    km.fit(data)                #fit it to the data
    rain    = cm.rainbow(np.linspace(0,1,k)) #assign each cluster index a matplotlib color
    clrs    = [rain[i] for i in km.labels_] #transform label indices into colors
    
    fig     = plt.figure(8+run)     #new figure
    ax      = fig.add_subplot(111)  
    ax.scatter(data[:,0],data[:,1],c=clrs)  #plot the data coloring by cluster
    plt.savefig("kmeans_"+str(k))



#show the change in clustering "score" as k increases
ks  = range(2,100,4)
scores  = []
for run,k in enumerate(ks):
    km  = KMeans(n_clusters=k)
    km.fit(data)
    scores.append(-km.score(data))
    
fig     = plt.figure(20)
ax      = fig.add_subplot(111)
ax.scatter(ks,scores)
ax.set_xlabel("Number of Clusters (k)")
ax.set_ylabel("Sum of Distances to Centroids")
plt.savefig("kmeansScores")

# -*- coding: utf-8 -*-
"""
Created on Tue Aug  4 18:05:01 2015

@author: rohankoodli
"""

#import numpy as np
import matplotlib.pyplot as plt
#from scipy import stats

import seaborn
seaborn.set()

from sklearn.datasets.samples_generator import make_blobs

X,y = make_blobs(n_samples=150,centers=4,random_state=0,cluster_std=01.0)

plt.scatter(X[:,0],X[:,1],s=70)
plt.plot()

from sklearn.cluster import KMeans

est = KMeans(5)
est.fit(X)
y_kmeans = est.predict(X)
plt.scatter(X[:,0],X[:,1],c=y_kmeans,s=90,cmap='rainbow')
plt.plot()
print est.score(X,y)
def kmeans(k, X):
    km = KMeans(k)
    km.fit(X)
    obj_value = np.abs(km.score(X))
    return obj_value
Beispiel #53
0
def Clustering(BoF_Team1, BoF_Team2):
    t0 = time()

    PCA_threshold = 0.8

    # --Team1--
    dim = np.shape(BoF_Team1)[0]
    threshold_dim = 0
    for i in range(dim):
        pca = PCA(n_components=i)
        pca.fit(BoF_Team1)
        X = pca.transform(BoF_Team1)
        E = pca.explained_variance_ratio_
        if np.sum(E) > PCA_threshold:
            thereshold_dim = len(E)
            print "Team1 dim:%d" % thereshold_dim
            break

    pca = PCA(n_components=thereshold_dim)
    pca.fit(BoF_Team1)
    X = pca.transform(BoF_Team1)

    min_score = 10000
    for i in range(200):
        model = KMeans(n_clusters=K, init="k-means++", max_iter=1000, tol=0.0001).fit(X)
        if min_score > model.score(X):
            labels_Team1 = model.labels_

    pca = PCA(n_components=2)
    pca.fit(BoF_Team1)
    X = pca.transform(BoF_Team1)
    for k in range(K):
        labels_Team1_ind = np.where(labels_Team1 == k)[0]
        plt.scatter(X[labels_Team1_ind, 0], X[labels_Team1_ind, 1], color=C[k])

    plt.title("Team1_PCA_kmeans")
    # plt.legend()
    plt.savefig("Seq_Team1/Team1_PCA_kmeans.png")
    # plt.show()
    plt.close()
    np.savetxt("Seq_Team1/labels_Team1.csv", labels_Team1, delimiter=",")

    # --Team2--
    dim = np.shape(BoF_Team2)[0]
    threshold_dim = 0
    for i in range(dim):
        pca = PCA(n_components=i)
        pca.fit(BoF_Team2)
        X = pca.transform(BoF_Team2)
        E = pca.explained_variance_ratio_
        if np.sum(E) > PCA_threshold:
            thereshold_dim = len(E)
            print "Team2 dim:%d" % thereshold_dim
            break

    min_score = 10000
    for i in range(200):
        model = KMeans(n_clusters=K, init="k-means++", max_iter=1000, tol=0.0001).fit(X)
        if min_score > model.score(X):
            labels_Team2 = model.labels_

    pca = PCA(n_components=2)
    pca.fit(BoF_Team2)
    X = pca.transform(BoF_Team2)
    for k in range(K):
        labels_Team2_ind = np.where(labels_Team2 == k)[0]
        plt.scatter(X[labels_Team2_ind, 0], X[labels_Team2_ind, 1], color=C[k])

    plt.title("Team2_PCA_kmeans")
    plt.savefig("Seq_Team2/Team2_PCA_kmeans.png")
    # plt.show()
    plt.close()
    np.savetxt("Seq_Team2/labels_Team2.csv", labels_Team2, delimiter=",")

    print "time:%f" % (time() - t0)
    return labels_Team1, labels_Team2
def compute_scores(normal_users, queue, Ks=[]):

    '''
        Calculates the novelty scores (noise and strangeness) for the 4 algotithms
        Receives the list of normal users and the queue (all users) and the list of curiosity factors Ks
        Updates the global variables GMM_n, one_n, lsa_n, K_n, GMM_s, one_s, lsa_s with the results 
    '''
    
    global GMM_n, one_n, lsa_n, K_n, GMM_s, one_s, lsa_s, K_s #Novelty Scores for each algorithm, those ''_n are for noise score, ''_s are for strangeness score 

    GMM_n = []
    one_n = []
    lsa_n = []
    K_n = []
    GMM_s = []
    one_s = []
    lsa_s = []
    K_s = []

    K_GMM_n, K_KMeans_n, K_GMM_s, K_KMeans_s = Ks #K_GMM_n, K_KMeans_n are the noise curiosity factors for each algorithm
                                                  #K_GMM_s, K_KMeans_s are the strangeness curiosity factors for each algorithm
                                                  #Ks is a list containing the 4 above mentioned parameters
    

    '''
    
    For One_class_SVM and LSA, when asked to predict the new entry, a label is directly returned 
        LSA: 'anomaly' or '0' (normal)

        One One_class_SVM: -1 (anomaly) or 1 (normal)

    GMM and K means predict a fitting score. The novelty score is obtained calculating the zscore of the entry compared with the scores of all other entries, calling 
    the function get_score_last_item
        If the zscore returned >= 1 the new entry is anomalous

    '''

    '''
    Noise scores are computed with the queue as the base of knowledge, fitting all the entries but the last to the algorithm
    '''                                    
    B = GMM(covariance_type='full', n_components = 1)
    B.fit(queue[0:-1])
    x = [B.score([i]).mean() for i in queue]
    GMM_n.append(get_score_last_item(x, K_GMM_n))


    K = KMeans(n_clusters=1)
    K.fit(queue[0:-1])
    x = [K.score([i]) for i in queue]
    K_n.append(get_score_last_item(x, K_KMeans_n))

    oneClassSVM = OneClassSVM(nu=0.1)
    oneClassSVM.fit(queue[0:-1])
    x = oneClassSVM.predict(np.array([queue[-1]]))
    if x == -1:
        one_n.append(1)
    if x == 1:
        one_n.append(0)
    
    X = np.array(queue[0:-1])
    anomalymodel = lsanomaly.LSAnomaly()
    anomalymodel.fit(X)
    x = anomalymodel.predict(np.array([queue[-1]])) 
    if x == ['anomaly']:
        lsa_n.append(1)
    if x == [0]:
        lsa_n.append(0)

    '''
    Strangeness scores are computed with the normal users as the base of knowledge, fitting normal users to the algorithm
    ''' 

    normal_and_new = normal_users + [queue[-1]] #List to be passed to get_score_last_item to calculate the zscore of the last item, the new entry

    B = GMM(covariance_type='full', n_components = 1)
    B.fit(normal_users)
    x = [B.score([i]).mean() for i in normal_and_new]
    GMM_s.append(get_score_last_item(x, K_GMM_s))


    K = KMeans(n_clusters=1)
    K.fit(normal_users)
    x = [K.score([i]) for i in normal_and_new]
    K_s.append(get_score_last_item(x, K_KMeans_s))

    oneClassSVM = OneClassSVM(nu=0.1)
    oneClassSVM.fit(normal_users)
    x = oneClassSVM.predict(np.array([queue[-1]]))
    if x == -1:
        one_s.append(1)
    if x == 1:
        one_s.append(0)

    anomalymodel = lsanomaly.LSAnomaly()
    X = np.array(normal_users)
    anomalymodel.fit(X)
    x = anomalymodel.predict(np.array([queue[-1]])) 
    if x == ['anomaly']:
        lsa_s.append(1)
    if x == [0]:
        lsa_s.append(0)

    return GMM_n, one_n, lsa_n, K_n, GMM_s, one_s, lsa_s, K_s
Beispiel #55
0
    y_train_non_default = y_train[zero_train]
    y_train_default = y_train[none_zero_train]

    print "pre-processing train data..."
    scalar = preprocessing.StandardScaler().fit(x_train_default)
    x_train = scalar.transform(x_train_default)

    # if mode == 'test':
    #     print 'pre-processing test data...'
    #     x_test = scalar.transform(x_test)

    for k in range(1, 50):
        classifier = KMeans(n_clusters=k, n_init=10, max_iter=300, tol=0.0001, precompute_distances=True)

        classifier.fit(x_train_default)
        score = classifier.score(x_train_default)
        print k, "cluster:", score

        predicts = classifier.predict(x_train_default)
        count = {}
        loss_value = [[] for i in range(k)]

        for i in xrange(len(predicts)):
            count[predicts[i]] = count.get(predicts[i], 0) + 1
            loss_value[predicts[i]].append(y_train_default[i])  # = loss_value.get(predicts[i], 0) + y_train_default[i]

        for i in range(len(loss_value)):
            mean = np.mean(loss_value[i])
            std = np.std(loss_value[i])
            print "cluster", i, ": count =", count[i], ", mean =", mean, ", std =", std
Beispiel #56
0
points = np.array(data.loc[:, ['X', 'Y']])
dis = []
columns_name = []
for cat_data in data_grouped:
    tmp = cat_data[1].loc[:, ['X', 'Y']]
    if len(tmp) < 100:
        continue
    print "For %d: " % cat_data[0]
    columns_name.append("To%d" %cat_data[0])
    group = tmp.groupby(lambda x:random.randint(0,1))
    train_data = group.get_group(0)
    test_data = group.get_group(1)
    score = float('inf')
    K = 0
    for i in range(1, 8):
        oracle = KMeans(init='k-means++', n_clusters=i, n_init=10)
        oracle.fit(train_data)
        score_val = oracle.score(test_data)
        score_val = math.fabs(math.fabs(score_val) - oracle.inertia_) / oracle.inertia_
        if score_val < score:
            score = score_val
            K = i
    best = KMeans(init='k-means++', n_clusters=K, n_init=10)
    best.fit(tmp)
    label = best.predict(points)
    belong_point = best.cluster_centers_[label]
    dis.append(np.power(np.sum(np.power(belong_point - points,2), axis=1), 0.5))
pd.DataFrame(np.column_stack(dis), columns=columns_name).to_csv('vec.csv', index=False)

Beispiel #57
0
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris

iris = load_iris()
x = iris.data

km1 = KMeans(init='k-means++', n_clusters=3)
km1.fit(x)
print km1.labels_
print km1.cluster_centers_
print km1.score(x)

km2 = KMeans(init='random', n_clusters=3)
km2.fit(x)
print km2.labels_
print km2.cluster_centers_
print km2.score(x)

Beispiel #58
0
def new_makeClusterKMeans(valueListFilename):
	maxNoOfClusters = 20
	for noOfClusters in range(maxNoOfClusters):
		noOfClusters += 1
		valueListFile = open(valueListFilename, 'r')

		partailFitTrainSize = 100
		#noOfClusters = 3
		lineNo = 0

		kmeans = KMeans(n_clusters = noOfClusters, max_iter=100)#MiniBatchKMeans(n_clusters = noOfClusters, max_iter=100, batch_size=100)

		trainValueList = []
		for line in valueListFile:
			lineNo += 1
			#if (lineNo % partailFitTrainSize == 0):
			'''	trainValueList = np.array(trainValueList)
				kmeans.partial_fit(trainValueList)
				trainValueList = []
			'''
			valueList = json.loads(line)
			tempList = []
			for col in clusterAttrList:
				tempList.append(valueList[col])
			trainValueList.append(tempList)
		
		#if (lineNo % partailFitTrainSize != 0):
		trainValueList = np.array(trainValueList)
		#kmeans.partial_fit(trainValueList)
		kmeans.fit(trainValueList)

		valueListFile.close()

		centroids = kmeans.cluster_centers_
		labels = kmeans.labels_
		
		print('centroids = ', centroids)
		print('labels = ', len(labels))

		valueListFile = open(valueListFilename, 'r')
		
		ansList = []

		xAxis = 0
		yAxis = 5

		scoreSum = 0
		lineNo = 0
		for line in valueListFile:
			valueList = json.loads(line)
			'''
			tempList = []
			for col in clusterAttrList:
				tempList.append(valueList[col])
			'''
			#ansList.append(kmeans.predict([tempList])[0])
			#thisLabel = kmeans.predict([tempList])[0]
			plt.plot(valueList[xAxis], valueList[yAxis], colors[labels[lineNo]], markersize = 10)
			#scoreSum += kmeans.score([tempList])
			lineNo += 1
		
		scoreSum = kmeans.score(trainValueList)
		
		plt.xlabel(trainAttrList[xAxis])
		plt.ylabel(trainAttrList[yAxis])

		#plt.scatter(centroids[:, 0], centroids[:, 1], marker = 'x', s = 150, linewidths = 5, zorder = 10)

		plt.show()
		

		valueListFile.close()
		print('noOfClusters = ', noOfClusters, '      sum = ', scoreSum)