コード例 #1
ファイル: clustering.py プロジェクト: sampwing/nlp_clustering
def main():
   currently using kmeans to cluster the data, need to be able to find k as the data gathered will be dynamic
   maybe only calculate on batch time

   also, look into using EM instead of kmeans
    filename = "data"
    fd = open(filename, "r")
    questions = [line.strip() for line in fd]

    data = Dataset()
    features = data.populate(questions)
    k = 2
    cluster_ids, centroids = milk.kmeans(features, k)  # using unigrams successfully classifies every question
    print cluster_ids
    while True:
        raw = raw_input("ask a question?")
        if raw == "x" or raw == "q":
        if raw == "":
        f = data.featurevector(raw)  # wont update the data used initially to find unigrams
        query = features + [f]
        cluster_ids, centroids = milk.kmeans(query, k + 1)
        print "***** did you mean ******"
        results = filter(lambda zipped: zipped[0] == cluster_ids[-1], zip(cluster_ids[:-1], questions))
        if len(results) == 0:
            print "** no similar questions have been asked **"
        for index, result in results[:5]:
            print result
        print "*************************"
コード例 #2
ファイル: kmeans.py プロジェクト: rcurtin/benchmarks
    def RunKMeansMilk():
      totalTimer = Timer()

      # Load input dataset.
      # If the dataset contains two files then the second file is the centroids
      # file.
      Log.Info("Loading dataset", self.verbose)
      if len(self.dataset) == 2:
        data = np.genfromtxt(self.dataset[0], delimiter=',')
        centroids = np.genfromtxt(self.dataset[1], delimiter=',')
        data = np.genfromtxt(self.dataset, delimiter=',')

      # Gather parameters.
      clusters = None
      if "clusters" in options:
        clusters = options.pop("clusters")
      maxIterations = None
      if "max_iterations" in options:
        maxIterations = options.pop("max_iterations")
      if len(options) > 0:
        Log.Fatal("Unknown parameters: " + str(options))
        raise Exception("unknown parameters")

      # Now do validation of options.
      if not clusters and len(self.dataset) != 2:
        Log.Fatal("Required option: Number of clusters or cluster locations.")
        return -1
      elif (not clusters or int(clusters) < 1) and len(self.dataset) != 2:
        Log.Fatal("Invalid number of clusters requested! Must be greater than"
            + " or equal to 1.")
        return -1

      m = 1000 if not maxIterations else int(maxIterations)

        # Create the KMeans object and perform K-Means clustering.
        with totalTimer:
          if len(self.dataset) == 2:
            assignments = kmeans(data,
            assignments, centroids = kmeans(data,

      except Exception as e:
        Log.Fatal("Exception: " + str(e))
        return -1

      time = totalTimer.ElapsedTime()
      return time
コード例 #3
ファイル: precluster.py プロジェクト: arnaudsj/milk
 def execute(self):
     import milk
     while True:
         k,ri = self.inq.get()
         if k == 'shutdown':
         _,centroids = milk.kmeans(self.features, k=k, R=(k*1024+ri))
コード例 #4
    def train(self, features, labels, **kwargs):
        from milk.supervised.gridsearch import gridminimise
        from milk.supervised import svm
        c_features = np.concatenate([f for f,_ in features if f.size])
        c_features = c_features[::self.sample]

        learner = milk.defaultlearner()
        k = (self.k if self.k is not None else len(features)//self.kfrac)
        _,codebook = milk.kmeans(c_features, k=k, R=123)
        features = project.f(features, codebook)
        model = learner.train(features, labels)
        return codebook_model(codebook, model)
コード例 #5
def main():
   filename = 'data'
   fd = open(filename, 'r')
   questions = [line.strip() for line in fd]
   data = Clustering()
   features = data.populate(questions)
   k = 2
   cluster_ids, centroids = milk.kmeans(features, k) 
   while True:
      raw = raw_input('Type in your question?')
      if raw == 'x' or raw == 'q': break
      if raw == '': continue
      f = data.featurevector(raw) 
      query = features + [f]
      cluster_ids, centroids = milk.kmeans(query, k + 1)
      results = filter(lambda zipped: zipped[0] == cluster_ids[-1], zip(cluster_ids[:-1], questions))
      if len(results) == 0:
         print '** no similar questions have been asked **'
      for index, result in results[:5]:
         print result
      print '*************************'
コード例 #6
ファイル: jugparallel.py プロジェクト: arnaudsj/milk
def kmeans_select_best(features, ks, repeats=1, method='AIC', R=None, **kwargs):
    assignments_centroids = kmeans_select_best(features, ks, repeats=1, method='AIC', R=None, **kwargs)

    Perform ``repeats`` calls to ``kmeans`` for each ``k`` in ``ks``, select
    the best one according to ``method.``

    Note that, unlike a raw ``kmeans`` call, this is *always deterministic*
    even if ``R=None`` (which is interpreted as being equivalent to setting it
    to a fixed value). Otherwise, the jug paradigm would be broken as different
    runs would give different results.

    features : array-like
        2D array
    ks : sequence of integers
        These will be the values of ``k`` to try
    repeats : integer, optional
        How many times to attempt each k (default: 1).
    method : str, optional
        Which method to use. Must be one of 'AIC' (default) or 'BIC'.
    R : random number source, optional
        Even you do not pass a value, the result will be deterministic. This is
        different from the typical behaviour of ``R``, but, when using jug,
        reproducibility is often but, when using jug, reproducibility is often
        a desired feature.
    kwargs : other options
        These are passed transparently to ``kmeans``

    assignments_centroids : jug.Task
        jug.Task which is the result of the best (as measured by ``method``)
        kmeans clustering.
    from milk import kmeans
    from milk.utils import get_pyrandom
    kmeans = TaskGenerator(kmeans)
    if R is not None:
        start = get_pyrandom(R).randint(0,1024*1024)
        start = 7
    results = []
    for ki,k in enumerate(ks):
        for i in xrange(repeats):
            results.append(kmeans(features, k, R=(start+7*repeats*ki+i), **kwargs))
    return _select_best(features, results, method)[1]
コード例 #7
ファイル: jugparallel.py プロジェクト: neelvad/milk
def kmeans_select_best(features, ks, repeats=1, method='AIC', R=None, **kwargs):
    assignments_centroids = kmeans_select_best(features, ks, repeats=1, method='AIC', R=None, **kwargs)

    Perform ``repeats`` calls to ``kmeans`` for each ``k`` in ``ks``, select
    the best one according to ``method.``

    Note that, unlike a raw ``kmeans`` call, this is *always deterministic*
    even if ``R=None``.

    features : array-like
        2D array
    ks : sequence of integers
        These will be the values of ``k`` to try
    repeats : integer, optional
        How many times to attempt each k (default: 1).
    method : str, optional
        Which method to use. Must be one of 'AIC' (default) or 'BIC'.
    R : random number source, optional
        If you do not pass a value, the result will be deterministic
    kwargs : other options
        These are passed transparently to ``kmeans``

    assignments_centroids : jug.Task
        jug.Task which is the result of the best (as measured by ``method``)
        kmeans clustering.
    from milk import kmeans
    from milk.utils import get_pyrandom
    kmeans = TaskGenerator(kmeans)
    if R is not None:
        start = get_pyrandom(R).randint(0,1024*1024)
        start = 7
    results = []
    for ki,k in enumerate(ks):
        for i in xrange(repeats):
            results.append(kmeans(features, k, R=(start+7*repeats*ki+i), **kwargs))
    return _select_best(features, results, method)
コード例 #8
ファイル: kmeans.py プロジェクト: Alienfeel/pylearn2
    def train_all(self, dataset, mu=None):
        Process kmeans algorithm on the input to localize clusters.

        #TODO-- why does this sometimes return X and sometimes return nothing?

        X = dataset.get_design_matrix()

        n, m = X.shape
        k = self.k

        if milk is not None:
            #use the milk implementation of k-means if it's available
            cluster_ids, mu = milk.kmeans(X,k)
            #our own implementation

            # taking random inputs as initial clusters if user does not provide
            # them.
            if mu is not None:
                if not len(mu) == k:
                    raise Exception('You gave %i clusters, but k=%i were expected'
                                    % (len(mu), k))
                indices = numpy.random.randint(X.shape[0], size=k)
                mu = X[indices]

                dists = numpy.zeros((n, k))
            except MemoryError:
                print ("dying trying to allocate dists matrix ",
                       "for %d examples and %d means" % (n, k))

            old_kills = {}

            iter = 0
            mmd = prev_mmd = float('inf')
            while True:
                if self.verbose:
                    print 'kmeans iter ' + str(iter)

                #print 'iter:',iter,' conv crit:',abs(mmd-prev_mmd)
                #if numpy.sum(numpy.isnan(mu)) > 0:
                if numpy.any(numpy.isnan(mu)):
                    print 'nan found'
                    return X

                #computing distances
                for i in xrange(k):
                    dists[:, i] = numpy.square((X - mu[i, :])).sum(axis=1)

                if iter > 0:
                    prev_mmd = mmd

                min_dists = dists.min(axis=1)

                #mean minimum distance:
                mmd = min_dists.mean()

                print 'cost: ',mmd

                if iter > 0 and (iter >= self.max_iter or \
                                        abs(mmd - prev_mmd) < self.convergence_th):

                #finding minimum distances
                min_dist_inds = dists.argmin(axis=1)

                #computing means
                i = 0
                blacklist = []
                new_kills = {}
                while i < k:
                    b = min_dist_inds == i
                    if not numpy.any(b):
                        killed_on_prev_iter = True
                        #initializes empty cluster to be the mean of the d data
                        #points farthest from their corresponding means
                        if i in old_kills:
                            d = old_kills[i] - 1
                            if d == 0:
                                d = 50
                            new_kills[i] = d
                            d = 5
                        mu[i, :] = 0
                        for j in xrange(d):
                            idx = numpy.argmax(min_dists)
                            min_dists[idx] = 0
                            #chose point idx
                            mu[i, :] += X[idx, :]
                        mu[i, :] /= float(d)
                        #cluster i was empty, reset it to d far out data points
                        #recomputing distances for this cluster
                        dists[:, i] = numpy.square((X - mu[i, :])).sum(axis=1)
                        min_dists = dists.min(axis=1)
                        for idx in blacklist:
                            min_dists[idx] = 0
                        min_dist_inds = dists.argmin(axis=1)
                        i += 1
                        mu[i, :] = numpy.mean(X[b, :], axis=0)
                        if numpy.any(numpy.isnan(mu)):
                            print 'nan found at', i
                            return X
                        i += 1

                old_kills = new_kills

                iter += 1

        self.mu = sharedX( mu )
        self._params = [ self.mu ]
        return True
コード例 #9
ファイル: word_cut.py プロジェクト: huxiaoqian/case
def word_net(weibo,weibo_dict,lable,flag,k_cluster):#词频词网

    black = load_black_words()
    sw = load_scws()
    n = 0
    ts = time.time()

    f_dict = dict()#频数字典
    total = 0#词的总数
    weibo_word = []
    weibo_text = dict()
    weibo_mid = []
    for i in range(0,len(weibo)):
        mid = weibo[i]
        text = weibo_dict[weibo[i]][1]
        if lable[i] == 0:
            words = sw.participle(text)
            row = []
            for word in words:
                if (word[1] in cx_dict) and (3 < len(word[0]) < 30 or word[0] in single_word_whitelist) and (word[0] not in black):#选择分词结果的名词、动词、形容词,并去掉单个词
                    total = total + 1
                    if f_dict.has_key(str(word[0])):
                        f_dict[str(word[0])] = f_dict[str(word[0])] + 1
                        f_dict[str(word[0])] = 1
            weibo_text[str(mid)] = str(text)
        n = n + 1
        if n%10000 == 0:
            end = time.time()
            print '%s weibo takes %s s' %(n,(end-ts))
            ts = end

    #top_k = int(total*0.175) + 1#关键词数量
    keyword = TopkHeap(300)
    ts = time.time()
    print 'start to calculate information counting'
    n = 0
    for k,v in f_dict.iteritems():#计算单个词的信息量
        if v >= 2 and (float(v)/float(total)) <= 0.8:#去掉频数小于3,频率高于80%的词
            p = v#0 - math.log(v, 2)#计算信息量
        n = n + 1
        if n%10000 == 0:
            end = time.time()
            print '%s weibo takes %s s' %(n,(end-ts))
            ts = end
    keyword_data = keyword.TopK()#取得前100的高频词作为顶点
    ts = time.time()

    keyword = []
    k_value = dict()
    for i in range(0,len(keyword_data)):
        k_value[str(keyword_data[i][1])] = float(keyword_data[i][0])/float(total)

    word_net = dict()#词网字典
    for i in range(0,len(weibo_word)):
        row = weibo_word[i]
        for j in range(0,len(row)):
            if row[j] in keyword:
                if j-1 >= 0 and row[j] != row[j-1]:
                    if word_net.has_key(str(row[j]+'_'+row[j-1])):
                        word_net[str(row[j]+'_'+row[j-1])] = word_net[str(row[j]+'_'+row[j-1])] + 1
                    elif word_net.has_key(str(row[j-1]+'_'+row[j])):
                        word_net[str(row[j-1]+'_'+row[j])] = word_net[str(row[j-1]+'_'+row[j])] + 1
                        word_net[str(row[j-1]+'_'+row[j])] = 1
                if j+1 < len(row) and row[j] != row[j+1]:
                    if word_net.has_key(str(row[j]+'_'+row[j+1])):
                        word_net[str(row[j]+'_'+row[j+1])] = word_net[str(row[j]+'_'+row[j+1])] + 1
                    elif word_net.has_key(str(row[j+1]+'_'+row[j])):
                        word_net[str(row[j+1]+'_'+row[j])] = word_net[str(row[j+1]+'_'+row[j])] + 1
                        word_net[str(row[j]+'_'+row[j+1])] = 1
    end = time.time()
    print 'net use %s s' % (end-ts)
    weight = TopkHeap(500)
    for k,v in word_net.iteritems():#计算权重
        k1,k2 = k.split('_')
        if not k_value.has_key(k1):
            k_value[k1] = 0
        if not k_value.has_key(k2):
            k_value[k2] = 0
        if k_value[k1] > k_value[k2]:
            p = v*k_value[k1]
            p = v*k_value[k2]

    data = weight.TopK()
    word = []
    for i in range(0,len(data)):
        if data[i][1] not in word:
            if len(word) == 300:#取前300的词对

    feature = []
    for w in word:
        k1,k2 = w[1].split('_')
        c = []
        for i in range(0, len(weibo_word)):
            n1 = str(weibo_text[str(weibo_mid[i])]).count(str(k1))
            n2 = str(weibo_text[str(weibo_mid[i])]).count(str(k2))
            n = n1 + n2
    features = np.array(feature)
    cluster_ids = milk.kmeans(features, k_cluster)

    return cluster_ids, word
コード例 #10
ファイル: aplicoKMeans.py プロジェクト: martinpineyro/RecPat
@author: juanibraun
import milk
import sys
import csv

input = sys.argv[-2]
out_name = sys.argv[-1]
data = []
nombres = []

#leo entrada
with open(input,'rb') as g:
    reader = csv.reader(g,delimiter=';')
    for row in reader:

#aplico kMeans y genero salida
k = 1000
cluster_ids, centroids = milk.kmeans(data, k)
with open(out_name,'wb') as f:
    writer = csv.writer(f,delimiter=';')

print cluster_ids
print centroids
コード例 #11
ファイル: surf_luispedro.py プロジェクト: BLKStone/mahotas
from __future__ import print_function
import numpy as np
import mahotas as mh
from mahotas.features import surf
from pylab import *

from os import path

f = mh.demos.load('luispedro', as_grey=True)
f = f.astype(np.uint8)
spoints = surf.surf(f, 4, 6, 2)
print("Nr points:", len(spoints))

    import milk
    descrs = spoints[:,5:]
    k = 5
    values, _  =milk.kmeans(descrs, k)
    colors = np.array([(255-52*i,25+52*i,37**i % 101) for i in range(k)])
    values = np.zeros(100)
    colors = np.array([(255,0,0)])

f2 = surf.show_surf(f, spoints[:100], values, colors)
コード例 #12
    def train_all(self, dataset, mu=None):
        Process kmeans algorithm on the input to localize clusters.

        dataset : WRITEME
        mu : WRITEME

        rval : bool

        #TODO-- why does this sometimes return X and sometimes return nothing?

        X = dataset.get_design_matrix()

        n, m = X.shape
        k = self.k

        if milk is not None:
            #use the milk implementation of k-means if it's available
            cluster_ids, mu = milk.kmeans(X, k)
            #our own implementation

            # taking random inputs as initial clusters if user does not provide
            # them.
            if mu is not None:
                if not len(mu) == k:
                    raise Exception(
                        'You gave %i clusters, but k=%i were expected' %
                        (len(mu), k))
                indices = numpy.random.randint(X.shape[0], size=k)
                mu = X[indices]

                dists = numpy.zeros((n, k))
            except MemoryError:
                raise TypicalMemoryError("dying trying to allocate dists "
                                         "matrix for {0} examples and {1} "
                                         "means".format(n, k))

            old_kills = {}

            iter = 0
            mmd = prev_mmd = float('inf')
            while True:
                if self.verbose:
                    logger.info('kmeans iter {0}'.format(iter))

                #print 'iter:',iter,' conv crit:',abs(mmd-prev_mmd)
                #if numpy.sum(numpy.isnan(mu)) > 0:
                if numpy.any(numpy.isnan(mu)):
                    logger.info('nan found')
                    return X

                #computing distances
                for i in xrange(k):
                    dists[:, i] = numpy.square((X - mu[i, :])).sum(axis=1)

                if iter > 0:
                    prev_mmd = mmd

                min_dists = dists.min(axis=1)

                #mean minimum distance:
                mmd = min_dists.mean()

                logger.info('cost: {0}'.format(mmd))

                if iter > 0 and (iter >= self.max_iter or \
                                        abs(mmd - prev_mmd) < self.convergence_th):

                #finding minimum distances
                min_dist_inds = dists.argmin(axis=1)

                #computing means
                i = 0
                blacklist = []
                new_kills = {}
                while i < k:
                    b = min_dist_inds == i
                    if not numpy.any(b):
                        killed_on_prev_iter = True
                        #initializes empty cluster to be the mean of the d data
                        #points farthest from their corresponding means
                        if i in old_kills:
                            d = old_kills[i] - 1
                            if d == 0:
                                d = 50
                            new_kills[i] = d
                            d = 5
                        mu[i, :] = 0
                        for j in xrange(d):
                            idx = numpy.argmax(min_dists)
                            min_dists[idx] = 0
                            #chose point idx
                            mu[i, :] += X[idx, :]
                        mu[i, :] /= float(d)
                        #cluster i was empty, reset it to d far out data points
                        #recomputing distances for this cluster
                        dists[:, i] = numpy.square((X - mu[i, :])).sum(axis=1)
                        min_dists = dists.min(axis=1)
                        for idx in blacklist:
                            min_dists[idx] = 0
                        min_dist_inds = dists.argmin(axis=1)
                        i += 1
                        mu[i, :] = numpy.mean(X[b, :], axis=0)
                        if numpy.any(numpy.isnan(mu)):
                            logger.info('nan found at {0}'.format(i))
                            return X
                        i += 1

                old_kills = new_kills

                iter += 1

        self.mu = sharedX(mu)
        self._params = [self.mu]
        return True
コード例 #13
ファイル: word_cut.py プロジェクト: NeilWang6/case
def word_net(weibo,weibo_dict,lable,flag,k_cluster):#词频词网

    black = load_black_words()
    sw = load_scws()
    n = 0
    ts = time.time()

    f_dict = dict()#频数字典
    total = 0#词的总数
    weibo_word = []
    weibo_text = dict()
    weibo_mid = []
    for i in range(0,len(weibo)):
        mid = weibo[i]
        text = weibo_dict[weibo[i]]
        if lable[i] == 0:
            words = sw.participle(text)
            row = []
            for word in words:
                if (word[1] in cx_dict) and (3 < len(word[0]) < 30 or word[0] in single_word_whitelist) and (word[0] not in black):#选择分词结果的名词、动词、形容词,并去掉单个词
                    total = total + 1
                    if f_dict.has_key(str(word[0])):
                        f_dict[str(word[0])] = f_dict[str(word[0])] + 1
                        f_dict[str(word[0])] = 1
            weibo_text[str(mid)] = str(text)
        n = n + 1
        if n%10000 == 0:
            end = time.time()
            print '%s weibo takes %s s' %(n,(end-ts))
            ts = end

    #top_k = int(total*0.175) + 1#关键词数量
    keyword = TopkHeap(300)
    ts = time.time()
    print 'start to calculate information counting'
    n = 0
    for k,v in f_dict.iteritems():#计算单个词的信息量
        if v >= 2 and (float(v)/float(total)) <= 0.8:#去掉频数小于3,频率高于80%的词
            p = v#0 - math.log(v, 2)#计算信息量
        n = n + 1
        if n%10000 == 0:
            end = time.time()
            print '%s weibo takes %s s' %(n,(end-ts))
            ts = end
    keyword_data = keyword.TopK()#取得前100的高频词作为顶点
    ts = time.time()

    keyword = []
    k_value = dict()
    for i in range(0,len(keyword_data)):
        k_value[str(keyword_data[i][1])] = float(keyword_data[i][0])/float(total)

    word_net = dict()#词网字典
    for i in range(0,len(weibo_word)):
        row = weibo_word[i]
        for j in range(0,len(row)):
            if row[j] in keyword:
                if j-1 >= 0 and row[j] != row[j-1]:
                    if word_net.has_key(str(row[j]+'_'+row[j-1])):
                        word_net[str(row[j]+'_'+row[j-1])] = word_net[str(row[j]+'_'+row[j-1])] + 1
                    elif word_net.has_key(str(row[j-1]+'_'+row[j])):
                        word_net[str(row[j-1]+'_'+row[j])] = word_net[str(row[j-1]+'_'+row[j])] + 1
                        word_net[str(row[j-1]+'_'+row[j])] = 1
                if j+1 < len(row) and row[j] != row[j+1]:
                    if word_net.has_key(str(row[j]+'_'+row[j+1])):
                        word_net[str(row[j]+'_'+row[j+1])] = word_net[str(row[j]+'_'+row[j+1])] + 1
                    elif word_net.has_key(str(row[j+1]+'_'+row[j])):
                        word_net[str(row[j+1]+'_'+row[j])] = word_net[str(row[j+1]+'_'+row[j])] + 1
                        word_net[str(row[j]+'_'+row[j+1])] = 1
    end = time.time()
    print 'net use %s s' % (end-ts)
    weight = TopkHeap(500)
    for k,v in word_net.iteritems():#计算权重
        k1,k2 = k.split('_')
        if not k_value.has_key(k1):
            k_value[k1] = 0
        if not k_value.has_key(k2):
            k_value[k2] = 0
        if k_value[k1] > k_value[k2]:
            p = v*k_value[k1]
            p = v*k_value[k2]

    data = weight.TopK()
    word = []
    for i in range(0,len(data)):
        if data[i][1] not in word:
            if len(word) == 300:#取前300的词对

    feature = []
    for w in word:
        k1,k2 = w[1].split('_')
        c = []
        for i in range(0, len(weibo_word)):
            n1 = str(weibo_text[str(weibo_mid[i])]).count(str(k1))
            n2 = str(weibo_text[str(weibo_mid[i])]).count(str(k2))
            n = n1 + n2
    features = np.array(feature)
    cluster_ids = milk.kmeans(features, k_cluster)

    return cluster_ids, word
コード例 #14
from mahotas.features import surf
from pylab import *

from os import path

    luispedro_image = path.join(path.dirname(path.abspath(__file__)), 'data',
except NameError:
    luispedro_image = 'data/luispedro.jpg'

f = mahotas.imread(luispedro_image, as_grey=True)
f = f.astype(np.uint8)
spoints = surf.surf(f, 4, 6, 2)
print("Nr points:", len(spoints))

    import milk
    descrs = spoints[:, 5:]
    k = 5
    values, _ = milk.kmeans(descrs, k)
    colors = np.array([(255 - 52 * i, 25 + 52 * i, 37**i % 101)
                       for i in range(k)])
    values = np.zeros(100)
    colors = np.array([(255, 0, 0)])

f2 = surf.show_surf(f, spoints[:100], values, colors)
コード例 #15
ファイル: news_cut.py プロジェクト: huxiaoqian/case
def cut_word(flag,cluster):#标题分类
    title = dict()
    title_count = dict()
    weibo_word = []
    black = load_black_words()
    sw = load_scws()
    word_count = []
    reader = csv.reader(file('./comment/data%s.csv' % flag, 'rb'))
    for mid,url,t,c,author,publish,site,board in reader:#按标题归类
        if title_count.has_key(str(t)):
            item = title_count[str(t)]
            title_count[str(t)] = item
            item = []
            title_count[str(t)] = item
        string = t + '_' + c
        words = sw.participle(string)
        for word in words:
            if (word[1] in cx_dict) and (3 < len(word[0]) < 30 or word[0] in single_word_whitelist) and (word[0] not in black):
                if word[0] not in weibo_word:
        title[str(mid)] = [str(t),str(c)]

    lable = dict()
    n = 0
    for k,v in title_count.iteritems():
        if len(v) >= 20:
            n = n + 1
            lable[str(n)] = v

    big_data = []#进行分类的微博
    big_lable = []#分类的标签
    for k,v in lable.iteritems():
        for i in v:
            del title[str(i)]
    for k,v in title.iteritems():
        string = v[0] + '_' + v[1]
        for i in range(0,len(weibo_word)):
            if weibo_word[i] in string:
                word_count[i] = word_count[i] + 1

    new_weibo = []
    for i in range(0,len(word_count)):
        if word_count[i] >= 5:

    notin = []
    data = dict()
    for k,v in title.iteritems():
        f = 0
        row = []
        string = v[0] + '_' + v[1]
        for i in new_weibo:
            if i in string:
                n = string.count(i)
                f = 1
        if f == 1:
            data[k] = row

    feature = []
    word = []
    for k,v in data.iteritems():
    features = np.array(feature)
    cluster_ids = milk.kmeans(features, cluster)

    return word, cluster_ids, big_data, big_lable#新闻、聚类标签、分类的微博、分类的标签