コード例 #1
0
    def test_knn_result():
        model = gensim.models.Word2Vec.load(
            '/Users/holly/Desktop/毕设/Data/(旧)PlatformsComments/p2p.word2vec.model'
        )
        X = model[model.wv.vocab]
        from nltk.cluster import KMeansClusterer
        import nltk
        NUM_CLUSTERS = 5
        kclusterer = KMeansClusterer(
            NUM_CLUSTERS,
            distance=nltk.cluster.util.cosine_distance,
            repeats=25)
        assigned_clusters = kclusterer.cluster(X, assign_clusters=True)

        words = list(model.wv.vocab)
        cluster_dict = {0: [], 1: [], 2: [], 3: [], 4: []}
        for i, word in enumerate(words):
            cluster_dict[assigned_clusters[i]].append(word)

        for j in range(5):
            with open(
                    os.path.join(
                        '/Users/holly/Desktop/毕设/Data/(旧)PlatformsComments/result',
                        str(j) + '.txt'), 'w') as f:
                for word in cluster_dict[j]:
                    f.write("%s\n" % word)
                f.close()
コード例 #2
0
 def cluster(self, embedding, NUM_CLUSTERS):
     kclusterer = KMeansClusterer(
         NUM_CLUSTERS,
         distance=nltk.cluster.util.cosine_distance,
         repeats=25)
     assigned_clusters = kclusterer.cluster(embedding, assign_clusters=True)
     return assigned_clusters
コード例 #3
0
def main():

    NUM_CLUSTERS = 3

    model = Word2Vec.load("test_word2vec_1.model")
    model_data = model[
        model.wv.vocab]  #word2vec 모델을 kmeans clustering 하기 위하여 데이터로 변환

    # 클러스터링이 잘 되는지 확인-------------------------------------------------------------------------
    kclusterer = KMeansClusterer(NUM_CLUSTERS,
                                 distance=nltk.cluster.util.cosine_distance,
                                 repeats=25)
    assigned_clusters = kclusterer.cluster(model_data, assign_clusters=True)
    print(assigned_clusters)

    words = list(model.wv.vocab)
    for i, word in enumerate(words):
        print(word + ":" + str(assigned_clusters[i]))
    #-----------------------------------------------------------------------------------------------------

    #model_data->get Vector Data

    # clusterling -------------------------------------------------------------------------------------------
    kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS)
    kmeans.fit(model_data)  #clusterling
    labels = kmeans.labels_  # 각 데이터의 라벨 값들
    centroids = kmeans.cluster_centers_  #각 cluster의 중심점의 좌표

    # 결과 확인 -------------------------------------------------------------------------------------------
    print("Cluster id labels for inputted data")
    print(labels)
    print("Centroids data")
    print(centroids)
コード例 #4
0
 def clusterize(self,noClusters,noNouranksToKeep,**kwargs):
     """
     """
     storage = getUtility(INounPhraseStorage)
     
     docids = storage.rankedNouns.keys()
     docnouns = []
     allNouns = set()
     vectors = []
     
     for key in docids:
         importantNouns = storage.getNounTerms(
             key,
             noNouranksToKeep)
         docnouns.append(importantNouns)
         allNouns = allNouns.union(importantNouns)
     
     for nouns in docnouns:
         vector = [(noun in nouns and 1 or 0) for noun in allNouns]
         vectors.append(numpy.array(vector))
     
     clusterer = KMeansClusterer(noClusters,pearson,**kwargs)
     clusters = clusterer.cluster(vectors,True)
     
     result = {}
     for i in range(noClusters):
         result[i] = []
     for docid in docids:
         index = indexOf(docids,docid)
         result[clusters[index]] = result[clusters[index]] + [docid]
     return result
コード例 #5
0
    def clustering(self, modelpath, hashtag_cluster_path, num_clusters):

        #loading word2vec model
        model = Word2Vec.load(modelpath)
        X = model.wv.vectors

        #clustering
        num_clusters = int(num_clusters)
        kclusterer = KMeansClusterer(num_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25)
        assigned_clusters = kclusterer.cluster(X, assign_clusters=True)

        #distributing hashtags into their respective clusters
        words = list(model.wv.vocab)
        cluster_distribution = {}
        for i, word in enumerate(words):
            try:
                cluster_distribution[str(assigned_clusters[i])].append(word)
            except:
                cluster_distribution[str(assigned_clusters[i])] = []
                cluster_distribution[str(assigned_clusters[i])].append(word)

        #save the cluster distribution
        with open(hashtag_cluster_path, "w") as write_file:
            json.dump(cluster_distribution, write_file)
        print('saved hashtag cluster.')
def word2vec_cluster(in_file, out_file):
    sentences = []
    with codecs.open(in_file, 'r',encoding='utf-8', errors='ignore') as in_file:
        corpus = in_file.readlines()
        for line in corpus:
            line = line.strip('\n')
            if not line:
                continue
            line = line.lower()
            line = line.split(" ")
            sentences.append(line)

    print("training model...")
    model = Word2Vec(sentences, min_count=2)
     
    print("get vector data...")
    X = model[model.wv.vocab]

    NUM_CLUSTERS=50
    kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,avoid_empty_clusters=True, repeats=30)

    print("assigning cluster..")
    assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
     
    words = list(model.wv.vocab)

    with open(out_file, 'a') as out_file: 
        for i, word in enumerate(words):  
            out_file.write(word + ":" + str(assigned_clusters[i]) + '\n')
コード例 #7
0
ファイル: cluster.py プロジェクト: wsun/abstracts
def cluster(abstracts, mode, metric, debug=False, repeats=10):
    ''' 
    K-means clustering with evaluation metrics, using custom distance
    function and provided abstracts.
    '''

    labels = []
    vectors = []

    # create vectors and labels; k will be number of ground-truth labels
    construct(abstracts, vectors, mode)
    k = label(abstracts, labels)

    # cluster
    clusterer = KMeansClusterer(k, metric, repeats=repeats, 
                                normalise=True, avoid_empty_clusters=True)
    clusters = clusterer.cluster(vectors, assign_clusters=True, trace=debug) 
    means = clusterer.means()

    print 
    print "EVALUATION:"

    # compute evaluation metrics
    dist = sumdistance(vectors, clusters, means)
    pure = purity(clusters, labels, k)
    entr = entropy(clusters, labels, k)
    f, rand = f1(clusters, labels, k)

    print "Sum of distances: %f" % dist
    print "Purity: %f" % pure
    print "Entropy: %f" % entr
    print "Rand index: %f" % rand
    print "F1 measure: %f" % f
コード例 #8
0
def get_clusters(txt):
    clusters = {}
    num_clusters = len(txt) / 4
    if num_clusters < 2:
        num_clusters = 2
    if num_clusters > 5:
        num_clusters = 5
    #txt = [''.join([l for l in txt])]
    #print txt
    responses = [line.strip() for line in txt]
    words = get_words(responses)

    cluster = KMeansClusterer(num_clusters,
                              euclidean_distance,
                              repeats=100,
                              avoid_empty_clusters=True)
    cluster.cluster(
        [vectorspaced(response, words) for response in responses if response])
    classified_examples = [
        cluster.classify(vectorspaced(response, words))
        for response in responses
    ]

    for cluster_id, title in sorted(zip(classified_examples, responses)):
        if not cluster_id in clusters:
            clusters[cluster_id] = [title]
        else:
            clusters[cluster_id].append(title)

    return (clusters)
コード例 #9
0
def demo():
    # example from figure 14.9, page 517, Manning and Schutze

    from nltk.cluster import KMeansClusterer, euclidean_distance

    vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
    means = [[4, 3], [5, 5]]

    clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
    clusters = clusterer.cluster(vectors, True, trace=True)

    print 'Clustered:', vectors
    print 'As:', clusters
    print 'Means:', clusterer.means()
    print

    vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]

    # test k-means using the euclidean distance metric, 2 means and repeat
    # clustering 10 times with random seeds

    clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
    clusters = clusterer.cluster(vectors, True)
    print 'Clustered:', vectors
    print 'As:', clusters
    print 'Means:', clusterer.means()
    print

    # classify a new vector
    vector = numpy.array([3, 3])
    print 'classify(%s):' % vector,
    print clusterer.classify(vector)
    print
コード例 #10
0
    def clusterize(self, noClusters, noNounsToKeep, **kwargs):
        """
        """
        root = getUtility(ISiteRoot)
        catalog = getToolByName(root, 'portal_catalog')

        nounTermsIndex = catalog._catalog.getIndex('noun_terms')
        uidTermsIndex = catalog._catalog.getIndex('UID')
        nounTermsIndexIds = []
        allNouns = set()
        docnouns = []
        vectors = []

        for key in nounTermsIndex._unindex.keys():
            importantNouns = nounTermsIndex._unindex[key][:noNounsToKeep]
            if importantNouns:
                nounTermsIndexIds.append(key)
                docnouns.append(importantNouns)
                allNouns = allNouns.union(importantNouns)

        for nouns in docnouns:
            vector = [(noun in nouns and 1 or 0) for noun in allNouns]
            vectors.append(numpy.array(vector))

        clusterer = KMeansClusterer(noClusters, pearson, **kwargs)
        clusters = clusterer.cluster(vectors, True)
        result = {}
        for i in range(noClusters):
            result[i] = []
        for i in range(len(nounTermsIndexIds)):
            docid = nounTermsIndexIds[i]
            uid = uidTermsIndex._unindex[docid]
            result[clusters[i]] = result[clusters[i]] + [uid]

        return result
コード例 #11
0
def grouper(filename):
    stemmer_func = nltk.stem.snowball.EnglishStemmer().stem

    @decorators.memoize
    def normalize_word(word):
        return stemmer_func(word.lower())

    def get_words(titles):
        words = set()
        for title in job_titles:
            for word in title.split():
                words.add(normalize_word(word))
        return list(words)

    @decorators.memoize
    def vectorspaced(title):
        title_components = [normalize_word(word) for word in title.split()]
        return numpy.array([
            word in title_components for word in words], numpy.short)
        
    with open(filename) as title_file:

        job_titles = [line.decode('utf-8').strip() for line in title_file.readlines()]
        #name = Data(keyword = job_titles)
        #db.session.add(name)
        #db.session.commit()
        words = get_words(job_titles)
        if len(words) >= 1500:
            k = 75
        elif len(words) >= 500 and len(words) < 1000:
            k = 55
        elif len(words) >200 and len(words)<500:
            k =30
        else:
            k = 15
       

        cluster = KMeansClusterer(k,euclidean_distance,avoid_empty_clusters = True )
        cluster.cluster([vectorspaced(title) for title in job_titles if title])

        classified_examples = [
                cluster.classify(vectorspaced(title)) for title in job_titles
            ]
        global gen_file
        gen_file =str(uuid.uuid4())+".csv"
        f = open("/home/ubuntu/downloads/"+gen_file,'wb')
        try:
            w = csv.writer(f)
            w.writerow(('Search Terms','GroupID'))
            for cluster_id, title in sorted(zip(classified_examples, job_titles)):
                w.writerow((title.encode('utf-8'),cluster_id))
            #print "done"
        finally:
            f.close()
        f1 = open("/home/ubuntu/time/"+gen_file+".txt", 'wb')
        try:
            t = (time.time() - start_time)
            f1.write(str(t))
        finally:
            f.close()
コード例 #12
0
    def __kmeans(self, aspect, vectors, id_sentences, k=50):
        ''' Cluster sentences using the K-Means Algorithm '''
        k = min(k, len(vectors))
        vectors = [array(v) for v in vectors]
        means = vectors[:k]
        clusterer = KMeansClusterer(k,
                                    euclidean_distance,
                                    initial_means=means,
                                    avoid_empty_clusters=True)
        with utils.Capturing() as output:
            clusters = clusterer.cluster(vectors, True)

        for id_cluster in range(k):
            self.__clusters[aspect][id_cluster] = {
                'importance': 0,
                'sentences': [],
                'representative_words': [],
                'max_sentence': None
            }

        for index, id_cluster in enumerate(clusters):
            self.__clusters[aspect][id_cluster]['sentences'].append(
                id_sentences[index])

        for id_cluster in range(k):  # Delete empty clusters
            if len(self.__clusters[aspect][id_cluster]['sentences']) == 0:
                self.__clusters[aspect].pop(id_cluster)

        self.__search_representative_words(aspect)
コード例 #13
0
def clustering(data, cluster, n_classes):
    print('\n------------------GMM\n')
    assigned_clusters = mixture.GaussianMixture(
        n_components=n_classes, covariance_type='tied').fit_predict(data)

    print('Mutual_info_score =',
          mutual_info_score(cluster - 1, assigned_clusters))
    print(
        'Adjusted_mutual_info_score =',
        adjusted_mutual_info_score(cluster - 1,
                                   assigned_clusters,
                                   average_method='min'))
    print('Adjusted_rand_scor =',
          adjusted_rand_score(cluster - 1, assigned_clusters))

    print('\nK_MEANS')

    kclusterer = KMeansClusterer(num_means=n_classes,
                                 distance=nltk.cluster.util.cosine_distance)
    assigned_clusters = kclusterer.cluster(data, assign_clusters=True)

    print('Mutual_info_score =',
          mutual_info_score(cluster - 1, assigned_clusters))
    print(
        'Adjusted_mutual_info_score =',
        adjusted_mutual_info_score(cluster - 1,
                                   assigned_clusters,
                                   average_method='min'))
    print('Adjusted_rand_scor =',
          adjusted_rand_score(cluster - 1, assigned_clusters))
コード例 #14
0
ファイル: cluster.py プロジェクト: hiroaki8388/dazai
 def __init__(self, k=10):
     """
     :param k: クラスタ数 int
     """
     cosine = nltk.cluster.util.cosine_distance
     self.model = KMeansClusterer(
         k, distance=cosine, avoid_empty_clusters=True)
コード例 #15
0
def clusterize( data, repeats=50 ):

    clusterer = KMeansClusterer(5, iou_dist_function, repeats=repeats, avoid_empty_clusters=True)
    clusters = clusterer.cluster(vectors, True)
    #print(clusters)
    anchors = clusterer.means()

    return anchors
コード例 #16
0
def KmeansClustering(trainX, numberOfClusters, numberOfRepeats):
    # init cluster with trainX
    # example taken from https://www.nltk.org/_modules/nltk/cluster/kmeans.html#demo
    clusterer = KMeansClusterer(numberOfClusters,
                                cosine_distance,
                                initial_means=None,
                                repeats=numberOfRepeats)
    assigned_clusters = clusterer.cluster(trainX, assign_clusters=True)
    return clusterer, assigned_clusters
def kmeans_test(model, documents):
    count = len(documents)
    vectors = []

    print("done")
    kclusterer = KMeansClusterer(20,
                                 distance=nltk.cluster.util.cosine_distance,
                                 repeats=25)
    assigned_clusters = kclusterer.cluster(model.docvecs, assign_clusters=True)
def cluster_kmean(train_file, test_file):
    """Load train and test data into data frames"""
    f_train = open(train_file, encoding="utf-8")
    train_data = json.load(f_train)
    df_train = pd.DataFrame(train_data, columns=['text'])
    f_train.close()

    f_test = open(test_file, encoding='utf-8')
    test_data = json.load(f_test)
    df_test = pd.DataFrame(test_data, columns=['text', 'labels'])
    f_test.close()

    labels = df_test.labels
    labels = list(set(sum(labels, [])))[:3]
    """"Initialize TF-IDF vectorizer"""
    tfidf_vect = TfidfVectorizer(stop_words="english", min_df=5)

    dtm = tfidf_vect.fit_transform(df_train['text'])

    num_clusters = 3
    """Initialize clutering"""
    clusterer = KMeansClusterer(num_clusters, cosine_distance, repeats=20)

    clusters = clusterer.cluster(dtm.toarray(), assign_clusters=True)

    centroids = np.array(clusterer.means())

    sorted_centroids = centroids.argsort()[:, ::-1]

    voc_lookup = tfidf_vect.get_feature_names()

    test_dtm = tfidf_vect.transform(df_test.text)

    predicted = [clusterer.classify(v) for v in test_dtm.toarray()]

    df_test['label_test'] = df_test['labels'].apply(lambda x: x[0])

    confusion_df = pd.DataFrame(list(
        zip(df_test["label_test"].values, predicted)),
                                columns=["actual_class", "cluster"])

    df_result = pd.crosstab(index=confusion_df.cluster,
                            columns=confusion_df.actual_class)

    print(df_result)

    df_clusterLabelsPredicted = list(
        df_result.apply(lambda x: x.idxmax(), axis=1))
    cluster_dict = dict(
        (i, j) for i, j in enumerate(df_clusterLabelsPredicted))

    predicted_target = [cluster_dict[i] for i in predicted]

    print(
        metrics.classification_report(df_test["label_test"], predicted_target))
    for i in cluster_dict:
        print("Cluster %d : Topic %s" % (i, cluster_dict[i]))
コード例 #19
0
def cluster_kmean(train_file, test_file):

    data = pd.read_json(train_file, orient='columns')
    data.columns = ["text"]
    tfidf_vect = TfidfVectorizer(min_df=5, stop_words='english')
    dtm = tfidf_vect.fit_transform(data["text"])

    num_clusters = 3
    clusterer = KMeansClusterer(num_clusters, cosine_distance, repeats=5)
    clusters = clusterer.cluster(dtm.toarray(), assign_clusters=True)

    test = pd.read_json(test_file, orient='columns')
    test.columns = ["text", "label"]

    #to convert dataframe with multiple targets to the first target
    x = test["label"]
    truth = []
    for item in x:
        truth.append(item[0])
    test["label"] = truth

    test_dtm = tfidf_vect.transform(test["text"])
    predicted = [clusterer.classify(v) for v in test_dtm.toarray()]
    confusion_df = pd.DataFrame(list(zip(test["label"].values, predicted)),
                                columns=["label", "cluster"])
    crosstab = pd.crosstab(index=confusion_df.cluster,
                           columns=confusion_df.label)
    print("using cosine: ")
    print(crosstab)
    dfmax = crosstab.idxmax(axis=1)
    print(dfmax)
    cluster_dict = {0: dfmax[0], 1: dfmax[1], 2: dfmax[2]}
    predicted_target = [cluster_dict[i] for i in predicted]

    print(metrics.classification_report(test["label"], predicted_target))

    # Kmeans with 20 different centroid seeds
    num_clusters = 3
    km = KMeans(n_clusters=num_clusters, n_init=20).fit(dtm)
    clusters = km.labels_.tolist()
    predicted2 = km.predict(test_dtm)
    confusion_df2 = pd.DataFrame(list(zip(test["label"].values, predicted2)),
                                 columns=["label", "cluster"])

    crosstab2 = pd.crosstab(index=confusion_df2.cluster,
                            columns=confusion_df2.label)
    print("using Euclidean distance")
    print(crosstab2)
    dfmax = crosstab2.idxmax(axis=1)
    print(dfmax)
    cluster_dict = {0: dfmax[0], 1: dfmax[1], 2: dfmax[2]}

    predicted_target2 = [cluster_dict[i] for i in predicted2]
    print(metrics.classification_report(test["label"], predicted_target2))

    return None
コード例 #20
0
    def get_clusters(self, vectors):
        vectors = [numpy.array(v) for v in vectors]
        init_means = [copy(vectors[i]) for i in range(self.num_clusters)]
        clusterer = KMeansClusterer(self.num_clusters,
                                    euclidean_distance,
                                    initial_means=init_means,
                                    avoid_empty_clusters=True)
        clusters = clusterer.cluster(vectors, True)

        return clusters
コード例 #21
0
def clustering_on_wordvecs(word_vectors, num_clusters):
    # Initalize a k-means object and use it to extract centroids
    #kmeans_clustering = KMeans(n_clusters=num_clusters, init='k-means++');
    #idx = kmeans_clustering.fit_predict(word_vectors);
    kclusterer = KMeansClusterer(num_clusters,
                                 distance=nltk.cluster.util.cosine_distance,
                                 repeats=25)
    assigned_clusters = kclusterer.cluster(X, assign_clusters=True)

    return assigned_clusters
コード例 #22
0
def get_word_clusters(tweets):
    ListTweets = get_all_text(tweets)
    ListTweets = list(ListTweets)
    #   Project tweet text onto a vector space 
    vs_tweets = list(TweetVectors(tweets))
    cluster = KMeansClusterer(10, euclidean_distance, avoid_empty_clusters = True)
    cluster.cluster(vs_tweets)
    classified_examples = [ cluster.classify(tweet) for tweet in vs_tweets ]
    for cluster_id, tweet in sorted(zip(classified_examples, ListTweets)):
        print cluster_id, tweet
コード例 #23
0
 def kgen(self, num):
     num = int(num)
     clusterer = KMeansClusterer(num, distance=cosine_distance, repeats=20)
     vecs = self.model.wv[self.model.wv.vocab]
     assignments = clusterer.cluster(vecs, assign_clusters=True)
     self.vocab_to_cluster_map = dict(zip(self.model.wv.vocab, assignments))
     self.clusters = dict()
     for word, index in self.vocab_to_cluster_map.items():
         if index in self.clusters:
             self.clusters[index] += word
         else:
             self.clusters[index] = [word]
コード例 #24
0
 def cluster(self, corpus):
     """
     Fits the K-Means model to the given data.
     """
     cosine = nltk.cluster.util.cosine_distance
     self.model = KMeansClusterer(self.k,
                                  distance=cosine,
                                  avoid_empty_clusters=True)
     self.model.cluster([
         self.vectorize(corpus.words(fileid))
         for fileid in corpus.fileids(categories=['news'])
     ])
コード例 #25
0
def clustering_question(sents, sents_word2vec, NUM_CLUSTERS=15):
    kclusterer = KMeansClusterer(
        NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,
        repeats=25, avoid_empty_clusters=True)

    assigned_clusters = kclusterer.cluster(sents_word2vec, assign_clusters=True)
    data = pd.DataFrame([], columns=['text', 'cluster', 'centroid'])
    data.loc[:, 'text'] = sents
    data.loc[:, 'cluster'] = pd.Series(assigned_clusters, index=data.index)
    data.loc[:, 'centroid'] = data['cluster'].apply(lambda x: kclusterer.means()[x])

    return data, assigned_clusters
コード例 #26
0
def cosine_cluster(num_clusters, matrix):

    print("Running k-means using cosine distance...\n")

    matrix = np.asanyarray(matrix)
    
    k_means = KMeansClusterer(num_clusters, cosine_distance, avoid_empty_clusters=True)
    clusters = k_means.cluster(matrix, assign_clusters=True, trace=False)

    print("Successfully found %d clusters in %d dimensions \n" % (num_clusters, matrix.shape[1]))

    return clusters
コード例 #27
0
ファイル: midimetadata.py プロジェクト: b0rkestra/b0rkestra
def main():
    tracknames = get_tracknames()  
    #title_file = open("example_jobs.txt", 'r')

    #job_titles = [line.strip() for line in title_file.readlines()]
    words = get_words(tracknames)

    cluster = KMeansClusterer(20, euclidean_distance, avoid_empty_clusters=True)
    cluster.cluster([vectorspaced(trakname, words) for trakname in tracknames if trakname])
    classified_examples = [cluster.classify(vectorspaced(trackname, words)) for trackname in tracknames]


    for cluster_id, title in sorted(zip(classified_examples, tracknames)):
        print cluster_id, title
コード例 #28
0
def get_kmeans_predicted_clusters(word_representions, Num_clusters):
    #from dictionnary type to transposed dataframe
    Y = pd.DataFrame(data=word_representions).T
    X = Y.values
    #Clustering the data using sklearn library
    kclusterer = KMeansClusterer(Num_clusters,
                                 distance=nltk.cluster.util.euclidean_distance,
                                 repeats=25,
                                 avoid_empty_clusters=False)
    predicted_clusters = kclusterer.cluster(
        X,
        assign_clusters=True,
    )
    return predicted_clusters
コード例 #29
0
ファイル: kmeansdemo.py プロジェクト: mquezada/cc6909
def demo():
    # example from figure 14.9, page 517, Manning and Schutze

    from nltk.cluster import KMeansClusterer, euclidean_distance

    vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
    means = [[4, 3], [5, 5]]

    clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
    clusters = clusterer.cluster(vectors, True, trace=True)

    print 'Clustered:', vectors
    print 'As:', clusters
    print 'Means:', clusterer.means()
    print

    vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]

    # test k-means using the euclidean distance metric, 2 means and repeat
    # clustering 10 times with random seeds

    clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
    clusters = clusterer.cluster(vectors, True)
    print 'Clustered:', vectors
    print 'As:', clusters
    print 'Means:', clusterer.means()
    print

    # classify a new vector
    vector = numpy.array([3, 3])
    print 'classify(%s):' % vector,
    print clusterer.classify(vector)
    print
コード例 #30
0
ファイル: cluster.py プロジェクト: hiroaki8388/dazai
class KMeansTopics(BaseEstimator, TransformerMixin):
    def __init__(self, k=10):
        """
        :param k: クラスタ数 int
        """
        cosine = nltk.cluster.util.cosine_distance
        self.model = KMeansClusterer(
            k, distance=cosine, avoid_empty_clusters=True)

    def fit(self, sents):
        return self

    def transform(self, sents):
        self.model.cluster(sents)
コード例 #31
0
def nltk_kmeans(word_vectors, k):
    from nltk.cluster import KMeansClusterer
    import nltk

    #word_vectors.init_sims()
    norm_vectors = word_vectors.syn0

    kmeans = KMeansClusterer(k, nltk.cluster.util.cosine_distance, repeats=25)
    assigned_clusters = kmeans.cluster(norm_vectors, assign_clusters=True)

    clusters = defaultdict(list)
    for idx in range(0, len(word_vectors.index2word)):
        clusters[assigned_clusters[idx]].append(word_vectors.index2word[idx])

    return (clusters, kmeans)
コード例 #32
0
ファイル: cluster.py プロジェクト: tdiggelm/nltk-playground
def demo_1():

    urls = [
        "www.ai-one.com",
        "http://en.wikipedia.org/wiki/Albert_Einstein",
        "http://en.wikipedia.org/wiki/USA",
        "http://en.wikipedia.org/wiki/Microsoft"
        ]

    keywords = [get_keywords(url) for url in urls]
    all_words = set(chain(*keywords))
    vectors = [vector_from_keywords(kw, all_words) for kw in keywords]

    clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
    clusters = clusterer.cluster(vectors, True)
コード例 #33
0
class KMeansClusters(BaseEstimator, TransformerMixin):
    """
    Cluster text data using k-means. Makes use of nltk k-means clustering.
    Allows for alternative distance measures
    """
    def __init__(self, k=7):
        self.k = k
        self.distance = nltk.cluster.util.cosine_distance
        self.model = KMeansClusterer(self.k,
                                     self.distance,
                                     avoid_empty_clusters=True)

    def fit(self, documents, labels=None):
        return self

    def transform(self, documents):
        """
        fits the K-means model to the given documents
        Parameters
        ----------
        documents :
            a string containing the normalized text.

        Returns
        -------
            fitted model
        """
        return np.array(self.model.cluster(documents, assign_clusters=True))
コード例 #34
0
class PartionalNltk():

    def __init__(self):
        self.clf = KMeansClusterer(2, cosine_distance, repeats=30, avoid_empty_clusters=True)

    def cluster(self, data):
        clusters = self.clf.cluster(data.toarray(), True)
        return np.array(clusters)

    def f_score(self, cluster, f_score_dict):

        for cl in f_score_dict:
            docs = np.array(f_score_dict[cl]['docs'])
            nri = np.intersect1d(cluster, docs).shape[0]
            nr = docs.shape[0]
            ni = cluster.shape[0]
            #print nri, nr, ni

            try:
                recall = float(nri) / float(nr)
                precision = float(nri) / float(ni)
                f_score = (2 * precision * recall) / (precision + recall)
                f_score_dict[cl]['fscore'] = f_score if (f_score > f_score_dict[cl]['fscore'] or not f_score_dict[cl]['fscore']) else f_score_dict[cl]['fscore']
            except ZeroDivisionError, e:
                #print e
                pass

        return f_score_dict
コード例 #35
0
class PartionalNltk():
    def __init__(self):
        self.clf = KMeansClusterer(2,
                                   cosine_distance,
                                   repeats=30,
                                   avoid_empty_clusters=True)

    def cluster(self, data):
        clusters = self.clf.cluster(data.toarray(), True)
        return np.array(clusters)

    def f_score(self, cluster, f_score_dict):

        for cl in f_score_dict:
            docs = np.array(f_score_dict[cl]['docs'])
            nri = np.intersect1d(cluster, docs).shape[0]
            nr = docs.shape[0]
            ni = cluster.shape[0]
            #print nri, nr, ni

            try:
                recall = float(nri) / float(nr)
                precision = float(nri) / float(ni)
                f_score = (2 * precision * recall) / (precision + recall)
                f_score_dict[cl]['fscore'] = f_score if (
                    f_score > f_score_dict[cl]['fscore']
                    or not f_score_dict[cl]['fscore']
                ) else f_score_dict[cl]['fscore']
            except ZeroDivisionError, e:
                #print e
                pass

        return f_score_dict
コード例 #36
0
def clustering_question(data, NUM_CLUSTERS=15):

    sentences = data['text']

    X = np.array(data['emb'].tolist())

    kclusterer = KMeansClusterer(NUM_CLUSTERS,
                                 distance=nltk.cluster.util.cosine_distance,
                                 repeats=25,
                                 avoid_empty_clusters=True)

    assigned_clusters = kclusterer.cluster(X, assign_clusters=True)

    data['cluster'] = pd.Series(assigned_clusters, index=data.index)
    data['centroid'] = data['cluster'].apply(lambda x: kclusterer.means()[x])

    return data, assigned_clusters
コード例 #37
0
def cluster(index, k):
    # example from figure 14.9, page 517, Manning and Schutze

    from nltk.cluster import KMeansClusterer, euclidean_distance

    #vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]

    # test k-means using the euclidean distance metric, 2 means and repeat
    # clustering 10 times with random seeds

    clusterer = KMeansClusterer(k, euclidean_distance, repeats=10)
    clusters = clusterer.cluster(index, True)
    print('Clustered:', index)
    print('As:', clusters)
    print('Means:', clusterer.means())
    print()
    return clusters
コード例 #38
0
def train(X, y, train_ratio):
    from sklearn.cluster import KMeans
    from sklearn.linear_model import LogisticRegression
    from sklearn import svm
    from sklearn.metrics import precision_score, recall_score, f1_score
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import train_test_split

    test_ratio = 1-train_ratio
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=36)

    """
    Classification
    """
#     clf = LogisticRegression(C=1000.0, random_state=0).fit(X_train, y_train)
#     clf = svm.SVC(kernel='linear', C=1e30).fit(X_train, y_train)
#     y_pred = clf.predict(X_test)

#     print(y_test)
#     print(y_pred)
#     print("accuracy: %.2f" %accuracy_score(y_test, y_pred))
#     print("Precision : %.3f" % precision_score(y_test, y_pred))
#     print("Recall : %.3f" % recall_score(y_test, y_pred))
#     print("F1-micro : %.3f" % f1_score(y_test, y_pred, average='micro'))
#     print("F1-macro : %.3f" % f1_score(y_test, y_pred, average='macro'))
#     f1_micro = f1_score(y_test, y_pred, average='micro')
#     f1_macro = f1_score(y_test, y_pred, average='macro')

#     print("F1-macro")
#     print(f1_macro)
#     print("F1-micro")
#     print(f1_micro)

    """
    Clustering
    """

    from sklearn.metrics.cluster import normalized_mutual_info_score
    from nltk.cluster import KMeansClusterer
    import nltk

    NUM_CLUSTERS=8
    kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,  repeats=100, normalise=True, avoid_empty_clusters=True)
    assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
    nmi = normalized_mutual_info_score(assigned_clusters, y)
    return nmi
コード例 #39
0
ファイル: kmeans.py プロジェクト: yokeyong/atap
class KMeansTopics(object):

    def __init__(self, corpus, k=10):
        """
        corpus is a corpus object, e.g. an HTMLCorpusReader()
        or an HTMLPickledCorpusReader() object

        k is the number of clusters
        """
        self.k = k
        self.model = None
        self.vocab = list(
            set(normalize(corpus.words(categories=['news'])))
            )

    def vectorize(self, document):
        """
        Vectorizes a document consisting of a list of part of speech
        tagged tokens using the segmentation and tokenization methods.

        One-hot encode the set of documents
        """
        features = set(normalize(document))
        return np.array([
            token in features for token in self.vocab], np.short)

    def cluster(self, corpus):
        """
        Fits the K-Means model to the given data.
        """
        cosine = nltk.cluster.util.cosine_distance
        self.model = KMeansClusterer(
            self.k, distance=cosine, avoid_empty_clusters=True)
        self.model.cluster([
            self.vectorize(
                corpus.words(fileid)
            ) for fileid in corpus.fileids(categories=['news'])
        ])

    def classify(self, document):
        """
        Pass through to the internal model classify
        """
        return self.model.classify(self.vectorize(document))
コード例 #40
0
ファイル: kmeans.py プロジェクト: yokeyong/atap
 def cluster(self, corpus):
     """
     Fits the K-Means model to the given data.
     """
     cosine = nltk.cluster.util.cosine_distance
     self.model = KMeansClusterer(
         self.k, distance=cosine, avoid_empty_clusters=True)
     self.model.cluster([
         self.vectorize(
             corpus.words(fileid)
         ) for fileid in corpus.fileids(categories=['news'])
     ])
コード例 #41
0
ファイル: cluster.py プロジェクト: zhuxi0511/NLTK_test
def my_demo_main(file_list_name, tokenizer_num=0):
    from mmseg import seg_txt
    from nltk.cluster import KMeansClusterer, euclidean_distance
    from nltk.cluster import GAAClusterer
    tokenizer_list = [seg_txt,]
    file_list = open(file_list_name)
    tokenizer = tokenizer_list[tokenizer_num]
    texts = [[term for term in tokenizer(open('pos/' + str(file_name.strip())).read())] for file_name in file_list]

    data = TF_IDF(texts)

    vectors = []

    file_count = 1
    feature_set = set()
    for text in data.texts:
        vector = list()
        for term in set(text):
            vector.append((data.tf_idf(term, text), term))
        vector.sort(key=lambda x:x[0], reverse=True)
        for term in vector[:int(len(vector)*0.15) + 1]:
            feature_set.add(term[1])

    print feature_set
    print len(feature_set)
    for text in data.texts:
        vector = list()
        for term in feature_set:
            if term in text:
                vector.append(data.tf_idf(term, text))
            else:
                vector.append(0)
        square_sum = map(lambda x:x*x, vector)
        square_sum = math.sqrt(sum(square_sum))
        vector = map(lambda x:x/square_sum, vector)
        vectors += [numpy.array(vector)]
        print file_count
        file_count += 1

    means = find_max_density(vectors, euclidean_distance);
    print 'means', len(means)

    f = open('result.txt', 'w')
    clusterer = KMeansClusterer(len(means), euclidean_distance, initial_means=means)
    clusters = clusterer.cluster(vectors, True, True)
    print 'km1', clusters
    f.write('km1: ' + str(clusters) + '\n')

    clusterer = KMeansClusterer(len(vectors) / 10, euclidean_distance, repeats=10)

    clusters = clusterer.cluster(vectors, True, True)
    print 'km2', clusters
    f.write('km2: ' + str(clusters) + '\n')

    clusterer = GAAClusterer(len(vectors) / 10)
    clusters = clusterer.cluster(vectors, True)
    print 'gaac', clusters
    f.write('gaac: ' + str(clusters) + '\n')
    f.close()
コード例 #42
0
    def cluster(self, k=5, repeats=1):
        '''
        Cluster documents into k clusters using the NLTK
        implementation of K-Means clustering. The frequency of each
        unique word across an article serves as its feature vector.
        '''
        article_freq_count = {} #frequency of each unique word in a given article
        for article in self.testing_articles:
            article_freq_count[article.id] = []
            for unique_word in self.keywords:
                #count frequency of word in article, add to frequency list
                article_freq_count[article.id].append(article.content.count(unique_word))

        #nltk k-means requires numpy array-like objects
        vectors = [array(article_freq_count[article]) for article in article_freq_count]
        clusterer = KMeansClusterer(k, cosine_distance, repeats=repeats)
        clusters = clusterer.cluster(vectors, True, trace=False)

        groups = [[] for _ in xrange(k)]

        #vector positions need to be converted back to article IDs,
        #because IDs are striped during vector construction.
        vector_ids = {} #maps positions in the vector to article IDs
        f =  article_freq_count.copy()
        for pos in xrange(len(vectors)):
            for id in f.keys():
                #equivalent to 'if article_freq_count[id] == vectors[pos]',
                #but numpy equivalence checking is weird
                t = article_freq_count[id] == vectors[pos]
                if not False in t:
                    vector_ids[pos] = id
                    f.pop(id)

        for i in xrange(len(clusters)):
            groups[clusters[i]].append(vector_ids[i])

        return groups
コード例 #43
0
######################################
# Cluster a BOW vector in 4 clusters #
#                                    #
# Requirements: clusterVectors       #
# Usage       : %loadpy cluster.py   #
######################################

import nltk
from nltk import cluster
from nltk.cluster import cosine_distance
from nltk.cluster import KMeansClusterer

numClusters = 4
print "KMeans Clustering with %d means and using cosine distance" %numClusters
clusterer = KMeansClusterer(numClusters, cosine_distance);
clusters = clusterer.cluster(clusterVectors, assign_clusters=True, trace=False);
means = clusterer.means();
コード例 #44
0
ファイル: clustering.py プロジェクト: samsheff/Filing-Cabinet
    if len(sys.argv) == 2:
        filename = sys.argv[1]
    else:
        print "No filename specified! Exiting."
        exit()
 
    with open(filename) as title_file:
 
        print "Reading Files"
        job_titles = [unicode(line.strip(), "utf-8") for line in title_file.readlines()]
 
        print "Parsing Words"
        words = get_words(job_titles)
 
        print "Creating Cluster Instance"
        cluster = KMeansClusterer(10, euclidean_distance, 5)

        # Alternative Clusterer - Less accurate for my use
        #cluster = GAAClusterer(20)
        
        print "Clustering"
        cluster.cluster([vectorspaced(title) for title in job_titles if title])
 
        # NOTE: This is inefficient, cluster.classify should really just be
        # called when you are classifying previously unseen examples!
        print "Classifying"
        classified_examples = [
                cluster.classify(vectorspaced(title)) for title in job_titles
           ]
        print "Saving results"
        for cluster_id, title in sorted(zip(classified_examples, job_titles)):
コード例 #45
0
	0's are inserted otherwise. 
	@param response The survey response to generate a vector for 
    '''
    response_components = [normalize_word(word) for word in response.split()]    
    return numpy.array([
        word in response_components and not word in stopwords
        for word in words], numpy.short)
 
if __name__ == '__main__':
 
    num_clusters = DEFAULT_NUM_CLUSTERS
    if len(sys.argv) == 2:
        num_clusters = int(sys.argv[1])
 
    with open("reviews.txt") as survey_file:
 
        responses = [line.strip() for line in survey_file.readlines()]
 
        words = get_words(responses)
 
	cluster = KMeansClusterer(num_clusters, euclidean_distance,
			repeats=100, avoid_empty_clusters=True)
        cluster.cluster([vectorspaced(response) for response in responses if response])
 
        classified_examples = [
                cluster.classify(vectorspaced(response)) for response in responses
            ]
 
        for cluster_id, title in sorted(zip(classified_examples, responses)):
            print cluster_id, title
コード例 #46
0
ファイル: lolcorpus.py プロジェクト: credeiki/LoL
    return freq_lore.items()

    
num_clusters=20
vec_len=len(common_words)
vector_words=common_words[:vec_len]
word_freqs=[get_word_freq(text) for [text,a,b] in annotated]
tmp_vector=[]
for champ_freq in word_freqs:
    for word in vector_words:
        appendable=None
        for (aword, afreq) in champ_freq:
            if word == aword:
                appendable=afreq
        if appendable==None:
            tmp_vector.append(0)
        else:
            tmp_vector.append(appendable)
            
vector_list=[tmp_vector[i:i+vec_len] for i in range(0, len(tmp_vector), vec_len)]
word_array=numpy.array(vector_list)
clusterer= KMeansClusterer(num_clusters, euclidean_distance, repeats=10)
clusters = clusterer.cluster(word_array, True)
enum_clusters=list(enumerate(clusters))
enum_clusters.sort(key=lambda x: x[1])
clustered_champs= [(annotated[index][0], clus_num) for (index , clus_num) in enum_clusters]


print('clustered_champs',clustered_champs)

コード例 #47
0
 def __init__(self):
     self.clf = KMeansClusterer(2, cosine_distance, repeats=30, avoid_empty_clusters=True)
コード例 #48
0
def get_cluster(k=K):
    cluster = KMeansClusterer(k, euclidean_distance)
    cluster.cluster([vectorspaced(corpus.words(fileid)) for fileid in corpus.fileids()])
    return cluster
コード例 #49
0
def kmeans_cluster(datamatrix, numofclusters=3):
    clusterer = KMeansClusterer(numofclusters, euclidean_distance)
    groups = clusterer.cluster(datamatrix, assign_clusters=True, trace=True)
    means = clusterer.means()
    return groups, means
コード例 #50
0
ファイル: NLTK.py プロジェクト: mkhuthir/learnPython
# ### k-Means Clustering
# [Clustering](http://www.nltk.org/api/nltk.cluster.html) groups similar items together.  
# The K-means clusterer starts with k arbitrarily chosen means (or centroids) then assigns each vector to the cluster with the closest mean. It then recalculates the means of each cluster as the centroid of its vector members. This process repeats until the cluster memberships stabilize. [NLTK docs on this example](https://www.nltk.org/_modules/nltk/cluster/kmeans.html)  
# This example clusters int vectors, which you can think of as points on a plane. But you could also use clustering to cluster similar documents by vocabulary/topic.

# In[80]:


import numpy as np
from nltk.cluster import KMeansClusterer, euclidean_distance

vectors = [np.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
means = [[4, 3], [5, 5]]

clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
clusters = clusterer.cluster(vectors, True, trace=True)

print('Clustered:', vectors)
print('As:', clusters)
print('Means:', clusterer.means())


# **k-Means Clustering, Example-2**  
# In this example we cluster an array of 6 points into 2 clusters.  
# The initial centroids are randomly chosen by the clusterer, and it does 10 iterations to regroup the clusters and recalculate centroids. 

# In[103]:


vectors = [np.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]