def cluster_texts(texts, clustersNumber, distance):
    # Convierte texto en una coleccion
    # Load the list of texts into a TextCollection object.
    collection = nltk.TextCollection(texts)
    print("Created a collection of", len(collection), "terms.")

    # Para representar los textos como vectores de terminos representativos, cojo los terminos unicos
    # Get a list of unique terms
    unique_terms = list(set(collection))
    print("Unique terms found: ", len(unique_terms))

    ### And here we actually call the function and create our array of vectors.
    # TF mide la frecuencia en los textos.
    # Mira de los terminos unicos, cuantas veces aparece en el documento. No mira cuantas veces aparece en la coleccion
    # Hay otras medidas, como TF-IDF que son mas precisas porque tambien miran cuantas veces aparece en la coleccion
    vectors = [numpy.array(TF(f, unique_terms, collection)) for f in texts]
    print("Vectors created.")
    print(vectors)

    # initialize the clusterer
    clusterer = GAAClusterer(clustersNumber)
    clusters = clusterer.cluster(vectors, True)
    # Estas lineas siguientes comentadas es lo mismo pero con otra libreria, la llamada scikit-learn
    #clusterer = AgglomerativeClustering(n_clusters=clustersNumber,
    #                                  linkage="average", affinity=distanceFunction)
    #clusters = clusterer.fit_predict(vectors)

    return clusters
Beispiel #2
0
def Clustering(orig, minclusters, maxclusters) :
    '''returns (distortion score, number of clusters, cluster assignment)'''

    # perform clustering
    clusterer = GAAClusterer()
    clusterer.cluster(orig)
    vrc = []

    # calculate distortions
    wb = len(orig)
    centroid = numpy.mean(orig, axis=0)
    for vector in orig : wb -= cosine_distance(vector, centroid)
    lowerbound = minclusters
    if lowerbound < 2 : lowerbound = 2
    for k in range(lowerbound, maxclusters + 1) :
        clusterer.update_clusters(k)
        gaac = []
        ww = len(orig)
        for vector in orig :
            maxcos = None
            for j in range(k) :
                clust = clusterer._centroids[j]
                cdist = cosine_distance(vector, clust)
                if not maxcos or cdist > maxcos[0] :
                    maxcos = (cdist, j)
            ww -= maxcos[0]
            gaac.append(maxcos[1])
        vrc.append(((wb/(k - 1)) / (ww/(len(orig) - k)), k, gaac))
    khat = (float("inf"), vrc[0][1], vrc[0][2])
    for k in range(1, len(vrc) - 1) :
        dist = (vrc[k+1][0] - vrc[k][0]) - (vrc[k][0] - vrc[k-1][0])
        if dist < khat[0] : khat = (dist, vrc[k][1], vrc[k][2])

    return khat
def extract_tweets_citedby_graph(df):
  global	stemmer_func,	words,	stopwords

  stemmer_func	=	nltk.stem.snowball.SnowballStemmer("english").stem
  stopwords	=	set(nltk.corpus.stopwords.words('english'))

  words	=	get_words(df[2].values)
  # pp.pprint(words[:10])

  #	K-Means	clustering:
  # cluster	=	KMeansClusterer(7,	euclidean_distance,avoid_empty_clusters=True)

  #	GAAClusterer
  cluster	=	GAAClusterer(21)

  cluster.cluster([vectorspaced(title)	for	title	in	df[2].values	if	title],True)
  classified_examples	=	[cluster.classify(vectorspaced(title))	for	title	in	df[2].values]

  # for	cluster_id,	title	in	sorted(zip(classified_examples,	df[2].values)):
  #   # print	"{}\t{}\t{}\n".format(cluster_id,	df[0].loc[df[2]	==	title].values,	df[1].loc[df[2]	==	title].values)
  #   print	"{}\t{}\t{}".format(cluster_id,	df[1].loc[df[2]	==	title].values, title)

  #	Display	clusters	/	write	to	disk
  with	open	('Results/clustered_relevant_users.tsv', 'w')	as f:
    for	cluster_id,title in sorted(zip(classified_examples,	df[2].values)):
      if	cluster_id>6:
        #	save:	docid	tab	userids
        f.write('{}\t{}\n'.format(df[0].loc[df[2]	==	title].values,	df[1].loc[df[2]	==	title].values))
  if os.path.exists('Results/clustered_relevant_users.tsv'):  print 'file saved: Results/clustered_relevant_users.tsv'

  return
Beispiel #4
0
def demo():
    """
    Non-interactive demonstration of the clusterers with simple 2-D data.
    """

    from nltk.cluster import GAAClusterer

    # use a set of tokens with 2D indices
    vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]

    # test the GAAC clusterer with 4 clusters
    clusterer = GAAClusterer(4)
    clusters = clusterer.cluster(vectors, True)

    print('Clusterer:', clusterer)
    print('Clustered:', vectors)
    print('As:', clusters)
    print()

    # show the dendrogram
    clusterer.dendrogram().show()

    # classify a new vector
    vector = numpy.array([3, 3])
    print('classify(%s):' % vector, end=' ')
    print(clusterer.classify(vector))
    print()
def get_word_clusters(tweets):
    all_words = set()
    for tweet in tweets:
        for word in get_words(tweet[HEADER_DICT["text"]]):
            all_words.add(word)
    all_words = tuple(all_words)

    cluster = GAAClusterer(5)
    cluster.cluster([vectorspaced(tweet[HEADER_DICT["text"]], all_words) for tweet in tweets])

    classified_examples = [cluster.classify(vectorspaced(tweet[HEADER_DICT["text"]], all_words)) for tweet in tweets]

    for cluster_id, title in sorted(zip(classified_examples, job_titles)):
        print cluster_id, title
def get_word_clusters():
    all_words = set()
    for tweet in tweets.find():
        for word in get_words(tweet['text']):
            all_words.add(word)
    all_words = tuple(all_words)

    cluster = GAAClusterer(5)
    cluster.cluster([vectorspaced(tweet['text'], all_words) for tweet in tweets.find()])

    classified_examples = [
        cluster.classify(vectorspaced(tweet['text'], all_words)) for tweet in tweets.find()
    ]

    for cluster_id, title in sorted(zip(classified_examples, job_titles)):
        print cluster_id, title
Beispiel #7
0
def demo():
    """
    Non-interactive demonstration of the clusterers with simple 2-D data.
    """

    from nltk.cluster import GAAClusterer

    # use a set of tokens with 2D indices
    vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]

    # test the GAAC clusterer with 4 clusters
    clusterer = GAAClusterer(4)
    clusters = clusterer.cluster(vectors, True)

    print 'Clusterer:', clusterer
    print 'Clustered:', vectors
    print 'As:', clusters
    print

    # show the dendrogram
    clusterer.dendrogram().show()

    # classify a new vector
    vector = numpy.array([3, 3])
    print 'classify(%s):' % vector,
    print clusterer.classify(vector)
    print
def get_word_clusters():
    all_words = set()
    for tweet in tweets.find():
        for word in get_words(tweet['text']):
            all_words.add(word)
    all_words = tuple(all_words)

    cluster = GAAClusterer(5)
    cluster.cluster(
        [vectorspaced(tweet['text'], all_words) for tweet in tweets.find()])

    classified_examples = [
        cluster.classify(vectorspaced(tweet['text'], all_words))
        for tweet in tweets.find()
    ]

    for cluster_id, title in sorted(zip(classified_examples, job_titles)):
        print cluster_id, title
def Gaaclusterer_experiment(samples,k_cluster):
    silhouette = []
    devis_boldin = []

    for i in range(2,k_cluster):
        gaaclusterer= GAAClusterer(num_clusters=i)
        assigned_cluster = gaaclusterer.cluster(samples,True)
        silhouette.append(metrics.silhouette_score(X=samples, labels=np.array(assigned_cluster)))
        devis_boldin.append(davies_bouldin_score(samples, assigned_cluster))

    plt.plot(np.arange(2,k_cluster), silhouette,c='r', label='silhouette')
    plt.plot(np.arange(2,k_cluster), devis_boldin,c='g' ,label='devis_bouldin')
    plt.xlabel('number of cluster')
    plt.ylabel('Score')
    plt.title('GAACluster')
    plt.legend()
    plt.show()
    return assigned_cluster
Beispiel #10
0
def cluster_texts(texts, clustersNumber, distance):
    #Load the list of texts into a TextCollection object.
    collection = nltk.TextCollection(texts)
    print("Created a collection of", len(collection), "terms.")

    #get a list of unique terms
    unique_terms = list(set(collection))
    print("Unique terms found: ", len(unique_terms))

    ### And here we actually call the function and create our array of vectors.
    vectors = [numpy.array(TF(f, unique_terms, collection)) for f in texts]
    print("Vectors created.")

    # initialize the clusterer
    clusterer = GAAClusterer(clustersNumber)
    clusters = clusterer.cluster(vectors, True)
    #clusterer = AgglomerativeClustering(n_clusters=clustersNumber,
    #                                  linkage="average", affinity=distanceFunction)
    #clusters = clusterer.fit_predict(vectors)

    return clusters
Beispiel #11
0
def Clustering(orig, minclusters, maxclusters):
    '''returns (distortion score, number of clusters, cluster assignment)'''

    # perform clustering
    clusterer = GAAClusterer()
    clusterer.cluster(orig)
    vrc = []

    # calculate distortions
    wb = len(orig)
    centroid = numpy.mean(orig, axis=0)
    for vector in orig:
        wb -= cosine_distance(vector, centroid)
    lowerbound = minclusters
    if lowerbound < 2: lowerbound = 2
    for k in range(lowerbound, maxclusters + 1):
        clusterer.update_clusters(k)
        gaac = []
        ww = len(orig)
        for vector in orig:
            maxcos = None
            for j in range(k):
                clust = clusterer._centroids[j]
                cdist = cosine_distance(vector, clust)
                if not maxcos or cdist > maxcos[0]:
                    maxcos = (cdist, j)
            ww -= maxcos[0]
            gaac.append(maxcos[1])
        vrc.append(((wb / (k - 1)) / (ww / (len(orig) - k)), k, gaac))
    khat = (float("inf"), vrc[0][1], vrc[0][2])
    for k in range(1, len(vrc) - 1):
        dist = (vrc[k + 1][0] - vrc[k][0]) - (vrc[k][0] - vrc[k - 1][0])
        if dist < khat[0]: khat = (dist, vrc[k][1], vrc[k][2])

    return khat
def clusterIt(kwnb, clusternb, keywords):
    @decorators.memoize
    def normalize_word(word):
        return stemmer_func(word.lower())
     
    def get_words(titles):
        words = set()
        for title in job_titles:                        
                for word in title.split():
                    words.add(normalize_word(word))
        return list(words)
     
    @decorators.memoize
    def vectorspaced(title):
        title_components = [normalize_word(word) for word in title.split()]
        return numpy.array([
            word in title_components and not word in stopwords
            for word in words], numpy.short)
    
    ret = list()          
    if len(keywords) > 0:                        
        job_titles = [x.keyword for x in keywords]
        job_titles = [x.strip() for x in job_titles]
        words = get_words(job_titles)    
        
        # cluster = KMeansClusterer(5, euclidean_distance)
        cluster = GAAClusterer(clusternb)
        cluster.cluster([vectorspaced(title) for title in job_titles if title])
        classified_examples = [cluster.classify(vectorspaced(title)) for title in job_titles]
        
        for cluster_id, title in sorted(zip(classified_examples, job_titles)):
            if(title != ''):
                for keyword in keywords:
                    if (title==keyword.keyword):
                        keyword.assignCluster(cluster_id)
                        ret.append(keyword)
    return ret
Beispiel #13
0
#nltk kmeans
model = KMeansClusterer(cluster_number,
                        distance=cosine_distance,
                        repeats=epochs)
clusters = model.cluster(vectors, assign_clusters=True)

dump(model, '../data/advanced_nltk_kmeans.joblib')

# Just cluster
data['cluster'] = pd.DataFrame(clusters)
data[['text', 'cluster']].to_csv('../data/text_clustered_nltk_kmeans.csv',
                                 index=True,
                                 quoting=csv.QUOTE_ALL)

#nltk GAAClusterer
model = GAAClusterer(num_clusters=cluster_number)
model.cluster(vectors, assign_clusters=True)

clusters = [model.classify_vectorspace(vector.tolist()) for vector in vectors]

data['cluster'] = pd.DataFrame(clusters)
data[['text', 'cluster']].to_csv('../data/text_clustered_nltk_gaac.csv',
                                 index=True,
                                 quoting=csv.QUOTE_ALL)

#sklearn means
model = KMeans(n_clusters=cluster_number, max_iter=epochs, n_jobs=8)
model.fit(vectors)
dump(model, '../data/advanced_sklearn_kmeans.joblib')

data['cluster'] = pd.DataFrame(model.labels_)
Beispiel #14
0
    title_components = [normalize_word(word) for word in title.split()]
    return numpy.array(
        [word in title_components and not word in stopwords for word in words],
        numpy.short)


if __name__ == '__main__':

    filename = 'example.txt'
    if len(sys.argv) == 2:
        filename = sys.argv[1]

    with open(filename) as title_file:

        job_titles = [line.strip() for line in title_file.readlines()]

        words = get_words(job_titles)

        # cluster = KMeansClusterer(5, euclidean_distance)
        cluster = GAAClusterer(5)
        cluster.cluster([vectorspaced(title) for title in job_titles if title])

        # NOTE: This is inefficient, cluster.classify should really just be
        # called when you are classifying previously unseen examples!
        classified_examples = [
            cluster.classify(vectorspaced(title)) for title in job_titles
        ]

        for cluster_id, title in sorted(zip(classified_examples, job_titles)):
            print cluster_id, title
Beispiel #15
0
def cluster3(index, k):
    from nltk.cluster import GAAClusterer
    clusterer = GAAClusterer(k)
    clusters = clusterer.cluster(index, True)
    return clusters
Beispiel #16
0
    return numpy.array(
        [word in title_components and not word in stopwords for word in words],
        numpy.short)


if __name__ == '__main__':

    filename = 'CSV/pridected_true_text_alldata.csv'
    if len(sys.argv) == 2:
        filename = sys.argv[1]

    with open(filename) as title_file:

        job_titles = [line.strip() for line in title_file.readlines()]

        words = get_words(stemmer, job_titles)

        # cluster = KMeansClusterer(5, euclidean_distance)
        cluster = GAAClusterer(30)
        cluster.cluster(
            [vectorspaced(stemmer, title) for title in job_titles if title])

        # NOTE: This is inefficient, cluster.classify should really just be
        # called when you are classifying previously unseen examples!
        classified_examples = [
            cluster.classify(vectorspaced(stemmer, title))
            for title in job_titles
        ]

        for cluster_id, title in sorted(zip(classified_examples, job_titles)):
            print cluster_id, title
Beispiel #17
0
    return numpy.array(w_vec, numpy.float)


if __name__ == '__main__':
    corpus_dir='/Users/dehao/github/Lydata/ckip/corpus_res/'
    #corpus_dir='D:/python_workspace/corpus_test/'
    words=get_words(corpus_dir)
    vec_space=[]
    indf_dict={}
    index=0
    #写对应关系
    fw1=open('/Users/dehao/github/Lydata/ckip/index_file.txt', 'w')
    #写结果
    fw2=open('/Users/dehao/github/Lydata/ckip/cluser_res.txt', 'w')
    for f in os.listdir(corpus_dir):
        index+=1
        print 'already handle file:', index
        indf_dict[index]=f
        with codecs.open(corpus_dir+f, 'r', encoding='utf-8') as title:
            vec_space.append(vectorspaced(title))
    
    #如何根据hisgram判断分类数目?
    cluster = GAAClusterer(40)
    clustered=cluster.cluster(vec_space, True)
        #cluster.dendrogram().show()
    for k, v in indf_dict.items():
        fw1.write('%s\t%s\n' % (str(k), str(v)))
    fw1.close()
    
    fw2.write('%s\n' % ','.join([str(v) for v in clustered]))
    fw2.close()
Beispiel #18
0
def vectorspaced(stemmer, title):
    title_components = [stemmer.stem(word.lower()) for word in title.split()]
    return numpy.array([
        word in title_components and not word in stopwords
        for word in words], numpy.short)
 
if __name__ == '__main__':
 
    filename = 'CSV/pridected_true_text_alldata.csv'
    if len(sys.argv) == 2:
        filename = sys.argv[1]
 
    
    with open(filename) as title_file:
 
        job_titles = [line.strip() for line in title_file.readlines()]
 
        words = get_words(stemmer, job_titles)
 
        # cluster = KMeansClusterer(5, euclidean_distance)
        cluster = GAAClusterer(30)
        cluster.cluster([vectorspaced(stemmer, title) for title in job_titles if title])
 
        # NOTE: This is inefficient, cluster.classify should really just be
        # called when you are classifying previously unseen examples!
        classified_examples = [
                cluster.classify(vectorspaced(stemmer, title)) for title in job_titles
            ]
 
        for cluster_id, title in sorted(zip(classified_examples, job_titles)):
            print cluster_id, title
silhouette_score(tfidf,
                 array1,
                 metric='euclidean',
                 sample_size=None,
                 random_state=None)
#0.031277350000072916

## clustering only on the training data
km2 = KMeans(n_clusters=num_clusters, random_state=42)
km2.fit(X_all)
clusters2 = km2.labels_.tolist()

array2 = np.array(clusters2)
silhouette_score(X_all,
                 array2,
                 metric='euclidean',
                 sample_size=None,
                 random_state=None)
#0.037444797109297122

from nltk.cluster import GAAClusterer
clusterer = GAAClusterer(4)
clusters_agg = clusterer.cluster(X_all.toarray(), True)
array3 = np.array(clusters_agg)
# EValuating the nltk Agglomerative clustering
silhouette_score(X_all,
                 array3,
                 metric='cosine',
                 sample_size=None,
                 random_state=None)