Example #1
0
def demo():
    """
    Non-interactive demonstration of the clusterers with simple 2-D data.
    """

    from nltk.cluster import GAAClusterer

    # use a set of tokens with 2D indices
    vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]

    # test the GAAC clusterer with 4 clusters
    clusterer = GAAClusterer(4)
    clusters = clusterer.cluster(vectors, True)

    print("Clusterer:", clusterer)
    print("Clustered:", vectors)
    print("As:", clusters)
    print()

    # show the dendrogram
    clusterer.dendrogram().show()

    # classify a new vector
    vector = numpy.array([3, 3])
    print("classify(%s):" % vector, end=" ")
    print(clusterer.classify(vector))
    print()
Example #2
0
def Clustering(orig, minclusters, maxclusters):
    '''returns (distortion score, number of clusters, cluster assignment)'''

    # perform clustering
    clusterer = GAAClusterer()
    clusterer.cluster(orig)
    vrc = []

    # calculate distortions
    wb = len(orig)
    centroid = numpy.mean(orig, axis=0)
    for vector in orig:
        wb -= cosine_distance(vector, centroid)
    lowerbound = minclusters
    if lowerbound < 2: lowerbound = 2
    for k in range(lowerbound, maxclusters + 1):
        clusterer.update_clusters(k)
        gaac = []
        ww = len(orig)
        for vector in orig:
            maxcos = None
            for j in range(k):
                clust = clusterer._centroids[j]
                cdist = cosine_distance(vector, clust)
                if not maxcos or cdist > maxcos[0]:
                    maxcos = (cdist, j)
            ww -= maxcos[0]
            gaac.append(maxcos[1])
        vrc.append(((wb / (k - 1)) / (ww / (len(orig) - k)), k, gaac))
    khat = (float("inf"), vrc[0][1], vrc[0][2])
    for k in range(1, len(vrc) - 1):
        dist = (vrc[k + 1][0] - vrc[k][0]) - (vrc[k][0] - vrc[k - 1][0])
        if dist < khat[0]: khat = (dist, vrc[k][1], vrc[k][2])

    return khat
def cluster_texts(texts, clustersNumber, distance):
    # Convierte texto en una coleccion
    # Load the list of texts into a TextCollection object.
    collection = nltk.TextCollection(texts)
    print("Created a collection of", len(collection), "terms.")

    # Para representar los textos como vectores de terminos representativos, cojo los terminos unicos
    # Get a list of unique terms
    unique_terms = list(set(collection))
    print("Unique terms found: ", len(unique_terms))

    ### And here we actually call the function and create our array of vectors.
    # TF mide la frecuencia en los textos.
    # Mira de los terminos unicos, cuantas veces aparece en el documento. No mira cuantas veces aparece en la coleccion
    # Hay otras medidas, como TF-IDF que son mas precisas porque tambien miran cuantas veces aparece en la coleccion
    vectors = [numpy.array(TF(f, unique_terms, collection)) for f in texts]
    print("Vectors created.")
    print(vectors)

    # initialize the clusterer
    clusterer = GAAClusterer(clustersNumber)
    clusters = clusterer.cluster(vectors, True)
    # Estas lineas siguientes comentadas es lo mismo pero con otra libreria, la llamada scikit-learn
    #clusterer = AgglomerativeClustering(n_clusters=clustersNumber,
    #                                  linkage="average", affinity=distanceFunction)
    #clusters = clusterer.fit_predict(vectors)

    return clusters
def Gaaclusterer_experiment(samples,k_cluster):
    silhouette = []
    devis_boldin = []

    for i in range(2,k_cluster):
        gaaclusterer= GAAClusterer(num_clusters=i)
        assigned_cluster = gaaclusterer.cluster(samples,True)
        silhouette.append(metrics.silhouette_score(X=samples, labels=np.array(assigned_cluster)))
        devis_boldin.append(davies_bouldin_score(samples, assigned_cluster))

    plt.plot(np.arange(2,k_cluster), silhouette,c='r', label='silhouette')
    plt.plot(np.arange(2,k_cluster), devis_boldin,c='g' ,label='devis_bouldin')
    plt.xlabel('number of cluster')
    plt.ylabel('Score')
    plt.title('GAACluster')
    plt.legend()
    plt.show()
    return assigned_cluster
def get_word_clusters():
    all_words = set()
    for tweet in tweets.find():
        for word in get_words(tweet['text']):
            all_words.add(word)
    all_words = tuple(all_words)

    cluster = GAAClusterer(5)
    cluster.cluster(
        [vectorspaced(tweet['text'], all_words) for tweet in tweets.find()])

    classified_examples = [
        cluster.classify(vectorspaced(tweet['text'], all_words))
        for tweet in tweets.find()
    ]

    for cluster_id, title in sorted(zip(classified_examples, job_titles)):
        print cluster_id, title
Example #6
0
def cluster_texts(texts, clustersNumber, distance):
    #Load the list of texts into a TextCollection object.
    collection = nltk.TextCollection(texts)
    print("Created a collection of", len(collection), "terms.")

    #get a list of unique terms
    unique_terms = list(set(collection))
    print("Unique terms found: ", len(unique_terms))

    ### And here we actually call the function and create our array of vectors.
    vectors = [numpy.array(TF(f, unique_terms, collection)) for f in texts]
    print("Vectors created.")

    # initialize the clusterer
    clusterer = GAAClusterer(clustersNumber)
    clusters = clusterer.cluster(vectors, True)
    #clusterer = AgglomerativeClustering(n_clusters=clustersNumber,
    #                                  linkage="average", affinity=distanceFunction)
    #clusters = clusterer.fit_predict(vectors)

    return clusters
Example #7
0
    title_components = [normalize_word(word) for word in title.split()]
    return numpy.array(
        [word in title_components and not word in stopwords for word in words],
        numpy.short)


if __name__ == '__main__':

    filename = 'example.txt'
    if len(sys.argv) == 2:
        filename = sys.argv[1]

    with open(filename) as title_file:

        job_titles = [line.strip() for line in title_file.readlines()]

        words = get_words(job_titles)

        # cluster = KMeansClusterer(5, euclidean_distance)
        cluster = GAAClusterer(5)
        cluster.cluster([vectorspaced(title) for title in job_titles if title])

        # NOTE: This is inefficient, cluster.classify should really just be
        # called when you are classifying previously unseen examples!
        classified_examples = [
            cluster.classify(vectorspaced(title)) for title in job_titles
        ]

        for cluster_id, title in sorted(zip(classified_examples, job_titles)):
            print cluster_id, title
Example #8
0
def cluster3(index, k):
    from nltk.cluster import GAAClusterer
    clusterer = GAAClusterer(k)
    clusters = clusterer.cluster(index, True)
    return clusters
silhouette_score(tfidf,
                 array1,
                 metric='euclidean',
                 sample_size=None,
                 random_state=None)
#0.031277350000072916

## clustering only on the training data
km2 = KMeans(n_clusters=num_clusters, random_state=42)
km2.fit(X_all)
clusters2 = km2.labels_.tolist()

array2 = np.array(clusters2)
silhouette_score(X_all,
                 array2,
                 metric='euclidean',
                 sample_size=None,
                 random_state=None)
#0.037444797109297122

from nltk.cluster import GAAClusterer
clusterer = GAAClusterer(4)
clusters_agg = clusterer.cluster(X_all.toarray(), True)
array3 = np.array(clusters_agg)
# EValuating the nltk Agglomerative clustering
silhouette_score(X_all,
                 array3,
                 metric='cosine',
                 sample_size=None,
                 random_state=None)
Example #10
0
    return numpy.array(
        [word in title_components and not word in stopwords for word in words],
        numpy.short)


if __name__ == '__main__':

    filename = 'CSV/pridected_true_text_alldata.csv'
    if len(sys.argv) == 2:
        filename = sys.argv[1]

    with open(filename) as title_file:

        job_titles = [line.strip() for line in title_file.readlines()]

        words = get_words(stemmer, job_titles)

        # cluster = KMeansClusterer(5, euclidean_distance)
        cluster = GAAClusterer(30)
        cluster.cluster(
            [vectorspaced(stemmer, title) for title in job_titles if title])

        # NOTE: This is inefficient, cluster.classify should really just be
        # called when you are classifying previously unseen examples!
        classified_examples = [
            cluster.classify(vectorspaced(stemmer, title))
            for title in job_titles
        ]

        for cluster_id, title in sorted(zip(classified_examples, job_titles)):
            print cluster_id, title
Example #11
0
#nltk kmeans
model = KMeansClusterer(cluster_number,
                        distance=cosine_distance,
                        repeats=epochs)
clusters = model.cluster(vectors, assign_clusters=True)

dump(model, '../data/advanced_nltk_kmeans.joblib')

# Just cluster
data['cluster'] = pd.DataFrame(clusters)
data[['text', 'cluster']].to_csv('../data/text_clustered_nltk_kmeans.csv',
                                 index=True,
                                 quoting=csv.QUOTE_ALL)

#nltk GAAClusterer
model = GAAClusterer(num_clusters=cluster_number)
model.cluster(vectors, assign_clusters=True)

clusters = [model.classify_vectorspace(vector.tolist()) for vector in vectors]

data['cluster'] = pd.DataFrame(clusters)
data[['text', 'cluster']].to_csv('../data/text_clustered_nltk_gaac.csv',
                                 index=True,
                                 quoting=csv.QUOTE_ALL)

#sklearn means
model = KMeans(n_clusters=cluster_number, max_iter=epochs, n_jobs=8)
model.fit(vectors)
dump(model, '../data/advanced_sklearn_kmeans.joblib')

data['cluster'] = pd.DataFrame(model.labels_)