Exemple #1
0
if __name__ == "__main__":
    # Read data
    books = collection_reader.read_books_from_mongo()
    documents = collection_reader.extract_corpus(books)
    print("{} books:".format(len(documents)))
    print([book["book_id3"] for book in books])
    print()

    # Create term-document representation
    X = preprocessing_util.convert_to_term_document(documents, min_df=0.1, max_df=0.9)

    for feature_number in range(10,24,4):
        print("Features: {}".format(feature_number))

        # SVD
        Y = preprocessing_util.apply_svd(X, feature_number)

        # Cosine similarity matrix
        dist = 1 - cosine_similarity(Y)

        ###############################################################################
        # Do the actual clustering
        k = 4
        ac = AgglomerativeClustering(linkage="average", n_clusters=k, affinity="cosine")

        print("Clustering sparse data with {}".format(ac))
        t0 = time()
        ac.fit(dist)
        print("done in {}".format(time() - t0))
        print()
Exemple #2
0
from util import plot_util, preprocessing_util, benchmark, collection_reader

if __name__ == "__main__":
    # Read data
    books = collection_reader.read_books_from_mongo()
    documents = collection_reader.extract_corpus(books)
    print("{} books:".format(len(documents)))
    print([book["book_id3"] for book in books])
    print()

    # Create term-document representation
    X = preprocessing_util.convert_to_term_document(documents,
                                                    min_df=0.1,
                                                    max_df=0.9)

    # SVD
    X = preprocessing_util.apply_svd(X, min(X.shape))

    ###############################################################################
    # Do the actual clustering
    print("Clustering data")
    k = 4

    method = DBSCAN(eps=0.8, min_samples=1).fit(X)

    # Metrics
    benchmark.clustering_metrics(X, method.labels_)

    # Create a 3d scatter plot of the corpus
    plot_util.create_3d_plot_for_sparse_matrix(X, method.labels_)