def handle(self): """ Process clustering of corpus documents. """ index_db = IndexDB() self.connection = index_db.handler() documents = self.indexed_documents() total_docs = len(documents) # We generate one cluster for each 500 docs. num_clusters = round(total_docs / 500) # Load vectorize from dump or process documents vectorization try: vectorizer = joblib.load('vectorizer.pkl') except FileNotFoundError: matrix, vectorizer = self.documents_vectors() terms = vectorizer.get_feature_names() print("\nUsing %d features for clustering.\n" % (len(terms))) # Load cluster model from dump or process clustering. try: km = joblib.load('doc_cluster.pkl') except FileNotFoundError: km = KMeans(n_clusters=num_clusters, n_init=5, max_iter=100, precompute_distances=True, verbose=1) km.fit(matrix) # Save clusters and vectorizer. joblib.dump(km, 'doc_cluster.pkl') joblib.dump(vectorizer, 'vectorizer.pkl') clusters = km.labels_.tolist() centroids = km.cluster_centers_.argsort()[:, ::-1] frame = pandas.DataFrame(documents, index=[clusters], columns=['doc_id']) # Print report of clusters. for i in range(num_clusters): print(colored("\n\n====================================", 'yellow')) print(colored("Cluster %d:" % (i), 'yellow'), end='') for word_idx in centroids[i, 0:9]: word = terms[word_idx] print(colored(' %s' % (word), 'yellow'), end=',') print( colored("\n====================================\n\n", 'yellow')) print("Documents:") for doc_id in frame.ix[i]['doc_id'].values.tolist(): print(' - %s' % (self.document_field_value(doc_id, 'body'))) print("------------------------------------")
def handle(self): """ Process corpus documents indexation. """ download('stopwords') indexdb = IndexDB() self.connection = indexdb.handler() data_dir = '/Users/pablocc/harvard_data/' counter = 0 for filename in os.listdir(data_dir): if os.path.isdir(data_dir + filename) or filename[0] == '.': continue with open(data_dir + filename, 'rb') as fh: reader = MARCReader(fh) for record in reader: document = self.prepare_record(record) counter += 1 print("%s - processing document %s." % (counter, document['id'])) self.index_document(document)
from indexdb import IndexDB from math import log10 from nltk.corpus import stopwords from sklearn.cluster import KMeans from sklearn.externals import joblib from sklearn.feature_extraction.text import TfidfVectorizer from sys import exit from termcolor import colored import numpy import os import pandas import sqlite3 numpy.set_printoptions(threshold=numpy.nan) index_db = IndexDB() connection = index_db.handler() def indexed_document_words(doc_id): """ Get indexed document words. :param str doc_id: The document ID. :returns: A list of document words. """ print("Tokens for document '%s'" % (doc_id)) # Get document words db = connection.cursor() db.execute('''SELECT word FROM documents_words WHERE id = ?''', (doc_id, )) result = db.fetchall()