def handle(self): """ Process clustering of corpus documents. """ index_db = IndexDB() self.connection = index_db.handler() documents = self.indexed_documents() total_docs = len(documents) # We generate one cluster for each 500 docs. num_clusters = round(total_docs / 500) # Load vectorize from dump or process documents vectorization try: vectorizer = joblib.load('vectorizer.pkl') except FileNotFoundError: matrix, vectorizer = self.documents_vectors() terms = vectorizer.get_feature_names() print("\nUsing %d features for clustering.\n" % (len(terms))) # Load cluster model from dump or process clustering. try: km = joblib.load('doc_cluster.pkl') except FileNotFoundError: km = KMeans(n_clusters=num_clusters, n_init=5, max_iter=100, precompute_distances=True, verbose=1) km.fit(matrix) # Save clusters and vectorizer. joblib.dump(km, 'doc_cluster.pkl') joblib.dump(vectorizer, 'vectorizer.pkl') clusters = km.labels_.tolist() centroids = km.cluster_centers_.argsort()[:, ::-1] frame = pandas.DataFrame(documents, index=[clusters], columns=['doc_id']) # Print report of clusters. for i in range(num_clusters): print(colored("\n\n====================================", 'yellow')) print(colored("Cluster %d:" % (i), 'yellow'), end='') for word_idx in centroids[i, 0:9]: word = terms[word_idx] print(colored(' %s' % (word), 'yellow'), end=',') print( colored("\n====================================\n\n", 'yellow')) print("Documents:") for doc_id in frame.ix[i]['doc_id'].values.tolist(): print(' - %s' % (self.document_field_value(doc_id, 'body'))) print("------------------------------------")
def __init__(self, basedir, doctype, encoding='utf-8', indexstyle=None, threshold=100, verbose=False): self.basedir = basedir self.threshold = threshold self.verbose = verbose self._corpus = self.GzipTarDBCorpusWithLabel( os.path.join(basedir, 'src'), doctype, encoding, indexstyle=indexstyle) self._artdb = FixedDB(os.path.join(basedir, 'articles')) self._indexdb = IndexDB(os.path.join(basedir, 'idx'), 'idx') self._loctoindex = None self._mode = None return
def __init__(self, basedir, doctype, encoding='utf-8', indexstyle=None, threshold=100, verbose=False): self.basedir = basedir self.threshold = threshold self.verbose = verbose self._corpus = self.GzipTarDBCorpusWithLabel(os.path.join( basedir, 'src'), doctype, encoding, indexstyle=indexstyle) self._artdb = FixedDB(os.path.join(basedir, 'articles')) self._indexdb = IndexDB(os.path.join(basedir, 'idx'), 'idx') self._loctoindex = None self._mode = None return
def handle(self): """ Process corpus documents indexation. """ download('stopwords') indexdb = IndexDB() self.connection = indexdb.handler() data_dir = '/Users/pablocc/harvard_data/' counter = 0 for filename in os.listdir(data_dir): if os.path.isdir(data_dir + filename) or filename[0] == '.': continue with open(data_dir + filename, 'rb') as fh: reader = MARCReader(fh) for record in reader: document = self.prepare_record(record) counter += 1 print("%s - processing document %s." % (counter, document['id'])) self.index_document(document)
class TarCMS(object): """Content Management with tar files Sample usage: # Create a TarCMS object. cms = TarCMS(basedir, doctype) # Actually create the structure on disk. cms.create() # Open it. cms.open(mode='w') # Add a new document. aid = cms.create_article('this is my text.') # Modify the document. tid = cms.modify_article(aid, 'this is my revised text.') # Search all documents. for (tid,mtime,title,snippet) in cms.find_snapshots(queries): data = cms.get_data(tid) # Retrieve all revisions of an article: for tid in cms.get_article(aid): data = cms.get_data(tid) # Close it. cms.close() # Check the validity of the metadata. cms.validate() # Recover the metadata. cms.recover() """ class GzipTarDBCorpusWithLabel(GzipTarDBCorpus): def loc_labels(self, loc): info = GzipTarDBCorpus.get_info(self, loc) name = info.name[8:] if name: return [name] return [] class TarCMSError(Exception): pass class ArticleNotFound(TarCMSError): pass def __init__(self, basedir, doctype, encoding='utf-8', indexstyle=None, threshold=100, verbose=False): self.basedir = basedir self.threshold = threshold self.verbose = verbose self._corpus = self.GzipTarDBCorpusWithLabel( os.path.join(basedir, 'src'), doctype, encoding, indexstyle=indexstyle) self._artdb = FixedDB(os.path.join(basedir, 'articles')) self._indexdb = IndexDB(os.path.join(basedir, 'idx'), 'idx') self._loctoindex = None self._mode = None return def __repr__(self): return '<TarCMS: basedir=%r>' % (self.basedir,) def __iter__(self): return self.list_articles() def create(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._corpus.create() self._artdb.create(9) self._indexdb.create() return def open(self, mode='r'): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._corpus.open(mode=mode) self._artdb.open(mode=mode) self._indexdb.open() self._loctoindex = set() self._mode = mode return def close(self): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) self.flush() self._corpus.close() self._artdb.close() self._indexdb.close() self._mode = None return def _add_corpus(self, info, data): assert self._mode is not None tid = self._corpus.add_data(info, data) self._loctoindex.add(tid) if self.threshold and self.threshold <= len(self._loctoindex): self.flush() return tid def _add_file(self, info, path): assert self._mode is not None fp = file(path, 'rb') data = fp.read() fp.close() return self._add_corpus(info, data) def flush(self): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) self._corpus.flush() self._artdb.flush() indexer = Indexer(self._indexdb, self._corpus, verbose=self.verbose) for tid in self._loctoindex: indexer.index_loc(tid) indexer.finish() self._loctoindex.clear() return def create_article(self, data, info=None): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) if info is None: info = TarInfo() assert isinstance(info, TarInfo) aid = '%08x' % self._artdb.nextrecno() info.name = aid+info.name tid = self._add_corpus(info, data) assert aid == tid self._artdb.add_record(tid) return aid def modify_article(self, aid, data, info=None): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) tid0 = self._artdb.get_record(int(aid, 16)) if info is None: info = self.get_info(tid0) assert isinstance(info, TarInfo) info.name = aid+info.name loc = self._add_corpus(info, data) tid = '%08x' % self._artdb.add_record(tid0) assert loc == tid self._artdb.set_record(int(aid, 16), tid) return tid def list_snapshots(self, aid=None): """Get all revisions of an article.""" if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) if aid is None: for tid in self._artdb: yield tid else: try: tid = self._artdb.get_record(int(aid, 16)) except FixedDB.InvalidRecord: raise TarCMS.ArticleNotFound(aid) while aid != tid: yield tid tid = self._artdb.get_record(int(tid, 16)) yield tid return def list_articles(self): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) for (aid,tid) in enumerate(self._artdb): aid = '%08x' % aid if aid == tid: yield aid return def find_snapshots(self, preds, disjunctive=False): """Find snapshots that match to the predicates.""" if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) sel = Selection(self._indexdb, preds, disjunctive=disjunctive) for x in sel: yield sel.get_snippet(x) return def find_articles(self, preds, disjunctive=False): sel = self.find_snapshots(preds, disjunctive=disjunctive) aids = set() for (tid, mtime, title, snippet) in sel: try: aid = self._artdb.get_record(int(tid, 16)) except FixedDB.InvalidRecord: raise TarCMS.ArticleNotFound(tid) if aid not in aids: aids.add(aid) yield (aid, mtime, title, snippet) return def get_info(self, tid): """Get the information about the snapshot specified by tid.""" if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) info = self._corpus.get_info(tid) info.name = info.name[8:] return info def get_data(self, tid): """Get a particular revision of article specified by tid.""" if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) return self._corpus.get_data(tid) def get_latest(self, aid): """Equivalent to self.list_snapshots(aid)[0].""" for tid in self.list_snapshots(aid): return tid raise KeyError(aid) def _get_tids(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._corpus.open(mode='r') tids = [] for tid in self._corpus.get_all_locs(): info = self._corpus.get_info(tid) aid = info.name[:8] if tid == aid: tids.append(tid) else: i = int(aid, 16) tids.append(tids[i]) tids[i] = tid self._corpus.close() return tids def validate_artdb(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._artdb.open(mode='r') for (entry,tid) in ezip(self._artdb, self._get_tids()): if entry != tid: raise TarCMS.TarCMSError self._artdb.close() return def recover_artdb(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._artdb.open(mode='w') for tid in self._get_tids(): self._artdb.add_record(tid) self._artdb.close() return def validate(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._corpus.validate_catalog() self.validate_artdb() return def recover(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._corpus.recover_catalog() self.recover_artdb() self._indexdb.reset() indexer = Indexer(self._indexdb, self._corpus, verbose=verbose) for tid in self._corpus.get_all_locs(): indexer.index_loc(tid) indexer.finish() return
from cleo import Command from indexdb import IndexDB from math import log10 from nltk.corpus import stopwords from sklearn.cluster import KMeans from sklearn.externals import joblib from sklearn.feature_extraction.text import TfidfVectorizer from sys import exit from termcolor import colored import numpy import os import pandas import sqlite3 numpy.set_printoptions(threshold=numpy.nan) index_db = IndexDB() connection = index_db.handler() def indexed_document_words(doc_id): """ Get indexed document words. :param str doc_id: The document ID. :returns: A list of document words. """ print("Tokens for document '%s'" % (doc_id)) # Get document words db = connection.cursor() db.execute('''SELECT word FROM documents_words WHERE id = ?''', (doc_id, ))
class TarCMS(object): """Content Management with tar files Sample usage: # Create a TarCMS object. cms = TarCMS(basedir, doctype) # Actually create the structure on disk. cms.create() # Open it. cms.open(mode='w') # Add a new document. aid = cms.create_article('this is my text.') # Modify the document. tid = cms.modify_article(aid, 'this is my revised text.') # Search all documents. for (tid,mtime,title,snippet) in cms.find_snapshots(queries): data = cms.get_data(tid) # Retrieve all revisions of an article: for tid in cms.get_article(aid): data = cms.get_data(tid) # Close it. cms.close() # Check the validity of the metadata. cms.validate() # Recover the metadata. cms.recover() """ class GzipTarDBCorpusWithLabel(GzipTarDBCorpus): def loc_labels(self, loc): info = GzipTarDBCorpus.get_info(self, loc) name = info.name[8:] if name: return [name] return [] class TarCMSError(Exception): pass class ArticleNotFound(TarCMSError): pass def __init__(self, basedir, doctype, encoding='utf-8', indexstyle=None, threshold=100, verbose=False): self.basedir = basedir self.threshold = threshold self.verbose = verbose self._corpus = self.GzipTarDBCorpusWithLabel(os.path.join( basedir, 'src'), doctype, encoding, indexstyle=indexstyle) self._artdb = FixedDB(os.path.join(basedir, 'articles')) self._indexdb = IndexDB(os.path.join(basedir, 'idx'), 'idx') self._loctoindex = None self._mode = None return def __repr__(self): return '<TarCMS: basedir=%r>' % (self.basedir, ) def __iter__(self): return self.list_articles() def create(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._corpus.create() self._artdb.create(9) self._indexdb.create() return def open(self, mode='r'): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._corpus.open(mode=mode) self._artdb.open(mode=mode) self._indexdb.open() self._loctoindex = set() self._mode = mode return def close(self): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) self.flush() self._corpus.close() self._artdb.close() self._indexdb.close() self._mode = None return def _add_corpus(self, info, data): assert self._mode is not None tid = self._corpus.add_data(info, data) self._loctoindex.add(tid) if self.threshold and self.threshold <= len(self._loctoindex): self.flush() return tid def _add_file(self, info, path): assert self._mode is not None fp = file(path, 'rb') data = fp.read() fp.close() return self._add_corpus(info, data) def flush(self): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) self._corpus.flush() self._artdb.flush() indexer = Indexer(self._indexdb, self._corpus, verbose=self.verbose) for tid in self._loctoindex: indexer.index_loc(tid) indexer.finish() self._loctoindex.clear() return def create_article(self, data, info=None): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) if info is None: info = TarInfo() assert isinstance(info, TarInfo) aid = '%08x' % self._artdb.nextrecno() info.name = aid + info.name tid = self._add_corpus(info, data) assert aid == tid self._artdb.add_record(tid) return aid def modify_article(self, aid, data, info=None): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) tid0 = self._artdb.get_record(int(aid, 16)) if info is None: info = self.get_info(tid0) assert isinstance(info, TarInfo) info.name = aid + info.name loc = self._add_corpus(info, data) tid = '%08x' % self._artdb.add_record(tid0) assert loc == tid self._artdb.set_record(int(aid, 16), tid) return tid def list_snapshots(self, aid=None): """Get all revisions of an article.""" if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) if aid is None: for tid in self._artdb: yield tid else: try: tid = self._artdb.get_record(int(aid, 16)) except FixedDB.InvalidRecord: raise TarCMS.ArticleNotFound(aid) while aid != tid: yield tid tid = self._artdb.get_record(int(tid, 16)) yield tid return def list_articles(self): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) for (aid, tid) in enumerate(self._artdb): aid = '%08x' % aid if aid == tid: yield aid return def find_snapshots(self, preds, disjunctive=False): """Find snapshots that match to the predicates.""" if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) sel = Selection(self._indexdb, preds, disjunctive=disjunctive) for x in sel: yield sel.get_snippet(x) return def find_articles(self, preds, disjunctive=False): sel = self.find_snapshots(preds, disjunctive=disjunctive) aids = set() for (tid, mtime, title, snippet) in sel: try: aid = self._artdb.get_record(int(tid, 16)) except FixedDB.InvalidRecord: raise TarCMS.ArticleNotFound(tid) if aid not in aids: aids.add(aid) yield (aid, mtime, title, snippet) return def get_info(self, tid): """Get the information about the snapshot specified by tid.""" if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) info = self._corpus.get_info(tid) info.name = info.name[8:] return info def get_data(self, tid): """Get a particular revision of article specified by tid.""" if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) return self._corpus.get_data(tid) def get_latest(self, aid): """Equivalent to self.list_snapshots(aid)[0].""" for tid in self.list_snapshots(aid): return tid raise KeyError(aid) def _get_tids(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._corpus.open(mode='r') tids = [] for tid in self._corpus.get_all_locs(): info = self._corpus.get_info(tid) aid = info.name[:8] if tid == aid: tids.append(tid) else: i = int(aid, 16) tids.append(tids[i]) tids[i] = tid self._corpus.close() return tids def validate_artdb(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._artdb.open(mode='r') for (entry, tid) in ezip(self._artdb, self._get_tids()): if entry != tid: raise TarCMS.TarCMSError self._artdb.close() return def recover_artdb(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._artdb.open(mode='w') for tid in self._get_tids(): self._artdb.add_record(tid) self._artdb.close() return def validate(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._corpus.validate_catalog() self.validate_artdb() return def recover(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._corpus.recover_catalog() self.recover_artdb() self._indexdb.reset() indexer = Indexer(self._indexdb, self._corpus, verbose=verbose) for tid in self._corpus.get_all_locs(): indexer.index_loc(tid) indexer.finish() return