def handle(self):
        """
        Process clustering of corpus documents.

        """

        index_db = IndexDB()
        self.connection = index_db.handler()
        documents = self.indexed_documents()
        total_docs = len(documents)
        # We generate one cluster for each 500 docs.
        num_clusters = round(total_docs / 500)

        # Load vectorize from dump or process documents vectorization
        try:
            vectorizer = joblib.load('vectorizer.pkl')
        except FileNotFoundError:
            matrix, vectorizer = self.documents_vectors()

        terms = vectorizer.get_feature_names()
        print("\nUsing %d features for clustering.\n" % (len(terms)))

        # Load cluster model from dump or process clustering.
        try:
            km = joblib.load('doc_cluster.pkl')
        except FileNotFoundError:
            km = KMeans(n_clusters=num_clusters,
                        n_init=5,
                        max_iter=100,
                        precompute_distances=True,
                        verbose=1)
            km.fit(matrix)

            # Save clusters and vectorizer.
            joblib.dump(km, 'doc_cluster.pkl')
            joblib.dump(vectorizer, 'vectorizer.pkl')

        clusters = km.labels_.tolist()
        centroids = km.cluster_centers_.argsort()[:, ::-1]
        frame = pandas.DataFrame(documents,
                                 index=[clusters],
                                 columns=['doc_id'])

        # Print report of clusters.
        for i in range(num_clusters):
            print(colored("\n\n====================================",
                          'yellow'))
            print(colored("Cluster %d:" % (i), 'yellow'), end='')
            for word_idx in centroids[i, 0:9]:
                word = terms[word_idx]
                print(colored(' %s' % (word), 'yellow'), end=',')
            print(
                colored("\n====================================\n\n",
                        'yellow'))

            print("Documents:")
            for doc_id in frame.ix[i]['doc_id'].values.tolist():
                print(' - %s' % (self.document_field_value(doc_id, 'body')))
                print("------------------------------------")
Example #2
0
 def __init__(self, basedir, doctype, encoding='utf-8', indexstyle=None, threshold=100, verbose=False):
   self.basedir = basedir
   self.threshold = threshold
   self.verbose = verbose
   self._corpus = self.GzipTarDBCorpusWithLabel(
     os.path.join(basedir, 'src'), doctype, encoding, indexstyle=indexstyle)
   self._artdb = FixedDB(os.path.join(basedir, 'articles'))
   self._indexdb = IndexDB(os.path.join(basedir, 'idx'), 'idx')
   self._loctoindex = None
   self._mode = None
   return
Example #3
0
 def __init__(self,
              basedir,
              doctype,
              encoding='utf-8',
              indexstyle=None,
              threshold=100,
              verbose=False):
     self.basedir = basedir
     self.threshold = threshold
     self.verbose = verbose
     self._corpus = self.GzipTarDBCorpusWithLabel(os.path.join(
         basedir, 'src'),
                                                  doctype,
                                                  encoding,
                                                  indexstyle=indexstyle)
     self._artdb = FixedDB(os.path.join(basedir, 'articles'))
     self._indexdb = IndexDB(os.path.join(basedir, 'idx'), 'idx')
     self._loctoindex = None
     self._mode = None
     return
    def handle(self):
        """
        Process corpus documents indexation.

        """

        download('stopwords')
        indexdb = IndexDB()
        self.connection = indexdb.handler()
        data_dir = '/Users/pablocc/harvard_data/'
        counter = 0

        for filename in os.listdir(data_dir):
            if os.path.isdir(data_dir + filename) or filename[0] == '.':
                continue

            with open(data_dir + filename, 'rb') as fh:
                reader = MARCReader(fh)
                for record in reader:
                    document = self.prepare_record(record)
                    counter += 1
                    print("%s - processing document %s." %
                          (counter, document['id']))
                    self.index_document(document)
Example #5
0
class TarCMS(object):

  """Content Management with tar files

  Sample usage:
    # Create a TarCMS object.
    cms = TarCMS(basedir, doctype)
    # Actually create the structure on disk.
    cms.create()
    # Open it.
    cms.open(mode='w')
    # Add a new document.
    aid = cms.create_article('this is my text.')
    # Modify the document.
    tid = cms.modify_article(aid, 'this is my revised text.')
    # Search all documents.
    for (tid,mtime,title,snippet) in cms.find_snapshots(queries):
      data = cms.get_data(tid)
    # Retrieve all revisions of an article:
    for tid in cms.get_article(aid):
      data = cms.get_data(tid)
    # Close it.
    cms.close()
    # Check the validity of the metadata.
    cms.validate()
    # Recover the metadata.
    cms.recover()
  """

  class GzipTarDBCorpusWithLabel(GzipTarDBCorpus):
    def loc_labels(self, loc):
      info = GzipTarDBCorpus.get_info(self, loc)
      name = info.name[8:]
      if name:
        return [name]
      return []
    
  class TarCMSError(Exception): pass
  class ArticleNotFound(TarCMSError): pass

  def __init__(self, basedir, doctype, encoding='utf-8', indexstyle=None, threshold=100, verbose=False):
    self.basedir = basedir
    self.threshold = threshold
    self.verbose = verbose
    self._corpus = self.GzipTarDBCorpusWithLabel(
      os.path.join(basedir, 'src'), doctype, encoding, indexstyle=indexstyle)
    self._artdb = FixedDB(os.path.join(basedir, 'articles'))
    self._indexdb = IndexDB(os.path.join(basedir, 'idx'), 'idx')
    self._loctoindex = None
    self._mode = None
    return

  def __repr__(self):
    return '<TarCMS: basedir=%r>' % (self.basedir,)

  def __iter__(self):
    return self.list_articles()
  
  def create(self):
    if self._mode: raise TarCMS.TarCMSError('already open: %r' % self)
    self._corpus.create()
    self._artdb.create(9)
    self._indexdb.create()
    return

  def open(self, mode='r'):
    if self._mode: raise TarCMS.TarCMSError('already open: %r' % self)
    self._corpus.open(mode=mode)
    self._artdb.open(mode=mode)
    self._indexdb.open()
    self._loctoindex = set()
    self._mode = mode
    return

  def close(self):
    if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self)
    self.flush()
    self._corpus.close()
    self._artdb.close()
    self._indexdb.close()
    self._mode = None
    return

  def _add_corpus(self, info, data):
    assert self._mode is not None
    tid = self._corpus.add_data(info, data)
    self._loctoindex.add(tid)
    if self.threshold and self.threshold <= len(self._loctoindex):
      self.flush()
    return tid

  def _add_file(self, info, path):
    assert self._mode is not None
    fp = file(path, 'rb')
    data = fp.read()
    fp.close()
    return self._add_corpus(info, data)

  def flush(self):
    if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self)
    self._corpus.flush()
    self._artdb.flush()
    indexer = Indexer(self._indexdb, self._corpus, verbose=self.verbose)
    for tid in self._loctoindex:
      indexer.index_loc(tid)
    indexer.finish()
    self._loctoindex.clear()
    return

  def create_article(self, data, info=None):
    if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self)
    if info is None:
      info = TarInfo()
    assert isinstance(info, TarInfo)
    aid = '%08x' % self._artdb.nextrecno()
    info.name = aid+info.name
    tid = self._add_corpus(info, data)
    assert aid == tid
    self._artdb.add_record(tid)
    return aid

  def modify_article(self, aid, data, info=None):
    if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self)
    tid0 = self._artdb.get_record(int(aid, 16))
    if info is None:
      info = self.get_info(tid0)
    assert isinstance(info, TarInfo)
    info.name = aid+info.name
    loc = self._add_corpus(info, data)
    tid = '%08x' % self._artdb.add_record(tid0)
    assert loc == tid
    self._artdb.set_record(int(aid, 16), tid)
    return tid

  def list_snapshots(self, aid=None):
    """Get all revisions of an article."""
    if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self)
    if aid is None:
      for tid in self._artdb:
        yield tid
    else:
      try:
        tid = self._artdb.get_record(int(aid, 16))
      except FixedDB.InvalidRecord:
        raise TarCMS.ArticleNotFound(aid)
      while aid != tid:
        yield tid
        tid = self._artdb.get_record(int(tid, 16))
      yield tid
    return

  def list_articles(self):
    if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self)
    for (aid,tid) in enumerate(self._artdb):
      aid = '%08x' % aid
      if aid == tid:
        yield aid
    return

  def find_snapshots(self, preds, disjunctive=False):
    """Find snapshots that match to the predicates."""
    if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self)
    sel = Selection(self._indexdb, preds, disjunctive=disjunctive)
    for x in sel:
      yield sel.get_snippet(x)
    return

  def find_articles(self, preds, disjunctive=False):
    sel = self.find_snapshots(preds, disjunctive=disjunctive)
    aids = set()
    for (tid, mtime, title, snippet) in sel:
      try:
        aid = self._artdb.get_record(int(tid, 16))
      except FixedDB.InvalidRecord:
        raise TarCMS.ArticleNotFound(tid)
      if aid not in aids:
        aids.add(aid)
        yield (aid, mtime, title, snippet)
    return    

  def get_info(self, tid):
    """Get the information about the snapshot specified by tid."""
    if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self)
    info = self._corpus.get_info(tid)
    info.name = info.name[8:]
    return info

  def get_data(self, tid):
    """Get a particular revision of article specified by tid."""
    if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self)
    return self._corpus.get_data(tid)

  def get_latest(self, aid):
    """Equivalent to self.list_snapshots(aid)[0]."""
    for tid in self.list_snapshots(aid):
      return tid
    raise KeyError(aid)

  def _get_tids(self):
    if self._mode: raise TarCMS.TarCMSError('already open: %r' % self)
    self._corpus.open(mode='r')
    tids = []
    for tid in self._corpus.get_all_locs():
      info = self._corpus.get_info(tid)
      aid = info.name[:8]
      if tid == aid:
        tids.append(tid)
      else:
        i = int(aid, 16)
        tids.append(tids[i])
        tids[i] = tid
    self._corpus.close()
    return tids

  def validate_artdb(self):
    if self._mode: raise TarCMS.TarCMSError('already open: %r' % self)
    self._artdb.open(mode='r')
    for (entry,tid) in ezip(self._artdb, self._get_tids()):
      if entry != tid: raise TarCMS.TarCMSError
    self._artdb.close()
    return

  def recover_artdb(self):
    if self._mode: raise TarCMS.TarCMSError('already open: %r' % self)
    self._artdb.open(mode='w')
    for tid in self._get_tids():
      self._artdb.add_record(tid)
    self._artdb.close()
    return

  def validate(self):
    if self._mode: raise TarCMS.TarCMSError('already open: %r' % self)
    self._corpus.validate_catalog()
    self.validate_artdb()
    return

  def recover(self):
    if self._mode: raise TarCMS.TarCMSError('already open: %r' % self)
    self._corpus.recover_catalog()
    self.recover_artdb()
    self._indexdb.reset()
    indexer = Indexer(self._indexdb, self._corpus, verbose=verbose)
    for tid in self._corpus.get_all_locs():
      indexer.index_loc(tid)
    indexer.finish()
    return
from cleo import Command
from indexdb import IndexDB
from math import log10
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sys import exit
from termcolor import colored
import numpy
import os
import pandas
import sqlite3

numpy.set_printoptions(threshold=numpy.nan)
index_db = IndexDB()
connection = index_db.handler()


def indexed_document_words(doc_id):
    """ Get indexed document words.

    :param str doc_id: The document ID.
    :returns: A list of document words.

    """

    print("Tokens for document '%s'" % (doc_id))
    # Get document words
    db = connection.cursor()
    db.execute('''SELECT word FROM documents_words WHERE id = ?''', (doc_id, ))
Example #7
0
class TarCMS(object):
    """Content Management with tar files

  Sample usage:
    # Create a TarCMS object.
    cms = TarCMS(basedir, doctype)
    # Actually create the structure on disk.
    cms.create()
    # Open it.
    cms.open(mode='w')
    # Add a new document.
    aid = cms.create_article('this is my text.')
    # Modify the document.
    tid = cms.modify_article(aid, 'this is my revised text.')
    # Search all documents.
    for (tid,mtime,title,snippet) in cms.find_snapshots(queries):
      data = cms.get_data(tid)
    # Retrieve all revisions of an article:
    for tid in cms.get_article(aid):
      data = cms.get_data(tid)
    # Close it.
    cms.close()
    # Check the validity of the metadata.
    cms.validate()
    # Recover the metadata.
    cms.recover()
  """
    class GzipTarDBCorpusWithLabel(GzipTarDBCorpus):
        def loc_labels(self, loc):
            info = GzipTarDBCorpus.get_info(self, loc)
            name = info.name[8:]
            if name:
                return [name]
            return []

    class TarCMSError(Exception):
        pass

    class ArticleNotFound(TarCMSError):
        pass

    def __init__(self,
                 basedir,
                 doctype,
                 encoding='utf-8',
                 indexstyle=None,
                 threshold=100,
                 verbose=False):
        self.basedir = basedir
        self.threshold = threshold
        self.verbose = verbose
        self._corpus = self.GzipTarDBCorpusWithLabel(os.path.join(
            basedir, 'src'),
                                                     doctype,
                                                     encoding,
                                                     indexstyle=indexstyle)
        self._artdb = FixedDB(os.path.join(basedir, 'articles'))
        self._indexdb = IndexDB(os.path.join(basedir, 'idx'), 'idx')
        self._loctoindex = None
        self._mode = None
        return

    def __repr__(self):
        return '<TarCMS: basedir=%r>' % (self.basedir, )

    def __iter__(self):
        return self.list_articles()

    def create(self):
        if self._mode: raise TarCMS.TarCMSError('already open: %r' % self)
        self._corpus.create()
        self._artdb.create(9)
        self._indexdb.create()
        return

    def open(self, mode='r'):
        if self._mode: raise TarCMS.TarCMSError('already open: %r' % self)
        self._corpus.open(mode=mode)
        self._artdb.open(mode=mode)
        self._indexdb.open()
        self._loctoindex = set()
        self._mode = mode
        return

    def close(self):
        if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self)
        self.flush()
        self._corpus.close()
        self._artdb.close()
        self._indexdb.close()
        self._mode = None
        return

    def _add_corpus(self, info, data):
        assert self._mode is not None
        tid = self._corpus.add_data(info, data)
        self._loctoindex.add(tid)
        if self.threshold and self.threshold <= len(self._loctoindex):
            self.flush()
        return tid

    def _add_file(self, info, path):
        assert self._mode is not None
        fp = file(path, 'rb')
        data = fp.read()
        fp.close()
        return self._add_corpus(info, data)

    def flush(self):
        if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self)
        self._corpus.flush()
        self._artdb.flush()
        indexer = Indexer(self._indexdb, self._corpus, verbose=self.verbose)
        for tid in self._loctoindex:
            indexer.index_loc(tid)
        indexer.finish()
        self._loctoindex.clear()
        return

    def create_article(self, data, info=None):
        if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self)
        if info is None:
            info = TarInfo()
        assert isinstance(info, TarInfo)
        aid = '%08x' % self._artdb.nextrecno()
        info.name = aid + info.name
        tid = self._add_corpus(info, data)
        assert aid == tid
        self._artdb.add_record(tid)
        return aid

    def modify_article(self, aid, data, info=None):
        if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self)
        tid0 = self._artdb.get_record(int(aid, 16))
        if info is None:
            info = self.get_info(tid0)
        assert isinstance(info, TarInfo)
        info.name = aid + info.name
        loc = self._add_corpus(info, data)
        tid = '%08x' % self._artdb.add_record(tid0)
        assert loc == tid
        self._artdb.set_record(int(aid, 16), tid)
        return tid

    def list_snapshots(self, aid=None):
        """Get all revisions of an article."""
        if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self)
        if aid is None:
            for tid in self._artdb:
                yield tid
        else:
            try:
                tid = self._artdb.get_record(int(aid, 16))
            except FixedDB.InvalidRecord:
                raise TarCMS.ArticleNotFound(aid)
            while aid != tid:
                yield tid
                tid = self._artdb.get_record(int(tid, 16))
            yield tid
        return

    def list_articles(self):
        if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self)
        for (aid, tid) in enumerate(self._artdb):
            aid = '%08x' % aid
            if aid == tid:
                yield aid
        return

    def find_snapshots(self, preds, disjunctive=False):
        """Find snapshots that match to the predicates."""
        if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self)
        sel = Selection(self._indexdb, preds, disjunctive=disjunctive)
        for x in sel:
            yield sel.get_snippet(x)
        return

    def find_articles(self, preds, disjunctive=False):
        sel = self.find_snapshots(preds, disjunctive=disjunctive)
        aids = set()
        for (tid, mtime, title, snippet) in sel:
            try:
                aid = self._artdb.get_record(int(tid, 16))
            except FixedDB.InvalidRecord:
                raise TarCMS.ArticleNotFound(tid)
            if aid not in aids:
                aids.add(aid)
                yield (aid, mtime, title, snippet)
        return

    def get_info(self, tid):
        """Get the information about the snapshot specified by tid."""
        if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self)
        info = self._corpus.get_info(tid)
        info.name = info.name[8:]
        return info

    def get_data(self, tid):
        """Get a particular revision of article specified by tid."""
        if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self)
        return self._corpus.get_data(tid)

    def get_latest(self, aid):
        """Equivalent to self.list_snapshots(aid)[0]."""
        for tid in self.list_snapshots(aid):
            return tid
        raise KeyError(aid)

    def _get_tids(self):
        if self._mode: raise TarCMS.TarCMSError('already open: %r' % self)
        self._corpus.open(mode='r')
        tids = []
        for tid in self._corpus.get_all_locs():
            info = self._corpus.get_info(tid)
            aid = info.name[:8]
            if tid == aid:
                tids.append(tid)
            else:
                i = int(aid, 16)
                tids.append(tids[i])
                tids[i] = tid
        self._corpus.close()
        return tids

    def validate_artdb(self):
        if self._mode: raise TarCMS.TarCMSError('already open: %r' % self)
        self._artdb.open(mode='r')
        for (entry, tid) in ezip(self._artdb, self._get_tids()):
            if entry != tid: raise TarCMS.TarCMSError
        self._artdb.close()
        return

    def recover_artdb(self):
        if self._mode: raise TarCMS.TarCMSError('already open: %r' % self)
        self._artdb.open(mode='w')
        for tid in self._get_tids():
            self._artdb.add_record(tid)
        self._artdb.close()
        return

    def validate(self):
        if self._mode: raise TarCMS.TarCMSError('already open: %r' % self)
        self._corpus.validate_catalog()
        self.validate_artdb()
        return

    def recover(self):
        if self._mode: raise TarCMS.TarCMSError('already open: %r' % self)
        self._corpus.recover_catalog()
        self.recover_artdb()
        self._indexdb.reset()
        indexer = Indexer(self._indexdb, self._corpus, verbose=verbose)
        for tid in self._corpus.get_all_locs():
            indexer.index_loc(tid)
        indexer.finish()
        return