class SoPaperIndexer(object): """ Don't instantiate me """ __metaclass__ = Singleton def __init__(self): self.indexer = XapianIndexer(DB_DIR) def _do_add_paper(self, doc): try: self.indexer.add_doc(doc) except: log_exc("Exception in add_paper") log_info("Error with this doc: {0}".format(doc['id'])) def add_paper(self, doc): assert doc.get('text') assert doc.get('title') assert doc.get('id') self._do_add_paper(doc) self.indexer.flush() SoPaperSearcher().searcher.reopen() def rebuild(self): self.indexer.clear() db = get_mongo('paper') itr = db.find({}, {'pdf': 1, 'title': 1, 'text': 1}) for res in itr: text = res.get('text') if not text: log_info("About to add text for paper {0}".format(res['_id'])) try: data = res['pdf'] text = pdf2text(data) except KeyError: log_err("No pdf in pid={0},title={1}".format( res['_id'], res['title'])) continue except Exception: log_exc("Exception in pdf2text") db.update({'_id': res['_id']}, {'$set': {'text': text}}) doc = {'text': text, 'title': res['title'], 'id': res['_id'] } self._do_add_paper(doc) self.indexer.flush()
class SoPaperIndexer(object): """ Don't instantiate me """ __metaclass__ = Singleton def __init__(self): self.indexer = XapianIndexer(DB_DIR) def _do_add_paper(self, doc): try: self.indexer.add_doc(doc) except: log_exc("Exception in add_paper") log_info("Error with this doc: {0}".format(doc['id'])) def add_paper(self, doc): assert doc.get('text') assert doc.get('title') assert doc.get('id') self._do_add_paper(doc) self.indexer.flush() SoPaperSearcher().searcher.reopen() def rebuild(self): self.indexer.clear() db = get_mongo('paper') itr = db.find({}, {'pdf': 1, 'title': 1, 'text': 1}) for res in itr: text = res.get('text') if not text: log_info("About to add text for paper {0}".format(res['_id'])) try: data = res['pdf'] text = pdf2text(data) except KeyError: log_err("No pdf in pid={0},title={1}".format( res['_id'], res['title'])) continue except Exception: log_exc("Exception in pdf2text") db.update({'_id': res['_id']}, {'$set': {'text': text}}) doc = {'text': text, 'title': res['title'], 'id': res['_id']} self._do_add_paper(doc) self.indexer.flush()
def __init__(self): self.indexer = XapianIndexer(DB_DIR)