Ejemplo n.º 1
0
class SoPaperIndexer(object):
    """ Don't instantiate me
    """
    __metaclass__ = Singleton

    def __init__(self):
        self.indexer = XapianIndexer(DB_DIR)

    def _do_add_paper(self, doc):
        try:
            self.indexer.add_doc(doc)
        except:
            log_exc("Exception in add_paper")
            log_info("Error with this doc: {0}".format(doc['id']))

    def add_paper(self, doc):
        assert doc.get('text')
        assert doc.get('title')
        assert doc.get('id')
        self._do_add_paper(doc)
        self.indexer.flush()
        SoPaperSearcher().searcher.reopen()

    def rebuild(self):
        self.indexer.clear()

        db = get_mongo('paper')
        itr = db.find({}, {'pdf': 1, 'title': 1, 'text': 1})
        for res in itr:
            text = res.get('text')
            if not text:
                log_info("About to add text for paper {0}".format(res['_id']))
                try:
                    data = res['pdf']
                    text = pdf2text(data)
                except KeyError:
                    log_err("No pdf in pid={0},title={1}".format(
                        res['_id'], res['title']))
                    continue
                except Exception:
                    log_exc("Exception in pdf2text")

                db.update({'_id': res['_id']}, {'$set': {'text': text}})
            doc = {'text': text,
                   'title': res['title'],
                   'id': res['_id']
                  }
            self._do_add_paper(doc)
        self.indexer.flush()
Ejemplo n.º 2
0
class SoPaperIndexer(object):
    """ Don't instantiate me
    """
    __metaclass__ = Singleton

    def __init__(self):
        self.indexer = XapianIndexer(DB_DIR)

    def _do_add_paper(self, doc):
        try:
            self.indexer.add_doc(doc)
        except:
            log_exc("Exception in add_paper")
            log_info("Error with this doc: {0}".format(doc['id']))

    def add_paper(self, doc):
        assert doc.get('text')
        assert doc.get('title')
        assert doc.get('id')
        self._do_add_paper(doc)
        self.indexer.flush()
        SoPaperSearcher().searcher.reopen()

    def rebuild(self):
        self.indexer.clear()

        db = get_mongo('paper')
        itr = db.find({}, {'pdf': 1, 'title': 1, 'text': 1})
        for res in itr:
            text = res.get('text')
            if not text:
                log_info("About to add text for paper {0}".format(res['_id']))
                try:
                    data = res['pdf']
                    text = pdf2text(data)
                except KeyError:
                    log_err("No pdf in pid={0},title={1}".format(
                        res['_id'], res['title']))
                    continue
                except Exception:
                    log_exc("Exception in pdf2text")

                db.update({'_id': res['_id']}, {'$set': {'text': text}})
            doc = {'text': text, 'title': res['title'], 'id': res['_id']}
            self._do_add_paper(doc)
        self.indexer.flush()
Ejemplo n.º 3
0
 def __init__(self):
     self.indexer = XapianIndexer(DB_DIR)
Ejemplo n.º 4
0
 def __init__(self):
     self.indexer = XapianIndexer(DB_DIR)