Exemple #1
0
    def _apply_cache(self, indexpath, cachepath, cache_id):
        idx = IndexerConnection(indexpath)
        cm = XapianCacheManager(cachepath, id=cache_id)

        idx.set_cache_manager(cm)
        idx.apply_cached_items()
        idx.close()
Exemple #2
0
    def _create_index(self, indexpath):
        iconn = IndexerConnection(indexpath)
        iconn.add_field_action("field", FieldActions.INDEX_FREETEXT, language="en")

        documents = [
            ("1", [("term_a", 1), ("term_b", 5)]),
            ("2", [("term_a", 2), ("term_b", 4)]),
            ("3", [("term_a", 3), ("term_b", 3)]),
            ("4", [("term_a", 4), ("term_b", 2)]),
            ("5", [("term_a", 5), ("term_b", 1)]),
        ]

        for docid, terms in documents:
            pdoc = self._create_processed_doc(iconn, docid, terms)
            iconn.replace(pdoc, xapid=docid)
        iconn.flush()
        iconn.close()
Exemple #3
0
    def __init__(self, dirname):
        self.dbPath = os.path.abspath(dirname)

        self.dbconn = IndexerConnection(self.dbPath)

        self.dbconn.add_field_action('title',
                                     FieldActions.INDEX_FREETEXT,
                                     weight=5,
                                     language='en')
        self.dbconn.add_field_action('text',
                                     FieldActions.INDEX_FREETEXT,
                                     language='en',
                                     spell=True,
                                     stop=STOPWORDS)
        #self.dbconn.add_field_action('citecnt', FieldActions.FACET, type='float')
        #self.dbconn.add_field_action('citecnt', FieldActions.WEIGHT)

        self.lock = threading.Lock()

        for k in FIELD_NUM.keys():
            self.dbconn.add_field_action(k, FieldActions.STORE_CONTENT)
Exemple #4
0
    def __init__(self, dirname):
        self.dbPath = os.path.abspath(dirname)

        self.dbconn = IndexerConnection(self.dbPath)

        self.dbconn.add_field_action('title', FieldActions.INDEX_FREETEXT,
                                     weight=5, language='en')
        self.dbconn.add_field_action('text', FieldActions.INDEX_FREETEXT,
                                     language='en', spell=True, stop=STOPWORDS)
        #self.dbconn.add_field_action('citecnt', FieldActions.FACET, type='float')
        #self.dbconn.add_field_action('citecnt', FieldActions.WEIGHT)

        self.lock = threading.Lock()

        for k in FIELD_NUM.keys():
            self.dbconn.add_field_action(k, FieldActions.STORE_CONTENT)
Exemple #5
0
def get_connection(path, indexer=False, callback=None):
    """Get a connection to the database.

    This function reuses already existing connections.
    """
    global _index_connection, _search_connections

    try:
        _connection_attemts = _new = 0
        connection = None
        while _connection_attemts <= 3:
            try:
                if indexer:
                    if _index_connection is None:
                        _new = True
                        _index_connection = IndexerConnection(path)
                    connection = _index_connection
                else:
                    thread = get_current_thread()
                    if thread not in _search_connections:
                        _new = True
                        _search_connections[
                            thread] = connection = SearchConnection(path)
                    else:
                        connection = _search_connections[thread]
            except (xapian.DatabaseOpeningError, xapian.DatabaseLockError):
                time.sleep(0.5)
                _connection_attemts += 1
            else:
                break

        if callback:
            callback(connection)

        if not _new:
            connection.reopen()
        yield connection
    finally:
        if connection is not None:
            connection.close()
            _index_connection = None
Exemple #6
0
    def test_multiple_cache(self):
        with tempdir() as basepath:
            # create an index
            indexpath = os.path.join(basepath, "test_index")
            self._create_index(indexpath)

            base_cachepath = os.path.join(basepath, "cache")
            os.makedirs(base_cachepath)

            # create and apply cache 1
            cachepath1 = os.path.join(base_cachepath, "1")
            self._create_and_apply_cache(indexpath, cachepath1, "1")

            # create and apply cache 2
            cachepath2 = os.path.join(base_cachepath, "2")
            self._create_and_apply_cache(indexpath, cachepath2, "cache2")

            # test cache 1
            self._check_cache_results(
                indexpath, cachepath1, "1", [["5", "4", "3", "2", "1"], ["4", "2", "5", "3", "1"]]
            )
            # test cache 2
            self._check_cache_results(
                indexpath, cachepath2, "cache2", [["5", "4", "3", "2", "1"], ["3", "4", "1", "5", "2"]]
            )

            # the document whose docid is 4 is in both caches, we're
            # testing here if replacing it with one cache manager set
            # will change the result in the other cache. It must change.

            # replace document
            iconn = IndexerConnection(indexpath)
            cm = XapianCacheManager(cachepath2, id="cache2")
            iconn.set_cache_manager(cm)
            docid, terms = ("4", [("term_a", 4), ("term_b", 2)])
            pdoc = self._create_processed_doc(iconn, docid, terms)
            iconn.replace(pdoc, xapid=int(docid))
            iconn.flush()
            iconn.close()
            cm.close()

            # check if the results in both caches are ok
            self._check_cache_results(
                indexpath, cachepath1, "1", [["5", "4", "3", "2", "1"], ["4", "2", "5", "3", "1"]]
            )
            self._check_cache_results(
                indexpath, cachepath2, "cache2", [["5", "4", "3", "2", "1"], ["3", "4", "1", "5", "2"]]
            )

            # there are 2 code pathes when we deal with caches:
            # 1. the cache has not enough results
            # 2. the cache has enough results
            # in the first case, the result will come from a mixed query
            # against the index. In the second, the results will come from
            # the cache_manger. So, the cache managers must be updated.
            # When using multiple cache_manager, the deletion must be
            # explicitly done in each cache, and then we must ask for the
            # delete method to ignore cache (not try to update it). A better
            # approach for this will be developed.

            # remove document
            iconn = IndexerConnection(indexpath)
            cm = XapianCacheManager(cachepath1, id="1")
            iconn.set_cache_manager(cm)
            iconn._remove_cached_items(xapid=4)
            cm = XapianCacheManager(cachepath2, id="cache2")
            iconn.set_cache_manager(cm)
            iconn._remove_cached_items(xapid=4)
            cm.close()
            iconn.delete(xapid=4, ignore_cache=True)
            iconn.flush()
            iconn.close()

            # cache has not enough results
            self._check_cache_results(indexpath, cachepath1, "1", [["5", "3", "2", "1"], ["2", "5", "3", "1"]])
            self._check_cache_results(indexpath, cachepath2, "cache2", [["5", "3", "2", "1"], ["3", "1", "5", "2"]])

            # cache has enough results
            self._check_cache_results(indexpath, cachepath1, "1", [["5"], ["2"]], num_results=1)
            self._check_cache_results(indexpath, cachepath2, "cache2", [["5", "3"], ["3", "1"]], num_results=2)
Exemple #7
0
class XapianIndexer(object):
    def __init__(self, dirname):
        self.dbPath = os.path.abspath(dirname)

        self.dbconn = IndexerConnection(self.dbPath)

        self.dbconn.add_field_action('title',
                                     FieldActions.INDEX_FREETEXT,
                                     weight=5,
                                     language='en')
        self.dbconn.add_field_action('text',
                                     FieldActions.INDEX_FREETEXT,
                                     language='en',
                                     spell=True,
                                     stop=STOPWORDS)
        #self.dbconn.add_field_action('citecnt', FieldActions.FACET, type='float')
        #self.dbconn.add_field_action('citecnt', FieldActions.WEIGHT)

        self.lock = threading.Lock()

        for k in FIELD_NUM.keys():
            self.dbconn.add_field_action(k, FieldActions.STORE_CONTENT)

    def add_doc(self, doc):
        """ doc: a dict """
        content = doc['text']
        document = UnprocessedDocument()
        document.fields.append(Field('text', content))

        for k, v in doc.iteritems():
            if k in ['text', 'id']:
                continue
            if type(v) == list:
                for item in v:
                    document.fields.append(Field(k, ensure_unicode(item)))
            else:
                document.fields.append(Field(k, ensure_unicode(v)))
        document.id = str(doc['id'])
        try:
            self.lock.acquire()
            self.dbconn.add(document)
        except errors.IndexerError as e:
            print str(e)
        finally:
            self.lock.release()

    def flush(self):
        self.dbconn.flush()

    def close(self):
        self.dbconn.close()

    def clear(self):
        self.close()
        shutil.rmtree(self.dbPath)
        self.__init__(self.dbPath)
 def _apply_cache(self, indexpath, cm):
     idx = IndexerConnection(indexpath)
     idx.set_cache_manager(cm)
     idx.apply_cached_items()
     idx.close()
    def test_multiple_cache(self):
        with tempdir() as basepath:
            # create an index
            indexpath = os.path.join(basepath, "test_index")
            self._create_index(indexpath)

            base_cachepath = os.path.join(basepath, "cache")
            os.makedirs(base_cachepath)

            # create and apply cache 1
            cache_manager = XapianMultipleCachesManager(base_cachepath)
            cache_manager.add_cache("1")
            cache_manager.select_cache("1")
            self._create_and_apply_cache(indexpath, cache_manager)

            # create and apply cache 2
            cache_manager.add_cache("cache2")
            cache_manager.select_cache("cache2")
            self._create_and_apply_cache(indexpath, cache_manager)

            # test cache 1
            self._check_cache_results(
                indexpath, base_cachepath, "1", [["5", "4", "3", "2", "1"], ["4", "2", "5", "3", "1"]]
            )
            # test cache 2
            self._check_cache_results(
                indexpath, base_cachepath, "cache2", [["5", "4", "3", "2", "1"], ["3", "4", "1", "5", "2"]]
            )

            # the document whose docid is 4 is in both caches, we're
            # testing here if replacing it with one cache manager set
            # will change the result in the other cache. It must change.

            # replace document
            iconn = IndexerConnection(indexpath)
            cache_manager = XapianMultipleCachesManager(base_cachepath)
            cache_manager.add_cache("1")
            cache_manager.add_cache("cache2")
            iconn.set_cache_manager(cache_manager)
            docid, terms = ("4", [("term_a", 4), ("term_b", 2)])
            pdoc = self._create_processed_doc(iconn, docid, terms)
            iconn.replace(pdoc, xapid=int(docid))
            iconn.flush()
            iconn.close()
            cache_manager.close()

            # check if the results in both caches are ok
            self._check_cache_results(
                indexpath, base_cachepath, "1", [["5", "4", "3", "2", "1"], ["4", "2", "5", "3", "1"]]
            )
            self._check_cache_results(
                indexpath, base_cachepath, "cache2", [["5", "4", "3", "2", "1"], ["3", "4", "1", "5", "2"]]
            )

            # there are 2 code pathes when we deal with caches:
            # 1. the cache has not enough results
            # 2. the cache has enough results
            # in the first case, the result will come from a mixed query
            # against the index. In the second, the results will come from
            # the cache_manger. So, all the cache managers must be updated.

            # remove document
            cache_manager = XapianMultipleCachesManager(base_cachepath)
            cache_manager.add_cache("1")
            cache_manager.add_cache("cache2")

            iconn = IndexerConnection(indexpath)
            iconn.set_cache_manager(cache_manager)
            iconn.delete(xapid=4)
            cache_manager.close()
            iconn.flush()
            iconn.close()

            # cache has not enough results
            self._check_cache_results(indexpath, base_cachepath, "1", [["5", "3", "2", "1"], ["2", "5", "3", "1"]])
            self._check_cache_results(indexpath, base_cachepath, "cache2", [["5", "3", "2", "1"], ["3", "1", "5", "2"]])

            # cache has enough results
            self._check_cache_results(indexpath, base_cachepath, "1", [["5"], ["2"]], num_results=1)
            self._check_cache_results(indexpath, base_cachepath, "cache2", [["5", "3"], ["3", "1"]], num_results=2)
Exemple #10
0
class XapianIndexer(object):

    def __init__(self, dirname):
        self.dbPath = os.path.abspath(dirname)

        self.dbconn = IndexerConnection(self.dbPath)

        self.dbconn.add_field_action('title', FieldActions.INDEX_FREETEXT,
                                     weight=5, language='en')
        self.dbconn.add_field_action('text', FieldActions.INDEX_FREETEXT,
                                     language='en', spell=True, stop=STOPWORDS)
        #self.dbconn.add_field_action('citecnt', FieldActions.FACET, type='float')
        #self.dbconn.add_field_action('citecnt', FieldActions.WEIGHT)

        self.lock = threading.Lock()

        for k in FIELD_NUM.keys():
            self.dbconn.add_field_action(k, FieldActions.STORE_CONTENT)

    def add_doc(self, doc):
        """ doc: a dict """
        content = doc['text']
        document = UnprocessedDocument()
        document.fields.append(Field('text', content))

        for k, v in doc.iteritems():
            if k in ['text', 'id']:
                continue
            if type(v) == list:
                for item in v:
                    document.fields.append(Field(k, ensure_unicode(item)))
            else:
                document.fields.append(Field(k, ensure_unicode(v)))
        document.id = str(doc['id'])
        try:
            self.lock.acquire()
            self.dbconn.add(document)
        except errors.IndexerError as e:
            print str(e)
        finally:
            self.lock.release()


    def flush(self):
        self.dbconn.flush()

    def close(self):
        self.dbconn.close()

    def clear(self):
        self.close()
        shutil.rmtree(self.dbPath)
        self.__init__(self.dbPath)