Beispiel #1
0
    def search(self, phrase, keywords=None, sortAscending=True):
        if not phrase and not keywords:
            return []

        # XXX Colons in phrase will screw stuff up.  Can they be quoted or
        # escaped somehow?  Probably by using a different QueryParser.
        if keywords:
            fieldPhrase = u' '.join(u':'.join((k, v))
                                    for (k, v) in keywords.iteritems())
            if phrase:
                phrase = phrase + u' ' + fieldPhrase
            else:
                phrase = fieldPhrase
        phrase = phrase.translate({
            ord(u'@'): u' ',
            ord(u'-'): u' ',
            ord(u'.'): u' '
        })
        qp = PyLucene.QueryParser('text', self.analyzer)
        qp.setDefaultOperator(qp.Operator.AND)
        query = qp.parseQuery(phrase)

        sort = PyLucene.Sort(PyLucene.SortField('sortKey', not sortAscending))

        try:
            hits = self.searcher.search(query, sort)
        except PyLucene.JavaError, err:
            if 'no terms in field sortKey' in str(err):
                hits = []
            else:
                raise
Beispiel #2
0
    def update(self, indexer, iterable):
        """Update an index from a queryset."""
        self._open_index()

        for o in ifilter(indexer.should_index, iterable):
            # Clear a potential old object out of the index
            self.remove(o)

            # Create a new document to index.
            doc = PyLucene.Document()

            # Index the model identifier so we can easily deal with only models of a certain type
            doc.add(
                PyLucene.Field(MODEL_FIELD, str(o._meta), STORE_YES,
                               UN_TOKENIZED))

            # Index the "identifier" (app_label.module_name.pk) for this object
            doc.add(
                PyLucene.Field(IDENTIFIER_FIELD, self.get_identifier(o),
                               STORE_YES, INDEX_NO))

            # Index the default content for the object
            # Don't actually store the complete contents; just index them.
            doc.add(
                PyLucene.Field(CONTENTS_FIELD, indexer.flatten(o), STORE_NO,
                               TOKENIZED))

            # Index each field that needs to be individually searchable.
            for (name, value) in indexer.get_field_values(o).items():
                doc.add(PyLucene.Field(name, value, STORE_NO, TOKENIZED))

            self._index.addDocument(doc)

        self._close_index()
Beispiel #3
0
    def _create_query_for_field(self, field, value, analyzer=None):
        """generate a field query

        this functions creates a field->value query

        :param field: The fieldname to be used
        :type field: str
        :param value: The wanted value of the field
        :type value: str
        :param analyzer: The analyzer to be used
                         Possible analyzers are:
                         - :attr:`CommonDatabase.ANALYZER_TOKENIZE`
                           the field value is splitted to be matched word-wise
                         - :attr:`CommonDatabase.ANALYZER_PARTIAL`
                           the field value must start with the query string
                         - :attr:`CommonDatabase.ANALYZER_EXACT`
                           keep special characters and the like
        :type analyzer: bool
        :return: resulting query object
        :rtype: PyLucene.Query
        """
        if analyzer is None:
            analyzer = self.analyzer
        if analyzer == self.ANALYZER_EXACT:
            analyzer_obj = PyLucene.KeywordAnalyzer()
        else:
            value = self._escape_term_value(value)
            analyzer_obj = PyLucene.StandardAnalyzer()
        qp = PyLucene.QueryParser(field, analyzer_obj)
        if (analyzer & self.ANALYZER_PARTIAL > 0):
            # PyLucene uses explicit wildcards for partial matching
            value += "*"
        return qp.parse(value)
Beispiel #4
0
 def indexDocs(self, root, writer):
     for root, dirnames, filenames in os.walk(root):
         for filename in filenames:
             #if not filename.endswith('.txt'):
             #    continue
             print "adding", filename
             try:
                 path = os.path.join(root, filename)
                 file = open(path)
                 contents = unicode(file.read(), 'iso-8859-1')
                 file.close()
                 doc = PyLucene.Document()
                 doc.add(
                     PyLucene.Field("name", filename,
                                    PyLucene.Field.Store.YES,
                                    PyLucene.Field.Index.UN_TOKENIZED))
                 doc.add(
                     PyLucene.Field("path", path, PyLucene.Field.Store.YES,
                                    PyLucene.Field.Index.UN_TOKENIZED))
                 if len(contents) > 0:
                     doc.add(
                         PyLucene.Field("contents", contents,
                                        PyLucene.Field.Store.YES,
                                        PyLucene.Field.Index.TOKENIZED))
                 else:
                     print "warning: no content in %s" % filename
                 writer.addDocument(doc)
             except Exception, e:
                 print "Failed in indexDocs:", e
Beispiel #5
0
    def search(self,
               q,
               models=None,
               order_by=RELEVANCE,
               limit=None,
               offset=None):
        """Perform a search."""
        original_query = q
        q = query.convert(original_query, LuceneQueryConverter)
        if models:
            models_queries = []
            for m in models:
                if hasattr(m, "_meta"):
                    models_queries.append('%s:"%s"' % (MODEL_FIELD, m._meta))
                else:
                    models_queries.append('%s:"%s"' % (MODEL_FIELD, m))
            q += ' AND (%s)' % (' '.join(models_queries))

        searcher = PyLucene.IndexSearcher(settings.SEARCH_INDEX_PATH)
        analyzer = PorterStemmerAnalyzer()
        compiled_query = PyLucene.QueryParser(CONTENTS_FIELD,
                                              analyzer).parse(q)

        if order_by is RELEVANCE:
            sort = PyLucene.Sort.RELEVANCE
        else:
            reversed = order_by.startswith('-')
            sort_field = PyLucene.SortField(order_by.lstrip('-'), reversed)
            sort = PyLucene.Sort(sort_field)

        hits = searcher.search(compiled_query, sort)
        return self._get_search_results(original_query, hits, limit, offset)
Beispiel #6
0
 def tokenStream(self, fieldName, reader):
     result = PyLucene.StandardTokenizer(reader)
     result = PyLucene.StandardFilter(result)
     result = PyLucene.LowerCaseFilter(result)
     result = PyLucene.PorterStemFilter(result)
     result = PyLucene.StopFilter(result,
                                  PyLucene.StopAnalyzer.ENGLISH_STOP_WORDS)
     return result
Beispiel #7
0
    def index_node(self, iba_node):
        self.delete_node(iba_node.nid)
        create = len(os.listdir('index')) == 0
        analyzer = PyLucene.StandardAnalyzer()
        writer = PyLucene.IndexWriter("index", analyzer, create)

        writer.addDocument(self._document_node(iba_node))
        writer.close()
        self.count = self.count + 1
Beispiel #8
0
    def search_node_by_name2(self, name):
        if self.searcher is None:
            self.searcher = PyLucene.IndexSearcher("index")

        query = PyLucene.QueryParser(COLUMN_NAME,
                                     PyLucene.StandardAnalyzer()).parse(name)
        hits = self.searcher.search(query)
        result = self.hits_to_list(hits)

        return result
    def __init__(self, basedir, analyzer=None, create_allowed=True):
        """Initialize or open an indexing database.

        Any derived class must override __init__.

        :raise ValueError: The given location exists, but the database type
                           is incompatible (e.g. created by a different indexing engine)
        :raise OSError: the database failed to initialize

        :param basedir: The parent directory of the database
        :type basedir: str
        :param analyzer: Bitwise combination of possible analyzer flags
                         to be used as the default analyzer for this database.
                         Leave it empty to use the system default analyzer
                         (self.ANALYZER_DEFAULT). See self.ANALYZER_TOKENIZE,
                         self.ANALYZER_PARTIAL, ...
        :type analyzer: int
        :param create_allowed: create the database, if necessary; default: True
        :type create_allowed: bool
        """
        jvm = PyLucene.getVMEnv()
        jvm.attachCurrentThread()
        super(PyLuceneDatabase, self).__init__(basedir,
                                               analyzer=analyzer,
                                               create_allowed=create_allowed)
        self.pyl_analyzer = PyLucene.StandardAnalyzer()
        self.writer = None
        self.reader = None
        self.index_version = None
        try:
            # try to open an existing database
            tempreader = PyLucene.IndexReader.open(self.location)
            tempreader.close()
        except PyLucene.JavaError, err_msg:
            # Write an error out, in case this is a real problem instead of an absence of an index
            # TODO: turn the following two lines into debug output
            #errorstr = str(e).strip() + "\n" + self.errorhandler.traceback_str()
            #DEBUG_FOO("could not open index, so going to create: " + errorstr)
            # Create the index, so we can open cached readers on it
            if not create_allowed:
                raise OSError("Indexer: skipping database creation")
            try:
                # create the parent directory if it does not exist
                parent_path = os.path.dirname(self.location)
                if not os.path.isdir(parent_path):
                    # recursively create all directories up to parent_path
                    os.makedirs(parent_path)
            except IOError, err_msg:
                raise OSError("Indexer: failed to create the parent " \
                        + "directory (%s) of the indexing database: %s" \
                        % (parent_path, err_msg))
Beispiel #10
0
    def _create_empty_document(self):
        """create an empty document to be filled and added to the index later

        :return: the new document object
        :rtype: PyLucene.Document
        """
        return PyLucene.Document()
Beispiel #11
0
 def clear(self, models):
     """Clear the entire index of a certain model."""
     self._open_index()
     for model in models:
         term = PyLucene.Term(MODEL_FIELD, str(model._meta))
         self._index.deleteDocuments(term)
     self._close_index()
Beispiel #12
0
    def _create_query_combined(self, queries, require_all=True):
        """generate a combined query

        @param queries: list of the original queries
        @type queries: list of xapian.Query
        @param require_all: boolean operator
            (True -> AND (default) / False -> OR)
        @type require_all: bool
        @return: the resulting combined query object
        @rtype: PyLucene.Query
        """
        combined_query = PyLucene.BooleanQuery()
        for query in queries:
            combined_query.add(
                PyLucene.BooleanClause(query, require_all, False))
        return combined_query
Beispiel #13
0
    def _create_query_for_field(self, field, value, analyzer=None):
        """generate a field query

        this functions creates a field->value query

        @param field: the fieldname to be used
        @type field: str
        @param value: the wanted value of the field
        @type value: str
        @param analyzer: the analyzer to be used
            possible analyzers are:
              - L{CommonDatabase.ANALYZER_TOKENIZE}
                    the field value is splitted to be matched word-wise
              - L{CommonDatabase.ANALYZER_PARTIAL}
                    the field value must start with the query string
              - L{CommonDatabase.ANALYZER_EXACT}
                    keep special characters and the like
        @type analyzer: bool
        @return: resulting query object
        @rtype: PyLucene.Query
        """
        if analyzer is None:
            analyzer = self.analyzer
        if analyzer == self.ANALYZER_EXACT:
            analyzer_obj = self.ExactAnalyzer()
        else:
            value = _escape_term_value(value)
            analyzer_obj = PyLucene.StandardAnalyzer()
        if (analyzer & self.ANALYZER_PARTIAL) > 0:
            # PyLucene uses explicit wildcards for partial matching
            value += "*"
        return PyLucene.QueryParser.parse(value, field, analyzer_obj)
Beispiel #14
0
 def _index_refresh(self):
     """re-read the indexer database"""
     try:
         if self.reader is None or self.searcher is None:
             self.reader = PyLucene.IndexReader.open(self.location)
             self.searcher = PyLucene.IndexSearcher(self.reader)
         elif self.index_version != self.reader.getCurrentVersion( \
                 self.location):
             self.searcher.close()
             self.reader.close()
             self.reader = PyLucene.IndexReader.open(self.location)
             self.searcher = PyLucene.IndexSearcher(self.reader)
             self.index_version = self.reader.getCurrentVersion(self.location)
     except PyLucene.JavaError,e:
         # TODO: add some debugging output?
         #self.errorhandler.logerror("Error attempting to read index - try reindexing: "+str(e))
         pass
    def _create_query_for_string(self, text, require_all=True,
                analyzer=None):
        """generate a query for a plain term of a string query

        basically this function parses the string and returns the resulting
        query

        :param text: The query string
        :type text: str
        :param require_all: boolean operator
                            (True -> AND (default) / False -> OR)
        :type require_all: bool
        :param analyzer: The analyzer to be used
                         Possible analyzers are:
                         - :attr:`CommonDatabase.ANALYZER_TOKENIZE`
                           the field value is splitted to be matched word-wise
                         - :attr:`CommonDatabase.ANALYZER_PARTIAL`
                           the field value must start with the query string
                         - :attr:`CommonDatabase.ANALYZER_EXACT`
                           keep special characters and the like
        :type analyzer: bool
        :return: resulting query object
        :rtype: PyLucene.Query
        """
        if analyzer is None:
            analyzer = self.analyzer
        if analyzer == self.ANALYZER_EXACT:
            # exact matching - no substitution ...
            # for PyLucene: nothing special is necessary
            pass
        # don't care about special characters ...
        if analyzer == self.ANALYZER_EXACT:
            analyzer_obj = self.ExactAnalyzer()
        else:
            text = _escape_term_value(text)
            analyzer_obj = PyLucene.StandardAnalyzer()
        qp = PyLucene.QueryParser(analyzer=analyzer_obj)
        if require_all:
            qp.setDefaultOperator(qp.Operator.AND)
        else:
            qp.setDefaultOperator(qp.Operator.OR)
        if (analyzer & self.ANALYZER_PARTIAL) > 0:
            # PyLucene uses explicit wildcards for partial matching
            text += "*"
        return qp.parse(text)
Beispiel #16
0
 def get_writer(self, create=False):
     writer = None
     while writer is None:
         try:
             writer = lucene.IndexWriter(self.store, self.analyzer, create)
             writer.setMaxFieldLength(1048576)
         except Exception, e:
             print e
             time.sleep(.1)
Beispiel #17
0
    def delete_existing_feed_docs(self, feed):
        """ deletes existing documents relating to the given feed """
        reader = lucene.IndexReader.open(self.feed_modifier.store)
        numDeleted = reader.deleteDocuments(lucene.Term('url', feed.xml_url))
        logging.info('deleted %d existing index documents' % numDeleted)
        reader.close()

        reader = lucene.IndexReader.open(self.entry_modifier.store)
        for entry in feed.get_entries():
            try:
                id = '%s:%s' % (feed.xml_url, entry.get('id', None))
                numDeleted = reader.deleteDocuments(lucene.Term('id', id))
                if numDeleted:
                    logging.info('deleted %d feed entry docyments' %
                                 numDeleted)
            except:
                {}
        reader.close()
Beispiel #18
0
 def unindex_record(self, record):
     """
     Unindex documents matching this entry's uid.  *Should* 
     only be one, but could be many, if somehow the same entry 
     got indexed multiple times.
     """
     reader = self.context.get_search_index_reader()
     term = PyLucene.Term('uid', str(record.uid))
     reader.deleteDocuments(term)
     reader.close()
Beispiel #19
0
    def search(self,
               query_string='',
               require_visible=True,
               allow_curated=True):

        hits = []
        query_string = str(query_string)
        self.logger.info('Performing search: %s' % query_string)
        disassembled_query = disassemble_user_query(query_string)
        self.logger.debug('Disassembled query: %s' % str(disassembled_query))
        reassembled_query = '+(%s)' % reassemble_user_query(disassembled_query)
        self.logger.debug('Reassembled query: %s', reassembled_query)

        if not allow_curated:
            reassembled_query += \
                ' -record-status:%s' % canary.loader.QueuedRecord.STATUS_CURATED

        if require_visible:
            reassembled_query += ' +article-type:[%s TO %s]' % \
                (Study.ARTICLE_TYPES['traditional'],
                Study.ARTICLE_TYPES['curated'])
            reassembled_query += ' +record-status:%s' % \
                canary.loader.QueuedRecord.STATUS_CURATED

        try:
            searcher = PyLucene.IndexSearcher(
                PyLucene.FSDirectory.getDirectory(
                    self.context.config.search_index_dir, False))
            analyzer = PyLucene.StandardAnalyzer()
            query_parser = PyLucene.QueryParser('all', analyzer)
            query_parser.setOperator(PyLucene.QueryParser.DEFAULT_OPERATOR_AND)
            query = query_parser.parseQuery(reassembled_query)
            self.logger.info('Search query: %s', query)
            hits = searcher.search(query)
            return hits, searcher
        except Exception, e:
            self.logger.error('Search failed: %s', e)
            #self.logger.error(traceback.format_stack())
            if hits \
                and searcher:
                return hits, searcher
            else:
                return [], None
Beispiel #20
0
 def __del__(self):
     """remove lock and close writer after loosing the last reference"""
     jvm = PyLucene.getVMEnv()
     jvm.attachCurrentThread()
     self._writer_close()
     if hasattr(self, "reader") and self.reader is not None:
         self.reader.close()
         self.reader = None
     if hasattr(self, "searcher") and self.searcher is not None:
         self.searcher.close()
         self.searcher = None
Beispiel #21
0
    def search(self,
               query,
               fields=FEED_ENTRY_FIELDS,
               analyzer=None,
               store=None):
        if not query or len(query.strip()) == 0 or len(fields) == 0:
            return None
        analyzer = analyzer or self.analyzer
        if store is None:
            store = self.entry_modifier.store

        if len(fields) > 1:
            qp = lucene.MultiFieldQueryParser(fields, analyzer)
        else:
            qp = lucene.QueryParser(fields[0], analyzer)
        q = qp.parse(query)

        searcher = lucene.IndexSearcher(store)
        hits = searcher.search(q, lucene.Sort.RELEVANCE)
        return HitHolder(hits, searcher)
Beispiel #22
0
        def openReadIndex(self):
            luceneDir = self.store.newDirectory(self.indexDirectory)

            if not luceneDir.exists():
                self.openWriteIndex().close()

            fsdir = PyLucene.FSDirectory.getDirectory(luceneDir.path, False)
            try:
                searcher = PyLucene.IndexSearcher(fsdir)
            except PyLucene.JavaError, e:
                raise IndexCorrupt()
Beispiel #23
0
    def begin_transaction(self):
        """PyLucene does not support transactions

        Thus this function just opens the database for write access.
        Call "cancel_transaction" or "commit_transaction" to close write
        access in order to remove the exclusive lock from the database
        directory.
        """
        jvm = PyLucene.getVMEnv()
        jvm.attachCurrentThread()
        self._writer_open()
    def begin_transaction(self):
        """PyLucene does not support transactions

        Thus this function just opens the database for write access.
        Call "cancel_transaction" or "commit_transaction" to close write
        access in order to remove the exclusive lock from the database
        directory.
        """
        jvm = PyLucene.getVMEnv()
        jvm.attachCurrentThread()
        self._writer_open()
 def __del__(self):
     """remove lock and close writer after loosing the last reference"""
     jvm = PyLucene.getVMEnv()
     jvm.attachCurrentThread()
     self._writer_close()
     if hasattr(self, "reader") and self.reader is not None:
         self.reader.close()
         self.reader = None
     if hasattr(self, "searcher") and self.searcher is not None:
         self.searcher.close()
         self.searcher = None
Beispiel #26
0
    def search_node_by_attribute2(self, att_type, att_value):
        if self.searcher is None:
            self.searcher = PyLucene.IndexSearcher("index")

        analyzer = PyLucene.StandardAnalyzer()

        if att_type != "" and att_value == "":
            parser = PyLucene.QueryParser(COLUMN_ATTRIBUTE_TYPE_NID, analyzer)
            query = parser.parse(att_type)
        elif att_type == "" and att_value != "":
            parser = PyLucene.QueryParser(COLUMN_ATTRIBUTE_VALUE, analyzer)
            query = parser.parse(att_value)
        elif att_type != "" and att_value != "":
            parser = PyLucene.QueryParser(COLUMN_ATTRIBUTE_VALUE, analyzer)
            query = parser.parse(COLUMN_ATTRIBUTE_TYPE_NID + ":" + att_type +
                                 " AND " + att_value)

        hits = self.searcher.search(query)
        result = self.hits_to_list(hits)

        return result
Beispiel #27
0
    def __init__(self, store_dir=STORE_DIR, destroy=False, analyzer=None):
        self.store_dir = store_dir
        self.analyzer = analyzer or lucene.StandardAnalyzer()

        self.feed_modifier = IndexModifier(store_dir=os.path.join(
            store_dir, 'feeds'),
                                           destroy=destroy,
                                           analyzer=analyzer)
        self.entry_modifier = IndexModifier(store_dir=os.path.join(
            store_dir, 'entries'),
                                            destroy=destroy,
                                            analyzer=analyzer)
Beispiel #28
0
    def _create_query_for_string(self, text, require_all=True,
            analyzer=None):
        """generate a query for a plain term of a string query

        basically this function parses the string and returns the resulting
        query

        @param text: the query string
        @type text: str
        @param require_all: boolean operator
            (True -> AND (default) / False -> OR)
        @type require_all: bool
        @param analyzer: the analyzer to be used
            possible analyzers are:
             -  L{CommonDatabase.ANALYZER_TOKENIZE}
                    the field value is splitted to be matched word-wise
             -  L{CommonDatabase.ANALYZER_PARTIAL}
                    the field value must start with the query string
             -  L{CommonDatabase.ANALYZER_EXACT}
                    keep special characters and the like
        @type analyzer: bool
        @return: resulting query object
        @rtype: PyLucene.Query
        """
        if analyzer is None:
            analyzer = self.analyzer
        if analyzer == self.ANALYZER_EXACT:
            analyzer_obj = PyLucene.KeywordAnalyzer()
        else:
            text = _escape_term_value(text)
            analyzer_obj = PyLucene.StandardAnalyzer()
        qp = PyLucene.QueryParser(UNNAMED_FIELD_NAME, analyzer_obj)
        if (analyzer & self.ANALYZER_PARTIAL > 0):
            # PyLucene uses explicit wildcards for partial matching
            text += "*"
        if require_all:
            qp.setDefaultOperator(qp.Operator.AND)
        else:
            qp.setDefaultOperator(qp.Operator.OR)
        return qp.parse(text)
Beispiel #29
0
def create_index(self, arg):
    """ Post download setup callback for creating a lucene index """

    moreinfo("Creating lucene index")
    storeDir = "index"
    if not os.path.exists(storeDir):
        os.mkdir(storeDir)

    store = PyLucene.FSDirectory.getDirectory(storeDir, True)

    self.lucene_writer = PyLucene.IndexWriter(store,
                                              PyLucene.StandardAnalyzer(),
                                              True)
    # Uncomment this line to enable a PorterStemmer analyzer
    # self.lucene_writer = PyLucene.IndexWriter(store, PorterStemmerAnalyzer(), True)
    self.lucene_writer.setMaxFieldLength(1048576)

    count = 0

    urllist = []

    for urlobj in self._urldict.values():

        filename = urlobj.get_full_filename()
        url = urlobj.get_full_url()

        try:
            urllist.index(url)
            continue
        except ValueError:
            urllist.append(url)

        if not filename in self._downloaddict['_savedfiles']: continue

        data = ''

        moreinfo('Adding index for URL', url)

        if os.path.isfile(filename):
            try:
                data = unicode(open(filename).read(), 'iso-8859-1')
            except UnicodeDecodeError, e:
                data = ''

        doc = PyLucene.Document()
        doc.add(
            PyLucene.Field("name", filename, PyLucene.Field.Store.YES,
                           PyLucene.Field.Index.UN_TOKENIZED))
        doc.add(
            PyLucene.Field("path", url, PyLucene.Field.Store.YES,
                           PyLucene.Field.Index.UN_TOKENIZED))
        if data and len(data) > 0:
            doc.add(
                PyLucene.Field("contents", data, PyLucene.Field.Store.YES,
                               PyLucene.Field.Index.TOKENIZED))
        else:
            extrainfo("warning: no content in %s" % filename)

        self.lucene_writer.addDocument(doc)
        count += 1
Beispiel #30
0
 def delete_node(self, nid):
     try:
         index_present = len(os.listdir('index')) > 0
         if index_present:
             reader = PyLucene.IndexReader.open("index")
             term = PyLucene.Term(COLUMN_NID, nid)
             if reader.termDocs(term) != None:
                 reader.deleteDocuments(term)
             reader.close()
     except:
         IBASGlobal.print_message(
             "Error while deleting document from Lucene with nid " +
             str(nid), 0)
Beispiel #31
0
 def _writer_open(self):
     """open write access for the indexing database and acquire an
     exclusive lock
     """
     if not self._writer_is_open():
         self._delete_stale_lock()
         self.writer = PyLucene.IndexWriter(self.location,
                                            self.pyl_analyzer, False)
         # "setMaxFieldLength" is available since PyLucene v2
         # we must stay compatible to v1 for the derived class
         # (PyLuceneIndexer1) - thus we make this step optional
         if hasattr(self.writer, "setMaxFieldLength"):
             self.writer.setMaxFieldLength(MAX_FIELD_SIZE)
    def _add_plain_term(self, document, term, tokenize=True):
        """add a term to a document

        :param document: the document to be changed
        :type document: xapian.Document | PyLucene.Document
        :param term: a single term to be added
        :type term: str
        :param tokenize: should the term be tokenized automatically
        :type tokenize: bool
        """
        # Field parameters: name, string, store, index, token
        document.add(PyLucene.Field(str(PyLuceneIndex.UNNAMED_FIELD_NAME), term,
                True, True, tokenize))
    def __init__(self, basedir, analyzer=None, create_allowed=True):
        """Initialize or open an indexing database.

        Any derived class must override __init__.

        :raise ValueError: The given location exists, but the database type
                           is incompatible (e.g. created by a different indexing engine)
        :raise OSError: the database failed to initialize

        :param basedir: The parent directory of the database
        :type basedir: str
        :param analyzer: Bitwise combination of possible analyzer flags
                         to be used as the default analyzer for this database.
                         Leave it empty to use the system default analyzer
                         (self.ANALYZER_DEFAULT). See self.ANALYZER_TOKENIZE,
                         self.ANALYZER_PARTIAL, ...
        :type analyzer: int
        :param create_allowed: create the database, if necessary; default: True
        :type create_allowed: bool
        """
        jvm = PyLucene.getVMEnv()
        jvm.attachCurrentThread()
        super(PyLuceneDatabase, self).__init__(basedir, analyzer=analyzer, create_allowed=create_allowed)
        self.pyl_analyzer = PyLucene.StandardAnalyzer()
        self.writer = None
        self.reader = None
        self.index_version = None
        try:
            # try to open an existing database
            tempreader = PyLucene.IndexReader.open(self.location)
            tempreader.close()
        except PyLucene.JavaError, err_msg:
            # Write an error out, in case this is a real problem instead of an absence of an index
            # TODO: turn the following two lines into debug output
            # errorstr = str(e).strip() + "\n" + self.errorhandler.traceback_str()
            # DEBUG_FOO("could not open index, so going to create: " + errorstr)
            # Create the index, so we can open cached readers on it
            if not create_allowed:
                raise OSError("Indexer: skipping database creation")
            try:
                # create the parent directory if it does not exist
                parent_path = os.path.dirname(self.location)
                if not os.path.isdir(parent_path):
                    # recursively create all directories up to parent_path
                    os.makedirs(parent_path)
            except IOError, err_msg:
                raise OSError(
                    "Indexer: failed to create the parent "
                    + "directory (%s) of the indexing database: %s" % (parent_path, err_msg)
                )
 def make_query(self, *args, **kwargs):
     jvm = PyLucene.getVMEnv()
     jvm.attachCurrentThread()
     return super(PyLuceneDatabase, self).make_query(*args, **kwargs)
import re
import os
import time
import logging

# try to import the PyLucene package (with the two possible names)
# remember the type of the detected package (compiled with jcc (>=v2.3) or
# with gcj (<=v2.2)
try:
    import PyLucene
    _COMPILER = 'gcj'
except ImportError:
    # if this fails, then there is no pylucene installed
    import lucene
    PyLucene = lucene
    PyLucene.initVM(PyLucene.CLASSPATH)
    _COMPILER = 'jcc'

import CommonIndexer


UNNAMED_FIELD_NAME = "FieldWithoutAName"
MAX_FIELD_SIZE = 1048576


def is_available():
    return _get_pylucene_version() == 2


class PyLuceneDatabase(CommonIndexer.CommonDatabase):
    """manage and use a pylucene indexing database"""
    def __init__(self, basedir, analyzer=None, create_allowed=True):
        """Initialize or open an indexing database.

        Any derived class must override __init__.

        :raise ValueError: The given location exists, but the database type
                           is incompatible (e.g. created by a different indexing engine)
        :raise OSError: the database failed to initialize

        :param basedir: The parent directory of the database
        :type basedir: str
        :param analyzer: Bitwise combination of possible analyzer flags
                         to be used as the default analyzer for this database.
                         Leave it empty to use the system default analyzer
                         (self.ANALYZER_DEFAULT). See self.ANALYZER_TOKENIZE,
                         self.ANALYZER_PARTIAL, ...
        :type analyzer: int
        :param create_allowed: create the database, if necessary; default: True
        :type create_allowed: bool
        """
        jvm = PyLucene.getVMEnv()
        jvm.attachCurrentThread()
        super(PyLuceneDatabase, self).__init__(
            basedir, analyzer=analyzer, create_allowed=create_allowed)
        self.pyl_analyzer = PyLucene.StandardAnalyzer()
        self.writer = None
        self.reader = None
        self.index_version = None
        try:
            # try to open an existing database
            tempreader = PyLucene.IndexReader.open(self.location)
            tempreader.close()
        except PyLucene.JavaError as err_msg:
            # Write an error out, in case this is a real problem instead of an absence of an index
            # TODO: turn the following two lines into debug output
            #errorstr = str(e).strip() + "\n" + self.errorhandler.traceback_str()
            #DEBUG_FOO("could not open index, so going to create: " + errorstr)
            # Create the index, so we can open cached readers on it
            if not create_allowed:
                raise OSError("Indexer: skipping database creation")
            try:
                # create the parent directory if it does not exist
                parent_path = os.path.dirname(self.location)
                if not os.path.isdir(parent_path):
                    # recursively create all directories up to parent_path
                    os.makedirs(parent_path)
            except IOError as err_msg:
                raise OSError("Indexer: failed to create the parent "
                              "directory (%s) of the indexing database: %s" %
                              (parent_path, err_msg))
            try:
                tempwriter = PyLucene.IndexWriter(
                    self.location, self.pyl_analyzer, True)
                tempwriter.close()
            except PyLucene.JavaError as err_msg:
                raise OSError("Indexer: failed to open or create a Lucene"
                              " database (%s): %s" % (self.location, err_msg))
        # the indexer is initialized - now we prepare the searcher
        # windows file locking seems inconsistent, so we try 10 times
        numtries = 0
        #self.dir_lock.acquire(blocking=True)
        # read "self.reader", "self.indexVersion" and "self.searcher"
        try:
            while numtries < 10:
                try:
                    self.reader = PyLucene.IndexReader.open(self.location)
                    self.indexVersion = self.reader.getCurrentVersion(
                        self.location)
                    self.searcher = PyLucene.IndexSearcher(self.reader)
                    break
                except PyLucene.JavaError as e:
                    # store error message for possible later re-raise (below)
                    lock_error_msg = e
                    time.sleep(0.01)
                    numtries += 1
            else:
                # locking failed for 10 times
                raise OSError("Indexer: failed to lock index database"
                              " (%s)" % lock_error_msg)
        finally:
            pass
        #    self.dir_lock.release()
        # initialize the searcher and the reader
        self._index_refresh()