def search(self, phrase, keywords=None, sortAscending=True): if not phrase and not keywords: return [] # XXX Colons in phrase will screw stuff up. Can they be quoted or # escaped somehow? Probably by using a different QueryParser. if keywords: fieldPhrase = u' '.join(u':'.join((k, v)) for (k, v) in keywords.iteritems()) if phrase: phrase = phrase + u' ' + fieldPhrase else: phrase = fieldPhrase phrase = phrase.translate({ ord(u'@'): u' ', ord(u'-'): u' ', ord(u'.'): u' ' }) qp = PyLucene.QueryParser('text', self.analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parseQuery(phrase) sort = PyLucene.Sort(PyLucene.SortField('sortKey', not sortAscending)) try: hits = self.searcher.search(query, sort) except PyLucene.JavaError, err: if 'no terms in field sortKey' in str(err): hits = [] else: raise
def update(self, indexer, iterable): """Update an index from a queryset.""" self._open_index() for o in ifilter(indexer.should_index, iterable): # Clear a potential old object out of the index self.remove(o) # Create a new document to index. doc = PyLucene.Document() # Index the model identifier so we can easily deal with only models of a certain type doc.add( PyLucene.Field(MODEL_FIELD, str(o._meta), STORE_YES, UN_TOKENIZED)) # Index the "identifier" (app_label.module_name.pk) for this object doc.add( PyLucene.Field(IDENTIFIER_FIELD, self.get_identifier(o), STORE_YES, INDEX_NO)) # Index the default content for the object # Don't actually store the complete contents; just index them. doc.add( PyLucene.Field(CONTENTS_FIELD, indexer.flatten(o), STORE_NO, TOKENIZED)) # Index each field that needs to be individually searchable. for (name, value) in indexer.get_field_values(o).items(): doc.add(PyLucene.Field(name, value, STORE_NO, TOKENIZED)) self._index.addDocument(doc) self._close_index()
def _create_query_for_field(self, field, value, analyzer=None): """generate a field query this functions creates a field->value query :param field: The fieldname to be used :type field: str :param value: The wanted value of the field :type value: str :param analyzer: The analyzer to be used Possible analyzers are: - :attr:`CommonDatabase.ANALYZER_TOKENIZE` the field value is splitted to be matched word-wise - :attr:`CommonDatabase.ANALYZER_PARTIAL` the field value must start with the query string - :attr:`CommonDatabase.ANALYZER_EXACT` keep special characters and the like :type analyzer: bool :return: resulting query object :rtype: PyLucene.Query """ if analyzer is None: analyzer = self.analyzer if analyzer == self.ANALYZER_EXACT: analyzer_obj = PyLucene.KeywordAnalyzer() else: value = self._escape_term_value(value) analyzer_obj = PyLucene.StandardAnalyzer() qp = PyLucene.QueryParser(field, analyzer_obj) if (analyzer & self.ANALYZER_PARTIAL > 0): # PyLucene uses explicit wildcards for partial matching value += "*" return qp.parse(value)
def indexDocs(self, root, writer): for root, dirnames, filenames in os.walk(root): for filename in filenames: #if not filename.endswith('.txt'): # continue print "adding", filename try: path = os.path.join(root, filename) file = open(path) contents = unicode(file.read(), 'iso-8859-1') file.close() doc = PyLucene.Document() doc.add( PyLucene.Field("name", filename, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add( PyLucene.Field("path", path, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) if len(contents) > 0: doc.add( PyLucene.Field("contents", contents, PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def search(self, q, models=None, order_by=RELEVANCE, limit=None, offset=None): """Perform a search.""" original_query = q q = query.convert(original_query, LuceneQueryConverter) if models: models_queries = [] for m in models: if hasattr(m, "_meta"): models_queries.append('%s:"%s"' % (MODEL_FIELD, m._meta)) else: models_queries.append('%s:"%s"' % (MODEL_FIELD, m)) q += ' AND (%s)' % (' '.join(models_queries)) searcher = PyLucene.IndexSearcher(settings.SEARCH_INDEX_PATH) analyzer = PorterStemmerAnalyzer() compiled_query = PyLucene.QueryParser(CONTENTS_FIELD, analyzer).parse(q) if order_by is RELEVANCE: sort = PyLucene.Sort.RELEVANCE else: reversed = order_by.startswith('-') sort_field = PyLucene.SortField(order_by.lstrip('-'), reversed) sort = PyLucene.Sort(sort_field) hits = searcher.search(compiled_query, sort) return self._get_search_results(original_query, hits, limit, offset)
def tokenStream(self, fieldName, reader): result = PyLucene.StandardTokenizer(reader) result = PyLucene.StandardFilter(result) result = PyLucene.LowerCaseFilter(result) result = PyLucene.PorterStemFilter(result) result = PyLucene.StopFilter(result, PyLucene.StopAnalyzer.ENGLISH_STOP_WORDS) return result
def index_node(self, iba_node): self.delete_node(iba_node.nid) create = len(os.listdir('index')) == 0 analyzer = PyLucene.StandardAnalyzer() writer = PyLucene.IndexWriter("index", analyzer, create) writer.addDocument(self._document_node(iba_node)) writer.close() self.count = self.count + 1
def search_node_by_name2(self, name): if self.searcher is None: self.searcher = PyLucene.IndexSearcher("index") query = PyLucene.QueryParser(COLUMN_NAME, PyLucene.StandardAnalyzer()).parse(name) hits = self.searcher.search(query) result = self.hits_to_list(hits) return result
def __init__(self, basedir, analyzer=None, create_allowed=True): """Initialize or open an indexing database. Any derived class must override __init__. :raise ValueError: The given location exists, but the database type is incompatible (e.g. created by a different indexing engine) :raise OSError: the database failed to initialize :param basedir: The parent directory of the database :type basedir: str :param analyzer: Bitwise combination of possible analyzer flags to be used as the default analyzer for this database. Leave it empty to use the system default analyzer (self.ANALYZER_DEFAULT). See self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ... :type analyzer: int :param create_allowed: create the database, if necessary; default: True :type create_allowed: bool """ jvm = PyLucene.getVMEnv() jvm.attachCurrentThread() super(PyLuceneDatabase, self).__init__(basedir, analyzer=analyzer, create_allowed=create_allowed) self.pyl_analyzer = PyLucene.StandardAnalyzer() self.writer = None self.reader = None self.index_version = None try: # try to open an existing database tempreader = PyLucene.IndexReader.open(self.location) tempreader.close() except PyLucene.JavaError, err_msg: # Write an error out, in case this is a real problem instead of an absence of an index # TODO: turn the following two lines into debug output #errorstr = str(e).strip() + "\n" + self.errorhandler.traceback_str() #DEBUG_FOO("could not open index, so going to create: " + errorstr) # Create the index, so we can open cached readers on it if not create_allowed: raise OSError("Indexer: skipping database creation") try: # create the parent directory if it does not exist parent_path = os.path.dirname(self.location) if not os.path.isdir(parent_path): # recursively create all directories up to parent_path os.makedirs(parent_path) except IOError, err_msg: raise OSError("Indexer: failed to create the parent " \ + "directory (%s) of the indexing database: %s" \ % (parent_path, err_msg))
def _create_empty_document(self): """create an empty document to be filled and added to the index later :return: the new document object :rtype: PyLucene.Document """ return PyLucene.Document()
def clear(self, models): """Clear the entire index of a certain model.""" self._open_index() for model in models: term = PyLucene.Term(MODEL_FIELD, str(model._meta)) self._index.deleteDocuments(term) self._close_index()
def _create_query_combined(self, queries, require_all=True): """generate a combined query @param queries: list of the original queries @type queries: list of xapian.Query @param require_all: boolean operator (True -> AND (default) / False -> OR) @type require_all: bool @return: the resulting combined query object @rtype: PyLucene.Query """ combined_query = PyLucene.BooleanQuery() for query in queries: combined_query.add( PyLucene.BooleanClause(query, require_all, False)) return combined_query
def _create_query_for_field(self, field, value, analyzer=None): """generate a field query this functions creates a field->value query @param field: the fieldname to be used @type field: str @param value: the wanted value of the field @type value: str @param analyzer: the analyzer to be used possible analyzers are: - L{CommonDatabase.ANALYZER_TOKENIZE} the field value is splitted to be matched word-wise - L{CommonDatabase.ANALYZER_PARTIAL} the field value must start with the query string - L{CommonDatabase.ANALYZER_EXACT} keep special characters and the like @type analyzer: bool @return: resulting query object @rtype: PyLucene.Query """ if analyzer is None: analyzer = self.analyzer if analyzer == self.ANALYZER_EXACT: analyzer_obj = self.ExactAnalyzer() else: value = _escape_term_value(value) analyzer_obj = PyLucene.StandardAnalyzer() if (analyzer & self.ANALYZER_PARTIAL) > 0: # PyLucene uses explicit wildcards for partial matching value += "*" return PyLucene.QueryParser.parse(value, field, analyzer_obj)
def _index_refresh(self): """re-read the indexer database""" try: if self.reader is None or self.searcher is None: self.reader = PyLucene.IndexReader.open(self.location) self.searcher = PyLucene.IndexSearcher(self.reader) elif self.index_version != self.reader.getCurrentVersion( \ self.location): self.searcher.close() self.reader.close() self.reader = PyLucene.IndexReader.open(self.location) self.searcher = PyLucene.IndexSearcher(self.reader) self.index_version = self.reader.getCurrentVersion(self.location) except PyLucene.JavaError,e: # TODO: add some debugging output? #self.errorhandler.logerror("Error attempting to read index - try reindexing: "+str(e)) pass
def _create_query_for_string(self, text, require_all=True, analyzer=None): """generate a query for a plain term of a string query basically this function parses the string and returns the resulting query :param text: The query string :type text: str :param require_all: boolean operator (True -> AND (default) / False -> OR) :type require_all: bool :param analyzer: The analyzer to be used Possible analyzers are: - :attr:`CommonDatabase.ANALYZER_TOKENIZE` the field value is splitted to be matched word-wise - :attr:`CommonDatabase.ANALYZER_PARTIAL` the field value must start with the query string - :attr:`CommonDatabase.ANALYZER_EXACT` keep special characters and the like :type analyzer: bool :return: resulting query object :rtype: PyLucene.Query """ if analyzer is None: analyzer = self.analyzer if analyzer == self.ANALYZER_EXACT: # exact matching - no substitution ... # for PyLucene: nothing special is necessary pass # don't care about special characters ... if analyzer == self.ANALYZER_EXACT: analyzer_obj = self.ExactAnalyzer() else: text = _escape_term_value(text) analyzer_obj = PyLucene.StandardAnalyzer() qp = PyLucene.QueryParser(analyzer=analyzer_obj) if require_all: qp.setDefaultOperator(qp.Operator.AND) else: qp.setDefaultOperator(qp.Operator.OR) if (analyzer & self.ANALYZER_PARTIAL) > 0: # PyLucene uses explicit wildcards for partial matching text += "*" return qp.parse(text)
def get_writer(self, create=False): writer = None while writer is None: try: writer = lucene.IndexWriter(self.store, self.analyzer, create) writer.setMaxFieldLength(1048576) except Exception, e: print e time.sleep(.1)
def delete_existing_feed_docs(self, feed): """ deletes existing documents relating to the given feed """ reader = lucene.IndexReader.open(self.feed_modifier.store) numDeleted = reader.deleteDocuments(lucene.Term('url', feed.xml_url)) logging.info('deleted %d existing index documents' % numDeleted) reader.close() reader = lucene.IndexReader.open(self.entry_modifier.store) for entry in feed.get_entries(): try: id = '%s:%s' % (feed.xml_url, entry.get('id', None)) numDeleted = reader.deleteDocuments(lucene.Term('id', id)) if numDeleted: logging.info('deleted %d feed entry docyments' % numDeleted) except: {} reader.close()
def unindex_record(self, record): """ Unindex documents matching this entry's uid. *Should* only be one, but could be many, if somehow the same entry got indexed multiple times. """ reader = self.context.get_search_index_reader() term = PyLucene.Term('uid', str(record.uid)) reader.deleteDocuments(term) reader.close()
def search(self, query_string='', require_visible=True, allow_curated=True): hits = [] query_string = str(query_string) self.logger.info('Performing search: %s' % query_string) disassembled_query = disassemble_user_query(query_string) self.logger.debug('Disassembled query: %s' % str(disassembled_query)) reassembled_query = '+(%s)' % reassemble_user_query(disassembled_query) self.logger.debug('Reassembled query: %s', reassembled_query) if not allow_curated: reassembled_query += \ ' -record-status:%s' % canary.loader.QueuedRecord.STATUS_CURATED if require_visible: reassembled_query += ' +article-type:[%s TO %s]' % \ (Study.ARTICLE_TYPES['traditional'], Study.ARTICLE_TYPES['curated']) reassembled_query += ' +record-status:%s' % \ canary.loader.QueuedRecord.STATUS_CURATED try: searcher = PyLucene.IndexSearcher( PyLucene.FSDirectory.getDirectory( self.context.config.search_index_dir, False)) analyzer = PyLucene.StandardAnalyzer() query_parser = PyLucene.QueryParser('all', analyzer) query_parser.setOperator(PyLucene.QueryParser.DEFAULT_OPERATOR_AND) query = query_parser.parseQuery(reassembled_query) self.logger.info('Search query: %s', query) hits = searcher.search(query) return hits, searcher except Exception, e: self.logger.error('Search failed: %s', e) #self.logger.error(traceback.format_stack()) if hits \ and searcher: return hits, searcher else: return [], None
def __del__(self): """remove lock and close writer after loosing the last reference""" jvm = PyLucene.getVMEnv() jvm.attachCurrentThread() self._writer_close() if hasattr(self, "reader") and self.reader is not None: self.reader.close() self.reader = None if hasattr(self, "searcher") and self.searcher is not None: self.searcher.close() self.searcher = None
def search(self, query, fields=FEED_ENTRY_FIELDS, analyzer=None, store=None): if not query or len(query.strip()) == 0 or len(fields) == 0: return None analyzer = analyzer or self.analyzer if store is None: store = self.entry_modifier.store if len(fields) > 1: qp = lucene.MultiFieldQueryParser(fields, analyzer) else: qp = lucene.QueryParser(fields[0], analyzer) q = qp.parse(query) searcher = lucene.IndexSearcher(store) hits = searcher.search(q, lucene.Sort.RELEVANCE) return HitHolder(hits, searcher)
def openReadIndex(self): luceneDir = self.store.newDirectory(self.indexDirectory) if not luceneDir.exists(): self.openWriteIndex().close() fsdir = PyLucene.FSDirectory.getDirectory(luceneDir.path, False) try: searcher = PyLucene.IndexSearcher(fsdir) except PyLucene.JavaError, e: raise IndexCorrupt()
def begin_transaction(self): """PyLucene does not support transactions Thus this function just opens the database for write access. Call "cancel_transaction" or "commit_transaction" to close write access in order to remove the exclusive lock from the database directory. """ jvm = PyLucene.getVMEnv() jvm.attachCurrentThread() self._writer_open()
def search_node_by_attribute2(self, att_type, att_value): if self.searcher is None: self.searcher = PyLucene.IndexSearcher("index") analyzer = PyLucene.StandardAnalyzer() if att_type != "" and att_value == "": parser = PyLucene.QueryParser(COLUMN_ATTRIBUTE_TYPE_NID, analyzer) query = parser.parse(att_type) elif att_type == "" and att_value != "": parser = PyLucene.QueryParser(COLUMN_ATTRIBUTE_VALUE, analyzer) query = parser.parse(att_value) elif att_type != "" and att_value != "": parser = PyLucene.QueryParser(COLUMN_ATTRIBUTE_VALUE, analyzer) query = parser.parse(COLUMN_ATTRIBUTE_TYPE_NID + ":" + att_type + " AND " + att_value) hits = self.searcher.search(query) result = self.hits_to_list(hits) return result
def __init__(self, store_dir=STORE_DIR, destroy=False, analyzer=None): self.store_dir = store_dir self.analyzer = analyzer or lucene.StandardAnalyzer() self.feed_modifier = IndexModifier(store_dir=os.path.join( store_dir, 'feeds'), destroy=destroy, analyzer=analyzer) self.entry_modifier = IndexModifier(store_dir=os.path.join( store_dir, 'entries'), destroy=destroy, analyzer=analyzer)
def _create_query_for_string(self, text, require_all=True, analyzer=None): """generate a query for a plain term of a string query basically this function parses the string and returns the resulting query @param text: the query string @type text: str @param require_all: boolean operator (True -> AND (default) / False -> OR) @type require_all: bool @param analyzer: the analyzer to be used possible analyzers are: - L{CommonDatabase.ANALYZER_TOKENIZE} the field value is splitted to be matched word-wise - L{CommonDatabase.ANALYZER_PARTIAL} the field value must start with the query string - L{CommonDatabase.ANALYZER_EXACT} keep special characters and the like @type analyzer: bool @return: resulting query object @rtype: PyLucene.Query """ if analyzer is None: analyzer = self.analyzer if analyzer == self.ANALYZER_EXACT: analyzer_obj = PyLucene.KeywordAnalyzer() else: text = _escape_term_value(text) analyzer_obj = PyLucene.StandardAnalyzer() qp = PyLucene.QueryParser(UNNAMED_FIELD_NAME, analyzer_obj) if (analyzer & self.ANALYZER_PARTIAL > 0): # PyLucene uses explicit wildcards for partial matching text += "*" if require_all: qp.setDefaultOperator(qp.Operator.AND) else: qp.setDefaultOperator(qp.Operator.OR) return qp.parse(text)
def create_index(self, arg): """ Post download setup callback for creating a lucene index """ moreinfo("Creating lucene index") storeDir = "index" if not os.path.exists(storeDir): os.mkdir(storeDir) store = PyLucene.FSDirectory.getDirectory(storeDir, True) self.lucene_writer = PyLucene.IndexWriter(store, PyLucene.StandardAnalyzer(), True) # Uncomment this line to enable a PorterStemmer analyzer # self.lucene_writer = PyLucene.IndexWriter(store, PorterStemmerAnalyzer(), True) self.lucene_writer.setMaxFieldLength(1048576) count = 0 urllist = [] for urlobj in self._urldict.values(): filename = urlobj.get_full_filename() url = urlobj.get_full_url() try: urllist.index(url) continue except ValueError: urllist.append(url) if not filename in self._downloaddict['_savedfiles']: continue data = '' moreinfo('Adding index for URL', url) if os.path.isfile(filename): try: data = unicode(open(filename).read(), 'iso-8859-1') except UnicodeDecodeError, e: data = '' doc = PyLucene.Document() doc.add( PyLucene.Field("name", filename, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add( PyLucene.Field("path", url, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) if data and len(data) > 0: doc.add( PyLucene.Field("contents", data, PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED)) else: extrainfo("warning: no content in %s" % filename) self.lucene_writer.addDocument(doc) count += 1
def delete_node(self, nid): try: index_present = len(os.listdir('index')) > 0 if index_present: reader = PyLucene.IndexReader.open("index") term = PyLucene.Term(COLUMN_NID, nid) if reader.termDocs(term) != None: reader.deleteDocuments(term) reader.close() except: IBASGlobal.print_message( "Error while deleting document from Lucene with nid " + str(nid), 0)
def _writer_open(self): """open write access for the indexing database and acquire an exclusive lock """ if not self._writer_is_open(): self._delete_stale_lock() self.writer = PyLucene.IndexWriter(self.location, self.pyl_analyzer, False) # "setMaxFieldLength" is available since PyLucene v2 # we must stay compatible to v1 for the derived class # (PyLuceneIndexer1) - thus we make this step optional if hasattr(self.writer, "setMaxFieldLength"): self.writer.setMaxFieldLength(MAX_FIELD_SIZE)
def _add_plain_term(self, document, term, tokenize=True): """add a term to a document :param document: the document to be changed :type document: xapian.Document | PyLucene.Document :param term: a single term to be added :type term: str :param tokenize: should the term be tokenized automatically :type tokenize: bool """ # Field parameters: name, string, store, index, token document.add(PyLucene.Field(str(PyLuceneIndex.UNNAMED_FIELD_NAME), term, True, True, tokenize))
def __init__(self, basedir, analyzer=None, create_allowed=True): """Initialize or open an indexing database. Any derived class must override __init__. :raise ValueError: The given location exists, but the database type is incompatible (e.g. created by a different indexing engine) :raise OSError: the database failed to initialize :param basedir: The parent directory of the database :type basedir: str :param analyzer: Bitwise combination of possible analyzer flags to be used as the default analyzer for this database. Leave it empty to use the system default analyzer (self.ANALYZER_DEFAULT). See self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ... :type analyzer: int :param create_allowed: create the database, if necessary; default: True :type create_allowed: bool """ jvm = PyLucene.getVMEnv() jvm.attachCurrentThread() super(PyLuceneDatabase, self).__init__(basedir, analyzer=analyzer, create_allowed=create_allowed) self.pyl_analyzer = PyLucene.StandardAnalyzer() self.writer = None self.reader = None self.index_version = None try: # try to open an existing database tempreader = PyLucene.IndexReader.open(self.location) tempreader.close() except PyLucene.JavaError, err_msg: # Write an error out, in case this is a real problem instead of an absence of an index # TODO: turn the following two lines into debug output # errorstr = str(e).strip() + "\n" + self.errorhandler.traceback_str() # DEBUG_FOO("could not open index, so going to create: " + errorstr) # Create the index, so we can open cached readers on it if not create_allowed: raise OSError("Indexer: skipping database creation") try: # create the parent directory if it does not exist parent_path = os.path.dirname(self.location) if not os.path.isdir(parent_path): # recursively create all directories up to parent_path os.makedirs(parent_path) except IOError, err_msg: raise OSError( "Indexer: failed to create the parent " + "directory (%s) of the indexing database: %s" % (parent_path, err_msg) )
def make_query(self, *args, **kwargs): jvm = PyLucene.getVMEnv() jvm.attachCurrentThread() return super(PyLuceneDatabase, self).make_query(*args, **kwargs)
import re import os import time import logging # try to import the PyLucene package (with the two possible names) # remember the type of the detected package (compiled with jcc (>=v2.3) or # with gcj (<=v2.2) try: import PyLucene _COMPILER = 'gcj' except ImportError: # if this fails, then there is no pylucene installed import lucene PyLucene = lucene PyLucene.initVM(PyLucene.CLASSPATH) _COMPILER = 'jcc' import CommonIndexer UNNAMED_FIELD_NAME = "FieldWithoutAName" MAX_FIELD_SIZE = 1048576 def is_available(): return _get_pylucene_version() == 2 class PyLuceneDatabase(CommonIndexer.CommonDatabase): """manage and use a pylucene indexing database"""
def __init__(self, basedir, analyzer=None, create_allowed=True): """Initialize or open an indexing database. Any derived class must override __init__. :raise ValueError: The given location exists, but the database type is incompatible (e.g. created by a different indexing engine) :raise OSError: the database failed to initialize :param basedir: The parent directory of the database :type basedir: str :param analyzer: Bitwise combination of possible analyzer flags to be used as the default analyzer for this database. Leave it empty to use the system default analyzer (self.ANALYZER_DEFAULT). See self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ... :type analyzer: int :param create_allowed: create the database, if necessary; default: True :type create_allowed: bool """ jvm = PyLucene.getVMEnv() jvm.attachCurrentThread() super(PyLuceneDatabase, self).__init__( basedir, analyzer=analyzer, create_allowed=create_allowed) self.pyl_analyzer = PyLucene.StandardAnalyzer() self.writer = None self.reader = None self.index_version = None try: # try to open an existing database tempreader = PyLucene.IndexReader.open(self.location) tempreader.close() except PyLucene.JavaError as err_msg: # Write an error out, in case this is a real problem instead of an absence of an index # TODO: turn the following two lines into debug output #errorstr = str(e).strip() + "\n" + self.errorhandler.traceback_str() #DEBUG_FOO("could not open index, so going to create: " + errorstr) # Create the index, so we can open cached readers on it if not create_allowed: raise OSError("Indexer: skipping database creation") try: # create the parent directory if it does not exist parent_path = os.path.dirname(self.location) if not os.path.isdir(parent_path): # recursively create all directories up to parent_path os.makedirs(parent_path) except IOError as err_msg: raise OSError("Indexer: failed to create the parent " "directory (%s) of the indexing database: %s" % (parent_path, err_msg)) try: tempwriter = PyLucene.IndexWriter( self.location, self.pyl_analyzer, True) tempwriter.close() except PyLucene.JavaError as err_msg: raise OSError("Indexer: failed to open or create a Lucene" " database (%s): %s" % (self.location, err_msg)) # the indexer is initialized - now we prepare the searcher # windows file locking seems inconsistent, so we try 10 times numtries = 0 #self.dir_lock.acquire(blocking=True) # read "self.reader", "self.indexVersion" and "self.searcher" try: while numtries < 10: try: self.reader = PyLucene.IndexReader.open(self.location) self.indexVersion = self.reader.getCurrentVersion( self.location) self.searcher = PyLucene.IndexSearcher(self.reader) break except PyLucene.JavaError as e: # store error message for possible later re-raise (below) lock_error_msg = e time.sleep(0.01) numtries += 1 else: # locking failed for 10 times raise OSError("Indexer: failed to lock index database" " (%s)" % lock_error_msg) finally: pass # self.dir_lock.release() # initialize the searcher and the reader self._index_refresh()