def _setupIndexer(self): if self.searcher is not None: self.searcher.close() self.searcher = None if self.indexer is None: self.indexer = IndexWriter(self.name, False, self.analyzer) self.indexer.mergeFactor = self.mergeFactor
def __init__(self, name, create=False, analyzer=None): """ @param name: Name of the directory for this index. @param create: Whether to create this directory or not. @type create: boolean """ self.name = name self.analyzer = analyzer or standardTokenizer # Create the index if we need to. From here on we assume # that the index exists self.indexer = IndexWriter(self.name, create, analyzer) # Remember the default merge factor self.mergeFactor = self.indexer.mergeFactor # Clean up self.indexer.close() self.indexer = self.searcher = None
def index(fileName, limit): startTiming() # create a new index in a directory indexer = IndexWriter(g_indexPath, True) # supposed to speed up indexing by avoiding disk i/o # that's how many documents to index in memory before flushing # to disk indexer.mergeFactor = 1100 count = 0 failedCount = 0 for (title, ns, txt) in iterGetArticle(fileName): if ns != 0: continue try: # iso-8859-1 is the default encoding used in Wikipedia txt = txt.decode("iso-8859-1") title = title.decode("iso-8859-1") indexOneArticle(indexer, title, txt) count += 1 if count >= limit: break if count % 500 == 0: print "indexed %d articles" % count except: # for now just ignore possible decoding errors if g_fVerbose: sys.stdout.write('failed to index %s' % title) failedCount += 1 # Uncomment the following line to optimize the index. # Have a look in the index dir before you optimize. # You will probably see a dozens of files from # several segments. optimize() merges all the segments # into one. It can be quite an expensive operation, but # it can save space and speed up searches. indexer.optimize() indexer.close() endTiming() print "indexed %d articles" % count print "failed to index %d articles" % failedCount dumpTiming()
def commit_documents(self): # for lupy, put everyting in a single index because lupy # doesn't support multiple index searching writer = IndexWriter(self._lupy_index_dir, False) while len(self._lupy_queue) > 0: qd = self._lupy_queue.pop(0) fields = qd[0] existing = qd[1] """ add a document with the given fields to the index. If it's a document that already exists, delete it first, using the url as the key. """ if existing: # commit anything that's already been added by closing the writer writer.close() _delete_document(self._get_field_value(fields, 'url')) # reopen writer writer = IndexWriter(self._lupy_index_dir, False) # create a lupy document consisting of the fields d = Document() for x in fields: # converting to unicode keeps lupy happy, but then when we get the previews back # we get an error when printing, so let's just punt and remove all bad characters # value = self._convert_text_from_iso8559_to_unicode(str(x[1])) value = self._remove_bad_characters(x[1]) f = Field(str(x[0]), value, x[2], x[3], x[4]) d.add(f) try: writer.addDocument(d) except: print "Lupy: could not add document: %s" % (self._get_field_value(fields, 'text')) writer.close()
def index(fileName,limit): startTiming() # create a new index in a directory indexer = IndexWriter(g_indexPath, True) # supposed to speed up indexing by avoiding disk i/o # that's how many documents to index in memory before flushing # to disk indexer.mergeFactor = 1100 count = 0 failedCount = 0 for (title,ns,txt) in iterGetArticle(fileName): if ns != 0: continue try: # iso-8859-1 is the default encoding used in Wikipedia txt = txt.decode("iso-8859-1") title = title.decode("iso-8859-1") indexOneArticle(indexer,title,txt) count += 1 if count >= limit: break if count % 500 == 0: print "indexed %d articles" % count except: # for now just ignore possible decoding errors if g_fVerbose: sys.stdout.write('failed to index %s' % title) failedCount += 1 # Uncomment the following line to optimize the index. # Have a look in the index dir before you optimize. # You will probably see a dozens of files from # several segments. optimize() merges all the segments # into one. It can be quite an expensive operation, but # it can save space and speed up searches. indexer.optimize() indexer.close() endTiming() print "indexed %d articles" % count print "failed to index %d articles" % failedCount dumpTiming()
def __init__(self, path, create=False): """Create an indexer, writing and index to the directory B{path}. The boolean flag B{create} determines whether the index is created (overwriting an existing index) or updated""" self.indexer = IndexWriter(path, create)
class Indexer: def __init__(self, path, create=False): """Create an indexer, writing and index to the directory B{path}. The boolean flag B{create} determines whether the index is created (overwriting an existing index) or updated""" self.indexer = IndexWriter(path, create) def addDoc(self, fname): """Add a document to the index""" # create document d = document.Document() # add a file field containing the path to this file f = document.Keyword('filename',fname) d.add(f) # I happen to know that the title is separated # from the story by '\n\n\n', so I can easily get the title # which we store in the title field fp = open(fname,'rb') s = fp.read().decode("latin-1") title = s.split('\n\n\n')[0] f = document.Text('title',title) d.add(f) # Here I pass False as the 3rd arg to ensure that # the actual text of s is not stored in the index # the following lines using TextWithReader are # more typical. f = document.Text('text', s, False) d.add(f) # Add text of an open file (fp) # This is typically how you add a file to an index # f = field.Text('text', fp) # d.add(f) fp.close() # add doc to index print 'indexing', fname self.indexer.addDocument(d) def index(self, dir): """Recurse through B{dir} and index the files. Call optimize() before closing to merge all of the segments created by indexing. This is an optional step and can be expensive for large indexes. """ for name in os.listdir(dir): f = os.path.join(dir, name) if os.path.isdir(f) or os.path.islink(f): continue self.addDoc(f) # Uncomment the following line to optimize the index. # Have a look in the index dir before you optimize. # You will probably see a dozens of files from # several segments. optimize() merges all the segments # into one. It can be quite an expensive operation, but # it can save space and speed up searches. # self.indexer.optimize() self.indexer.close()
class Index: def __init__(self, name, create=False, analyzer=None): """ @param name: Name of the directory for this index. @param create: Whether to create this directory or not. @type create: boolean """ self.name = name self.analyzer = analyzer or standardTokenizer # Create the index if we need to. From here on we assume # that the index exists self.indexer = IndexWriter(self.name, create, analyzer) # Remember the default merge factor self.mergeFactor = self.indexer.mergeFactor # Clean up self.indexer.close() self.indexer = self.searcher = None def index(self, **kw): """Add a document to the index. **kw contains the name and values of each Field in the Document that we are creating. If the key in **kw starts with '_' the field will be created as a Keyword. If it starts with '__', it is created as a stored Text field (e.g. tokenized and stored), otherwise it will be created as a Text field. The leading '_' are removed before field creation. Text fields will have their value tokenized before indexing. The value is not stored in the index. This is the usual type of field that you need for plain text. Keyword fields will not have their value tokenized. The value is stored in the index and is returned with search hits on the Document. If you wanted to store the path to a document along with each document, you would use a Keyword field. The path would not be tokenized and its value would be returned in the query results, so you could easily open and display the file. """ self._setupIndexer() # create document d = document.Document() # TODO - Please find another way of defining fields # than magic field names!!! # add a file field containing the path to this file for key, value in kw.items(): if key[:2] == '__': key = key[2:] # Tokenized and stored f = document.Text(key, value, True) elif key[0] == '_': # Not tokenized and stored key = key[1:] # keyword f = document.Keyword(key, value) else: # Tokenized and not stored f = document.Text(key, value, False) d.add(f) self.indexer.addDocument(d) def _setupIndexer(self): if self.searcher is not None: self.searcher.close() self.searcher = None if self.indexer is None: self.indexer = IndexWriter(self.name, False, self.analyzer) self.indexer.mergeFactor = self.mergeFactor def _setupSearcher(self): if self.indexer is not None: self.indexer.close() self.indexer = None if self.searcher is None: self.searcher = indexsearcher.IndexSearcher(self.name) def delete(self, **kw): "Delete the first document containing the specified term. See also L{deleteAll}." # Not very efficient for bulk deletes # Use deleteAll for bulk deletes self._setupSearcher() if len(kw) != 1: raise RuntimeError, 'one and only one field for the moment' field, value = kw.items()[0] t = Term(field, value) self.searcher.reader.deleteTerm(t) def deleteAll(self, **kw): "Remove all documents containing this field and value." self.close() reader = indexsearcher.open(self.name) if len(kw) != 1: raise RuntimeError, 'one and only one field for the moment' field, values = kw.items()[0] for value in values: t = Term(field, value) reader.deleteTerm(t) # commit the deletes reader.close() def close(self): # Indexer and Searchers are different # and we have to open the right kind # for the operation we are performing. # The actual creation is done in the index and find # methods. Here we close whatever is open. if self.searcher is not None: self.searcher.close() self.searcher = None if self.indexer is not None: self.indexer.close() self.indexer = None def flush(self): """Flush outstanding indexes to disk. This makes sure we are searching the latest stuff. """ if self.indexer is not None: self.indexer.flushRamSegments() def optimize(self): """Merge all on-disk segments into a single segment. Saves space and can speed up queries.""" self._setupIndexer() self.indexer.optimize() def parse(self, field, qString): if qString.startswith('"'): qString = qString.strip('"') #qWords = qString.strip('"').split() qWords = self._tokenize(qString) return self.phraseSearch(field, qWords) else: qWords = self._tokenize(qString) if len(qWords) == 1: return self.termSearch(field, qWords[0]) else: return self.boolSearch(field, qWords) def _tokenize(self, qString): return list(self.analyzer(qString)) def find(self, qStr): """Perform a search in any field in this index. If the search string is enclosed in double quotes, a phrase search will be run; otherwise, the search will be for documents containing all words specified.""" self._setupSearcher() fields = self.searcher.fieldNames() if not fields: return [] all = [self.parse(field, qStr) for field in fields] if len(all) is 1: # simple case return self.searcher.search(all[0]) q = BooleanQuery() for query in all: # OR all of the field queries q.add(query, False, False) hits = self.searcher.search(q) return hits def findInField(self, **kw): """Search only in a single field.""" # eg index.findInField(text='flute') if len(kw) != 1: raise RuntimeError, 'one and only one field for the moment' self._setupSearcher() field, query = kw.items()[0] q = self.parse(field, query) hits = self.searcher.search(q) return hits def termSearch(self, field, term): "Search for a single C{term} in a C{field}." t = Term(field, term) q = TermQuery(t) return q def phraseSearch(self, field, words): "Search for a phrase (given as a list of words) in C{field}." q = PhraseQuery() for word in words: t = Term(field, word) q.add(t) return q def boolSearch(self, field, ands=[], ors=[], nots=[]): """Build a simple boolean query. Each word in C{ands} is equiv to +word Each word in C{ors} is equiv to word Each word in C{nots} is equiv to -word E.g. C{boolSearch(['spam'], ['eggs'], ['parrot', 'cheese'])} is equiv to C{+spam eggs -parrot -cheese} in Google/Lucene syntax. """ q = BooleanQuery() for a in ands: t = Term(field, a) tq = TermQuery(t) q.add(tq, True, False) for a in ors: t = Term(field, a) tq = TermQuery(t) q.add(tq, False, False) for a in nots: t = Term(field, a) tq = TermQuery(t) q.add(tq, False, True) return q def printHits(self, hits): if len(hits) == 0: print 'Nothing found!' else: for i in range(len(hits)): print hits.doc(i), hits.score(i) def setMergeFactor(self, anInt): "Set how many documents will be processed before the indexes will be merged. Never less than 2." # Never less than 2 if anInt >= 2: self.mergeFactor = anInt
def _optimize_index(self): index = IndexWriter(self._lupy_index_dir, False) index.optimize() index.close()
class Indexer: def __init__(self, path, create=False): """Create an indexer, writing and index to the directory B{path}. The boolean flag B{create} determines whether the index is created (overwriting an existing index) or updated""" self.indexer = IndexWriter(path, create) def addDoc(self, fname): """Add a document to the index""" # create document d = document.Document() # add a file field containing the path to this file f = document.Keyword('filename', fname) d.add(f) # I happen to know that the title is separated # from the story by '\n\n\n', so I can easily get the title # which we store in the title field fp = open(fname, 'rb') s = fp.read().decode("latin-1") title = s.split('\n\n\n')[0] f = document.Text('title', title) d.add(f) # Here I pass False as the 3rd arg to ensure that # the actual text of s is not stored in the index # the following lines using TextWithReader are # more typical. f = document.Text('text', s, False) d.add(f) # Add text of an open file (fp) # This is typically how you add a file to an index # f = field.Text('text', fp) # d.add(f) fp.close() # add doc to index print 'indexing', fname self.indexer.addDocument(d) def index(self, dir): """Recurse through B{dir} and index the files. Call optimize() before closing to merge all of the segments created by indexing. This is an optional step and can be expensive for large indexes. """ for name in os.listdir(dir): f = os.path.join(dir, name) if os.path.isdir(f) or os.path.islink(f): continue self.addDoc(f) # Uncomment the following line to optimize the index. # Have a look in the index dir before you optimize. # You will probably see a dozens of files from # several segments. optimize() merges all the segments # into one. It can be quite an expensive operation, but # it can save space and speed up searches. # self.indexer.optimize() self.indexer.close()