def search_spam( post, ix, ): """ Search spam index for posts similar to this one. Returns """ writer = AsyncWriter(ix) add_post_to_index(post=post, writer=writer, is_spam=post.is_spam) writer.commit() # Search for this post in the spam index fields = ['uid'] results = search.preform_whoosh_search(ix=ix, query=post.uid, fields=fields) # Preform more_like_this on this posts content similar_content = results[0].more_like_this('content', top=5) # Remove this post from the spam index after results are collected. writer = AsyncWriter(ix) writer.delete_by_term('uid', text=post.uid) writer.commit() # Get the results into a list and close the searcher object. similar_content = list(map(search.normalize_result, similar_content)) results.searcher.close() return similar_content
def remove_post(post, ix=None): """ Remove spam from index """ ix = ix or init_index() # Remove this post from index writer = AsyncWriter(ix) writer.delete_by_term('uid', text=post.uid) writer.commit() logger.debug(f"Removing uid={post.uid} from index") return
def build_index(whoosh_index_dir, file_path, hgweb_config_dir, dburi, **kwargs): """ Build two search indexes simultaneously One is for repositories and the other for tools. Returns a tuple with number of repos and tools that were indexed. """ model = ts_mapping.init(file_path, dburi, engine_options={}, create_tables=False) sa_session = model.context.current repo_index, tool_index = _get_or_create_index(whoosh_index_dir) repo_index_writer = AsyncWriter(repo_index) tool_index_writer = AsyncWriter(tool_index) repos_indexed = 0 tools_indexed = 0 execution_timer = ExecutionTimer() with repo_index.searcher() as searcher: for repo in get_repos(sa_session, file_path, hgweb_config_dir, **kwargs): tools_list = repo.pop('tools_list') repo_id = repo['id'] indexed_document = searcher.document(id=repo_id) if indexed_document: if indexed_document['full_last_updated'] == repo.get( 'full_last_updated'): # We're done, since we sorted repos by update time break else: # Got an update, delete the previous document repo_index_writer.delete_by_term('id', repo_id) repo_index_writer.add_document(**repo) # Tools get their own index for tool in tools_list: tool_index_writer.add_document(**tool) tools_indexed += 1 repos_indexed += 1 tool_index_writer.commit() repo_index_writer.commit() log.info("Indexed repos: %s, tools: %s", repos_indexed, tools_indexed) log.info("Toolbox index finished %s", execution_timer) return repos_indexed, tools_indexed
def update_index(self, document): """Update search index for a document Args: self (object): FullTextSearch Instance document (_dict): A dictionary with title, path and content """ ix = self.get_index() with ix.searcher(): writer = AsyncWriter(ix) writer.delete_by_term(self.id, document[self.id]) writer.add_document(**document) writer.commit(optimize=True)
def remove_document_from_index(self, doc_name): """Remove document from search index Args: self (object): FullTextSearch Instance doc_name (str): name of the document to be removed """ if not doc_name: return ix = self.get_index() with ix.searcher(): writer = AsyncWriter(ix) writer.delete_by_term(self.id, doc_name) writer.commit(optimize=True)
class SearchPipeline(object): cleanup = False def open_spider(self, spider): """ When opening spider, open or create index. """ index_dir = os.path.expanduser('~/.sitesearcher/index') if not os.path.exists(index_dir): os.makedirs(index_dir) self.indexname = spider.allowed_domains[0] if index.exists_in(index_dir, indexname=self.indexname): self.index = index.open_dir(index_dir, indexname=self.indexname) else: self.index = index.create_in( index_dir, indexname=self.indexname, schema=schema, ) self.writer = AsyncWriter(self.index) def process_item(self, item, spider): """ Add crawled item to index. Add items using ``update_document`` to delete any previously indexed versions and avoid duplicates """ self.writer.update_document( url=item.get('url'), content=item.get('content')) def close_spider(self, spider): """ Close index writer on closing of spider an clean up. On closing, delete any previously indexed items that have not been updated in this crawl, as these are obviously no longer reachable sites. """ with self.index.searcher() as searcher: for page in searcher.all_stored_fields(): if page['url'] not in spider.state['update_list']: self.writer.delete_by_term('url', page['url']) self.writer.commit()
def delete(self, name, purge=True): """ Delete a document by its name. name is actually a hash. If purge is true, file is also removed from the boxes. """ # Grab a writer on the index writer = AsyncWriter(self.index) # Delete and commit ffom index writer.delete_by_term(u'hash', name) writer.commit() # Delete the document from the boxes if we want to purge them if not purge: return # We need to remove the doc is box is writable for box in self.boxes: if box.haskey(name) and not box.readonly: del (box[name])
def updateindex(self): print('updateindex') storage = FileStorage(self.indexpath) ix = storage.open_index(indexname=self.indexname) index_id = set() to_index_id = set() objlist = self.model.objects.all() with ix.searcher() as searcher: writer = AsyncWriter(ix) for indexfield in searcher.all_stored_fields(): if len(indexfield) > 0: indexId = indexfield['id'] print(indexId) index_id.add(indexId) # 数据库未找到此篇,则可能已被删除,故从index中删除此篇 if not self.model.objects.filter(id=indexId): print(indexId) writer.delete_by_term('id', str(indexId)) else: for key in indexfield: # 根据updatefield进行更新 if key == self.updatefield: print(indexId) objfromdb = self.model.objects.get(id=indexId) contentofobj = getattr(objfromdb, self.updatefield) if contentofobj != indexfield[key]: writer.delete_by_term('id', str(indexId)) to_index_id.add(indexId) print('update id is %s, title is %s' % (indexId, objfromdb.title)) for obj in objlist: if obj.id in to_index_id or obj.id not in index_id: self.__addonedoc(writer, obj.id) print('add id is %s, title is %s' % (obj.id, obj.title)) writer.commit() storage.close()
writer = AsyncWriter(self.ix[LATEST_REVS]) else: writer = self.ix[LATEST_REVS].writer() with writer as writer: writer.update_document(**doc) def remove_revision(self, revid, async=True): """ Remove a single revision from indexes. """ if async: writer = AsyncWriter(self.ix[ALL_REVS]) else: writer = self.ix[ALL_REVS].writer() with writer as writer: writer.delete_by_term(REVID, revid) if async: writer = AsyncWriter(self.ix[LATEST_REVS]) else: writer = self.ix[LATEST_REVS].writer() with writer as writer: # find out itemid related to the revid we want to remove: with self.ix[LATEST_REVS].searcher() as searcher: docnum_remove = searcher.document_number(revid=revid) if docnum_remove is not None: itemid = searcher.stored_fields(docnum_remove)[ITEMID] if docnum_remove is not None: # we are removing a revid that is in latest revs index latest_backends_revids = self._find_latest_backends_revids(self.ix[ALL_REVS], Term(ITEMID, itemid)) if latest_backends_revids: # we have a latest revision, just update the document in the index:
class Index(object): def __init__(self, directory, persist): self.log = logging.getLogger("ftpvista.index") self._persist = persist if not os.path.exists(directory): self.log.info("Creating the index in %s" % directory) os.mkdir(directory) self._idx = index.create_in(directory, schema=self.get_schema()) else: self.log.info("Opening the index in %s" % directory) self._idx = index.open_dir(directory) self._searcher = self._idx.searcher() self._writer = None self.open_writer() self._last_optimization = None def open_writer(self): # self._writer = BufferedWriter(self._idx, 120, 4000) self._writer = AsyncWriter(self._idx) def get_schema(self): analyzer = StemmingAnalyzer("([a-zA-Z0-9])+") my_analyzer = analyzer | CharsetFilter(accent_map) return Schema( server_id=ID(stored=True), has_id=ID(), path=TEXT(analyzer=my_analyzer, stored=True), name=TEXT(analyzer=my_analyzer, stored=True), ext=TEXT(analyzer=my_analyzer, stored=True), size=ID(stored=True), mtime=ID(stored=True, sortable=True), audio_album=TEXT(analyzer=my_analyzer, stored=True), audio_artist=TEXT(analyzer=my_analyzer, stored=True), audio_title=TEXT(analyzer=my_analyzer, stored=True), audio_track=ID(stored=True), audio_year=ID(stored=True), ) def delete_all_docs(self, server): self.open_writer() self._writer.delete_by_term("server_id", str(server.get_server_id())) self._writer.commit() self.log.info("All documents of server %s deleted" % server.get_ip_addr()) def incremental_server_update(self, server_id, current_files): """Prepares to incrementaly update the documents for the given server. server_id -- Id of the server to update. current_files -- a list of (path, size, mtime) tuples for each files currently on the server. Delete all the outdated files from the index and returns a list of files needing to be reindexed. """ def delete_doc(writer, serverid, path): writer.delete_by_query(Term("server_id", str(serverid)) & Term("path", path)) # Build a {path => (size, mtime)} mapping for quick lookups to_index = {} for path, size, mtime in current_files: to_index[path] = (size, mtime) results = self._searcher.documents(server_id=str(server_id)) if results: for fields in results: indexed_path = fields["path"] if indexed_path not in to_index: # This file was deleted from the server since it was indexed delete_doc(self._writer, server_id, indexed_path) self.log.debug("%s has been removed" % indexed_path) else: size, mtime = to_index[indexed_path] try: if mtime > datetime.strptime(fields["mtime"], "%Y-%m-%d %H:%M:%S"): # This file has been modified since it was indexed delete_doc(self._writer, server_id, indexed_path) else: # up to date, no need to reindex del to_index[indexed_path] except ValueError: delete_doc(self._writer, server_id, indexed_path) # return the remaining files return [(path, xsize, xmtime) for (path, (xsize, xmtime)) in to_index.items()] def add_document( self, server_id, name, path, size, mtime, audio_album=None, audio_artist=None, audio_title=None, audio_year=None ): """Add a document with the specified fields in the index. Changes need to be commited. """ # passing the optional arguments is quite a mess # let's build a dict for that purpose _, ext = os.path.splitext(name) ext = ext.lstrip(".") kwargs = { "server_id": server_id, "name": name, "ext": ext, "path": path, "size": size, "mtime": mtime, "has_id": "a", } # Add the optional args if audio_album is not None: kwargs["audio_album"] = audio_album if audio_artist is not None: kwargs["audio_artist"] = audio_artist if audio_title is not None: kwargs["audio_title"] = audio_title if audio_year is not None: kwargs["audio_year"] = audio_year try: self._writer.add_document(**kwargs) except IndexingError: self.open_writer() self._writer.add_document(**kwargs) def commit(self, optimize=False): """ Commit the changes in the index and optimize it """ self.log.info(" -- Begin of Commit -- ") try: self._writer.commit(optimize=optimize) except IndexingError: self.open_writer() self._writer.commit(optimize=optimize) self.log.info("Index commited") self._searcher = self._idx.searcher() self.log.info(" -- End of Commit -- ") def close(self): self.log.info(" -- Closing writer and index -- ") # self._writer.close() """ Close the index """ self._idx.close()
def remove(self, item_id): from whoosh.writing import AsyncWriter writer = AsyncWriter(self.ix) writer.delete_by_term('id', item_id) writer.commit()
def index_update(index, items): """ :param:index: index name :param:items: list of (operation, full class name, primary key, data) tuples. """ index_name = index index = service.app_state.indexes[index_name] adapted = service.adapted session = safe_session() updated = set() writer = AsyncWriter(index) try: for op, cls_name, pk, data in items: if pk is None: continue # always delete. Whoosh manual says that 'update' is actually delete + add # operation object_key = "{}:{}".format(cls_name, pk) writer.delete_by_term("object_key", object_key) adapter = adapted.get(cls_name) if not adapter: # FIXME: log to sentry? continue if object_key in updated: # don't add twice the same document in same transaction. The writer will # not delete previous records, ending in duplicate records for same # document. continue if op in ("new", "changed"): with session.begin(nested=True): obj = adapter.retrieve(pk, _session=session, **data) if obj is None: # deleted after task queued, but before task run continue document = service.get_document(obj, adapter) try: writer.add_document(**document) except ValueError: # logger is here to give us more infos in order to catch a weird bug # that happens regularly on CI but is not reliably # reproductible. logger.error("writer.add_document(%r)", document, exc_info=True) raise updated.add(object_key) except BaseException: writer.cancel() raise session.close() writer.commit() try: # async thread: wait for its termination writer.join() except RuntimeError: # happens when actual writer was already available: asyncwriter didn't need # to start a thread pass
def index_update(index, items): """ :param:index: index name :param:items: list of (operation, full class name, primary key, data) tuples. """ index_name = index index = service.app_state.indexes[index_name] adapted = service.adapted session = safe_session() updated = set() writer = AsyncWriter(index) try: for op, cls_name, pk, data in items: if pk is None: continue # always delete. Whoosh manual says that 'update' is actually delete + add # operation object_key = f"{cls_name}:{pk}" writer.delete_by_term("object_key", object_key) adapter = adapted.get(cls_name) if not adapter: # FIXME: log to sentry? continue if object_key in updated: # don't add twice the same document in same transaction. The writer will # not delete previous records, ending in duplicate records for same # document. continue if op in ("new", "changed"): with session.begin(nested=True): obj = adapter.retrieve(pk, _session=session, **data) if obj is None: # deleted after task queued, but before task run continue document = service.get_document(obj, adapter) try: writer.add_document(**document) except ValueError: # logger is here to give us more infos in order to catch a weird bug # that happens regularly on CI but is not reliably # reproductible. logger.error("writer.add_document(%r)", document, exc_info=True) raise updated.add(object_key) except Exception: writer.cancel() raise session.close() writer.commit() try: # async thread: wait for its termination writer.join() except RuntimeError: # happens when actual writer was already available: asyncwriter didn't need # to start a thread pass
def remove(self, item_id): from whoosh.writing import AsyncWriter writer = AsyncWriter(self.ix) writer.delete_by_term('id', item_id) writer.commit()
writer = AsyncWriter(self.ix[LATEST_REVS]) else: writer = self.ix[LATEST_REVS].writer() with writer as writer: writer.update_document(**doc) def remove_revision(self, revid, async=True): """ Remove a single revision from indexes. """ if async: writer = AsyncWriter(self.ix[ALL_REVS]) else: writer = self.ix[ALL_REVS].writer() with writer as writer: writer.delete_by_term(REVID, revid) if async: writer = AsyncWriter(self.ix[LATEST_REVS]) else: writer = self.ix[LATEST_REVS].writer() with writer as writer: # find out itemid related to the revid we want to remove: with self.ix[LATEST_REVS].searcher() as searcher: docnum_remove = searcher.document_number(revid=revid) if docnum_remove is not None: itemid = searcher.stored_fields(docnum_remove)[ITEMID] if docnum_remove is not None: # we are removing a revid that is in latest revs index try: latest_revids = self._find_latest_revids(self.ix[ALL_REVS], Term(ITEMID, itemid)) except AttributeError:
def unindex_post(post): """Delete a post from the search index""" writer = AsyncWriter(ix) writer.delete_by_term('id', str(post.id)) writer.commit()