コード例 #1
0
def search_spam(
    post,
    ix,
):
    """
    Search spam index for posts similar to this one.
    Returns
    """
    writer = AsyncWriter(ix)
    add_post_to_index(post=post, writer=writer, is_spam=post.is_spam)
    writer.commit()

    # Search for this post in the spam index
    fields = ['uid']

    results = search.preform_whoosh_search(ix=ix,
                                           query=post.uid,
                                           fields=fields)

    # Preform more_like_this on this posts content
    similar_content = results[0].more_like_this('content', top=5)

    # Remove this post from the spam index after results are collected.
    writer = AsyncWriter(ix)
    writer.delete_by_term('uid', text=post.uid)
    writer.commit()

    # Get the results into a list and close the searcher object.
    similar_content = list(map(search.normalize_result, similar_content))

    results.searcher.close()

    return similar_content
コード例 #2
0
def remove_post(post, ix=None):
    """
    Remove spam from index
    """

    ix = ix or init_index()

    # Remove this post from index
    writer = AsyncWriter(ix)
    writer.delete_by_term('uid', text=post.uid)
    writer.commit()
    logger.debug(f"Removing uid={post.uid} from index")
    return
コード例 #3
0
def build_index(whoosh_index_dir, file_path, hgweb_config_dir, dburi,
                **kwargs):
    """
    Build two search indexes simultaneously
    One is for repositories and the other for tools.

    Returns a tuple with number of repos and tools that were indexed.
    """
    model = ts_mapping.init(file_path,
                            dburi,
                            engine_options={},
                            create_tables=False)
    sa_session = model.context.current
    repo_index, tool_index = _get_or_create_index(whoosh_index_dir)

    repo_index_writer = AsyncWriter(repo_index)
    tool_index_writer = AsyncWriter(tool_index)
    repos_indexed = 0
    tools_indexed = 0

    execution_timer = ExecutionTimer()
    with repo_index.searcher() as searcher:
        for repo in get_repos(sa_session, file_path, hgweb_config_dir,
                              **kwargs):
            tools_list = repo.pop('tools_list')
            repo_id = repo['id']
            indexed_document = searcher.document(id=repo_id)
            if indexed_document:
                if indexed_document['full_last_updated'] == repo.get(
                        'full_last_updated'):
                    # We're done, since we sorted repos by update time
                    break
                else:
                    # Got an update, delete the previous document
                    repo_index_writer.delete_by_term('id', repo_id)

            repo_index_writer.add_document(**repo)

            #  Tools get their own index
            for tool in tools_list:
                tool_index_writer.add_document(**tool)
                tools_indexed += 1

            repos_indexed += 1

    tool_index_writer.commit()
    repo_index_writer.commit()

    log.info("Indexed repos: %s, tools: %s", repos_indexed, tools_indexed)
    log.info("Toolbox index finished %s", execution_timer)
    return repos_indexed, tools_indexed
コード例 #4
0
    def update_index(self, document):
        """Update search index for a document

		Args:
			self (object): FullTextSearch Instance
			document (_dict): A dictionary with title, path and content
		"""
        ix = self.get_index()

        with ix.searcher():
            writer = AsyncWriter(ix)
            writer.delete_by_term(self.id, document[self.id])
            writer.add_document(**document)
            writer.commit(optimize=True)
コード例 #5
0
    def remove_document_from_index(self, doc_name):
        """Remove document from search index

		Args:
		        self (object): FullTextSearch Instance
		        doc_name (str): name of the document to be removed
		"""
        if not doc_name:
            return

        ix = self.get_index()
        with ix.searcher():
            writer = AsyncWriter(ix)
            writer.delete_by_term(self.id, doc_name)
            writer.commit(optimize=True)
コード例 #6
0
class SearchPipeline(object):
    cleanup = False

    def open_spider(self, spider):
        """ When opening spider, open or create index. """

        index_dir = os.path.expanduser('~/.sitesearcher/index')
        if not os.path.exists(index_dir):
            os.makedirs(index_dir)

        self.indexname = spider.allowed_domains[0]
        if index.exists_in(index_dir, indexname=self.indexname):
            self.index = index.open_dir(index_dir, indexname=self.indexname)
        else:
            self.index = index.create_in(
                index_dir,
                indexname=self.indexname,
                schema=schema,
            )
        self.writer = AsyncWriter(self.index)

    def process_item(self, item, spider):
        """ Add crawled item to index.

        Add items using ``update_document`` to delete any previously indexed
        versions and avoid duplicates
        """

        self.writer.update_document(
            url=item.get('url'), content=item.get('content'))

    def close_spider(self, spider):
        """ Close index writer on closing of spider an clean up.

        On closing, delete any previously indexed items that have not been
        updated in this crawl, as these are obviously no longer reachable sites.
        """

        with self.index.searcher() as searcher:
            for page in searcher.all_stored_fields():
                if page['url'] not in spider.state['update_list']:
                    self.writer.delete_by_term('url', page['url'])
        self.writer.commit()
コード例 #7
0
    def delete(self, name, purge=True):
        """
        Delete a document by its name. name is actually a hash. If purge is true, file is also
        removed from the boxes.
        """
        # Grab a writer on the index
        writer = AsyncWriter(self.index)

        # Delete and commit ffom index
        writer.delete_by_term(u'hash', name)
        writer.commit()

        # Delete the document from the boxes if we want to purge them
        if not purge:
            return

        # We need to remove the doc is box is writable
        for box in self.boxes:
            if box.haskey(name) and not box.readonly:
                del (box[name])
コード例 #8
0
 def updateindex(self):
     print('updateindex')
     storage = FileStorage(self.indexpath)
     ix = storage.open_index(indexname=self.indexname)
     index_id = set()
     to_index_id = set()
     objlist = self.model.objects.all()
     with ix.searcher() as searcher:
         writer = AsyncWriter(ix)
         for indexfield in searcher.all_stored_fields():
             if len(indexfield) > 0:
                 indexId = indexfield['id']
                 print(indexId)
                 index_id.add(indexId)
                 # 数据库未找到此篇,则可能已被删除,故从index中删除此篇
                 if not self.model.objects.filter(id=indexId):
                     print(indexId)
                     writer.delete_by_term('id', str(indexId))
                 else:
                     for key in indexfield:
                         # 根据updatefield进行更新
                         if key == self.updatefield:
                             print(indexId)
                             objfromdb = self.model.objects.get(id=indexId)
                             contentofobj = getattr(objfromdb,
                                                    self.updatefield)
                             if contentofobj != indexfield[key]:
                                 writer.delete_by_term('id', str(indexId))
                                 to_index_id.add(indexId)
                                 print('update id is %s, title is %s' %
                                       (indexId, objfromdb.title))
         for obj in objlist:
             if obj.id in to_index_id or obj.id not in index_id:
                 self.__addonedoc(writer, obj.id)
                 print('add id is %s, title is %s' % (obj.id, obj.title))
         writer.commit()
     storage.close()
コード例 #9
0
ファイル: indexing.py プロジェクト: denedios/moin-2.0
            writer = AsyncWriter(self.ix[LATEST_REVS])
        else:
            writer = self.ix[LATEST_REVS].writer()
        with writer as writer:
            writer.update_document(**doc)

    def remove_revision(self, revid, async=True):
        """
        Remove a single revision from indexes.
        """
        if async:
            writer = AsyncWriter(self.ix[ALL_REVS])
        else:
            writer = self.ix[ALL_REVS].writer()
        with writer as writer:
            writer.delete_by_term(REVID, revid)
        if async:
            writer = AsyncWriter(self.ix[LATEST_REVS])
        else:
            writer = self.ix[LATEST_REVS].writer()
        with writer as writer:
            # find out itemid related to the revid we want to remove:
            with self.ix[LATEST_REVS].searcher() as searcher:
                docnum_remove = searcher.document_number(revid=revid)
                if docnum_remove is not None:
                    itemid = searcher.stored_fields(docnum_remove)[ITEMID]
            if docnum_remove is not None:
                # we are removing a revid that is in latest revs index
                latest_backends_revids = self._find_latest_backends_revids(self.ix[ALL_REVS], Term(ITEMID, itemid))
                if latest_backends_revids:
                    # we have a latest revision, just update the document in the index:
コード例 #10
0
ファイル: index.py プロジェクト: magne4000/ftpvista
class Index(object):
    def __init__(self, directory, persist):
        self.log = logging.getLogger("ftpvista.index")

        self._persist = persist
        if not os.path.exists(directory):
            self.log.info("Creating the index in %s" % directory)
            os.mkdir(directory)
            self._idx = index.create_in(directory, schema=self.get_schema())
        else:
            self.log.info("Opening the index in %s" % directory)
            self._idx = index.open_dir(directory)

        self._searcher = self._idx.searcher()
        self._writer = None
        self.open_writer()
        self._last_optimization = None

    def open_writer(self):
        # self._writer = BufferedWriter(self._idx, 120, 4000)
        self._writer = AsyncWriter(self._idx)

    def get_schema(self):
        analyzer = StemmingAnalyzer("([a-zA-Z0-9])+")
        my_analyzer = analyzer | CharsetFilter(accent_map)
        return Schema(
            server_id=ID(stored=True),
            has_id=ID(),
            path=TEXT(analyzer=my_analyzer, stored=True),
            name=TEXT(analyzer=my_analyzer, stored=True),
            ext=TEXT(analyzer=my_analyzer, stored=True),
            size=ID(stored=True),
            mtime=ID(stored=True, sortable=True),
            audio_album=TEXT(analyzer=my_analyzer, stored=True),
            audio_artist=TEXT(analyzer=my_analyzer, stored=True),
            audio_title=TEXT(analyzer=my_analyzer, stored=True),
            audio_track=ID(stored=True),
            audio_year=ID(stored=True),
        )

    def delete_all_docs(self, server):
        self.open_writer()
        self._writer.delete_by_term("server_id", str(server.get_server_id()))
        self._writer.commit()
        self.log.info("All documents of server %s deleted" % server.get_ip_addr())

    def incremental_server_update(self, server_id, current_files):
        """Prepares to incrementaly update the documents for the given server.

        server_id      -- Id of the server to update.
        current_files  -- a list of (path, size, mtime) tuples for each files
                          currently on the server.

        Delete all the outdated files from the index and returns a list
        of files needing to be reindexed.
        """

        def delete_doc(writer, serverid, path):
            writer.delete_by_query(Term("server_id", str(serverid)) & Term("path", path))

        # Build a {path => (size, mtime)} mapping for quick lookups
        to_index = {}
        for path, size, mtime in current_files:
            to_index[path] = (size, mtime)

        results = self._searcher.documents(server_id=str(server_id))
        if results:
            for fields in results:
                indexed_path = fields["path"]

                if indexed_path not in to_index:
                    # This file was deleted from the server since it was indexed
                    delete_doc(self._writer, server_id, indexed_path)
                    self.log.debug("%s has been removed" % indexed_path)
                else:
                    size, mtime = to_index[indexed_path]
                    try:
                        if mtime > datetime.strptime(fields["mtime"], "%Y-%m-%d %H:%M:%S"):
                            # This file has been modified since it was indexed
                            delete_doc(self._writer, server_id, indexed_path)
                        else:
                            # up to date, no need to reindex
                            del to_index[indexed_path]
                    except ValueError:
                        delete_doc(self._writer, server_id, indexed_path)

        # return the remaining files
        return [(path, xsize, xmtime) for (path, (xsize, xmtime)) in to_index.items()]

    def add_document(
        self, server_id, name, path, size, mtime, audio_album=None, audio_artist=None, audio_title=None, audio_year=None
    ):
        """Add a document with the specified fields in the index.

        Changes need to be commited.

        """

        # passing the optional arguments is quite a mess
        # let's build a dict for that purpose

        _, ext = os.path.splitext(name)
        ext = ext.lstrip(".")

        kwargs = {
            "server_id": server_id,
            "name": name,
            "ext": ext,
            "path": path,
            "size": size,
            "mtime": mtime,
            "has_id": "a",
        }

        # Add the optional args
        if audio_album is not None:
            kwargs["audio_album"] = audio_album

        if audio_artist is not None:
            kwargs["audio_artist"] = audio_artist

        if audio_title is not None:
            kwargs["audio_title"] = audio_title

        if audio_year is not None:
            kwargs["audio_year"] = audio_year

        try:
            self._writer.add_document(**kwargs)
        except IndexingError:
            self.open_writer()
            self._writer.add_document(**kwargs)

    def commit(self, optimize=False):
        """ Commit the changes in the index and optimize it """
        self.log.info(" -- Begin of Commit -- ")
        try:
            self._writer.commit(optimize=optimize)
        except IndexingError:
            self.open_writer()
            self._writer.commit(optimize=optimize)
        self.log.info("Index commited")

        self._searcher = self._idx.searcher()
        self.log.info(" -- End of Commit -- ")

    def close(self):
        self.log.info(" -- Closing writer and index -- ")
        # self._writer.close()
        """ Close the index """
        self._idx.close()
コード例 #11
0
ファイル: z_whoosh.py プロジェクト: ybenitezf/nstock
 def remove(self, item_id):
     from whoosh.writing import AsyncWriter
     writer = AsyncWriter(self.ix)
     writer.delete_by_term('id', item_id)
     writer.commit()
コード例 #12
0
def index_update(index, items):
    """
    :param:index: index name
    :param:items: list of (operation, full class name, primary key, data) tuples.
    """
    index_name = index
    index = service.app_state.indexes[index_name]
    adapted = service.adapted

    session = safe_session()
    updated = set()
    writer = AsyncWriter(index)
    try:
        for op, cls_name, pk, data in items:
            if pk is None:
                continue

            # always delete. Whoosh manual says that 'update' is actually delete + add
            # operation
            object_key = "{}:{}".format(cls_name, pk)
            writer.delete_by_term("object_key", object_key)

            adapter = adapted.get(cls_name)
            if not adapter:
                # FIXME: log to sentry?
                continue

            if object_key in updated:
                # don't add twice the same document in same transaction. The writer will
                # not delete previous records, ending in duplicate records for same
                # document.
                continue

            if op in ("new", "changed"):
                with session.begin(nested=True):
                    obj = adapter.retrieve(pk, _session=session, **data)

                if obj is None:
                    # deleted after task queued, but before task run
                    continue

                document = service.get_document(obj, adapter)
                try:
                    writer.add_document(**document)
                except ValueError:
                    # logger is here to give us more infos in order to catch a weird bug
                    # that happens regularly on CI but is not reliably
                    # reproductible.
                    logger.error("writer.add_document(%r)", document, exc_info=True)
                    raise
                updated.add(object_key)
    except BaseException:
        writer.cancel()
        raise

    session.close()
    writer.commit()
    try:
        # async thread: wait for its termination
        writer.join()
    except RuntimeError:
        # happens when actual writer was already available: asyncwriter didn't need
        # to start a thread
        pass
コード例 #13
0
ファイル: service.py プロジェクト: abilian/abilian-core
def index_update(index, items):
    """
    :param:index: index name
    :param:items: list of (operation, full class name, primary key, data) tuples.
    """
    index_name = index
    index = service.app_state.indexes[index_name]
    adapted = service.adapted

    session = safe_session()
    updated = set()
    writer = AsyncWriter(index)
    try:
        for op, cls_name, pk, data in items:
            if pk is None:
                continue

            # always delete. Whoosh manual says that 'update' is actually delete + add
            # operation
            object_key = f"{cls_name}:{pk}"
            writer.delete_by_term("object_key", object_key)

            adapter = adapted.get(cls_name)
            if not adapter:
                # FIXME: log to sentry?
                continue

            if object_key in updated:
                # don't add twice the same document in same transaction. The writer will
                # not delete previous records, ending in duplicate records for same
                # document.
                continue

            if op in ("new", "changed"):
                with session.begin(nested=True):
                    obj = adapter.retrieve(pk, _session=session, **data)

                if obj is None:
                    # deleted after task queued, but before task run
                    continue

                document = service.get_document(obj, adapter)
                try:
                    writer.add_document(**document)
                except ValueError:
                    # logger is here to give us more infos in order to catch a weird bug
                    # that happens regularly on CI but is not reliably
                    # reproductible.
                    logger.error("writer.add_document(%r)", document, exc_info=True)
                    raise
                updated.add(object_key)
    except Exception:
        writer.cancel()
        raise

    session.close()
    writer.commit()
    try:
        # async thread: wait for its termination
        writer.join()
    except RuntimeError:
        # happens when actual writer was already available: asyncwriter didn't need
        # to start a thread
        pass
コード例 #14
0
 def remove(self, item_id):
     from whoosh.writing import AsyncWriter
     writer = AsyncWriter(self.ix)
     writer.delete_by_term('id', item_id)
     writer.commit()
コード例 #15
0
            writer = AsyncWriter(self.ix[LATEST_REVS])
        else:
            writer = self.ix[LATEST_REVS].writer()
        with writer as writer:
            writer.update_document(**doc)

    def remove_revision(self, revid, async=True):
        """
        Remove a single revision from indexes.
        """
        if async:
            writer = AsyncWriter(self.ix[ALL_REVS])
        else:
            writer = self.ix[ALL_REVS].writer()
        with writer as writer:
            writer.delete_by_term(REVID, revid)
        if async:
            writer = AsyncWriter(self.ix[LATEST_REVS])
        else:
            writer = self.ix[LATEST_REVS].writer()
        with writer as writer:
            # find out itemid related to the revid we want to remove:
            with self.ix[LATEST_REVS].searcher() as searcher:
                docnum_remove = searcher.document_number(revid=revid)
                if docnum_remove is not None:
                    itemid = searcher.stored_fields(docnum_remove)[ITEMID]
            if docnum_remove is not None:
                # we are removing a revid that is in latest revs index
                try:
                    latest_revids = self._find_latest_revids(self.ix[ALL_REVS], Term(ITEMID, itemid))
                except AttributeError:
コード例 #16
0
def unindex_post(post):
    """Delete a post from the search index"""
    writer = AsyncWriter(ix)
    writer.delete_by_term('id', str(post.id))
    writer.commit()