Python index_name Exemples, collective.es.index.utils.index_name Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : queueprocessor.py Projet : zopyx/collective.es.index

 def _get_mapping(self, es):
     request = getRequest()
     mapping = getattr(request, CACHE_ATTRIBUTE, None)
     if mapping is not None:
         return mapping
     try:
         mapping = es.indices.get_mapping(index=index_name())
     except TransportError as e:
         if e.status_code == 404:
             self._create_index(es)
             mapping = es.indices.get_mapping(index=index_name())
         else:
             raise
     setattr(request, CACHE_ATTRIBUTE, mapping)
     return mapping

Exemple #2

0

Afficher le fichier

Fichier : queueprocessor.py Projet : zopyx/collective.es.index

 def get_payload(self, obj):
     try:
         serializer = getMultiAdapter((obj, getRequest()), ISerializeToJson)
     except ComponentLookupError:
         logger.exception(
             'Abort ElasticSearch Indexing for {0}'.format(
                 obj.absolute_url(), ), )
         query_blocker.unblock()
         return
     try:
         data = serializer()
     except ComponentLookupError:
         logger.exception(
             'Abort ElasticSearch Indexing for {0}'.format(
                 obj.absolute_url(), ), )
         query_blocker.unblock()
         return
     self._reduce_data(data)
     if HAS_ARCHETYPES:
         self._fix_at_fields(obj, data)
     self._expand_rid(obj, data)
     self._expand_binary_data(obj, data)
     uid = api.content.get_uuid(obj)
     es_kwargs = dict(
         index=index_name(),
         doc_type='content',
         id=uid,
         pipeline=self._es_pipeline_name,
         body=data,
         request_timeout=es_config.request_timeout,
     )
     return es_kwargs

Exemple #3

0

Afficher le fichier

Fichier : queueprocessor.py Projet : zopyx/collective.es.index

 def unindex(self, obj):
     index = index_name()
     if index is None:
         # portal no longer there
         return
     uid = api.content.get_uuid(obj)
     if es_config.use_celery:
         unindex_content.delay(
             index=index,
             doc_type='content',
             uid=uid,
             timeout=es_config.request_timeout,
         )
     else:
         es = get_ingest_client()
         if es is None:
             logger.warning('No ElasticSearch client available.', )
             return
         try:
             es.delete(
                 index=index,
                 doc_type='content',
                 id=uid,
                 request_timeout=es_config.request_timeout,
             )
         except Exception:
             logger.exception('unindexing of {0} failed'.format(uid))

Exemple #4

0

Afficher le fichier

 def index(self, obj, attributes=None):
     start = time.time()
     query_blocker.block()
     es = get_ingest_client()
     if es is None:
         logger.warning('No ElasticSearch client available.', )
         return
     self._check_for_ingest_pipeline(es)
     self._check_for_mapping(es)  # will also create the index
     try:
         serializer = getMultiAdapter((obj, getRequest()), ISerializeToJson)
     except ComponentLookupError:
         logger.exception(
             'Abort ElasticSearch Indexing for {0}'.format(
                 obj.absolute_url(), ), )
         return
     try:
         data = serializer()
     except ComponentLookupError:
         logger.exception(
             'Abort ElasticSearch Indexing for {0}'.format(
                 obj.absolute_url(), ), )
         return
     logging.info("TOOK after serializing: {0:2.3f}".format(time.time() -
                                                            start))
     self._reduce_data(data)
     self._expand_rid(obj, data)
     self._expand_binary_data(obj, data)
     self._auto_mapping(es, obj, data)
     uid = api.content.get_uuid(obj)
     es_kwargs = dict(
         index=index_name(),
         doc_type='content',
         id=uid,
         pipeline=self._es_pipeline_name,
         body=data,
     )
     parent = aq_parent(obj)
     portal = api.portal.get()
     if aq_base(portal) is aq_base(parent):
         self._check_and_add_portal_to_index(portal)
         # annotations = IAnnotations(portal)
         # es_kwargs['parent'] = annotations[ES_PORTAL_UUID_KEY]
         pass
     else:
         # es_kwargs['parent'] = api.content.get_uuid(parent)
         pass
     logging.info("TOOK after preprocessing: {0:2.3f}".format(time.time() -
                                                              start))
     try:
         es.index(**es_kwargs)
     except Exception:
         logger.exception(
             'indexing of {0} failed.\n{1}'.format(
                 uid,
                 pformat(es_kwargs, indent=2),
             ), )
     query_blocker.unblock()
     logging.info("TOOK overall: {0:2.3f}".format(time.time() - start))

Exemple #5

0

Afficher le fichier

Fichier : queueprocessor.py Projet : zopyx/collective.es.index

 def _setup_mapping(self, es):
     es.indices.put_mapping(
         doc_type='content',
         index=index_name(),
         body=INITIAL_MAPPING,
     )
     request = getRequest()
     setattr(request, CACHE_ATTRIBUTE, None)

Exemple #6

0

Afficher le fichier

Fichier : esproxyindex.py Projet : collective/collective.es.index

    def _apply_index(self, request):
        """Apply the index to query parameters given in 'request'.

        The argument should be a mapping object.

        If the request does not contain the needed parameters, then
        None is returned.

        If the request contains a parameter with the name of the
        column and this parameter is either a Record or a class
        instance then it is assumed that the parameters of this index
        are passed as attribute (Note: this is the recommended way to
        pass parameters since Zope 2.4)

        Otherwise two objects are returned.  The first object is a
        ResultSet containing the record numbers of the matching
        records.  The second object is a tuple containing the names of
        all data fields used.
        """
        if query_blocker.blocked:
            return
        record = parseIndexRequest(request, self.id)
        if record.keys is None:
            return None
        template_params = {
            'keys': record.keys,
        }
        query_body = self._apply_template(template_params)
        logger.info(query_body)
        es_kwargs = dict(
            index=index_name(),
            body=query_body,
            size=BATCH_SIZE,
            scroll='1m',
            _source_include=['rid'],
        )
        es = get_query_client()
        result = es.search(**es_kwargs)
        # initial return value, other batches to be applied

        def score(record):
            return int(10000 * float(record['_score']))

        retval = IIBTree()
        for r in result['hits']['hits']:
            retval[r['_source']['rid']] = score(r)

        total = result['hits']['total']
        if total > BATCH_SIZE:
            sid = result['_scroll_id']
            counter = BATCH_SIZE
            while counter < total:
                result = es.scroll(scroll_id=sid, scroll='1m')
                for record in result['hits']['hits']:
                    retval[record['_source']['rid']] = score(record)
                counter += BATCH_SIZE
        return retval, (self.id,)

Exemple #7

0

Afficher le fichier

Fichier : esproxyindex.py Projet : enfold/collective.es.index

 def numObjects(self):
     """Return the number of indexed objects."""
     es = get_query_client()
     search = Search(using=es, index=index_name())
     try:
         return len(list(search.scan()))
     except Exception:
         logger.exception('ElasticSearch "count" query failed')
         return 'Problem getting all documents count from ElasticSearch!'

Exemple #8

0

Afficher le fichier

Fichier : esproxyindex.py Projet : collective/collective.es.index

 def numObjects(self):
     """Return the number of indexed objects."""
     es_kwargs = dict(
         index=index_name(),
         body={'query': {'match_all': {}}},
     )
     es = get_query_client()
     try:
         return es.count(**es_kwargs)['count']
     except Exception:
         logger.exception('ElasticSearch "count" query failed')
         return 'Problem getting all documents count from ElasticSearch!'

Exemple #9

0

Afficher le fichier

 def _auto_mapping(self, es, obj, data):
     mappings = self._get_mapping(es)
     old_map = mappings[index_name()]['mappings']
     old_map = old_map.get('content', {}).get('properties', {})
     new_map = {}
     for key in data:
         if key in old_map:
             continue
         # figure out field type
         value = data[key]
         for pytype in MAPPING_TYPE_MAP:
             if isinstance(value, pytype):
                 new_map[key] = MAPPING_TYPE_MAP[pytype]
                 break
     for record in INGEST_PIPELINES['processors']:
         name = record['attachment']['target_field']
         if name not in old_map:
             new_map[name] = {
                 'type': 'nested',
                 'properties': {
                     'content': MAPPING_TYPE_MAP[basestring],
                     'content_length': MAPPING_TYPE_MAP[int],
                     'content_type': MAPPING_TYPE_MAP[basestring],
                     'language': MAPPING_TYPE_MAP[basestring],
                 },
             }
     if not new_map:
         return
     new_map = {
         'content': {
             'properties': new_map,
         },
     }
     es.indices.put_mapping(
         doc_type='content',
         index=index_name(),
         body=new_map,
     )
     request = getRequest()
     setattr(request, CACHE_ATTRIBUTE, None)

Exemple #10

0

Afficher le fichier

 def unindex(self, obj):
     es = get_ingest_client()
     if es is None:
         logger.warning('No ElasticSearch client available.', )
         return
     uid = api.content.get_uuid(obj)
     try:
         es.delete(
             index=index_name(),
             doc_type='content',
             id=uid,
         )
     except Exception:
         logger.exception('unindexing of {0} failed'.format(uid))

Exemple #11

0

Afficher le fichier

Fichier : queueprocessor.py Projet : enfold/collective.es.index

 def index(self, obj, attributes=None):
     index = index_name()
     if index is None:
         # Not configured yet.
         return
     query_blocker.block()
     es = get_ingest_client()
     if es is None:
         logger.warning('No ElasticSearch client available.', )
         query_blocker.unblock()
         return
     try:
         self._check_for_ingest_pipeline(es)
         self._check_for_mapping(es)  # will also create the index
     except TransportError:
         logger.exception(
             'ElasticSearch connection failed for {0}'.format(
                 obj.absolute_url(), ), )
         query_blocker.unblock()
         return
     parent = aq_parent(obj)
     portal = api.portal.get()
     if aq_base(portal) is aq_base(parent):
         self._check_and_add_portal_to_index(portal)
     if es_config.use_celery:
         path = '/'.join([p for p in obj.getPhysicalPath() if p != ''])
         index_content.delay(path, obj.absolute_url())
     else:
         es_kwargs = self.get_payload(obj)
         try:
             es.index(**es_kwargs)
         except Exception:
             uid = api.content.get_uuid(obj)
             logger.exception('indexing of {0} failed.'.format(uid, ), )
             import Globals
             if Globals.DevelopmentMode:
                 logger.debug(pformat(es_kwargs, indent=2))
     query_blocker.unblock()

Exemple #12

0

Afficher le fichier

Fichier : queueprocessor.py Projet : zopyx/collective.es.index

    def _check_and_add_portal_to_index(self, portal):
        # at first portal is not in ES!
        # Also, portal has no UUID. bad enough. so for es we give it one.
        # If portal has our UUID we assume it is also indexed already
        annotations = IAnnotations(portal)
        if ES_PORTAL_UUID_KEY in annotations:
            # looks like we're indexed.
            return

        annotations[ES_PORTAL_UUID_KEY] = uid = uuid.uuid4().hex
        serializer = getMultiAdapter((portal, getRequest()), ISerializeToJson)
        data = serializer()
        self._reduce_data(data)
        es_kwargs = dict(
            index=index_name(),
            doc_type='content',  # why do we still need it in ES6+?
            id=uid,
            body=data,
        )
        es = get_ingest_client()
        try:
            es.index(**es_kwargs)
        except Exception:
            logger.exception('indexing of {0} failed'.format(uid))

Exemple #13

0

Afficher le fichier

Fichier : queueprocessor.py Projet : zopyx/collective.es.index

 def _es_pipeline_name(self):
     return 'attachment_ingest_{0}'.format(index_name())

Exemple #14

0

Afficher le fichier

Fichier : esproxyindex.py Projet : enfold/collective.es.index

    def _apply_index(self, request):
        """Apply the index to query parameters given in 'request'.

        The argument should be a mapping object.

        If the request does not contain the needed parameters, then
        None is returned.

        If the request contains a parameter with the name of the
        column and this parameter is either a Record or a class
        instance then it is assumed that the parameters of this index
        are passed as attribute (Note: this is the recommended way to
        pass parameters since Zope 2.4)

        Otherwise two objects are returned.  The first object is a
        ResultSet containing the record numbers of the matching
        records.  The second object is a tuple containing the names of
        all data fields used.
        """
        config = get_configuration()
        timeout = getattr(config, 'request_timeout', 20)
        search_fields = getattr(config, 'search_fields', None)
        if not search_fields:
            search_fields = SEARCH_FIELDS
        search_fields = search_fields.split()
        logger.info(search_fields)
        if query_blocker.blocked:
            return
        record = parseIndexRequest(request, self.id)
        if record.keys is None:
            return None
        es = get_query_client()
        search = Search(using=es, index=index_name())
        search = search.params(request_timeout=timeout)
        search = search.sort('rid', '_id')
        search = search.source(include='rid')
        query_string = record.keys[0].decode('utf8')
        logger.info(query_string)
        if '*' in query_string:
            query_string = query_string.replace('*', ' ')
        query_string = query_string.strip()
        search = search.query('simple_query_string',
                              query=query_string,
                              fields=search_fields)
        results_count = search.count()
        search = search.params(request_timeout=timeout,
                               size=BATCH_SIZE,
                               track_scores=True)
        # setup highlighting
        for field in search_fields:
            name = field.split('^')[0]
            if name == 'title':
                # title shows up in results anyway
                continue
            search = search.highlight(name, fragment_size=FRAGMENT_SIZE)

        # initial return value, other batches to be applied
        retval = IIBTree()
        highlights = OOBTree()
        last_seen = None
        count = 0
        batch_count = results_count / BATCH_SIZE
        if results_count % BATCH_SIZE != 0:
            batch_count = batch_count + 1
        for i in xrange(batch_count):
            if last_seen is not None:
                search = search.update_from_dict({'search_after': last_seen})
            try:
                results = search.execute(ignore_cache=True)
            except TransportError:
                # No es client, return empty results
                logger.exception('ElasticSearch client not available.')
                return IIBTree(), (self.id, )

            for r in results:
                rid = getattr(r, 'rid', None)
                if rid is not None:
                    retval[rid] = int(10000 * float(r.meta.score))
                    # Index query returns only rids, so we need
                    # to save highlights for later use
                    highlight_list = []
                    if getattr(r.meta, 'highlight', None) is not None:
                        for key in dir(r.meta.highlight):
                            highlight_list.extend(r.meta.highlight[key])
                    highlights[r.meta.id] = highlight_list
                last_seen = [rid, r.meta.id]
                count = count + 1

        # store highlights
        try:
            annotations = IAnnotations(self.REQUEST)
            annotations[HIGHLIGHT_KEY] = highlights
        except TypeError:
            # maybe we are in a test
            pass

        return retval, (self.id, )

Exemple #15

0

Afficher le fichier

Fichier : esproxyindex.py Projet : zopyx/collective.es.index

    def _apply_index(self, request):
        """Apply the index to query parameters given in 'request'.

        The argument should be a mapping object.

        If the request does not contain the needed parameters, then
        None is returned.

        If the request contains a parameter with the name of the
        column and this parameter is either a Record or a class
        instance then it is assumed that the parameters of this index
        are passed as attribute (Note: this is the recommended way to
        pass parameters since Zope 2.4)

        Otherwise two objects are returned.  The first object is a
        ResultSet containing the record numbers of the matching
        records.  The second object is a tuple containing the names of
        all data fields used.
        """
        config = get_configuration()
        timeout = getattr(config, 'request_timeout', 20)
        search_fields = getattr(config, 'search_fields', None)
        if not search_fields:
            search_fields = SEARCH_FIELDS
        search_fields = search_fields.split()
        if query_blocker.blocked:
            return
        record = parseIndexRequest(request, self.id)
        if record.keys is None:
            return None
        es = get_query_client()
        search = Search(using=es, index=index_name())
        search = search.params(
            request_timeout=timeout,
            size=BATCH_SIZE,
            preserve_order=True,
        )
        search = search.source(include='rid')
        query_string = record.keys[0]
        if query_string and query_string.startswith('*'):
            # plone.app.querystring contains op sends a leading *, remove it
            query_string = query_string[1:]
        search = search.query('simple_query_string',
                              query=query_string,
                              fields=search_fields)
        # setup highlighting
        for field in search_fields:
            name = field.split('^')[0]
            if name == 'title':
                # title shows up in results anyway
                continue
            search = search.highlight(name, fragment_size=FRAGMENT_SIZE)

        try:
            result = search.scan()
        except TransportError:
            # No es client, return empty results
            logger.exception('ElasticSearch client not available.')
            return IIBTree(), (self.id, )
        # initial return value, other batches to be applied

        retval = IIBTree()
        highlights = OOBTree()
        for r in result:
            if getattr(r, 'rid', None) is None:
                # something was indexed with no rid. Ignore for now.
                # this is only for highlights, so no big deal if we
                # skip one
                continue
            retval[r.rid] = int(10000 * float(r.meta.score))
            # Index query returns only rids, so we need
            # to save highlights for later use
            highlight_list = []
            if getattr(r.meta, 'highlight', None) is not None:
                for key in dir(r.meta.highlight):
                    highlight_list.extend(r.meta.highlight[key])
            highlights[r.meta.id] = highlight_list

        # store highlights
        try:
            annotations = IAnnotations(self.REQUEST)
            annotations[HIGHLIGHT_KEY] = highlights
        except TypeError:
            # maybe we are in a test
            pass

        return retval, (self.id, )

Exemple #16

0

Afficher le fichier

class ElasticSearchIndexQueueProcessor(object):
    """ a queue processor for ElasticSearch"""
    @property
    def _es_pipeline_name(self):
        return 'attachment_ingest_{0}'.format(index_name())

    def _create_index(seld, es):
        es.indices.create(index=index_name())

    @ram.cache(lambda *args: index_name())
    def _check_for_ingest_pipeline(self, es):
        # do we have the ingest pipeline?
        try:
            es.ingest.get_pipeline(self._es_pipeline_name)
        except NotFoundError:
            es.ingest.put_pipeline(self._es_pipeline_name, INGEST_PIPELINES)

    def _check_for_mapping(self, es):
        if not self._get_mapping(es):
            raise ValueError('Can not fetch mapping.')

    def _get_mapping(self, es):
        request = getRequest()
        mapping = getattr(request, CACHE_ATTRIBUTE, None)
        if mapping is not None:
            return mapping
        try:
            mapping = es.indices.get_mapping(index=index_name())
        except TransportError as e:
            if e.status_code == 404:
                self._create_index(es)
                mapping = es.indices.get_mapping(index=index_name())
            else:
                raise
        setattr(request, CACHE_ATTRIBUTE, mapping)
        return mapping

    def _auto_mapping(self, es, obj, data):
        mappings = self._get_mapping(es)
        old_map = mappings[index_name()]['mappings']
        old_map = old_map.get('content', {}).get('properties', {})
        new_map = {}
        for key in data:
            if key in old_map:
                continue
            # figure out field type
            value = data[key]
            for pytype in MAPPING_TYPE_MAP:
                if isinstance(value, pytype):
                    new_map[key] = MAPPING_TYPE_MAP[pytype]
                    break
        for record in INGEST_PIPELINES['processors']:
            name = record['attachment']['target_field']
            if name not in old_map:
                new_map[name] = {
                    'type': 'nested',
                    'properties': {
                        'content': MAPPING_TYPE_MAP[basestring],
                        'content_length': MAPPING_TYPE_MAP[int],
                        'content_type': MAPPING_TYPE_MAP[basestring],
                        'language': MAPPING_TYPE_MAP[basestring],
                    },
                }
        if not new_map:
            return
        new_map = {
            'content': {
                'properties': new_map,
            },
        }
        es.indices.put_mapping(
            doc_type='content',
            index=index_name(),
            body=new_map,
        )
        request = getRequest()
        setattr(request, CACHE_ATTRIBUTE, None)

    def _check_and_add_portal_to_index(self, portal):
        # at first portal is not in ES!
        # Also, portal has no UUID. bad enough. so for es we give it one.
        # If portal has our UUID we assume it is also indexed already
        annotations = IAnnotations(portal)
        if ES_PORTAL_UUID_KEY in annotations:
            # looks like we're indexed.
            return

        annotations[ES_PORTAL_UUID_KEY] = uid = uuid.uuid4().hex
        serializer = getMultiAdapter((portal, getRequest()), ISerializeToJson)
        data = serializer()
        self._reduce_data(data)
        es_kwargs = dict(
            index=index_name(),
            doc_type='content',  # why do we still need it in ES6+?
            id=uid,
            body=data,
        )
        es = get_ingest_client()
        try:
            es.index(**es_kwargs)
        except Exception:
            logger.exception('indexing of {0} failed'.format(uid))

    def _reduce_data(self, data):
        for key in KEYS_TO_REMOVE:
            if key in data:
                del data[key]

    def _iterate_binary_fields(self, obj, data):
        for record in INGEST_PIPELINES['processors']:
            yield record['attachment']['field']

    def _expand_binary_data(self, obj, data):
        for fieldname in self._iterate_binary_fields(obj, data):
            if fieldname not in data:
                continue
            field = getattr(obj, fieldname, None)
            if field is None:
                continue
            data[fieldname + '_meta'] = data[fieldname]
            if IBlobby.providedBy(field):
                with field.open() as fh:
                    data[fieldname] = base64.b64encode(fh.read())
            elif IRichTextValue.providedBy(field):
                data[fieldname] = base64.b64encode(
                    data[fieldname + '_meta']['data'].encode('utf8'), )

    def _expand_rid(self, obj, data):
        cat = api.portal.get_tool('portal_catalog')
        path = '/'.join(obj.getPhysicalPath())
        data['rid'] = cat.getrid(path)

    def index(self, obj, attributes=None):
        start = time.time()
        query_blocker.block()
        es = get_ingest_client()
        if es is None:
            logger.warning('No ElasticSearch client available.', )
            return
        self._check_for_ingest_pipeline(es)
        self._check_for_mapping(es)  # will also create the index
        try:
            serializer = getMultiAdapter((obj, getRequest()), ISerializeToJson)
        except ComponentLookupError:
            logger.exception(
                'Abort ElasticSearch Indexing for {0}'.format(
                    obj.absolute_url(), ), )
            return
        try:
            data = serializer()
        except ComponentLookupError:
            logger.exception(
                'Abort ElasticSearch Indexing for {0}'.format(
                    obj.absolute_url(), ), )
            return
        logging.info("TOOK after serializing: {0:2.3f}".format(time.time() -
                                                               start))
        self._reduce_data(data)
        self._expand_rid(obj, data)
        self._expand_binary_data(obj, data)
        self._auto_mapping(es, obj, data)
        uid = api.content.get_uuid(obj)
        es_kwargs = dict(
            index=index_name(),
            doc_type='content',
            id=uid,
            pipeline=self._es_pipeline_name,
            body=data,
        )
        parent = aq_parent(obj)
        portal = api.portal.get()
        if aq_base(portal) is aq_base(parent):
            self._check_and_add_portal_to_index(portal)
            # annotations = IAnnotations(portal)
            # es_kwargs['parent'] = annotations[ES_PORTAL_UUID_KEY]
            pass
        else:
            # es_kwargs['parent'] = api.content.get_uuid(parent)
            pass
        logging.info("TOOK after preprocessing: {0:2.3f}".format(time.time() -
                                                                 start))
        try:
            es.index(**es_kwargs)
        except Exception:
            logger.exception(
                'indexing of {0} failed.\n{1}'.format(
                    uid,
                    pformat(es_kwargs, indent=2),
                ), )
        query_blocker.unblock()
        logging.info("TOOK overall: {0:2.3f}".format(time.time() - start))

    def reindex(self, obj, attributes=None, update_metadata=1):
        self.index(obj, attributes)

    def unindex(self, obj):
        es = get_ingest_client()
        if es is None:
            logger.warning('No ElasticSearch client available.', )
            return
        uid = api.content.get_uuid(obj)
        try:
            es.delete(
                index=index_name(),
                doc_type='content',
                id=uid,
            )
        except Exception:
            logger.exception('unindexing of {0} failed'.format(uid))

    def begin(self):
        pass

    def commit(self, wait=None):
        pass

    def abort(self):
        pass

Exemple #17

0

Afficher le fichier

 def _create_index(seld, es):
     es.indices.create(index=index_name())

Exemple #18

0

Afficher le fichier

Fichier : faceted.py Projet : enfold/collective.es.index

 def search(self):
     es = get_query_client()
     s = Search(doc_type=self.doc_types, index=index_name(), using=es)
     s = s.params(size=BATCH_SIZE)
     return s.response_class(FacetedResponse)

Exemple #19

0

Afficher le fichier

Fichier : queueprocessor.py Projet : zopyx/collective.es.index

 def _create_index(self, es):
     es.indices.create(index=index_name())
     self._setup_mapping(es)

Exemple #20

0

Afficher le fichier

Fichier : queueprocessor.py Projet : zopyx/collective.es.index

class ElasticSearchIndexQueueProcessor(object):
    """ a queue processor for ElasticSearch"""
    @property
    def _es_pipeline_name(self):
        return 'attachment_ingest_{0}'.format(index_name())

    def _create_index(self, es):
        es.indices.create(index=index_name())
        self._setup_mapping(es)

    @ram.cache(lambda *args: index_name())
    def _check_for_ingest_pipeline(self, es):
        # do we have the ingest pipeline?
        try:
            es.ingest.get_pipeline(self._es_pipeline_name)
        except NotFoundError:
            es.ingest.put_pipeline(self._es_pipeline_name, INGEST_PIPELINES)

    def _check_for_mapping(self, es):
        if not self._get_mapping(es):
            raise ValueError('Can not fetch mapping.')

    def _get_mapping(self, es):
        request = getRequest()
        mapping = getattr(request, CACHE_ATTRIBUTE, None)
        if mapping is not None:
            return mapping
        try:
            mapping = es.indices.get_mapping(index=index_name())
        except TransportError as e:
            if e.status_code == 404:
                self._create_index(es)
                mapping = es.indices.get_mapping(index=index_name())
            else:
                raise
        setattr(request, CACHE_ATTRIBUTE, mapping)
        return mapping

    def _setup_mapping(self, es):
        es.indices.put_mapping(
            doc_type='content',
            index=index_name(),
            body=INITIAL_MAPPING,
        )
        request = getRequest()
        setattr(request, CACHE_ATTRIBUTE, None)

    def _check_and_add_portal_to_index(self, portal):
        # at first portal is not in ES!
        # Also, portal has no UUID. bad enough. so for es we give it one.
        # If portal has our UUID we assume it is also indexed already
        annotations = IAnnotations(portal)
        if ES_PORTAL_UUID_KEY in annotations:
            # looks like we're indexed.
            return

        annotations[ES_PORTAL_UUID_KEY] = uid = uuid.uuid4().hex
        serializer = getMultiAdapter((portal, getRequest()), ISerializeToJson)
        data = serializer()
        self._reduce_data(data)
        es_kwargs = dict(
            index=index_name(),
            doc_type='content',  # why do we still need it in ES6+?
            id=uid,
            body=data,
        )
        es = get_ingest_client()
        try:
            es.index(**es_kwargs)
        except Exception:
            logger.exception('indexing of {0} failed'.format(uid))

    def _fix_at_fields(self, obj, data):
        # unlike dexterity, AT text fields use same serializer as rich text,
        # which wrecks our index mapping
        if IBaseContent.providedBy(obj):
            for field in AT_SIMPLE_TEXT_FIELDS:
                if field in data and isinstance(data[field], dict):
                    data[field] = data[field]['data']

    def _reduce_data(self, data):
        for key in KEYS_TO_REMOVE:
            if key in data:
                del data[key]

    def _iterate_binary_fields(self, obj, data):
        for record in INGEST_PIPELINES['processors']:
            if 'attachment' not in record:
                continue
            yield record['attachment']['field']

    def _expand_binary_data(self, obj, data):
        max_size = es_config.max_blobsize
        for fieldname in self._iterate_binary_fields(obj, data):
            if fieldname not in data:
                data[fieldname] = None
                continue
            field = getattr(obj, fieldname, None)
            if field is None:
                data[fieldname] = None
                continue
            data[fieldname + '_meta'] = data[fieldname]
            if IBlobby.providedBy(field):
                with field.open() as fh:
                    data[fieldname] = base64.b64encode(fh.read())
            elif IRichTextValue.providedBy(field):
                data[fieldname] = base64.b64encode(
                    data[fieldname + '_meta']['data'].encode('utf8'), )
            if max_size and len(data[fieldname]) > max_size:
                data[fieldname] = None
                del data[fieldname + '_meta']
                logger.info(
                    'File too big for ElasticSearch Indexing: {0}'.format(
                        obj.absolute_url(), ), )

    def _expand_rid(self, obj, data):
        cat = api.portal.get_tool('portal_catalog')
        path = '/'.join(obj.getPhysicalPath())
        data['rid'] = cat.getrid(path)

    def get_payload(self, obj):
        try:
            serializer = getMultiAdapter((obj, getRequest()), ISerializeToJson)
        except ComponentLookupError:
            logger.exception(
                'Abort ElasticSearch Indexing for {0}'.format(
                    obj.absolute_url(), ), )
            query_blocker.unblock()
            return
        try:
            data = serializer()
        except ComponentLookupError:
            logger.exception(
                'Abort ElasticSearch Indexing for {0}'.format(
                    obj.absolute_url(), ), )
            query_blocker.unblock()
            return
        self._reduce_data(data)
        if HAS_ARCHETYPES:
            self._fix_at_fields(obj, data)
        self._expand_rid(obj, data)
        self._expand_binary_data(obj, data)
        uid = api.content.get_uuid(obj)
        es_kwargs = dict(
            index=index_name(),
            doc_type='content',
            id=uid,
            pipeline=self._es_pipeline_name,
            body=data,
            request_timeout=es_config.request_timeout,
        )
        return es_kwargs

    def index(self, obj, attributes=None):
        query_blocker.block()
        es = get_ingest_client()
        if es is None:
            logger.warning('No ElasticSearch client available.', )
            query_blocker.unblock()
            return
        try:
            self._check_for_ingest_pipeline(es)
            self._check_for_mapping(es)  # will also create the index
        except TransportError:
            logger.exception(
                'ElasticSearch connection failed for {0}'.format(
                    obj.absolute_url(), ), )
            query_blocker.unblock()
            return
        parent = aq_parent(obj)
        portal = api.portal.get()
        if aq_base(portal) is aq_base(parent):
            self._check_and_add_portal_to_index(portal)
        if es_config.use_celery:
            path = '/'.join([p for p in obj.getPhysicalPath() if p != ''])
            index_content.delay(path)
        else:
            es_kwargs = self.get_payload(obj)
            try:
                es.index(**es_kwargs)
            except Exception:
                uid = api.content.get_uuid(obj)
                logger.exception('indexing of {0} failed.'.format(uid, ), )
                import Globals
                if Globals.DevelopmentMode:
                    logger.debug(pformat(es_kwargs, indent=2))
        query_blocker.unblock()

    def reindex(self, obj, attributes=None, update_metadata=1):
        self.index(obj, attributes)

    def unindex(self, obj):
        index = index_name()
        if index is None:
            # portal no longer there
            return
        uid = api.content.get_uuid(obj)
        if es_config.use_celery:
            unindex_content.delay(
                index=index,
                doc_type='content',
                uid=uid,
                timeout=es_config.request_timeout,
            )
        else:
            es = get_ingest_client()
            if es is None:
                logger.warning('No ElasticSearch client available.', )
                return
            try:
                es.delete(
                    index=index,
                    doc_type='content',
                    id=uid,
                    request_timeout=es_config.request_timeout,
                )
            except Exception:
                logger.exception('unindexing of {0} failed'.format(uid))

    def begin(self):
        pass

    def commit(self, wait=None):
        pass

    def abort(self):
        pass