Example #1
0
    def test_haystack_identifier_method(self):
        # The custom implementation returns the MD-5 hash of the key value by
        # default:
        get_identifier = _lookup_identifier_method()
        self.assertEqual(get_identifier("a.b.c"), "553f764f7b436175c0387e22b4a19213")

        # … but it also supports a custom override mechanism which would
        # definitely fail with the default implementation:
        class custom_id_class(object):
            def get_custom_haystack_id(self):
                return "CUSTOM"

        self.assertEqual(get_identifier(custom_id_class()), "CUSTOM")
Example #2
0
    def test_get_identifier(self):
        # Various invalid identifiers.
        self.assertRaises(AttributeError, get_identifier, "core")
        self.assertRaises(AttributeError, get_identifier, "core.mockmodel")
        self.assertRaises(AttributeError, get_identifier, "core.mockmodel.foo")
        self.assertRaises(AttributeError, get_identifier, "core-app.mockmodel.1")

        # Valid string identifier.
        self.assertEqual(get_identifier("core.mockmodel.1"), "core.mockmodel.1")

        # Valid object.
        mock = MockModel.objects.get(pk=1)
        self.assertEqual(get_identifier(mock), "core.mockmodel.1")
Example #3
0
 def remove(self, obj_or_string, commit=True):
     solr_id = get_identifier(obj_or_string)
     
     try:
         self.conn.delete(id=solr_id, commit=commit)
     except (IOError, SolrError), e:
         self.log.error("Failed to remove document '%s' from Solr: %s", solr_id, e)
Example #4
0
    def update(self, index, iterable, commit=True):
        if self._do_setup(): return

        prepped_docs = []

        for obj in iterable:
            try:
                prepped_data = index.full_prepare(obj)
                final_data = []

                for key, value in prepped_data.items():
                    field_type = self.schema[key]
                    final_data.append(field_type(name=key, value=value))

                document = search.Document(doc_id=prepped_data[ID], fields=final_data)
                prepped_docs.append(document)
            except search.Error:
                if not self.silently_fail:
                    raise
                self.log.error("{0} while preparing object for update".format(e.__class__.__name__),
                    exc_info=True, extra={
                        "data": {
                            "index": index,
                            "object": get_identifier(obj),
                        }
                    })

        self.index.put(prepped_docs)
    def remove(self, obj_or_string, commit=True):
        doc_id = get_identifier(obj_or_string)

        if not self.setup_complete:
            try:
                self.setup()
            except self.TransportError as e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e)
                return

        try:
            self.conn.delete(index=self.index_name, doc_type='modelresult', id=doc_id, ignore=404)

            if commit:
                self.conn.indices.refresh(index=self.index_name)
        except self.TransportError as e:
            if e.status_code == 404 and e.info.get('ok') and e.info.get('found') == False:
                self.log.warning("Tried removing nonexistent document '%s' from ElasticSearch: %s.", doc_id, e)
            elif not self.silently_fail:
                raise
            else:
                self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e)
Example #6
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            doc = index.full_prepare(obj)

            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])

            try:
                writer.update_document(**doc)
            except Exception, e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" % e, exc_info=True, extra={
                    "data": {
                        "index": index,
                        "object": get_identifier(obj)
                    }
                })
Example #7
0
    def update(self, index, iterable, commit=True):
        docs = []

        for obj in iterable:
            try:
                docs.append(index.full_prepare(obj))
            except UnicodeDecodeError:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(
                    u"UnicodeDecodeError while preparing object for update",
                    exc_info=True,
                    extra={"data": {"index": index, "object": get_identifier(obj)}},
                )

        if len(docs) > 0:
            try:
                self.conn.add(docs, commit=commit, boost=index.get_field_weights())
            except (IOError, SolrError), e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Solr: %s", e)
    def remove(self, obj_or_string, commit=True):
        doc_id = get_identifier(obj_or_string)

        if not self.setup_complete:
            try:
                self.setup()
            except elasticsearch.TransportError as e:
                if not self.silently_fail:
                    raise

                self.log.error(
                    "Failed to remove document '%s' from Elasticsearch: %s",
                    doc_id,
                    e,
                    exc_info=True,
                )
                return

        try:
            self.conn.delete(
                index=self.index_name, doc_type="modelresult", id=doc_id, ignore=404
            )

            if commit:
                self.conn.indices.refresh(index=self.index_name)
        except elasticsearch.TransportError as e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to remove document '%s' from Elasticsearch: %s",
                doc_id,
                e,
                exc_info=True,
            )
    def remove(self, obj, commit=True):

        if not self.setup_complete:
            self.setup()

        index = self._get_index_for(obj._meta.model)
        index.deleteObject(get_identifier(obj))
    def more_like_this(self, model_instance, additional_query_string=None,
                       start_offset=0, end_offset=None, models=None,
                       limit_to_registered_models=None, result_class=None, **kwargs):
        from haystack import connections

        if not self.setup_complete:
            self.setup()

        # Handle deferred models.
        if get_proxied_model and hasattr(model_instance, '_deferred') and model_instance._deferred:
            model_klass = get_proxied_model(model_instance._meta)
        else:
            model_klass = type(model_instance)

        index = connections[self.connection_alias].get_unified_index().get_index(model_klass)
        field_name = index.get_content_field()
        params = {}

        if start_offset is not None:
            params['search_from'] = start_offset

        if end_offset is not None:
            params['search_size'] = end_offset - start_offset

        doc_id = get_identifier(model_instance)

        try:
            raw_results = self.conn.morelikethis(self.index_name, 'modelresult', doc_id, [field_name], **params)
        except (requests.RequestException, pyelasticsearch.ElasticSearchError), e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to fetch More Like This from Elasticsearch for document '%s': %s", doc_id, e)
            raw_results = {}
    def more_like_this(self, model_instance, additional_query_string=None,
                       start_offset=0, end_offset=None, models=None,
                       limit_to_registered_models=None, result_class=None, **kwargs):
        from haystack import connections

        if not self.setup_complete:
            self.setup()

        # Deferred models will have a different class ("RealClass_Deferred_fieldname")
        # which won't be in our registry:
        model_klass = model_instance._meta.concrete_model

        index = connections[self.connection_alias].get_unified_index().get_index(model_klass)
        field_name = index.get_content_field()
        params = {}

        if start_offset is not None:
            params['search_from'] = start_offset

        if end_offset is not None:
            params['search_size'] = end_offset - start_offset

        doc_id = get_identifier(model_instance)

        try:
            raw_results = self.conn.more_like_this(self.index_name, 'modelresult', doc_id, [field_name], **params)
        except (requests.RequestException, pyelasticsearch.ElasticHttpError), e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to fetch More Like This from Elasticsearch for document '%s': %s", doc_id, e)
            raw_results = {}
    def remove(self, obj_or_string, commit=True):
        doc_id = get_identifier(obj_or_string)

        # django-haystack default to using namespaced ids for objects like layers.layer.83 but the GeoNode SearchIndexes
        # override the default ids with ResourceBase ids.
        if isinstance(obj_or_string, ResourceBase):
            doc_id = getattr(obj_or_string, 'id')

        if not self.setup_complete:
            try:
                self.setup()
            except elasticsearch.TransportError as e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e)
                return

        try:
            self.conn.delete(index=self.index_name, doc_type='modelresult', id=doc_id, ignore=404)

            if commit:
                self.conn.indices.refresh(index=self.index_name)
        except elasticsearch.TransportError as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e)
 def remove(self, obj_or_string, commit=True):
     if not self.setup_complete:
         self.setup()
     
     self.index = self.index.refresh()
     whoosh_id = get_identifier(obj_or_string)
     self.index.delete_by_query(q=self.parser.parse(u'id:"%s"' % whoosh_id))
Example #14
0
    def enqueue(self, action, instance):
        """
        Shoves a message about how to update the index into the queue.

        This is a standardized string, resembling something like::

            ``update:notes.note.23``
            # ...or...
            ``delete:weblog.entry.8``
        """
        message = "%s:%s" % (action, get_identifier(instance))
        try:
            return queue.write(message)
        except QueueException:
            import smtplib
            from email.mime.text import MIMEText
            
            msg = {}
            me = "*****@*****.**"
            you = "*****@*****.**"
            msg['Subject'] = 'Error - QueueException'
            msg['From'] = me
            msg['To'] = you

            s = smtplib.SMTP('localhost')
            s.sendmail(me, [you], MIMEText("the following message could not be queued\n\n"+message))
            s.quit()
Example #15
0
 def prepare(self, obj):
     """
     Fetches and adds/alters data before indexing.
     """
     self.prepared_data = {
         'id': get_identifier(obj),
         'django_ct': "%s.%s" % (obj._meta.app_label, obj._meta.module_name),
         'django_id': force_unicode(obj.pk),
     }
     
     for field_name, field in self.fields.items():
         # Use the possibly overridden name, which will default to the
         # variable name of the field.
         self.prepared_data[field.index_fieldname] = field.prepare(obj)
     
     for field_name, field in self.fields.items():
         if hasattr(self, "prepare_%s" % field_name):
             value = getattr(self, "prepare_%s" % field_name)(obj)
             self.prepared_data[field.index_fieldname] = value
     
     # Remove any fields that lack a value and are `null=True`.
     for field_name, field in self.fields.items():
         if field.null is True:
             if self.prepared_data[field.index_fieldname] is None:
                 del(self.prepared_data[field.index_fieldname])
     
     return self.prepared_data
Example #16
0
def enqueue_task(action, instance, **kwargs):
    """
    Common utility for enqueing a task for the given action and
    model instance.
    """
    identifier = get_identifier(instance)
    options = {}
    if settings.CELERY_HAYSTACK_QUEUE:
        options['queue'] = settings.CELERY_HAYSTACK_QUEUE
    if settings.CELERY_HAYSTACK_COUNTDOWN:
        options['countdown'] = settings.CELERY_HAYSTACK_COUNTDOWN

    task = get_update_task()

    def task_func():
        return task.apply_async((action, identifier), kwargs, **options)

    if hasattr(transaction, 'on_commit'):
        # Django 1.9 on_commit hook
        transaction.on_commit(
            task_func
        )
    elif hasattr(connection, 'on_commit'):
        # Django-transaction-hooks
        connection.on_commit(
            task_func
        )
    else:
        task_func()
    def remove(self, obj_or_string, commit=True):
        """
        Removes a document/object from the backend. Can be either a model
        instance or the identifier (i.e. ``app_name.model_name.id``) in the
        event the object no longer exists.

        :param obj_or_string: The model instance or the identifier.
        :param commit: True to refresh the search index after the remove.
        """
        doc_id = get_identifier(obj_or_string)

        if not self.setup_complete:
            try:
                self.setup()
            except elasticsearch.TransportError as e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e,
                               exc_info=True)
                return

        try:
            self.conn.delete(index=self.index_name, doc_type='modelresult', id=doc_id, ignore=404)

            if commit:
                self.conn.indices.refresh(index=self.index_name)
        except elasticsearch.TransportError as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e, exc_info=True)
Example #18
0
def enqueue_task(action, instance):
    """
    Common utility for enqueing a task for the given action and
    model instance.
    """
    identifier = get_identifier(instance)
    get_update_task().delay(action, identifier)
Example #19
0
    def more_like_this(
        self,
        model_instance,
        additional_query_string=None,
        start_offset=0,
        end_offset=None,
        limit_to_registered_models=None,
        result_class=None,
        **kwargs
    ):
        from haystack import connections

        # Handle deferred models.
        if get_proxied_model and hasattr(model_instance, "_deferred") and model_instance._deferred:
            model_klass = get_proxied_model(model_instance._meta)
        else:
            model_klass = type(model_instance)

        index = connections[self.connection_alias].get_unified_index().get_index(model_klass)
        field_name = index.get_content_field()
        params = {"fl": "*,score"}

        if start_offset is not None:
            params["start"] = start_offset

        if end_offset is not None:
            params["rows"] = end_offset

        narrow_queries = set()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True)

        if limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            if narrow_queries is None:
                narrow_queries = set()

            registered_models = self.build_models_list()

            if len(registered_models) > 0:
                narrow_queries.add("%s:(%s)" % (DJANGO_CT, " OR ".join(registered_models)))

        if additional_query_string:
            narrow_queries.add(additional_query_string)

        if narrow_queries:
            params["fq"] = list(narrow_queries)

        query = "%s:%s" % (ID, get_identifier(model_instance))

        try:
            raw_results = self.conn.more_like_this(query, field_name, **params)
        except (IOError, SolrError), e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to fetch More Like This from Solr for document '%s': %s", query, e)
            raw_results = EmptyResults()
Example #20
0
    def more_like_this(self, model_instance, additional_query_string=None,
                       start_offset=0, end_offset=None, models=None,
                       limit_to_registered_models=None, result_class=None, **kwargs):
        from haystack import connections

        # Deferred models will have a different class ("RealClass_Deferred_fieldname")
        # which won't be in our registry:
        model_klass = model_instance._meta.concrete_model

        index = connections[self.connection_alias].get_unified_index().get_index(model_klass)
        field_name = index.get_content_field()
        params = {
            'fl': '*,score',
        }

        if start_offset is not None:
            params['start'] = start_offset

        if end_offset is not None:
            params['rows'] = end_offset

        narrow_queries = set()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add('%s:(%s)' % (DJANGO_CT, ' OR '.join(model_choices)))

        if additional_query_string:
            narrow_queries.add(additional_query_string)

        if narrow_queries:
            params['fq'] = list(narrow_queries)

        query = "%s:%s" % (ID, get_identifier(model_instance))

        try:
            raw_results = self.conn.more_like_this(query, field_name, **params)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to fetch More Like This from Solr for document '%s': %s",
                           query, e, exc_info=True)
            raw_results = EmptyResults()

        return self._process_results(raw_results, result_class=result_class)
Example #21
0
def trigger_index_update(klass, instance_pk):
    try:
        from celery_haystack.utils import get_update_task
    except ImportError:
        return
    task = get_update_task()
    task.delay('update', get_identifier(klass(id=instance_pk)))
def search_index_signal_handler(instance, signal, **kwargs):
    """
    Signal handler for when indexable objects are saved or deleted.

    The indexing will run after the transaction commits, which will generally
    mean that all of the related ManyToMany data will be saved and ready.

    """

    if search_update_options["disabled"]:
        return

    deleting = signal is post_delete

    if deleting:
        # When deleting, pass in an identifier string instead of the instance.
        # This is because Django will unset the instance's pk before the update
        # method runs, making it impossible to determine afterwards.
        item = get_identifier(instance)
    else:
        item = instance

    if search_update_options["async"]:
        queue_update(item, remove=deleting)
    else:
        update_object(item, remove=deleting)
    def more_like_this(self, model_instance, additional_query_string=None,
                       start_offset=0, end_offset=None, models=None,
                       limit_to_registered_models=None, result_class=None, **kwargs):
        from haystack import connections

        # Handle deferred models.
        if get_proxied_model and hasattr(model_instance, '_deferred') and model_instance._deferred:
            model_klass = get_proxied_model(model_instance._meta)
        else:
            model_klass = type(model_instance)

        index = connections[self.connection_alias].get_unified_index().get_index(model_klass)
        field_name = index.get_content_field()
        params = {
            'fl': '*,score',
        }

        if start_offset is not None:
            params['start'] = start_offset

        if end_offset is not None:
            params['rows'] = end_offset

        narrow_queries = set()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(['%s.%s' % (model._meta.app_label, model._meta.module_name) for model in models])
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add('%s:(%s)' % (DJANGO_CT, ' OR '.join(model_choices)))

        if additional_query_string:
            narrow_queries.add(additional_query_string)

        if narrow_queries:
            params['fq'] = list(narrow_queries)

        query = "%s:%s" % (ID, get_identifier(model_instance))

        try:
            raw_results = self.conn.more_like_this(query, field_name, **params)
        except (IOError, SolrError), e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to fetch More Like This from Solr for document '%s': %s", query, e)
            raw_results = EmptyResults()
Example #24
0
    def remove(self, obj_or_string, commit=True):
        solr_id = get_identifier(obj_or_string)

        try:
            kwargs = {"commit": commit, ID: solr_id}
            self.conn.delete(**kwargs)
        except (IOError, SolrError), e:
            self.log.error("Failed to remove document '%s' from Solr: %s", solr_id, e)
    def more_like_this(
        self,
        model_instance,
        additional_query_string=None,
        start_offset=0,
        end_offset=None,
        models=None,
        limit_to_registered_models=None,
        result_class=None,
        **kwargs
    ):
        from haystack import connections

        if not self.setup_complete:
            self.setup()

        # Deferred models will have a different class ("RealClass_Deferred_fieldname")
        # which won't be in our registry:
        model_klass = model_instance._meta.concrete_model

        index = (
            connections[self.connection_alias]
            .get_unified_index()
            .get_index(model_klass)
        )
        field_name = index.get_content_field()
        params = {}

        if start_offset is not None:
            params["search_from"] = start_offset

        if end_offset is not None:
            params["search_size"] = end_offset - start_offset

        doc_id = get_identifier(model_instance)

        try:
            raw_results = self.conn.mlt(
                index=self.index_name,
                doc_type="modelresult",
                id=doc_id,
                mlt_fields=[field_name],
                **params
            )
        except elasticsearch.TransportError as e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to fetch More Like This from Elasticsearch for document '%s': %s",
                doc_id,
                e,
                exc_info=True,
            )
            raw_results = {}

        return self._process_results(raw_results, result_class=result_class)
def search_index_delete(instance, **kwargs):
    logger = search_index_delete.get_logger(**kwargs)
    try:
        search_index = (connections['default'].get_unified_index()\
                                              .get_index(instance.__class__))
        search_index.remove_object(get_identifier(instance))
    except Exception, exc:
        logger.error(exc)
        search_index_delete.retry(exc=exc)
Example #27
0
 def remove(self, obj):
     """
     Remove indexes for `obj` from the database.
     
     We delete all instances of `Q<app_name>.<model_name>.<pk>` which
     should be unique to this object.
     """
     database = self._database(writable=True)
     database.delete_document(DOCUMENT_ID_TERM_PREFIX + get_identifier(obj))
Example #28
0
    def enqueue(self, action, instance):
        it = IndexingTask(
            action=action,
            identifier=get_identifier(instance)
        )
        if action == 'update':
            it.content_object = instance

        it.save()
Example #29
0
    def remove(self, obj_or_string, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        whoosh_id = get_identifier(obj_or_string)
        self.index.delete_by_query(q=self.parser.parse(u'id:"%s"' % whoosh_id))

        # For now, commit no matter what, as we run into locking issues otherwise.
        self.index.commit()
    def remove(self, obj_or_string, commit=True):
        doc_id = get_identifier(obj_or_string)

        try:
            self.conn.delete(self.index_name, 'modelresult', doc_id)
        except pyes.connection.ElasticSearchException, e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to remove document '%s' from Elasticsearch: %s", solr_id, e)
Example #31
0
    def remove(self, obj_or_string, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        whoosh_id = get_identifier(obj_or_string)

        try:
            self.index.delete_by_query(q=self.parser.parse('%s:"%s"' %
                                                           (ID, whoosh_id)))
        except Exception as e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to remove document '%s' from Whoosh: %s",
                whoosh_id,
                e,
                exc_info=True,
            )
Example #32
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            try:
                self.setup()
            except (requests.RequestException, pyelasticsearch.ElasticHttpError) as e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Elasticsearch: %s", e)
                return

        prepped_docs = []

        for obj in iterable:
            try:
                prepped_data = index.full_prepare(obj)
                final_data = {}

                # Convert the data to make sure it's happy.
                for key, value in prepped_data.items():
                    final_data[key] = self._from_python(value)

                prepped_docs.append(final_data)
            except (requests.RequestException, pyelasticsearch.ElasticHttpError) as e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error("%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={
                    "data": {
                        "index": index,
                        "object": get_identifier(obj)
                    }
                })

        self.conn.bulk_index(self.index_name, 'modelresult', prepped_docs, id_field=ID)

        if commit:
            self.conn.refresh(index=self.index_name)
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            try:
                self.setup()
            except elasticsearch.TransportError as e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Elasticsearch: %s", e, exc_info=True)
                return

        prepped_docs = []

        for obj in iterable:
            try:
                prepped_data = index.full_prepare(obj)
                final_data = {}

                # Convert the data to make sure it's happy.
                for key, value in prepped_data.items():
                    final_data[key] = self._from_python(value)
                final_data['_id'] = final_data[ID]

                prepped_docs.append(final_data)
            except SkipDocument:
                self.log.debug(u"Indexing for object `%s` skipped", obj)
            except elasticsearch.TransportError as e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True,
                               extra={"data": {"index": index,
                                               "object": get_identifier(obj)}})

        bulk(self.conn, prepped_docs, index=self.index_name, doc_type='modelresult')

        if commit:
            self.conn.indices.refresh(index=self.index_name)
Example #34
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            doc = index.full_prepare(obj)

            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])

            # Document boosts aren't supported in Whoosh 2.5.0+.
            if 'boost' in doc:
                del doc['boost']

            try:
                writer.update_document(**doc)
            except Exception as e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" %
                               e.__class__.__name__,
                               exc_info=True,
                               extra={
                                   "data": {
                                       "index": index,
                                       "object": get_identifier(obj)
                                   }
                               })

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()
Example #35
0
    def prepare(self, obj):
        """
        Fetches and adds/alters data before indexing.
        """
        self.prepared_data = {
            ID: get_identifier(obj),
            DJANGO_CT: "%s.%s" % (obj._meta.app_label, obj._meta.module_name),
            DJANGO_ID: force_unicode(obj.pk),
        }

        for field_name, field in self.fields.items():
            # Use the possibly overridden name, which will default to the
            # variable name of the field.
            self.prepared_data[field.index_fieldname] = field.prepare(obj)

        for field_name, field in self.fields.items():
            if hasattr(self, "prepare_%s" % field_name):
                value = getattr(self, "prepare_%s" % field_name)(obj)
                self.prepared_data[field.index_fieldname] = value

        return self.prepared_data
Example #36
0
    def prepare(self, obj, language):
        self.prepared_data = {
            ID: '%s.%s' % (get_identifier(obj), language.code),
            DJANGO_CT: "%s.%s" % (obj._meta.app_label, obj._meta.module_name),
            DJANGO_ID: force_unicode(obj.pk),
        }
        for field_name, field in self.fields.items():
            # Use the possibly overridden name, which will default to the
            # variable name of the field.
            self.prepared_data[field.index_fieldname] = field.prepare(obj)

            if field.use_template and field.document:
                try:
                    self.prepared_data[field.index_fieldname] = field._prepare_template(obj, language)
                except:
                    pass

            if hasattr(self, "prepare_%s" % field_name):
                value = getattr(self, "prepare_%s" % field_name)(obj, language)
                self.prepared_data[field.index_fieldname] = value

        return self.prepared_data
    def remove(self, obj_or_string, commit=True):
        """
        Removes an object from the index.
        :param obj_or_string:
        :param commit:
        """
        if not self.setup_complete:
            try:
                self.setup()
            except elasticsearch.TransportError as e:
                if not self.silently_fail:
                    raise
                doc_id = get_identifier(obj_or_string)
                self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e)
                return

        for language in self.languages:
            # self.log.debug('removing {0} from index {1}'.format(obj_or_string, language))
            self.index_name = self._index_name_for_language(language)
            with translation.override(language):
                super(ElasticsearchMultilingualSearchBackend, self).remove(obj_or_string,
                                                                           commit=commit)
Example #38
0
    def update(self, index, iterable, commit=True):
        docs = []

        for obj in iterable:
            try:
                docs.append(index.full_prepare(obj))
            except SkipDocument:
                self.log.debug("Indexing for object `%s` skipped", obj)
            except UnicodeDecodeError:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(
                    "UnicodeDecodeError while preparing object for update",
                    exc_info=True,
                    extra={
                        "data": {
                            "index": index,
                            "object": get_identifier(obj)
                        }
                    },
                )

        if len(docs) > 0:
            try:
                self.conn.add(docs,
                              commit=commit,
                              boost=index.get_field_weights())
            except (IOError, SolrError) as e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Solr: %s",
                               e,
                               exc_info=True)
    def more_like_this(self, model_instance, additional_query_string=None,
                       start_offset=0, end_offset=None, models=None,
                       limit_to_registered_models=None, result_class=None, **kwargs):
        from haystack import connections

        if not self.setup_complete:
            self.setup()

        # Keep proxy models with their own class
        if model_instance._meta.proxy:
            model_klass = type(model_instance)
        else:
            # Deferred models will have a different class ("RealClass_Deferred_fieldname")
            # which won't be in our registry:
            model_klass = model_instance._meta.concrete_model

        index = connections[self.connection_alias].get_unified_index().get_index(model_klass)
        field_name = index.get_content_field()
        params = {}

        if start_offset is not None:
            params['search_from'] = start_offset

        if end_offset is not None:
            params['search_size'] = end_offset - start_offset

        doc_id = get_identifier(model_instance)

        try:
            raw_results = self.conn.mlt(index=self.index_name, doc_type='modelresult', id=doc_id, mlt_fields=[field_name], **params)
        except elasticsearch.TransportError as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to fetch More Like This from Elasticsearch for document '%s': %s", doc_id, e)
            raw_results = {}

        return self._process_results(raw_results, result_class=result_class)
Example #40
0
    def remove(self, obj_or_string, commit=True):
        doc_id = get_identifier(obj_or_string)

        if not self.setup_complete:
            try:
                self.setup()
            except elasticsearch.TransportError as e:
                if not self.silently_fail:
                    raise

                self.log.error(
                    "Failed to remove document '%s' from Elasticsearch: %s",
                    doc_id,
                    e,
                    exc_info=True,
                )
                return

        try:
            self.conn.delete(
                index=self.index_name,
                id=doc_id,
                ignore=404,
                **self._get_doc_type_option(),
            )

            if commit:
                self.conn.indices.refresh(index=self.index_name)
        except elasticsearch.TransportError as e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to remove document '%s' from Elasticsearch: %s",
                doc_id,
                e,
                exc_info=True,
            )
    def remove(self, obj_or_string, commit=True):
        doc_id = get_identifier(obj_or_string)

        if not self.setup_complete:
            try:
                self.setup()
            except (requests.RequestException, pyelasticsearch.ElasticHttpError) as e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e)
                return

        try:
            self.conn.delete(self.index_name, 'modelresult', doc_id)

            if commit:
                self.conn.refresh(index=self.index_name)
        except (requests.RequestException, pyelasticsearch.ElasticHttpError) as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e)
Example #42
0
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        write = AsyncWriter(self.index)

        for obj in iterable:
            try:
                doc = index.full_prepare(obj)
            except SkipDocument:
                self.log.debug(u"Indexing for object '%s' skipped", obj)
            else:
                for key in doc:
                    doc[key] = self._from_python(doc[key])

                if 'boost' in doc:
                    del doc['boost']

                try:
                    write.update_document(**doc)
                except Exception as e:
                    if not self.silently_fail:
                        raise

                    self.log.error(u"%s while preparing object for update" %
                                   e.__class__.__name__,
                                   exc_info=True,
                                   extra={
                                       'data': {
                                           'index': index,
                                           'object': get_identifier(obj)
                                       }
                                   })

        if len(iterable) > 0:
            write.commit()
Example #43
0
def enqueue_task(action, instance, **kwargs):
    """
    Common utility for enqueing a task for the given action and
    model instance.
    """
    identifier = get_identifier(instance)
    options = {}
    if settings.CELERY_HAYSTACK_QUEUE:
        options['queue'] = settings.CELERY_HAYSTACK_QUEUE
    if settings.CELERY_HAYSTACK_COUNTDOWN:
        options['countdown'] = settings.CELERY_HAYSTACK_COUNTDOWN

    task = get_update_task()
    task_func = lambda: task.apply_async(
        (action, identifier), kwargs, **options)

    if hasattr(transaction, 'on_commit'):
        # Django 1.9 on_commit hook
        transaction.on_commit(task_func)
    elif hasattr(connection, 'on_commit'):
        # Django-transaction-hooks
        connection.on_commit(task_func)
    else:
        task_func()
Example #44
0
    def update(self, index, iterable, commit=None):
        docs = []

        if commit == None:
            commit = self.connection_options.get('COMMIT_UPDATES', True)

        for obj in iterable:
            try:
                docs.append(index.full_prepare(obj))
            except UnicodeDecodeError:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(
                    u"UnicodeDecodeError while preparing object for update",
                    exc_info=True,
                    extra={
                        "data": {
                            "index": index,
                            "object": get_identifier(obj)
                        }
                    })

        if len(docs) > 0:
            try:
                self.conn.add(docs,
                              commit=commit,
                              boost=index.get_field_weights())
            except (IOError, SolrError), e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Solr: %s", e)
Example #45
0
    def more_like_this(self,
                       model_instance,
                       additional_query_string=None,
                       start_offset=0,
                       end_offset=None,
                       models=None,
                       limit_to_registered_models=None,
                       result_class=None,
                       **kwargs):
        from haystack import connections

        # Handle deferred models.
        if get_proxied_model and hasattr(
                model_instance, '_deferred') and model_instance._deferred:
            model_klass = get_proxied_model(model_instance._meta)
        else:
            model_klass = type(model_instance)

        index = connections[
            self.connection_alias].get_unified_index().get_index(model_klass)
        field_name = index.get_content_field()
        params = {
            'fl': '*,score',
        }

        if start_offset is not None:
            params['start'] = start_offset

        if end_offset is not None:
            params['rows'] = end_offset

        narrow_queries = set()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted([
                '%s.%s' % (model._meta.app_label, model._meta.module_name)
                for model in models
            ])
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add('%s:(%s)' %
                               (DJANGO_CT, ' OR '.join(model_choices)))

        if additional_query_string:
            narrow_queries.add(additional_query_string)

        if narrow_queries:
            params['fq'] = list(narrow_queries)

        query = "%s:%s" % (ID, get_identifier(model_instance))

        try:
            raw_results = self.conn.more_like_this(query, field_name, **params)
        except (IOError, SolrError), e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to fetch More Like This from Solr for document '%s': %s",
                query, e)
            raw_results = EmptyResults()
Example #46
0
 def remove(self, obj, commit=True):
     del (self.docs[get_identifier(obj)])
 def test_haystack_identifier_method(self):
     get_identifier = _lookup_identifier_method()
     self.assertEqual(get_identifier('a.b.c'), 'a.b.c')
Example #48
0
    def more_like_this(self,
                       model_instance,
                       additional_query_string=None,
                       start_offset=0,
                       end_offset=None,
                       models=None,
                       limit_to_registered_models=None,
                       result_class=None,
                       **kwargs):

        from haystack import connections

        if not self.setup_complete:
            self.setup()

        # Deferred models will have a different class ("RealClass_Deferred_fieldname")
        # which won't be in our registry:
        model_klass = model_instance._meta.concrete_model

        index = connections[
            self.connection_alias].get_unified_index().get_index(model_klass)
        field_name = index.get_content_field()
        params = {}

        if start_offset is not None:
            params['from_'] = start_offset

        if end_offset is not None:
            params['size'] = end_offset - start_offset

        doc_id = get_identifier(model_instance)

        try:
            # More like this Query
            # https://www.elastic.co/guide/en/elasticsearch/reference/2.2/query-dsl-mlt-query.html
            mlt_query = {
                'query': {
                    'more_like_this': {
                        'fields': [field_name],
                        'like': [{
                            "_id": doc_id
                        }]
                    }
                }
            }

            narrow_queries = []

            if additional_query_string and additional_query_string != '*:*':
                additional_filter = {
                    "query_string": {
                        "query": additional_query_string
                    }
                }
                narrow_queries.append(additional_filter)

            if limit_to_registered_models is None:
                limit_to_registered_models = getattr(
                    settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

            if models and len(models):
                model_choices = sorted(get_model_ct(model) for model in models)

            elif limit_to_registered_models:
                # Using narrow queries, limit the results to only models handled
                # with the current routers.
                model_choices = self.build_models_list()

            else:
                model_choices = []

            if len(model_choices) > 0:
                model_filter = {"terms": {DJANGO_CT: model_choices}}
                narrow_queries.append(model_filter)

            if len(narrow_queries) > 0:
                mlt_query = {
                    "query": {
                        "bool": {
                            'must': mlt_query['query'],
                            'filter': {
                                'bool': {
                                    'must': list(narrow_queries)
                                }
                            }
                        }
                    }
                }

            raw_results = self.conn.search(body=mlt_query,
                                           index=self.index_name,
                                           _source=True,
                                           **params)

        except elasticsearch.TransportError as e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to fetch More Like This from Elasticsearch for document '%s': %s",
                doc_id,
                e,
                exc_info=True)
            raw_results = {}

        return self._process_results(raw_results, result_class=result_class)
Example #49
0
    def more_like_this(self,
                       model_instance,
                       additional_query_string=None,
                       start_offset=0,
                       end_offset=None,
                       models=None,
                       limit_to_registered_models=None,
                       result_class=None,
                       **kwargs):

        from haystack import connections

        if not self.setup_complete:
            self.setup()

        # Deferred models will have a different class ("RealClass_Deferred_fieldname")
        # which won't be in our registry:
        model_klass = model_instance._meta.concrete_model

        index = connections[
            self.connection_alias].get_unified_index().get_index(model_klass)
        field_name = index.get_content_field()
        params = {}

        if start_offset is not None:
            params['from_'] = start_offset

        if end_offset is not None:
            params['size'] = end_offset - start_offset

        doc_id = get_identifier(model_instance)

        more_like_this_query = {
            'query': {
                'more_like_this': {
                    'fields': [field_name],
                    'like': [{
                        "_id": doc_id
                    }]
                }
            }
        }

        additional_filters = []

        if additional_query_string and additional_query_string != '*:*':
            additional_filter = \
            {
                "query_string": {"query": additional_query_string}
            }

            additional_filters.append(additional_filter)

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models:
            content_types = [get_model_ct(model) for model in models]
        elif limit_to_registered_models:
            content_types = self.build_models_list()
        else:
            content_types = []

        if content_types:
            model_filter = {"terms": {DJANGO_CT: content_types}}
            additional_filters.append(model_filter)

        if additional_filters:
            more_like_this_query = \
            {
                "query": {
                    "bool": {
                        'must': more_like_this_query['query'],
                        'filter': {
                            'bool': {
                                'must': additional_filters
                            }
                        }
                    }
                }
            }

        try:
            raw_results = self.conn.search(index=self.index_name,
                                           doc_type='modelresult',
                                           body=more_like_this_query,
                                           _source=True,
                                           **params)
        except elasticsearch.TransportError as e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to fetch More Like This from Elasticsearch for document '%s': %s",
                doc_id,
                e,
                exc_info=True)
            raw_results = {}

        return self._process_results(raw_results, result_class=result_class)
Example #50
0
    def update(self, index, iterable):
        """
        Updates the `index` with any objects in `iterable` by adding/updating
        the database as needed.

        Required arguments:
            `index` -- The `SearchIndex` to process
            `iterable` -- An iterable of model instances to index

        For each object in `iterable`, a document is created containing all
        of the terms extracted from `index.full_prepare(obj)` with field prefixes,
        and 'as-is' as needed.  Also, if the field type is 'text' it will be
        stemmed and stored with the 'Z' prefix as well.

        eg. `content:Testing` ==> `testing, Ztest, ZXCONTENTtest, XCONTENTtest`

        Each document also contains an extra term in the format:

        `XCONTENTTYPE<app_name>.<model_name>`

        As well as a unique identifier in the the format:

        `Q<app_name>.<model_name>.<pk>`

        eg.: foo.bar (pk=1) ==> `Qfoo.bar.1`, `XCONTENTTYPEfoo.bar`

        This is useful for querying for a specific document corresponding to
        a model instance.

        The document also contains a pickled version of the object itself and
        the document ID in the document data field.

        Finally, we also store field values to be used for sorting data.  We
        store these in the document value slots (position zero is reserver
        for the document ID).  All values are stored as unicode strings with
        conversion of float, int, double, values being done by Xapian itself
        through the use of the :method:xapian.sortable_serialise method.
        """
        database = self._database(writable=True)
        try:
            for obj in iterable:
                document = xapian.Document()

                term_generator = xapian.TermGenerator()
                term_generator.set_database(database)
                term_generator.set_stemmer(xapian.Stem(self.language))
                if self.include_spelling is True:
                    term_generator.set_flags(
                        xapian.TermGenerator.FLAG_SPELLING)
                term_generator.set_document(document)

                document_id = DOCUMENT_ID_TERM_PREFIX + get_identifier(obj)
                data = index.full_prepare(obj)
                weights = index.get_field_weights()
                for field in self.schema:
                    if field['field_name'] in data.keys():
                        prefix = DOCUMENT_CUSTOM_TERM_PREFIX + field[
                            'field_name'].upper()
                        value = data[field['field_name']]
                        try:
                            weight = int(weights[field['field_name']])
                        except KeyError:
                            weight = 1
                        if field['type'] == 'text':
                            if field['multi_valued'] == 'false':
                                term = _marshal_term(value)
                                term_generator.index_text(term, weight)
                                term_generator.index_text(term, weight, prefix)
                                if len(term.split()) == 1:
                                    document.add_term(term, weight)
                                    document.add_term(prefix + term, weight)
                                document.add_value(field['column'],
                                                   _marshal_value(value))
                            else:
                                for term in value:
                                    term = _marshal_term(term)
                                    term_generator.index_text(term, weight)
                                    term_generator.index_text(
                                        term, weight, prefix)
                                    if len(term.split()) == 1:
                                        document.add_term(term, weight)
                                        document.add_term(
                                            prefix + term, weight)
                        else:
                            if field['multi_valued'] == 'false':
                                term = _marshal_term(value)
                                if len(term.split()) == 1:
                                    document.add_term(term, weight)
                                    document.add_term(prefix + term, weight)
                                    document.add_value(field['column'],
                                                       _marshal_value(value))
                            else:
                                for term in value:
                                    term = _marshal_term(term)
                                    if len(term.split()) == 1:
                                        document.add_term(term, weight)
                                        document.add_term(
                                            prefix + term, weight)

                document.set_data(
                    pickle.dumps((obj._meta.app_label, obj._meta.module_name,
                                  obj.pk, data), pickle.HIGHEST_PROTOCOL))
                document.add_term(document_id)
                document.add_term(DOCUMENT_CT_TERM_PREFIX + u'%s.%s' %
                                  (obj._meta.app_label, obj._meta.module_name))
                database.replace_document(document_id, document)

        except UnicodeDecodeError:
            sys.stderr.write('Chunk failed.\n')
            pass

        finally:
            database.close()
Example #51
0
 def enqueue_delete(self, instance, **kwargs):
     if transaction.is_dirty():
         transaction.commit_unless_managed()
     tasks.delete_from_index.delay(get_identifier(instance))
Example #52
0
    def more_like_this(self,
                       model_instance,
                       additional_query_string=None,
                       start_offset=0,
                       end_offset=None,
                       models=None,
                       limit_to_registered_models=None,
                       result_class=None,
                       **kwargs):
        if not self.setup_complete:
            self.setup()

        # Handle deferred models.
        if get_proxied_model and hasattr(
                model_instance, '_deferred') and model_instance._deferred:
            model_klass = get_proxied_model(model_instance._meta)
        else:
            model_klass = type(model_instance)

        field_name = self.content_field_name
        narrow_queries = set()
        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted([
                '%s.%s' % (model._meta.app_label, model._meta.module_name)
                for model in models
            ])
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add(' OR '.join(
                ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))

        if additional_query_string and additional_query_string != '*':
            narrow_queries.add(additional_query_string)

        narrow_searcher = None

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(
                    self.parser.parse(force_unicode(nq)))

                if len(recent_narrowed_results) <= 0:
                    return {
                        'results': [],
                        'hits': 0,
                    }

                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                    narrowed_results = recent_narrowed_results

        # Prevent against Whoosh throwing an error. Requires an end_offset
        # greater than 0.
        if not end_offset is None and end_offset <= 0:
            end_offset = 1

        # Determine the page.
        page_num = 0

        if end_offset is None:
            end_offset = 1000000

        if start_offset is None:
            start_offset = 0

        page_length = end_offset - start_offset

        if page_length and page_length > 0:
            page_num = start_offset / page_length

        # Increment because Whoosh uses 1-based page numbers.
        page_num += 1

        self.index = self.index.refresh()
        raw_results = EmptyResults()

        if self.index.doc_count():
            query = "%s:%s" % (ID, get_identifier(model_instance))
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query)
            results = searcher.search(parsed_query)

            if len(results):
                raw_results = results[0].more_like_this(field_name,
                                                        top=end_offset)

            # Handle the case where the results have been narrowed.
            if narrowed_results is not None and hasattr(raw_results, 'filter'):
                raw_results.filter(narrowed_results)

        try:
            raw_page = ResultsPage(raw_results, page_num, page_length)
        except ValueError:
            if not self.silently_fail:
                raise

            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': None,
            }

        results = self._process_results(raw_page, result_class=result_class)
        searcher.close()

        if hasattr(narrow_searcher, 'close'):
            narrow_searcher.close()

        return results
class ElasticsearchSearchBackend(BaseSearchBackend):
    # Word reserved by Elasticsearch for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )

    # Characters reserved by Elasticsearch for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\',
        '+',
        '-',
        '&&',
        '||',
        '!',
        '(',
        ')',
        '{',
        '}',
        '[',
        ']',
        '^',
        '"',
        '~',
        '*',
        '?',
        ':',
    )

    # Settings to add an n-gram & edge n-gram analyzer.
    DEFAULT_SETTINGS = {
        'settings': {
            "analysis": {
                "analyzer": {
                    "ngram_analyzer": {
                        "type": "custom",
                        "tokenizer": "lowercase",
                        "filter": ["haystack_ngram"]
                    },
                    "edgengram_analyzer": {
                        "type": "custom",
                        "tokenizer": "lowercase",
                        "filter": ["haystack_edgengram"]
                    }
                },
                "tokenizer": {
                    "haystack_ngram_tokenizer": {
                        "type": "nGram",
                        "min_gram": 3,
                        "max_gram": 15,
                    },
                    "haystack_edgengram_tokenizer": {
                        "type": "edgeNGram",
                        "min_gram": 2,
                        "max_gram": 15,
                        "side": "front"
                    }
                },
                "filter": {
                    "haystack_ngram": {
                        "type": "nGram",
                        "min_gram": 3,
                        "max_gram": 15
                    },
                    "haystack_edgengram": {
                        "type": "edgeNGram",
                        "min_gram": 2,
                        "max_gram": 15
                    }
                }
            }
        }
    }

    def __init__(self, connection_alias, **connection_options):
        super(ElasticsearchSearchBackend,
              self).__init__(connection_alias, **connection_options)

        if not 'URL' in connection_options:
            raise ImproperlyConfigured(
                "You must specify a 'URL' in your settings for connection '%s'."
                % connection_alias)

        if not 'INDEX_NAME' in connection_options:
            raise ImproperlyConfigured(
                "You must specify a 'INDEX_NAME' in your settings for connection '%s'."
                % connection_alias)

        self.conn = pyelasticsearch.ElasticSearch(connection_options['URL'],
                                                  timeout=self.timeout)
        self.index_name = connection_options['INDEX_NAME']
        self.log = logging.getLogger('haystack')
        self.setup_complete = False
        self.existing_mapping = {}

    def setup(self):
        """
        Defers loading until needed.
        """
        # Get the existing mapping & cache it. We'll compare it
        # during the ``update`` & if it doesn't match, we'll put the new
        # mapping.
        try:
            self.existing_mapping = self.conn.get_mapping(
                index=self.index_name)
        except Exception:
            if not self.silently_fail:
                raise

        unified_index = haystack.connections[
            self.connection_alias].get_unified_index()
        self.content_field_name, field_mapping = self.build_schema(
            unified_index.all_searchfields())
        current_mapping = {'modelresult': {'properties': field_mapping}}

        if current_mapping != self.existing_mapping:
            try:
                # Make sure the index is there first.
                self.conn.create_index(self.index_name, self.DEFAULT_SETTINGS)
                self.conn.put_mapping(self.index_name, 'modelresult',
                                      current_mapping)
                self.existing_mapping = current_mapping
            except Exception:
                if not self.silently_fail:
                    raise

        self.setup_complete = True

    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            try:
                self.setup()
            except (requests.RequestException,
                    pyelasticsearch.ElasticHttpError), e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Elasticsearch: %s",
                               e)
                return

        prepped_docs = []

        for obj in iterable:
            try:
                prepped_data = index.full_prepare(obj)
                final_data = {}

                # Convert the data to make sure it's happy.
                for key, value in prepped_data.items():
                    final_data[key] = self._from_python(value)

                prepped_docs.append(final_data)
            except (requests.RequestException,
                    pyelasticsearch.ElasticHttpError), e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" %
                               e.__name__,
                               exc_info=True,
                               extra={
                                   "data": {
                                       "index": index,
                                       "object": get_identifier(obj)
                                   }
                               })
Example #54
0
 def enqueue_delete(self, instance, **kwargs):
     get_queue(QUEUE_NAME).enqueue(index_delete_obj,
                                   get_identifier(instance))
Example #55
0
    def more_like_this(self,
                       model_instance,
                       additional_query_string=None,
                       start_offset=0,
                       end_offset=None,
                       models=None,
                       limit_to_registered_models=None,
                       result_class=None,
                       **kwargs):
        from haystack import connections

        # Deferred models will have a different class ("RealClass_Deferred_fieldname")
        # which won't be in our registry:
        model_klass = model_instance._meta.concrete_model

        index = connections[
            self.connection_alias].get_unified_index().get_index(model_klass)
        field_name = index.get_content_field()
        params = {
            'fl': '*,score',
        }

        if start_offset is not None:
            params['start'] = start_offset

        if end_offset is not None:
            params['rows'] = end_offset

        narrow_queries = set()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add('%s:(%s)' %
                               (DJANGO_CT, ' OR '.join(model_choices)))

        if additional_query_string:
            narrow_queries.add(additional_query_string)

        if narrow_queries:
            params['fq'] = list(narrow_queries)

        query = "%s:%s" % (ID, get_identifier(model_instance))

        try:
            raw_results = self.conn.more_like_this(query, field_name, **params)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to fetch More Like This from Solr for document '%s': %s",
                query,
                e,
                exc_info=True)
            raw_results = EmptyResults()

        return self._process_results(raw_results, result_class=result_class)
Example #56
0
 def remove(self, obj, commit=True):
     global MOCK_INDEX_DATA
     if commit:
         del (MOCK_INDEX_DATA[get_identifier(obj)])
Example #57
0
    def more_like_this(self,
                       model_instance,
                       additional_query=None,
                       start_offset=0,
                       end_offset=None,
                       limit_to_registered_models=True,
                       result_class=None,
                       **kwargs):
        """
        Given a model instance, returns a result set of similar documents.

        Required arguments:
            `model_instance` -- The model instance to use as a basis for
                                retrieving similar documents.

        Optional arguments:
            `additional_query` -- An additional query to narrow results
            `start_offset` -- The starting offset (default=0)
            `end_offset` -- The ending offset (default=None), if None, then all documents
            `limit_to_registered_models` -- Limit returned results to models registered in the current `SearchSite` (default = True)

        Returns:
            A dictionary with the following keys:
                `results` -- A list of `SearchResult`
                `hits` -- The total available results

        Opens a database connection, then builds a simple query using the
        `model_instance` to build the unique identifier.

        For each document retrieved(should always be one), adds an entry into
        an RSet (relevance set) with the document id, then, uses the RSet
        to query for an ESet (A set of terms that can be used to suggest
        expansions to the original query), omitting any document that was in
        the original query.

        Finally, processes the resulting matches and returns.
        """
        database = self._database()

        if result_class is None:
            result_class = SearchResult

        query = xapian.Query(DOCUMENT_ID_TERM_PREFIX +
                             get_identifier(model_instance))

        enquire = xapian.Enquire(database)
        enquire.set_query(query)

        rset = xapian.RSet()

        if not end_offset:
            end_offset = database.get_doccount()

        for match in self._get_enquire_mset(database, enquire, 0, end_offset):
            rset.add_document(match.docid)

        query = xapian.Query(xapian.Query.OP_ELITE_SET, [
            expand.term for expand in enquire.get_eset(
                match.document.termlist_count(), rset, XHExpandDecider())
        ], match.document.termlist_count())
        query = xapian.Query(
            xapian.Query.OP_AND_NOT,
            [query, DOCUMENT_ID_TERM_PREFIX + get_identifier(model_instance)])
        if limit_to_registered_models:
            registered_models = self.build_models_list()

            if len(registered_models) > 0:
                query = xapian.Query(
                    xapian.Query.OP_AND, query,
                    xapian.Query(xapian.Query.OP_OR, [
                        xapian.Query('%s%s' % (DOCUMENT_CT_TERM_PREFIX, model))
                        for model in registered_models
                    ]))
        if additional_query:
            query = xapian.Query(xapian.Query.OP_AND, query, additional_query)

        enquire.set_query(query)

        results = []
        matches = self._get_enquire_mset(database, enquire, start_offset,
                                         end_offset)

        for match in matches:
            app_label, module_name, pk, model_data = pickle.loads(
                self._get_document_data(database, match.document))
            results.append(
                result_class(app_label, module_name, pk, match.percent,
                             **model_data))

        return {
            'results': results,
            'hits': self._get_hit_count(database, enquire),
            'facets': {
                'fields': {},
                'dates': {},
                'queries': {},
            },
            'spelling_suggestion': None,
        }
                prepped_docs.append(final_data)
            except (requests.RequestException,
                    pyelasticsearch.ElasticSearchError), e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" %
                               e.__name__,
                               exc_info=True,
                               extra={
                                   "data": {
                                       "index": index,
                                       "object": get_identifier(obj)
                                   }
                               })

        self.conn.bulk_index(self.index_name,
                             'modelresult',
                             prepped_docs,
                             id_field=ID)

        if commit:
            self.conn.refresh(indexes=[self.index_name])

    def _remove(self, obj_or_string, commit=True):
        doc_id = get_identifier(obj_or_string)

        if not self.setup_complete:
Example #59
0
    def more_like_this(self,
                       model_instance,
                       additional_query_string=None,
                       start_offset=0,
                       end_offset=None,
                       models=None,
                       limit_to_registered_models=None,
                       result_class=None,
                       **kwargs):
        if not self.setup_complete:
            self.setup()

        field_name = self.content_field_name
        narrow_queries = set()
        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add(' OR '.join(
                ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))

        if additional_query_string and additional_query_string != '*':
            narrow_queries.add(additional_query_string)

        narrow_searcher = None

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(
                    self.parser.parse(force_text(nq)), limit=None)

                if len(recent_narrowed_results) <= 0:
                    return {
                        'results': [],
                        'hits': 0,
                    }

                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                    narrowed_results = recent_narrowed_results

        page_num, page_length = self.calculate_page(start_offset, end_offset)

        self.index = self.index.refresh()
        raw_results = EmptyResults()

        searcher = None
        if self.index.doc_count():
            query = "%s:%s" % (ID, get_identifier(model_instance))
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query)
            results = searcher.search(parsed_query)

            if len(results):
                raw_results = results[0].more_like_this(field_name,
                                                        top=end_offset)

            # Handle the case where the results have been narrowed.
            if narrowed_results is not None and hasattr(raw_results, 'filter'):
                raw_results.filter(narrowed_results)

        try:
            raw_page = ResultsPage(raw_results, page_num, page_length)
        except ValueError:
            if not self.silently_fail:
                raise

            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': None,
            }

        # Because as of Whoosh 2.5.1, it will return the wrong page of
        # results if you request something too high. :(
        if raw_page.pagenum < page_num:
            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': None,
            }

        results = self._process_results(raw_page, result_class=result_class)

        if searcher:
            searcher.close()

        if hasattr(narrow_searcher, 'close'):
            narrow_searcher.close()

        return results
Example #60
0
 def enqueue_delete(self, instance, **kwargs):
     search_index_delete.delay(instance._meta.app_label,
                               instance._meta.module_name,
                               get_identifier(instance))