def test_haystack_identifier_method(self): # The custom implementation returns the MD-5 hash of the key value by # default: get_identifier = _lookup_identifier_method() self.assertEqual(get_identifier("a.b.c"), "553f764f7b436175c0387e22b4a19213") # … but it also supports a custom override mechanism which would # definitely fail with the default implementation: class custom_id_class(object): def get_custom_haystack_id(self): return "CUSTOM" self.assertEqual(get_identifier(custom_id_class()), "CUSTOM")
def test_get_identifier(self): # Various invalid identifiers. self.assertRaises(AttributeError, get_identifier, "core") self.assertRaises(AttributeError, get_identifier, "core.mockmodel") self.assertRaises(AttributeError, get_identifier, "core.mockmodel.foo") self.assertRaises(AttributeError, get_identifier, "core-app.mockmodel.1") # Valid string identifier. self.assertEqual(get_identifier("core.mockmodel.1"), "core.mockmodel.1") # Valid object. mock = MockModel.objects.get(pk=1) self.assertEqual(get_identifier(mock), "core.mockmodel.1")
def remove(self, obj_or_string, commit=True): solr_id = get_identifier(obj_or_string) try: self.conn.delete(id=solr_id, commit=commit) except (IOError, SolrError), e: self.log.error("Failed to remove document '%s' from Solr: %s", solr_id, e)
def update(self, index, iterable, commit=True): if self._do_setup(): return prepped_docs = [] for obj in iterable: try: prepped_data = index.full_prepare(obj) final_data = [] for key, value in prepped_data.items(): field_type = self.schema[key] final_data.append(field_type(name=key, value=value)) document = search.Document(doc_id=prepped_data[ID], fields=final_data) prepped_docs.append(document) except search.Error: if not self.silently_fail: raise self.log.error("{0} while preparing object for update".format(e.__class__.__name__), exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj), } }) self.index.put(prepped_docs)
def remove(self, obj_or_string, commit=True): doc_id = get_identifier(obj_or_string) if not self.setup_complete: try: self.setup() except self.TransportError as e: if not self.silently_fail: raise self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e) return try: self.conn.delete(index=self.index_name, doc_type='modelresult', id=doc_id, ignore=404) if commit: self.conn.indices.refresh(index=self.index_name) except self.TransportError as e: if e.status_code == 404 and e.info.get('ok') and e.info.get('found') == False: self.log.warning("Tried removing nonexistent document '%s' from ElasticSearch: %s.", doc_id, e) elif not self.silently_fail: raise else: self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e)
def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) try: writer.update_document(**doc) except Exception, e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error(u"%s while preparing object for update" % e, exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } })
def update(self, index, iterable, commit=True): docs = [] for obj in iterable: try: docs.append(index.full_prepare(obj)) except UnicodeDecodeError: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error( u"UnicodeDecodeError while preparing object for update", exc_info=True, extra={"data": {"index": index, "object": get_identifier(obj)}}, ) if len(docs) > 0: try: self.conn.add(docs, commit=commit, boost=index.get_field_weights()) except (IOError, SolrError), e: if not self.silently_fail: raise self.log.error("Failed to add documents to Solr: %s", e)
def remove(self, obj_or_string, commit=True): doc_id = get_identifier(obj_or_string) if not self.setup_complete: try: self.setup() except elasticsearch.TransportError as e: if not self.silently_fail: raise self.log.error( "Failed to remove document '%s' from Elasticsearch: %s", doc_id, e, exc_info=True, ) return try: self.conn.delete( index=self.index_name, doc_type="modelresult", id=doc_id, ignore=404 ) if commit: self.conn.indices.refresh(index=self.index_name) except elasticsearch.TransportError as e: if not self.silently_fail: raise self.log.error( "Failed to remove document '%s' from Elasticsearch: %s", doc_id, e, exc_info=True, )
def remove(self, obj, commit=True): if not self.setup_complete: self.setup() index = self._get_index_for(obj._meta.model) index.deleteObject(get_identifier(obj))
def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): from haystack import connections if not self.setup_complete: self.setup() # Handle deferred models. if get_proxied_model and hasattr(model_instance, '_deferred') and model_instance._deferred: model_klass = get_proxied_model(model_instance._meta) else: model_klass = type(model_instance) index = connections[self.connection_alias].get_unified_index().get_index(model_klass) field_name = index.get_content_field() params = {} if start_offset is not None: params['search_from'] = start_offset if end_offset is not None: params['search_size'] = end_offset - start_offset doc_id = get_identifier(model_instance) try: raw_results = self.conn.morelikethis(self.index_name, 'modelresult', doc_id, [field_name], **params) except (requests.RequestException, pyelasticsearch.ElasticSearchError), e: if not self.silently_fail: raise self.log.error("Failed to fetch More Like This from Elasticsearch for document '%s': %s", doc_id, e) raw_results = {}
def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): from haystack import connections if not self.setup_complete: self.setup() # Deferred models will have a different class ("RealClass_Deferred_fieldname") # which won't be in our registry: model_klass = model_instance._meta.concrete_model index = connections[self.connection_alias].get_unified_index().get_index(model_klass) field_name = index.get_content_field() params = {} if start_offset is not None: params['search_from'] = start_offset if end_offset is not None: params['search_size'] = end_offset - start_offset doc_id = get_identifier(model_instance) try: raw_results = self.conn.more_like_this(self.index_name, 'modelresult', doc_id, [field_name], **params) except (requests.RequestException, pyelasticsearch.ElasticHttpError), e: if not self.silently_fail: raise self.log.error("Failed to fetch More Like This from Elasticsearch for document '%s': %s", doc_id, e) raw_results = {}
def remove(self, obj_or_string, commit=True): doc_id = get_identifier(obj_or_string) # django-haystack default to using namespaced ids for objects like layers.layer.83 but the GeoNode SearchIndexes # override the default ids with ResourceBase ids. if isinstance(obj_or_string, ResourceBase): doc_id = getattr(obj_or_string, 'id') if not self.setup_complete: try: self.setup() except elasticsearch.TransportError as e: if not self.silently_fail: raise self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e) return try: self.conn.delete(index=self.index_name, doc_type='modelresult', id=doc_id, ignore=404) if commit: self.conn.indices.refresh(index=self.index_name) except elasticsearch.TransportError as e: if not self.silently_fail: raise self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e)
def remove(self, obj_or_string, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() whoosh_id = get_identifier(obj_or_string) self.index.delete_by_query(q=self.parser.parse(u'id:"%s"' % whoosh_id))
def enqueue(self, action, instance): """ Shoves a message about how to update the index into the queue. This is a standardized string, resembling something like:: ``update:notes.note.23`` # ...or... ``delete:weblog.entry.8`` """ message = "%s:%s" % (action, get_identifier(instance)) try: return queue.write(message) except QueueException: import smtplib from email.mime.text import MIMEText msg = {} me = "*****@*****.**" you = "*****@*****.**" msg['Subject'] = 'Error - QueueException' msg['From'] = me msg['To'] = you s = smtplib.SMTP('localhost') s.sendmail(me, [you], MIMEText("the following message could not be queued\n\n"+message)) s.quit()
def prepare(self, obj): """ Fetches and adds/alters data before indexing. """ self.prepared_data = { 'id': get_identifier(obj), 'django_ct': "%s.%s" % (obj._meta.app_label, obj._meta.module_name), 'django_id': force_unicode(obj.pk), } for field_name, field in self.fields.items(): # Use the possibly overridden name, which will default to the # variable name of the field. self.prepared_data[field.index_fieldname] = field.prepare(obj) for field_name, field in self.fields.items(): if hasattr(self, "prepare_%s" % field_name): value = getattr(self, "prepare_%s" % field_name)(obj) self.prepared_data[field.index_fieldname] = value # Remove any fields that lack a value and are `null=True`. for field_name, field in self.fields.items(): if field.null is True: if self.prepared_data[field.index_fieldname] is None: del(self.prepared_data[field.index_fieldname]) return self.prepared_data
def enqueue_task(action, instance, **kwargs): """ Common utility for enqueing a task for the given action and model instance. """ identifier = get_identifier(instance) options = {} if settings.CELERY_HAYSTACK_QUEUE: options['queue'] = settings.CELERY_HAYSTACK_QUEUE if settings.CELERY_HAYSTACK_COUNTDOWN: options['countdown'] = settings.CELERY_HAYSTACK_COUNTDOWN task = get_update_task() def task_func(): return task.apply_async((action, identifier), kwargs, **options) if hasattr(transaction, 'on_commit'): # Django 1.9 on_commit hook transaction.on_commit( task_func ) elif hasattr(connection, 'on_commit'): # Django-transaction-hooks connection.on_commit( task_func ) else: task_func()
def remove(self, obj_or_string, commit=True): """ Removes a document/object from the backend. Can be either a model instance or the identifier (i.e. ``app_name.model_name.id``) in the event the object no longer exists. :param obj_or_string: The model instance or the identifier. :param commit: True to refresh the search index after the remove. """ doc_id = get_identifier(obj_or_string) if not self.setup_complete: try: self.setup() except elasticsearch.TransportError as e: if not self.silently_fail: raise self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e, exc_info=True) return try: self.conn.delete(index=self.index_name, doc_type='modelresult', id=doc_id, ignore=404) if commit: self.conn.indices.refresh(index=self.index_name) except elasticsearch.TransportError as e: if not self.silently_fail: raise self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e, exc_info=True)
def enqueue_task(action, instance): """ Common utility for enqueing a task for the given action and model instance. """ identifier = get_identifier(instance) get_update_task().delay(action, identifier)
def more_like_this( self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, limit_to_registered_models=None, result_class=None, **kwargs ): from haystack import connections # Handle deferred models. if get_proxied_model and hasattr(model_instance, "_deferred") and model_instance._deferred: model_klass = get_proxied_model(model_instance._meta) else: model_klass = type(model_instance) index = connections[self.connection_alias].get_unified_index().get_index(model_klass) field_name = index.get_content_field() params = {"fl": "*,score"} if start_offset is not None: params["start"] = start_offset if end_offset is not None: params["rows"] = end_offset narrow_queries = set() if limit_to_registered_models is None: limit_to_registered_models = getattr(settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True) if limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. if narrow_queries is None: narrow_queries = set() registered_models = self.build_models_list() if len(registered_models) > 0: narrow_queries.add("%s:(%s)" % (DJANGO_CT, " OR ".join(registered_models))) if additional_query_string: narrow_queries.add(additional_query_string) if narrow_queries: params["fq"] = list(narrow_queries) query = "%s:%s" % (ID, get_identifier(model_instance)) try: raw_results = self.conn.more_like_this(query, field_name, **params) except (IOError, SolrError), e: if not self.silently_fail: raise self.log.error("Failed to fetch More Like This from Solr for document '%s': %s", query, e) raw_results = EmptyResults()
def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): from haystack import connections # Deferred models will have a different class ("RealClass_Deferred_fieldname") # which won't be in our registry: model_klass = model_instance._meta.concrete_model index = connections[self.connection_alias].get_unified_index().get_index(model_klass) field_name = index.get_content_field() params = { 'fl': '*,score', } if start_offset is not None: params['start'] = start_offset if end_offset is not None: params['rows'] = end_offset narrow_queries = set() if limit_to_registered_models is None: limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add('%s:(%s)' % (DJANGO_CT, ' OR '.join(model_choices))) if additional_query_string: narrow_queries.add(additional_query_string) if narrow_queries: params['fq'] = list(narrow_queries) query = "%s:%s" % (ID, get_identifier(model_instance)) try: raw_results = self.conn.more_like_this(query, field_name, **params) except (IOError, SolrError) as e: if not self.silently_fail: raise self.log.error("Failed to fetch More Like This from Solr for document '%s': %s", query, e, exc_info=True) raw_results = EmptyResults() return self._process_results(raw_results, result_class=result_class)
def trigger_index_update(klass, instance_pk): try: from celery_haystack.utils import get_update_task except ImportError: return task = get_update_task() task.delay('update', get_identifier(klass(id=instance_pk)))
def search_index_signal_handler(instance, signal, **kwargs): """ Signal handler for when indexable objects are saved or deleted. The indexing will run after the transaction commits, which will generally mean that all of the related ManyToMany data will be saved and ready. """ if search_update_options["disabled"]: return deleting = signal is post_delete if deleting: # When deleting, pass in an identifier string instead of the instance. # This is because Django will unset the instance's pk before the update # method runs, making it impossible to determine afterwards. item = get_identifier(instance) else: item = instance if search_update_options["async"]: queue_update(item, remove=deleting) else: update_object(item, remove=deleting)
def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): from haystack import connections # Handle deferred models. if get_proxied_model and hasattr(model_instance, '_deferred') and model_instance._deferred: model_klass = get_proxied_model(model_instance._meta) else: model_klass = type(model_instance) index = connections[self.connection_alias].get_unified_index().get_index(model_klass) field_name = index.get_content_field() params = { 'fl': '*,score', } if start_offset is not None: params['start'] = start_offset if end_offset is not None: params['rows'] = end_offset narrow_queries = set() if limit_to_registered_models is None: limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted(['%s.%s' % (model._meta.app_label, model._meta.module_name) for model in models]) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add('%s:(%s)' % (DJANGO_CT, ' OR '.join(model_choices))) if additional_query_string: narrow_queries.add(additional_query_string) if narrow_queries: params['fq'] = list(narrow_queries) query = "%s:%s" % (ID, get_identifier(model_instance)) try: raw_results = self.conn.more_like_this(query, field_name, **params) except (IOError, SolrError), e: if not self.silently_fail: raise self.log.error("Failed to fetch More Like This from Solr for document '%s': %s", query, e) raw_results = EmptyResults()
def remove(self, obj_or_string, commit=True): solr_id = get_identifier(obj_or_string) try: kwargs = {"commit": commit, ID: solr_id} self.conn.delete(**kwargs) except (IOError, SolrError), e: self.log.error("Failed to remove document '%s' from Solr: %s", solr_id, e)
def more_like_this( self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs ): from haystack import connections if not self.setup_complete: self.setup() # Deferred models will have a different class ("RealClass_Deferred_fieldname") # which won't be in our registry: model_klass = model_instance._meta.concrete_model index = ( connections[self.connection_alias] .get_unified_index() .get_index(model_klass) ) field_name = index.get_content_field() params = {} if start_offset is not None: params["search_from"] = start_offset if end_offset is not None: params["search_size"] = end_offset - start_offset doc_id = get_identifier(model_instance) try: raw_results = self.conn.mlt( index=self.index_name, doc_type="modelresult", id=doc_id, mlt_fields=[field_name], **params ) except elasticsearch.TransportError as e: if not self.silently_fail: raise self.log.error( "Failed to fetch More Like This from Elasticsearch for document '%s': %s", doc_id, e, exc_info=True, ) raw_results = {} return self._process_results(raw_results, result_class=result_class)
def search_index_delete(instance, **kwargs): logger = search_index_delete.get_logger(**kwargs) try: search_index = (connections['default'].get_unified_index()\ .get_index(instance.__class__)) search_index.remove_object(get_identifier(instance)) except Exception, exc: logger.error(exc) search_index_delete.retry(exc=exc)
def remove(self, obj): """ Remove indexes for `obj` from the database. We delete all instances of `Q<app_name>.<model_name>.<pk>` which should be unique to this object. """ database = self._database(writable=True) database.delete_document(DOCUMENT_ID_TERM_PREFIX + get_identifier(obj))
def enqueue(self, action, instance): it = IndexingTask( action=action, identifier=get_identifier(instance) ) if action == 'update': it.content_object = instance it.save()
def remove(self, obj_or_string, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() whoosh_id = get_identifier(obj_or_string) self.index.delete_by_query(q=self.parser.parse(u'id:"%s"' % whoosh_id)) # For now, commit no matter what, as we run into locking issues otherwise. self.index.commit()
def remove(self, obj_or_string, commit=True): doc_id = get_identifier(obj_or_string) try: self.conn.delete(self.index_name, 'modelresult', doc_id) except pyes.connection.ElasticSearchException, e: if not self.silently_fail: raise self.log.error("Failed to remove document '%s' from Elasticsearch: %s", solr_id, e)
def remove(self, obj_or_string, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() whoosh_id = get_identifier(obj_or_string) try: self.index.delete_by_query(q=self.parser.parse('%s:"%s"' % (ID, whoosh_id))) except Exception as e: if not self.silently_fail: raise self.log.error( "Failed to remove document '%s' from Whoosh: %s", whoosh_id, e, exc_info=True, )
def update(self, index, iterable, commit=True): if not self.setup_complete: try: self.setup() except (requests.RequestException, pyelasticsearch.ElasticHttpError) as e: if not self.silently_fail: raise self.log.error("Failed to add documents to Elasticsearch: %s", e) return prepped_docs = [] for obj in iterable: try: prepped_data = index.full_prepare(obj) final_data = {} # Convert the data to make sure it's happy. for key, value in prepped_data.items(): final_data[key] = self._from_python(value) prepped_docs.append(final_data) except (requests.RequestException, pyelasticsearch.ElasticHttpError) as e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error("%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } }) self.conn.bulk_index(self.index_name, 'modelresult', prepped_docs, id_field=ID) if commit: self.conn.refresh(index=self.index_name)
def update(self, index, iterable, commit=True): if not self.setup_complete: try: self.setup() except elasticsearch.TransportError as e: if not self.silently_fail: raise self.log.error("Failed to add documents to Elasticsearch: %s", e, exc_info=True) return prepped_docs = [] for obj in iterable: try: prepped_data = index.full_prepare(obj) final_data = {} # Convert the data to make sure it's happy. for key, value in prepped_data.items(): final_data[key] = self._from_python(value) final_data['_id'] = final_data[ID] prepped_docs.append(final_data) except SkipDocument: self.log.debug(u"Indexing for object `%s` skipped", obj) except elasticsearch.TransportError as e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={"data": {"index": index, "object": get_identifier(obj)}}) bulk(self.conn, prepped_docs, index=self.index_name, doc_type='modelresult') if commit: self.conn.indices.refresh(index=self.index_name)
def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) # Document boosts aren't supported in Whoosh 2.5.0+. if 'boost' in doc: del doc['boost'] try: writer.update_document(**doc) except Exception as e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } }) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit()
def prepare(self, obj): """ Fetches and adds/alters data before indexing. """ self.prepared_data = { ID: get_identifier(obj), DJANGO_CT: "%s.%s" % (obj._meta.app_label, obj._meta.module_name), DJANGO_ID: force_unicode(obj.pk), } for field_name, field in self.fields.items(): # Use the possibly overridden name, which will default to the # variable name of the field. self.prepared_data[field.index_fieldname] = field.prepare(obj) for field_name, field in self.fields.items(): if hasattr(self, "prepare_%s" % field_name): value = getattr(self, "prepare_%s" % field_name)(obj) self.prepared_data[field.index_fieldname] = value return self.prepared_data
def prepare(self, obj, language): self.prepared_data = { ID: '%s.%s' % (get_identifier(obj), language.code), DJANGO_CT: "%s.%s" % (obj._meta.app_label, obj._meta.module_name), DJANGO_ID: force_unicode(obj.pk), } for field_name, field in self.fields.items(): # Use the possibly overridden name, which will default to the # variable name of the field. self.prepared_data[field.index_fieldname] = field.prepare(obj) if field.use_template and field.document: try: self.prepared_data[field.index_fieldname] = field._prepare_template(obj, language) except: pass if hasattr(self, "prepare_%s" % field_name): value = getattr(self, "prepare_%s" % field_name)(obj, language) self.prepared_data[field.index_fieldname] = value return self.prepared_data
def remove(self, obj_or_string, commit=True): """ Removes an object from the index. :param obj_or_string: :param commit: """ if not self.setup_complete: try: self.setup() except elasticsearch.TransportError as e: if not self.silently_fail: raise doc_id = get_identifier(obj_or_string) self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e) return for language in self.languages: # self.log.debug('removing {0} from index {1}'.format(obj_or_string, language)) self.index_name = self._index_name_for_language(language) with translation.override(language): super(ElasticsearchMultilingualSearchBackend, self).remove(obj_or_string, commit=commit)
def update(self, index, iterable, commit=True): docs = [] for obj in iterable: try: docs.append(index.full_prepare(obj)) except SkipDocument: self.log.debug("Indexing for object `%s` skipped", obj) except UnicodeDecodeError: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error( "UnicodeDecodeError while preparing object for update", exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } }, ) if len(docs) > 0: try: self.conn.add(docs, commit=commit, boost=index.get_field_weights()) except (IOError, SolrError) as e: if not self.silently_fail: raise self.log.error("Failed to add documents to Solr: %s", e, exc_info=True)
def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): from haystack import connections if not self.setup_complete: self.setup() # Keep proxy models with their own class if model_instance._meta.proxy: model_klass = type(model_instance) else: # Deferred models will have a different class ("RealClass_Deferred_fieldname") # which won't be in our registry: model_klass = model_instance._meta.concrete_model index = connections[self.connection_alias].get_unified_index().get_index(model_klass) field_name = index.get_content_field() params = {} if start_offset is not None: params['search_from'] = start_offset if end_offset is not None: params['search_size'] = end_offset - start_offset doc_id = get_identifier(model_instance) try: raw_results = self.conn.mlt(index=self.index_name, doc_type='modelresult', id=doc_id, mlt_fields=[field_name], **params) except elasticsearch.TransportError as e: if not self.silently_fail: raise self.log.error("Failed to fetch More Like This from Elasticsearch for document '%s': %s", doc_id, e) raw_results = {} return self._process_results(raw_results, result_class=result_class)
def remove(self, obj_or_string, commit=True): doc_id = get_identifier(obj_or_string) if not self.setup_complete: try: self.setup() except elasticsearch.TransportError as e: if not self.silently_fail: raise self.log.error( "Failed to remove document '%s' from Elasticsearch: %s", doc_id, e, exc_info=True, ) return try: self.conn.delete( index=self.index_name, id=doc_id, ignore=404, **self._get_doc_type_option(), ) if commit: self.conn.indices.refresh(index=self.index_name) except elasticsearch.TransportError as e: if not self.silently_fail: raise self.log.error( "Failed to remove document '%s' from Elasticsearch: %s", doc_id, e, exc_info=True, )
def remove(self, obj_or_string, commit=True): doc_id = get_identifier(obj_or_string) if not self.setup_complete: try: self.setup() except (requests.RequestException, pyelasticsearch.ElasticHttpError) as e: if not self.silently_fail: raise self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e) return try: self.conn.delete(self.index_name, 'modelresult', doc_id) if commit: self.conn.refresh(index=self.index_name) except (requests.RequestException, pyelasticsearch.ElasticHttpError) as e: if not self.silently_fail: raise self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e)
def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() write = AsyncWriter(self.index) for obj in iterable: try: doc = index.full_prepare(obj) except SkipDocument: self.log.debug(u"Indexing for object '%s' skipped", obj) else: for key in doc: doc[key] = self._from_python(doc[key]) if 'boost' in doc: del doc['boost'] try: write.update_document(**doc) except Exception as e: if not self.silently_fail: raise self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={ 'data': { 'index': index, 'object': get_identifier(obj) } }) if len(iterable) > 0: write.commit()
def enqueue_task(action, instance, **kwargs): """ Common utility for enqueing a task for the given action and model instance. """ identifier = get_identifier(instance) options = {} if settings.CELERY_HAYSTACK_QUEUE: options['queue'] = settings.CELERY_HAYSTACK_QUEUE if settings.CELERY_HAYSTACK_COUNTDOWN: options['countdown'] = settings.CELERY_HAYSTACK_COUNTDOWN task = get_update_task() task_func = lambda: task.apply_async( (action, identifier), kwargs, **options) if hasattr(transaction, 'on_commit'): # Django 1.9 on_commit hook transaction.on_commit(task_func) elif hasattr(connection, 'on_commit'): # Django-transaction-hooks connection.on_commit(task_func) else: task_func()
def update(self, index, iterable, commit=None): docs = [] if commit == None: commit = self.connection_options.get('COMMIT_UPDATES', True) for obj in iterable: try: docs.append(index.full_prepare(obj)) except UnicodeDecodeError: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error( u"UnicodeDecodeError while preparing object for update", exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } }) if len(docs) > 0: try: self.conn.add(docs, commit=commit, boost=index.get_field_weights()) except (IOError, SolrError), e: if not self.silently_fail: raise self.log.error("Failed to add documents to Solr: %s", e)
def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): from haystack import connections # Handle deferred models. if get_proxied_model and hasattr( model_instance, '_deferred') and model_instance._deferred: model_klass = get_proxied_model(model_instance._meta) else: model_klass = type(model_instance) index = connections[ self.connection_alias].get_unified_index().get_index(model_klass) field_name = index.get_content_field() params = { 'fl': '*,score', } if start_offset is not None: params['start'] = start_offset if end_offset is not None: params['rows'] = end_offset narrow_queries = set() if limit_to_registered_models is None: limit_to_registered_models = getattr( settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted([ '%s.%s' % (model._meta.app_label, model._meta.module_name) for model in models ]) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add('%s:(%s)' % (DJANGO_CT, ' OR '.join(model_choices))) if additional_query_string: narrow_queries.add(additional_query_string) if narrow_queries: params['fq'] = list(narrow_queries) query = "%s:%s" % (ID, get_identifier(model_instance)) try: raw_results = self.conn.more_like_this(query, field_name, **params) except (IOError, SolrError), e: if not self.silently_fail: raise self.log.error( "Failed to fetch More Like This from Solr for document '%s': %s", query, e) raw_results = EmptyResults()
def remove(self, obj, commit=True): del (self.docs[get_identifier(obj)])
def test_haystack_identifier_method(self): get_identifier = _lookup_identifier_method() self.assertEqual(get_identifier('a.b.c'), 'a.b.c')
def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): from haystack import connections if not self.setup_complete: self.setup() # Deferred models will have a different class ("RealClass_Deferred_fieldname") # which won't be in our registry: model_klass = model_instance._meta.concrete_model index = connections[ self.connection_alias].get_unified_index().get_index(model_klass) field_name = index.get_content_field() params = {} if start_offset is not None: params['from_'] = start_offset if end_offset is not None: params['size'] = end_offset - start_offset doc_id = get_identifier(model_instance) try: # More like this Query # https://www.elastic.co/guide/en/elasticsearch/reference/2.2/query-dsl-mlt-query.html mlt_query = { 'query': { 'more_like_this': { 'fields': [field_name], 'like': [{ "_id": doc_id }] } } } narrow_queries = [] if additional_query_string and additional_query_string != '*:*': additional_filter = { "query_string": { "query": additional_query_string } } narrow_queries.append(additional_filter) if limit_to_registered_models is None: limit_to_registered_models = getattr( settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: model_filter = {"terms": {DJANGO_CT: model_choices}} narrow_queries.append(model_filter) if len(narrow_queries) > 0: mlt_query = { "query": { "bool": { 'must': mlt_query['query'], 'filter': { 'bool': { 'must': list(narrow_queries) } } } } } raw_results = self.conn.search(body=mlt_query, index=self.index_name, _source=True, **params) except elasticsearch.TransportError as e: if not self.silently_fail: raise self.log.error( "Failed to fetch More Like This from Elasticsearch for document '%s': %s", doc_id, e, exc_info=True) raw_results = {} return self._process_results(raw_results, result_class=result_class)
def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): from haystack import connections if not self.setup_complete: self.setup() # Deferred models will have a different class ("RealClass_Deferred_fieldname") # which won't be in our registry: model_klass = model_instance._meta.concrete_model index = connections[ self.connection_alias].get_unified_index().get_index(model_klass) field_name = index.get_content_field() params = {} if start_offset is not None: params['from_'] = start_offset if end_offset is not None: params['size'] = end_offset - start_offset doc_id = get_identifier(model_instance) more_like_this_query = { 'query': { 'more_like_this': { 'fields': [field_name], 'like': [{ "_id": doc_id }] } } } additional_filters = [] if additional_query_string and additional_query_string != '*:*': additional_filter = \ { "query_string": {"query": additional_query_string} } additional_filters.append(additional_filter) if limit_to_registered_models is None: limit_to_registered_models = getattr( settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models: content_types = [get_model_ct(model) for model in models] elif limit_to_registered_models: content_types = self.build_models_list() else: content_types = [] if content_types: model_filter = {"terms": {DJANGO_CT: content_types}} additional_filters.append(model_filter) if additional_filters: more_like_this_query = \ { "query": { "bool": { 'must': more_like_this_query['query'], 'filter': { 'bool': { 'must': additional_filters } } } } } try: raw_results = self.conn.search(index=self.index_name, doc_type='modelresult', body=more_like_this_query, _source=True, **params) except elasticsearch.TransportError as e: if not self.silently_fail: raise self.log.error( "Failed to fetch More Like This from Elasticsearch for document '%s': %s", doc_id, e, exc_info=True) raw_results = {} return self._process_results(raw_results, result_class=result_class)
def update(self, index, iterable): """ Updates the `index` with any objects in `iterable` by adding/updating the database as needed. Required arguments: `index` -- The `SearchIndex` to process `iterable` -- An iterable of model instances to index For each object in `iterable`, a document is created containing all of the terms extracted from `index.full_prepare(obj)` with field prefixes, and 'as-is' as needed. Also, if the field type is 'text' it will be stemmed and stored with the 'Z' prefix as well. eg. `content:Testing` ==> `testing, Ztest, ZXCONTENTtest, XCONTENTtest` Each document also contains an extra term in the format: `XCONTENTTYPE<app_name>.<model_name>` As well as a unique identifier in the the format: `Q<app_name>.<model_name>.<pk>` eg.: foo.bar (pk=1) ==> `Qfoo.bar.1`, `XCONTENTTYPEfoo.bar` This is useful for querying for a specific document corresponding to a model instance. The document also contains a pickled version of the object itself and the document ID in the document data field. Finally, we also store field values to be used for sorting data. We store these in the document value slots (position zero is reserver for the document ID). All values are stored as unicode strings with conversion of float, int, double, values being done by Xapian itself through the use of the :method:xapian.sortable_serialise method. """ database = self._database(writable=True) try: for obj in iterable: document = xapian.Document() term_generator = xapian.TermGenerator() term_generator.set_database(database) term_generator.set_stemmer(xapian.Stem(self.language)) if self.include_spelling is True: term_generator.set_flags( xapian.TermGenerator.FLAG_SPELLING) term_generator.set_document(document) document_id = DOCUMENT_ID_TERM_PREFIX + get_identifier(obj) data = index.full_prepare(obj) weights = index.get_field_weights() for field in self.schema: if field['field_name'] in data.keys(): prefix = DOCUMENT_CUSTOM_TERM_PREFIX + field[ 'field_name'].upper() value = data[field['field_name']] try: weight = int(weights[field['field_name']]) except KeyError: weight = 1 if field['type'] == 'text': if field['multi_valued'] == 'false': term = _marshal_term(value) term_generator.index_text(term, weight) term_generator.index_text(term, weight, prefix) if len(term.split()) == 1: document.add_term(term, weight) document.add_term(prefix + term, weight) document.add_value(field['column'], _marshal_value(value)) else: for term in value: term = _marshal_term(term) term_generator.index_text(term, weight) term_generator.index_text( term, weight, prefix) if len(term.split()) == 1: document.add_term(term, weight) document.add_term( prefix + term, weight) else: if field['multi_valued'] == 'false': term = _marshal_term(value) if len(term.split()) == 1: document.add_term(term, weight) document.add_term(prefix + term, weight) document.add_value(field['column'], _marshal_value(value)) else: for term in value: term = _marshal_term(term) if len(term.split()) == 1: document.add_term(term, weight) document.add_term( prefix + term, weight) document.set_data( pickle.dumps((obj._meta.app_label, obj._meta.module_name, obj.pk, data), pickle.HIGHEST_PROTOCOL)) document.add_term(document_id) document.add_term(DOCUMENT_CT_TERM_PREFIX + u'%s.%s' % (obj._meta.app_label, obj._meta.module_name)) database.replace_document(document_id, document) except UnicodeDecodeError: sys.stderr.write('Chunk failed.\n') pass finally: database.close()
def enqueue_delete(self, instance, **kwargs): if transaction.is_dirty(): transaction.commit_unless_managed() tasks.delete_from_index.delay(get_identifier(instance))
def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): if not self.setup_complete: self.setup() # Handle deferred models. if get_proxied_model and hasattr( model_instance, '_deferred') and model_instance._deferred: model_klass = get_proxied_model(model_instance._meta) else: model_klass = type(model_instance) field_name = self.content_field_name narrow_queries = set() narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr( settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted([ '%s.%s' % (model._meta.app_label, model._meta.module_name) for model in models ]) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add(' OR '.join( ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices])) if additional_query_string and additional_query_string != '*': narrow_queries.add(additional_query_string) narrow_searcher = None if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search( self.parser.parse(force_unicode(nq))) if len(recent_narrowed_results) <= 0: return { 'results': [], 'hits': 0, } if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results # Prevent against Whoosh throwing an error. Requires an end_offset # greater than 0. if not end_offset is None and end_offset <= 0: end_offset = 1 # Determine the page. page_num = 0 if end_offset is None: end_offset = 1000000 if start_offset is None: start_offset = 0 page_length = end_offset - start_offset if page_length and page_length > 0: page_num = start_offset / page_length # Increment because Whoosh uses 1-based page numbers. page_num += 1 self.index = self.index.refresh() raw_results = EmptyResults() if self.index.doc_count(): query = "%s:%s" % (ID, get_identifier(model_instance)) searcher = self.index.searcher() parsed_query = self.parser.parse(query) results = searcher.search(parsed_query) if len(results): raw_results = results[0].more_like_this(field_name, top=end_offset) # Handle the case where the results have been narrowed. if narrowed_results is not None and hasattr(raw_results, 'filter'): raw_results.filter(narrowed_results) try: raw_page = ResultsPage(raw_results, page_num, page_length) except ValueError: if not self.silently_fail: raise return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } results = self._process_results(raw_page, result_class=result_class) searcher.close() if hasattr(narrow_searcher, 'close'): narrow_searcher.close() return results
class ElasticsearchSearchBackend(BaseSearchBackend): # Word reserved by Elasticsearch for special use. RESERVED_WORDS = ( 'AND', 'NOT', 'OR', 'TO', ) # Characters reserved by Elasticsearch for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', ) # Settings to add an n-gram & edge n-gram analyzer. DEFAULT_SETTINGS = { 'settings': { "analysis": { "analyzer": { "ngram_analyzer": { "type": "custom", "tokenizer": "lowercase", "filter": ["haystack_ngram"] }, "edgengram_analyzer": { "type": "custom", "tokenizer": "lowercase", "filter": ["haystack_edgengram"] } }, "tokenizer": { "haystack_ngram_tokenizer": { "type": "nGram", "min_gram": 3, "max_gram": 15, }, "haystack_edgengram_tokenizer": { "type": "edgeNGram", "min_gram": 2, "max_gram": 15, "side": "front" } }, "filter": { "haystack_ngram": { "type": "nGram", "min_gram": 3, "max_gram": 15 }, "haystack_edgengram": { "type": "edgeNGram", "min_gram": 2, "max_gram": 15 } } } } } def __init__(self, connection_alias, **connection_options): super(ElasticsearchSearchBackend, self).__init__(connection_alias, **connection_options) if not 'URL' in connection_options: raise ImproperlyConfigured( "You must specify a 'URL' in your settings for connection '%s'." % connection_alias) if not 'INDEX_NAME' in connection_options: raise ImproperlyConfigured( "You must specify a 'INDEX_NAME' in your settings for connection '%s'." % connection_alias) self.conn = pyelasticsearch.ElasticSearch(connection_options['URL'], timeout=self.timeout) self.index_name = connection_options['INDEX_NAME'] self.log = logging.getLogger('haystack') self.setup_complete = False self.existing_mapping = {} def setup(self): """ Defers loading until needed. """ # Get the existing mapping & cache it. We'll compare it # during the ``update`` & if it doesn't match, we'll put the new # mapping. try: self.existing_mapping = self.conn.get_mapping( index=self.index_name) except Exception: if not self.silently_fail: raise unified_index = haystack.connections[ self.connection_alias].get_unified_index() self.content_field_name, field_mapping = self.build_schema( unified_index.all_searchfields()) current_mapping = {'modelresult': {'properties': field_mapping}} if current_mapping != self.existing_mapping: try: # Make sure the index is there first. self.conn.create_index(self.index_name, self.DEFAULT_SETTINGS) self.conn.put_mapping(self.index_name, 'modelresult', current_mapping) self.existing_mapping = current_mapping except Exception: if not self.silently_fail: raise self.setup_complete = True def update(self, index, iterable, commit=True): if not self.setup_complete: try: self.setup() except (requests.RequestException, pyelasticsearch.ElasticHttpError), e: if not self.silently_fail: raise self.log.error("Failed to add documents to Elasticsearch: %s", e) return prepped_docs = [] for obj in iterable: try: prepped_data = index.full_prepare(obj) final_data = {} # Convert the data to make sure it's happy. for key, value in prepped_data.items(): final_data[key] = self._from_python(value) prepped_docs.append(final_data) except (requests.RequestException, pyelasticsearch.ElasticHttpError), e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error(u"%s while preparing object for update" % e.__name__, exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } })
def enqueue_delete(self, instance, **kwargs): get_queue(QUEUE_NAME).enqueue(index_delete_obj, get_identifier(instance))
def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): from haystack import connections # Deferred models will have a different class ("RealClass_Deferred_fieldname") # which won't be in our registry: model_klass = model_instance._meta.concrete_model index = connections[ self.connection_alias].get_unified_index().get_index(model_klass) field_name = index.get_content_field() params = { 'fl': '*,score', } if start_offset is not None: params['start'] = start_offset if end_offset is not None: params['rows'] = end_offset narrow_queries = set() if limit_to_registered_models is None: limit_to_registered_models = getattr( settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add('%s:(%s)' % (DJANGO_CT, ' OR '.join(model_choices))) if additional_query_string: narrow_queries.add(additional_query_string) if narrow_queries: params['fq'] = list(narrow_queries) query = "%s:%s" % (ID, get_identifier(model_instance)) try: raw_results = self.conn.more_like_this(query, field_name, **params) except (IOError, SolrError) as e: if not self.silently_fail: raise self.log.error( "Failed to fetch More Like This from Solr for document '%s': %s", query, e, exc_info=True) raw_results = EmptyResults() return self._process_results(raw_results, result_class=result_class)
def remove(self, obj, commit=True): global MOCK_INDEX_DATA if commit: del (MOCK_INDEX_DATA[get_identifier(obj)])
def more_like_this(self, model_instance, additional_query=None, start_offset=0, end_offset=None, limit_to_registered_models=True, result_class=None, **kwargs): """ Given a model instance, returns a result set of similar documents. Required arguments: `model_instance` -- The model instance to use as a basis for retrieving similar documents. Optional arguments: `additional_query` -- An additional query to narrow results `start_offset` -- The starting offset (default=0) `end_offset` -- The ending offset (default=None), if None, then all documents `limit_to_registered_models` -- Limit returned results to models registered in the current `SearchSite` (default = True) Returns: A dictionary with the following keys: `results` -- A list of `SearchResult` `hits` -- The total available results Opens a database connection, then builds a simple query using the `model_instance` to build the unique identifier. For each document retrieved(should always be one), adds an entry into an RSet (relevance set) with the document id, then, uses the RSet to query for an ESet (A set of terms that can be used to suggest expansions to the original query), omitting any document that was in the original query. Finally, processes the resulting matches and returns. """ database = self._database() if result_class is None: result_class = SearchResult query = xapian.Query(DOCUMENT_ID_TERM_PREFIX + get_identifier(model_instance)) enquire = xapian.Enquire(database) enquire.set_query(query) rset = xapian.RSet() if not end_offset: end_offset = database.get_doccount() for match in self._get_enquire_mset(database, enquire, 0, end_offset): rset.add_document(match.docid) query = xapian.Query(xapian.Query.OP_ELITE_SET, [ expand.term for expand in enquire.get_eset( match.document.termlist_count(), rset, XHExpandDecider()) ], match.document.termlist_count()) query = xapian.Query( xapian.Query.OP_AND_NOT, [query, DOCUMENT_ID_TERM_PREFIX + get_identifier(model_instance)]) if limit_to_registered_models: registered_models = self.build_models_list() if len(registered_models) > 0: query = xapian.Query( xapian.Query.OP_AND, query, xapian.Query(xapian.Query.OP_OR, [ xapian.Query('%s%s' % (DOCUMENT_CT_TERM_PREFIX, model)) for model in registered_models ])) if additional_query: query = xapian.Query(xapian.Query.OP_AND, query, additional_query) enquire.set_query(query) results = [] matches = self._get_enquire_mset(database, enquire, start_offset, end_offset) for match in matches: app_label, module_name, pk, model_data = pickle.loads( self._get_document_data(database, match.document)) results.append( result_class(app_label, module_name, pk, match.percent, **model_data)) return { 'results': results, 'hits': self._get_hit_count(database, enquire), 'facets': { 'fields': {}, 'dates': {}, 'queries': {}, }, 'spelling_suggestion': None, }
prepped_docs.append(final_data) except (requests.RequestException, pyelasticsearch.ElasticSearchError), e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error(u"%s while preparing object for update" % e.__name__, exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } }) self.conn.bulk_index(self.index_name, 'modelresult', prepped_docs, id_field=ID) if commit: self.conn.refresh(indexes=[self.index_name]) def _remove(self, obj_or_string, commit=True): doc_id = get_identifier(obj_or_string) if not self.setup_complete:
def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): if not self.setup_complete: self.setup() field_name = self.content_field_name narrow_queries = set() narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr( settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add(' OR '.join( ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices])) if additional_query_string and additional_query_string != '*': narrow_queries.add(additional_query_string) narrow_searcher = None if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search( self.parser.parse(force_text(nq)), limit=None) if len(recent_narrowed_results) <= 0: return { 'results': [], 'hits': 0, } if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results page_num, page_length = self.calculate_page(start_offset, end_offset) self.index = self.index.refresh() raw_results = EmptyResults() searcher = None if self.index.doc_count(): query = "%s:%s" % (ID, get_identifier(model_instance)) searcher = self.index.searcher() parsed_query = self.parser.parse(query) results = searcher.search(parsed_query) if len(results): raw_results = results[0].more_like_this(field_name, top=end_offset) # Handle the case where the results have been narrowed. if narrowed_results is not None and hasattr(raw_results, 'filter'): raw_results.filter(narrowed_results) try: raw_page = ResultsPage(raw_results, page_num, page_length) except ValueError: if not self.silently_fail: raise return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } # Because as of Whoosh 2.5.1, it will return the wrong page of # results if you request something too high. :( if raw_page.pagenum < page_num: return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } results = self._process_results(raw_page, result_class=result_class) if searcher: searcher.close() if hasattr(narrow_searcher, 'close'): narrow_searcher.close() return results
def enqueue_delete(self, instance, **kwargs): search_index_delete.delay(instance._meta.app_label, instance._meta.module_name, get_identifier(instance))