def remove_indexed_files(model, project_slug, version_slug=None, build_id=None): """ Remove files from `version_slug` of `project_slug` from the search index. :param model: Class of the model to be deleted. :param project_slug: Project slug. :param version_slug: Version slug. If isn't given, all index from `project` are deleted. :param build_id: Build id. If isn't given, all index from `version` are deleted. """ log.bind( project_slug=project_slug, version_slug=version_slug, ) if not DEDConfig.autosync_enabled(): log.info('Autosync disabled, skipping removal from the search index.') return try: document = list(registry.get_documents(models=[model]))[0] log.info('Deleting old files from search index.') documents = ( document().search() .filter('term', project=project_slug) ) if version_slug: documents = documents.filter('term', version=version_slug) if build_id: documents = documents.exclude('term', build=build_id) documents.delete() except Exception: log.exception('Unable to delete a subset of files. Continuing.')
def handle_populate(self, models, options, connection): """ Insert all given models in their respective indices. Args: models: An iterable with model classes. options: A dict with command line options. connection: The Elasticsearch connection. """ for doc in registry.get_documents(models): qs = doc().get_queryset() if not options['current']: # We want to find and populate the latest index. indices = connection.indices.get('{}.*'.format(doc._doc_type.index)).keys() if not indices: raise AttributeError('The index \'{}\' does not exist.'.format(doc._doc_type.index)) indices.sort() doc._doc_type.index = indices[-1] # Dirty hack to override the doctype meta, but it works. elif not connection.indices.exists(doc._doc_type.index): # We want to use the current index, which means we can use the # alias. If so, we do need to check the alias/index exists, or # we risk creating an index implicitly (which is a mess to # clean up). raise AttributeError('The index \'{}\' does not exist.'.format(doc._doc_type.index)) self.stdout.write("Indexing {} '{}' objects to '{}'".format( qs.count(), doc._doc_type.model.__name__, doc._doc_type.index) ) doc().update(qs)
def search_bulk_index(model: Type[Model], qs: QuerySet, **kwargs): """Django orm bulk functions such as `bulk_create`, `bulk_index` and `update`do not send signals for the modified objects and therefore do not automatically update the elasticsearch index. This function therefore bulk-reindexes the changed objects.""" [current_doc] = registry.get_documents([model]) return current_doc().update(qs, **kwargs)
def remove_indexed_files(model, version, build): """ Remove files from the version from the search index. This excludes files from the current build. """ if not DEDConfig.autosync_enabled(): log.info( 'Autosync disabled, skipping removal from the search index for: %s:%s', version.project.slug, version.slug, ) return try: document = list(registry.get_documents(models=[model]))[0] log.info( 'Deleting old files from search index for: %s:%s', version.project.slug, version.slug, ) (document().search().filter( 'term', project=version.project.slug).filter( 'term', version=version.slug).exclude('term', build=build).delete()) except Exception: log.exception('Unable to delete a subset of files. Continuing.')
def remove_indexed_file(sender, instance_list, **kwargs): """Remove deleted files from the build process.""" if not instance_list: return model = sender document = list(registry.get_documents(models=[model]))[0] version = kwargs.get('version') commit = kwargs.get('commit') index_kwargs = { 'app_label': model._meta.app_label, 'model_name': model.__name__, 'document_class': str(document), 'objects_id': [obj.id for obj in instance_list], } # Do not index if autosync is disabled globally if DEDConfig.autosync_enabled(): delete_objects_in_es(**index_kwargs) if version and commit: # Sanity check by deleting all old files not in this commit log.info('Deleting old commits from search index') document().search().filter( 'term', version=version.slug, ).filter( 'term', project=version.project.slug, ).exclude( 'term', commit=commit, ).delete()
def index_new_files(model, version, build): """Index new files from the version into the search index.""" if not DEDConfig.autosync_enabled(): log.info( 'Autosync disabled, skipping indexing into the search index for: %s:%s', version.project.slug, version.slug, ) return try: document = list(registry.get_documents(models=[model]))[0] doc_obj = document() queryset = ( doc_obj.get_queryset() .filter(project=version.project, version=version, build=build) ) log.info( 'Indexing new objecst into search index for: %s:%s', version.project.slug, version.slug, ) doc_obj.update(queryset.iterator()) except Exception: log.exception('Unable to index a subset of files. Continuing.')
def update_hierarchy_labels(apps, schema_editor): region = apps.get_model('regions', 'Region') placeholder = PlaceholderApi() all_regions = region.objects.all() regions_ids = list(all_regions.values_list('region_id', flat=True)) reg_data = placeholder.find_by_id(regions_ids) placeholder.add_hierarchy_labels(reg_data) for reg in all_regions: try: reg.i18n.update({ 'hierarchy_label_pl': reg_data[str(reg.region_id)]['hierarchy_label_pl'], 'hierarchy_label_en': reg_data[str(reg.region_id)]['hierarchy_label_en'] }) reg.hierarchy_label = reg_data[str( reg.region_id)]['hierarchy_label_pl'] reg.save() except KeyError: logger.debug( f'Couldn\'t update hierarchy label for region with id {reg.region_id}' ) docs = registry.get_documents((region, )) default_reg = region.objects.get(region_id=settings.DEFAULT_REGION_ID) default_reg.i18n.update({ 'hierarchy_label_pl': 'Polska', 'hierarchy_label_en': 'Poland' }) default_reg.hierarchy_label = 'Polska' default_reg.save() for doc in docs: doc().update(all_regions)
def save_regions_data(values): main_regions = [] additional_regions = [] if values: placeholder = PlaceholderApi() reg_data, all_regions_ids = placeholder.get_all_regions_details( values) existing_regions = Region.objects.filter( region_id__in=all_regions_ids) existing_regions_ids = list( existing_regions.values_list('region_id', flat=True)) to_create_regions_ids = set(all_regions_ids).difference( set(existing_regions_ids)) to_create_regions = { reg_id: reg_data[str(reg_id)] for reg_id in to_create_regions_ids } created_regions = Region.objects.create_new_regions( to_create_regions) if created_regions: docs = registry.get_documents((Region, )) for doc in docs: doc().update(created_regions) all_regions = list(existing_regions) + created_regions for reg in all_regions: if str(reg.region_id) in values: main_regions.append(reg) else: additional_regions.append(reg) return main_regions, additional_regions
def _populate(self, models, options): parallel = options['parallel'] for doc in registry.get_documents(models): self.stdout.write("Indexing {} '{}' objects {}".format( doc().get_queryset().count() if options['count'] else "all", doc.django.model.__name__, "(parallel)" if parallel else "")) qs = doc().get_indexing_queryset() doc().update(qs, parallel=parallel)
def update_index(): models = registry.get_models() for doc in registry.get_documents(models): qs = doc().get_queryset() log.info('indexing {} "{}" objects'.format( qs.count(), doc._doc_type.model.__name__)) doc().update(qs)
def _run_reindex_tasks(self, models, queue): apply_async_kwargs = {'priority': 0} if queue: log.info('Adding indexing tasks to queue %s', queue) apply_async_kwargs['queue'] = queue else: log.info('Adding indexing tasks to default queue') index_time = timezone.now() timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S') for doc in registry.get_documents(models): queryset = doc().get_queryset() # Get latest object from the queryset app_label = queryset.model._meta.app_label model_name = queryset.model.__name__ index_name = doc._doc_type.index new_index_name = "{}_{}".format(index_name, timestamp) # Set index temporarily for indexing, # this will only get set during the running of this command doc._doc_type.index = new_index_name pre_index_task = create_new_es_index.si(app_label=app_label, model_name=model_name, index_name=index_name, new_index_name=new_index_name) indexing_tasks = self._get_indexing_tasks(app_label=app_label, model_name=model_name, queryset=queryset, index_name=new_index_name, document_class=str(doc)) post_index_task = switch_es_index.si(app_label=app_label, model_name=model_name, index_name=index_name, new_index_name=new_index_name) # Task to run in order to add the objects # that has been inserted into database while indexing_tasks was running # We pass the creation time of latest object, so its possible to index later items missed_index_task = index_missing_objects.si(app_label=app_label, model_name=model_name, document_class=str(doc), index_generation_time=index_time) # http://celery.readthedocs.io/en/latest/userguide/canvas.html#chords chord_tasks = chord(header=indexing_tasks, body=post_index_task) if queue: pre_index_task.set(queue=queue) chord_tasks.set(queue=queue) missed_index_task.set(queue=queue) # http://celery.readthedocs.io/en/latest/userguide/canvas.html#chain chain(pre_index_task, chord_tasks, missed_index_task).apply_async(**apply_async_kwargs) message = ("Successfully issued tasks for {}.{}, total {} items" .format(app_label, model_name, queryset.count())) log.info(message)
def _run_reindex_tasks(self, models, queue): apply_async_kwargs = {'priority': 0} if queue: log.info('Adding indexing tasks to queue {0}'.format(queue)) apply_async_kwargs['queue'] = queue else: log.info('Adding indexing tasks to default queue') index_time = timezone.now() timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S') for doc in registry.get_documents(models): queryset = doc().get_queryset() # Get latest object from the queryset app_label = queryset.model._meta.app_label model_name = queryset.model.__name__ index_name = doc._doc_type.index new_index_name = "{}_{}".format(index_name, timestamp) # Set index temporarily for indexing, # this will only get set during the running of this command doc._doc_type.index = new_index_name pre_index_task = create_new_es_index.si(app_label=app_label, model_name=model_name, index_name=index_name, new_index_name=new_index_name) indexing_tasks = self._get_indexing_tasks(app_label=app_label, model_name=model_name, queryset=queryset, index_name=new_index_name, document_class=str(doc)) post_index_task = switch_es_index.si(app_label=app_label, model_name=model_name, index_name=index_name, new_index_name=new_index_name) # Task to run in order to add the objects # that has been inserted into database while indexing_tasks was running # We pass the creation time of latest object, so its possible to index later items missed_index_task = index_missing_objects.si(app_label=app_label, model_name=model_name, document_class=str(doc), index_generation_time=index_time) # http://celery.readthedocs.io/en/latest/userguide/canvas.html#chords chord_tasks = chord(header=indexing_tasks, body=post_index_task) if queue: pre_index_task.set(queue=queue) chord_tasks.set(queue=queue) missed_index_task.set(queue=queue) # http://celery.readthedocs.io/en/latest/userguide/canvas.html#chain chain(pre_index_task, chord_tasks, missed_index_task).apply_async(**apply_async_kwargs) message = ("Successfully issued tasks for {}.{}, total {} items" .format(app_label, model_name, queryset.count())) log.info(message)
def setUpTestData(cls): super(ElasticsearchApiTestCase, cls).setUpTestData() # Check if this model has Elasticsearch mappings and take one of them # to test with (unless a custom es_doc_type has been defined). if not cls.es_doc_type: doc_types = registry.get_documents(cls.model_cls) if doc_types: cls.es_doc_type = next(iter(doc_types))
def update_index(): models = registry.get_models() for doc in registry.get_documents(models): qs = doc().get_queryset() log.info('indexing {} "{}" objects'.format( qs.count(), doc._doc_type.model.__name__) ) doc().update(qs)
def teardown(self): # Listen to all model saves. for doc in registry.get_documents(): if getattr(doc, 'special_signals', False): continue model = doc._doc_type.model models.signals.post_save.disconnect(self.handle_save, sender=model) models.signals.post_delete.disconnect(self.handle_delete, sender=model) models.signals.m2m_changed.disconnect(self.handle_m2m_changed, sender=model) models.signals.pre_delete.disconnect(self.handle_pre_delete, sender=model)
def setup(self): for doc in registry.get_documents(): if getattr(doc, 'special_signals', False): continue model = doc._doc_type.model models.signals.post_save.connect(self.handle_save, sender=model) models.signals.post_delete.connect(self.handle_delete, sender=model) models.signals.m2m_changed.connect(self.handle_m2m_changed, sender=model) models.signals.pre_delete.connect(self.handle_pre_delete, sender=model)
def setup(self): for doc in registry.get_documents(): if getattr(doc, 'special_signals', False): continue model = doc.django.model models.signals.post_save.connect(self.handle_save, sender=model) models.signals.post_delete.connect(self.handle_delete, sender=model) models.signals.m2m_changed.connect(self.handle_m2m_changed, sender=model) models.signals.pre_delete.connect(self.handle_pre_delete, sender=model)
def teardown(self): # Listen to all model saves. for doc in registry.get_documents(): if getattr(doc, 'special_signals', False): continue model = doc.django.model models.signals.post_save.disconnect(self.handle_save, sender=model) models.signals.post_delete.disconnect(self.handle_delete, sender=model) models.signals.m2m_changed.disconnect(self.handle_m2m_changed, sender=model) models.signals.pre_delete.disconnect(self.handle_pre_delete, sender=model)
def update_related_task(app_label, object_name, pk_set): model = apps.get_model(app_label, object_name) docs = registry.get_documents((model,)) for doc in docs: qs = model.objects.filter(pk__in=pk_set) doc().update(qs.iterator()) return { 'app': model._meta.app_label, 'model': model._meta.object_name, 'instance_id': pk_set }
def __init__(self, model=None, query=None, using=None, hints=None, search=None): super(ElasticQuerySet, self).__init__(model=model, query=query, using=using, hints=hints) if search is None: doc_types = registry.get_documents([model]) search = Search( index=[doc_type._doc_type.index for doc_type in doc_types], doc_type=list(doc_types) ).source(include=[])[0:10000] self.search = search self._total = None
def _update(self, models, options): """ Update indices with sanity check. Will be created a new index and populate with data. The index will be masked with previous one to prevent missing data. """ alias_mappings = [] for document in registry.get_documents(models): # pylint: disable=protected-access index = document._index record_count = self.get_record_count(document) alias, new_index_name = self.prepare_backend_index(index) alias_mappings.append( AliasMapper(document, index, new_index_name, alias, record_count)) # Set the alias (from settings) to the timestamped catalog. run_attempts = 0 indexes_pending = { key: '' for key in [x.new_index_name for x in alias_mappings] } conn = get_connection() while indexes_pending and run_attempts < 1: # Only try once, as retries gave buggy results. See VAN-391 run_attempts += 1 self._populate(models, options) for doc, __, new_index_name, alias, record_count in alias_mappings: # Run a sanity check to ensure we aren't drastically changing the # index, which could be indicative of a bug. if new_index_name in indexes_pending and not options.get( 'disable_change_limit', False): record_count_is_sane, index_info_string = self.sanity_check_new_index( run_attempts, doc, new_index_name, record_count) if record_count_is_sane: ElasticsearchUtils.set_alias(conn, alias, new_index_name) indexes_pending.pop(new_index_name, None) else: indexes_pending[new_index_name] = index_info_string else: ElasticsearchUtils.set_alias(conn, alias, new_index_name) indexes_pending.pop(new_index_name, None) for index_alias_mapper in alias_mappings: index_alias_mapper.registered_index._name = index_alias_mapper.alias # pylint: disable=protected-access if indexes_pending: raise CommandError( 'Sanity check failed for the new index(es): {}'.format( indexes_pending)) return True
def get_document_for_model(model): """Get document for model given. :param model: Model to get document index for. :type model: Subclass of `django.db.models.Model`. :return: Document index for the given model. :rtype: Subclass of `django_elasticsearch_dsl.Document`. """ documents = registry.get_documents() for document in documents: if model == document.Django.model: return document
def null_field_in_related_task(app_label, object_name, instance_id): instance = _instance(app_label, object_name, instance_id) for rel in instance._meta.related_objects: field_name = rel.field.name model = rel.field.model rel_instances = model.objects.filter(**{ rel.field.name: instance }) doc = list(registry.get_documents(models=[model, ]))[0] for rel_inst in rel_instances: setattr(rel_inst, field_name, None) doc_instance = doc(related_instance_to_ignore=instance) doc_instance.update(rel_inst)
def _get_document(model, document_class): """ Get DED document class object from the model and name of document class. :param model: The model class to find the document :param document_class: the name of the document class. :return: DED DocType object """ documents = registry.get_documents(models=[model]) for document in documents: if str(document) == document_class: return document
def handle(self, *args, **options): """This command is based off of the 'populate' command of Django ES DSL: https://github.com/sabricot/django-elasticsearch-dsl/blob/f6b2e0694e4ed69826c824196ccec5863874c856/django_elasticsearch_dsl/management/commands/search_index.py#L86 We have updated it so that it will do incremental updates rather than looping over the full queryset every time. """ models = set(registry.get_models()) for doc in registry.get_documents(models): start_time = timezone.now() - UPDATE_WINDOW qs = doc().get_queryset().filter(last_modified__gt=start_time).order_by("id") self.stdout.write("Indexing {} '{}' objects".format(qs.count(), qs.model.__name__)) doc().update(qs)
def update_search_index(self): """ Update the Document inside the Elasticsearch index after changing relevant parts of the product. """ documents = elasticsearch_registry.get_documents([ProductModel]) if settings.USE_I18N: for language, _ in settings.LANGUAGES: try: document = next(doc for doc in documents if doc._language == language) except StopIteration: document = next(doc for doc in documents if doc._language is None) document().update(self) else: document = next(doc for doc in documents) document().update(self)
def index_indexed_file(sender, instance_list, **kwargs): """Handle indexing from the build process.""" if not instance_list: return model = sender document = list(registry.get_documents(models=[model]))[0] index_kwargs = { 'app_label': model._meta.app_label, 'model_name': model.__name__, 'document_class': str(document), 'objects_id': [obj.id for obj in instance_list], } # Do not index if autosync is disabled globally if DEDConfig.autosync_enabled(): index_objects_to_es(**index_kwargs)
def __init__(self, model=None, query=None, using=None, hints=None, search=None): super(ElasticQuerySet, self).__init__(model=model, query=query, using=using, hints=hints) if search is None: doc_types = registry.get_documents([model]) search = Search( index=[doc_type._doc_type.index for doc_type in doc_types], doc_type=list(doc_types)).source(include=[])[0:10000] self.search = search self._total = None
def _populate(self, models, options): docs = { doc._doc_type.index: doc for doc in registry.get_documents() if doc._doc_type.model in models } for index in settings.ELASTICSEARCH_INDEX_NAMES.values(): try: doc = docs[index] qs = doc().get_queryset() self.stdout.write("Indexing {} '{}' objects".format( qs.count(), doc._doc_type.model.__name__)) start = datetime.now() doc().update(qs.iterator(), chunk_size=1000) finish = datetime.now() print('Time: {}'.format(finish - start)) except KeyError: pass
def _reindex_files_from(self, days_ago, queue=None): """Reindex HTML files from versions with recent builds.""" chunk_size = settings.ES_TASK_CHUNK_SIZE since = datetime.now() - timedelta(days=days_ago) queryset = Version.objects.filter(builds__date__gte=since).distinct() app_label = HTMLFile._meta.app_label model_name = HTMLFile.__name__ apply_async_kwargs = { 'kwargs': { 'app_label': app_label, 'model_name': model_name, }, } if queue: apply_async_kwargs['queue'] = queue for doc in registry.get_documents(models=[HTMLFile]): apply_async_kwargs['kwargs']['document_class'] = str(doc) for version in queryset.iterator(): project = version.project files_qs = (HTMLFile.objects.filter( version=version).values_list('pk', flat=True).iterator()) current = 0 while True: objects_id = list(itertools.islice(files_qs, chunk_size)) if not objects_id: break current += len(objects_id) log.info( 'Re-indexing files. version=%s:%s total=%s', project.slug, version.slug, current, ) apply_async_kwargs['kwargs']['objects_id'] = objects_id index_objects_to_es.apply_async(**apply_async_kwargs) log.info( "Tasks issued successfully. version=%s:%s items=%s", project.slug, version.slug, str(current), )
def _get_docs(self, models): _last_doc = None _logentries_doc = None _docs = [] for doc in registry.get_documents(models): # Move history index if doc.Index.name == 'histories': _last_doc = doc elif doc.Index.name == 'logentries': _logentries_doc = doc else: _docs.append(doc) if _last_doc: _docs.append(_last_doc) if _logentries_doc: _docs.append(_logentries_doc) return _docs
def _run_reindex_tasks(self, models, queue): apply_async_kwargs = {'queue': queue} log.info('Adding indexing tasks to queue.', queue=queue) timestamp = datetime.now().strftime('%Y%m%d%H%M%S') for doc in registry.get_documents(models): queryset = doc().get_queryset() app_label = queryset.model._meta.app_label model_name = queryset.model.__name__ index_name = doc._index._name new_index_name = "{}_{}".format(index_name, timestamp) # Set and create a temporal index for indexing. create_new_es_index( app_label=app_label, model_name=model_name, index_name=index_name, new_index_name=new_index_name, ) doc._index._name = new_index_name log.info('Temporal index created.', index_name=new_index_name) indexing_tasks = self._get_indexing_tasks( app_label=app_label, model_name=model_name, queryset=queryset, index_name=new_index_name, document_class=str(doc), ) for task in indexing_tasks: task.apply_async(**apply_async_kwargs) log.info( "Tasks issued successfully.", model_name=model_name, app_label=app_label, items=queryset.count(), ) return timestamp
def _reindex_projects_from(self, days_ago, queue): """Reindex projects with recent changes.""" since = datetime.now() - timedelta(days=days_ago) queryset = Project.objects.filter(modified_date__gte=since).distinct() app_label = Project._meta.app_label model_name = Project.__name__ apply_async_kwargs = {'queue': queue} for doc in registry.get_documents(models=[Project]): indexing_tasks = self._get_indexing_tasks( app_label=app_label, model_name=model_name, queryset=queryset, index_name=doc._index._name, document_class=str(doc), ) for task in indexing_tasks: task.apply_async(**apply_async_kwargs) log.info("Tasks issued successfully. model=%s.%s items=%s", app_label, model_name, str(queryset.count()))
def _change_index(self, models, timestamp): for doc in registry.get_documents(models): queryset = doc().get_queryset() app_label = queryset.model._meta.app_label model_name = queryset.model.__name__ index_name = doc._index._name new_index_name = "{}_{}".format(index_name, timestamp) switch_es_index( app_label=app_label, model_name=model_name, index_name=index_name, new_index_name=new_index_name, ) log.info( "Index name changed. model=%s.%s from=%s to=%s", app_label, model_name, new_index_name, index_name, )
def fix_resources_links_protocol(self): self.stdout.write('Reading resource data from https_protocol_report') latest_report = None try: latest_report = Report.objects.filter( file__contains='http_protocol_resources').latest('created') except Report.DoesNotExist: self.stdout.write( 'No http_protocol_resources report,' ' you need to generate report first with: manage.py create_https_protocol_report.' ) if latest_report: file_path = latest_report.file self.stdout.write(f'Reading data from report: {file_path}') full_path = str(settings.ROOT_DIR) + file_path with open(full_path) as csvfile: report_data = csv.reader(csvfile, delimiter=',') next(report_data, None) resources_ids = [ row[0] for row in report_data if 'Wymagana poprawa' in row[2] ] self.stdout.write( f'Found {len(resources_ids)} resources to update link protocol.' ) edited_resources = [] resources = Resource.objects.filter(pk__in=resources_ids, link__contains='http://') edited_ids = [] for res in resources: old_link = res.link res.link = old_link.replace('http://', 'https://') edited_resources.append(res) edited_ids.append(res.pk) self.stdout.write('Attempting to update resources in db and ES.') Resource.objects.bulk_update(edited_resources, ['link']) docs = registry.get_documents((Resource, )) for doc in docs: self.stdout.write(f'Updating document {doc} in ES') doc().update(Resource.objects.filter(pk__in=edited_ids)) self.stdout.write(f'Updated {resources.count()} resources.')
def get_model_object_by_instance(self, instance): """ Provide Model object by elasticsearch response instance. """ document = None _object = None index_or_alias_name = ElasticsearchUtils.get_alias_by_index_name( instance.meta.index) for doc in registry.get_documents(): if index_or_alias_name == doc._index._name: # pylint: disable=protected-access document = doc break hit = self._build_hit(instance) es_pk = hit['_source'].get('pk') if document and es_pk: try: _object = document(hit).get_queryset().get(pk=es_pk) except ObjectDoesNotExist: log.error( "Object could not be found in database for SearchResult '%r'.", self) return _object