def handle(self, *args, **options): qs = AbstractCreativeWork.objects.annotate(has_identifiers=Exists( WorkIdentifier.objects.filter( creative_work=OuterRef('pk')))).exclude(has_identifiers=True) indexer = SearchIndexer(celery_app) for id in qs.values_list('id', flat=True).iterator(): indexer.index('creativework', id)
def reindex_works(works, index=None, urgent=True): if not isinstance(works, list): works = [works] work_ids = [work.id for work in works] if (work_ids): print('Indexing {} works'.format(len(work_ids))) indexer = SearchIndexer() indexer.index('creativework', *work_ids, index=index, urgent=urgent) else: print('No works to index')
def handle(self, *args, **options): qs = AbstractCreativeWork.objects.annotate( has_identifiers=Exists( WorkIdentifier.objects.filter(creative_work=OuterRef('pk')) ) ).exclude(has_identifiers=True) indexer = SearchIndexer(celery_app) for id in qs.values_list('id', flat=True).iterator(): indexer.index('creativework', id)
def test_related_works(self): def part_of(child_work, parent_work): factories.AbstractWorkRelationFactory( type='share.ispartof', subject=child_work, related=parent_work ) def retracts(retraction, work): factories.AbstractWorkRelationFactory( type='share.retracts', subject=retraction, related=work ) child = factories.AbstractCreativeWorkFactory() lost_sibling = factories.AbstractCreativeWorkFactory(is_deleted=True) parent = factories.AbstractCreativeWorkFactory() gparent = factories.AbstractCreativeWorkFactory() ggparent = factories.AbstractCreativeWorkFactory() gggparent = factories.AbstractCreativeWorkFactory() retraction = factories.AbstractCreativeWorkFactory() part_of(child, parent) part_of(lost_sibling, parent) part_of(parent, gparent) part_of(gparent, ggparent) part_of(ggparent, gggparent) retracts(retraction, child) cases = [ ({child}, {child}), ({lost_sibling}, {lost_sibling}), ({parent}, {parent, child}), ({gparent}, {gparent, parent, child}), ({ggparent}, {ggparent, gparent, parent, child}), ({gggparent}, {gggparent, ggparent, gparent, parent}), ({retraction}, {retraction, child}), ({retraction, ggparent}, {retraction, ggparent, gparent, parent, child}), ] for input, expected in cases: input_ids = {w.id for w in input} expected_ids = {w.id for w in expected} actual_ids = SearchIndexer(None).pks_to_reindex(models.AbstractCreativeWork, input_ids) assert expected_ids == actual_ids
def run(self, chunk_size=500): from bots.elasticsearch import tasks # TODO fix me if self.es_setup: self.setup() else: logger.debug('Skipping ES setup') logger.info('Loading up indexed models') for model_name in self.INDEX_MODELS: if self.es_models and model_name.lower() not in self.es_models: continue model = apps.get_model('share', model_name) if self.es_filter: logger.info('Looking for %ss that match %s', model, self.es_filter) qs = model.objects.filter(**self.es_filter).values_list('id', flat=True) else: most_recent_result = pendulum.parse(self.get_most_recently_modified()) logger.info('Looking for %ss that have been modified after %s', model, most_recent_result) q = Q(date_modified__gt=most_recent_result) if hasattr(model, 'subjects') and hasattr(model, 'subject_relations'): q = q | Q(subjects__date_modified__gt=most_recent_result) | Q(subject_relations__date_modified__gt=most_recent_result) qs = model.objects.filter(q).values_list('id', flat=True) count = qs.count() if count < 1: logger.info('Found 0 qualifying %ss', model) continue else: logger.info('Found %s %s that must be updated in ES', count, model) for batch in chunk(qs.iterator(), chunk_size): if batch: if not self.to_daemon: tasks.index_model.apply_async((model.__name__, batch,), {'es_url': self.es_url, 'es_index': self.es_index}) else: try: SearchIndexer(celery.current_app).index(model.__name__, *batch, index=self.es_index if self.es_index != settings.ELASTICSEARCH_INDEX else None) except ValueError: logger.warning('Not sending model type %r to the SearchIndexer', model) logger.info('Starting task to index sources') tasks.index_sources.apply_async((), {'es_url': self.es_url, 'es_index': self.es_index})
def disambiguate(self, normalized_id): normalized = NormalizedData.objects.select_related('source__source').get( pk=normalized_id) if self.request.id: self.update_state(meta={'source': normalized.source.source.long_title}) # Load all relevant ContentTypes in a single query ContentType.objects.get_for_models(*apps.get_models('share'), for_concrete_models=False) updated = None try: with transaction.atomic(): cg = ChangeGraph(normalized.data['@graph'], namespace=normalized.source.username) cg.process() cs = ChangeSet.objects.from_graph(cg, normalized.id) if cs and (normalized.source.is_robot or normalized.source.is_trusted or Source.objects.filter(user=normalized.source).exists()): # TODO: verify change set is not overwriting user created object updated = cs.accept() except Exception as e: raise self.retry( exc=e, countdown=(random.random() + 1) * min(settings.CELERY_RETRY_BACKOFF_BASE**self.request.retries, 60 * 15)) if not updated: return # Only index creativeworks on the fly, for the moment. updated_works = set(x.id for x in updated if isinstance(x, AbstractCreativeWork)) existing_works = set(n.instance.id for n in cg.nodes if isinstance(n.instance, AbstractCreativeWork)) ids = list(updated_works | existing_works) try: SearchIndexer(self.app).index('creativework', *ids) except Exception as e: logger.exception('Could not add results from %r to elasticqueue', normalized) raise
def _update_index(self, work_ids, urgent): indexer = SearchIndexer( self.task.app) if self.task else SearchIndexer() indexer.index('creativework', *work_ids, urgent=urgent)
def _update_index(self, work_ids, urgent): indexer = SearchIndexer(self.task.app) if self.task else SearchIndexer() indexer.index('creativework', *work_ids, urgent=urgent)
def test_noops(self, model, pks): result = SearchIndexer(None).pks_to_reindex(model, pks) assert result == pks