Example #1
0
    def handle(self, *args, **options):
        qs = AbstractCreativeWork.objects.annotate(has_identifiers=Exists(
            WorkIdentifier.objects.filter(
                creative_work=OuterRef('pk')))).exclude(has_identifiers=True)

        indexer = SearchIndexer(celery_app)

        for id in qs.values_list('id', flat=True).iterator():
            indexer.index('creativework', id)
Example #2
0
def reindex_works(works, index=None, urgent=True):
    if not isinstance(works, list):
        works = [works]
    work_ids = [work.id for work in works]
    if (work_ids):
        print('Indexing {} works'.format(len(work_ids)))
        indexer = SearchIndexer()
        indexer.index('creativework', *work_ids, index=index, urgent=urgent)
    else:
        print('No works to index')
    def handle(self, *args, **options):
        qs = AbstractCreativeWork.objects.annotate(
            has_identifiers=Exists(
                WorkIdentifier.objects.filter(creative_work=OuterRef('pk'))
            )
        ).exclude(has_identifiers=True)

        indexer = SearchIndexer(celery_app)

        for id in qs.values_list('id', flat=True).iterator():
            indexer.index('creativework', id)
Example #4
0
    def test_related_works(self):
        def part_of(child_work, parent_work):
            factories.AbstractWorkRelationFactory(
                type='share.ispartof',
                subject=child_work,
                related=parent_work
            )

        def retracts(retraction, work):
            factories.AbstractWorkRelationFactory(
                type='share.retracts',
                subject=retraction,
                related=work
            )

        child = factories.AbstractCreativeWorkFactory()
        lost_sibling = factories.AbstractCreativeWorkFactory(is_deleted=True)
        parent = factories.AbstractCreativeWorkFactory()
        gparent = factories.AbstractCreativeWorkFactory()
        ggparent = factories.AbstractCreativeWorkFactory()
        gggparent = factories.AbstractCreativeWorkFactory()

        retraction = factories.AbstractCreativeWorkFactory()

        part_of(child, parent)
        part_of(lost_sibling, parent)
        part_of(parent, gparent)
        part_of(gparent, ggparent)
        part_of(ggparent, gggparent)
        retracts(retraction, child)

        cases = [
            ({child}, {child}),
            ({lost_sibling}, {lost_sibling}),
            ({parent}, {parent, child}),
            ({gparent}, {gparent, parent, child}),
            ({ggparent}, {ggparent, gparent, parent, child}),
            ({gggparent}, {gggparent, ggparent, gparent, parent}),
            ({retraction}, {retraction, child}),
            ({retraction, ggparent}, {retraction, ggparent, gparent, parent, child}),
        ]

        for input, expected in cases:
            input_ids = {w.id for w in input}
            expected_ids = {w.id for w in expected}
            actual_ids = SearchIndexer(None).pks_to_reindex(models.AbstractCreativeWork, input_ids)
            assert expected_ids == actual_ids
Example #5
0
    def run(self, chunk_size=500):
        from bots.elasticsearch import tasks  # TODO fix me

        if self.es_setup:
            self.setup()
        else:
            logger.debug('Skipping ES setup')

        logger.info('Loading up indexed models')
        for model_name in self.INDEX_MODELS:
            if self.es_models and model_name.lower() not in self.es_models:
                continue

            model = apps.get_model('share', model_name)

            if self.es_filter:
                logger.info('Looking for %ss that match %s', model, self.es_filter)
                qs = model.objects.filter(**self.es_filter).values_list('id', flat=True)
            else:
                most_recent_result = pendulum.parse(self.get_most_recently_modified())
                logger.info('Looking for %ss that have been modified after %s', model, most_recent_result)
                q = Q(date_modified__gt=most_recent_result)
                if hasattr(model, 'subjects') and hasattr(model, 'subject_relations'):
                    q = q | Q(subjects__date_modified__gt=most_recent_result) | Q(subject_relations__date_modified__gt=most_recent_result)
                qs = model.objects.filter(q).values_list('id', flat=True)

            count = qs.count()

            if count < 1:
                logger.info('Found 0 qualifying %ss', model)
                continue
            else:
                logger.info('Found %s %s that must be updated in ES', count, model)

            for batch in chunk(qs.iterator(), chunk_size):
                if batch:
                    if not self.to_daemon:
                        tasks.index_model.apply_async((model.__name__, batch,), {'es_url': self.es_url, 'es_index': self.es_index})
                    else:
                        try:
                            SearchIndexer(celery.current_app).index(model.__name__, *batch, index=self.es_index if self.es_index != settings.ELASTICSEARCH_INDEX else None)
                        except ValueError:
                            logger.warning('Not sending model type %r to the SearchIndexer', model)

        logger.info('Starting task to index sources')
        tasks.index_sources.apply_async((), {'es_url': self.es_url, 'es_index': self.es_index})
Example #6
0
def disambiguate(self, normalized_id):
    normalized = NormalizedData.objects.select_related('source__source').get(
        pk=normalized_id)

    if self.request.id:
        self.update_state(meta={'source': normalized.source.source.long_title})

    # Load all relevant ContentTypes in a single query
    ContentType.objects.get_for_models(*apps.get_models('share'),
                                       for_concrete_models=False)

    updated = None

    try:
        with transaction.atomic():
            cg = ChangeGraph(normalized.data['@graph'],
                             namespace=normalized.source.username)
            cg.process()
            cs = ChangeSet.objects.from_graph(cg, normalized.id)
            if cs and (normalized.source.is_robot
                       or normalized.source.is_trusted or
                       Source.objects.filter(user=normalized.source).exists()):
                # TODO: verify change set is not overwriting user created object
                updated = cs.accept()
    except Exception as e:
        raise self.retry(
            exc=e,
            countdown=(random.random() + 1) *
            min(settings.CELERY_RETRY_BACKOFF_BASE**self.request.retries,
                60 * 15))

    if not updated:
        return
    # Only index creativeworks on the fly, for the moment.
    updated_works = set(x.id for x in updated
                        if isinstance(x, AbstractCreativeWork))
    existing_works = set(n.instance.id for n in cg.nodes
                         if isinstance(n.instance, AbstractCreativeWork))
    ids = list(updated_works | existing_works)

    try:
        SearchIndexer(self.app).index('creativework', *ids)
    except Exception as e:
        logger.exception('Could not add results from %r to elasticqueue',
                         normalized)
        raise
Example #7
0
 def _update_index(self, work_ids, urgent):
     indexer = SearchIndexer(
         self.task.app) if self.task else SearchIndexer()
     indexer.index('creativework', *work_ids, urgent=urgent)
Example #8
0
 def _update_index(self, work_ids, urgent):
     indexer = SearchIndexer(self.task.app) if self.task else SearchIndexer()
     indexer.index('creativework', *work_ids, urgent=urgent)
Example #9
0
 def test_noops(self, model, pks):
     result = SearchIndexer(None).pks_to_reindex(model, pks)
     assert result == pks