Example #1
0
    def sync(self, limit):
        """
        Synchronize a batch of annotations from Postgres to Elasticsearch.

        Called periodically by a Celery task (see h-periodic).

        Each time this method runs it considers a fixed number of sync
        annotation jobs from the queue and for each job:

        * If the annotation is already the same in Elastic as in the DB then
          remove the job from the queue

        * If the annotation is missing from Elastic or different in Elastic
          than in the DB then re-sync the annotation into Elastic. Leave the
          job on the queue to be re-checked and removed the next time the
          method runs.
        """
        jobs = self._get_jobs_from_queue(limit)

        if not jobs:
            return {}

        counts = defaultdict(set)

        annotation_ids = {
            URLSafeUUID.hex_to_url_safe(job.kwargs["annotation_id"])
            for job in jobs if not job.kwargs.get("force", False)
        }
        if annotation_ids:
            annotations_from_db = self._get_annotations_from_db(annotation_ids)
            annotations_from_es = self._get_annotations_from_es(annotation_ids)
        else:
            annotations_from_db = {}
            annotations_from_es = {}

        # Completed jobs that can be removed from the queue.
        job_complete = []

        # IDs of annotations to (re-)add to Elasticsearch because they're
        # either missing from Elasticsearch or are different in Elasticsearch
        # than in the DB.
        annotation_ids_to_sync = set()

        for job in jobs:
            annotation_id = URLSafeUUID.hex_to_url_safe(
                job.kwargs["annotation_id"])
            annotation_from_db = annotations_from_db.get(annotation_id)
            annotation_from_es = annotations_from_es.get(annotation_id)

            if job.kwargs.get("force", False):
                annotation_ids_to_sync.add(annotation_id)
                job_complete.append(job)
                counts[Queue.Result.SYNCED_FORCED.format(
                    tag=job.tag)].add(annotation_id)
                counts[Queue.Result.SYNCED_TAG_TOTAL.format(
                    tag=job.tag)].add(annotation_id)
                counts[Queue.Result.SYNCED_TOTAL].add(annotation_id)
                counts[Queue.Result.COMPLETED_FORCED.format(tag=job.tag)].add(
                    job.id)
                counts[Queue.Result.COMPLETED_TAG_TOTAL.format(
                    tag=job.tag)].add(job.id)
                counts[Queue.Result.COMPLETED_TOTAL].add(job.id)
            elif not annotation_from_db:
                job_complete.append(job)
                counts[Queue.Result.COMPLETED_DELETED.format(tag=job.tag)].add(
                    job.id)
                counts[Queue.Result.COMPLETED_TAG_TOTAL.format(
                    tag=job.tag)].add(job.id)
                counts[Queue.Result.COMPLETED_TOTAL].add(job.id)
            elif not annotation_from_es:
                annotation_ids_to_sync.add(annotation_id)
                counts[Queue.Result.SYNCED_MISSING.format(
                    tag=job.tag)].add(annotation_id)
                counts[Queue.Result.SYNCED_TAG_TOTAL.format(
                    tag=job.tag)].add(annotation_id)
                counts[Queue.Result.SYNCED_TOTAL].add(annotation_id)
            elif not self._equal(annotation_from_es, annotation_from_db):
                annotation_ids_to_sync.add(annotation_id)
                counts[Queue.Result.SYNCED_DIFFERENT.format(
                    tag=job.tag)].add(annotation_id)
                counts[Queue.Result.SYNCED_TAG_TOTAL.format(
                    tag=job.tag)].add(annotation_id)
                counts[Queue.Result.SYNCED_TOTAL].add(annotation_id)
            else:
                job_complete.append(job)
                counts[Queue.Result.COMPLETED_UP_TO_DATE.format(
                    tag=job.tag)].add(job.id)
                counts[Queue.Result.COMPLETED_TAG_TOTAL.format(
                    tag=job.tag)].add(job.id)
                counts[Queue.Result.COMPLETED_TOTAL].add(job.id)

        for job in job_complete:
            self._db.delete(job)

        if annotation_ids_to_sync:
            self._batch_indexer.index(list(annotation_ids_to_sync))

        return {key: len(value) for key, value in counts.items()}
Example #2
0
 def database_id(self, annotation):
     """Return `annotation.id` in the internal format used within the database."""
     return str(uuid.UUID(URLSafeUUID.url_safe_to_hex(annotation.id)))
Example #3
0
 def url_safe_id(self, job):
     """Return the URL-safe version of the given job's annotation ID."""
     return URLSafeUUID.hex_to_url_safe(job.kwargs["annotation_id"])
Example #4
0
    def sync(self, limit):
        """
        Synchronize a batch of annotations from Postgres to Elasticsearch.

        Called periodically by a Celery task (see h-periodic).

        Each time this method runs it considers a fixed number of sync
        annotation jobs from the queue and for each job:

        * If the annotation is already the same in Elastic as in the DB then
          remove the job from the queue

        * If the annotation is missing from Elastic or different in Elastic
          than in the DB then re-sync the annotation into Elastic. Leave the
          job on the queue to be re-checked and removed the next time the
          method runs.
        """
        jobs = self._get_jobs_from_queue(limit)

        if not jobs:
            return

        annotation_ids = {
            URLSafeUUID.hex_to_url_safe(job.kwargs["annotation_id"])
            for job in jobs if not job.kwargs.get("force", False)
        }
        if annotation_ids:
            annotations_from_db = self._get_annotations_from_db(annotation_ids)
            annotations_from_es = self._get_annotations_from_es(annotation_ids)
        else:
            annotations_from_db = {}
            annotations_from_es = {}

        # Completed jobs that can be removed from the queue.
        job_complete = []

        # IDs of annotations to (re-)add to Elasticsearch because they're
        # either missing from Elasticsearch or are different in Elasticsearch
        # than in the DB.
        annotation_ids_to_sync = set()

        counts = Counter()

        for job in jobs:
            annotation_id = URLSafeUUID.hex_to_url_safe(
                job.kwargs["annotation_id"])
            annotation_from_db = annotations_from_db.get(annotation_id)
            annotation_from_es = annotations_from_es.get(annotation_id)

            if job.kwargs.get("force", False):
                annotation_ids_to_sync.add(annotation_id)
                job_complete.append(job)
                counts[Queue.Result.FORCED] += 1
            elif not annotation_from_db:
                job_complete.append(job)
                counts[Queue.Result.DELETED_FROM_DB] += 1
            elif not annotation_from_es:
                annotation_ids_to_sync.add(annotation_id)
                counts[Queue.Result.MISSING] += 1
            elif annotation_from_es["updated"] != annotation_from_db.updated:
                annotation_ids_to_sync.add(annotation_id)
                counts[Queue.Result.OUT_OF_DATE] += 1
            else:
                job_complete.append(job)
                counts[Queue.Result.UP_TO_DATE] += 1

        for job in job_complete:
            self._db.delete(job)

        if annotation_ids_to_sync:
            self._batch_indexer.index(list(annotation_ids_to_sync))

        LOG.info(dict(counts))