Esempio n. 1
0
    def bulk_update(self, works, retry_on_batch_failure=True):
        """Upload a batch of works to the search index at once."""

        from model import Work

        time1 = time.time()
        docs = Work.to_search_documents(works)

        for doc in docs:
            doc["_index"] = self.works_index
            doc["_type"] = self.work_document_type
        time2 = time.time()

        success_count, errors = self.bulk(
            docs,
            raise_on_error=False,
            raise_on_exception=False,
        )

        # If the entire update failed, try it one more time before giving up on the batch.
        if retry_on_batch_failure and len(errors) == len(docs):
            self.log.info("Elasticsearch bulk update timed out, trying again.")
            return self.bulk_update(works, retry_on_batch_failure=False)

        time3 = time.time()
        self.log.info("Created %i search documents in %.2f seconds" %
                      (len(docs), time2 - time1))
        self.log.info("Uploaded %i search documents in  %.2f seconds" %
                      (len(docs), time3 - time2))

        doc_ids = [d['_id'] for d in docs]

        # We weren't able to create search documents for these works, maybe
        # because they don't have presentation editions yet.
        missing_works = [work for work in works if work.id not in doc_ids]

        error_ids = [
            error.get('data', {}).get("_id", None)
            or error.get('index', {}).get('_id', None) for error in errors
        ]

        successes = [
            work for work in works
            if work.id in doc_ids and work.id not in error_ids
        ]

        failures = []
        for missing in missing_works:
            if not missing.presentation_ready:
                failures.append(
                    (work, "Work not indexed because not presentation-ready."))
            else:
                failures.append((work, "Work not indexed"))

        for error in errors:
            error_id = error.get('data', {}).get('_id', None) or error.get(
                'index', {}).get('_id', None)

            work = None
            works_with_error = [work for work in works if work.id == error_id]
            if works_with_error:
                work = works_with_error[0]

            exception = error.get('exception', None)
            error_message = error.get('error', None)
            if not error_message:
                error_message = error.get('index', {}).get('error', None)

            failures.append((work, error_message))

        self.log.info(
            "Successfully indexed %i documents, failed to index %i." %
            (success_count, len(failures)))

        return successes, failures
Esempio n. 2
0
    def bulk_update(self, works, retry_on_batch_failure=True):
        """Upload a batch of works to the search index at once."""

        time1 = time.time()
        needs_add = []
        successes = []
        for work in works:
            if work.presentation_ready:
                needs_add.append(work)
            else:
                # Works are removed one at a time, which shouldn't
                # pose a performance problem because works almost never
                # stop being presentation ready.
                self.remove_work(work)
                successes.append(work)

        # Add any works that need adding.
        docs = Work.to_search_documents(needs_add)

        for doc in docs:
            doc["_index"] = self.works_index
            doc["_type"] = self.work_document_type
        time2 = time.time()

        success_count, errors = self.bulk(
            docs,
            raise_on_error=False,
            raise_on_exception=False,
        )

        # If the entire update failed, try it one more time before
        # giving up on the batch.
        #
        # Removed works were already removed, so no need to try them again.
        if len(errors) == len(docs):
            if retry_on_batch_failure:
                self.log.info("Elasticsearch bulk update timed out, trying again.")
                return self.bulk_update(needs_add, retry_on_batch_failure=False)
            else:
                docs = []

        time3 = time.time()
        self.log.info("Created %i search documents in %.2f seconds" % (len(docs), time2 - time1))
        self.log.info("Uploaded %i search documents in  %.2f seconds" % (len(docs), time3 - time2))
        
        doc_ids = [d['_id'] for d in docs]
        
        # We weren't able to create search documents for these works, maybe
        # because they don't have presentation editions yet.
        def get_error_id(error):
            return error.get('data', {}).get('_id', None) or error.get('index', {}).get('_id', None)   
        error_ids = [get_error_id(error) for error in errors]

        missing_works = [
            work for work in works 
            if work.id not in doc_ids and work.id not in error_ids
            and work not in successes
        ]
            
        successes.extend(
            [work for work in works 
             if work.id in doc_ids and work.id not in error_ids]
        )

        failures = []
        for missing in missing_works:
            failures.append((work, "Work not indexed"))

        for error in errors:
            
            error_id = get_error_id(error)
            work = None
            works_with_error = [work for work in works if work.id == error_id]
            if works_with_error:
                work = works_with_error[0]

            exception = error.get('exception', None)
            error_message = error.get('error', None)
            if not error_message:
                error_message = error.get('index', {}).get('error', None)

            failures.append((work, error_message))

        self.log.info("Successfully indexed %i documents, failed to index %i." % (success_count, len(failures)))

        return successes, failures