def bulk_update(self, works, retry_on_batch_failure=True): """Upload a batch of works to the search index at once.""" from model import Work time1 = time.time() docs = Work.to_search_documents(works) for doc in docs: doc["_index"] = self.works_index doc["_type"] = self.work_document_type time2 = time.time() success_count, errors = self.bulk( docs, raise_on_error=False, raise_on_exception=False, ) # If the entire update failed, try it one more time before giving up on the batch. if retry_on_batch_failure and len(errors) == len(docs): self.log.info("Elasticsearch bulk update timed out, trying again.") return self.bulk_update(works, retry_on_batch_failure=False) time3 = time.time() self.log.info("Created %i search documents in %.2f seconds" % (len(docs), time2 - time1)) self.log.info("Uploaded %i search documents in %.2f seconds" % (len(docs), time3 - time2)) doc_ids = [d['_id'] for d in docs] # We weren't able to create search documents for these works, maybe # because they don't have presentation editions yet. missing_works = [work for work in works if work.id not in doc_ids] error_ids = [ error.get('data', {}).get("_id", None) or error.get('index', {}).get('_id', None) for error in errors ] successes = [ work for work in works if work.id in doc_ids and work.id not in error_ids ] failures = [] for missing in missing_works: if not missing.presentation_ready: failures.append( (work, "Work not indexed because not presentation-ready.")) else: failures.append((work, "Work not indexed")) for error in errors: error_id = error.get('data', {}).get('_id', None) or error.get( 'index', {}).get('_id', None) work = None works_with_error = [work for work in works if work.id == error_id] if works_with_error: work = works_with_error[0] exception = error.get('exception', None) error_message = error.get('error', None) if not error_message: error_message = error.get('index', {}).get('error', None) failures.append((work, error_message)) self.log.info( "Successfully indexed %i documents, failed to index %i." % (success_count, len(failures))) return successes, failures
def bulk_update(self, works, retry_on_batch_failure=True): """Upload a batch of works to the search index at once.""" time1 = time.time() needs_add = [] successes = [] for work in works: if work.presentation_ready: needs_add.append(work) else: # Works are removed one at a time, which shouldn't # pose a performance problem because works almost never # stop being presentation ready. self.remove_work(work) successes.append(work) # Add any works that need adding. docs = Work.to_search_documents(needs_add) for doc in docs: doc["_index"] = self.works_index doc["_type"] = self.work_document_type time2 = time.time() success_count, errors = self.bulk( docs, raise_on_error=False, raise_on_exception=False, ) # If the entire update failed, try it one more time before # giving up on the batch. # # Removed works were already removed, so no need to try them again. if len(errors) == len(docs): if retry_on_batch_failure: self.log.info("Elasticsearch bulk update timed out, trying again.") return self.bulk_update(needs_add, retry_on_batch_failure=False) else: docs = [] time3 = time.time() self.log.info("Created %i search documents in %.2f seconds" % (len(docs), time2 - time1)) self.log.info("Uploaded %i search documents in %.2f seconds" % (len(docs), time3 - time2)) doc_ids = [d['_id'] for d in docs] # We weren't able to create search documents for these works, maybe # because they don't have presentation editions yet. def get_error_id(error): return error.get('data', {}).get('_id', None) or error.get('index', {}).get('_id', None) error_ids = [get_error_id(error) for error in errors] missing_works = [ work for work in works if work.id not in doc_ids and work.id not in error_ids and work not in successes ] successes.extend( [work for work in works if work.id in doc_ids and work.id not in error_ids] ) failures = [] for missing in missing_works: failures.append((work, "Work not indexed")) for error in errors: error_id = get_error_id(error) work = None works_with_error = [work for work in works if work.id == error_id] if works_with_error: work = works_with_error[0] exception = error.get('exception', None) error_message = error.get('error', None) if not error_message: error_message = error.get('index', {}).get('error', None) failures.append((work, error_message)) self.log.info("Successfully indexed %i documents, failed to index %i." % (success_count, len(failures))) return successes, failures