def process_changes_chunk(self, changes_chunk): with self._datadog_timing('bulk_extract'): bad_changes, docs = bulk_fetch_changes_docs(changes_chunk) with self._datadog_timing('bulk_transform'): changes_to_process = { change.id: change for change in changes_chunk if change.document and not self.doc_filter_fn(change.document) } retry_changes = list(bad_changes) error_collector = ErrorCollector() es_actions = build_bulk_payload(self.index_info, list(changes_to_process.values()), self.doc_transform_fn, error_collector) error_changes = error_collector.errors try: with self._datadog_timing('bulk_load'): _, errors = self.es_interface.bulk_ops( es_actions, raise_on_error=False, raise_on_exception=False) except Exception as e: pillow_logging.exception("[%s] ES bulk load error") error_changes.extend([(change, e) for change in changes_to_process.values()]) else: for change_id, error_msg in get_errors_with_ids(errors): error_changes.append((changes_to_process[change_id], BulkDocException(error_msg))) return retry_changes, error_changes
def test_get_docs(self): missing_case_ids = [uuid.uuid4().hex, uuid.uuid4().hex] changes = self._changes_from_ids(self.case_ids + missing_case_ids) bad_changes, result_docs = bulk_fetch_changes_docs(changes, 'domain') self.assertEqual(set(self.case_ids), set([doc['_id'] for doc in result_docs])) self.assertEqual(set(missing_case_ids), set([change.id for change in bad_changes]))
def _process_chunk_for_domain(self, domain, changes_chunk): adapters = list(self.table_adapters_by_domain[domain]) changes_by_id = {change.id: change for change in changes_chunk} to_delete_by_adapter = defaultdict(list) rows_to_save_by_adapter = defaultdict(list) async_configs_by_doc_id = defaultdict(list) to_update = {change for change in changes_chunk if not change.deleted} with self._metrics_timer('extract'): retry_changes, docs = bulk_fetch_changes_docs(to_update, domain) change_exceptions = [] with self._metrics_timer('single_batch_transform'): for doc in docs: change = changes_by_id[doc['_id']] doc_subtype = change.metadata.document_subtype eval_context = EvaluationContext(doc) with self._metrics_timer('single_doc_transform'): for adapter in adapters: with self._per_config_metrics_timer('transform', adapter.config._id): if adapter.config.filter(doc, eval_context): if adapter.run_asynchronous: async_configs_by_doc_id[doc['_id']].append(adapter.config._id) else: try: rows_to_save_by_adapter[adapter].extend(adapter.get_all_values(doc, eval_context)) except Exception as e: change_exceptions.append((change, e)) eval_context.reset_iteration() elif (doc_subtype is None or doc_subtype in adapter.config.get_case_type_or_xmlns_filter()): # Delete if the subtype is unknown or # if the subtype matches our filters, but the full filter no longer applies to_delete_by_adapter[adapter].append(doc) with self._metrics_timer('single_batch_delete'): # bulk delete by adapter to_delete = [{'_id': c.id} for c in changes_chunk if c.deleted] for adapter in adapters: delete_docs = to_delete_by_adapter[adapter] + to_delete if not delete_docs: continue with self._per_config_metrics_timer('delete', adapter.config._id): try: adapter.bulk_delete(delete_docs) except Exception: delete_ids = [doc['_id'] for doc in delete_docs] retry_changes.update([c for c in changes_chunk if c.id in delete_ids]) with self._metrics_timer('single_batch_load'): # bulk update by adapter for adapter, rows in rows_to_save_by_adapter.items(): with self._per_config_metrics_timer('load', adapter.config._id): try: adapter.save_rows(rows) except Exception: retry_changes.update(to_update) if async_configs_by_doc_id: with self._metrics_timer('async_config_load'): doc_type_by_id = { _id: changes_by_id[_id].metadata.document_type for _id in async_configs_by_doc_id.keys() } AsyncIndicator.bulk_update_records(async_configs_by_doc_id, domain, doc_type_by_id) return retry_changes, change_exceptions