def test_update_record(self): domain = 'test-update-record' doc_type = 'form' initial_data = { 'd1': ['c1', 'c2'], 'd2': ['c1'], 'd3': ['c2'] } AsyncIndicator.objects.bulk_create([ AsyncIndicator( doc_id=doc_id, doc_type=doc_type, domain=domain, indicator_config_ids=sorted(config_ids)) for doc_id, config_ids in six.iteritems(initial_data) ]) updated_data = { 'd2': ['c2'], 'd3': ['c3'], 'd4': ['c2', 'c1'], 'd5': ['c4'] } with self.assertNumQueries(3): # 3 queries, 1 for query, 1 for update, 1 for create doc_type_by_ids = {i: doc_type for i in ['d1', 'd2', 'd3', 'd4', 'd5']} AsyncIndicator.bulk_update_records(updated_data, domain, doc_type_by_ids) self.assertEqual( self._get_indicator_data(), { 'd1': ['c1', 'c2'], 'd2': ['c1', 'c2'], 'd3': ['c2', 'c3'], 'd4': ['c1', 'c2'], 'd5': ['c4'] } )
def process_change(self, pillow_instance, change): self.bootstrap_if_needed() if change.deleted: # we don't currently support hard-deletions at all. # we may want to change this at some later date but seem ok for now. # see https://github.com/dimagi/commcare-hq/pull/6944 for rationale return domain = change.metadata.domain if not domain: # if no domain we won't save to any UCR table return async_tables = [] doc = change.get_document() ensure_document_exists(change) ensure_matched_revisions(change) if doc is None: return eval_context = EvaluationContext(doc) for table in self.table_adapters_by_domain[domain]: if table.config.filter(doc): if table.run_asynchronous: async_tables.append(table.config._id) else: self._save_doc_to_table(table, doc, eval_context) eval_context.reset_iteration() elif table.config.deleted_filter(doc): table.delete(doc) if async_tables: AsyncIndicator.update_indicators(change, async_tables)
def handle(self, domain, case_type, data_source_ids, **options): configs = [] for data_source_id in data_source_ids: config, _ = get_datasource_config(data_source_id, domain) assert config.asynchronous assert config.referenced_doc_type == CASE_DOC_TYPE configs.append(config) fake_change_doc = {'doc_type': CASE_DOC_TYPE, 'domain': domain} doc_store = get_document_store(domain, CASE_DOC_TYPE) case_accessor = doc_store.case_accessors case_ids = case_accessor.get_case_ids_in_domain(type=case_type) num_case_ids = len(case_ids) print("inserting %d docs" % num_case_ids) for config in configs: adapter = get_indicator_adapter(config, can_handle_laboratory=True) adapter.build_table() # normally called after rebuilding finishes adapter.after_table_build() config_ids = [config._id for config in configs] for i, case_id in enumerate(case_ids): change = FakeChange(case_id, fake_change_doc) AsyncIndicator.update_indicators(change, config_ids) if i % 1000 == 0: print("inserted %d / %d docs" % (i, num_case_ids))
def test_update_record(self): domain = 'test-update-record' doc_type = 'form' initial_data = {'d1': ['c1', 'c2'], 'd2': ['c1'], 'd3': ['c2']} AsyncIndicator.objects.bulk_create([ AsyncIndicator(doc_id=doc_id, doc_type=doc_type, domain=domain, indicator_config_ids=sorted(config_ids)) for doc_id, config_ids in initial_data.items() ]) updated_data = { 'd2': ['c2'], 'd3': ['c3'], 'd4': ['c2', 'c1'], 'd5': ['c4'] } with self.assertNumQueries(3): # 3 queries, 1 for query, 1 for update, 1 for create doc_type_by_ids = { i: doc_type for i in ['d1', 'd2', 'd3', 'd4', 'd5'] } AsyncIndicator.bulk_update_records(updated_data, domain, doc_type_by_ids) self.assertEqual( self._get_indicator_data(), { 'd1': ['c1', 'c2'], 'd2': ['c1', 'c2'], 'd3': ['c2', 'c3'], 'd4': ['c1', 'c2'], 'd5': ['c4'] })
def handle(self, domain, case_type, data_source_ids, **options): configs = [] for data_source_id in data_source_ids: config, _ = get_datasource_config(data_source_id, domain) assert config.asynchronous assert config.referenced_doc_type == CASE_DOC_TYPE configs.append(config) fake_change_doc = {'doc_type': CASE_DOC_TYPE, 'domain': domain} for config in configs: adapter = get_indicator_adapter(config, can_handle_laboratory=True) adapter.build_table() # normally called after rebuilding finishes adapter.after_table_build() self.domain = domain self.case_type = case_type config_ids = [config._id for config in configs] for case_id in self._get_case_ids_to_process(): change = FakeChange(case_id, fake_change_doc) AsyncIndicator.update_from_kafka_change(change, config_ids) for config in configs: if not config.is_static: config.meta.build.rebuilt_asynchronously = True config.save()
def _save_ids(self, ids): if self.bulk: AsyncIndicator.bulk_creation(ids, self.referenced_type, self.domain, self.config_ids) else: for id_ in ids: change = FakeChange(id_, self.fake_change_doc) AsyncIndicator.update_from_kafka_change(change, self.config_ids)
def _process_chunk_for_domain(self, domain, changes_chunk): adapters = list(self.table_adapters_by_domain[domain]) changes_by_id = {change.id: change for change in changes_chunk} to_delete_by_adapter = defaultdict(list) rows_to_save_by_adapter = defaultdict(list) async_configs_by_doc_id = defaultdict(list) to_update = {change for change in changes_chunk if not change.deleted} retry_changes, docs = self.get_docs_for_changes(to_update, domain) change_exceptions = [] for doc in docs: eval_context = EvaluationContext(doc) for adapter in adapters: if adapter.config.filter(doc): if adapter.run_asynchronous: async_configs_by_doc_id[doc['_id']].append( adapter.config._id) else: try: rows_to_save_by_adapter[adapter].extend( adapter.get_all_values(doc, eval_context)) except Exception as e: change_exceptions.append( (changes_by_id[doc["_id"]], e)) eval_context.reset_iteration() elif adapter.config.deleted_filter(doc) or adapter.doc_exists( doc): to_delete_by_adapter[adapter].append(doc['_id']) # bulk delete by adapter to_delete = [c.id for c in changes_chunk if c.deleted] for adapter in adapters: delete_ids = to_delete_by_adapter[adapter] + to_delete try: adapter.bulk_delete(delete_ids) except Exception as ex: notify_exception( None, "Error in deleting changes chunk {ids}: {ex}".format( ids=delete_ids, ex=ex)) retry_changes.update( [c for c in changes_chunk if c.id in delete_ids]) # bulk update by adapter for adapter, rows in six.iteritems(rows_to_save_by_adapter): try: adapter.save_rows(rows) except Exception as ex: notify_exception( None, "Error in saving changes chunk {ids}: {ex}".format( ids=[c.id for c in to_update], ex=repr(ex))) retry_changes.update(to_update) if async_configs_by_doc_id: doc_type_by_id = { _id: changes_by_id[_id].metadata.document_type for _id in async_configs_by_doc_id.keys() } AsyncIndicator.bulk_update_records(async_configs_by_doc_id, domain, doc_type_by_id) return retry_changes, change_exceptions
def _save_ids(self, ids): if self.bulk: AsyncIndicator.bulk_creation(ids, self.referenced_type, self.domain, self.config_ids) else: for id_ in ids: change = FakeChange(id_, self.fake_change_doc) AsyncIndicator.update_from_kafka_change( change, self.config_ids)
def _process_chunk_for_domain(self, domain, changes_chunk): adapters = list(self.table_adapters_by_domain[domain]) changes_by_id = {change.id: change for change in changes_chunk} to_delete_by_adapter = defaultdict(list) rows_to_save_by_adapter = defaultdict(list) async_configs_by_doc_id = defaultdict(list) to_update = {change for change in changes_chunk if not change.deleted} retry_changes, docs = self.get_docs_for_changes(to_update, domain) change_exceptions = [] for doc in docs: eval_context = EvaluationContext(doc) for adapter in adapters: if adapter.config.filter(doc): if adapter.run_asynchronous: async_configs_by_doc_id[doc['_id']].append(adapter.config._id) else: try: rows_to_save_by_adapter[adapter].extend(adapter.get_all_values(doc, eval_context)) except Exception as e: change_exceptions.append((changes_by_id[doc["_id"]], e)) eval_context.reset_iteration() elif adapter.config.deleted_filter(doc) or adapter.doc_exists(doc): to_delete_by_adapter[adapter].append(doc['_id']) # bulk delete by adapter to_delete = [c.id for c in changes_chunk if c.deleted] for adapter in adapters: delete_ids = to_delete_by_adapter[adapter] + to_delete try: adapter.bulk_delete(delete_ids) except Exception as ex: notify_exception( None, "Error in deleting changes chunk {ids}: {ex}".format( ids=delete_ids, ex=ex)) retry_changes.update([c for c in changes_chunk if c.id in delete_ids]) # bulk update by adapter for adapter, rows in six.iteritems(rows_to_save_by_adapter): try: adapter.save_rows(rows) except Exception as ex: notify_exception( None, "Error in saving changes chunk {ids}: {ex}".format( ids=[c.id for c in to_update], ex=repr(ex))) retry_changes.update(to_update) if async_configs_by_doc_id: doc_type_by_id = { _id: changes_by_id[_id].metadata.document_type for _id in async_configs_by_doc_id.keys() } AsyncIndicator.bulk_update_records(async_configs_by_doc_id, domain, doc_type_by_id) return retry_changes, change_exceptions
def _build_indicators(config, document_store, relevant_ids): adapter = get_indicator_adapter(config, raise_errors=True, load_source='build_indicators') for doc in document_store.iter_documents(relevant_ids): if config.asynchronous: AsyncIndicator.update_record( doc.get('_id'), config.referenced_doc_type, config.domain, [config._id] ) else: # save is a noop if the filter doesn't match adapter.best_effort_save(doc)
def handle(self, *args, **options): fake_change_doc = {'doc_type': CASE_DOC_TYPE, 'domain': DOMAIN} for data_source_id in DATA_SOURCES: print("processing data source %s" % data_source_id) data_source, is_static = get_datasource_config(data_source_id, DOMAIN) assert is_static adapter = get_indicator_adapter(data_source) table = adapter.get_table() for case_id in self._get_case_ids_to_process(adapter, table, data_source_id): change = FakeChange(case_id, fake_change_doc) AsyncIndicator.update_from_kafka_change(change, [data_source_id])
def _setup_docs_and_indicators(self): self.docs = [{ "_id": str(i), "domain": self.domain.name, "doc_type": "CommCareCase", "name": 'doc_name_' + str(i), "color": 'doc_color_' + str(i) } for i in range(10)] self.doc_ids = [str(i) for i in range(10)] AsyncIndicator.bulk_creation([doc["_id"] for doc in self.docs], "CommCareCase", self.domain, [])
def _setup_docs_and_indicators(self): self.docs = [ { "_id": str(i), "domain": self.domain.name, "doc_type": "CommCareCase", "name": 'doc_name_' + str(i), "color": 'doc_color_' + str(i) } for i in range(10) ] self.doc_ids = [str(i) for i in range(10)] AsyncIndicator.bulk_creation( [doc["_id"] for doc in self.docs], "CommCareCase", self.domain, [] )
def process_change(self, change): self.bootstrap_if_needed() domain = change.metadata.domain if not domain or domain not in self.table_adapters_by_domain: # if no domain we won't save to any UCR table return if change.deleted: adapters = list(self.table_adapters_by_domain[domain]) for table in adapters: table.delete({'_id': change.metadata.document_id}) async_tables = [] doc = change.get_document() ensure_document_exists(change) ensure_matched_revisions(change, doc) if doc is None: return with TimingContext() as timer: eval_context = EvaluationContext(doc) # make copy to avoid modifying list during iteration adapters = list(self.table_adapters_by_domain[domain]) doc_subtype = change.metadata.document_subtype for table in adapters: if table.config.filter(doc, eval_context): if table.run_asynchronous: async_tables.append(table.config._id) else: self._save_doc_to_table(domain, table, doc, eval_context) eval_context.reset_iteration() elif (doc_subtype is None or doc_subtype in table.config.get_case_type_or_xmlns_filter()): table.delete(doc) if async_tables: AsyncIndicator.update_from_kafka_change(change, async_tables) self.domain_timing_context.update(**{ domain: timer.duration })
def process_change(self, change): self.bootstrap_if_needed() domain = change.metadata.domain if not domain or domain not in self.table_adapters_by_domain: # if no domain we won't save to any UCR table return if change.deleted: adapters = list(self.table_adapters_by_domain[domain]) for table in adapters: table.delete({'_id': change.metadata.document_id}) async_tables = [] doc = change.get_document() ensure_document_exists(change) ensure_matched_revisions(change, doc) if doc is None: return with TimingContext() as timer: eval_context = EvaluationContext(doc) # make copy to avoid modifying list during iteration adapters = list(self.table_adapters_by_domain[domain]) for table in adapters: if table.config.filter(doc): if table.run_asynchronous: async_tables.append(table.config._id) else: self._save_doc_to_table(domain, table, doc, eval_context) eval_context.reset_iteration() elif table.config.deleted_filter(doc) or table.doc_exists(doc): table.delete(doc) if async_tables: AsyncIndicator.update_from_kafka_change(change, async_tables) self.domain_timing_context.update(**{ domain: timer.duration })
def create_async_indicator(doc_id): assert isinstance(doc_id, six.text_type) return AsyncIndicator(doc_id=doc_id, doc_type="CommCareCase", domain=DOMAIN, indicator_config_ids=[DATA_SOURCE_NAME])
def _process_chunk_for_domain(self, domain, changes_chunk): adapters = list(self.table_adapters_by_domain[domain]) changes_by_id = {change.id: change for change in changes_chunk} to_delete_by_adapter = defaultdict(list) rows_to_save_by_adapter = defaultdict(list) async_configs_by_doc_id = defaultdict(list) to_update = {change for change in changes_chunk if not change.deleted} with self._datadog_timing('extract'): retry_changes, docs = self.get_docs_for_changes(to_update, domain) change_exceptions = [] with self._datadog_timing('single_batch_transform'): for doc in docs: change = changes_by_id[doc['_id']] doc_subtype = change.metadata.document_subtype eval_context = EvaluationContext(doc) with self._datadog_timing('single_doc_transform'): for adapter in adapters: with self._datadog_timing('transform', adapter.config._id): if adapter.config.filter(doc, eval_context): if adapter.run_asynchronous: async_configs_by_doc_id[doc['_id']].append( adapter.config._id) else: try: rows_to_save_by_adapter[ adapter].extend( adapter.get_all_values( doc, eval_context)) except Exception as e: change_exceptions.append((change, e)) eval_context.reset_iteration() elif (doc_subtype is None or doc_subtype in adapter.config. get_case_type_or_xmlns_filter()): # Delete if the subtype is unknown or # if the subtype matches our filters, but the full filter no longer applies to_delete_by_adapter[adapter].append(doc) with self._datadog_timing('single_batch_delete'): # bulk delete by adapter to_delete = [{'_id': c.id} for c in changes_chunk if c.deleted] for adapter in adapters: delete_docs = to_delete_by_adapter[adapter] + to_delete if not delete_docs: continue with self._datadog_timing('delete', adapter.config._id): try: adapter.bulk_delete(delete_docs) except Exception: delete_ids = [doc['_id'] for doc in delete_docs] retry_changes.update( [c for c in changes_chunk if c.id in delete_ids]) with self._datadog_timing('single_batch_load'): # bulk update by adapter for adapter, rows in rows_to_save_by_adapter.items(): with self._datadog_timing('load', adapter.config._id): try: adapter.save_rows(rows) except Exception: retry_changes.update(to_update) if async_configs_by_doc_id: with self._datadog_timing('async_config_load'): doc_type_by_id = { _id: changes_by_id[_id].metadata.document_type for _id in async_configs_by_doc_id.keys() } AsyncIndicator.bulk_update_records(async_configs_by_doc_id, domain, doc_type_by_id) return retry_changes, change_exceptions