def test_hash_in_python(self): # test that python hashing matches with SQL hashing N = 2048 doc_ids = [str(i) for i in range(N)] sql_hashes = ShardAccessor.hash_doc_ids_sql(doc_ids) csiphash_hashes = ShardAccessor.hash_doc_ids_python(doc_ids) self.assertEquals(len(csiphash_hashes), N) self.assertTrue( all( isinstance(hash_, (int, long)) for hash_ in csiphash_hashes.values())) N_shards = 1024 part_mask = N_shards - 1 sql_shards = { doc_id: hash_ & part_mask for doc_id, hash_ in sql_hashes.items() } python_shards = { doc_id: hash_ & part_mask for doc_id, hash_ in sql_hashes.items() } self.assertEqual(python_shards, sql_shards)
def handle(self, domain, **options): forms_by_shard = Counter() forms_by_db = Counter() cases_by_shard = Counter() cases_by_db = Counter() print('======================== forms ========================') print('id\t\t\t\t\tshard\tdatabase') for form_id in sorted( FormAccessors(domain=domain).get_all_form_ids_in_domain()): shard_id, dbname = ShardAccessor.get_shard_id_and_database_for_doc( form_id) forms_by_shard[shard_id] += 1 forms_by_db[dbname] += 1 print('{}\t{}\t{}'.format(form_id, shard_id, dbname)) print('\n======================== cases ========================') print('id\t\t\t\t\tshard\tdatabase') for case_id in sorted( CaseAccessors(domain=domain).get_case_ids_in_domain()): shard_id, dbname = ShardAccessor.get_shard_id_and_database_for_doc( case_id) cases_by_shard[shard_id] += 1 cases_by_db[dbname] += 1 print('{}\t{}\t{}'.format(case_id, shard_id, dbname)) _print(forms_by_shard, 'forms by shard') _print(forms_by_db, 'forms by db') _print(cases_by_shard, 'cases by shard') _print(cases_by_db, 'cases by db')
def test_get_docs_by_database(self): # test_python_hashing_gives_correct_db ensures the hashing works correctly so this just tests # that get_docs_by_database is consistent with get_database_for_docs form_ids = [str(uuid4()) for i in range(100)] dbs_for_docs = ShardAccessor.get_database_for_docs(form_ids) docs_for_dbs = ShardAccessor.get_docs_by_database(form_ids) for db, doc_ids in docs_for_dbs.items(): for doc_id in doc_ids: self.assertEqual(db, dbs_for_docs[doc_id])
def get_db_alias_for_partitioned_doc(partition_value): if settings.USE_PARTITIONED_DATABASE: from corehq.form_processor.backends.sql.dbaccessors import ShardAccessor db_name = ShardAccessor.get_database_for_doc(partition_value) else: db_name = 'default' return db_name
def test_hash_doc_ids(self): N = 1001 doc_ids = [str(i) for i in range(N)] hashes = ShardAccessor.hash_doc_ids_sql(doc_ids) self.assertEquals(len(hashes), N) self.assertTrue( all(isinstance(hash_, int) for hash_ in hashes.values()))
def test_python_hashing_gives_correct_db(self): # Rudimentary test to ensure that python sharding matches SQL sharding num_forms = 100 form_ids = [create_form_for_test(DOMAIN).form_id for i in range(num_forms)] dbs_for_docs = ShardAccessor.get_database_for_docs(form_ids) for form_id, db_alias in dbs_for_docs.items(): XFormInstanceSQL.objects.using(db_alias).get(form_id=form_id)
def test_hash_in_python(self): # test that python hashing matches with SQL hashing N = 2048 doc_ids = [str(i) for i in range(N)] sql_hashes = ShardAccessor.hash_doc_ids_sql(doc_ids) csiphash_hashes = ShardAccessor.hash_doc_ids_python(doc_ids) self.assertEquals(len(csiphash_hashes), N) self.assertTrue(all(isinstance(hash_, (int, long)) for hash_ in csiphash_hashes.values())) N_shards = 1024 part_mask = N_shards - 1 sql_shards = {doc_id: hash_ & part_mask for doc_id, hash_ in sql_hashes.items()} python_shards = {doc_id: hash_ & part_mask for doc_id, hash_ in sql_hashes.items()} self.assertEqual(python_shards, sql_shards)
def test_settings(self): """ The tests in this class assume a certain partitioned setup to ensure the partitioning is working properly, so this test makes sure those assumptions are valid. """ self.assertEqual(len(settings.PARTITION_DATABASE_CONFIG['shards']), 2) self.assertIn(self.db1, settings.PARTITION_DATABASE_CONFIG['shards']) self.assertIn(self.db2, settings.PARTITION_DATABASE_CONFIG['shards']) self.assertEqual( settings.PARTITION_DATABASE_CONFIG['shards'][self.db1], [0, 1]) self.assertEqual( settings.PARTITION_DATABASE_CONFIG['shards'][self.db2], [2, 3]) self.assertEqual(set(partition_config.get_form_processing_dbs()), set([self.db1, self.db2])) self.assertEqual(ShardAccessor.get_database_for_doc(self.p1_uuid), self.db1) self.assertEqual(ShardAccessor.get_database_for_doc(self.p2_uuid), self.db2)
def test_uuids_used(self): self.assertEqual(ShardAccessor.get_database_for_doc(self.p1_uuid1), self.db1) self.assertEqual(ShardAccessor.get_database_for_doc(self.p1_uuid2), self.db1) self.assertEqual(ShardAccessor.get_database_for_doc(self.p1_uuid3), self.db1) self.assertEqual(ShardAccessor.get_database_for_doc(self.p2_uuid1), self.db2) self.assertEqual(ShardAccessor.get_database_for_doc(self.p2_uuid2), self.db2) self.assertEqual(ShardAccessor.get_database_for_doc(self.p2_uuid3), self.db2)
def handle(self, domain, **options): forms_by_shard = Counter() forms_by_db = Counter() cases_by_shard = Counter() cases_by_db = Counter() print('======================== forms ========================') print('id\t\t\t\t\tshard\tdatabase') for form_id in sorted(FormAccessors(domain=domain).get_all_form_ids_in_domain()): shard_id, dbname = ShardAccessor.get_shard_id_and_database_for_doc(form_id) forms_by_shard[shard_id] += 1 forms_by_db[dbname] += 1 print('{}\t{}\t{}'.format(form_id, shard_id, dbname)) print('\n======================== cases ========================') print('id\t\t\t\t\tshard\tdatabase') for case_id in sorted(CaseAccessors(domain=domain).get_case_ids_in_domain()): shard_id, dbname = ShardAccessor.get_shard_id_and_database_for_doc(case_id) cases_by_shard[shard_id] += 1 cases_by_db[dbname] += 1 print('{}\t{}\t{}'.format(case_id, shard_id, dbname)) _print(forms_by_shard, 'forms by shard') _print(forms_by_db, 'forms by db') _print(cases_by_shard, 'cases by shard') _print(cases_by_db, 'cases by db')
def _group_objects_by_db(objects): """ :param objects: Deserialized object dictionaries :return: List of tuples of (db_alias, [object,...]) """ objects_by_db = defaultdict(list) for obj in objects: app_label = obj['model'] model = apps.get_model(app_label) db_alias = router.db_for_write(model) if settings.USE_PARTITIONED_DATABASE and db_alias == partition_config.get_proxy_db(): doc_id = _get_doc_id(app_label, obj) db_alias = ShardAccessor.get_database_for_doc(doc_id) objects_by_db[db_alias].append(obj) return list(objects_by_db.items())
def delete_object_from_partitioned_database(obj, partition_value): """ Determines from which database to delete a partitioned model object and deletes it there. :param obj: A Django model object :param parition_value: The value that is used to partition the model; this value will be used to select the database """ if settings.USE_PARTITIONED_DATABASE: db_name = ShardAccessor.get_database_for_doc(partition_value) else: db_name = 'default' obj.delete(using=db_name)
def _group_objects_by_db(objects): """ :param objects: Deserialized object dictionaries :return: List of tuples of (db_alias, [object,...]) """ objects_by_db = defaultdict(list) for obj in objects: app_label = obj['model'] model = apps.get_model(app_label) db_alias = router.db_for_write(model) if settings.USE_PARTITIONED_DATABASE and db_alias == partition_config.proxy_db: doc_id = _get_doc_id(app_label, obj) db_alias = ShardAccessor.get_database_for_doc(doc_id) objects_by_db[db_alias].append(obj) return list(objects_by_db.items())
def test_get_database_for_docs(self): # test that sharding 1000 docs gives a distribution withing some tolerance # (bit of a vague test) N = 1000 doc_ids = [str(i) for i in range(N)] doc_db_map = ShardAccessor.get_database_for_docs(doc_ids) doc_count_per_db = defaultdict(int) for db_alias in doc_db_map.values(): doc_count_per_db[db_alias] += 1 num_dbs = len(partition_config.get_form_processing_dbs()) even_split = int(N // num_dbs) tolerance = N * 0.05 # 5% tollerance diffs = [abs(even_split - count) for count in doc_count_per_db.values()] outliers = [diff for diff in diffs if diff > tolerance] message = 'partitioning not within tollerance: tolerance={}, diffs={}'.format(tolerance, diffs) self.assertEqual(len(outliers), 0, message)
def test_get_database_for_docs(self): # test that sharding 1000 docs gives a distribution withing some tolerance # (bit of a vague test) N = 1000 doc_ids = [str(i) for i in range(N)] doc_db_map = ShardAccessor.get_database_for_docs(doc_ids) doc_count_per_db = defaultdict(int) for db_alias in doc_db_map.values(): doc_count_per_db[db_alias] += 1 num_dbs = len(partition_config.get_form_processing_dbs()) even_split = int(N / num_dbs) tolerance = N * 0.05 # 5% tollerance diffs = [abs(even_split - count) for count in doc_count_per_db.values()] outliers = [diff for diff in diffs if diff > tolerance] message = 'partitioning not within tollerance: tolerance={}, diffs={}'.format(tolerance, diffs) self.assertEqual(len(outliers), 0, message)
def get_object_from_partitioned_database(model_class, partition_value, partitioned_field_name): """ Determines from which database to retrieve a paritioned model object and retrieves it. :param model_class: A Django model class :param parition_value: The value that is used to partition the model; this value will be used to select the database :param partitioned_field_name: The model field on which the object is partitioned; the object whose partitioned_field_name attribute equals partition_value is returned :return: The model object """ if settings.USE_PARTITIONED_DATABASE: db_name = ShardAccessor.get_database_for_doc(partition_value) else: db_name = 'default' kwargs = { partitioned_field_name: partition_value, } return model_class.objects.using(db_name).get(**kwargs)
def _publish_cases_for_sql(domain, case_records): records_with_types = filter(lambda r: r.doc_subtype, case_records) records_with_no_types = filter(lambda r: not r.doc_subtype, case_records) # if we already had a type just publish as-is for record in records_with_types: producer.send_change( topics.CASE_SQL, _change_meta_for_sql_case(domain, record.doc_id, record.doc_subtype)) # else lookup the type from the database for record_chunk in chunked(records_with_no_types, 10000): # databases will contain a mapping of shard database ids to case_ids in that DB id_chunk = [r.doc_id for r in record_chunk] databases = ShardAccessor.get_docs_by_database(id_chunk) for db, doc_ids in databases.items(): results = CommCareCaseSQL.objects.using(db).filter( case_id__in=doc_ids, ).values_list('case_id', 'type') # make sure we found the same number of IDs assert len(results) == len(doc_ids) for case_id, case_type in results: producer.send_change( topics.CASE_SQL, _change_meta_for_sql_case(domain, case_id, case_type))
def test_hash_uuid(self): uuid = UUID('403724ef9fe141f2908363918c62c2ff') self.assertEqual(ShardAccessor.hash_doc_id_python(uuid), 1415444857) self.assertEqual(ShardAccessor.hash_doc_uuid_sql_for_testing(uuid), 1415444857)
def test_hash_doc_ids(self): N = 1001 doc_ids = [str(i) for i in range(N)] hashes = ShardAccessor.hash_doc_ids_sql(doc_ids) self.assertEquals(len(hashes), N) self.assertTrue(all(isinstance(hash_, int) for hash_ in hashes.values()))
def get_db_alias_for_partitioned_doc(partition_value): if settings.USE_PARTITIONED_DATABASE: db_name = ShardAccessor.get_database_for_doc(partition_value) else: db_name = 'default' return db_name