def test_add_then_flush(self): # can add in one call, flush in another model = MagicMock() data = {'a': 1} batch = BatchOperations(model) batch.add(data) self.assertEqual(1, batch.num_pending_adds) batch.flush() self.assertEqual(0, batch.num_pending_adds) model.objects.bulk_create.assert_called_with([model(**data)])
def test_add_one_record(self): # Calling it once should just add the data to the list # (we don't test what's in the list, that's an implementation detail, # just that its length is correct) batch = BatchOperations(None) data = {'civil_registry_id': 1} batch.add(data) batch.add(data) # There's now something in the list self.assertEqual(2, batch.num_pending_adds)
def test_delete_then_flush(self): pk = 1 model = MagicMock() # The utility ought to call model.objects.filter(pk__in=[pk]).delete(). # Use side_effect to get mock to call our code when it # tries to call objects.filter, so that we can check that # it passed the expected args, pk__in=[pk]. # Then return another mock, which the caller should try to # call delete() on, so we can check that too. filter_return = MagicMock() def our_callback(*args, **kwargs): assert not args assert kwargs == {'pk__in': [pk]} return filter_return model = MagicMock(side_effect=our_callback) batch = BatchOperations(model) batch.delete(pk) batch.flush() filter_return.delete.assert_called()
def mirror_database( from_model, to_model, from_db_name='default', to_db_name='default'): """ Given two tables with the same schema, possibly in different databases, update the second efficiently to contain the same data as the first, except that deleting records from the second table is optional (pass delete_missing_records=True to confirm you want that). Returns a MirrorStats object with some statistics about what happened. :param from_model, to_model: The Django models to copy from and to. :param from_db_name: Name of the DATABASES key of the database we're copying from. Default: 'default'. :param to_db_name: Name of the DATABASES key of the database we're copying to. Default: 'default'. """ # NOW comes the fun part. We'll go through our two tables in parallel # in primary key order, so we can delete records from the target that no longer # exist in the source, and add or update records that have been # added or updated. from_queryset = from_model.objects.using(from_db_name).order_by('pk').iterator() to_queryset = to_model.objects.using(to_db_name).order_by('pk').iterator() def next_from(): """Return next record in the table we're copying from, or None""" try: return from_queryset.next() except StopIteration: return None def next_to(): """Return next record in the table we're copying to, or None""" try: return to_queryset.next() except StopIteration: return None stats = MirrorStats() # We'll do our adds and deletes in bulk. # Too bad there's no bulk_update, but updates should be the # least frequent operation. batch = BatchOperations(to_model) from_record = next_from() to_record = next_to() while from_record and to_record: if from_record.pk == to_record.pk: # Same record - if the data has changed, update our mirror # Note that if a record was flagged 'missing' in a previous # import and turns up again in a later dump, this update # will turn off the missing field for us. if model_to_dict(from_record) != model_to_dict(to_record): stats.modified_record_count += 1 to_model.objects.filter(pk=from_record.pk)\ .update(**model_to_dict(from_record)) else: stats.unchanged_count += 1 # We've dealt with both of these records, move on in both tables from_record = next_from() to_record = next_to() elif from_record.pk < to_record.pk: # The "TO" table is missing a record that the FROM table has, so add it stats.new_record_count += 1 batch.add(model_to_dict(from_record)) # We've dealt with this from_record, on to the next from_record = next_from() else: # from_record.pk > to_record.pk: # The "TO" table has a record that's not in the from table, so remove it # if delete_missing_records is set stats.not_there_anymore_count += 1 stats.missing_pks.append(to_record.pk) # We've dealt with this to_record, on to the next to_record = next_to() # At this point, we might have left over records from one table # or the other (though not both) while from_record: # Records that aren't in the "TO" table and need to be added stats.new_record_count += 1 batch.add(model_to_dict(from_record)) from_record = next_from() while to_record: # Need to remove records from the "TO" table stats.not_there_anymore_count += 1 stats.missing_pks.append(to_record.pk) to_record = next_to() # Finish out the batches if needed batch.flush() # Return statistics return stats
def import_citizen_dump(input_filename, max_change_percent=DEFAULT_MAX_CHANGE_PERCENT, encoding='UTF-8'): with transaction.atomic(): # Clear out TempCitizen table. (We clear it at the end too, but this makes # extra sure that we start with it empty.) delete_all('default', [TempCitizen]) num_records_at_start = Citizen.objects.count() # # 1. Fill our temp table with the data from the latest dump # logger.info("Loading data from dump") input_file = codecs.open(input_filename, encoding=encoding) logger.info("Reading %s" % input_filename) batch = BatchOperations(TempCitizen) records_read = 0 for record in get_records(input_file): records_read += 1 batch.add(record) batch.flush() # # 2. Sync data from temp table to our real table # logger.info("Updating our own database") stats = mirror_database(from_model=TempCitizen, to_model=Citizen) # See what % of the records we're changing if num_records_at_start > 0: num_changes = (stats.modified_record_count + stats.new_record_count + stats.not_there_anymore_count) percent_changed = 100 * (num_changes / num_records_at_start) if percent_changed > max_change_percent: raise TooManyChanges( "Too many changes, aborting Citizen data import. Max change is %f%% but " "the import would have changed %f%% records (%d/%d). Use " "--max-change-percent=NN to override this limit if necessary." % (max_change_percent, percent_changed, num_changes, num_records_at_start)) # Add our data stats.records_read = records_read # Make a note of when we did it timestamp = now() CitizenMetadata.objects.update_or_create(defaults=dict(dump_time=timestamp)) # Flag any records that turned up missing if stats.missing_pks: Citizen.objects.filter(pk__in=stats.missing_pks, missing=None).update(missing=timestamp) # And we're done! # Clear out our temp table (no point in taking up disk space) delete_all('default', [TempCitizen]) return stats
def mirror_database(from_model, to_model, from_db_name='default', to_db_name='default'): """ Given two tables with the same schema, possibly in different databases, update the second efficiently to contain the same data as the first, except that deleting records from the second table is optional (pass delete_missing_records=True to confirm you want that). Returns a MirrorStats object with some statistics about what happened. :param from_model, to_model: The Django models to copy from and to. :param from_db_name: Name of the DATABASES key of the database we're copying from. Default: 'default'. :param to_db_name: Name of the DATABASES key of the database we're copying to. Default: 'default'. """ # NOW comes the fun part. We'll go through our two tables in parallel # in primary key order, so we can delete records from the target that no longer # exist in the source, and add or update records that have been # added or updated. from_queryset = from_model.objects.using(from_db_name).order_by( 'pk').iterator() to_queryset = to_model.objects.using(to_db_name).order_by('pk').iterator() def next_from(): """Return next record in the table we're copying from, or None""" try: return from_queryset.next() except StopIteration: return None def next_to(): """Return next record in the table we're copying to, or None""" try: return to_queryset.next() except StopIteration: return None stats = MirrorStats() # We'll do our adds and deletes in bulk. # Too bad there's no bulk_update, but updates should be the # least frequent operation. batch = BatchOperations(to_model) from_record = next_from() to_record = next_to() while from_record and to_record: if from_record.pk == to_record.pk: # Same record - if the data has changed, update our mirror # Note that if a record was flagged 'missing' in a previous # import and turns up again in a later dump, this update # will turn off the missing field for us. if model_to_dict(from_record) != model_to_dict(to_record): stats.modified_record_count += 1 to_model.objects.filter(pk=from_record.pk)\ .update(**model_to_dict(from_record)) else: stats.unchanged_count += 1 # We've dealt with both of these records, move on in both tables from_record = next_from() to_record = next_to() elif from_record.pk < to_record.pk: # The "TO" table is missing a record that the FROM table has, so add it stats.new_record_count += 1 batch.add(model_to_dict(from_record)) # We've dealt with this from_record, on to the next from_record = next_from() else: # from_record.pk > to_record.pk: # The "TO" table has a record that's not in the from table, so remove it # if delete_missing_records is set stats.not_there_anymore_count += 1 stats.missing_pks.append(to_record.pk) # We've dealt with this to_record, on to the next to_record = next_to() # At this point, we might have left over records from one table # or the other (though not both) while from_record: # Records that aren't in the "TO" table and need to be added stats.new_record_count += 1 batch.add(model_to_dict(from_record)) from_record = next_from() while to_record: # Need to remove records from the "TO" table stats.not_there_anymore_count += 1 stats.missing_pks.append(to_record.pk) to_record = next_to() # Finish out the batches if needed batch.flush() # Return statistics return stats
def test_delete_one_record(self): batch = BatchOperations(None) pk = 1 batch.delete(pk) self.assertEqual(1, batch.num_pending_deletes)