Example #1
0
 def test_add_then_flush(self):
     # can add in one call, flush in another
     model = MagicMock()
     data = {'a': 1}
     batch = BatchOperations(model)
     batch.add(data)
     self.assertEqual(1, batch.num_pending_adds)
     batch.flush()
     self.assertEqual(0, batch.num_pending_adds)
     model.objects.bulk_create.assert_called_with([model(**data)])
Example #2
0
 def test_add_one_record(self):
     # Calling it once should just add the data to the list
     # (we don't test what's in the list, that's an implementation detail,
     # just that its length is correct)
     batch = BatchOperations(None)
     data = {'civil_registry_id': 1}
     batch.add(data)
     batch.add(data)
     # There's now something in the list
     self.assertEqual(2, batch.num_pending_adds)
Example #3
0
    def test_delete_then_flush(self):
        pk = 1
        model = MagicMock()

        # The utility ought to call model.objects.filter(pk__in=[pk]).delete().
        # Use side_effect to get mock to call our code when it
        # tries to call objects.filter, so that we can check that
        # it passed the expected args, pk__in=[pk].
        # Then return another mock, which the caller should try to
        # call delete() on, so we can check that too.
        filter_return = MagicMock()

        def our_callback(*args, **kwargs):
            assert not args
            assert kwargs == {'pk__in': [pk]}
            return filter_return

        model = MagicMock(side_effect=our_callback)
        batch = BatchOperations(model)
        batch.delete(pk)
        batch.flush()
        filter_return.delete.assert_called()
Example #4
0
def mirror_database(
        from_model,
        to_model,
        from_db_name='default',
        to_db_name='default'):
    """
    Given two tables with the same schema, possibly in different databases,
    update the second efficiently to contain the same data as
    the first, except that deleting records from the second table
    is optional (pass delete_missing_records=True to confirm you
    want that).

    Returns a MirrorStats object with some statistics about what happened.

    :param from_model, to_model: The Django models to copy from and to.
    :param from_db_name: Name of the DATABASES key of the database we're
    copying from. Default: 'default'.
    :param to_db_name: Name of the DATABASES key of the database we're
    copying to. Default: 'default'.
    """

    # NOW comes the fun part. We'll go through our two tables in parallel
    # in primary key order, so we can delete records from the target that no longer
    # exist in the source, and add or update records that have been
    # added or updated.
    from_queryset = from_model.objects.using(from_db_name).order_by('pk').iterator()
    to_queryset = to_model.objects.using(to_db_name).order_by('pk').iterator()

    def next_from():
        """Return next record in the table we're copying from, or None"""
        try:
            return from_queryset.next()
        except StopIteration:
            return None

    def next_to():
        """Return next record in the table we're copying to, or None"""
        try:
            return to_queryset.next()
        except StopIteration:
            return None

    stats = MirrorStats()

    # We'll do our adds and deletes in bulk.
    # Too bad there's no bulk_update, but updates should be the
    # least frequent operation.
    batch = BatchOperations(to_model)

    from_record = next_from()
    to_record = next_to()
    while from_record and to_record:
        if from_record.pk == to_record.pk:
            # Same record - if the data has changed, update our mirror
            # Note that if a record was flagged 'missing' in a previous
            # import and turns up again in a later dump, this update
            # will turn off the missing field for us.
            if model_to_dict(from_record) != model_to_dict(to_record):
                stats.modified_record_count += 1
                to_model.objects.filter(pk=from_record.pk)\
                    .update(**model_to_dict(from_record))
            else:
                stats.unchanged_count += 1
            # We've dealt with both of these records, move on in both tables
            from_record = next_from()
            to_record = next_to()
        elif from_record.pk < to_record.pk:
            # The "TO" table is missing a record that the FROM table has, so add it
            stats.new_record_count += 1
            batch.add(model_to_dict(from_record))
            # We've dealt with this from_record, on to the next
            from_record = next_from()
        else:  # from_record.pk > to_record.pk:
            # The "TO" table has a record that's not in the from table, so remove it
            # if delete_missing_records is set
            stats.not_there_anymore_count += 1
            stats.missing_pks.append(to_record.pk)
            # We've dealt with this to_record, on to the next
            to_record = next_to()

    # At this point, we might have left over records from one table
    # or the other (though not both)
    while from_record:
        # Records that aren't in the "TO" table and need to be added
        stats.new_record_count += 1
        batch.add(model_to_dict(from_record))
        from_record = next_from()
    while to_record:
        # Need to remove records from the "TO" table
        stats.not_there_anymore_count += 1
        stats.missing_pks.append(to_record.pk)
        to_record = next_to()

    # Finish out the batches if needed
    batch.flush()

    # Return statistics
    return stats
Example #5
0
def import_citizen_dump(input_filename,
                        max_change_percent=DEFAULT_MAX_CHANGE_PERCENT,
                        encoding='UTF-8'):

    with transaction.atomic():

        # Clear out TempCitizen table. (We clear it at the end too, but this makes
        # extra sure that we start with it empty.)
        delete_all('default', [TempCitizen])

        num_records_at_start = Citizen.objects.count()

        #
        # 1. Fill our temp table with the data from the latest dump
        #
        logger.info("Loading data from dump")
        input_file = codecs.open(input_filename, encoding=encoding)
        logger.info("Reading %s" % input_filename)
        batch = BatchOperations(TempCitizen)
        records_read = 0
        for record in get_records(input_file):
            records_read += 1
            batch.add(record)
        batch.flush()

        #
        # 2. Sync data from temp table to our real table
        #
        logger.info("Updating our own database")
        stats = mirror_database(from_model=TempCitizen,
                                to_model=Citizen)

        # See what % of the records we're changing
        if num_records_at_start > 0:
            num_changes = (stats.modified_record_count + stats.new_record_count
                           + stats.not_there_anymore_count)
            percent_changed = 100 * (num_changes / num_records_at_start)
            if percent_changed > max_change_percent:
                raise TooManyChanges(
                    "Too many changes, aborting Citizen data import. Max change is %f%% but "
                    "the import would have changed %f%% records (%d/%d).  Use "
                    "--max-change-percent=NN to override this limit if necessary."
                    % (max_change_percent, percent_changed, num_changes, num_records_at_start))

        # Add our data
        stats.records_read = records_read

        # Make a note of when we did it
        timestamp = now()
        CitizenMetadata.objects.update_or_create(defaults=dict(dump_time=timestamp))

        # Flag any records that turned up missing
        if stats.missing_pks:
            Citizen.objects.filter(pk__in=stats.missing_pks, missing=None).update(missing=timestamp)

        # And we're done!

        # Clear out our temp table (no point in taking up disk space)
        delete_all('default', [TempCitizen])

    return stats
Example #6
0
def mirror_database(from_model,
                    to_model,
                    from_db_name='default',
                    to_db_name='default'):
    """
    Given two tables with the same schema, possibly in different databases,
    update the second efficiently to contain the same data as
    the first, except that deleting records from the second table
    is optional (pass delete_missing_records=True to confirm you
    want that).

    Returns a MirrorStats object with some statistics about what happened.

    :param from_model, to_model: The Django models to copy from and to.
    :param from_db_name: Name of the DATABASES key of the database we're
    copying from. Default: 'default'.
    :param to_db_name: Name of the DATABASES key of the database we're
    copying to. Default: 'default'.
    """

    # NOW comes the fun part. We'll go through our two tables in parallel
    # in primary key order, so we can delete records from the target that no longer
    # exist in the source, and add or update records that have been
    # added or updated.
    from_queryset = from_model.objects.using(from_db_name).order_by(
        'pk').iterator()
    to_queryset = to_model.objects.using(to_db_name).order_by('pk').iterator()

    def next_from():
        """Return next record in the table we're copying from, or None"""
        try:
            return from_queryset.next()
        except StopIteration:
            return None

    def next_to():
        """Return next record in the table we're copying to, or None"""
        try:
            return to_queryset.next()
        except StopIteration:
            return None

    stats = MirrorStats()

    # We'll do our adds and deletes in bulk.
    # Too bad there's no bulk_update, but updates should be the
    # least frequent operation.
    batch = BatchOperations(to_model)

    from_record = next_from()
    to_record = next_to()
    while from_record and to_record:
        if from_record.pk == to_record.pk:
            # Same record - if the data has changed, update our mirror
            # Note that if a record was flagged 'missing' in a previous
            # import and turns up again in a later dump, this update
            # will turn off the missing field for us.
            if model_to_dict(from_record) != model_to_dict(to_record):
                stats.modified_record_count += 1
                to_model.objects.filter(pk=from_record.pk)\
                    .update(**model_to_dict(from_record))
            else:
                stats.unchanged_count += 1
            # We've dealt with both of these records, move on in both tables
            from_record = next_from()
            to_record = next_to()
        elif from_record.pk < to_record.pk:
            # The "TO" table is missing a record that the FROM table has, so add it
            stats.new_record_count += 1
            batch.add(model_to_dict(from_record))
            # We've dealt with this from_record, on to the next
            from_record = next_from()
        else:  # from_record.pk > to_record.pk:
            # The "TO" table has a record that's not in the from table, so remove it
            # if delete_missing_records is set
            stats.not_there_anymore_count += 1
            stats.missing_pks.append(to_record.pk)
            # We've dealt with this to_record, on to the next
            to_record = next_to()

    # At this point, we might have left over records from one table
    # or the other (though not both)
    while from_record:
        # Records that aren't in the "TO" table and need to be added
        stats.new_record_count += 1
        batch.add(model_to_dict(from_record))
        from_record = next_from()
    while to_record:
        # Need to remove records from the "TO" table
        stats.not_there_anymore_count += 1
        stats.missing_pks.append(to_record.pk)
        to_record = next_to()

    # Finish out the batches if needed
    batch.flush()

    # Return statistics
    return stats
Example #7
0
 def test_delete_one_record(self):
     batch = BatchOperations(None)
     pk = 1
     batch.delete(pk)
     self.assertEqual(1, batch.num_pending_deletes)