コード例 #1
0
def test_bulk_insert(valid_build):
    one = valid_build()
    two = valid_build()
    assert one == two
    assert one is not two
    insert_count, skipped = Build.bulk_insert([one, two])
    assert skipped == 0
    # Because they're *equal*
    assert insert_count == 1
    assert Build.objects.all().count() == 1

    two["download"]["size"] += 1
    three = valid_build()
    three["download"]["size"] += 2
    insert_count, skipped = Build.bulk_insert([one, two, three])
    assert skipped == 0
    assert insert_count == 2
    # Even though they're "inserted at the same time", their created_at
    # should be different.
    created_ats = [x.created_at for x in Build.objects.all()]
    assert created_ats[0] != created_ats[1]
    assert created_ats[1] != created_ats[2]
    assert Build.objects.all().count() == 3

    insert_count, skipped = Build.bulk_insert([one, two, three])
    assert skipped == 0
    assert insert_count == 0
コード例 #2
0
def test_bulk_insert_invalid(valid_build):
    one = valid_build()
    two = valid_build()
    two.pop("target")
    with pytest.raises(ValidationError) as exception:
        Build.bulk_insert([one, two])
    assert "'target' is a required property" in str(exception.value)
    # Even if the first one was valid, it won't be inserted.
    assert not Build.objects.exists()
コード例 #3
0
def test_bulk_insert_invalid_skip_invalid(valid_build):
    one = valid_build()
    two = valid_build()
    two.pop("target")

    inserted, skipped = Build.bulk_insert([one, two], skip_invalid=True)
    assert inserted == 1
    assert skipped == 1
    # The first one would be inserted.
    assert Build.objects.count() == 1
コード例 #4
0
    def handle(self, *args, **options):
        # verbose = options["verbosity"] > 1
        if not settings.DATABASES.get("kinto"):
            raise ImproperlyConfigured(
                "See configuration documentation about setting up "
                "second the 'kinto' connection.")

        pages = 0
        done = 0
        skip_validation = options["skip_validation"]
        skip_invalid = options["skip_invalid"]
        skipped = 0
        inserted_total = 0
        total_t0 = time.time()
        for batch, total_records in self.iterator(options):
            builds = [
                x[0] for x in batch if not skip_invalid or "build" in x[0]
            ]
            count = len(builds)
            print(f"Page {pages + 1} ({count} records)")
            t0 = time.time()
            inserted, batch_skipped = Build.bulk_insert(
                builds,
                skip_validation=skip_validation,
                skip_invalid=skip_invalid,
                metadata={"kinto-migration": True},
            )
            t1 = time.time()
            done += count
            skipped += batch_skipped
            inserted_total += inserted
            print("Inserted {} new out of {} in "
                  "{:.2f} seconds. {} of {} ({:.1f}%)".format(
                      format(inserted, ","),
                      format(count, ","),
                      t1 - t0,
                      format(done, ","),
                      format(total_records, ","),
                      100 * done / total_records,
                  ))
            if batch_skipped:
                print(f"Skipped {batch_skipped} invalid records.")

            pages += 1
        total_t1 = time.time()

        print(f"In total, skipped {skipped} invalid records.")
        print(f"In total, processed {done} valid records.")
        print(f"In total, inserted {inserted_total} valid records.")

        print("The whole migration took {:.1f} minutes.".format(
            (total_t1 - total_t0) / 60))
コード例 #5
0
ファイル: kinto-migration.py プロジェクト: smarnach/buildhub2
    def handle(self, *args, **options):
        if options["continue"]:
            with open(self.next_url_log_file) as f:
                url = f.read().strip()
            logger.info(f"Continuing with URL {url}")
        else:
            # Ping it first
            kinto_url = options["kinto-url"]
            r = requests.get(kinto_url)
            r.raise_for_status()
            assert r.json()["project_name"] == "kinto", r.json()

            if kinto_url.endswith("/"):
                kinto_url = kinto_url[:-1]
            url = (
                f"{kinto_url}/buckets/build-hub/collections/releases/records"
                "?_limit=10000")
        pages = 0
        session = requests.Session()
        done = 0
        skip_validation = options["skip_validation"]
        for batch, total_records in self.iterator(session, url):
            logger.info(f"Page {pages + 1} ({len(batch)} records)")
            # Now let's bulk insert these
            builds = []
            for record in batch:
                record.pop("id")
                record.pop("last_modified")
                builds.append(record)
            # Skip validation most of the time
            t0 = time.time()
            inserted, _ = Build.bulk_insert(
                builds,
                skip_validation=skip_validation,
                metadata={"kinto-migration": True},
            )
            t1 = time.time()
            metrics.incr("kinto_migrated", value=len(builds))
            metrics.incr("kinto_inserted", value=inserted)
            done += len(batch)
            logger.info("Inserted {} new out of {} in "
                        "{:.2f} seconds. {} of {} ({:.1f}%)".format(
                            format(inserted, ","),
                            format(len(builds), ","),
                            t1 - t0,
                            format(done, ","),
                            format(total_records, ","),
                            100 * done / total_records,
                        ))

            pages += 1
コード例 #6
0
ファイル: kinto-migration.py プロジェクト: peterbe/buildhub2
    def handle(self, *args, **options):
        # Ping it first
        kinto_url = options['kinto-url']
        r = requests.get(kinto_url)
        r.raise_for_status()
        assert r.json()['project_name'] == 'kinto', r.json()

        if kinto_url.endswith('/'):
            kinto_url = kinto_url[:-1]
        url = (f"{kinto_url}/buckets/build-hub/collections/releases/records"
               "?_limit=10000")
        pages = 0
        session = requests.Session()
        done = 0
        skip_validation = options['skip_validation']
        for batch, total_records in self.iterator(session, url):
            logger.info(f"Page {pages + 1} ({len(batch)} records)")
            # Now let's bulk insert these
            builds = []
            for record in batch:
                record.pop('id')
                record.pop('last_modified')
                builds.append(record)
            # Skip validation most of the time
            t0 = time.time()
            inserted = Build.bulk_insert(
                builds,
                skip_validation=skip_validation,
                metadata={'kinto-migration': True},
            )
            t1 = time.time()
            done += len(batch)
            logger.info("Inserted {} new out of {} in "
                        "{:.2f} seconds. {} of {} ({:.1f}%)".format(
                            format(inserted, ','),
                            format(len(builds), ','),
                            t1 - t0,
                            format(done, ','),
                            format(total_records, ','),
                            100 * done / total_records,
                        ))

            pages += 1
コード例 #7
0
    def handle(self, *args, **options):
        # verbose = options["verbosity"] > 1
        if not settings.DATABASES.get("kinto"):
            raise ImproperlyConfigured(
                "See configuration documentation about setting up "
                "second the 'kinto' connection."
            )

        current_count = Build.objects.all().count()
        print(f"There are currently {current_count:,} in our existing database.")

        with connections["kinto"].cursor() as cursor:
            cursor.execute(
                """
                SELECT COUNT(*)
                FROM records
                WHERE
                    parent_id = %s AND collection_id = %s
            """,
                [options["parent_id"], options["collection_id"]],
            )
            (total_records,) = cursor.fetchone()
            print(f"There are currently {total_records:,} in the Kinto database.")

        pages = 0
        done = 0
        skip_validation = options["skip_validation"]
        skip_invalid = options["skip_invalid"]
        skipped = 0
        inserted_total = 0
        total_t0 = time.time()
        for batch in self.iterator(options):
            # builds = [x[0] for x in batch if not skip_invalid or "build" in x[0]]
            builds = []
            for build in batch:
                if not skip_invalid or "build" in build[0]:
                    if build[0].get("schema"):
                        # The one common thing in the old Kinto database is that each
                        # build has a key 'schema' which is just a timestamp (integer).
                        # Just pop it out so as to not get validation errors that
                        # not actually critical.
                        build[0].pop("schema")
                    builds.append(build[0])
            count = len(builds)
            print(f"Page {pages + 1} ({count:,} records)")
            t0 = time.time()
            inserted, batch_skipped = Build.bulk_insert(
                builds,
                skip_validation=skip_validation,
                skip_invalid=skip_invalid,
                metadata={"kinto-migration": True},
            )
            t1 = time.time()
            done += count
            skipped += batch_skipped
            inserted_total += inserted
            print(
                "Inserted {} new out of {} in "
                "{:.2f} seconds. {} of {} ({:.1f}%)".format(
                    format(inserted, ","),
                    format(count, ","),
                    t1 - t0,
                    format(done, ","),
                    format(total_records, ","),
                    100 * done / total_records,
                )
            )
            if batch_skipped:
                print(f"Skipped {batch_skipped:,} invalid records.")

            pages += 1
        total_t1 = time.time()

        print(f"In total, skipped {skipped:,} invalid records.")
        print(f"In total, processed {done:,} valid records.")
        print(f"In total, inserted {inserted_total:,} valid records.")

        print(
            "The whole migration took {:.1f} minutes.".format(
                (total_t1 - total_t0) / 60
            )
        )