def test_bulk_insert(valid_build): one = valid_build() two = valid_build() assert one == two assert one is not two insert_count, skipped = Build.bulk_insert([one, two]) assert skipped == 0 # Because they're *equal* assert insert_count == 1 assert Build.objects.all().count() == 1 two["download"]["size"] += 1 three = valid_build() three["download"]["size"] += 2 insert_count, skipped = Build.bulk_insert([one, two, three]) assert skipped == 0 assert insert_count == 2 # Even though they're "inserted at the same time", their created_at # should be different. created_ats = [x.created_at for x in Build.objects.all()] assert created_ats[0] != created_ats[1] assert created_ats[1] != created_ats[2] assert Build.objects.all().count() == 3 insert_count, skipped = Build.bulk_insert([one, two, three]) assert skipped == 0 assert insert_count == 0
def test_bulk_insert_invalid(valid_build): one = valid_build() two = valid_build() two.pop("target") with pytest.raises(ValidationError) as exception: Build.bulk_insert([one, two]) assert "'target' is a required property" in str(exception.value) # Even if the first one was valid, it won't be inserted. assert not Build.objects.exists()
def test_bulk_insert_invalid_skip_invalid(valid_build): one = valid_build() two = valid_build() two.pop("target") inserted, skipped = Build.bulk_insert([one, two], skip_invalid=True) assert inserted == 1 assert skipped == 1 # The first one would be inserted. assert Build.objects.count() == 1
def handle(self, *args, **options): # verbose = options["verbosity"] > 1 if not settings.DATABASES.get("kinto"): raise ImproperlyConfigured( "See configuration documentation about setting up " "second the 'kinto' connection.") pages = 0 done = 0 skip_validation = options["skip_validation"] skip_invalid = options["skip_invalid"] skipped = 0 inserted_total = 0 total_t0 = time.time() for batch, total_records in self.iterator(options): builds = [ x[0] for x in batch if not skip_invalid or "build" in x[0] ] count = len(builds) print(f"Page {pages + 1} ({count} records)") t0 = time.time() inserted, batch_skipped = Build.bulk_insert( builds, skip_validation=skip_validation, skip_invalid=skip_invalid, metadata={"kinto-migration": True}, ) t1 = time.time() done += count skipped += batch_skipped inserted_total += inserted print("Inserted {} new out of {} in " "{:.2f} seconds. {} of {} ({:.1f}%)".format( format(inserted, ","), format(count, ","), t1 - t0, format(done, ","), format(total_records, ","), 100 * done / total_records, )) if batch_skipped: print(f"Skipped {batch_skipped} invalid records.") pages += 1 total_t1 = time.time() print(f"In total, skipped {skipped} invalid records.") print(f"In total, processed {done} valid records.") print(f"In total, inserted {inserted_total} valid records.") print("The whole migration took {:.1f} minutes.".format( (total_t1 - total_t0) / 60))
def handle(self, *args, **options): if options["continue"]: with open(self.next_url_log_file) as f: url = f.read().strip() logger.info(f"Continuing with URL {url}") else: # Ping it first kinto_url = options["kinto-url"] r = requests.get(kinto_url) r.raise_for_status() assert r.json()["project_name"] == "kinto", r.json() if kinto_url.endswith("/"): kinto_url = kinto_url[:-1] url = ( f"{kinto_url}/buckets/build-hub/collections/releases/records" "?_limit=10000") pages = 0 session = requests.Session() done = 0 skip_validation = options["skip_validation"] for batch, total_records in self.iterator(session, url): logger.info(f"Page {pages + 1} ({len(batch)} records)") # Now let's bulk insert these builds = [] for record in batch: record.pop("id") record.pop("last_modified") builds.append(record) # Skip validation most of the time t0 = time.time() inserted, _ = Build.bulk_insert( builds, skip_validation=skip_validation, metadata={"kinto-migration": True}, ) t1 = time.time() metrics.incr("kinto_migrated", value=len(builds)) metrics.incr("kinto_inserted", value=inserted) done += len(batch) logger.info("Inserted {} new out of {} in " "{:.2f} seconds. {} of {} ({:.1f}%)".format( format(inserted, ","), format(len(builds), ","), t1 - t0, format(done, ","), format(total_records, ","), 100 * done / total_records, )) pages += 1
def handle(self, *args, **options): # Ping it first kinto_url = options['kinto-url'] r = requests.get(kinto_url) r.raise_for_status() assert r.json()['project_name'] == 'kinto', r.json() if kinto_url.endswith('/'): kinto_url = kinto_url[:-1] url = (f"{kinto_url}/buckets/build-hub/collections/releases/records" "?_limit=10000") pages = 0 session = requests.Session() done = 0 skip_validation = options['skip_validation'] for batch, total_records in self.iterator(session, url): logger.info(f"Page {pages + 1} ({len(batch)} records)") # Now let's bulk insert these builds = [] for record in batch: record.pop('id') record.pop('last_modified') builds.append(record) # Skip validation most of the time t0 = time.time() inserted = Build.bulk_insert( builds, skip_validation=skip_validation, metadata={'kinto-migration': True}, ) t1 = time.time() done += len(batch) logger.info("Inserted {} new out of {} in " "{:.2f} seconds. {} of {} ({:.1f}%)".format( format(inserted, ','), format(len(builds), ','), t1 - t0, format(done, ','), format(total_records, ','), 100 * done / total_records, )) pages += 1
def handle(self, *args, **options): # verbose = options["verbosity"] > 1 if not settings.DATABASES.get("kinto"): raise ImproperlyConfigured( "See configuration documentation about setting up " "second the 'kinto' connection." ) current_count = Build.objects.all().count() print(f"There are currently {current_count:,} in our existing database.") with connections["kinto"].cursor() as cursor: cursor.execute( """ SELECT COUNT(*) FROM records WHERE parent_id = %s AND collection_id = %s """, [options["parent_id"], options["collection_id"]], ) (total_records,) = cursor.fetchone() print(f"There are currently {total_records:,} in the Kinto database.") pages = 0 done = 0 skip_validation = options["skip_validation"] skip_invalid = options["skip_invalid"] skipped = 0 inserted_total = 0 total_t0 = time.time() for batch in self.iterator(options): # builds = [x[0] for x in batch if not skip_invalid or "build" in x[0]] builds = [] for build in batch: if not skip_invalid or "build" in build[0]: if build[0].get("schema"): # The one common thing in the old Kinto database is that each # build has a key 'schema' which is just a timestamp (integer). # Just pop it out so as to not get validation errors that # not actually critical. build[0].pop("schema") builds.append(build[0]) count = len(builds) print(f"Page {pages + 1} ({count:,} records)") t0 = time.time() inserted, batch_skipped = Build.bulk_insert( builds, skip_validation=skip_validation, skip_invalid=skip_invalid, metadata={"kinto-migration": True}, ) t1 = time.time() done += count skipped += batch_skipped inserted_total += inserted print( "Inserted {} new out of {} in " "{:.2f} seconds. {} of {} ({:.1f}%)".format( format(inserted, ","), format(count, ","), t1 - t0, format(done, ","), format(total_records, ","), 100 * done / total_records, ) ) if batch_skipped: print(f"Skipped {batch_skipped:,} invalid records.") pages += 1 total_t1 = time.time() print(f"In total, skipped {skipped:,} invalid records.") print(f"In total, processed {done:,} valid records.") print(f"In total, inserted {inserted_total:,} valid records.") print( "The whole migration took {:.1f} minutes.".format( (total_t1 - total_t0) / 60 ) )