def download_and_insert(obj, maybe=False): key = obj["Key"] with io.BytesIO() as f: # 'bucket_name' and 's3_client' is hoisted from the closure s3_client.download_fileobj(bucket_name, key, f) # After it has been populated by download_fileobj() we need to # rewind it so we can send it to json.load(). f.seek(0) # Before exiting this context (and freeing up the binary data), # we turn it into a Python dict. build = json.load(f) inserted = Build.insert( build=build, s3_object_key=obj["Key"], s3_object_etag=obj["ETag"] ) if inserted: logger.info(f"New Build inserted from backfill ({key})") metrics.incr("backfill_inserted") else: logger.info(f"Key downloaded but not inserted again ({key})") metrics.incr("backfill_not_inserted") if maybe and not inserted: # If this happens, it means that the build exists exactly with # this build_hash already but the ETag isn't matching. # Update the s3_object_* attributes found = Build.objects.filter( s3_object_key=key, build_hash=Build.get_build_hash(build) ) found.update(s3_object_etag=obj["ETag"])
def test_bulk_insert(valid_build): one = valid_build() two = valid_build() assert one == two assert one is not two insert_count, skipped = Build.bulk_insert([one, two]) assert skipped == 0 # Because they're *equal* assert insert_count == 1 assert Build.objects.all().count() == 1 two["download"]["size"] += 1 three = valid_build() three["download"]["size"] += 2 insert_count, skipped = Build.bulk_insert([one, two, three]) assert skipped == 0 assert insert_count == 2 # Even though they're "inserted at the same time", their created_at # should be different. created_ats = [x.created_at for x in Build.objects.all()] assert created_ats[0] != created_ats[1] assert created_ats[1] != created_ats[2] assert Build.objects.all().count() == 3 insert_count, skipped = Build.bulk_insert([one, two, three]) assert skipped == 0 assert insert_count == 0
def test_bulk_insert_invalid(valid_build): one = valid_build() two = valid_build() two.pop("target") with pytest.raises(ValidationError) as exception: Build.bulk_insert([one, two]) assert "'target' is a required property" in str(exception.value) # Even if the first one was valid, it won't be inserted. assert not Build.objects.exists()
def test_happy_path(valid_build, client, elasticsearch): build = valid_build() Build.insert(build) elasticsearch.flush() url = reverse("api:search") response = client.get(url) assert response.status_code == 200 result = response.json() assert result["hits"]["total"] == 1 hit, = result["hits"]["hits"] assert hit["_source"]["target"]["version"] == build["target"]["version"]
def test_happy_path(valid_build, client, elasticsearch): build = valid_build() Build.insert(build) elasticsearch.flush() url = reverse('api:search') response = client.get(url) assert response.status_code == 200 result = response.json() assert result['hits']['total'] == 1 hit, = result['hits']['hits'] assert hit['_source']['target']['version'] == build['target']['version']
def test_insert(settings, valid_build): build = valid_build() inserted = Build.insert(build) assert inserted.build_hash assert inserted.build == build assert inserted.created_at assert inserted.build_hash in repr(inserted) # It's idempotent. second_time = Build.insert(build) assert not second_time assert Build.objects.all().count() == 1
def test_happy_path_records(valid_build, client, elasticsearch): url = reverse("api:records") response = client.get(url) assert response.status_code == 200 result = response.json() assert result["builds"]["total"] == 0 build = valid_build() Build.insert(build) response = client.get(url) assert response.status_code == 200 result = response.json() assert result["builds"]["total"] == 1
def test_insert_invalid(settings, valid_build): build = valid_build() # We can't completely mess with the schema to the point were it # breaks Elasticsearch writes. build["source"]["junk"] = True with pytest.raises(ValidationError) as exception: Build.insert(build) err_msg = "Additional properties are not allowed ('junk' was unexpected)" assert err_msg in str(exception.value) # The 'skip_validation' is kinda dumb but it exists for when you're # super certain that the stuff you're inserting really is valid. Build.insert(build, skip_validation=True)
def test_happy_path(valid_build, client, elasticsearch): build = valid_build() Build.insert(build) elasticsearch.flush() url = reverse("api:search") response = client.get(url) assert response.status_code == 200 result = response.json() assert result["hits"]["total"] == 1 (hit, ) = result["hits"]["hits"] assert hit["_source"]["target"]["version"] == build["target"]["version"] # No CSP header for the API requests since they're always JSON. assert not response.has_header("Content-Security-Policy")
def test_rebuild_bigquery_command(bigquery_client, bigquery_testing_table, valid_build, settings): """Test that the fixture is created and insertion is successful. Note that streaming data into a recreated table does not work in testing due to caching (see salting in the bigquery fixture in conftest.py). """ client = bigquery_client table = bigquery_testing_table # We insert data into the database that predates BigQuery functionality settings.BQ_ENABLED = False n_documents = 10 build = valid_build() for i in range(n_documents): build["build"]["number"] = i inserted = Build.insert(build) assert inserted settings.BQ_ENABLED = True settings.BQ_DATASET_ID = table.dataset_id settings.BQ_TABLE_ID = table.table_id settings.BQ_REBUILD_MAX_ERROR_COUNT = 0 # done in 4 chunks settings.BQ_REBUILD_CHUNK_SIZE = 3 call_command("rebuild-bigquery", yes=True) table_id = f"{table.dataset_id}.{table.table_id}" query = f"SELECT COUNT(*) as n_rows FROM {table_id}" print(query) job = client.query(query) result = list(job.result())[0] assert result.n_rows == n_documents
def test_model_serialization(valid_build): """Example document: ``` { "build_hash": "v1:465552ab2ea1b5039a086987b70c598c", "metadata": { "version": "Testing" }, "build": { ... }, "created_at": "2020-01-10T22:46:32.274Z", "s3_object_key": "", "s3_object_etag": "" } ``` """ build = valid_build() inserted = Build.insert(build) doc = inserted.to_dict() assert set(doc.keys()) == { "build_hash", "build", "metadata", "created_at", "s3_object_key", "s3_object_etag", }
def test_insert_writes_to_elasticsearch(settings, elasticsearch, valid_build): build = valid_build() inserted = Build.insert(build) assert inserted # Because Elasticsearch is async, the content written won't be there # until we wait or flush. elasticsearch.flush() search = BuildDoc.search() response = search.execute() assert response.hits.total == 1 (build_doc, ) = response assert build_doc.id == inserted.id as_dict = build_doc.to_dict() as_dict.pop("id") # Can't easily compare these because elasticseach_dsl will convert # dates to datetime.datetime objects. # But if we convert dates from the Elasticsearch query to a string # we can compare. as_dict["build"]["date"] = as_dict["build"]["date"].isoformat()[:19] as_dict["download"]["date"] = as_dict["download"]["date"].isoformat()[:19] build = inserted.build build["build"]["date"] = build["build"]["date"][:19] build["download"]["date"] = build["download"]["date"][:19] assert as_dict == build
def test_ingest_idempotently( mocked_boto3, settings, valid_build, itertools_count, mocker ): mocked_message = mocker.MagicMock() message = { "Message": json.dumps( { "Records": [ { "s3": { "object": { "key": "some/path/to/buildhub.json", "eTag": "e4eb6609382efd6b3bc9deec616ad5c0", }, "bucket": {"name": "buildhubses"}, } } ] } ) } mocked_message.body = json.dumps(message) mocked_queue = mocker.MagicMock() mocked_queue.receive_messages().__iter__.return_value = [mocked_message] mocked_boto3.resource().get_queue_by_name.return_value = mocked_queue mocked_s3_client = mocker.MagicMock() mocked_boto3.client.return_value = mocked_s3_client build = valid_build() Build.insert(build) def mocked_download_fileobj(bucket_name, key_name, f): # Sanity checks that the mocking is right assert bucket_name == "buildhubses" assert key_name == "some/path/to/buildhub.json" f.write(json.dumps(build).encode("utf-8")) mocked_s3_client.download_fileobj.side_effect = mocked_download_fileobj start(settings.SQS_QUEUE_URL) mocked_boto3.resource().get_queue_by_name.assert_called_with( QueueName="buildhub-s3-events" ) # It should have created no new Builds assert Build.objects.all().count() == 1
def test_insert_skips_writes_to_bigquery_when_disabled(mocked_bigquery, valid_build, settings): settings.BQ_ENABLED = False build = valid_build() inserted = Build.insert(build) assert inserted mocked_bigquery.assert_not_called()
def process_buildhub_json_key(config, s3): logger.debug(f"S3 buildhub.json key {s3!r}") key_name = s3["object"]["key"] assert os.path.basename(key_name).endswith("buildhub.json"), key_name bucket_name = s3["bucket"]["name"] # We need a S3 connection client to be able to download this one. if bucket_name not in config: logger.debug("Creating a new BOTO3 S3 CLIENT") if settings.UNSIGNED_SQS_S3_CLIENT: config[bucket_name] = boto3.client( "s3", config["region_name"], config=Config(signature_version=UNSIGNED)) else: config[bucket_name] = boto3.client("s3", config["region_name"]) with io.BytesIO() as f: try: config[bucket_name].download_fileobj(bucket_name, key_name, f) except ClientError as exception: if exception.response["Error"]["Code"] == "404": logger.warning( f"Tried to download {key_name} (in {bucket_name}) " "but not found.") return raise # After it has been populated by download_fileobj() we need to # rewind it so we can send it to json.load(). f.seek(0) # Before exiting this context (and freeing up the binary data), # we turn it into a Python dict. build = json.load(f) # XXX Needs to deal with how to avoid corrupt buildhub.json S3 keys # never leaving the system. try: inserted = Build.insert( build=build, s3_object_key=s3["object"]["key"], s3_object_etag=s3["object"]["eTag"], ) except ValidationError as exc: # We're only doing a try:except ValidationError: here so we get a # chance to log a useful message about the S3 object and the # validation error message. logger.warning( "Failed to insert build because the build was not valid. " f"S3 key {key_name!r} (bucket {bucket_name!r}). " f"Validation error message: {exc.message}") raise if inserted: metrics.incr("sqs_inserted") logger.info( f"Inserted {key_name} as a valid Build ({inserted.build_hash})") else: metrics.incr("sqs_not_inserted") logger.info(f"Did not insert {key_name} because we already had it")
def test_ingest_idempotently( mocked_boto3, settings, valid_build, itertools_count, mocker, ): mocked_message = mocker.MagicMock() mocked_message.body = json.dumps({ 'Records': [ { 's3': { 'object': { 'key': 'some/path/to/buildhub.json', 'eTag': 'e4eb6609382efd6b3bc9deec616ad5c0', }, 'bucket': { 'name': 'buildhubses', } } }, ] }) mocked_queue = mocker.MagicMock() mocked_queue.receive_messages().__iter__.return_value = [mocked_message] mocked_boto3.resource().get_queue_by_name.return_value = mocked_queue mocked_s3_client = mocker.MagicMock() mocked_boto3.client.return_value = mocked_s3_client build = valid_build() Build.insert(build) def mocked_download_fileobj(bucket_name, key_name, f): # Sanity checks that the mocking is right assert bucket_name == 'buildhubses' assert key_name == 'some/path/to/buildhub.json' f.write(json.dumps(build).encode('utf-8')) mocked_s3_client.download_fileobj.side_effect = mocked_download_fileobj start(settings.SQS_QUEUE_URL) mocked_boto3.resource().get_queue_by_name.assert_called_with( QueueName='buildhub-s3-events') # It should have created no new Builds assert Build.objects.all().count() == 1
def test_bulk_insert_invalid_skip_invalid(valid_build): one = valid_build() two = valid_build() two.pop("target") inserted, skipped = Build.bulk_insert([one, two], skip_invalid=True) assert inserted == 1 assert skipped == 1 # The first one would be inserted. assert Build.objects.count() == 1
def test_search_aggregations(valid_build, json_poster, elasticsearch): build = valid_build() build["target"]["version"] = "60.0.1" Build.insert(build) build = valid_build() build["target"]["version"] = "60.0.2" Build.insert(build) build = valid_build() build["target"]["version"] = "60.1" Build.insert(build) elasticsearch.flush() search = { "aggs": { "versions": { "filter": {"match_all": {}}, "aggs": { "target.version": { "terms": { "field": "target.version", "size": 1000, "order": {"_term": "desc"}, "include": "6.*", } }, "target.version_count": { "cardinality": {"field": "target.version"} }, }, } }, "size": 0, } url = reverse("api:search") response = json_poster(url, search) assert response.status_code == 200 result = response.json() assert result["hits"]["total"] == 3 assert not result["hits"]["hits"] # because only aggregations agg_key = "versions" buckets = result["aggregations"][agg_key]["target.version"]["buckets"] assert buckets == [ {"key": "60.1", "doc_count": 1}, {"key": "60.0.2", "doc_count": 1}, {"key": "60.0.1", "doc_count": 1}, ] # This time filter more search["aggs"][agg_key]["aggs"]["target.version"]["terms"]["include"] = "60.0.*" response = json_poster(url, search) assert response.status_code == 200 result = response.json() buckets = result["aggregations"][agg_key]["target.version"]["buckets"] assert buckets == [ {"key": "60.0.2", "doc_count": 1}, {"key": "60.0.1", "doc_count": 1}, ]
def handle(self, *args, **options): # verbose = options["verbosity"] > 1 if not settings.DATABASES.get("kinto"): raise ImproperlyConfigured( "See configuration documentation about setting up " "second the 'kinto' connection.") pages = 0 done = 0 skip_validation = options["skip_validation"] skip_invalid = options["skip_invalid"] skipped = 0 inserted_total = 0 total_t0 = time.time() for batch, total_records in self.iterator(options): builds = [ x[0] for x in batch if not skip_invalid or "build" in x[0] ] count = len(builds) print(f"Page {pages + 1} ({count} records)") t0 = time.time() inserted, batch_skipped = Build.bulk_insert( builds, skip_validation=skip_validation, skip_invalid=skip_invalid, metadata={"kinto-migration": True}, ) t1 = time.time() done += count skipped += batch_skipped inserted_total += inserted print("Inserted {} new out of {} in " "{:.2f} seconds. {} of {} ({:.1f}%)".format( format(inserted, ","), format(count, ","), t1 - t0, format(done, ","), format(total_records, ","), 100 * done / total_records, )) if batch_skipped: print(f"Skipped {batch_skipped} invalid records.") pages += 1 total_t1 = time.time() print(f"In total, skipped {skipped} invalid records.") print(f"In total, processed {done} valid records.") print(f"In total, inserted {inserted_total} valid records.") print("The whole migration took {:.1f} minutes.".format( (total_t1 - total_t0) / 60))
def handle(self, *args, **options): if options["continue"]: with open(self.next_url_log_file) as f: url = f.read().strip() logger.info(f"Continuing with URL {url}") else: # Ping it first kinto_url = options["kinto-url"] r = requests.get(kinto_url) r.raise_for_status() assert r.json()["project_name"] == "kinto", r.json() if kinto_url.endswith("/"): kinto_url = kinto_url[:-1] url = ( f"{kinto_url}/buckets/build-hub/collections/releases/records" "?_limit=10000") pages = 0 session = requests.Session() done = 0 skip_validation = options["skip_validation"] for batch, total_records in self.iterator(session, url): logger.info(f"Page {pages + 1} ({len(batch)} records)") # Now let's bulk insert these builds = [] for record in batch: record.pop("id") record.pop("last_modified") builds.append(record) # Skip validation most of the time t0 = time.time() inserted, _ = Build.bulk_insert( builds, skip_validation=skip_validation, metadata={"kinto-migration": True}, ) t1 = time.time() metrics.incr("kinto_migrated", value=len(builds)) metrics.incr("kinto_inserted", value=inserted) done += len(batch) logger.info("Inserted {} new out of {} in " "{:.2f} seconds. {} of {} ({:.1f}%)".format( format(inserted, ","), format(len(builds), ","), t1 - t0, format(done, ","), format(total_records, ","), 100 * done / total_records, )) pages += 1
def process_buildhub_json_key(config, s3): logger.debug(f"S3 buildhub.json key {s3!r}") key_name = s3['object']['key'] assert os.path.basename(key_name) == 'buildhub.json', key_name bucket_name = s3['bucket']['name'] # We need a S3 connection client to be able to download this one. if bucket_name not in config: logger.debug('Creating a new BOTO3 S3 CLIENT') config[bucket_name] = boto3.client('s3', config['region_name']) with io.BytesIO() as f: try: config[bucket_name].download_fileobj(bucket_name, key_name, f) except ClientError as exception: if exception.response['Error']['Code'] == '404': logger.warning( f"Tried to download {key_name} (in {bucket_name}) " "but not found.") return raise # After it has been populated by download_fileobj() we need to # rewind it so we can send it to json.load(). f.seek(0) # Before exiting this context (and freeing up the binary data), # we turn it into a Python dict. build = json.load(f) # XXX Needs to deal with how to avoid corrupt buildhub.json S3 keys # never leaving the system. try: inserted = Build.insert( build=build, s3_object_key=s3['object']['key'], s3_object_etag=s3['object']['eTag'], ) except ValidationError as exc: # We're only doing a try:except ValidationError: here so we get a # chance to log a useful message about the S3 object and the # validation error message. logger.warning( "Failed to insert build because the build was not valid. " f"S3 key {key_name!r} (bucket {bucket_name!r}). " f"Validation error message: {exc.message}") raise if inserted: logger.info( f"Inserted {key_name} as a valid Build ({inserted.build_hash})") else: logger.info(f"Did not insert {key_name} because we already had it")
def test_serialized_instance_inserts_into_bigquery(bigquery_client, bigquery_testing_table, valid_build, settings): """Test that the fixture is created and insertion is successful.""" # This test does not rely on auto-insertion settings.BQ_ENABLED = False client = bigquery_client table = bigquery_testing_table doc = Build.insert(valid_build()).to_dict() errors = client.insert_rows(table, [doc]) assert errors == [] table_id = f"{table.dataset_id}.{table.table_id}" job = client.query(f"SELECT COUNT(*) as n_rows FROM {table_id}") result = list(job.result())[0] assert result.n_rows == 1
def test_insert_writes_to_bigquery_when_enabled(mocked_bigquery, valid_build, settings, mocker): mocked_client = mocker.MagicMock() mocked_bigquery.Client.return_value = mocked_client settings.BQ_ENABLED = True build = valid_build() inserted = Build.insert(build) assert inserted mocked_client.insert_rows.assert_called_once() args = mocked_client.insert_rows.call_args # takes a (table, document) tuple documents = args[0][1] assert len(documents) == 1 assert documents[0]["build_hash"] == inserted.build_hash
def test_insert_writes_to_bigquery(bigquery_client, bigquery_testing_table, valid_build, settings): """Test that the fixture is created and insertion is successful.""" client = bigquery_client table = bigquery_testing_table # mock settings to ensure callback sends data to the right place settings.BQ_DATASET_ID = table.dataset_id settings.BQ_TABLE_ID = table.table_id build = valid_build() inserted = Build.insert(build) assert inserted table_id = f"{table.dataset_id}.{table.table_id}" job = client.query(f"SELECT COUNT(*) as n_rows FROM {table_id}") result = list(job.result())[0] assert result.n_rows == 1
def handle(self, *args, **options): # Ping it first kinto_url = options['kinto-url'] r = requests.get(kinto_url) r.raise_for_status() assert r.json()['project_name'] == 'kinto', r.json() if kinto_url.endswith('/'): kinto_url = kinto_url[:-1] url = (f"{kinto_url}/buckets/build-hub/collections/releases/records" "?_limit=10000") pages = 0 session = requests.Session() done = 0 skip_validation = options['skip_validation'] for batch, total_records in self.iterator(session, url): logger.info(f"Page {pages + 1} ({len(batch)} records)") # Now let's bulk insert these builds = [] for record in batch: record.pop('id') record.pop('last_modified') builds.append(record) # Skip validation most of the time t0 = time.time() inserted = Build.bulk_insert( builds, skip_validation=skip_validation, metadata={'kinto-migration': True}, ) t1 = time.time() done += len(batch) logger.info("Inserted {} new out of {} in " "{:.2f} seconds. {} of {} ({:.1f}%)".format( format(inserted, ','), format(len(builds), ','), t1 - t0, format(done, ','), format(total_records, ','), 100 * done / total_records, )) pages += 1
def handle(self, *args, **options): # verbose = options["verbosity"] > 1 if not settings.DATABASES.get("kinto"): raise ImproperlyConfigured( "See configuration documentation about setting up " "second the 'kinto' connection." ) current_count = Build.objects.all().count() print(f"There are currently {current_count:,} in our existing database.") with connections["kinto"].cursor() as cursor: cursor.execute( """ SELECT COUNT(*) FROM records WHERE parent_id = %s AND collection_id = %s """, [options["parent_id"], options["collection_id"]], ) (total_records,) = cursor.fetchone() print(f"There are currently {total_records:,} in the Kinto database.") pages = 0 done = 0 skip_validation = options["skip_validation"] skip_invalid = options["skip_invalid"] skipped = 0 inserted_total = 0 total_t0 = time.time() for batch in self.iterator(options): # builds = [x[0] for x in batch if not skip_invalid or "build" in x[0]] builds = [] for build in batch: if not skip_invalid or "build" in build[0]: if build[0].get("schema"): # The one common thing in the old Kinto database is that each # build has a key 'schema' which is just a timestamp (integer). # Just pop it out so as to not get validation errors that # not actually critical. build[0].pop("schema") builds.append(build[0]) count = len(builds) print(f"Page {pages + 1} ({count:,} records)") t0 = time.time() inserted, batch_skipped = Build.bulk_insert( builds, skip_validation=skip_validation, skip_invalid=skip_invalid, metadata={"kinto-migration": True}, ) t1 = time.time() done += count skipped += batch_skipped inserted_total += inserted print( "Inserted {} new out of {} in " "{:.2f} seconds. {} of {} ({:.1f}%)".format( format(inserted, ","), format(count, ","), t1 - t0, format(done, ","), format(total_records, ","), 100 * done / total_records, ) ) if batch_skipped: print(f"Skipped {batch_skipped:,} invalid records.") pages += 1 total_t1 = time.time() print(f"In total, skipped {skipped:,} invalid records.") print(f"In total, processed {done:,} valid records.") print(f"In total, inserted {inserted_total:,} valid records.") print( "The whole migration took {:.1f} minutes.".format( (total_t1 - total_t0) / 60 ) )
def test_search_aggregations(valid_build, json_poster, elasticsearch): build = valid_build() build['target']['version'] = '60.0.1' Build.insert(build) build = valid_build() build['target']['version'] = '60.0.2' Build.insert(build) build = valid_build() build['target']['version'] = '60.1' Build.insert(build) elasticsearch.flush() search = { "aggs": { "versions": { "filter": { "match_all": {} }, "aggs": { "target.version": { "terms": { "field": "target.version", "size": 1000, "order": { "_term": "desc" }, "include": "6.*" } }, "target.version_count": { "cardinality": { "field": "target.version" } } } } }, "size": 0 } url = reverse('api:search') response = json_poster(url, search) assert response.status_code == 200 result = response.json() assert result['hits']['total'] == 3 assert not result['hits']['hits'] # because only aggregations agg_key = 'versions' buckets = result['aggregations'][agg_key]['target.version']['buckets'] assert buckets == [{ 'key': '60.1', 'doc_count': 1 }, { 'key': '60.0.2', 'doc_count': 1 }, { 'key': '60.0.1', 'doc_count': 1 }] # This time filter more search['aggs'][agg_key]['aggs']['target.version']['terms'][ 'include'] = '60\.0.*' response = json_poster(url, search) assert response.status_code == 200 result = response.json() buckets = result['aggregations'][agg_key]['target.version']['buckets'] assert buckets == [{ 'key': '60.0.2', 'doc_count': 1 }, { 'key': '60.0.1', 'doc_count': 1 }]
def test_backfill_happy_path( mocked_boto3, settings, valid_build, itertools_count, mocker, ): # Create a ready build that is *exactly* like our mocked S3 thing is. build = valid_build() build['download']['mimetype'] = 'one/buildhub.json' Build.insert( build=build, s3_object_key='one/buildhub.json', s3_object_etag='abc123', ) # Create one build that has the same build_hash as the second mocked # key but make the s3_object_etag mismatch. build = valid_build() build['download']['mimetype'] = 'two/buildhub.json' Build.insert( build=build, s3_object_key='two/buildhub.json', s3_object_etag='somethingdifferent', ) mocked_s3_client = mocker.MagicMock() mocked_boto3.client.return_value = mocked_s3_client def mocked_download_fileobj(bucket_name, key_name, f): assert bucket_name == 'buildhubses' build = valid_build() # Just need to mess with the build a little bit so that it's # still valid to the schema but makes a different build_hash. if key_name == 'two/buildhub.json': build['download']['mimetype'] = key_name elif key_name == 'three/buildhub.json': build['download']['mimetype'] = key_name else: raise NotImplementedError(key_name) f.write(json.dumps(build).encode('utf-8')) mocked_s3_client.download_fileobj.side_effect = mocked_download_fileobj def mocked_list_objects(**kwargs): if kwargs.get('ContinuationToken'): # you're on page 2 return { 'Contents': [ { 'Key': 'three/buildhub.json', 'ETag': 'ghi345', }, ] } else: return { 'Contents': [ { 'Key': 'one/buildhub.json', 'ETag': 'abc123', }, { 'Key': 'two/buildhub.json', 'ETag': 'def234', }, ], 'NextContinuationToken': 'nextpageplease', } mocked_s3_client.list_objects_v2.side_effect = mocked_list_objects backfill(settings.S3_BUCKET_URL) # We had 2 before, this should have created 1 new and edited 1 assert Build.objects.all().count() == 3 # The second one should have had its etag updated assert not Build.objects.filter( s3_object_key='two/buildhub.json', s3_object_etag='somethingdifferent', ) assert Build.objects.get( s3_object_key='two/buildhub.json', s3_object_etag='def234', )
def test_backfill_happy_path(mocked_boto3, settings, valid_build, itertools_count, mocker): # Create a ready build that is *exactly* like our mocked S3 thing is. build = valid_build() build["download"]["mimetype"] = "one/buildhub.json" Build.insert(build=build, s3_object_key="one/buildhub.json", s3_object_etag="abc123") # Create one build that has the same build_hash as the second mocked # key but make the s3_object_etag mismatch. build = valid_build() build["download"]["mimetype"] = "two/buildhub.json" Build.insert( build=build, s3_object_key="two/buildhub.json", s3_object_etag="somethingdifferent", ) mocked_s3_client = mocker.MagicMock() mocked_boto3.client.return_value = mocked_s3_client def mocked_download_fileobj(bucket_name, key_name, f): assert bucket_name == "buildhubses" build = valid_build() # Just need to mess with the build a little bit so that it's # still valid to the schema but makes a different build_hash. if key_name == "two/buildhub.json": build["download"]["mimetype"] = key_name elif key_name == "three/buildhub.json": build["download"]["mimetype"] = key_name elif key_name == "three/Firefox-99-buildhub.json": build["download"]["mimetype"] = key_name else: raise NotImplementedError(key_name) f.write(json.dumps(build).encode("utf-8")) mocked_s3_client.download_fileobj.side_effect = mocked_download_fileobj def mocked_list_objects(**kwargs): if kwargs.get("ContinuationToken"): # you're on page 2 return { "Contents": [{ "Key": "three/buildhub.json", "ETag": "ghi345" }] } else: return { "Contents": [ { "Key": "one/buildhub.json", "ETag": "abc123" }, { "Key": "two/buildhub.json", "ETag": "def234" }, { "Key": "three/Firefox-99-buildhub.json", "ETag": "xyz987" }, ], "NextContinuationToken": "nextpageplease", } mocked_s3_client.list_objects_v2.side_effect = mocked_list_objects backfill(settings.S3_BUCKET_URL) # We had 2 before, this should have created 2 new and edited 1 assert Build.objects.all().count() == 4 # The second one should have had its etag updated assert not Build.objects.filter(s3_object_key="two/buildhub.json", s3_object_etag="somethingdifferent") assert Build.objects.get(s3_object_key="two/buildhub.json", s3_object_etag="def234") assert Build.objects.get(s3_object_key="three/Firefox-99-buildhub.json", s3_object_etag="xyz987")
def process_buildhub_json_key(config, s3): logger.debug(f"S3 buildhub.json key {s3!r}") key_name = s3["object"]["key"] assert os.path.basename(key_name).endswith("buildhub.json"), key_name bucket_name = s3["bucket"]["name"] # We need a S3 connection client to be able to download this one. if bucket_name not in config: logger.debug("Creating a new BOTO3 S3 CLIENT") connection_config = None if settings.UNSIGNED_S3_CLIENT: connection_config = Config(signature_version=UNSIGNED) config[bucket_name] = boto3.client("s3", config["region_name"], config=connection_config) with io.BytesIO() as f: try: config[bucket_name].download_fileobj(bucket_name, key_name, f) except ClientError as exception: if exception.response["Error"]["Code"] == "404": logger.warning( f"Tried to download {key_name} (in {bucket_name}) " "but not found.") return raise # After it has been populated by download_fileobj() we need to # rewind it so we can send it to json.load(). f.seek(0) # Before exiting this context (and freeing up the binary data), # we turn it into a Python dict. build = json.load(f) # XXX Needs to deal with how to avoid corrupt buildhub.json S3 keys # never leaving the system. inserted = [] try: ret = Build.insert( build=build, s3_object_key=s3["object"]["key"], s3_object_etag=s3["object"]["eTag"], ) inserted.append(ret) # This is a hack to fix https://bugzilla.mozilla.org/show_bug.cgi?id=1470948 # In some future world we might be able to architecture buildhub in such a way # where this sort of transformation isn't buried down deep in the code if (build["source"]["product"] == "firefox" and build["target"]["channel"] == "release"): beta_build = deepcopy(build) beta_build["target"]["channel"] = "beta" ret = Build.insert( build=beta_build, s3_object_key=s3["object"]["key"], s3_object_etag=s3["object"]["eTag"], ) inserted.append(ret) except ValidationError as exc: # We're only doing a try:except ValidationError: here so we get a # chance to log a useful message about the S3 object and the # validation error message. logger.warning( "Failed to insert build because the build was not valid. " f"S3 key {key_name!r} (bucket {bucket_name!r}). " f"Validation error message: {exc.message}") raise # Build.insert() above can return None (for Builds that already exist). # If anything was _actually_ inserted, log it. if any(inserted): for i in inserted: metrics.incr("sqs_inserted") logger.info( f"Inserted {key_name} as a valid Build ({i.build_hash})") else: metrics.incr("sqs_not_inserted") logger.info(f"Did not insert {key_name} because we already had it")