Example #1
0
 def download_and_insert(obj, maybe=False):
     key = obj["Key"]
     with io.BytesIO() as f:
         # 'bucket_name' and 's3_client' is hoisted from the closure
         s3_client.download_fileobj(bucket_name, key, f)
         # After it has been populated by download_fileobj() we need to
         # rewind it so we can send it to json.load().
         f.seek(0)
         # Before exiting this context (and freeing up the binary data),
         # we turn it into a Python dict.
         build = json.load(f)
     inserted = Build.insert(
         build=build, s3_object_key=obj["Key"], s3_object_etag=obj["ETag"]
     )
     if inserted:
         logger.info(f"New Build inserted from backfill ({key})")
         metrics.incr("backfill_inserted")
     else:
         logger.info(f"Key downloaded but not inserted again ({key})")
         metrics.incr("backfill_not_inserted")
     if maybe and not inserted:
         # If this happens, it means that the build exists exactly with
         # this build_hash already but the ETag isn't matching.
         # Update the s3_object_* attributes
         found = Build.objects.filter(
             s3_object_key=key, build_hash=Build.get_build_hash(build)
         )
         found.update(s3_object_etag=obj["ETag"])
def test_bulk_insert(valid_build):
    one = valid_build()
    two = valid_build()
    assert one == two
    assert one is not two
    insert_count, skipped = Build.bulk_insert([one, two])
    assert skipped == 0
    # Because they're *equal*
    assert insert_count == 1
    assert Build.objects.all().count() == 1

    two["download"]["size"] += 1
    three = valid_build()
    three["download"]["size"] += 2
    insert_count, skipped = Build.bulk_insert([one, two, three])
    assert skipped == 0
    assert insert_count == 2
    # Even though they're "inserted at the same time", their created_at
    # should be different.
    created_ats = [x.created_at for x in Build.objects.all()]
    assert created_ats[0] != created_ats[1]
    assert created_ats[1] != created_ats[2]
    assert Build.objects.all().count() == 3

    insert_count, skipped = Build.bulk_insert([one, two, three])
    assert skipped == 0
    assert insert_count == 0
def test_bulk_insert_invalid(valid_build):
    one = valid_build()
    two = valid_build()
    two.pop("target")
    with pytest.raises(ValidationError) as exception:
        Build.bulk_insert([one, two])
    assert "'target' is a required property" in str(exception.value)
    # Even if the first one was valid, it won't be inserted.
    assert not Build.objects.exists()
Example #4
0
def test_happy_path(valid_build, client, elasticsearch):
    build = valid_build()
    Build.insert(build)
    elasticsearch.flush()

    url = reverse("api:search")
    response = client.get(url)
    assert response.status_code == 200
    result = response.json()
    assert result["hits"]["total"] == 1
    hit, = result["hits"]["hits"]
    assert hit["_source"]["target"]["version"] == build["target"]["version"]
Example #5
0
def test_happy_path(valid_build, client, elasticsearch):
    build = valid_build()
    Build.insert(build)
    elasticsearch.flush()

    url = reverse('api:search')
    response = client.get(url)
    assert response.status_code == 200
    result = response.json()
    assert result['hits']['total'] == 1
    hit, = result['hits']['hits']
    assert hit['_source']['target']['version'] == build['target']['version']
def test_insert(settings, valid_build):
    build = valid_build()
    inserted = Build.insert(build)
    assert inserted.build_hash
    assert inserted.build == build
    assert inserted.created_at
    assert inserted.build_hash in repr(inserted)

    # It's idempotent.
    second_time = Build.insert(build)
    assert not second_time
    assert Build.objects.all().count() == 1
Example #7
0
def test_happy_path_records(valid_build, client, elasticsearch):
    url = reverse("api:records")
    response = client.get(url)
    assert response.status_code == 200
    result = response.json()
    assert result["builds"]["total"] == 0

    build = valid_build()
    Build.insert(build)
    response = client.get(url)
    assert response.status_code == 200
    result = response.json()
    assert result["builds"]["total"] == 1
def test_insert_invalid(settings, valid_build):
    build = valid_build()
    # We can't completely mess with the schema to the point were it
    # breaks Elasticsearch writes.
    build["source"]["junk"] = True
    with pytest.raises(ValidationError) as exception:
        Build.insert(build)
    err_msg = "Additional properties are not allowed ('junk' was unexpected)"
    assert err_msg in str(exception.value)

    # The 'skip_validation' is kinda dumb but it exists for when you're
    # super certain that the stuff you're inserting really is valid.
    Build.insert(build, skip_validation=True)
Example #9
0
def test_happy_path(valid_build, client, elasticsearch):
    build = valid_build()
    Build.insert(build)
    elasticsearch.flush()

    url = reverse("api:search")
    response = client.get(url)
    assert response.status_code == 200
    result = response.json()
    assert result["hits"]["total"] == 1
    (hit, ) = result["hits"]["hits"]
    assert hit["_source"]["target"]["version"] == build["target"]["version"]

    # No CSP header for the API requests since they're always JSON.
    assert not response.has_header("Content-Security-Policy")
Example #10
0
def test_rebuild_bigquery_command(bigquery_client, bigquery_testing_table,
                                  valid_build, settings):
    """Test that the fixture is created and insertion is successful.

    Note that streaming data into a recreated table does not work in testing due
    to caching (see salting in the bigquery fixture in conftest.py).
    """
    client = bigquery_client
    table = bigquery_testing_table

    # We insert data into the database that predates BigQuery functionality
    settings.BQ_ENABLED = False
    n_documents = 10
    build = valid_build()
    for i in range(n_documents):
        build["build"]["number"] = i
        inserted = Build.insert(build)
        assert inserted

    settings.BQ_ENABLED = True
    settings.BQ_DATASET_ID = table.dataset_id
    settings.BQ_TABLE_ID = table.table_id
    settings.BQ_REBUILD_MAX_ERROR_COUNT = 0
    # done in 4 chunks
    settings.BQ_REBUILD_CHUNK_SIZE = 3

    call_command("rebuild-bigquery", yes=True)

    table_id = f"{table.dataset_id}.{table.table_id}"
    query = f"SELECT COUNT(*) as n_rows FROM {table_id}"
    print(query)
    job = client.query(query)
    result = list(job.result())[0]
    assert result.n_rows == n_documents
Example #11
0
def test_model_serialization(valid_build):
    """Example document:
    ```
    {
        "build_hash": "v1:465552ab2ea1b5039a086987b70c598c",
        "metadata": {
            "version": "Testing"
        },
        "build": {
            ...
        },
        "created_at": "2020-01-10T22:46:32.274Z",
        "s3_object_key": "",
        "s3_object_etag": ""
    }
    ```
    """
    build = valid_build()
    inserted = Build.insert(build)
    doc = inserted.to_dict()
    assert set(doc.keys()) == {
        "build_hash",
        "build",
        "metadata",
        "created_at",
        "s3_object_key",
        "s3_object_etag",
    }
Example #12
0
def test_insert_writes_to_elasticsearch(settings, elasticsearch, valid_build):
    build = valid_build()
    inserted = Build.insert(build)
    assert inserted

    # Because Elasticsearch is async, the content written won't be there
    # until we wait or flush.
    elasticsearch.flush()
    search = BuildDoc.search()
    response = search.execute()
    assert response.hits.total == 1
    (build_doc, ) = response
    assert build_doc.id == inserted.id
    as_dict = build_doc.to_dict()
    as_dict.pop("id")
    # Can't easily compare these because elasticseach_dsl will convert
    # dates to datetime.datetime objects.
    # But if we convert dates from the Elasticsearch query to a string
    # we can compare.
    as_dict["build"]["date"] = as_dict["build"]["date"].isoformat()[:19]
    as_dict["download"]["date"] = as_dict["download"]["date"].isoformat()[:19]
    build = inserted.build
    build["build"]["date"] = build["build"]["date"][:19]
    build["download"]["date"] = build["download"]["date"][:19]
    assert as_dict == build
Example #13
0
def test_ingest_idempotently(
    mocked_boto3, settings, valid_build, itertools_count, mocker
):
    mocked_message = mocker.MagicMock()
    message = {
        "Message": json.dumps(
            {
                "Records": [
                    {
                        "s3": {
                            "object": {
                                "key": "some/path/to/buildhub.json",
                                "eTag": "e4eb6609382efd6b3bc9deec616ad5c0",
                            },
                            "bucket": {"name": "buildhubses"},
                        }
                    }
                ]
            }
        )
    }

    mocked_message.body = json.dumps(message)
    mocked_queue = mocker.MagicMock()
    mocked_queue.receive_messages().__iter__.return_value = [mocked_message]
    mocked_boto3.resource().get_queue_by_name.return_value = mocked_queue

    mocked_s3_client = mocker.MagicMock()
    mocked_boto3.client.return_value = mocked_s3_client

    build = valid_build()
    Build.insert(build)

    def mocked_download_fileobj(bucket_name, key_name, f):
        # Sanity checks that the mocking is right
        assert bucket_name == "buildhubses"
        assert key_name == "some/path/to/buildhub.json"
        f.write(json.dumps(build).encode("utf-8"))

    mocked_s3_client.download_fileobj.side_effect = mocked_download_fileobj
    start(settings.SQS_QUEUE_URL)
    mocked_boto3.resource().get_queue_by_name.assert_called_with(
        QueueName="buildhub-s3-events"
    )
    # It should have created no new Builds
    assert Build.objects.all().count() == 1
Example #14
0
def test_insert_skips_writes_to_bigquery_when_disabled(mocked_bigquery,
                                                       valid_build, settings):
    settings.BQ_ENABLED = False
    build = valid_build()
    inserted = Build.insert(build)
    assert inserted

    mocked_bigquery.assert_not_called()
Example #15
0
def process_buildhub_json_key(config, s3):
    logger.debug(f"S3 buildhub.json key {s3!r}")
    key_name = s3["object"]["key"]
    assert os.path.basename(key_name).endswith("buildhub.json"), key_name
    bucket_name = s3["bucket"]["name"]
    # We need a S3 connection client to be able to download this one.
    if bucket_name not in config:
        logger.debug("Creating a new BOTO3 S3 CLIENT")
        if settings.UNSIGNED_SQS_S3_CLIENT:
            config[bucket_name] = boto3.client(
                "s3",
                config["region_name"],
                config=Config(signature_version=UNSIGNED))
        else:
            config[bucket_name] = boto3.client("s3", config["region_name"])

    with io.BytesIO() as f:
        try:
            config[bucket_name].download_fileobj(bucket_name, key_name, f)
        except ClientError as exception:
            if exception.response["Error"]["Code"] == "404":
                logger.warning(
                    f"Tried to download {key_name} (in {bucket_name}) "
                    "but not found.")
                return
            raise

        # After it has been populated by download_fileobj() we need to
        # rewind it so we can send it to json.load().
        f.seek(0)
        # Before exiting this context (and freeing up the binary data),
        # we turn it into a Python dict.
        build = json.load(f)

    # XXX Needs to deal with how to avoid corrupt buildhub.json S3 keys
    # never leaving the system.
    try:
        inserted = Build.insert(
            build=build,
            s3_object_key=s3["object"]["key"],
            s3_object_etag=s3["object"]["eTag"],
        )
    except ValidationError as exc:
        # We're only doing a try:except ValidationError: here so we get a
        # chance to log a useful message about the S3 object and the
        # validation error message.
        logger.warning(
            "Failed to insert build because the build was not valid. "
            f"S3 key {key_name!r} (bucket {bucket_name!r}). "
            f"Validation error message: {exc.message}")
        raise
    if inserted:
        metrics.incr("sqs_inserted")
        logger.info(
            f"Inserted {key_name} as a valid Build ({inserted.build_hash})")
    else:
        metrics.incr("sqs_not_inserted")
        logger.info(f"Did not insert {key_name} because we already had it")
Example #16
0
def test_ingest_idempotently(
    mocked_boto3,
    settings,
    valid_build,
    itertools_count,
    mocker,
):
    mocked_message = mocker.MagicMock()
    mocked_message.body = json.dumps({
        'Records': [
            {
                's3': {
                    'object': {
                        'key': 'some/path/to/buildhub.json',
                        'eTag': 'e4eb6609382efd6b3bc9deec616ad5c0',
                    },
                    'bucket': {
                        'name': 'buildhubses',
                    }
                }
            },
        ]
    })
    mocked_queue = mocker.MagicMock()
    mocked_queue.receive_messages().__iter__.return_value = [mocked_message]
    mocked_boto3.resource().get_queue_by_name.return_value = mocked_queue

    mocked_s3_client = mocker.MagicMock()
    mocked_boto3.client.return_value = mocked_s3_client

    build = valid_build()
    Build.insert(build)

    def mocked_download_fileobj(bucket_name, key_name, f):
        # Sanity checks that the mocking is right
        assert bucket_name == 'buildhubses'
        assert key_name == 'some/path/to/buildhub.json'
        f.write(json.dumps(build).encode('utf-8'))

    mocked_s3_client.download_fileobj.side_effect = mocked_download_fileobj
    start(settings.SQS_QUEUE_URL)
    mocked_boto3.resource().get_queue_by_name.assert_called_with(
        QueueName='buildhub-s3-events')
    # It should have created no new Builds
    assert Build.objects.all().count() == 1
Example #17
0
def test_bulk_insert_invalid_skip_invalid(valid_build):
    one = valid_build()
    two = valid_build()
    two.pop("target")

    inserted, skipped = Build.bulk_insert([one, two], skip_invalid=True)
    assert inserted == 1
    assert skipped == 1
    # The first one would be inserted.
    assert Build.objects.count() == 1
Example #18
0
def test_search_aggregations(valid_build, json_poster, elasticsearch):
    build = valid_build()
    build["target"]["version"] = "60.0.1"
    Build.insert(build)
    build = valid_build()
    build["target"]["version"] = "60.0.2"
    Build.insert(build)
    build = valid_build()
    build["target"]["version"] = "60.1"
    Build.insert(build)

    elasticsearch.flush()

    search = {
        "aggs": {
            "versions": {
                "filter": {"match_all": {}},
                "aggs": {
                    "target.version": {
                        "terms": {
                            "field": "target.version",
                            "size": 1000,
                            "order": {"_term": "desc"},
                            "include": "6.*",
                        }
                    },
                    "target.version_count": {
                        "cardinality": {"field": "target.version"}
                    },
                },
            }
        },
        "size": 0,
    }

    url = reverse("api:search")
    response = json_poster(url, search)
    assert response.status_code == 200
    result = response.json()
    assert result["hits"]["total"] == 3
    assert not result["hits"]["hits"]  # because only aggregations
    agg_key = "versions"
    buckets = result["aggregations"][agg_key]["target.version"]["buckets"]
    assert buckets == [
        {"key": "60.1", "doc_count": 1},
        {"key": "60.0.2", "doc_count": 1},
        {"key": "60.0.1", "doc_count": 1},
    ]

    # This time filter more
    search["aggs"][agg_key]["aggs"]["target.version"]["terms"]["include"] = "60.0.*"
    response = json_poster(url, search)
    assert response.status_code == 200
    result = response.json()

    buckets = result["aggregations"][agg_key]["target.version"]["buckets"]
    assert buckets == [
        {"key": "60.0.2", "doc_count": 1},
        {"key": "60.0.1", "doc_count": 1},
    ]
    def handle(self, *args, **options):
        # verbose = options["verbosity"] > 1
        if not settings.DATABASES.get("kinto"):
            raise ImproperlyConfigured(
                "See configuration documentation about setting up "
                "second the 'kinto' connection.")

        pages = 0
        done = 0
        skip_validation = options["skip_validation"]
        skip_invalid = options["skip_invalid"]
        skipped = 0
        inserted_total = 0
        total_t0 = time.time()
        for batch, total_records in self.iterator(options):
            builds = [
                x[0] for x in batch if not skip_invalid or "build" in x[0]
            ]
            count = len(builds)
            print(f"Page {pages + 1} ({count} records)")
            t0 = time.time()
            inserted, batch_skipped = Build.bulk_insert(
                builds,
                skip_validation=skip_validation,
                skip_invalid=skip_invalid,
                metadata={"kinto-migration": True},
            )
            t1 = time.time()
            done += count
            skipped += batch_skipped
            inserted_total += inserted
            print("Inserted {} new out of {} in "
                  "{:.2f} seconds. {} of {} ({:.1f}%)".format(
                      format(inserted, ","),
                      format(count, ","),
                      t1 - t0,
                      format(done, ","),
                      format(total_records, ","),
                      100 * done / total_records,
                  ))
            if batch_skipped:
                print(f"Skipped {batch_skipped} invalid records.")

            pages += 1
        total_t1 = time.time()

        print(f"In total, skipped {skipped} invalid records.")
        print(f"In total, processed {done} valid records.")
        print(f"In total, inserted {inserted_total} valid records.")

        print("The whole migration took {:.1f} minutes.".format(
            (total_t1 - total_t0) / 60))
Example #20
0
    def handle(self, *args, **options):
        if options["continue"]:
            with open(self.next_url_log_file) as f:
                url = f.read().strip()
            logger.info(f"Continuing with URL {url}")
        else:
            # Ping it first
            kinto_url = options["kinto-url"]
            r = requests.get(kinto_url)
            r.raise_for_status()
            assert r.json()["project_name"] == "kinto", r.json()

            if kinto_url.endswith("/"):
                kinto_url = kinto_url[:-1]
            url = (
                f"{kinto_url}/buckets/build-hub/collections/releases/records"
                "?_limit=10000")
        pages = 0
        session = requests.Session()
        done = 0
        skip_validation = options["skip_validation"]
        for batch, total_records in self.iterator(session, url):
            logger.info(f"Page {pages + 1} ({len(batch)} records)")
            # Now let's bulk insert these
            builds = []
            for record in batch:
                record.pop("id")
                record.pop("last_modified")
                builds.append(record)
            # Skip validation most of the time
            t0 = time.time()
            inserted, _ = Build.bulk_insert(
                builds,
                skip_validation=skip_validation,
                metadata={"kinto-migration": True},
            )
            t1 = time.time()
            metrics.incr("kinto_migrated", value=len(builds))
            metrics.incr("kinto_inserted", value=inserted)
            done += len(batch)
            logger.info("Inserted {} new out of {} in "
                        "{:.2f} seconds. {} of {} ({:.1f}%)".format(
                            format(inserted, ","),
                            format(len(builds), ","),
                            t1 - t0,
                            format(done, ","),
                            format(total_records, ","),
                            100 * done / total_records,
                        ))

            pages += 1
Example #21
0
def process_buildhub_json_key(config, s3):
    logger.debug(f"S3 buildhub.json key {s3!r}")
    key_name = s3['object']['key']
    assert os.path.basename(key_name) == 'buildhub.json', key_name
    bucket_name = s3['bucket']['name']
    # We need a S3 connection client to be able to download this one.
    if bucket_name not in config:
        logger.debug('Creating a new BOTO3 S3 CLIENT')
        config[bucket_name] = boto3.client('s3', config['region_name'])

    with io.BytesIO() as f:
        try:
            config[bucket_name].download_fileobj(bucket_name, key_name, f)
        except ClientError as exception:
            if exception.response['Error']['Code'] == '404':
                logger.warning(
                    f"Tried to download {key_name} (in {bucket_name}) "
                    "but not found.")
                return
            raise

        # After it has been populated by download_fileobj() we need to
        # rewind it so we can send it to json.load().
        f.seek(0)
        # Before exiting this context (and freeing up the binary data),
        # we turn it into a Python dict.
        build = json.load(f)

    # XXX Needs to deal with how to avoid corrupt buildhub.json S3 keys
    # never leaving the system.
    try:
        inserted = Build.insert(
            build=build,
            s3_object_key=s3['object']['key'],
            s3_object_etag=s3['object']['eTag'],
        )
    except ValidationError as exc:
        # We're only doing a try:except ValidationError: here so we get a
        # chance to log a useful message about the S3 object and the
        # validation error message.
        logger.warning(
            "Failed to insert build because the build was not valid. "
            f"S3 key {key_name!r} (bucket {bucket_name!r}). "
            f"Validation error message: {exc.message}")
        raise
    if inserted:
        logger.info(
            f"Inserted {key_name} as a valid Build ({inserted.build_hash})")
    else:
        logger.info(f"Did not insert {key_name} because we already had it")
Example #22
0
def test_serialized_instance_inserts_into_bigquery(bigquery_client,
                                                   bigquery_testing_table,
                                                   valid_build, settings):
    """Test that the fixture is created and insertion is successful."""
    # This test does not rely on auto-insertion
    settings.BQ_ENABLED = False
    client = bigquery_client
    table = bigquery_testing_table
    doc = Build.insert(valid_build()).to_dict()
    errors = client.insert_rows(table, [doc])
    assert errors == []

    table_id = f"{table.dataset_id}.{table.table_id}"
    job = client.query(f"SELECT COUNT(*) as n_rows FROM {table_id}")
    result = list(job.result())[0]
    assert result.n_rows == 1
Example #23
0
def test_insert_writes_to_bigquery_when_enabled(mocked_bigquery, valid_build,
                                                settings, mocker):
    mocked_client = mocker.MagicMock()
    mocked_bigquery.Client.return_value = mocked_client

    settings.BQ_ENABLED = True
    build = valid_build()
    inserted = Build.insert(build)
    assert inserted

    mocked_client.insert_rows.assert_called_once()
    args = mocked_client.insert_rows.call_args
    # takes a (table, document) tuple
    documents = args[0][1]
    assert len(documents) == 1
    assert documents[0]["build_hash"] == inserted.build_hash
Example #24
0
def test_insert_writes_to_bigquery(bigquery_client, bigquery_testing_table,
                                   valid_build, settings):
    """Test that the fixture is created and insertion is successful."""
    client = bigquery_client
    table = bigquery_testing_table

    # mock settings to ensure callback sends data to the right place
    settings.BQ_DATASET_ID = table.dataset_id
    settings.BQ_TABLE_ID = table.table_id

    build = valid_build()
    inserted = Build.insert(build)
    assert inserted

    table_id = f"{table.dataset_id}.{table.table_id}"
    job = client.query(f"SELECT COUNT(*) as n_rows FROM {table_id}")
    result = list(job.result())[0]
    assert result.n_rows == 1
Example #25
0
    def handle(self, *args, **options):
        # Ping it first
        kinto_url = options['kinto-url']
        r = requests.get(kinto_url)
        r.raise_for_status()
        assert r.json()['project_name'] == 'kinto', r.json()

        if kinto_url.endswith('/'):
            kinto_url = kinto_url[:-1]
        url = (f"{kinto_url}/buckets/build-hub/collections/releases/records"
               "?_limit=10000")
        pages = 0
        session = requests.Session()
        done = 0
        skip_validation = options['skip_validation']
        for batch, total_records in self.iterator(session, url):
            logger.info(f"Page {pages + 1} ({len(batch)} records)")
            # Now let's bulk insert these
            builds = []
            for record in batch:
                record.pop('id')
                record.pop('last_modified')
                builds.append(record)
            # Skip validation most of the time
            t0 = time.time()
            inserted = Build.bulk_insert(
                builds,
                skip_validation=skip_validation,
                metadata={'kinto-migration': True},
            )
            t1 = time.time()
            done += len(batch)
            logger.info("Inserted {} new out of {} in "
                        "{:.2f} seconds. {} of {} ({:.1f}%)".format(
                            format(inserted, ','),
                            format(len(builds), ','),
                            t1 - t0,
                            format(done, ','),
                            format(total_records, ','),
                            100 * done / total_records,
                        ))

            pages += 1
    def handle(self, *args, **options):
        # verbose = options["verbosity"] > 1
        if not settings.DATABASES.get("kinto"):
            raise ImproperlyConfigured(
                "See configuration documentation about setting up "
                "second the 'kinto' connection."
            )

        current_count = Build.objects.all().count()
        print(f"There are currently {current_count:,} in our existing database.")

        with connections["kinto"].cursor() as cursor:
            cursor.execute(
                """
                SELECT COUNT(*)
                FROM records
                WHERE
                    parent_id = %s AND collection_id = %s
            """,
                [options["parent_id"], options["collection_id"]],
            )
            (total_records,) = cursor.fetchone()
            print(f"There are currently {total_records:,} in the Kinto database.")

        pages = 0
        done = 0
        skip_validation = options["skip_validation"]
        skip_invalid = options["skip_invalid"]
        skipped = 0
        inserted_total = 0
        total_t0 = time.time()
        for batch in self.iterator(options):
            # builds = [x[0] for x in batch if not skip_invalid or "build" in x[0]]
            builds = []
            for build in batch:
                if not skip_invalid or "build" in build[0]:
                    if build[0].get("schema"):
                        # The one common thing in the old Kinto database is that each
                        # build has a key 'schema' which is just a timestamp (integer).
                        # Just pop it out so as to not get validation errors that
                        # not actually critical.
                        build[0].pop("schema")
                    builds.append(build[0])
            count = len(builds)
            print(f"Page {pages + 1} ({count:,} records)")
            t0 = time.time()
            inserted, batch_skipped = Build.bulk_insert(
                builds,
                skip_validation=skip_validation,
                skip_invalid=skip_invalid,
                metadata={"kinto-migration": True},
            )
            t1 = time.time()
            done += count
            skipped += batch_skipped
            inserted_total += inserted
            print(
                "Inserted {} new out of {} in "
                "{:.2f} seconds. {} of {} ({:.1f}%)".format(
                    format(inserted, ","),
                    format(count, ","),
                    t1 - t0,
                    format(done, ","),
                    format(total_records, ","),
                    100 * done / total_records,
                )
            )
            if batch_skipped:
                print(f"Skipped {batch_skipped:,} invalid records.")

            pages += 1
        total_t1 = time.time()

        print(f"In total, skipped {skipped:,} invalid records.")
        print(f"In total, processed {done:,} valid records.")
        print(f"In total, inserted {inserted_total:,} valid records.")

        print(
            "The whole migration took {:.1f} minutes.".format(
                (total_t1 - total_t0) / 60
            )
        )
Example #27
0
def test_search_aggregations(valid_build, json_poster, elasticsearch):
    build = valid_build()
    build['target']['version'] = '60.0.1'
    Build.insert(build)
    build = valid_build()
    build['target']['version'] = '60.0.2'
    Build.insert(build)
    build = valid_build()
    build['target']['version'] = '60.1'
    Build.insert(build)

    elasticsearch.flush()

    search = {
        "aggs": {
            "versions": {
                "filter": {
                    "match_all": {}
                },
                "aggs": {
                    "target.version": {
                        "terms": {
                            "field": "target.version",
                            "size": 1000,
                            "order": {
                                "_term": "desc"
                            },
                            "include": "6.*"
                        }
                    },
                    "target.version_count": {
                        "cardinality": {
                            "field": "target.version"
                        }
                    }
                }
            }
        },
        "size": 0
    }

    url = reverse('api:search')
    response = json_poster(url, search)
    assert response.status_code == 200
    result = response.json()
    assert result['hits']['total'] == 3
    assert not result['hits']['hits']  # because only aggregations
    agg_key = 'versions'
    buckets = result['aggregations'][agg_key]['target.version']['buckets']
    assert buckets == [{
        'key': '60.1',
        'doc_count': 1
    }, {
        'key': '60.0.2',
        'doc_count': 1
    }, {
        'key': '60.0.1',
        'doc_count': 1
    }]

    # This time filter more
    search['aggs'][agg_key]['aggs']['target.version']['terms'][
        'include'] = '60\.0.*'
    response = json_poster(url, search)
    assert response.status_code == 200
    result = response.json()

    buckets = result['aggregations'][agg_key]['target.version']['buckets']
    assert buckets == [{
        'key': '60.0.2',
        'doc_count': 1
    }, {
        'key': '60.0.1',
        'doc_count': 1
    }]
Example #28
0
def test_backfill_happy_path(
    mocked_boto3,
    settings,
    valid_build,
    itertools_count,
    mocker,
):

    # Create a ready build that is *exactly* like our mocked S3 thing is.
    build = valid_build()
    build['download']['mimetype'] = 'one/buildhub.json'
    Build.insert(
        build=build,
        s3_object_key='one/buildhub.json',
        s3_object_etag='abc123',
    )

    # Create one build that has the same build_hash as the second mocked
    # key but make the s3_object_etag mismatch.
    build = valid_build()
    build['download']['mimetype'] = 'two/buildhub.json'
    Build.insert(
        build=build,
        s3_object_key='two/buildhub.json',
        s3_object_etag='somethingdifferent',
    )

    mocked_s3_client = mocker.MagicMock()
    mocked_boto3.client.return_value = mocked_s3_client

    def mocked_download_fileobj(bucket_name, key_name, f):
        assert bucket_name == 'buildhubses'
        build = valid_build()
        # Just need to mess with the build a little bit so that it's
        # still valid to the schema but makes a different build_hash.
        if key_name == 'two/buildhub.json':
            build['download']['mimetype'] = key_name
        elif key_name == 'three/buildhub.json':
            build['download']['mimetype'] = key_name
        else:
            raise NotImplementedError(key_name)
        f.write(json.dumps(build).encode('utf-8'))

    mocked_s3_client.download_fileobj.side_effect = mocked_download_fileobj

    def mocked_list_objects(**kwargs):
        if kwargs.get('ContinuationToken'):  # you're on page 2
            return {
                'Contents': [
                    {
                        'Key': 'three/buildhub.json',
                        'ETag': 'ghi345',
                    },
                ]
            }
        else:
            return {
                'Contents': [
                    {
                        'Key': 'one/buildhub.json',
                        'ETag': 'abc123',
                    },
                    {
                        'Key': 'two/buildhub.json',
                        'ETag': 'def234',
                    },
                ],
                'NextContinuationToken':
                'nextpageplease',
            }

    mocked_s3_client.list_objects_v2.side_effect = mocked_list_objects
    backfill(settings.S3_BUCKET_URL)

    # We had 2 before, this should have created 1 new and edited 1
    assert Build.objects.all().count() == 3
    # The second one should have had its etag updated
    assert not Build.objects.filter(
        s3_object_key='two/buildhub.json',
        s3_object_etag='somethingdifferent',
    )
    assert Build.objects.get(
        s3_object_key='two/buildhub.json',
        s3_object_etag='def234',
    )
def test_backfill_happy_path(mocked_boto3, settings, valid_build,
                             itertools_count, mocker):

    # Create a ready build that is *exactly* like our mocked S3 thing is.
    build = valid_build()
    build["download"]["mimetype"] = "one/buildhub.json"
    Build.insert(build=build,
                 s3_object_key="one/buildhub.json",
                 s3_object_etag="abc123")

    # Create one build that has the same build_hash as the second mocked
    # key but make the s3_object_etag mismatch.
    build = valid_build()
    build["download"]["mimetype"] = "two/buildhub.json"
    Build.insert(
        build=build,
        s3_object_key="two/buildhub.json",
        s3_object_etag="somethingdifferent",
    )

    mocked_s3_client = mocker.MagicMock()
    mocked_boto3.client.return_value = mocked_s3_client

    def mocked_download_fileobj(bucket_name, key_name, f):
        assert bucket_name == "buildhubses"
        build = valid_build()
        # Just need to mess with the build a little bit so that it's
        # still valid to the schema but makes a different build_hash.
        if key_name == "two/buildhub.json":
            build["download"]["mimetype"] = key_name
        elif key_name == "three/buildhub.json":
            build["download"]["mimetype"] = key_name
        elif key_name == "three/Firefox-99-buildhub.json":
            build["download"]["mimetype"] = key_name
        else:
            raise NotImplementedError(key_name)
        f.write(json.dumps(build).encode("utf-8"))

    mocked_s3_client.download_fileobj.side_effect = mocked_download_fileobj

    def mocked_list_objects(**kwargs):
        if kwargs.get("ContinuationToken"):  # you're on page 2
            return {
                "Contents": [{
                    "Key": "three/buildhub.json",
                    "ETag": "ghi345"
                }]
            }
        else:
            return {
                "Contents": [
                    {
                        "Key": "one/buildhub.json",
                        "ETag": "abc123"
                    },
                    {
                        "Key": "two/buildhub.json",
                        "ETag": "def234"
                    },
                    {
                        "Key": "three/Firefox-99-buildhub.json",
                        "ETag": "xyz987"
                    },
                ],
                "NextContinuationToken":
                "nextpageplease",
            }

    mocked_s3_client.list_objects_v2.side_effect = mocked_list_objects
    backfill(settings.S3_BUCKET_URL)

    # We had 2 before, this should have created 2 new and edited 1
    assert Build.objects.all().count() == 4
    # The second one should have had its etag updated
    assert not Build.objects.filter(s3_object_key="two/buildhub.json",
                                    s3_object_etag="somethingdifferent")
    assert Build.objects.get(s3_object_key="two/buildhub.json",
                             s3_object_etag="def234")
    assert Build.objects.get(s3_object_key="three/Firefox-99-buildhub.json",
                             s3_object_etag="xyz987")
Example #30
0
def process_buildhub_json_key(config, s3):
    logger.debug(f"S3 buildhub.json key {s3!r}")
    key_name = s3["object"]["key"]
    assert os.path.basename(key_name).endswith("buildhub.json"), key_name
    bucket_name = s3["bucket"]["name"]
    # We need a S3 connection client to be able to download this one.
    if bucket_name not in config:
        logger.debug("Creating a new BOTO3 S3 CLIENT")
        connection_config = None
        if settings.UNSIGNED_S3_CLIENT:
            connection_config = Config(signature_version=UNSIGNED)
        config[bucket_name] = boto3.client("s3",
                                           config["region_name"],
                                           config=connection_config)

    with io.BytesIO() as f:
        try:
            config[bucket_name].download_fileobj(bucket_name, key_name, f)
        except ClientError as exception:
            if exception.response["Error"]["Code"] == "404":
                logger.warning(
                    f"Tried to download {key_name} (in {bucket_name}) "
                    "but not found.")
                return
            raise

        # After it has been populated by download_fileobj() we need to
        # rewind it so we can send it to json.load().
        f.seek(0)
        # Before exiting this context (and freeing up the binary data),
        # we turn it into a Python dict.
        build = json.load(f)

    # XXX Needs to deal with how to avoid corrupt buildhub.json S3 keys
    # never leaving the system.
    inserted = []
    try:
        ret = Build.insert(
            build=build,
            s3_object_key=s3["object"]["key"],
            s3_object_etag=s3["object"]["eTag"],
        )
        inserted.append(ret)
        # This is a hack to fix https://bugzilla.mozilla.org/show_bug.cgi?id=1470948
        # In some future world we might be able to architecture buildhub in such a way
        # where this sort of transformation isn't buried down deep in the code
        if (build["source"]["product"] == "firefox"
                and build["target"]["channel"] == "release"):
            beta_build = deepcopy(build)
            beta_build["target"]["channel"] = "beta"
            ret = Build.insert(
                build=beta_build,
                s3_object_key=s3["object"]["key"],
                s3_object_etag=s3["object"]["eTag"],
            )
            inserted.append(ret)

    except ValidationError as exc:
        # We're only doing a try:except ValidationError: here so we get a
        # chance to log a useful message about the S3 object and the
        # validation error message.
        logger.warning(
            "Failed to insert build because the build was not valid. "
            f"S3 key {key_name!r} (bucket {bucket_name!r}). "
            f"Validation error message: {exc.message}")
        raise
    # Build.insert() above can return None (for Builds that already exist).
    # If anything was _actually_ inserted, log it.
    if any(inserted):
        for i in inserted:
            metrics.incr("sqs_inserted")
            logger.info(
                f"Inserted {key_name} as a valid Build ({i.build_hash})")
    else:
        metrics.incr("sqs_not_inserted")
        logger.info(f"Did not insert {key_name} because we already had it")