async def process_batch(session, batch, skip_incomplete): # Parallel fetch of metadata for each item of the batch. logger.info('Fetch metadata for {} releases...'.format(len(batch))) futures = [fetch_metadata(session, record) for record in batch] metadatas = await asyncio.gather(*futures) results = [merge_metadata(record, metadata) for record, metadata in zip(batch, metadatas)] for result in results: try: check_record(result) except ValueError as e: # Keep only results where metadata was found. if skip_incomplete: logger.warning(e) continue yield {'data': result}
async def main(loop, event): """ Trigger when S3 event kicks in. http://docs.aws.amazon.com/AmazonS3/latest/dev/notification-content-structure.html """ server_url = config('SERVER_URL', default='http://localhost:8888/v1') bucket = config('BUCKET', default='build-hub') collection = config('COLLECTION', default='releases') kinto_auth = tuple(config('AUTH', 'user:pass').split(':')) kinto_client = kinto_http.Client(server_url=server_url, auth=kinto_auth, retry=NB_RETRY_REQUEST) records = [] for record in event['Records']: if record.get('EventSource') == 'aws:sns': records.extend(json.loads(record['Sns']['Message'])['Records']) else: records.append(record) async with aiohttp.ClientSession(loop=loop) as session: for event_record in records: metrics.incr('s3_event_event') records_to_create = [] # Use event time as archive publication. event_time = ciso8601.parse_datetime(event_record['eventTime']) event_time = event_time.strftime(utils.DATETIME_FORMAT) key = event_record['s3']['object']['key'] filesize = event_record['s3']['object']['size'] url = utils.key_to_archive_url(key) logger.debug("Event file {}".format(url)) try: product = key.split('/')[1] # /pub/thunderbird/nightly/... except IndexError: continue # e.g. https://archive.mozilla.org/favicon.ico if product not in utils.ALL_PRODUCTS: logger.info('Skip product {}'.format(product)) continue # Release / Nightly / RC archive. if utils.is_build_url(product, url): logger.info('Processing {} archive: {}'.format(product, key)) record = utils.record_from_url(url) # Use S3 event infos for the archive. record['download']['size'] = filesize record['download']['date'] = event_time # Fetch release metadata. await scan_candidates(session, product) logger.debug("Fetch record metadata") # metadata = await fetch_metadata(session, record) metadata = await fetch_metadata(session, record) # If JSON metadata not available, archive will be # handled when JSON is delivered. if metadata is None: logger.info(f"JSON metadata not available {record['id']}") continue # Merge obtained metadata. record = utils.merge_metadata(record, metadata) records_to_create.append(record) # RC metadata elif utils.is_rc_build_metadata(product, url): logger.info(f'Processing {product} RC metadata: {key}') # pub/firefox/candidates/55.0b12-candidates/build1/mac/en-US/ # firefox-55.0b12.json logger.debug("Fetch new metadata") # It has been known to happen that right after an S3 Event # there's a slight delay to the metadata json file being # available. If that's the case we want to retry in a couple # of seconds to see if it's available on the next backoff # attempt. metadata = await fetch_json(session, url, retry_on_notfound=True) metadata['buildnumber'] = int( re.search('/build(\d+)/', url).group(1)) # We just received the metadata file. Lookup if the associated # archives are here too. archives = [] if 'multi' in url: # For multi we just check the associated archive # is here already. parent_folder = re.sub('multi/.+$', 'multi/', url) _, files = await fetch_listing(session, parent_folder, retry_on_notfound=True) for f in files: rc_url = parent_folder + f['name'] if utils.is_build_url(product, rc_url): archives.append( (rc_url, f['size'], f['last_modified'])) else: # For en-US it's different, it applies to every # localized archives. # Check if they are here by listing the parent folder # (including en-US archive). l10n_parent_url = re.sub('en-US/.+$', '', url) l10n_folders, _ = await fetch_listing( session, l10n_parent_url, retry_on_notfound=True, ) for locale in l10n_folders: _, files = await fetch_listing( session, l10n_parent_url + locale, retry_on_notfound=True, ) for f in files: rc_url = l10n_parent_url + locale + f['name'] if utils.is_build_url(product, rc_url): archives.append(( rc_url, f['size'], f['last_modified'], )) for rc_url, size, last_modified in archives: record = utils.record_from_url(rc_url) record['download']['size'] = size record['download']['date'] = last_modified record = utils.merge_metadata(record, metadata) records_to_create.append(record) # Theorically release should never be there yet :) # And repacks like EME-free/sha1 don't seem to be # published in RC. # Nightly metadata # pub/firefox/nightly/2017/08/2017-08-08-11-40-32-mozilla-central/ # firefox-57.0a1.en-US.linux-i686.json # -l10n/... elif utils.is_nightly_build_metadata(product, url): logger.info(f'Processing {product} nightly metadata: {key}') logger.debug("Fetch new nightly metadata") # See comment above about the exceptional need of # setting retry_on_notfound here. metadata = await fetch_json(session, url, retry_on_notfound=True) platform = metadata['moz_pkg_platform'] # Check if english version is here. parent_url = re.sub('/[^/]+$', '/', url) logger.debug("Fetch parent listing {}".format(parent_url)) _, files = await fetch_listing(session, parent_url) for f in files: if ('.' + platform + '.') not in f['name']: # metadata are by platform. continue en_nightly_url = parent_url + f['name'] if utils.is_build_url(product, en_nightly_url): record = utils.record_from_url(en_nightly_url) record['download']['size'] = f['size'] record['download']['date'] = f['last_modified'] record = utils.merge_metadata(record, metadata) records_to_create.append(record) break # Only one file for english. # Check also localized versions. l10n_folder_url = re.sub('-mozilla-central([^/]*)/([^/]+)$', '-mozilla-central\\1-l10n/', url) logger.debug("Fetch l10n listing {}".format(l10n_folder_url)) try: _, files = await fetch_listing( session, l10n_folder_url, retry_on_notfound=True, ) except ValueError: files = [] # No -l10/ folder published yet. for f in files: if (('.' + platform + '.') not in f['name'] and product != 'mobile'): # metadata are by platform. # (mobile platforms are contained by folder) continue nightly_url = l10n_folder_url + f['name'] if utils.is_build_url(product, nightly_url): record = utils.record_from_url(nightly_url) record['download']['size'] = f['size'] record['download']['date'] = f['last_modified'] record = utils.merge_metadata(record, metadata) records_to_create.append(record) else: logger.info('Ignored {}'.format(key)) logger.debug(f"{len(records_to_create)} records to create.") with metrics.timer('s3_event_records_to_create'): for record in records_to_create: # Check that fields values look OK. utils.check_record(record) # Push result to Kinto. kinto_client.create_record(data=record, bucket=bucket, collection=collection, if_not_exists=True) logger.info('Created {}'.format(record['id'])) metrics.incr('s3_event_record_created')
def test_merge_metadata(record, metadata, expected): result = merge_metadata(record, metadata) assert result == expected
async def main(loop, event): """ Trigger when S3 event kicks in. http://docs.aws.amazon.com/AmazonS3/latest/dev/notification-content-structure.html """ server_url = os.getenv('SERVER_URL', 'http://localhost:8888/v1') bucket = os.getenv('BUCKET', 'build-hub') collection = os.getenv('COLLECTION', 'releases') kinto_auth = tuple(os.getenv('AUTH', 'user:pass').split(':')) kinto_client = kinto_http.Client(server_url=server_url, auth=kinto_auth, retry=NB_RETRY_REQUEST) records = [] for record in event['Records']: if record.get('EventSource') == 'aws:sns': records.extend(json.loads(record['Sns']['Message'])['Records']) else: records.append(record) async with aiohttp.ClientSession(loop=loop) as session: for event_record in records: records_to_create = [] # Use event time as archive publication. event_time = datetime.datetime.strptime(event_record['eventTime'], '%Y-%m-%dT%H:%M:%S.%fZ') event_time = event_time.strftime(utils.DATETIME_FORMAT) key = event_record['s3']['object']['key'] filesize = event_record['s3']['object']['size'] url = utils.ARCHIVE_URL + key logger.debug("Event file {}".format(url)) try: product = key.split('/')[1] # /pub/thunderbird/nightly/... except IndexError: continue # e.g. https://archive.mozilla.org/favicon.ico if product not in utils.ALL_PRODUCTS: logger.info('Skip product {}'.format(product)) continue # Release / Nightly / RC archive. if utils.is_build_url(product, url): logger.info('Processing {} archive: {}'.format(product, key)) record = utils.record_from_url(url) # Use S3 event infos for the archive. record['download']['size'] = filesize record['download']['date'] = event_time # Fetch release metadata. await scan_candidates(session, product) logger.debug("Fetch record metadata") metadata = await fetch_metadata(session, record) # If JSON metadata not available, archive will be handled when JSON # is delivered. if metadata is None: logger.info('JSON metadata not available {}'.format( record['id'])) continue # Merge obtained metadata. record = utils.merge_metadata(record, metadata) records_to_create.append(record) # RC metadata elif utils.is_rc_build_metadata(product, url): logger.info('Processing {} RC metadata: {}'.format( product, key)) # pub/firefox/candidates/55.0b12-candidates/build1/mac/en-US/ # firefox-55.0b12.json logger.debug("Fetch new metadata") metadata = await fetch_json(session, url) metadata['buildnumber'] = int( re.search('/build(\d+)/', url).group(1)) # Check if localized languages are here (including en-US archive). l10n_parent_url = re.sub('en-US/.+$', '', url) l10n_folders, _ = await fetch_listing(session, l10n_parent_url) for locale in l10n_folders: _, files = await fetch_listing(session, l10n_parent_url + locale) for f in files: rc_url = l10n_parent_url + locale + f['name'] if utils.is_build_url(product, rc_url): record = utils.record_from_url(rc_url) record['download']['size'] = f['size'] record['download']['date'] = f['last_modified'] record = utils.merge_metadata(record, metadata) records_to_create.append(record) # Theorically release should never be there yet :) # And repacks like EME-free/sha1 don't seem to be published in RC. # Nightly metadata # pub/firefox/nightly/2017/08/2017-08-08-11-40-32-mozilla-central/ # firefox-57.0a1.en-US.linux-i686.json # -l10n/... elif utils.is_nightly_build_metadata(product, url): logger.info('Processing {} nightly metadata: {}'.format( product, key)) logger.debug("Fetch new nightly metadata") metadata = await fetch_json(session, url) platform = metadata['moz_pkg_platform'] # Check if english version is here. parent_url = re.sub('/[^/]+$', '/', url) logger.debug("Fetch parent listing {}".format(parent_url)) _, files = await fetch_listing(session, parent_url) for f in files: if ('.' + platform + '.') not in f['name']: # metadata are by platform. continue en_nightly_url = parent_url + f['name'] if utils.is_build_url(product, en_nightly_url): record = utils.record_from_url(en_nightly_url) record['download']['size'] = f['size'] record['download']['date'] = f['last_modified'] record = utils.merge_metadata(record, metadata) records_to_create.append(record) break # Only one file for english. # Check also localized versions. l10n_folder_url = re.sub('-mozilla-central([^/]*)/([^/]+)$', '-mozilla-central\\1-l10n/', url) logger.debug("Fetch l10n listing {}".format(l10n_folder_url)) try: _, files = await fetch_listing(session, l10n_folder_url) except ValueError: files = [] # No -l10/ folder published yet. for f in files: if ('.' + platform + '.') not in f['name'] and product != 'mobile': # metadata are by platform. # (mobile platforms are contained by folder) continue nightly_url = l10n_folder_url + f['name'] if utils.is_build_url(product, nightly_url): record = utils.record_from_url(nightly_url) record['download']['size'] = f['size'] record['download']['date'] = f['last_modified'] record = utils.merge_metadata(record, metadata) records_to_create.append(record) else: logger.info('Ignored {}'.format(key)) logger.debug("{} records to create.".format( len(records_to_create))) for record in records_to_create: # Check that fields values look OK. utils.check_record(record) # Push result to Kinto. kinto_client.create_record(data=record, bucket=bucket, collection=collection, if_not_exists=True) logger.info('Created {}'.format(record['id']))