def deferred_sync(self, delta): """ Synchronize the repository with the remote repository without downloading artifacts. Args: delta (namedtuple): Set of unit keys for units to be added to the repository. Set of unit keys for units that should be removed from the repository. Only the additions are used in this method. """ description = _("Adding file content to the repository without downloading artifacts.") progress_bar = ProgressBar(message=description, total=len(delta.additions)) with progress_bar: for remote_artifact in self.next_remote_artifact(delta.additions): content = self.content_dict.pop(remote_artifact.url) self._create_and_associate_content(content, {remote_artifact: None}) progress_bar.increment()
def fetch_roles(remote): """ Fetch the roles in a remote repository Args: remote (AnsibleRemote): A remote. Returns: list: a list of dicts that represent roles """ page_count = 0 def role_page_url(remote, page=1): parsed = urlparse(remote.url) new_query = parse_qs(parsed.query) new_query['page'] = page return parsed.scheme + '://' + parsed.netloc + parsed.path + '?' + urlencode( new_query, doseq=True) def parse_metadata(path): metadata = json.load(open(path)) page_count = metadata['num_pages'] return page_count, parse_roles(metadata) downloader = remote.get_downloader(role_page_url(remote)) downloader.fetch() page_count, roles = parse_metadata(downloader.path) progress_bar = ProgressBar(message='Parsing Pages from Galaxy Roles API', total=page_count, done=1, state='running') progress_bar.save() def downloader_coroutines(): for page in range(2, page_count + 1): downloader = remote.get_downloader(role_page_url(remote, page)) yield downloader.run() loop = asyncio.get_event_loop() downloaders = downloader_coroutines() not_done = set() with suppress(StopIteration): for i in range(20): not_done.add(next(downloaders)) while True: if not_done == set(): break done, not_done = loop.run_until_complete( asyncio.wait(not_done, return_when=FIRST_COMPLETED)) for item in done: download_result = item.result() new_page_count, new_roles = parse_metadata(download_result.path) roles.extend(new_roles) progress_bar.increment() with suppress(StopIteration): not_done.add(next(downloaders)) progress_bar.state = 'completed' progress_bar.save() return roles
def sync(remote_pk, repository_pk): """ Sync Collections with ``remote_pk``, and save a new RepositoryVersion for ``repository_pk``. Args: remote_pk (str): The remote PK. repository_pk (str): The repository PK. Raises: ValueError: If the remote does not specify a URL to sync or a ``whitelist`` of Collections to sync. """ remote = CollectionRemote.objects.get(pk=remote_pk) repository = Repository.objects.get(pk=repository_pk) if not remote.url: raise ValueError( _("A CollectionRemote must have a 'url' specified to synchronize.") ) if not remote.whitelist: raise ValueError( _("A CollectionRemote must have a 'whitelist' specified to synchronize." )) repository_spec_strings = remote.whitelist.split(' ') def nowhere(*args, **kwargs): pass collections_pks = [] download_pb = ProgressBar(message='Downloading Collections', total=len(repository_spec_strings)) import_pb = ProgressBar(message='Importing Collections', total=len(repository_spec_strings)) with RepositoryVersion.create(repository) as new_version: with tempfile.TemporaryDirectory() as temp_ansible_path: with download_pb: # workaround: mazer logs errors without this dir https://pulp.plan.io/issues/4999 os.mkdir(os.path.join(temp_ansible_path, 'ansible_collections')) galaxy_context = GalaxyContext( collections_path=temp_ansible_path, server={ 'url': remote.url, 'ignore_certs': False, }, ) install_repository_specs_loop( display_callback=nowhere, galaxy_context=galaxy_context, repository_spec_strings=repository_spec_strings, ) download_pb.done = len(repository_spec_strings) with import_pb: content_walk_generator = os.walk(temp_ansible_path) for dirpath, dirnames, filenames in content_walk_generator: if 'MANIFEST.json' in filenames: manifest_path = os.path.join(dirpath, 'MANIFEST.json') with open(manifest_path) as manifest_file: manifest_data = json.load(manifest_file) info = manifest_data['collection_info'] filename = '{namespace}-{name}-{version}'.format( namespace=info['namespace'], name=info['name'], version=info['version'], ) tarfile_path = os.path.join(temp_ansible_path, filename + '.tar.gz') with tarfile.open(name=tarfile_path, mode='w|gz') as newtar: newtar.add(dirpath, arcname=filename) with transaction.atomic(): collection, created = Collection.objects.get_or_create( namespace=info['namespace'], name=info['name'], version=info['version']) if created: artifact = Artifact.init_and_validate( newtar.name) artifact.save() ContentArtifact.objects.create( artifact=artifact, content=collection, relative_path=collection.relative_path, ) collections_pks.append(collection) import_pb.increment() collections = Collection.objects.filter(pk__in=collections_pks) new_version.add_content(collections)
async def run(self): """ DockerFirstStage. """ future_manifests = [] tag_list = [] to_download = [] man_dcs = {} total_blobs = [] with ProgressBar(message='Downloading tag list', total=1) as pb: repo_name = self.remote.namespaced_upstream_name relative_url = '/v2/{name}/tags/list'.format(name=repo_name) tag_list_url = urljoin(self.remote.url, relative_url) list_downloader = self.remote.get_downloader(url=tag_list_url) await list_downloader.run(extra_data={'repo_name': repo_name}) with open(list_downloader.path) as tags_raw: tags_dict = json.loads(tags_raw.read()) tag_list = tags_dict['tags'] # check for the presence of the pagination link header link = list_downloader.response_headers.get('Link') await self.handle_pagination(link, repo_name, tag_list) whitelist_tags = self.remote.whitelist_tags if whitelist_tags: tag_list = list(set(tag_list) & set(whitelist_tags.split(','))) pb.increment() msg = 'Creating Download requests for v2 Tags' with ProgressBar(message=msg, total=len(tag_list)) as pb: for tag_name in tag_list: relative_url = '/v2/{name}/manifests/{tag}'.format( name=self.remote.namespaced_upstream_name, tag=tag_name, ) url = urljoin(self.remote.url, relative_url) downloader = self.remote.get_downloader(url=url) to_download.append(downloader.run(extra_data={'headers': V2_ACCEPT_HEADERS})) pb.increment() pb_parsed_tags = ProgressBar(message='Processing v2 Tags', state='running') pb_parsed_ml_tags = ProgressBar(message='Parsing Manifest List Tags', state='running') pb_parsed_m_tags = ProgressBar(message='Parsing Manifests Tags', state='running') global pb_parsed_blobs pb_parsed_blobs = ProgressBar(message='Parsing Blobs', state='running') pb_parsed_man = ProgressBar(message='Parsing Manifests', state='running') for download_tag in asyncio.as_completed(to_download): tag = await download_tag with open(tag.path) as content_file: raw = content_file.read() content_data = json.loads(raw) mediatype = content_data.get('mediaType') tag.artifact_attributes['file'] = tag.path saved_artifact = Artifact(**tag.artifact_attributes) try: saved_artifact.save() except IntegrityError: del tag.artifact_attributes['file'] saved_artifact = Artifact.objects.get(**tag.artifact_attributes) tag_dc = self.create_tag(mediatype, saved_artifact, tag.url) if type(tag_dc.content) is ManifestListTag: list_dc = self.create_tagged_manifest_list( tag_dc, content_data) await self.put(list_dc) pb_parsed_ml_tags.increment() tag_dc.extra_data['list_relation'] = list_dc for manifest_data in content_data.get('manifests'): man_dc = self.create_manifest(list_dc, manifest_data) future_manifests.append(man_dc.get_or_create_future()) man_dcs[man_dc.content.digest] = man_dc await self.put(man_dc) pb_parsed_man.increment() elif type(tag_dc.content) is ManifestTag: man_dc = self.create_tagged_manifest(tag_dc, content_data) await self.put(man_dc) pb_parsed_m_tags.increment() tag_dc.extra_data['man_relation'] = man_dc self.handle_blobs(man_dc, content_data, total_blobs) await self.put(tag_dc) pb_parsed_tags.increment() pb_parsed_tags.state = 'completed' pb_parsed_tags.total = pb_parsed_tags.done pb_parsed_tags.save() pb_parsed_ml_tags.state = 'completed' pb_parsed_ml_tags.total = pb_parsed_ml_tags.done pb_parsed_ml_tags.save() pb_parsed_m_tags.state = 'completed' pb_parsed_m_tags.total = pb_parsed_m_tags.done pb_parsed_m_tags.save() pb_parsed_man.state = 'completed' pb_parsed_man.total = pb_parsed_man.done pb_parsed_man.save() for manifest_future in asyncio.as_completed(future_manifests): man = await manifest_future with man._artifacts.get().file.open() as content_file: raw = content_file.read() content_data = json.loads(raw) man_dc = man_dcs[man.digest] self.handle_blobs(man_dc, content_data, total_blobs) for blob in total_blobs: await self.put(blob) pb_parsed_blobs.state = 'completed' pb_parsed_blobs.total = pb_parsed_blobs.done pb_parsed_blobs.save()
async def run(self): """ Build `DeclarativeContent` from the repodata. """ packages_pb = ProgressBar(message='Parsed Packages') erratum_pb = ProgressBar(message='Parsed Erratum') packages_pb.save() erratum_pb.save() with ProgressBar(message='Downloading Metadata Files') as metadata_pb: downloader = self.remote.get_downloader( url=urljoin(self.remote.url, 'repodata/repomd.xml')) # TODO: decide how to distinguish between a mirror list and a normal repo result = await downloader.run() metadata_pb.increment() repomd_path = result.path repomd = cr.Repomd(repomd_path) package_repodata_urls = {} downloaders = [] for record in repomd.records: if record.type in PACKAGE_REPODATA: package_repodata_urls[record.type] = urljoin( self.remote.url, record.location_href) elif record.type in UPDATE_REPODATA: updateinfo_url = urljoin(self.remote.url, record.location_href) downloader = self.remote.get_downloader(url=updateinfo_url) downloaders.append([downloader.run()]) else: log.info( _('Unknown repodata type: {t}. Skipped.').format( t=record.type)) # TODO: skip databases, save unknown types to publish them as-is # to preserve order, downloaders are created after all repodata urls are identified package_repodata_downloaders = [] for repodata_type in PACKAGE_REPODATA: downloader = self.remote.get_downloader( url=package_repodata_urls[repodata_type]) package_repodata_downloaders.append(downloader.run()) downloaders.append(package_repodata_downloaders) # asyncio.gather is used to preserve the order of results for package repodata pending = [ asyncio.gather(*downloaders_group) for downloaders_group in downloaders ] while pending: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED) for downloader in done: results = downloader.result() if results[0].url == package_repodata_urls['primary']: primary_xml_path = results[0].path filelists_xml_path = results[1].path other_xml_path = results[2].path metadata_pb.done += 3 metadata_pb.save() packages = await RpmFirstStage.parse_repodata( primary_xml_path, filelists_xml_path, other_xml_path) packages_pb.total = len(packages) packages_pb.state = 'running' packages_pb.save() for pkg in packages.values(): package = Package( **Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr( CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(self.remote.url, package.location_href) filename = os.path.basename(package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download) dc = DeclarativeContent(content=package, d_artifacts=[da]) packages_pb.increment() await self.put(dc) elif results[0].url == updateinfo_url: updateinfo_xml_path = results[0].path metadata_pb.increment() updates = await RpmFirstStage.parse_updateinfo( updateinfo_xml_path) erratum_pb.total = len(updates) erratum_pb.state = 'running' erratum_pb.save() for update in updates: update_record = UpdateRecord( **UpdateRecord.createrepo_to_dict(update)) update_record.digest = RpmFirstStage.hash_update_record( update) future_relations = { 'collections': defaultdict(list), 'references': [] } for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict( collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict( package) pkg = UpdateCollectionPackage(**pkg_dict) future_relations['collections'][ coll].append(pkg) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict( reference) ref = UpdateReference(**reference_dict) future_relations['references'].append(ref) erratum_pb.increment() dc = DeclarativeContent(content=update_record) dc.extra_data = future_relations await self.put(dc) packages_pb.state = 'completed' erratum_pb.state = 'completed' packages_pb.save() erratum_pb.save()
async def run(self): """ DockerFirstStage. """ future_manifests = [] tag_list = [] to_download = [] man_dcs = {} total_blobs = [] with ProgressBar(message='Downloading tag list', total=1) as pb: repo_name = self.remote.namespaced_upstream_name relative_url = '/v2/{name}/tags/list'.format(name=repo_name) tag_list_url = urljoin(self.remote.url, relative_url) list_downloader = self.remote.get_downloader(url=tag_list_url) await list_downloader.run(extra_data={'repo_name': repo_name}) with open(list_downloader.path) as tags_raw: tags_dict = json.loads(tags_raw.read()) tag_list = tags_dict['tags'] # check for the presence of the pagination link header link = list_downloader.response_headers.get('Link') await self.handle_pagination(link, repo_name, tag_list) whitelist_tags = self.remote.whitelist_tags if whitelist_tags: tag_list = list(set(tag_list) & set(whitelist_tags.split(','))) pb.increment() for tag_name in tag_list: relative_url = '/v2/{name}/manifests/{tag}'.format( name=self.remote.namespaced_upstream_name, tag=tag_name, ) url = urljoin(self.remote.url, relative_url) downloader = self.remote.get_downloader(url=url) to_download.append( downloader.run(extra_data={'headers': V2_ACCEPT_HEADERS})) pb_parsed_tags = ProgressBar(message='Processing Tags', state='running') for download_tag in asyncio.as_completed(to_download): tag = await download_tag with open(tag.path, 'rb') as content_file: raw_data = content_file.read() content_data = json.loads(raw_data) media_type = content_data.get('mediaType') tag.artifact_attributes['file'] = tag.path saved_artifact = Artifact(**tag.artifact_attributes) try: saved_artifact.save() except IntegrityError: del tag.artifact_attributes['file'] saved_artifact = Artifact.objects.get( **tag.artifact_attributes) tag_dc = self.create_tag(saved_artifact, tag.url) if media_type == MEDIA_TYPE.MANIFEST_LIST: list_dc = self.create_tagged_manifest_list( tag_dc, content_data) await self.put(list_dc) tag_dc.extra_data['man_relation'] = list_dc for manifest_data in content_data.get('manifests'): man_dc = self.create_manifest(list_dc, manifest_data) future_manifests.append(man_dc.get_or_create_future()) man_dcs[man_dc.content.digest] = man_dc await self.put(man_dc) else: man_dc = self.create_tagged_manifest(tag_dc, content_data, raw_data) await self.put(man_dc) tag_dc.extra_data['man_relation'] = man_dc self.handle_blobs(man_dc, content_data, total_blobs) await self.put(tag_dc) pb_parsed_tags.increment() pb_parsed_tags.state = 'completed' pb_parsed_tags.total = pb_parsed_tags.done pb_parsed_tags.save() for manifest_future in asyncio.as_completed(future_manifests): man = await manifest_future with man._artifacts.get().file.open() as content_file: raw = content_file.read() content_data = json.loads(raw) man_dc = man_dcs[man.digest] self.handle_blobs(man_dc, content_data, total_blobs) for blob in total_blobs: await self.put(blob)