def reindex_packages_from_store( dao: Dao, config: Config, channel_name: str, user_id: bytes, ): """Reindex packages from files in the package store""" db = dao.db pkgstore = config.get_package_store() all_files = pkgstore.list_files(channel_name) pkg_files = [f for f in all_files if f.endswith(".tar.bz2")] channel = dao.get_channel(channel_name) if channel: for package in channel.packages: db.delete(package) db.commit() else: data = rest_models.Channel(name=channel_name, description="re-indexed from files", private=True) channel = dao.create_channel(data, user_id, authorization.OWNER) for fname in pkg_files: fid = pkgstore.serve_path(channel_name, fname) handle_file(channel_name, fname, fid, dao, user_id) update_indexes(dao, pkgstore, channel_name)
def get_channel_or_fail(channel_name: str, dao: Dao = Depends(get_dao)) -> db_models.Channel: channel = dao.get_channel(channel_name) if not channel: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f'Channel {channel_name} not found') return channel
def synchronize_packages( channel_name: str, dao: Dao, pkgstore: PackageStore, auth: authorization.Rules, session: requests.Session, includelist: List[str] = None, excludelist: List[str] = None, use_repodata: bool = False, ): logger.debug( f"executing synchronize_packages task in a process {os.getpid()}") new_channel = dao.get_channel(channel_name) if not new_channel: logger.error(f"channel {channel_name} not found") return host = new_channel.mirror_channel_url remote_repo = RemoteRepository(new_channel.mirror_channel_url, session) user_id = auth.assert_user() try: channel_data = remote_repo.open("channeldata.json").json() if use_repodata: create_packages_from_channeldata(channel_name, user_id, channel_data, dao) subdirs = channel_data.get("subdirs", []) except (RemoteFileNotFound, json.JSONDecodeError): subdirs = None except RemoteServerError: raise HTTPException( status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=f"Remote channel {host} unavailable", ) # if no channel data use known architectures if subdirs is None: subdirs = KNOWN_SUBDIRS for arch in subdirs: initial_sync_mirror( new_channel.name, remote_repo, arch, dao, pkgstore, auth, includelist, excludelist, use_repodata=use_repodata, )
def post_channel(new_channel: rest_models.Channel, dao: Dao = Depends(get_dao), auth: authorization.Rules = Depends(get_rules)): user_id = auth.assert_user() channel = dao.get_channel(new_channel.name) if channel: raise HTTPException(status_code=status.HTTP_409_CONFLICT, detail=f'Channel {new_channel.name} exists') dao.create_channel(new_channel, user_id, authorization.OWNER)
def post_channel( new_channel: rest_models.Channel, background_tasks: BackgroundTasks, dao: Dao = Depends(get_dao), auth: authorization.Rules = Depends(get_rules), task: Task = Depends(get_tasks_worker), remote_session: requests.Session = Depends(get_remote_session), ): user_id = auth.assert_user() channel = dao.get_channel(new_channel.name) if channel: raise HTTPException( status_code=status.HTTP_409_CONFLICT, detail=f"Channel {new_channel.name} exists", ) if not new_channel.mirror_channel_url: auth.assert_create_channel() is_mirror = new_channel.mirror_channel_url and new_channel.mirror_mode == "mirror" is_proxy = new_channel.mirror_channel_url and new_channel.mirror_mode == "proxy" if is_mirror: auth.assert_create_mirror_channel() if is_proxy: auth.assert_create_proxy_channel() if new_channel.metadata.actions is None: if is_mirror: actions = [ChannelActionEnum.synchronize] else: actions = [] else: actions = new_channel.metadata.actions channel = dao.create_channel(new_channel, user_id, authorization.OWNER) for action in actions: task.execute_channel_action(action, channel)
def synchronize_packages( channel_name: str, dao: Dao, pkgstore: PackageStore, auth: authorization.Rules, session: requests.Session, ): logger.debug( f"executing synchronize_packages task in a process {os.getpid()}") new_channel = dao.get_channel(channel_name) host = new_channel.mirror_channel_url remote_repo = RemoteRepository(new_channel.mirror_channel_url, session) try: channel_data = remote_repo.open("channeldata.json").json() subdirs = channel_data.get("subdirs", []) except (RemoteFileNotFound, json.JSONDecodeError): subdirs = None except RemoteServerError: raise HTTPException( status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=f"Remote channel {host} unavailable", ) # if no channel data use known architectures if subdirs is None: subdirs = KNOWN_SUBDIRS for arch in subdirs: initial_sync_mirror( new_channel.name, remote_repo, arch, dao, pkgstore, auth, )
def __call__( self, channel_name: str, dao: Dao = Depends(get_dao), auth: authorization.Rules = Depends(get_rules), ) -> db_models.Channel: channel = dao.get_channel(channel_name.lower()) if not channel: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"Channel {channel_name} not found", ) auth.assert_channel_read(channel) mirror_url = channel.mirror_channel_url is_proxy = mirror_url and channel.mirror_mode == "proxy" is_mirror = mirror_url and channel.mirror_mode == "mirror" is_local = not mirror_url if is_proxy and not self.allow_proxy: raise HTTPException( status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="This method is not implemented for proxy channels", ) if is_mirror and not self.allow_mirror: raise HTTPException( status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="This method is not implemented for mirror channels", ) if is_local and not self.allow_local: raise HTTPException( status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="This method is not implemented for local channels", ) return channel
def post_channel( request: Request, new_channel: rest_models.Channel, background_tasks: BackgroundTasks, mirror_api_key: Optional[str] = None, register_mirror: bool = False, dao: Dao = Depends(get_dao), auth: authorization.Rules = Depends(get_rules), task: Task = Depends(get_tasks_worker), config=Depends(get_config), session: requests.Session = Depends(get_remote_session), ): user_id = auth.assert_user() existing_channel = dao.get_channel(new_channel.name) if existing_channel: raise HTTPException( status_code=status.HTTP_409_CONFLICT, detail=f"Channel {new_channel.name} exists", ) if not new_channel.mirror_channel_url: auth.assert_create_channel() is_mirror = new_channel.mirror_channel_url and new_channel.mirror_mode == "mirror" is_proxy = new_channel.mirror_channel_url and new_channel.mirror_mode == "proxy" if is_mirror: auth.assert_create_mirror_channel() if is_proxy: auth.assert_create_proxy_channel() if new_channel.actions is None: if is_mirror: actions = [ChannelActionEnum.synchronize_repodata] else: actions = [] else: actions = new_channel.actions includelist = new_channel.metadata.includelist excludelist = new_channel.metadata.excludelist if includelist is not None and excludelist is not None: raise HTTPException( status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail="Cannot use both `includelist` and `excludelist` together.", ) user_attrs = new_channel.dict(exclude_unset=True) if "size_limit" in user_attrs: auth.assert_set_channel_size_limit() size_limit = new_channel.size_limit else: if config.configured_section("quotas"): size_limit = config.quotas_channel_quota else: size_limit = None channel = dao.create_channel(new_channel, user_id, authorization.OWNER, size_limit) pkgstore.create_channel(new_channel.name) indexing.update_indexes(dao, pkgstore, new_channel.name) # register mirror if is_mirror and register_mirror: mirror_url = str(new_channel.mirror_channel_url) mirror_url = mirror_url.replace("get", "api/channels") headers = {"x-api-key": mirror_api_key} if mirror_api_key else {} api_endpoint = str( request.url.replace(query=None)) + '/' + new_channel.name request.url response = session.post( mirror_url + '/mirrors', json={ "url": api_endpoint.replace("api/channels", "get"), "api_endpoint": api_endpoint, "metrics_endpoint": api_endpoint.replace("api", "metrics"), }, headers=headers, ) if response.status_code != 201: logger.warning( f"could not register mirror due to error {response.text}") for action in actions: task.execute_channel_action( action, channel, )
def initial_sync_mirror( channel_name: str, remote_repository: RemoteRepository, arch: str, dao: Dao, pkgstore: PackageStore, auth: authorization.Rules, includelist: List[str] = None, excludelist: List[str] = None, skip_errors: bool = True, use_repodata: bool = False, ): force = True # needed for updating packages for repodata_fn in ["repodata_from_packages.json", "repodata.json"]: try: repo_file = remote_repository.open(os.path.join(arch, repodata_fn)) repodata = json.load(repo_file.file) break except RemoteServerError: logger.error( f"can not get {repodata_fn} for channel {arch}/{channel_name}." ) if repodata_fn == "repodata.json": logger.error(f"Giving up for {channel_name}/{arch}.") return else: logger.error("Trying next filename.") continue except json.JSONDecodeError: logger.error(f"repodata.json badly formatted for arch {arch}" f"in channel {channel_name}") if repodata_fn == "repodata.json": return channel = dao.get_channel(channel_name) if not channel: logger.error(f"channel {channel_name} not found") return from quetz.main import handle_package_files packages = repodata.get("packages", {}) version_methods = [ _check_checksum(dao, channel_name, arch, "sha256"), _check_checksum(dao, channel_name, arch, "md5"), ] config = Config() max_batch_length = config.mirroring_batch_length max_batch_size = config.mirroring_batch_size # version_methods are context managers (for example, to update the db # after all packages have been checked), so we need to enter the context # for each any_updated = False with contextlib.ExitStack() as version_stack: version_checks = [ version_stack.enter_context(method) for method in version_methods ] update_batch = [] update_size = 0 def handle_batch(update_batch): # i_batch += 1 logger.debug(f"Handling batch: {update_batch}") if not update_batch: return False remote_packages = [] remote_packages_with_metadata = [] with ThreadPoolExecutor( max_workers=config.mirroring_num_parallel_downloads ) as executor: for f in executor.map( download_file, (remote_repository, ) * len(update_batch), update_batch, ): if f is not None: remote_packages.append(f[0]) remote_packages_with_metadata.append(f) try: if use_repodata: handle_repodata_package( channel, remote_packages_with_metadata, dao, auth, force, pkgstore, config, ) else: handle_package_files( channel, remote_packages, dao, auth, force, is_mirror_op=True, ) return True except Exception as exc: logger.error( f"could not process package {update_batch} from channel" f"{channel_name} due to error {exc} of " f"type {exc.__class__.__name__}") if not skip_errors: raise exc return False for package_name, metadata in packages.items(): if check_package_membership(package_name, includelist, excludelist): path = os.path.join(arch, package_name) # try to find out whether it's a new package version is_uptodate = None for _check in version_checks: is_uptodate = _check(package_name, metadata) if is_uptodate is not None: break # if package is up-to-date skip uploading file if is_uptodate: continue else: logger.debug( f"updating package {package_name} from {arch}") update_batch.append((path, package_name, metadata)) update_size += metadata.get('size', 100_000) if len(update_batch ) >= max_batch_length or update_size >= max_batch_size: logger.debug(f"Executing batch with {update_size}") any_updated |= handle_batch(update_batch) update_batch.clear() update_size = 0 # handle final batch any_updated |= handle_batch(update_batch) if any_updated: indexing.update_indexes(dao, pkgstore, channel_name, subdirs=[arch])
def synchronize_metrics_from_mirrors( channel_name: str, dao: Dao, session: requests.Session, now: datetime = datetime.utcnow(), ): logger = logging.getLogger("quetz") channel = dao.get_channel(channel_name) if not channel: return for m in channel.mirrors: if not m.metrics_endpoint: logger.warning( f"metrics endpoint not configured for mirror {m.url}." "Skipping metrics synchronisation" ) continue query_str = ["period=H"] start_time: Optional[datetime] if m.last_synchronised: start_time = m.last_synchronised.replace(minute=0, second=0, microsecond=0) query_str.append(f"start={start_time.isoformat()}") else: start_time = None # exclude incomplete intervals (the current hour) end_time = now.replace(minute=0, second=0, microsecond=0) if start_time == end_time: logger.debug(f"metrics data for mirror {m.url} are up-to-date") continue query_str.append(f"end={end_time.isoformat()}") metrics_url = m.metrics_endpoint + "?" + "&".join(query_str) response = session.get(metrics_url) if response.status_code != 200: logger.error( f"mirror server {metrics_url} returned bad response with code " f"{response.status_code} and message {response.text}" ) continue response_data = response.json() try: packages = response_data["packages"] except KeyError: logger.error( f"malfromated respose received from {metrics_url}: " "missing 'packages' key" ) continue for platform_filename, data in packages.items(): platform, filename = platform_filename.split('/') for s in data["series"]: timestamp = datetime.fromisoformat(s["timestamp"]) count = s["count"] dao.incr_download_count( channel_name, filename, platform, timestamp, count ) logger.debug(f"synchronized metrics from {metrics_url}") m.last_synchronised = end_time dao.db.commit()
def reindex_packages_from_store(dao: Dao, config: Config, channel_name: str, user_id, sync: bool = True): """Reindex packages from files in the package store""" logger.debug(f"Re-indexing channel {channel_name}") channel = dao.get_channel(channel_name) pkg_db = [] if channel: if not sync: for package in channel.packages: dao.db.delete(package) dao.db.commit() else: dao.cleanup_channel_db(channel_name) for package in channel.packages: for pv in package.package_versions: # type: ignore pkg_db.append(f"{pv.platform}/{pv.filename}") dao.db.commit() else: data = rest_models.Channel(name=channel_name, description="re-indexed from store", private=True) channel = dao.create_channel(data, user_id, authorization.OWNER) logger.debug(f"Reading package list for channel {channel_name}") user_id = uuid_to_bytes(user_id) pkgstore = config.get_package_store() all_files = pkgstore.list_files(channel_name) pkg_files = [f for f in all_files if f.endswith(".tar.bz2")] nthreads = config.general_package_unpack_threads logger.debug( f"Found {len(pkg_db)} packages for channel {channel_name} in database") logger.debug( f"Found {len(pkg_files)} packages for channel {channel_name} in pkgstore" ) pkg_files = list(set(pkg_files) - set(pkg_db)) logger.debug( f"Importing {len(pkg_files)} packages for channel {channel_name}" + " from pkgstore") for pkg_group in chunks(pkg_files, nthreads * 8): tic = time.perf_counter() with ThreadPoolExecutor(max_workers=nthreads) as executor: results = [] for fname in pkg_group: results.append( executor.submit(handle_condainfo, pkgstore, channel_name, fname)) for future in as_completed(results): condainfo = future.result() if condainfo: handle_file(channel_name, condainfo, dao, user_id) toc = time.perf_counter() logger.debug( f"Imported files {pkg_group[0]} to {pkg_group[-1]} " + f"for channel {channel_name} in {toc - tic:0.4f} seconds " + f"using {nthreads} threads") try: update_indexes(dao, pkgstore, channel_name) dao.db.commit() except IntegrityError: dao.rollback() logger.error(f"Update index {channel_name} failed") dao.cleanup_channel_db(channel_name) dao.db.commit()
def initial_sync_mirror( channel_name: str, remote_repository: RemoteRepository, arch: str, dao: Dao, pkgstore: PackageStore, auth: authorization.Rules, skip_errors: bool = True, ): force = True # needed for updating packages try: repo_file = remote_repository.open(os.path.join(arch, "repodata.json")) repodata = json.load(repo_file.file) except RemoteServerError: logger.error(f"can not get repodata.json for channel {channel_name}") return except json.JSONDecodeError: logger.error( f"repodata.json badly formatted for arch {arch} in channel {channel_name}" ) return channel = dao.get_channel(channel_name) from quetz.main import handle_package_files packages = repodata.get("packages", {}) version_methods = [ _check_timestamp(channel, dao), _check_checksum(pkgstore, channel_name, arch, "sha256"), _check_checksum(pkgstore, channel_name, arch, "md5"), ] # version_methods are context managers (for example, to update the db # after all packages have been checked), so we need to enter the context # for each any_updated = False with contextlib.ExitStack() as version_stack: version_checks = [ version_stack.enter_context(method) for method in version_methods ] for package_name, metadata in packages.items(): path = os.path.join(arch, package_name) # try to find out whether it's a new package version is_uptodate = None for _check in version_checks: is_uptodate = _check(package_name, metadata) if is_uptodate is not None: break # if package is up-to-date skip uploading file if is_uptodate: logger.debug( f"package {package_name} from {arch} up-to-date. Not updating" ) continue else: logger.debug(f"updating package {package_name} form {arch}") try: remote_package = remote_repository.open(path) except RemoteServerError: logger.error(f"remote server error when getting a file {path}") continue files = [remote_package] try: handle_package_files( channel_name, files, dao, auth, force, ) any_updated = True except Exception as exc: logger.error( f"could not process package {package_name} from channel" f"{channel_name} due to error {exc}") if not skip_errors: raise exc if any_updated: indexing.update_indexes(dao, pkgstore, channel_name, subdirs=[arch])