def download(cfg, url, local_path, size_in_bytes): offline = cfg.opts("system", "offline.mode") file_exists = os.path.isfile(local_path) if file_exists: logger.info("[%s] already exists locally. Skipping download." % local_path) return if not offline: logger.info("Downloading from [%s] to [%s]." % (url, local_path)) try: io.ensure_dir(os.path.dirname(local_path)) size_in_mb = round(convert.bytes_to_mb(size_in_bytes)) # ensure output appears immediately print("Downloading data from %s (%s MB) ... " % (url, size_in_mb), end='', flush=True) net.download(url, local_path, size_in_bytes) print("Done") except urllib.error.URLError: logger.exception("Could not download [%s] to [%s]." % (url, local_path)) # file must exist at this point -> verify if not os.path.isfile(local_path): if offline: raise exceptions.SystemSetupError( "Cannot find %s. Please disable offline mode and retry again." % local_path) else: raise exceptions.SystemSetupError( "Could not download from %s to %s. Please verify that data are available at %s and " "check your internet connection." % (url, local_path, url))
def download(cfg, url, local_path, size_in_bytes): offline = cfg.opts("system", "offline.mode") file_exists = os.path.isfile(local_path) # ensure we only skip the download if the file size also matches our expectation if file_exists and (size_in_bytes is None or os.path.getsize(local_path) == size_in_bytes): logger.info("[%s] already exists locally. Skipping download." % local_path) return False if not offline: try: io.ensure_dir(os.path.dirname(local_path)) if size_in_bytes: size_in_mb = round(convert.bytes_to_mb(size_in_bytes)) # ensure output appears immediately logger.info("Downloading data from [%s] (%s MB) to [%s]." % (url, size_in_mb, local_path)) else: logger.info("Downloading data from [%s] to [%s]." % (url, local_path)) # we want to have a bit more accurate download progress as these files are typically very large progress = net.Progress( "[INFO] Downloading data for track %s" % track.name, accuracy=1) net.download(url, local_path, size_in_bytes, progress_indicator=progress) progress.finish() logger.info("Downloaded data from [%s] to [%s]." % (url, local_path)) except urllib.error.URLError: logger.exception("Could not download [%s] to [%s]." % (url, local_path)) # file must exist at this point -> verify if not os.path.isfile(local_path): if offline: raise exceptions.SystemSetupError( "Cannot find %s. Please disable offline mode and retry again." % local_path) else: raise exceptions.SystemSetupError( "Cannot download from %s to %s. Please verify that data are available at %s and " "check your internet connection." % (url, local_path, url)) actual_size = os.path.getsize(local_path) if size_in_bytes is not None and actual_size != size_in_bytes: raise exceptions.DataError( "[%s] is corrupt. Downloaded [%d] bytes but [%d] bytes are expected." % (local_path, actual_size, size_in_bytes)) return True
def download(cfg, url, local_path, size_in_bytes): offline = cfg.opts("system", "offline.mode") file_exists = os.path.isfile(local_path) # ensure we only skip the download if the file size also matches our expectation if file_exists and (size_in_bytes is None or os.path.getsize(local_path) == size_in_bytes): logger.info("[%s] already exists locally. Skipping download." % local_path) return False if not offline: try: io.ensure_dir(os.path.dirname(local_path)) if size_in_bytes: size_in_mb = round(convert.bytes_to_mb(size_in_bytes)) # ensure output appears immediately console.info( "Downloading data from [%s] (%s MB) to [%s] ... " % (url, size_in_mb, local_path), end='', flush=True, logger=logger) else: console.info("Downloading data from [%s] to [%s] ... " % (url, local_path), end='', flush=True, logger=logger) net.download(url, local_path, size_in_bytes) console.println("[OK]") except urllib.error.URLError: logger.exception("Could not download [%s] to [%s]." % (url, local_path)) # file must exist at this point -> verify if not os.path.isfile(local_path): if offline: raise exceptions.SystemSetupError( "Cannot find %s. Please disable offline mode and retry again." % local_path) else: raise exceptions.SystemSetupError( "Cannot download from %s to %s. Please verify that data are available at %s and " "check your internet connection." % (url, local_path, url)) actual_size = os.path.getsize(local_path) if size_in_bytes is not None and actual_size != size_in_bytes: raise exceptions.DataError( "[%s] is corrupt. Downloaded [%d] bytes but [%d] bytes are expected." % (local_path, actual_size, size_in_bytes)) return True
def filters_from_included_tasks(included_tasks): filters = [] if included_tasks: for t in included_tasks: spec = t.split(":") if len(spec) == 1: filters.append(track.TaskNameFilter(spec[0])) elif len(spec) == 2: if spec[0] == "type": filters.append(track.TaskOpTypeFilter(spec[1])) else: raise exceptions.SystemSetupError("Invalid format for included tasks: [%s]. Expected [type] but got [%s]." % (t, spec[0])) else: raise exceptions.SystemSetupError("Invalid format for included tasks: [%s]" % t) return filters
def load_track(cfg): """ Loads a track :param cfg: The config object. It contains the name of the track to load. :return: The loaded track. """ track_name = cfg.opts("benchmarks", "track") try: repo = TrackRepository(cfg) reader = TrackFileReader(cfg) distribution_version = cfg.opts("source", "distribution.version", mandatory=False) data_root = cfg.opts("benchmarks", "local.dataset.cache") full_track = reader.read( track_name, repo.track_file(distribution_version, track_name), repo.track_dir(track_name), "%s/%s" % (data_root, track_name.lower())) if cfg.opts("benchmarks", "test.mode"): return post_process_for_test_mode(full_track) else: return full_track except FileNotFoundError: logger.exception("Cannot load track [%s]" % track_name) raise exceptions.SystemSetupError( "Cannot load track %s. List the available tracks with %s list tracks." % (track_name, PROGRAM_NAME))
def load_track(cfg): """ Loads a track :param cfg: The config object. It contains the name of the track to load. :return: The loaded track. """ track_name = None try: repo = track_repo(cfg) track_name = repo.track_name track_dir = repo.track_dir(track_name) reader = TrackFileReader(cfg) included_tasks = cfg.opts("track", "include.tasks") current_track = reader.read(track_name, repo.track_file(track_name), track_dir) current_track = filter_included_tasks( current_track, filters_from_included_tasks(included_tasks)) plugin_reader = TrackPluginReader(track_dir) current_track.has_plugins = plugin_reader.can_load() if cfg.opts("track", "test.mode.enabled"): return post_process_for_test_mode(current_track) else: return current_track except FileNotFoundError: logger.exception("Cannot load track [%s]" % track_name) raise exceptions.SystemSetupError( "Cannot load track %s. List the available tracks with %s list tracks." % (track_name, PROGRAM_NAME))
def _update(self, distribution_version): try: if self.remote and not self.offline: branch = versions.best_match( git.branches(self.tracks_dir, remote=self.remote), distribution_version) if branch: logger.info( "Rebasing on '%s' in '%s' for distribution version '%s'." % (branch, self.tracks_dir, distribution_version)) git.rebase(self.tracks_dir, branch=branch) return else: msg = "Could not find track data remotely for distribution version %s. " \ "Trying to find track data locally." % distribution_version logger.warn(msg) branch = versions.best_match( git.branches(self.tracks_dir, remote=False), distribution_version) if branch: logger.info( "Checking out '%s' in '%s' for distribution version '%s'." % (branch, self.tracks_dir, distribution_version)) git.checkout(self.tracks_dir, branch=branch) else: raise exceptions.SystemSetupError( "Cannot find track data for distribution version %s" % distribution_version) except exceptions.SupplyError as e: raise exceptions.DataError("Cannot update track data in '%s': %s" % (self.tracks_dir, e))
def _update(self, distribution_version): try: if self.remote and not self.offline: branch = versions.best_match(git.branches(self.tracks_dir, remote=self.remote), distribution_version) if branch: # Allow uncommitted changes iff we do not have to change the branch logger.info( "Checking out [%s] in [%s] for distribution version [%s]." % (branch, self.tracks_dir, distribution_version)) git.checkout(self.tracks_dir, branch=branch) logger.info("Rebasing on [%s] in [%s] for distribution version [%s]." % (branch, self.tracks_dir, distribution_version)) try: git.rebase(self.tracks_dir, branch=branch) except exceptions.SupplyError: logger.exception("Cannot rebase due to local changes in [%s]" % self.tracks_dir) console.warn( "Local changes in [%s] prevent track update from remote. Please commit your changes." % self.tracks_dir) return else: msg = "Could not find track data remotely for distribution version [%s]. " \ "Trying to find track data locally." % distribution_version logger.warning(msg) branch = versions.best_match(git.branches(self.tracks_dir, remote=False), distribution_version) if branch: logger.info("Checking out [%s] in [%s] for distribution version [%s]." % (branch, self.tracks_dir, distribution_version)) git.checkout(self.tracks_dir, branch=branch) else: raise exceptions.SystemSetupError("Cannot find track data for distribution version %s" % distribution_version) except exceptions.SupplyError: tb = sys.exc_info()[2] raise exceptions.DataError("Cannot update track data in [%s]." % self.tracks_dir).with_traceback(tb)
def __init__(self, cfg, fetch=True): self.cfg = cfg self.name = cfg.opts("track", "repository.name") self.offline = cfg.opts("system", "offline.mode") # If no URL is found, we consider this a local only repo (but still require that it is a git repo) self.url = cfg.opts("tracks", "%s.url" % self.name, mandatory=False) self.remote = self.url is not None and self.url.strip() != "" root = cfg.opts("node", "root.dir") track_repositories = cfg.opts("benchmarks", "track.repository.dir") self.tracks_dir = "%s/%s/%s" % (root, track_repositories, self.name) if self.remote and not self.offline and fetch: # a normal git repo with a remote if not git.is_working_copy(self.tracks_dir): git.clone(src=self.tracks_dir, remote=self.url) else: try: git.fetch(src=self.tracks_dir) except exceptions.SupplyError: console.warn( "Could not update tracks. Continuing with your locally available state.", logger=logger) else: if not git.is_working_copy(self.tracks_dir): raise exceptions.SystemSetupError( "[{src}] must be a git repository.\n\nPlease run:\ngit -C {src} init" .format(src=self.tracks_dir))
def load(self, track_plugin_path): plugin_name = io.basename(track_plugin_path) logger.info("Loading track plugin [%s] from [%s]" % (plugin_name, track_plugin_path)) # search all paths within this directory for modules but exclude all directories starting with "_" module_dirs = [] for dirpath, dirs, _ in os.walk(track_plugin_path): module_dirs.append(dirpath) ignore = [] for d in dirs: if d.startswith("_"): logger.debug("Removing [%s] from load path." % d) ignore.append(d) for d in ignore: dirs.remove(d) # load path is only the root of the package hierarchy plugin_root_path = os.path.abspath(os.path.join(track_plugin_path, os.pardir)) logger.debug("Adding [%s] to Python load path." % plugin_root_path) # needs to be at the beginning of the system path, otherwise import machinery tries to load application-internal modules sys.path.insert(0, plugin_root_path) try: root_module = self._load_plugin(plugin_name, module_dirs, track_plugin_path) # every module needs to have a register() method root_module.register(self) except BaseException: msg = "Could not load track plugin [%s]" % plugin_name logger.exception(msg) raise exceptions.SystemSetupError(msg)
def download_corpus(root_url, target_path, size_in_bytes, track_name, offline, test_mode): file_name = os.path.basename(target_path) if not root_url: raise exceptions.DataError("%s is missing and it cannot be downloaded because no source URL is provided in the track." % target_path) if offline: raise exceptions.SystemSetupError("Cannot find %s. Please disable offline mode and retry again." % target_path) data_url = "%s/%s" % (source_root_url, file_name) try: io.ensure_dir(os.path.dirname(target_path)) if size_in_bytes: size_in_mb = round(convert.bytes_to_mb(size_in_bytes)) logger.info("Downloading data from [%s] (%s MB) to [%s]." % (data_url, size_in_mb, target_path)) else: logger.info("Downloading data from [%s] to [%s]." % (data_url, target_path)) # we want to have a bit more accurate download progress as these files are typically very large progress = net.Progress("[INFO] Downloading data for track %s" % track_name, accuracy=1) net.download(data_url, target_path, size_in_bytes, progress_indicator=progress) progress.finish() logger.info("Downloaded data from [%s] to [%s]." % (data_url, target_path)) except urllib.error.HTTPError as e: if e.code == 404 and test_mode: raise exceptions.DataError("Track [%s] does not support test mode. Please ask the track author to add it or " "disable test mode and retry." % track_name) else: msg = "Could not download [%s] to [%s]" % (data_url, target_path) if e.reason: msg += " (HTTP status: %s, reason: %s)" % (str(e.code), e.reason) else: msg += " (HTTP status: %s)" % str(e.code) raise exceptions.DataError(msg) except urllib.error.URLError: logger.exception("Could not download [%s] to [%s]." % (data_url, target_path)) raise exceptions.DataError("Could not download [%s] to [%s]." % (data_url, target_path)) if not os.path.isfile(target_path): raise exceptions.SystemSetupError( "Cannot download from %s to %s. Please verify that data are available at %s and " "check your internet connection." % (data_url, target_path, data_url)) actual_size = os.path.getsize(target_path) if size_in_bytes is not None and actual_size != size_in_bytes: raise exceptions.DataError("[%s] is corrupt. Downloaded [%d] bytes but [%d] bytes are expected." % (target_path, actual_size, size_in_bytes))
def load(self): root_module = self.loader.load() try: # every module needs to have a register() method root_module.register(self) except BaseException: msg = "Could not register track plugin at [%s]" % self.loader.root_path logger.exception(msg) raise exceptions.SystemSetupError(msg)
def __init__(self, track_path): if not os.path.exists(track_path): raise exceptions.SystemSetupError("Track path %s does not exist" % track_path) if os.path.isdir(track_path): self.track_name = io.basename(track_path) self._track_dir = track_path self._track_file = os.path.join(track_path, "track.json") if not os.path.exists(self._track_file): raise exceptions.SystemSetupError("Could not find track.json in %s" % track_path) elif os.path.isfile(track_path): if io.has_extension(track_path, ".json"): self._track_dir = io.dirname(track_path) self._track_file = track_path self.track_name = io.splitext(io.basename(track_path))[0] else: raise exceptions.SystemSetupError("%s has to be a JSON file" % track_path) else: raise exceptions.SystemSetupError("%s is neither a file nor a directory" % track_path)
def __init__(self, cfg): self.cfg = cfg self.name = cfg.opts("system", "track.repository") # If no URL is found, we consider this a local only repo (but still require that it is a git repo) self.url = cfg.opts("tracks", "%s.url" % self.name, mandatory=False) self.remote = self.url is not None and self.url.strip() != "" root = cfg.opts("system", "root.dir") track_repositories = cfg.opts("benchmarks", "track.repository.dir") self.tracks_dir = "%s/%s/%s" % (root, track_repositories, self.name) if self.remote: # a normal git repo with a remote if not git.is_working_copy(self.tracks_dir): git.clone(src=self.tracks_dir, remote=self.url) else: git.fetch(src=self.tracks_dir, remote=self.url) else: if not git.is_working_copy(self.tracks_dir): raise exceptions.SystemSetupError("'{src}' must be a git repository.\n\nPlease run:\ngit -C {src} init" .format(src=self.tracks_dir))
def read(self, track_name, track_spec_file, mapping_dir): """ Reads a track file, verifies it against the JSON schema and if valid, creates a track. :param track_name: The name of the track. :param track_spec_file: The complete path to the track specification file. :param mapping_dir: The directory where the mapping files for this track are stored locally. :return: A corresponding track instance if the track file is valid. """ logger.info("Reading track specification file [%s]." % track_spec_file) try: rendered = render_template_from_file(track_spec_file, self.track_params) logger.info("Final rendered track for '%s': %s" % (track_spec_file, rendered)) track_spec = json.loads(rendered) except jinja2.exceptions.TemplateNotFound: logger.exception("Could not load [%s]." % track_spec_file) raise exceptions.SystemSetupError("Track %s does not exist" % track_name) except (json.JSONDecodeError, jinja2.exceptions.TemplateError) as e: logger.exception("Could not load [%s]." % track_spec_file) raise TrackSyntaxError("Could not load '%s'" % track_spec_file, e) # check the track version before even attempting to validate the JSON format to avoid bogus errors. raw_version = track_spec.get("version", TrackFileReader.MAXIMUM_SUPPORTED_TRACK_VERSION) try: track_version = int(raw_version) except ValueError: raise exceptions.InvalidSyntax("version identifier for track %s must be numeric but was [%s]" % (track_name, str(raw_version))) if TrackFileReader.MAXIMUM_SUPPORTED_TRACK_VERSION < track_version: raise exceptions.RallyError("Track %s requires a newer version of Rally. Please upgrade Rally (supported track version: %d, " "required track version: %d)" % (track_name, TrackFileReader.MAXIMUM_SUPPORTED_TRACK_VERSION, track_version)) try: jsonschema.validate(track_spec, self.track_schema) except jsonschema.exceptions.ValidationError as ve: raise TrackSyntaxError( "Track '%s' is invalid.\n\nError details: %s\nInstance: %s\nPath: %s\nSchema path: %s" % (track_name, ve.message, json.dumps(ve.instance, indent=4, sort_keys=True), ve.absolute_path, ve.absolute_schema_path)) return self.read_track(track_name, track_spec, mapping_dir)