def save_all(self, outdir, force=False, fmt=None): """Save data as json or zip exports using an outdir root. If fmt is None, we use the extractor default (typically single-json except for metrics that warrant larger / more extraction). """ if not self.manager or not self._extractors: logger.exit( "You must add a manager and do an extract() before save.") if fmt and fmt not in self.manager.export_formats: logger.exit("Export format %s is not recognized. Choose %s." % (fmt, ", ".join(self.manager.export_formats))) package_dir = os.path.join(outdir, self.manager.name, self.manager.uri) logger.info("Results will be written to %s" % package_dir) for _, extractor in self._extractors.items(): # Each metric can define a default format fmt_ = fmt or extractor.extractor # Do save based on selected type if fmt_ == "json-single": extractor.save_json_single(package_dir, force=force) elif fmt_ == "zip": extractor.save_zip(package_dir, force=force) else: extractor.save_json(package_dir, force=force)
def load_metric( self, metric, filename=None, local_repository=None, repository="vsoch/caliper-metrics", subfolder="", branch="main", extension="json", ): """Load a metric from from a file, local caliper repository, or GitHub repo that has them extracted, optionally specifying a custom repository and subfolder. Smaller metrics are typically provided via json, and larger ones via zip. """ # A manager is required if not self.manager: logger.exit("A manager is required to load a metric for.") if local_repository: return self._load_metric_local(local_repository, metric) elif filename: return self._load_metric_file(filename, metric) return self._load_metric_repo(metric, repository, subfolder, branch, extension)
def run_command(self, cmd): """A wrapper to run_command to handle errors""" logger.debug(" ".join(cmd)) response = run_command(cmd, quiet=self.quiet) if not response["return_code"] == 0: logger.exit("Error with %s, %s" % (" ".join(cmd), response["lines"])) return response["lines"]
def wget_and_extract(url, download_to, download_type="targz", chunk_size=1024, flatten=True): """Given a download url of a particular type (targz or wheel or zip) download to a folder and extract it. If flatten is true, we expect a top level folder that should be flattened into the current directory. """ if download_type == "targz": dest, root, dest_dir = wget_and_extract_targz(url, download_to, chunk_size) elif download_type in ["wheel", "gzip", "zip"]: dest, root, dest_dir = wget_and_extract_zip(url, download_to, chunk_size) else: logger.exit("%s is not a known archive type." % download_type) # Remove the archive if os.path.exists(dest): os.remove(dest) # Move contents into top level folder if flatten and os.path.exists(root): move_files(root, dest_dir) # Remove the originally extracted folder if os.path.exists(root): shutil.rmtree(root) return dest_dir
def get_analyzer(self): """Given the validated and loaded config, return the correct analyzer class depending on the packagemanger field. Currently we only support pypi """ if re.search("pypi", self.config["packagemanager"], re.IGNORECASE): return CaliperPypiAnalyzer(self.config_file) logger.exit( "%s is not a supported package manager at this time." % self.config["packagemanager"] )
def extract_metric(self, name, versions=None): """Given a metric, extract for each commit from the repository.""" versions = versions or [] if name not in self.metrics: logger.exit("Metric %s is not known." % name) # If no git repository defined, prepare one if not self.git: self.prepare_repository(versions) module, metric_name = self._metrics[name].rsplit(".", 1) metric = self.get_metric(name) metric.extract() self._extractors[metric_name] = metric
def prepare_repository(self, versions=None): """Since most source code archives won't include the git history, we would want to create a root directly with a new git installation, and then create tagged commits that correpond to each version. We can then use this git repository to derive metrics of change. """ versions = versions or [] if not self.manager: logger.exit("A manager is required to prepare a repository.") # Create temporary git directory self.tmpdir = tempfile.mkdtemp(prefix="%s-" % self.manager.uri.replace("/", "-")) self.git = GitManager(self.tmpdir, quiet=self.quiet) # Initialize empty respository self.git.init() # If we have versions, filter down self.filter_versions(versions) # For each version, download and create git commit and tag for i, spec in enumerate(self.manager.specs): logger.info("Downloading and tagging %s, %s of %s" % (spec["version"], i + 1, len(self.manager.specs))) download_to = os.path.join( self.tmpdir, os.path.basename(spec["source"]["filename"])) # Extraction type is based on source type wget_and_extract( url=spec["source"]["filename"], download_type=spec["source"]["type"], download_to=download_to, ) # git add all content in folder, commit and tag with version self.git.add() self.git.status() os.listdir(self.tmpdir) self.git.commit(spec["version"]) self.git.tag(spec["version"]) logger.info("Repository for %s is created at %s" % (self.manager, self.tmpdir)) return self.git
def main(args, extra): # Ensure that all metrics are valid client = MetricsExtractor(quiet=True) metrics = args.metric.split(",") # If asking for all, we will do all regardless of other specifications if "all" in metrics: metrics = ["all"] for metric in metrics: if metric == "all": continue if metric not in client.metrics: logger.exit("%s is not a known metric." % metric) # prepare top level output directory outdir = args.outdir or os.getcwd() # Now parse the package names and do the extraction! for package in args.packages: uri, package = package.split(":") # pypi:sif try: manager = get_named_manager(uri, package) except NotImplementedError: logger.exit("%s is not a valid package manager uri." % package) # Create a client to interact with client = MetricsExtractor(manager, quiet=True) # Honor the args.version versions = args.versions.split(",") if args.versions else None # Do the extraction for metric in metrics: if metric == "all": client.extract_all(versions=versions) else: client.extract_metric(metric, versions=versions) # Save results to files client.save_all(outdir, force=args.force, fmt=args.fmt) # Cleanup, unless disabled if not args.no_cleanup: client.cleanup(force=True)
def generate_graph(template, data, outdir, force): """given an html template, data to populate it, and an output directory, generate a plot. Known data attributes are: - datasets: a list of dataset, each having color, name, and values - title: the title for the html page Of course the template and data can be matched for each metric. """ filename = os.path.join(outdir, "index.html") if os.path.exists(filename) and not force: logger.exit("%s exists, use --force to overwrite." % filename) template = Template("".join(read_file(template))) result = template.render(**data) if not os.path.exists(outdir): mkdir_p(outdir) write_file(filename, result) logger.info("Output written to %s" % filename)
def main(args, extra): # The config file must exist if not args.config or not os.path.exists(args.config): logger.exit( "You must provide an existing caliper.yaml config with --config.") client = CaliperAnalyzer(args.config) analyzer = client.get_analyzer() # serial argument removed for analyze, doesn't run well building containers analyzer.run_analysis( show_progress=not args.no_progress, nproc=args.nprocs, force=args.force, parallel=False, cleanup=args.cleanup, )
def main(args, extra): # Ensure that all metrics are valid client = MetricsExtractor(quiet=True) # If the outdir is the present working directory outdir = os.getcwd() if args.outdir == "." else args.outdir # An input is required! if not args.input: logger.exit("An input results file is required.") # If the metric is not provided on the command line, needs to be in filename metric = args.metric or os.path.basename(args.input).split("-")[0] if not metric: logger.exit( "You must provide a --metric, not derivable from filename.") if metric not in client.metrics: logger.exit("%s is not a known metric." % metric) # prepare top level output directory outdir = args.outdir or os.getcwd() metric = client.get_metric(metric) metric.plot_results(args.input, outdir, force=args.force, title=args.title)
def _load_config(self, config_file): """Given a caliper.yaml file, load the config an ensure that it is valid.""" if not os.path.exists(config_file): logger.exit("%s does not exist." % config_file) self.config_file = config_file self.config_dir = os.path.abspath(os.path.dirname(self.config_file)) self.config = read_yaml(config_file).get("analysis", {}) self.outdir = os.path.join(self.config_dir, ".caliper") self.data_dir = os.path.join(self.outdir, "data") # Validate that required fields are present, and set required = ["packagemanager", "dependency"] for key in required: if key not in self.config or not self.config.get(key): logger.exit( "%s is a required field in the caliper.yaml config under the analysis key." % key ) # Set the Dockerfile, ensure it exists self.dockerfile = os.path.join( self.config_dir, self.config.get("dockerfile", "Dockerfile") ) if not os.path.exists(self.dockerfile): logger.exit("The Dockerfile does not exist.") # Set the dependency name and any additional args self.dependency = self.config.get("dependency") self.args = self.config.get("args", {}) # Filter to specific python and library versions self.python_versions = self.config.get("python_versions", []) self.test_versions = self.config.get("versions", []) for dirname in [self.outdir, self.data_dir]: if not os.path.exists(dirname): os.makedirs(dirname)
def analysis_task(**kwargs): """A shared analysis task for the serial or parallel workers. We will read in the Dockerfile template, and generate and run/test a container for a particular Python version, etc. """ # Ensure all arguments are provided for key in [ "name", "outdir", "dependency", "outfile", "dockerfile", "exists", ]: if key not in kwargs or kwargs.get(key) == None: logger.exit("%s is missing or undefined for analysis task." % key) dockerfile = kwargs.get("dockerfile") outfile = kwargs.get("outfile") cleanup = kwargs.get("cleanup", False) dependency = kwargs.get("dependency") force = kwargs.get("force", False) exists = kwargs.get("exists") name = kwargs.get("name") outdir = kwargs.get("outdir") result = {"inputs": kwargs} tests = kwargs.get("tests") tests = [] if not tests else tests.split("\n") worker_id = multiprocessing.current_process().name # If the output file already exists and force is true, overwrite if os.path.exists(outfile) and not force: return # If it doesn't exist, we wouldn't be able to build it, cut out early if not exists: result["build_retval"] = 1 write_json(result, outfile) return # Build temporary Dockerfile dockerfile_name = "Dockerfile.caliper.%s" % name dockerfile_fullpath = os.path.join(tempfile.gettempdir(), dockerfile_name) # Write and build temporary Dockerfile, and build the container write_file(dockerfile_fullpath, dockerfile) container_name = "%s-container:%s" % (dependency, name) sys.stdout.write("[%s] 0 of %s - building container %s\n" % (worker_id, len(tests), container_name)) runner = CommandRunner() runner.run_command( [ "docker", "build", "-f", dockerfile_fullpath, "-t", container_name, ".", ], cwd=outdir, ) # Clean up Dockerfile if os.path.exists(dockerfile_fullpath): os.remove(dockerfile_fullpath) # Keep a result for each script result["tests"] = {"build": {"retval": runner.retval}} if runner.retval != 0: result["tests"]["build"]["error"] = runner.error write_json(result, outfile) return # Get packages installed for each container runner.run_command(["docker", "run", container_name, "pip", "freeze"]) result["requirements.txt"] = runner.output # Test basic import of library test_results = {} # Run each test for i, script in enumerate(tests): start = time.time() sys.stdout.write("[%s] %s of %s - %s" % (worker_id, i + 1, len(tests), script)) runner.run_command( ["docker", "run", "--rm", container_name, "python", script]) end = time.time() test_results[script] = { "error": runner.error, "output": runner.output, "retval": runner.retval, "seconds": round(end - start, 2), } sys.stdout.write(" total time: %s seconds \n" % test_results[script]["seconds"]) sys.stdout.flush() # Update results with all tests result["tests"].update(test_results) # Save the result to file, clean up write_json(result, outfile) runner.run_command(["docker", "rmi", container_name, "--force"]) runner.run_command(["docker", "images", "-f", "dangling=true", "-q"]) for layer in runner.output: runner.run_command(["docker", "rmi", layer.strip("\n"), "--force"]) if cleanup: runner.run_command(["docker", "system", "prune", "--all", "--force"])
def run(self): """run will send a list of tasks, a tuple with arguments, through a function. The tasks should be added with add_task. """ # Keep track of some progress for the user total = len(self.tasks) # if we don't have tasks, don't run if not self.tasks: return # results will also have the same key to look up finished = dict() results = [] try: pool = multiprocessing.Pool(self.workers, init_worker) self.start() progress = 1 logger.info("Preparing %s tasks..." % total) for key, task in self.tasks.items(): func, params = task if self.show_progress: prefix = "[%s/%s]" % (progress, total) logger.show_progress(progress, total, length=35, prefix=prefix) result = pool.apply_async(multi_wrapper, multi_package(func, [params])) # Store the key with the result results.append((key, result)) progress += 1 progress = 1 logger.info("Waiting for results...") while len(results) > 0: pair = results.pop() key, result = pair if self.show_progress: prefix = "[%s/%s]" % (progress, total) logger.show_progress(progress, total, length=35, prefix=prefix) result.wait() progress += 1 finished[key] = result.get() self.end() pool.close() pool.join() except (KeyboardInterrupt, SystemExit): logger.error("Keyboard interrupt detected, terminating workers!") pool.terminate() sys.exit(1) except: logger.exit("Error running task.") return finished
def run_analysis( self, release_filter=None, nproc=None, parallel=False, show_progress=True, func=None, force=False, cleanup=False, ): """Once the config is loaded, run the analysis.""" # The release filter is a regular expression we use to find the correct # platform / architecture. We select linux wheels and source release_filter = release_filter or "(.*manylinux.*x86_64.*|[.]tar[.]gz)" func = func or analysis_task # prepare a command runner, check that docker is installed runner = CommandRunner() runner.run_command(["which", "docker"]) if runner.retval != 0: logger.exit("Docker must be installed to build containers.") # Prepare arguments for runner, whether it's serial or parallel manager = PypiManager(self.dependency) all_releases = manager.filter_releases(release_filter) python_versions = manager.get_python_versions() python_version_regex = "(%s)" % "|".join(self.python_versions) # Read in the template, populate with each deps version template = Template(read_file(self.dockerfile, readlines=False)) # Prepare arguments to build and test a container for each tasks = {} # Loop over versions of the library, and Python versions for version, releases in all_releases.items(): # Check if the user has defined a set of versions if self.test_versions and version not in self.test_versions: continue # Create a lookup based on Python version lookup = {x["python_version"]: x for x in releases} for python_version in python_versions: # If the user has requested a subset of Python versions if self.python_versions and not re.search( python_version_regex, python_version, re.IGNORECASE ): continue name = "%s-%s-%s-python-%s" % ( self.name, self.dependency, version, python_version, ) outfile = os.path.join(self.data_dir, "%s.json" % name) spec = lookup.get(python_version, {}) tests = "\n".join(self.config.get("tests")) # If the Python version is not in the lookup we cannot do a build exists = python_version in lookup # It's easier to pass the rendered template than all arguments for it container_base = "python:%s" % ".".join( [x for x in python_version.lstrip("cp")] ) result = template.render( base=container_base, filename=spec.get("url", ""), basename=spec.get("filename", ""), **self.args ) params = { "dependency": self.dependency, "outfile": outfile, "dockerfile": result, "force": force, "exists": exists, "name": name, "tests": tests, "cleanup": cleanup, "outdir": self.config_dir, } tasks[name] = (func, params) if parallel: return self._run_parallel(tasks, nproc, show_progress) return self._run_serial(tasks)
__author__ = "Vanessa Sochat" __copyright__ = "Copyright 2020-2021, Vanessa Sochat" __license__ = "MPL 2.0" from caliper.utils.file import mkdir_p, read_file, write_file from caliper.logger import logger import os here = os.path.abspath(os.path.dirname(__file__)) try: from jinja2 import Template except ImportError: logger.exit("You must install jinja2 to use graphs.") def generate_graph(template, data, outdir, force): """given an html template, data to populate it, and an output directory, generate a plot. Known data attributes are: - datasets: a list of dataset, each having color, name, and values - title: the title for the html page Of course the template and data can be matched for each metric. """ filename = os.path.join(outdir, "index.html") if os.path.exists(filename) and not force: logger.exit("%s exists, use --force to overwrite." % filename) template = Template("".join(read_file(template))) result = template.render(**data) if not os.path.exists(outdir):