def get_package_metadata(self, name=None): """Given a package name, retrieve it's metadata from pypi""" name = name or self.package_name if not name: raise ValueError("A package name is required.") # At some point we might need to add pagination url = "%s/repos/%s/releases?per_page=100" % (self.baseurl, name) self.metadata = do_request(url, headers=self._get_headers()) # Parse metadata into simplified version of spack package schema for release in self.metadata: self._specs.append({ "name": name, "version": release["tag_name"], "source": { "filename": release["tarball_url"], "type": "targz", }, "hash": None, }) # Must sort by version or won't work self._specs = self.sort_specs(self._specs, by="version") logger.info("Found %s versions for %s" % (len(self._specs), name)) return self._specs
def save_all(self, outdir, force=False, fmt=None): """Save data as json or zip exports using an outdir root. If fmt is None, we use the extractor default (typically single-json except for metrics that warrant larger / more extraction). """ if not self.manager or not self._extractors: logger.exit( "You must add a manager and do an extract() before save.") if fmt and fmt not in self.manager.export_formats: logger.exit("Export format %s is not recognized. Choose %s." % (fmt, ", ".join(self.manager.export_formats))) package_dir = os.path.join(outdir, self.manager.name, self.manager.uri) logger.info("Results will be written to %s" % package_dir) for _, extractor in self._extractors.items(): # Each metric can define a default format fmt_ = fmt or extractor.extractor # Do save based on selected type if fmt_ == "json-single": extractor.save_json_single(package_dir, force=force) elif fmt_ == "zip": extractor.save_zip(package_dir, force=force) else: extractor.save_json(package_dir, force=force)
def _load_metric_repo(self, metric, repository, subfolder, branch, extension): """helper function to load a metric from a repository.""" # If we have a subfolder, add // around it if subfolder: subfolder = "%s/" % subfolder.strip("/") manager = self.manager.replace(":", "/") # Load the index for the metric, must exist for all output types url = "https://raw.githubusercontent.com/%s/%s/%s%s/%s/index.json" % ( repository, branch, subfolder, manager, metric, ) logger.info("Downloading %s" % url) response = requests.get(url) if response.status_code == 200: index = response.json() data = index.get("data", {}) # Parse a metric repository, meaning reading the index.json return self._read_metric_repo(url, index, data, metric, extension)
def get_package_metadata(self, name=None, arch=None, python_version=None): """Given a package name, retrieve it's metadata from pypi. Given an arch regex and python version, we look for a particular architecture. Otherwise the choices are a bit random. """ # Note that without specifying an arch and python version, the # architecture returned can be fairly random. # Parse metadata into simplified version of spack package schema for version, releases in self.releases.items(): # Find an appropriate linux/unix flavor release to extract release = self.find_release(releases, arch, python_version) # Some releases can be empty, skip if not releases or not release: continue # Release type drives the extraction logic release_type = "wheel" if release["url"].endswith( "whl") else "targz" self._specs.append({ "name": name, "version": version, "source": { "filename": release["url"], "type": release_type, }, "hash": release["digests"]["sha256"], }) # Pypi is already sorted by version (at least it seems) logger.info("Found %s versions for %s" % (len(self._specs), name or self.package_name)) return self._specs
def prepare_repository(self, versions=None): """Since most source code archives won't include the git history, we would want to create a root directly with a new git installation, and then create tagged commits that correpond to each version. We can then use this git repository to derive metrics of change. """ versions = versions or [] if not self.manager: logger.exit("A manager is required to prepare a repository.") # Create temporary git directory self.tmpdir = tempfile.mkdtemp(prefix="%s-" % self.manager.uri.replace("/", "-")) self.git = GitManager(self.tmpdir, quiet=self.quiet) # Initialize empty respository self.git.init() # If we have versions, filter down self.filter_versions(versions) # For each version, download and create git commit and tag for i, spec in enumerate(self.manager.specs): logger.info("Downloading and tagging %s, %s of %s" % (spec["version"], i + 1, len(self.manager.specs))) download_to = os.path.join( self.tmpdir, os.path.basename(spec["source"]["filename"])) # Extraction type is based on source type wget_and_extract( url=spec["source"]["filename"], download_type=spec["source"]["type"], download_to=download_to, ) # git add all content in folder, commit and tag with version self.git.add() self.git.status() os.listdir(self.tmpdir) self.git.commit(spec["version"]) self.git.tag(spec["version"]) logger.info("Repository for %s is created at %s" % (self.manager, self.tmpdir)) return self.git
def generate_graph(template, data, outdir, force): """given an html template, data to populate it, and an output directory, generate a plot. Known data attributes are: - datasets: a list of dataset, each having color, name, and values - title: the title for the html page Of course the template and data can be matched for each metric. """ filename = os.path.join(outdir, "index.html") if os.path.exists(filename) and not force: logger.exit("%s exists, use --force to overwrite." % filename) template = Template("".join(read_file(template))) result = template.render(**data) if not os.path.exists(outdir): mkdir_p(outdir) write_file(filename, result) logger.info("Output written to %s" % filename)
def _run_serial(self, tasks, show_progress=True): """Run tasks in serial. The workers save result files, so we don't care about the results (would take more memory to try and return the same content). """ progress = 1 total = len(tasks) results = {} for key, task in tasks.items(): func, params = task prefix = "[%s/%s]" % (progress, total) if show_progress: logger.info("%s: %s" % (prefix, key)) else: logger.info("Processing task %s" % key) results[key] = func(**params) progress += 1 return results
def run(self): """run will send a list of tasks, a tuple with arguments, through a function. The tasks should be added with add_task. """ # Keep track of some progress for the user total = len(self.tasks) # if we don't have tasks, don't run if not self.tasks: return # results will also have the same key to look up finished = dict() results = [] try: pool = multiprocessing.Pool(self.workers, init_worker) self.start() progress = 1 logger.info("Preparing %s tasks..." % total) for key, task in self.tasks.items(): func, params = task if self.show_progress: prefix = "[%s/%s]" % (progress, total) logger.show_progress(progress, total, length=35, prefix=prefix) result = pool.apply_async(multi_wrapper, multi_package(func, [params])) # Store the key with the result results.append((key, result)) progress += 1 progress = 1 logger.info("Waiting for results...") while len(results) > 0: pair = results.pop() key, result = pair if self.show_progress: prefix = "[%s/%s]" % (progress, total) logger.show_progress(progress, total, length=35, prefix=prefix) result.wait() progress += 1 finished[key] = result.get() self.end() pool.close() pool.join() except (KeyboardInterrupt, SystemExit): logger.error("Keyboard interrupt detected, terminating workers!") pool.terminate() sys.exit(1) except: logger.exit("Error running task.") return finished