def parse_tests(dirname, outdir): """Assemble all tests results into one large data structure. This will be too large to load into the browser at once, but should be okay for Flask. """ results = {} # Read in input files, organize by python version, tensorflow version for filename in iter_files(dirname): # Create object to parse versions dep = DependencyVersion(filename) # Derive the name and versions from the filename (also in inputs:name) result = read_json(filename) basename = os.path.splitext(os.path.basename(filename))[0] if "tests" not in result: result["tests"] = {"build": {"retval": result["build_retval"]}} results[basename] = { "tests": result["tests"], "python": dep.pyversion, "tensorflow": dep.tfversion, } outfile = os.path.join(outdir, "tests.json") write_json(results, outfile) return outfile
def test_pypi_analyze(tmp_path): """test pypi analyzer""" print("Testing Pypi Analyer") from caliper.analysis import CaliperAnalyzer from caliper.utils.file import read_json config_file = os.path.join(here, "data", "analyze", "caliper.yaml") client = CaliperAnalyzer(config_file) analyzer = client.get_analyzer() analyzer.run_analysis(cleanup=True) outdir = os.path.join(here, "data", "analyze", ".caliper") assert os.path.exists(outdir) outfile = os.path.join(outdir, "data", "pypi-sif-0.0.11-python-cp27.json") assert os.path.exists(outfile) # Check fields in output file result = read_json(outfile) for key in ["inputs", "tests", "requirements.txt"]: assert key in result and result[key] is not None for key in [ "dependency", "outfile", "dockerfile", "force", "exists", "name", "tests", "cleanup", "outdir", ]: assert key in result["inputs"] and result["inputs"][key] is not None
def read_metric_local(self, index_file, metric): """Parse a local repository, returning data from a preferred type""" index = read_json(index_file) metric_dir = os.path.dirname(index_file) data = index.get("data", {}) if "json-single" in data: metric_file = os.path.join(metric_dir, data["json-single"].get("url", "")) if os.path.isfile(metric_file) and os.path.exists(metric_file): return read_json(metric_file) if "zip" in data: metric_file = os.path.join(metric_dir, data["zip"].get("url", "")) return json.loads(read_zip(metric_file, "%s-results.json" % metric)) elif "json" in data: results = {} for filename in data["json"].get("urls", []): metric_file = os.path.join(metric_dir, filename) results.update(read_json(metric_file)) return results
def update_index(self, extractor_dir, content): """If an index already exists, load it and update it with the data type (represented as the key of a dictionary in content). If an index does not exist, write a new one. Filepaths should be relative. """ index_file = os.path.join(extractor_dir, "index.json") if not os.path.exists(index_file): index = {"data": {}} write_json(index, index_file) # Read in the index and update it index = read_json(index_file) index["data"].update(content) write_json(index, index_file)
def _load_metric_file(self, filename, metric): """helper function to load a metric from a filename. If it's zipped,""" name = "%s-results.json" % metric if filename.endswith("zip"): return json.loads(read_zip(filename, name)) return read_json(name)
def parse_requirements(dirname, outdir): """Given an output directory with json files, parse the requirements.txt into a data structure and save into a compiler folder """ # Keep a lookup of requirements.txt to compare across rxments = set() requirements = {} # Read in input files, organize by python version, tensorflow version for filename in iter_files(dirname): # Derive the name and versions from the filename (also in inputs:name) result = read_json(filename) if "requirements.txt" not in result: requirements[filename] = {} continue [ rxments.add(re.split("(=|@)", x)[0].strip()) for x in result["requirements.txt"] ] # Now create a flattened dict frame with versions, and a lookup requirements = {} for filename in iter_files(dirname): versions = dict.fromkeys(rxments, None) dep = DependencyVersion(filename) basename = os.path.splitext(os.path.basename(filename))[0] # **Important** this is specific to tensorflow for x in result["requirements.txt"]: if "file://" in x: versions["tensorflow"] = dep.tfversion else: version = re.split("(==|@|<=|>=)", x)[-1].strip() library = re.split("(==|@|<=|>=)", x)[0].strip() versions[library] = version requirements[basename] = versions # Finally, create a data frame (of lists) df = [["name", "x", "y", "value", "tensorflow", "python"]] for ycoord, requirement in enumerate(rxments): for xcoord, filename in enumerate(iter_files(dirname)): # Create object to parse versions dep = DependencyVersion(filename) basename = os.path.splitext(os.path.basename(filename))[0] df.append([ basename, xcoord, ycoord, requirements[basename][requirement], dep.tfversion, dep.pyversion, ]) # Save versions and matrix to file outfile = os.path.join(outdir, "requirements.txt.json") write_json(requirements, outfile) outfile = os.path.join(outdir, "requirements.txt.tsv") write_rows(df, outfile, sep="\t") return outfile
def main(): """main entrypoint for caliper analysis""" parser = get_parser() # If an error occurs while parsing the arguments, the interpreter will exit with value 2 args, extra = parser.parse_known_args() filename = os.path.abspath(args.filename) if args.filename else None if not filename or (filename and not os.path.exists(filename)): sys.exit("A --filename with similarity scores is required.") # Prepare output directory if not args.outdir or not os.path.exists(args.outdir): sys.exit("The output directory %s does not exist" % args.outdir) sims = read_json(filename) # First derive list of labels for rows and columns labels = set() for key in sims: label1, label2 = key.split( ".." ) # important, other libraries should use .. in case - is part of the version if re.search("(rc|a|b)", label1) or re.search("(rc|a|b)", label2): continue labels.add(label1) labels.add(label1) # Versions need to be sorted by version, not string # For now we will remove the release candidtes labels = list(labels) try: labels.sort(key=StrictVersion) except: labels.sort() # Next create a data frame for each dfs = { x: pandas.DataFrame(index=labels, columns=labels) for x in sims[list(sims.keys())[0]].keys() } for pair, values in sims.items(): label1, label2 = pair.split("..") if re.search("(rc|a|b)", label1) or re.search("(rc|a|b)", label2): continue for key, value in values.items(): dfs[key].loc[label1, label2] = value dfs[key].loc[label2, label1] = value # Create output directory outdir = os.path.join(args.outdir, "plots") if not os.path.exists(outdir): os.mkdir(outdir) # Finally, prepare plots! for name, df in dfs.items(): fig, ax = plt.subplots(figsize=(args.dim, args.dim)) cax = ax.matshow(df.to_numpy(dtype=float), interpolation="nearest") ax.grid(True) plt.title("%s Version Similarity: %s" % (args.package.upper(), name)) plt.xticks(range(len(labels)), labels, rotation=90) plt.yticks(range(len(labels)), labels) fig.colorbar( cax, ticks=[ 0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.85, 0.90, 0.95, 1, ], ) # plt.show() for extension in ["png", "svg"]: if args.name: outfile = os.path.join( outdir, "pypi-%s-%s-%s-plot.%s" % (args.package, name, args.name, extension), ) else: outfile = os.path.join( outdir, "pypi-%s-%s-plot.%s" % (args.package, name, extension)) print("Saving %s" % outfile) plt.savefig(outfile, dpi=300)
def parse_tests(dirname, outdir, package): """Assemble all tests results into one large data structure. This will be too large to load into the browser at once, but should be okay for Flask. """ results = {} datadir = os.path.join(dirname, "data") # We need to keep a list of tests so the data structure is consistent tests = set() # We also want the versions sorted versions = set() groups = {} for filename in iter_files(datadir, package): basename = os.path.basename(filename) dep = DependencyVersion(filename, package) # Don't include release candidates, or a/b, etc. if re.search("(rc|a|b)", basename): continue if dep.tfversion not in groups: groups[dep.tfversion] = [] groups[dep.tfversion].append(filename) versions.add(dep.tfversion) # Sort the versions, we will add them to groups versions = list(versions) versions.sort(key=StrictVersion) # Loop through sorted versions for version in versions: for filename in groups[version]: basename = os.path.basename(filename) # Create object to parse versions dep = DependencyVersion(filename, package) results[dep.pyversion] = [] result = read_json(filename) for test in result.get("tests", []): tests.add(test) # Read in input files, organize by python version, tensorflow version for version in versions: for filename in groups[version]: basename = os.path.basename(filename) # Create object to parse versions dep = DependencyVersion(filename, package) # Make sure the test has at least one result result = read_json(filename) if "tests" not in result: result["tests"] = {"build": {"retval": result["build_retval"]}} # Make sure we have all tests, ordered the same, -1 indicates not run result_tests = result.get("tests", []) test_list = [] for test in tests: if test in result_tests: entry = result_tests[test] else: entry = {"retval": -1} # y axis will be tensorflow version, x axis will be test name entry["x_name"] = test entry["y_tensorflow"] = dep.tfversion test_list.append(entry) results[dep.pyversion] += test_list # Write to output file so we can generate a d3 outfile = os.path.join(dirname, "compiled", "test-results-by-python.json") write_json(results, outfile) return outfile
def extract_requirements(datadir, outdir, package): """Create a lookup for requirements including (and not including) versions to generate similarity matrices. An alternative is to extract all requirements (to see change between version) for a package and have this say something about the parent package, but this seems more complicated. """ # Keep a lookup of requirements.txt to compare across requirements = {} # Read in input files, organize by python version, tensorflow version for filename in iter_files(datadir, package): # Skip release candidates and a/b for now if re.search("(rc|b|a)", os.path.basename(filename)): continue # Only include those we have requirements for (meaning success install) result = read_json(filename) if "requirements.txt" in result: requirements[filename] = [ x.strip().lower() for x in result["requirements.txt"] ] # Level 1 similarity: overall modules # Level 2 similarity: modules and version string sims = {} # First just compare functions that exist for filename1, modules1 in requirements.items(): for filename2, modules2 in requirements.items(): uid1 = os.path.basename(filename1).rstrip(".json") uid2 = os.path.basename(filename2).rstrip(".json") # Dont' calculate it twice scores = {} key = "..".join(sorted([uid1, uid2])) if key in sims: continue # Diagonal is perfectly similar if uid1 == uid2: scores = {"module_sim": 1, "module_version_sim": 1} sims[key] = scores continue # Level 1: Module and version similarity modules1 = set(modules1) modules2 = set(modules2) scores["module_version_sim"] = information_coefficient( len(modules1), len(modules2), len(modules1.intersection(modules2))) # Level 2: Don't include versions, ignore casing funcs1 = [ re.split("(==|@)", x)[0].strip().lower() for x in modules1 ] funcs2 = [ re.split("(==|@)", x)[0].strip().lower() for x in modules2 ] scores["module_sim"] = information_coefficient( len(set(funcs1)), len(set(funcs2)), len(set(funcs1).intersection(set(funcs2))), ) sims[key] = scores outfile = os.path.join(outdir, "pypi-%s-requirements-sims.json" % package) write_json(sims, outfile) return outfile