Example #1
0
def parse_tests(dirname, outdir):
    """Assemble all tests results into one large data structure. This will
    be too large to load into the browser at once, but should be okay for Flask.
    """
    results = {}

    # Read in input files, organize by python version, tensorflow version
    for filename in iter_files(dirname):

        # Create object to parse versions
        dep = DependencyVersion(filename)

        # Derive the name and versions from the filename (also in inputs:name)
        result = read_json(filename)
        basename = os.path.splitext(os.path.basename(filename))[0]
        if "tests" not in result:
            result["tests"] = {"build": {"retval": result["build_retval"]}}
        results[basename] = {
            "tests": result["tests"],
            "python": dep.pyversion,
            "tensorflow": dep.tfversion,
        }

    outfile = os.path.join(outdir, "tests.json")
    write_json(results, outfile)
    return outfile
Example #2
0
def test_pypi_analyze(tmp_path):
    """test pypi analyzer"""
    print("Testing Pypi Analyer")
    from caliper.analysis import CaliperAnalyzer
    from caliper.utils.file import read_json

    config_file = os.path.join(here, "data", "analyze", "caliper.yaml")
    client = CaliperAnalyzer(config_file)
    analyzer = client.get_analyzer()
    analyzer.run_analysis(cleanup=True)

    outdir = os.path.join(here, "data", "analyze", ".caliper")
    assert os.path.exists(outdir)
    outfile = os.path.join(outdir, "data", "pypi-sif-0.0.11-python-cp27.json")
    assert os.path.exists(outfile)

    # Check fields in output file
    result = read_json(outfile)
    for key in ["inputs", "tests", "requirements.txt"]:
        assert key in result and result[key] is not None
    for key in [
            "dependency",
            "outfile",
            "dockerfile",
            "force",
            "exists",
            "name",
            "tests",
            "cleanup",
            "outdir",
    ]:
        assert key in result["inputs"] and result["inputs"][key] is not None
Example #3
0
    def read_metric_local(self, index_file, metric):
        """Parse a local repository, returning data from a preferred type"""
        index = read_json(index_file)
        metric_dir = os.path.dirname(index_file)
        data = index.get("data", {})

        if "json-single" in data:
            metric_file = os.path.join(metric_dir,
                                       data["json-single"].get("url", ""))
            if os.path.isfile(metric_file) and os.path.exists(metric_file):
                return read_json(metric_file)

        if "zip" in data:
            metric_file = os.path.join(metric_dir, data["zip"].get("url", ""))
            return json.loads(read_zip(metric_file,
                                       "%s-results.json" % metric))

        elif "json" in data:
            results = {}
            for filename in data["json"].get("urls", []):
                metric_file = os.path.join(metric_dir, filename)
                results.update(read_json(metric_file))
            return results
Example #4
0
    def update_index(self, extractor_dir, content):
        """If an index already exists, load it and update it with the data type
        (represented as the key of a dictionary in content). If an index does not
        exist, write a new one. Filepaths should be relative.
        """
        index_file = os.path.join(extractor_dir, "index.json")
        if not os.path.exists(index_file):
            index = {"data": {}}
            write_json(index, index_file)

        # Read in the index and update it
        index = read_json(index_file)
        index["data"].update(content)
        write_json(index, index_file)
Example #5
0
 def _load_metric_file(self, filename, metric):
     """helper function to load a metric from a filename. If it's zipped,"""
     name = "%s-results.json" % metric
     if filename.endswith("zip"):
         return json.loads(read_zip(filename, name))
     return read_json(name)
Example #6
0
def parse_requirements(dirname, outdir):
    """Given an output directory with json files, parse the requirements.txt
    into a data structure and save into a compiler folder
    """
    # Keep a lookup of requirements.txt to compare across
    rxments = set()
    requirements = {}

    # Read in input files, organize by python version, tensorflow version
    for filename in iter_files(dirname):

        # Derive the name and versions from the filename (also in inputs:name)
        result = read_json(filename)
        if "requirements.txt" not in result:
            requirements[filename] = {}
            continue

        [
            rxments.add(re.split("(=|@)", x)[0].strip())
            for x in result["requirements.txt"]
        ]

    # Now create a flattened dict frame with versions, and a lookup
    requirements = {}
    for filename in iter_files(dirname):

        versions = dict.fromkeys(rxments, None)
        dep = DependencyVersion(filename)
        basename = os.path.splitext(os.path.basename(filename))[0]

        # **Important** this is specific to tensorflow
        for x in result["requirements.txt"]:
            if "file://" in x:
                versions["tensorflow"] = dep.tfversion
            else:
                version = re.split("(==|@|<=|>=)", x)[-1].strip()
                library = re.split("(==|@|<=|>=)", x)[0].strip()
                versions[library] = version

        requirements[basename] = versions

    # Finally, create a data frame (of lists)
    df = [["name", "x", "y", "value", "tensorflow", "python"]]
    for ycoord, requirement in enumerate(rxments):
        for xcoord, filename in enumerate(iter_files(dirname)):

            # Create object to parse versions
            dep = DependencyVersion(filename)
            basename = os.path.splitext(os.path.basename(filename))[0]
            df.append([
                basename,
                xcoord,
                ycoord,
                requirements[basename][requirement],
                dep.tfversion,
                dep.pyversion,
            ])

    # Save versions and matrix to file
    outfile = os.path.join(outdir, "requirements.txt.json")
    write_json(requirements, outfile)
    outfile = os.path.join(outdir, "requirements.txt.tsv")
    write_rows(df, outfile, sep="\t")
    return outfile
Example #7
0
def main():
    """main entrypoint for caliper analysis"""
    parser = get_parser()

    # If an error occurs while parsing the arguments, the interpreter will exit with value 2
    args, extra = parser.parse_known_args()

    filename = os.path.abspath(args.filename) if args.filename else None
    if not filename or (filename and not os.path.exists(filename)):
        sys.exit("A --filename with similarity scores is required.")

    # Prepare output directory
    if not args.outdir or not os.path.exists(args.outdir):
        sys.exit("The output directory %s does not exist" % args.outdir)

    sims = read_json(filename)

    # First derive list of labels for rows and columns
    labels = set()
    for key in sims:
        label1, label2 = key.split(
            ".."
        )  # important, other libraries should use .. in case - is part of the version
        if re.search("(rc|a|b)", label1) or re.search("(rc|a|b)", label2):
            continue
        labels.add(label1)
        labels.add(label1)

    # Versions need to be sorted by version, not string
    # For now we will remove the release candidtes

    labels = list(labels)
    try:
        labels.sort(key=StrictVersion)
    except:
        labels.sort()

    # Next create a data frame for each
    dfs = {
        x: pandas.DataFrame(index=labels, columns=labels)
        for x in sims[list(sims.keys())[0]].keys()
    }
    for pair, values in sims.items():
        label1, label2 = pair.split("..")
        if re.search("(rc|a|b)", label1) or re.search("(rc|a|b)", label2):
            continue
        for key, value in values.items():
            dfs[key].loc[label1, label2] = value
            dfs[key].loc[label2, label1] = value

    # Create output directory
    outdir = os.path.join(args.outdir, "plots")
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    # Finally, prepare plots!
    for name, df in dfs.items():
        fig, ax = plt.subplots(figsize=(args.dim, args.dim))
        cax = ax.matshow(df.to_numpy(dtype=float), interpolation="nearest")
        ax.grid(True)
        plt.title("%s Version Similarity: %s" % (args.package.upper(), name))
        plt.xticks(range(len(labels)), labels, rotation=90)
        plt.yticks(range(len(labels)), labels)
        fig.colorbar(
            cax,
            ticks=[
                0,
                0.1,
                0.2,
                0.3,
                0.4,
                0.5,
                0.6,
                0.7,
                0.75,
                0.8,
                0.85,
                0.90,
                0.95,
                1,
            ],
        )
        # plt.show()
        for extension in ["png", "svg"]:
            if args.name:
                outfile = os.path.join(
                    outdir,
                    "pypi-%s-%s-%s-plot.%s" %
                    (args.package, name, args.name, extension),
                )
            else:
                outfile = os.path.join(
                    outdir,
                    "pypi-%s-%s-plot.%s" % (args.package, name, extension))
            print("Saving %s" % outfile)
            plt.savefig(outfile, dpi=300)
def parse_tests(dirname, outdir, package):
    """Assemble all tests results into one large data structure. This will
    be too large to load into the browser at once, but should be okay for Flask.
    """
    results = {}
    datadir = os.path.join(dirname, "data")

    # We need to keep a list of tests so the data structure is consistent
    tests = set()

    # We also want the versions sorted
    versions = set()
    groups = {}
    for filename in iter_files(datadir, package):
        basename = os.path.basename(filename)
        dep = DependencyVersion(filename, package)

        # Don't include release candidates, or a/b, etc.
        if re.search("(rc|a|b)", basename):
            continue

        if dep.tfversion not in groups:
            groups[dep.tfversion] = []
        groups[dep.tfversion].append(filename)
        versions.add(dep.tfversion)

    # Sort the versions, we will add them to groups
    versions = list(versions)
    versions.sort(key=StrictVersion)

    # Loop through sorted versions
    for version in versions:
        for filename in groups[version]:
            basename = os.path.basename(filename)

            # Create object to parse versions
            dep = DependencyVersion(filename, package)
            results[dep.pyversion] = []
            result = read_json(filename)
            for test in result.get("tests", []):
                tests.add(test)

        # Read in input files, organize by python version, tensorflow version
    for version in versions:
        for filename in groups[version]:
            basename = os.path.basename(filename)

            # Create object to parse versions
            dep = DependencyVersion(filename, package)

            # Make sure the test has at least one result
            result = read_json(filename)
            if "tests" not in result:
                result["tests"] = {"build": {"retval": result["build_retval"]}}

            # Make sure we have all tests, ordered the same, -1 indicates not run
            result_tests = result.get("tests", [])
            test_list = []
            for test in tests:
                if test in result_tests:
                    entry = result_tests[test]
                else:
                    entry = {"retval": -1}

                # y axis will be tensorflow version, x axis will be test name
                entry["x_name"] = test
                entry["y_tensorflow"] = dep.tfversion
                test_list.append(entry)

            results[dep.pyversion] += test_list

    # Write to output file so we can generate a d3
    outfile = os.path.join(dirname, "compiled", "test-results-by-python.json")
    write_json(results, outfile)
    return outfile
Example #9
0
def extract_requirements(datadir, outdir, package):
    """Create a lookup for requirements including (and not including) versions
    to generate similarity matrices. An alternative is to extract all
    requirements (to see change between version) for a package and have this
    say something about the parent package, but this seems more complicated.
    """
    # Keep a lookup of requirements.txt to compare across
    requirements = {}

    # Read in input files, organize by python version, tensorflow version
    for filename in iter_files(datadir, package):

        # Skip release candidates and a/b for now
        if re.search("(rc|b|a)", os.path.basename(filename)):
            continue

        # Only include those we have requirements for (meaning success install)
        result = read_json(filename)
        if "requirements.txt" in result:
            requirements[filename] = [
                x.strip().lower() for x in result["requirements.txt"]
            ]

    # Level 1 similarity: overall modules
    # Level 2 similarity: modules and version string
    sims = {}

    # First just compare functions that exist
    for filename1, modules1 in requirements.items():
        for filename2, modules2 in requirements.items():

            uid1 = os.path.basename(filename1).rstrip(".json")
            uid2 = os.path.basename(filename2).rstrip(".json")

            # Dont' calculate it twice
            scores = {}
            key = "..".join(sorted([uid1, uid2]))
            if key in sims:
                continue

            # Diagonal is perfectly similar
            if uid1 == uid2:
                scores = {"module_sim": 1, "module_version_sim": 1}
                sims[key] = scores
                continue

            # Level 1: Module and version similarity
            modules1 = set(modules1)
            modules2 = set(modules2)
            scores["module_version_sim"] = information_coefficient(
                len(modules1), len(modules2),
                len(modules1.intersection(modules2)))

            # Level 2: Don't include versions, ignore casing
            funcs1 = [
                re.split("(==|@)", x)[0].strip().lower() for x in modules1
            ]
            funcs2 = [
                re.split("(==|@)", x)[0].strip().lower() for x in modules2
            ]
            scores["module_sim"] = information_coefficient(
                len(set(funcs1)),
                len(set(funcs2)),
                len(set(funcs1).intersection(set(funcs2))),
            )
            sims[key] = scores

    outfile = os.path.join(outdir, "pypi-%s-requirements-sims.json" % package)
    write_json(sims, outfile)
    return outfile