Esempio n. 1
0
def serialize_repo_task(
    task_data: Dict[str, Any],
    task_names_to_process: AbstractSet[str],
) -> Optional[Dict[str, Any]]:
    # filter for node list_metadata output to parse and flatten deps
    task_name = get_in(task_data, ["name"], None)
    if task_name not in task_names_to_process:
        return None

    task_command = get_in(task_data, ["command"], None)

    task_result = extract_fields(
        task_data,
        [
            "command",
            "exit_code",
            "name",
            "working_dir",
        ],
    )

    updates = parse_command(task_name, task_command, task_data)
    if updates:
        if task_name == "list_metadata":
            log.info(
                f"wrote {task_result['name']} w/"
                f" {updates['dependencies_count']} deps and {updates.get('problems_count', 0)} problems"
                # f" {updates['graph_stats']}"
            )
        elif task_name == "audit":
            log.info(
                f"wrote {task_result['name']} w/ {updates['vulnerabilities_count']} vulns"
            )
        task_result.update(updates)
    return task_result
def flatten_deps(
    node_list_output: Dict[str, Union[Dict, str]]
) -> Generator[NPMPackage, None, None]:
    """returns a DFS of npm list JSON output yield NPMPackage objs with

    parent to child refs by ID
    """
    pkgs: List[NPMPackage] = []
    paths: List[JSONPath] = []
    for path in visit_deps(node_list_output):
        pkg: NPMPackage
        if path:
            assert isinstance(path[-1], str)
            pkg = _get_pkg(get_in(node_list_output, path), path[-1])
        else:
            pkg = _get_pkg(get_in(node_list_output, path))

        for prev_pkg, prev_pkg_path in itertools.zip_longest(
                reversed(pkgs), reversed(paths)):
            # match direct deps as one level deeper with a matching prefix
            # e.g. from ["dependencies", "yargs"]
            # match ["dependencies", "yargs", "dependencies", "yarg-parser"]
            # but do not match:
            # [] (the root)
            # ["dependencies", "ps"] (a sibling dep)
            # or ["dependencies", "yargs", "dependencies", \
            #        "yarg-parser", "dependencies", "yarg-parser-dep"] (an indirect child)
            if (len(prev_pkg_path) - 2 == len(path)
                    and path == prev_pkg_path[:len(path)]):
                bisect.insort(pkg.dependencies, prev_pkg.package_id)

        yield pkg
        pkgs.append(pkg)
        paths.append(path)
def cargo_metadata_to_rust_crates(
    cargo_meta_out: Dict,
) -> Dict[str, RustCrate]:
    assert (
        get_in(cargo_meta_out, ["metadata", "version"]) == 1
    ), "cargo metadata format was not version 1"
    # build hashmap by pkg_id so we can lookup additional package info from
    # resolved crate as packages[crate.id]
    crates: Dict[str, RustCrate] = {}
    for n in get_in(cargo_meta_out, ["metadata", "nodes"]):
        crate = RustCrate(**extract_fields(n, {"id", "features", "deps"}))
        assert crate.id not in crates
        crates[crate.id] = crate
    return crates
def _visit_child_deps(node_list_output: Dict[str, Union[Dict, str]],
                      path: JSONPath) -> Generator[JSONPath, None, None]:
    output = get_in(node_list_output, path)
    if output:
        for child_dep_key, child_dep in output.items():
            for nested_child_path in _visit_child_deps(
                    node_list_output,
                    list(path) + [child_dep_key, "dependencies"]):
                yield nested_child_path
            yield list(path) + [child_dep_key]
        yield path
Esempio n. 5
0
def parse_command(task_name: str, task_command: str,
                  task_data: Dict) -> Optional[Dict]:
    package_manager_name = get_in(task_data,
                                  ["envvar_args", "PACKAGE_MANAGER"])
    if package_manager_name == "npm":
        return parse_npm_task(task_name, task_data)
    elif package_manager_name == "yarn":
        return parse_yarn_task(task_name, task_data)
    elif package_manager_name == "cargo":
        return parse_cargo_task(task_name, task_data)
    log.warning(f"unrecognized command {task_command}")
    return None
Esempio n. 6
0
def parse_npm_list(parsed_stdout: Dict) -> Dict:
    deps = [dep for dep in flatten_deps(parsed_stdout)]
    updates = {"problems": get_in(parsed_stdout, ["problems"], [])}
    updates["dependencies"] = [asdict(dep) for dep in deps]
    updates["dependencies_count"] = len(deps)
    updates["problems_count"] = len(updates["problems"])

    updates["root"] = asdict(deps[-1]) if len(deps) else None
    updates["direct_dependencies_count"] = (len(deps[-1].dependencies)
                                            if len(deps) else None)
    updates["graph_stats"] = (get_graph_stats(
        npm_packages_to_networkx_digraph(deps)) if deps else dict())
    return updates
Esempio n. 7
0
def parse_cargo_task(task_name: str, task_result: Dict) -> Optional[Dict]:
    parsed_stdout = parse_stdout_as_json(get_in(task_result, ["stdout"], None))
    if parsed_stdout is None:
        log.warn("got non-JSON stdout for cargo task")
        return None

    if task_name == "list_metadata":
        return parse_cargo_list_metadata(parsed_stdout)
    elif task_name == "audit":
        return parse_cargo_audit(parsed_stdout)
    elif task_name == "install":
        return None
    else:
        raise NotImplementedError()
def visit_deps(
    node_list_output: Dict[str, Union[Dict, str]]
) -> Generator[JSONPath, None, None]:
    """generator of nodes from npm list JSON output in DFS order
    returning paths to valid node deps in the JSON paths

    Child dep keys are unordered.
    """
    for path in _visit_child_deps(node_list_output, ["dependencies"]):
        if is_valid_node_list_output_node(get_in(node_list_output, path)):
            yield path

    if is_valid_node_list_output_top_level(node_list_output):
        yield []
Esempio n. 9
0
def parse_npm_task(task_name: str, task_result: Dict) -> Optional[Dict]:
    # TODO: reuse cached results for each set of dep files w/ hashes and task name
    parsed_stdout = parse_stdout_as_json(get_in(task_result, ["stdout"], None))
    if parsed_stdout is None:
        log.warn("got non-JSON stdout for npm")
        return None

    if task_name == "list_metadata":
        return parse_npm_list(parsed_stdout)
    elif task_name == "audit":
        return parse_npm_audit(parsed_stdout)
    elif task_name == "install":
        return None
    else:
        raise NotImplementedError()
def cargo_metadata_to_rust_crate_and_packages(
    cargo_meta_out: Dict,
) -> Tuple[Dict[str, RustCrate], Dict[str, RustPackage]]:
    log.debug(
        "running crate-graph on {0[cargo_tomlfile_path]} in {0[org]}/{0[repo]} at {0[commit]} ".format(
            cargo_meta_out
        )
    )
    crates = cargo_metadata_to_rust_crates(cargo_meta_out)

    packages: Dict[str, RustPackage] = {}
    for p in get_in(cargo_meta_out, ["metadata", "packages"]):
        pkg = RustPackage(**p)
        assert pkg.id not in packages
        packages[pkg.id] = pkg

    return (crates, packages)
Esempio n. 11
0
def serialize_npm_registry_constraints(
        version_data: Dict[str, Any]) -> List[Dict[str, str]]:
    constraints = []
    for prefix, constraint_field in (
        ("", "dependencies"),
        ("optional", "optionalDependencies"),
        ("dev", "devDependencies"),
        ("bundle", "bundleDependencies"),
        ("peer", "peerDependencies"),
    ):
        field_data = get_in(version_data, [constraint_field])
        if field_data is None:
            continue

        for name, version_range in field_data.items():
            constraints.append(
                dict(name=name,
                     version_range=version_range,
                     type_prefix=prefix))
    return constraints
Esempio n. 12
0
def serialize_npm_registry_constraints(
        version_data: Dict[str, Any]) -> List[Dict[str, str]]:
    constraints = []
    for prefix, constraint_field in (
        ("", "dependencies"),
        ("optional", "optionalDependencies"),
        ("dev", "devDependencies"),
        ("peer", "peerDependencies"),
    ):
        field_data = get_in(version_data, [constraint_field])
        if field_data is None:
            continue
        if not isinstance(field_data, dict):
            log.warn(
                "got unexpected dependencies data type for {prefix} {constraint_field} {type(field_data)}"
            )
            continue

        for name, version_range in field_data.items():
            constraints.append(
                dict(name=name,
                     version_range=version_range,
                     type_prefix=prefix))
    return constraints
Esempio n. 13
0
def test_get_in_errors(value, path, default, expected_error):
    with pytest.raises(expected_error):
        m.get_in(value, path, default)
Esempio n. 14
0
def test_get_in(value, path, default, expected):
    assert m.get_in(value, path, default) == expected
Esempio n. 15
0
def serialize_npm_registry_entries(
    npm_registry_entries: Iterable[Dict[str,
                                        Any]]) -> Iterable[NPMRegistryEntry]:
    for entry in npm_registry_entries:
        # save version specific data
        for version, version_data in entry["versions"].items():
            fields = extract_nested_fields(
                version_data,
                {
                    "package_name": ["name"],
                    "package_version": ["version"],
                    "shasum": ["dist", "shasum"],
                    "tarball": ["dist", "tarball"],
                    "git_head": ["gitHead"],
                    "repository_type": ["repository", "type"],
                    "repository_url": ["repository", "url"],
                    "description": ["description"],
                    "url": ["url"],
                    "license_type": ["license"],
                    "keywords": ["keywords"],
                    "has_shrinkwrap": ["_hasShrinkwrap"],
                    "bugs_url": ["bugs", "url"],
                    "bugs_email": ["bugs", "email"],
                    "author_name": ["author", "name"],
                    "author_email": ["author", "email"],
                    "author_url": ["author", "url"],
                    "maintainers": ["maintainers"],
                    "contributors": ["contributors"],
                    "publisher_name": ["_npmUser", "name"],
                    "publisher_email": ["_npmUser", "email"],
                    "publisher_node_version": ["_nodeVersion"],
                    "publisher_npm_version": ["_npmVersion"],
                    "scripts": ["scripts"],
                },
            )
            fields["constraints"] = serialize_npm_registry_constraints(
                version_data)
            log.debug(
                f"serialized npm registry constraints for {fields['package_name']}@{fields['package_version']} : {fields['constraints']}"
            )

            # license can we a string e.g. 'MIT'
            # or dict e.g. {'type': 'MIT', 'url': 'https://github.com/jonschlinkert/micromatch/blob/master/LICENSE'}
            fields["license_url"] = None
            if isinstance(fields["license_type"], dict):
                fields["license_url"] = fields["license_type"].get("url", None)
                fields["license_type"] = fields["license_type"].get(
                    "type", None)

            # looking at you [email protected].{3,4} with:
            # [{"name": "StrongLoop", "url": "http://strongloop.com/license/"}, "MIT"],
            if not ((isinstance(fields["license_type"], str)
                     or fields["license_type"] is None) and
                    (isinstance(fields["license_url"], str)
                     or fields["license_url"] is None)):
                log.warning(
                    f"skipping weird license format {fields['license_type']}")
                fields["license_url"] = None
                fields["license_type"] = None

            # published_at .time[<version>] e.g. '2014-05-23T21:21:04.170Z' (not from
            # the version info object)
            # where time: an object mapping versions to the time published, along with created and modified timestamps
            fields["published_at"] = get_in(entry, ["time", version])
            fields["package_modified_at"] = get_in(entry, ["time", "modified"])

            fields[
                "source_url"] = f"https://registry.npmjs.org/{fields['package_name']}"
            yield NPMRegistryEntry(**fields)