def flatten_deps(
    node_list_output: Dict[str, Union[Dict, str]]
) -> Generator[NPMPackage, None, None]:
    """returns a DFS of npm list JSON output yield NPMPackage objs with

    parent to child refs by ID
    """
    pkgs: List[NPMPackage] = []
    paths: List[JSONPath] = []
    for path in visit_deps(node_list_output):
        pkg: NPMPackage
        if path:
            assert isinstance(path[-1], str)
            pkg = _get_pkg(get_in(node_list_output, path), path[-1])
        else:
            pkg = _get_pkg(get_in(node_list_output, path))

        for prev_pkg, prev_pkg_path in itertools.zip_longest(
                reversed(pkgs), reversed(paths)):
            # match direct deps as one level deeper with a matching prefix
            # e.g. from ["dependencies", "yargs"]
            # match ["dependencies", "yargs", "dependencies", "yarg-parser"]
            # but do not match:
            # [] (the root)
            # ["dependencies", "ps"] (a sibling dep)
            # or ["dependencies", "yargs", "dependencies", \
            #        "yarg-parser", "dependencies", "yarg-parser-dep"] (an indirect child)
            if (len(prev_pkg_path) - 2 == len(path)
                    and path == prev_pkg_path[:len(path)]):
                bisect.insort(pkg.dependencies, prev_pkg.package_id)

        yield pkg
        pkgs.append(pkg)
        paths.append(path)
Exemple #2
0
async def run_pipeline(
    source: Generator[Dict[str, Any], None, None], args: argparse.Namespace
) -> AsyncGenerator[Dict, None]:
    log.info(f"{pipeline.name} pipeline started")

    for i, line in enumerate(source):
        result = extract_fields(
            line,
            [
                "branch",
                "commit",
                "tag",
                "org",
                "repo",
                "repo_url",
                "ref",
                "dependency_files",
            ],
        )
        result["tasks"] = []

        for task_data in get_in(line, ["task_results"], []):
            # filter for node list_metadata output to parse and flatten deps
            task_name = get_in(task_data, ["name"], None)
            if task_name not in args.repo_task:
                continue

            task_command = get_in(task_data, ["command"], None)

            task_result = extract_fields(
                task_data,
                [
                    "command",
                    "container_name",
                    "exit_code",
                    "name",
                    "relative_path",
                    "working_dir",
                ],
            )

            updates = parse_command(task_name, task_command, task_data, line)
            if updates:
                if task_name == "list_metadata":
                    log.info(
                        f"wrote {task_result['name']} {result['org']}/{result['repo']} {task_result['relative_path']}"
                        f" {result['ref']['value']} w/"
                        f" {updates['dependencies_count']} deps and {updates.get('problems_count', 0)} problems"
                        # f" {updates['graph_stats']}"
                    )
                elif task_name == "audit":
                    log.info(
                        f"wrote {task_result['name']} {result['org']}/{result['repo']} {task_result['relative_path']}"
                        f" {result['ref']['value']} w/"
                        f" {updates['vulnerabilities_count']} vulns"
                    )
                task_result.update(updates)
            result["tasks"].append(task_result)
        yield result
Exemple #3
0
def cargo_metadata_to_rust_crates(
    cargo_meta_out: Dict, ) -> Dict[str, RustCrate]:
    assert (get_in(cargo_meta_out,
                   ["metadata", "version"
                    ]) == 1), "cargo metadata format was not version 1"
    # build hashmap by pkg_id so we can lookup additional package info from
    # resolved crate as packages[crate.id]
    crates: Dict[str, RustCrate] = {}
    for n in get_in(cargo_meta_out, ["metadata", "nodes"]):
        crate = RustCrate(**extract_fields(n, {"id", "features", "deps"}))
        assert crate.id not in crates
        crates[crate.id] = crate
    return crates
def _visit_child_deps(node_list_output: Dict[str, Union[Dict, str]],
                      path: JSONPath) -> Generator[JSONPath, None, None]:
    output = get_in(node_list_output, path)
    if output:
        for child_dep_key, child_dep in output.items():
            for nested_child_path in _visit_child_deps(
                    node_list_output,
                    list(path) + [child_dep_key, "dependencies"]):
                yield nested_child_path
            yield list(path) + [child_dep_key]
        yield path
Exemple #5
0
def parse_cargo_task(task_name: str, task_result: Dict) -> Optional[Dict]:
    parsed_stdout = parse_stdout_as_json(get_in(task_result, ["stdout"], None))
    if parsed_stdout is None:
        log.warn("got non-JSON stdout for cargo task")
        return None

    if task_name == "list_metadata":
        return parse_cargo_list_metadata(parsed_stdout)
    elif task_name == "audit":
        return parse_cargo_audit(parsed_stdout)
    elif task_name == "install":
        return None
    else:
        raise NotImplementedError()
def visit_deps(
    node_list_output: Dict[str, Union[Dict, str]]
) -> Generator[JSONPath, None, None]:
    """generator of nodes from npm list JSON output in DFS order
    returning paths to valid node deps in the JSON paths

    Child dep keys are unordered.
    """
    for path in _visit_child_deps(node_list_output, ["dependencies"]):
        if is_valid_node_list_output_node(get_in(node_list_output, path)):
            yield path

    if is_valid_node_list_output_top_level(node_list_output):
        yield []
Exemple #7
0
def parse_npm_task(task_name: str, task_result: Dict) -> Optional[Dict]:
    # TODO: reuse cached results for each set of dep files w/ hashes and task name
    parsed_stdout = parse_stdout_as_json(get_in(task_result, ["stdout"], None))
    if parsed_stdout is None:
        log.warn("got non-JSON stdout for npm")
        return None

    if task_name == "list_metadata":
        return parse_npm_list(parsed_stdout)
    elif task_name == "audit":
        return parse_npm_audit(parsed_stdout)
    elif task_name == "install":
        return None
    else:
        raise NotImplementedError()
Exemple #8
0
def parse_npm_list(parsed_stdout: Dict) -> Dict:
    deps = [dep for dep in flatten_deps(parsed_stdout)]
    updates = {"problems": get_in(parsed_stdout, ["problems"], [])}
    updates["dependencies"] = [asdict(dep) for dep in deps]
    updates["dependencies_count"] = len(deps)
    updates["problems_count"] = len(updates["problems"])

    updates["root"] = asdict(deps[-1]) if len(deps) else None
    updates["direct_dependencies_count"] = (
        len(deps[-1].dependencies) if len(deps) else None
    )
    updates["graph_stats"] = (
        get_graph_stats(npm_packages_to_networkx_digraph(deps)) if deps else dict()
    )
    return updates
Exemple #9
0
def cargo_metadata_to_rust_crate_and_packages(
    cargo_meta_out: Dict,
) -> Tuple[Dict[str, RustCrate], Dict[str, RustPackage]]:
    log.debug(
        "running crate-graph on {0[cargo_tomlfile_path]} in {0[org]}/{0[repo]} at {0[commit]} "
        .format(cargo_meta_out))
    crates = cargo_metadata_to_rust_crates(cargo_meta_out)

    packages: Dict[str, RustPackage] = {}
    for p in get_in(cargo_meta_out, ["metadata", "packages"]):
        pkg = RustPackage(**p)
        assert pkg.id not in packages
        packages[pkg.id] = pkg

    return (crates, packages)
Exemple #10
0
def insert_npm_registry_data(
        session: sqlalchemy.orm.Session,
        source: Generator[Dict[str, Any], None, None]) -> None:
    for line in source:
        # save version specific data
        for version, version_data in line["versions"].items():
            fields = extract_nested_fields(
                version_data,
                {
                    "package_name": ["name"],
                    "package_version": ["version"],
                    "shasum": ["dist", "shasum"],
                    "tarball": ["dist", "tarball"],
                    "git_head": ["gitHead"],
                    "repository_type": ["repository", "type"],
                    "repository_url": ["repository", "url"],
                    "description": ["description"],
                    "url": ["url"],
                    "license_type": ["license"],
                    "keywords": ["keywords"],
                    "has_shrinkwrap": ["_hasShrinkwrap"],
                    "bugs_url": ["bugs", "url"],
                    "bugs_email": ["bugs", "email"],
                    "author_name": ["author", "name"],
                    "author_email": ["author", "email"],
                    "author_url": ["author", "url"],
                    "maintainers": ["maintainers"],
                    "contributors": ["contributors"],
                    "publisher_name": ["_npmUser", "name"],
                    "publisher_email": ["_npmUser", "email"],
                    "publisher_node_version": ["_nodeVersion"],
                    "publisher_npm_version": ["_npmVersion"],
                },
            )
            # license can we a string e.g. 'MIT'
            # or dict e.g. {'type': 'MIT', 'url': 'https://github.com/jonschlinkert/micromatch/blob/master/LICENSE'}
            fields["license_url"] = None
            if isinstance(fields["license_type"], dict):
                fields["license_url"] = fields["license_type"].get("url", None)
                fields["license_type"] = fields["license_type"].get(
                    "type", None)

            # looking at you [email protected].{3,4} with:
            # [{"name": "StrongLoop", "url": "http://strongloop.com/license/"}, "MIT"],
            if not ((isinstance(fields["license_type"], str)
                     or fields["license_type"] is None) and
                    (isinstance(fields["license_url"], str)
                     or fields["license_url"] is None)):
                log.warning(
                    f"skipping weird license format {fields['license_type']}")
                fields["license_url"] = None
                fields["license_type"] = None

            # published_at .time[<version>] e.g. '2014-05-23T21:21:04.170Z' (not from
            # the version info object)
            # where time: an object mapping versions to the time published, along with created and modified timestamps
            fields["published_at"] = get_in(line, ["time", version])
            fields["package_modified_at"] = get_in(line, ["time", "modified"])

            fields[
                "source_url"] = f"https://registry.npmjs.org/{fields['package_name']}"

            if (session.query(NPMRegistryEntry.id).filter_by(
                    package_name=fields["package_name"],
                    package_version=fields["package_version"],
                    shasum=fields["shasum"],
                    tarball=fields["tarball"],
            ).one_or_none()):
                log.debug(
                    f"skipping inserting npm registry entry for {fields['package_name']}@{fields['package_version']}"
                    f" from {fields['tarball']} with sha {fields['shasum']}")
            else:
                session.add(NPMRegistryEntry(**fields))
                session.commit()
                log.info(
                    f"added npm registry entry for {fields['package_name']}@{fields['package_version']}"
                    f" from {fields['tarball']} with sha {fields['shasum']}")
def test_get_in_errors(value, path, default, expected_error):
    with pytest.raises(expected_error):
        m.get_in(value, path, default)
def test_get_in(value, path, default, expected):
    assert m.get_in(value, path, default) == expected