Example #1
0
    def __init__(
        self,
        should_extract=False,
        exclude_folders=[],
        checkers=None,
        logger=None,
        error_mode=ErrorMode.TruncTrace,
        score=0,
    ):
        self.logger = logger or LOGGER.getChild(self.__class__.__name__)
        # Update egg if installed in development mode
        if IS_DEVELOP():
            self.logger.debug("Updating egg_info")
            update_egg()

        # Load checkers if not given
        self.checkers = checkers or self.load_checkers()
        self.score = score
        self.total_scanned_files = 0
        self.exclude_folders = exclude_folders + [".git"]

        self.walker = DirWalk(folder_exclude_pattern=";".join(
            exclude if exclude.endswith("*") else exclude + "*"
            for exclude in exclude_folders)).walk
        self.should_extract = should_extract
        self.file_stack = []
        self.error_mode = error_mode
        self.cve_db = CVEDB()
Example #2
0
    def run_python_package_checkers(self, filename, lines):
        """
        This generator runs only for python packages.
        There are no actual checkers.
        The ProductInfo is computed without the help of any checkers from PKG-INFO or METADATA.
        """
        try:
            product = search(compile(r"^Name: (.+)$", MULTILINE),
                             lines).group(1)
            version = search(compile(r"^Version: (.+)$", MULTILINE),
                             lines).group(1)

            cve_db = CVEDB()
            vendor_package_pair = cve_db.get_vendor_product_pairs(product)

            if vendor_package_pair != []:
                vendor = vendor_package_pair[0]["vendor"]
                file_path = "".join(self.file_stack)

                self.logger.info(f"{file_path} is {product} {version}")

                yield ProductInfo(vendor, product, version), file_path

        # There are packages with a METADATA file in them containing different data from what the tool expects
        except AttributeError:
            self.logger.debug(f"{filename} is an invalid METADATA/PKG-INFO")

        self.logger.debug(f"Done scanning file: {filename}")
Example #3
0
    def __init__(
        self, filename: str, sbom_type: str = "spdx", logger: Optional[Logger] = None
    ):
        self.filename = filename
        self.sbom_data = defaultdict(dict)
        self.type = "unknown"
        if sbom_type in self.SBOMtype:
            self.type = sbom_type
        self.logger = logger or LOGGER.getChild(self.__class__.__name__)

        # Connect to the database
        self.cvedb = CVEDB(version_check=False)
Example #4
0
    def find_vendor_product(self):
        """find vendor-product pairs from database"""

        LOGGER.debug(
            f"checking for product_name='{self.product_name}' and version_name='{self.version_number}' in the database"
        )

        CVEDB.db_open(self)
        cursor = self.connection.cursor()

        # finding out all distinct (vendor, product) pairs with the help of product_name
        query = """
            SELECT distinct vendor, product FROM cve_range
            WHERE product=(:product);
        """

        cursor.execute(query, {"product": self.product_name})
        data = cursor.fetchall()

        # checking if (vendor, product) was found in the database
        if data:
            # warning the user to select the vendor-product pairs manually if multiple pairs are found
            if len(data) != 1:
                LOGGER.warning(
                    textwrap.dedent(f"""
                            ===============================================================
                            Multiple ("vendor", "product") pairs found for "{self.product_name}"
                            Please manually select the appropriate pair.
                            ===============================================================
                        """))
            return data  # [('vendor', 'product')]
        else:
            if self.product_name:
                # removing numeric characters from the product_name
                if any(char.isdigit() for char in self.product_name):
                    LOGGER.debug(
                        f"removing digits from product_name={self.product_name}"
                    )
                    self.product_name = "".join(
                        filter(lambda x: not x.isdigit(), self.product_name))
                    return self.find_vendor_product()
                else:
                    # raise error and ask for product_name
                    LOGGER.warning(
                        textwrap.dedent(f"""
                                =================================================================
                                No match was found for "{self.product_name}" in database.
                                Please check your file or try specifying the "product_name" also.
                                =================================================================
                            """))
                    return []

        CVEDB.db_close(self)
Example #5
0
 async def test_nvd_incremental_update(self):
     """Test to check whether we are able to fetch and save the nvd entries using time_of_last_update"""
     nvd_api = NVD_API(incremental_update=True)
     await nvd_api.get_nvd_params(
         time_of_last_update=datetime.now() - timedelta(days=4)
     )
     await nvd_api.get()
     cvedb = CVEDB(cachedir=self.outdir, nvd_type="api")
     cvedb.all_cve_entries = nvd_api.all_cve_entries
     cvedb.init_database()
     cvedb.populate_db()
     cvedb.check_cve_entries()
     assert cvedb.cve_count == nvd_api.total_results
Example #6
0
 def setup_class(cls):
     cls.cvedb = CVEDB()
     if os.getenv("UPDATE_DB") == "1":
         cls.cvedb.get_cvelist_if_stale()
     else:
         print("Skip NVD database updates.")
     # Instantiate a scanner
     cls.scanner = VersionScanner(should_extract=True)
     # temp dir for mapping tests
     cls.mapping_test_dir = tempfile.mkdtemp(prefix="mapping-test-")
     # temp dir for tests that require downloads
     cls.package_test_dir = tempfile.mkdtemp(prefix="package_test-")
Example #7
0
 def setUpClass(cls):
     # Run makefile to build faked binaries (in python 3 or 2)
     if platform == "linux" or platform == "linux2":
         subprocess.call(["make", "clean-linux"], cwd=BINARIES_PATH)
     elif platform == "win32":
         subprocess.call(["make", "clean-windows"], cwd=BINARIES_PATH)
     subprocess.call(["make", "all"], cwd=BINARIES_PATH)
     # Instantiate the NVD database
     cls.cvedb = CVEDB()
     if os.getenv("UPDATE_DB") == "1":
         cls.cvedb.get_cvelist_if_stale()
     else:
         print("Skip NVD database updates.")
     # Instantiate a scanner
     cls.scanner = Scanner(cls.cvedb)
     # temp dir for tests that require downloads
     cls.tempdir = tempfile.mkdtemp(prefix="cve-bin-tool-")
Example #8
0
 def setup_class(cls):
     cls.cvedb = CVEDB(cachedir=tempfile.mkdtemp(prefix="cvedb-"))
Example #9
0
def main(argv=None):
    """Scan a binary file for certain open source libraries that may have CVEs"""
    argv = argv or sys.argv

    # Reset logger level to info
    LOGGER.setLevel(logging.INFO)

    parser = argparse.ArgumentParser(
        prog="cve-bin-tool",
        description=textwrap.dedent("""
            The CVE Binary Tool scans for a number of common, vulnerable open source
            components (openssl, libpng, libxml2, expat and a few others) to let you know
            if a given directory or binary file includes common libraries with known
            vulnerabilities.
            """),
        epilog=textwrap.fill(
            f'Available checkers: {", ".join(VersionScanner.available_checkers())}'
        ) + "\n\nPlease disclose issues responsibly!",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    input_group = parser.add_argument_group("Input")
    input_group.add_argument("directory",
                             help="directory to scan",
                             nargs="?",
                             default=None)
    input_group.add_argument(
        "-e",
        "--exclude",
        action=StringToListAction,
        help="Comma separated Exclude directory path",
        default=None,
    )

    input_group.add_argument(
        "-i",
        "--input-file",
        action="store",
        default="",
        help="provide input filename",
    )
    input_group.add_argument("-C",
                             "--config",
                             action="store",
                             default="",
                             help="provide config file")

    output_group = parser.add_argument_group("Output")
    output_group.add_argument("-q",
                              "--quiet",
                              action="store_true",
                              help="suppress output")
    output_group.add_argument(
        "-l",
        "--log",
        help="log level (default: info)",
        dest="log_level",
        action="store",
        choices=["debug", "info", "warning", "error", "critical"],
    )
    output_group.add_argument(
        "-o",
        "--output-file",
        action="store",
        help="provide output filename (default: output to stdout)",
    )
    output_group.add_argument(
        "--html-theme",
        action="store",
        help="provide custom theme directory for HTML Report",
    )
    output_group.add_argument(
        "-f",
        "--format",
        action="store",
        choices=["csv", "json", "console", "html", "pdf"],
        help="update output format (default: console)",
    )
    output_group.add_argument(
        "-c",
        "--cvss",
        action="store",
        help=
        "minimum CVSS score (as integer in range 0 to 10) to report (default: 0)",
    )
    output_group.add_argument(
        "-S",
        "--severity",
        action="store",
        choices=["low", "medium", "high", "critical"],
        help="minimum CVE severity to report (default: low)",
    )
    parser.add_argument("-V", "--version", action="version", version=VERSION)
    parser.add_argument(
        "-u",
        "--update",
        action="store",
        choices=["now", "daily", "never", "latest"],
        help="update schedule for NVD database (default: daily)",
    )
    parser.add_argument(
        "-x",
        "--extract",
        action="store_true",
        help="autoextract compressed files",
    )
    parser.add_argument(
        "--disable-version-check",
        action="store_true",
        help="skips checking for a new version",
    )

    checker_group = parser.add_argument_group("Checkers")
    checker_group.add_argument(
        "-s",
        "--skips",
        dest="skips",
        action=StringToListAction,
        type=str,
        help="comma-separated list of checkers to disable",
    )
    checker_group.add_argument(
        "-r",
        "--runs",
        dest="runs",
        action=StringToListAction,
        type=str,
        help="comma-separated list of checkers to enable",
    )
    defaults = {
        "directory": "",
        "exclude": [],
        "input_file": "",
        "log_level": "info",
        "format": "console",
        "cvss": 0,
        "severity": "low",
        "update": "daily",
        "extract": True,
        "disable_version_check": False,
        "skips": "",
        "runs": "",
        "quiet": False,
        "output_file": "",
        "html_theme": "",
    }

    with ErrorHandler(mode=ErrorMode.NoTrace):
        raw_args = parser.parse_args(argv[1:])
        args = {key: value for key, value in vars(raw_args).items() if value}

    configs = {}
    if args.get("config"):
        conf = ConfigParser(args["config"])
        configs = conf.parse_config()

    args = ChainMap(args, configs, defaults)

    # logging and error related settings
    if args["log_level"]:
        LOGGER.setLevel(args["log_level"].upper())

    if args["quiet"]:
        LOGGER.setLevel(logging.CRITICAL)

    if 0 < LOGGER.level <= 10:
        error_mode = ErrorMode.FullTrace
    elif LOGGER.level >= 50:
        error_mode = ErrorMode.NoTrace
    else:
        error_mode = ErrorMode.TruncTrace

    if platform.system() != "Linux":
        warning_nolinux = """
                          **********************************************
                          Warning: this utility was developed for Linux.
                          You may need to install additional utilities
                          to use it on other operating systems.
                          **********************************************
                          """
        LOGGER.warning(warning_nolinux)

    # Database update related settings
    # Connect to the database
    cvedb_orig = CVEDB(version_check=not args["disable_version_check"],
                       error_mode=error_mode)

    # if OLD_CACHE_DIR (from cvedb.py) exists, print warning
    if os.path.exists(OLD_CACHE_DIR):
        LOGGER.warning(
            f"Obsolete cache dir {OLD_CACHE_DIR} is no longer needed and can be removed."
        )

    # Clear data if -u now is set
    if args["update"] == "now":
        cvedb_orig.clear_cached_data()

    if args["update"] == "latest":
        cvedb_orig.refresh_cache_and_update_db()

    # update db if needed
    if args["update"] != "never":
        cvedb_orig.get_cvelist_if_stale()
    else:
        LOGGER.warning("Not verifying CVE DB cache")
        if not cvedb_orig.nvd_years():
            with ErrorHandler(mode=error_mode, logger=LOGGER):
                raise EmptyCache(cvedb_orig.cachedir)

    # CVE Database validation
    if not cvedb_orig.check_cve_entries():
        with ErrorHandler(mode=error_mode, logger=LOGGER):
            raise CVEDataMissing("No data in CVE Database")

    # Input validation
    if not args["directory"] and not args["input_file"]:
        parser.print_usage()
        with ErrorHandler(logger=LOGGER, mode=ErrorMode.NoTrace):
            raise InsufficientArgs(
                "Please specify a directory to scan or an input file required")

    if args["directory"] and not os.path.exists(args["directory"]):
        parser.print_usage()
        with ErrorHandler(logger=LOGGER, mode=ErrorMode.NoTrace):
            raise FileNotFoundError("Directory/File doesn't exist")

    # Checkers related settings
    skips = args["skips"]
    if args["runs"]:
        runs = args["runs"]
        skips = list(
            map(
                lambda checker: checker.name,
                filter(
                    lambda checker: checker.name not in runs,
                    pkg_resources.iter_entry_points("cve_bin_tool.checker"),
                ),
            ))

    # CSVScanner related settings
    score = 0
    if args["severity"]:
        # Set minimum CVSS score based on severity
        cvss_score = {"low": 0, "medium": 4, "high": 7, "critical": 9}
        score = cvss_score[args["severity"]]
    if int(args["cvss"]) > 0:
        score = int(args["cvss"])

    with CVEScanner(score=score) as cve_scanner:
        triage_data: TriageData
        total_files: int = 0
        parsed_data: Dict[ProductInfo, TriageData] = {}

        if args["input_file"]:
            input_engine = InputEngine(args["input_file"],
                                       logger=LOGGER,
                                       error_mode=error_mode)
            parsed_data = input_engine.parse_input()
            if not args["directory"]:
                for product_info, triage_data in parsed_data.items():
                    LOGGER.warning(f"{product_info}, {triage_data}")
                    cve_scanner.get_cves(product_info, triage_data)
        if args["directory"]:
            version_scanner = VersionScanner(
                should_extract=args["extract"],
                exclude_folders=args["exclude"],
                error_mode=error_mode,
            )
            version_scanner.remove_skiplist(skips)
            version_scanner.print_checkers()
            for scan_info in version_scanner.recursive_scan(args["directory"]):
                if scan_info:
                    product_info, path = scan_info
                    LOGGER.debug(f"{product_info}: {path}")
                    triage_data = parsed_data.get(product_info,
                                                  {"default": {}})
                    # Ignore paths from triage_data if we are scanning directory
                    triage_data["paths"] = {path}
                    cve_scanner.get_cves(product_info, triage_data)
            total_files = version_scanner.total_scanned_files

        LOGGER.info("")
        LOGGER.info("Overall CVE summary: ")
        if args["input_file"]:
            LOGGER.info(
                f"There are {cve_scanner.products_with_cve} products with known CVEs detected"
            )
        else:
            LOGGER.info(
                f"There are {cve_scanner.products_with_cve} files with known CVEs detected"
            )
        if cve_scanner.products_with_cve > 0 or (args["format"] == "html"
                                                 or args["format"] == "pdf"):
            affected_string = ", ".join(
                map(
                    lambda product_version: "".join(str(product_version)),
                    cve_scanner.affected(),
                ))
            LOGGER.info(f"Known CVEs in {affected_string}:")

            # Creates a Object for OutputEngine
            output = OutputEngine(
                all_cve_data=cve_scanner.all_cve_data,
                scanned_dir=args["directory"],
                filename=args["output_file"],
                themes_dir=args["html_theme"],
                products_with_cve=cve_scanner.products_with_cve,
                products_without_cve=cve_scanner.products_without_cve,
                total_files=total_files,
            )

            if not args["quiet"]:
                output.output_file(args["format"])

        # Use the number of products with known cves as error code
        # as requested by folk planning to automate use of this script.
        # If no cves found, then the program exits cleanly.
        return cve_scanner.products_with_cve
Example #10
0
class SBOMManager:

    SBOMtype = ["spdx", "cyclonedx", "swid"]

    sbom_data: DefaultDict[ProductInfo, TriageData]

    def __init__(
        self, filename: str, sbom_type: str = "spdx", logger: Optional[Logger] = None
    ):
        self.filename = filename
        self.sbom_data = defaultdict(dict)
        self.type = "unknown"
        if sbom_type in self.SBOMtype:
            self.type = sbom_type
        self.logger = logger or LOGGER.getChild(self.__class__.__name__)

        # Connect to the database
        self.cvedb = CVEDB(version_check=False)

    def scan_file(self) -> Dict[ProductInfo, TriageData]:
        LOGGER.info(f"Processing SBOM {self.filename} of type {self.type.upper()}")
        try:
            if self.type == "spdx":
                spdx = SPDXParser()
                modules = spdx.parse(self.filename)
            elif self.type == "cyclonedx":
                cyclone = CycloneParser()
                modules = cyclone.parse(self.filename)
            elif self.type == "swid":
                swid = SWIDParser()
                modules = swid.parse(self.filename)
            else:
                modules = []
        except (KeyError, FileNotFoundError, ET.ParseError) as e:
            LOGGER.debug(e, exc_info=True)
            modules = []

        LOGGER.debug(
            f"The number of modules identified in SBOM - {len(modules)}\n{modules}"
        )

        # Now process list of modules to create [vendor, product, version] tuples
        parsed_data: List[ProductInfo] = []
        for m in modules:
            product, version = m[0], m[1]
            if version != "":
                # Now add vendor to create product record....
                # print (f"Find vendor for {product} {version}")
                vendor = self.get_vendor(product)
                if vendor is not None:
                    parsed_data.append(ProductInfo(vendor, product, version))
                    # print(vendor,product,version)

        for row in parsed_data:
            self.sbom_data[row]["default"] = {
                "remarks": Remarks.NewFound,
                "comments": "",
                "severity": "",
            }
            self.sbom_data[row]["paths"] = set(map(lambda x: x.strip(), "".split(",")))

        LOGGER.debug(f"SBOM Data {self.sbom_data}")
        return self.sbom_data

    def get_vendor(self, product: str) -> Optional[str]:
        vendor_package_pair = self.cvedb.get_vendor_product_pairs(product)
        if vendor_package_pair != []:
            vendor = vendor_package_pair[0]["vendor"]
            return vendor
        return None
Example #11
0
    def parse_list(self):
        input_file = self.input_file
        self.check_file()

        if not input_file.endswith("requirements.txt"):
            if distro.id() not in SUPPORTED_DISTROS:
                LOGGER.warning(
                    f"Package list support only available on {','.join(SUPPORTED_DISTROS)}!"
                )
                return {}

            system_packages = []

            LOGGER.info(f"Scanning {distro.id().capitalize()} package list.")

            if distro.id() in DEB_DISTROS:
                installed_packages = run(
                    [
                        "dpkg-query",
                        "--show",
                        '--showformat={"name": "${binary:Package}", "version": "${Version}"}, ',
                    ],
                    stdout=PIPE,
                )
                installed_packages = json.loads(
                    f"[{installed_packages.stdout.decode('utf-8')[0:-2]}]")
            elif distro.id() in RPM_DISTROS:
                installed_packages = run(
                    [
                        "rpm",
                        "--query",
                        "--all",
                        "--queryformat",
                        '{"name": "%{NAME}", "version": "%{VERSION}"\\}, ',
                    ],
                    stdout=PIPE,
                )
                installed_packages = json.loads(
                    f"[{installed_packages.stdout.decode('utf-8')[0:-2]}]")
            elif distro.id() in PACMAN_DISTROS:
                installed_packages = []

                installed_packages_output = run(
                    ["pacman", "--query", "--explicit"],
                    stdout=PIPE,
                )

                installed_packages_output = installed_packages_output.stdout.decode(
                    "utf-8").splitlines()

                dict_keys = ["name", "version"]
                for installed_package in installed_packages_output:
                    package_details = installed_package.split(" ")
                    installed_package_dict = dict(
                        zip(dict_keys, package_details))
                    installed_packages.append(installed_package_dict)

            with open(input_file) as req:
                lines = req.readlines()
            for line in lines:
                system_packages.append(re.split("\n", line)[0])

            for installed_package in installed_packages:
                if installed_package["name"] in system_packages:
                    self.package_names_without_vendor.append(installed_package)

        else:
            LOGGER.info("Scanning python package list.")
            txt_package_names = []

            installed_packages_json = run(
                ["pip", "list", "--format", "json"],
                stdout=PIPE,
            )
            installed_packages = json.loads(
                installed_packages_json.stdout.decode("utf-8"))

            with open(input_file) as txtfile:
                lines = txtfile.readlines()

                for line in lines:
                    txt_package_names.append(re.split(">|\\[|;|=|\n", line)[0])
                for installed_package in installed_packages:
                    package_name = installed_package["name"].lower()
                    if package_name in txt_package_names:
                        self.package_names_without_vendor.append(
                            installed_package)

        cve_db = CVEDB()
        vendor_package_pairs = cve_db.get_vendor_product_pairs(
            self.package_names_without_vendor)

        self.add_vendor(vendor_package_pairs)
        self.parse_data()
        return self.parsed_data_with_vendor
Example #12
0
class VersionScanner:
    """ "Scans files for CVEs using CVE checkers"""

    CHECKER_ENTRYPOINT = "cve_bin_tool.checker"

    def __init__(
        self,
        should_extract=False,
        exclude_folders=[],
        checkers=None,
        logger=None,
        error_mode=ErrorMode.TruncTrace,
        score=0,
    ):
        self.logger = logger or LOGGER.getChild(self.__class__.__name__)
        # Update egg if installed in development mode
        if IS_DEVELOP():
            self.logger.debug("Updating egg_info")
            update_egg()

        # Load checkers if not given
        self.checkers = checkers or self.load_checkers()
        self.score = score
        self.total_scanned_files = 0
        self.exclude_folders = exclude_folders + [".git"]

        self.walker = DirWalk(folder_exclude_pattern=";".join(
            exclude if exclude.endswith("*") else exclude + "*"
            for exclude in exclude_folders)).walk
        self.should_extract = should_extract
        self.file_stack = []
        self.error_mode = error_mode
        self.cve_db = CVEDB()
        # self.logger.info("Checkers loaded: %s" % (", ".join(self.checkers.keys())))

    @classmethod
    def load_checkers(cls):
        """Loads CVE checkers"""
        checkers = dict(
            map(
                lambda checker: (checker.name, checker.load()),
                importlib_metadata.entry_points()[cls.CHECKER_ENTRYPOINT],
            ))
        return checkers

    @classmethod
    def available_checkers(cls):
        checkers = importlib_metadata.entry_points()[cls.CHECKER_ENTRYPOINT]
        checker_list = [item.name for item in checkers]
        return checker_list

    def remove_skiplist(self, skips):
        # Take out any checkers that are on the skip list
        # (string of comma-delimited checker names)
        skiplist = skips
        for skipme in skiplist:
            if skipme in self.checkers:
                del self.checkers[skipme]
                self.logger.debug(f"Skipping checker: {skipme}")
            else:
                self.logger.error(
                    f"Checker {skipme} is not a valid checker name")

    def print_checkers(self):
        self.logger.info(f'Checkers: {", ".join(self.checkers.keys())}')

    def number_of_checkers(self):
        return len(self.checkers)

    def is_executable(self, filename):
        """check if file is an ELF binary file"""

        output = None
        if inpath("file"):
            # use system file if available (for performance reasons)
            output = subprocess.check_output(["file", filename])
            output = output.decode(sys.stdout.encoding)

            if "cannot open" in output:
                self.logger.warning(
                    f"Unopenable file {filename} cannot be scanned")
                return False, None

            if (("LSB " not in output) and ("LSB shared" not in output)
                    and ("LSB executable" not in output)
                    and ("PE32 executable" not in output)
                    and ("PE32+ executable" not in output)
                    and ("Mach-O" not in output)
                    and ("PKG-INFO: " not in output)
                    and ("METADATA: " not in output)
                    and ("pom.xml" not in output)):
                return False, None
        # otherwise use python implementation of file
        elif not is_binary(filename):
            return False, None

        return True, output

    def parse_strings(self, filename):
        """parse binary file's strings"""

        if inpath("strings"):
            # use "strings" on system if available (for performance)
            lines = subprocess.check_output(["strings",
                                             filename]).decode("utf-8")
        else:
            # Otherwise, use python implementation
            s = Strings(filename)
            lines = s.parse()
        return lines

    def scan_file(self, filename):
        """Scans a file to see if it contains any of the target libraries,
        and whether any of those contain CVEs"""

        self.logger.debug(f"Scanning file: {filename}")
        self.total_scanned_files += 1

        # Do not try to scan symlinks
        if os.path.islink(filename):
            return None

        # Ensure filename is a file
        if not os.path.isfile(filename):
            self.logger.debug(f"Invalid file {filename} cannot be scanned")
            return None

        # check if it's an ELF binary file
        is_exec, output = self.is_executable(filename)

        if not is_exec:
            return None

        # parse binary file's strings
        lines = self.parse_strings(filename)

        # Check for Java package
        if output and "pom.xml" in output:
            java_lines = "\n".join(lines.splitlines())
            yield from self.run_java_checker(filename, java_lines)

        #  If python package then strip the lines to avoid detecting other product strings
        if output and ("PKG-INFO: " in output or "METADATA: " in output):
            py_lines = "\n".join(lines.splitlines()[:3])
            yield from self.run_python_package_checkers(filename, py_lines)

        yield from self.run_checkers(filename, lines)

    def find_java_vendor(self, product, version):
        """Find vendor for Java product"""
        vendor_package_pair = self.cve_db.get_vendor_product_pairs(product)
        # If no match, try alternative product name.
        # Apache product names are stored as A_B in NVD database but often called A-B
        # Some packages have -parent appended to product which is not in NVD database
        if vendor_package_pair == [] and "-" in product:
            self.logger.debug(f"Try alternative product {product}")
            # Remove parent appendage
            if "-parent" in product:
                product = product.replace("-parent", "")
            product = product.replace("-", "_")
            vendor_package_pair = self.cve_db.get_vendor_product_pairs(product)
        if vendor_package_pair != []:
            vendor = vendor_package_pair[0]["vendor"]
            file_path = "".join(self.file_stack)
            self.logger.debug(f"{file_path} {product} {version} by {vendor}")
            return ProductInfo(vendor, product, version), file_path
        return None, None

    def run_java_checker(self, filename, lines):
        """Process maven pom.xml file and extract product and dependency details"""
        tree = ET.parse(filename)
        # Find root element
        root = tree.getroot()
        # Extract schema
        schema = root.tag[:root.tag.find("}") + 1]
        parent = root.find(schema + "parent")
        version = None
        product = None
        file_path = "".join(self.file_stack)
        # Parent tag is optional.
        if parent is None:
            product = root.find(schema + "artifactId").text
            version = root.find(schema + "version").text
        if version is None:
            version = parent.find(schema + "version").text
        # Check valid version identifier (i.e. starts with a digit)
        if not version[0].isdigit():
            self.logger.debug(f"Invalid {version} detected in {filename}")
            version = None
        if product is None:
            product = parent.find(schema + "artifactId").text
        if product is not None and version is not None:
            product_info, file_path = self.find_java_vendor(product, version)
            if file_path is not None:
                yield product_info, file_path

        # Scan for any dependencies referenced in file
        dependencies = root.find(schema + "dependencies")
        if dependencies is not None:
            for dependency in dependencies.findall(schema + "dependency"):
                product = dependency.find(schema + "artifactId")
                if product is not None:
                    version = dependency.find(schema + "version")
                    if version is not None:
                        version = version.text
                        self.logger.debug(
                            f"{file_path} {product.text} {version}")
                        if version[0].isdigit():
                            # Valid version identifier
                            product_info, file_path = self.find_java_vendor(
                                product.text, version)
                            if file_path is not None:
                                yield product_info, file_path

        self.logger.debug(f"Done scanning file: {filename}")

    def run_python_package_checkers(self, filename, lines):
        """
        This generator runs only for python packages.
        There are no actual checkers.
        The ProductInfo is computed without the help of any checkers from PKG-INFO or METADATA.
        """
        try:
            product = search(compile(r"^Name: (.+)$", MULTILINE),
                             lines).group(1)
            version = search(compile(r"^Version: (.+)$", MULTILINE),
                             lines).group(1)

            cve_db = CVEDB()
            vendor_package_pair = cve_db.get_vendor_product_pairs(product)

            if vendor_package_pair != []:
                vendor = vendor_package_pair[0]["vendor"]
                file_path = "".join(self.file_stack)

                self.logger.info(f"{file_path} is {product} {version}")

                yield ProductInfo(vendor, product, version), file_path

        # There are packages with a METADATA file in them containing different data from what the tool expects
        except AttributeError:
            self.logger.debug(f"{filename} is an invalid METADATA/PKG-INFO")

        self.logger.debug(f"Done scanning file: {filename}")

    def run_checkers(self, filename, lines):
        # tko
        for (dummy_checker_name, checker) in self.checkers.items():
            checker = checker()
            result = checker.get_version(lines, filename)
            # do some magic so we can iterate over all results, even the ones that just return 1 hit
            if "is_or_contains" in result:
                results = [dict()]
                results[0] = result
            else:
                results = result

            for result in results:
                if "is_or_contains" in result:
                    version = "UNKNOWN"
                    if "version" in result and result["version"] != "UNKNOWN":
                        version = result["version"]
                    elif result["version"] == "UNKNOWN":
                        file_path = "".join(self.file_stack)
                        self.logger.debug(
                            f"{dummy_checker_name} was detected with version UNKNOWN in file {file_path}"
                        )
                    else:
                        self.logger.error(
                            f"No version info for {dummy_checker_name}")

                    if version != "UNKNOWN":
                        file_path = "".join(self.file_stack)
                        self.logger.debug(
                            f'{file_path} {result["is_or_contains"]} {dummy_checker_name} {version}'
                        )
                        for vendor, product in checker.VENDOR_PRODUCT:
                            yield ProductInfo(vendor, product,
                                              version), file_path

        self.logger.debug(f"Done scanning file: {filename}")

    @staticmethod
    def clean_file_path(filepath):
        """Returns a cleaner filepath by removing temp path from filepath"""

        # we'll recieve a filepath similar to
        # /temp/anything/extractable_filename.extracted/folders/inside/file
        # We'll return /folders/inside/file to be scanned

        # start_point is the point from we want to start trimming
        # len("extracted") = 9
        start_point = filepath.find("extracted") + 9
        return filepath[start_point:]

    def scan_and_or_extract_file(self, ectx, filepath):
        """Runs extraction if possible and desired otherwise scans."""
        # Scan the file
        yield from self.scan_file(filepath)
        # Attempt to extract the file and scan the contents
        if ectx.can_extract(filepath):
            if not self.should_extract:
                LOGGER.warning(
                    f"{filepath} is an archive. Pass -x option to auto-extract"
                )
                return None
            for filename in self.walker([ectx.extract(filepath)]):
                clean_path = self.clean_file_path(filename)
                self.file_stack.append(f" contains {clean_path}")
                yield from self.scan_and_or_extract_file(ectx, filename)
                self.file_stack.pop()

    def recursive_scan(self, scan_path):
        with Extractor(logger=self.logger, error_mode=self.error_mode) as ectx:
            if os.path.isdir(scan_path):
                for filepath in self.walker([scan_path]):
                    self.file_stack.append(filepath)
                    yield from self.scan_and_or_extract_file(ectx, filepath)
                    self.file_stack.pop()
            elif os.path.isfile(scan_path):
                self.file_stack.append(scan_path)
                yield from self.scan_and_or_extract_file(ectx, scan_path)
                self.file_stack.pop()