Example #1
0
    def init_wato(self):
        if not self._missing_but_required_wato_files():
            logger.info(
                "WATO is already initialized -> Skipping initializiation")
            return

        logger.debug("Initializing WATO...")

        web = CMKWebSession(self)
        web.login()

        # Call WATO once for creating the default WATO configuration
        logger.debug(
            "Requesting wato.py (which creates the WATO factory settings)...")
        response = web.get("wato.py?mode=sites").text
        #logger.debug("Debug: %r" % response)
        assert "site=%s" % web.site.id in response

        logger.debug("Waiting for WATO files to be created...")
        wait_time = 20.0
        while self._missing_but_required_wato_files() and wait_time >= 0:
            time.sleep(0.5)
            wait_time -= 0.5

        missing_files = self._missing_but_required_wato_files()
        assert not missing_files, \
            "Failed to initialize WATO data structures " \
            "(Still missing: %s)" % missing_files

        web.enforce_non_localized_gui()

        self._add_wato_test_config(web)
Example #2
0
class Worker(threading.Thread):
    def __init__(self, num, crawler):
        super(Worker, self).__init__()
        self.name = "worker-%d" % num
        self.crawler = crawler
        self.daemon = True
        self.terminate = False
        self.idle = True

        self.client = CMKWebSession(self.crawler.site)
        self.client.login()
        self.client.enforce_non_localized_gui()

    def run(self):
        while not self.terminate:
            try:
                while not self.terminate:
                    url = self.crawler.todo.get(block=False)
                    self.idle = False
                    try:
                        self.visit_url(url)
                    except Exception as e:
                        self.error(
                            url, "Failed to visit: %s\n%s" %
                            (e, traceback.format_exc()))
                    self.crawler.todo.task_done()
            except queue.Empty:
                self.idle = True
                time.sleep(0.5)

    def stop(self):
        self.terminate = True

    def visit_url(self, url):
        if url.url in self.crawler.visited:
            logger.info("Already visited: %s", url.url)
            return
        self.crawler.visited.append(url.url)

        #print("%s - Visiting #%d (todo %d): %s" %
        #    (self.name, len(self.crawler.visited), self.crawler.todo.qsize(), url.url))

        started = time.time()
        try:
            #print "FETCH", url.url_without_host()
            response = self.client.get(url.url_without_host())
        except AssertionError as e:
            if "This view can only be used in mobile mode" in "%s" % e:
                logger.info("Skipping mobile mode view checking")
                return
            raise
        duration = time.time() - started

        self.update_stats(url, duration, len(response.content))

        content_type = response.headers.get('content-type')
        #print self.name, content_type, len(response.text)

        if content_type.startswith("text/html"):
            self.check_response(url, response)
        elif content_type.startswith("text/plain"):
            pass  # no specific test
        elif content_type.startswith("text/csv"):
            pass  # no specific test
        elif content_type in ["image/png", "image/gif"]:
            pass  # no specific test
        elif content_type in ["application/pdf"]:
            pass  # no specific test
        elif content_type in [
                "application/x-rpm",
                "application/x-deb",
                "application/x-debian-package",
                "application/x-gzip",
                "application/x-msdos-program",
                "application/x-msi",
                "application/x-tgz",
                "application/x-redhat-package-manager",
                "application/x-pkg",
                "application/x-tar",
                "application/json",
                "text/x-chdr",
                "text/x-c++src",
                "text/x-sh",
        ]:
            pass  # no specific test
        else:
            self.error(url, "Unknown content type: %s" % (content_type))
            return

    def update_stats(self, url, duration, content_size):
        stats = self.crawler.stats.setdefault(
            url.neutral_url(), {
                "first_duration": duration,
                "first_content_size": content_size,
            })

        avg_duration = (duration + stats.get("avg_duration", duration)) / 2.0
        avg_content_size = (content_size +
                            stats.get("avg_content_size", content_size)) / 2.0

        stats.update({
            "orig_url": url.orig_url,
            "referer_url": url.referer_url,
            "num_visited": stats.get("num_visited", 0) + 1,
            "last_duration": duration,
            "last_content_size": content_size,
            "avg_duration": avg_duration,
            "avg_content_size": avg_content_size,
        })

    def error(self, url, s):
        s = "[%s - found on %s] %s" % (url.url, url.referer_url, s)
        self.crawler.error(s)

    def check_response(self, url, response):
        soup = BeautifulSoup(response.text, "lxml")

        # The referenced resources (images, stylesheets, javascript files) are checked by
        # the generic web client handler. This only needs to reaslize the crawling.
        self.check_content(url, response, soup)
        self.check_links(url, soup)
        self.check_frames(url, soup)
        self.check_iframes(url, soup)

    def check_content(self, url, response, soup):
        ignore_texts = [
            "This view can only be used in mobile mode.",
            # Some single context views are accessed without their context information, which
            # results in a helpful error message since 1.7. These are not failures that this test
            # should report.
            "Missing context information",
            # Same for availability views that cannot be accessed any more
            # from views with missing context
            "miss some required context information",
            # Same for dashlets that are related to a specific context
            "There are no metrics meeting your context filters",
            # Some of the errors are only visible to the user when trying to submit and
            # some are visible for the reason that the GUI crawl sites do not have license
            # information configured -> ignore the errors
            "license usage report",
        ]

        for element in soup.select("div.error"):
            inner_html = "%s" % element

            skip = False
            for ignore_text in ignore_texts:
                if ignore_text in inner_html:
                    skip = True
                    break

            if not skip:
                self.error(url, "Found error: %s" % (element))

    def check_frames(self, url, soup):
        self.check_referenced(url, soup, "frame", "src")

    def check_iframes(self, url, soup):
        self.check_referenced(url, soup, "iframe", "src")

    def check_links(self, url, soup):
        self.check_referenced(url, soup, "a", "href")

    def check_referenced(self, referer_url, soup, tag, attr):
        elements = soup.find_all(tag)

        for element in elements:
            orig_url = element.get(attr)
            if orig_url is None:
                continue  # Skip elements that don't have the attribute in question

            url = self.normalize_url(self.crawler.site.internal_url, orig_url)

            if url is None:
                continue

            try:
                self.verify_is_valid_url(url)
            except InvalidUrl:
                #print self.name, "skip invalid", url, e
                self.crawler.skipped.add(url)
                continue

            # Ensure that this url has not been crawled yet
            crawl_it = False
            with self.crawler.handled_lock:
                if url not in self.crawler.handled:
                    crawl_it = True
                    self.crawler.handled.add(url)

            if crawl_it:
                #open("/tmp/todo", "a").write("%s (%s)\n" % (url, referer_url.url))
                self.crawler.todo.put(
                    Url(url, orig_url=orig_url, referer_url=referer_url.url))

    def verify_is_valid_url(self, url):
        parsed = urlsplit(url)

        if parsed.scheme != "http":
            raise InvalidUrl("invalid scheme: %r" % (parsed, ))

        # skip external urls
        if url.startswith("http://") and not url.startswith(
                self.crawler.site.internal_url):
            raise InvalidUrl("Skipping external URL: %s" % url)

        # skip non check_mk urls
        if not parsed.path.startswith("/%s/check_mk" % self.crawler.site.id) \
           or "../pnp4nagios/" in parsed.path \
           or "../nagvis/" in parsed.path \
           or "check_mk/plugin-api" in parsed.path \
           or "../nagios/" in parsed.path:
            raise InvalidUrl("Skipping non Check_MK URL: %s %s" %
                             (url, parsed))

        # skip current url with link to index
        if "index.py?start_url=" in url:
            raise InvalidUrl("Skipping link to index with current URL: %s" %
                             url)

        if "logout.py" in url:
            raise InvalidUrl("Skipping logout URL: %s" % url)

        if "_transid=" in url:
            raise InvalidUrl("Skipping action URL: %s" % url)

        if "selection=" in url:
            raise InvalidUrl("Skipping selection URL: %s" % url)

        # TODO: Remove this exclude when ModeCheckManPage works without an
        # automation call. Currently we have to use such a call to enrich the
        # man page with some additional info from config.check_info, see
        # AutomationGetCheckManPage.
        if "mode=check_manpage" in url and "wato.py" in url:
            raise InvalidUrl("Skipping man page URL: %s" % url)

        # Don't follow filled in filter form views
        if "view.py" in url and "filled_in=filter" in url:
            raise InvalidUrl("Skipping filled in filter URL: %s" % url)

        # Don't follow the view editor
        if "edit_view.py" in url:
            raise InvalidUrl("Skipping view editor URL: %s" % url)

        # Skip agent download files
        if parsed.path.startswith("/%s/check_mk/agents/" %
                                  self.crawler.site.id):
            raise InvalidUrl("Skipping agent download file: %s" % url)

    def normalize_url(self, base_url, url):
        url = urljoin(base_url, url.rstrip("#"))
        parsed = list(urlsplit(url))
        parsed[3] = urlencode(
            sorted(parse_qsl(parsed[3], keep_blank_values=True)))
        return urlunsplit(parsed)
Example #3
0
def web(site):
    web = CMKWebSession(site)
    web.login()
    web.enforce_non_localized_gui()
    return web