def scan(self, response):
        """Scan a response, returning any matches."""
        # logging.info('Stats: {0}'.format(self.crawler.stats.get_stats()))

        content_type = response.headers.get('content-type')
        if content_type:
            mime_type = parse_content_type(content_type)
        else:
            mime_type, encoding = mimetypes.guess_type(response.url)
            if not mime_type:
                try:
                    mime_type = self.magic.from_buffer(response.body)
                except MagicException as me:
                    logging.error(me)

        # data, mime_type = self.check_encoding(mime_type, response)
        data = response.body

        # Save the URL item to the database
        if (Processor.mimetype_to_processor_type(mime_type) == 'ocr'
            and not self.scanner.scan_object.do_ocr):
            # Ignore this URL
            return

        url_object = Url(url=response.request.url, mime_type=mime_type,
                         scan=self.scanner.scan_object)
        url_object.save()

        self.scanner.scan(data, url_object)
Exemple #2
0
    def create_ressources(self, filename):
        shutil.copy2(self.test_dir + 'html/' + filename,
                     self.test_dir + 'tmp/')
        url = Url(scan=Scan(), url=self.test_dir + 'tmp/' + filename)
        item = ConversionQueueItem(url=url,
                                   file=self.test_dir + 'tmp/' + filename,
                                   type=html.HTMLProcessor,
                                   status=ConversionQueueItem.NEW)

        return item
 def broken_url_save(self, status_code, status_message, url):
     logging.info("Handle Error: %s %s" % (status_message, url))
     status_message = regex.sub("\[.+\] ", "", status_message)
     status_message = capitalize_first(status_message)
     # Add broken URL
     broken_url = Url(url=url,
                      scan=self.scanner.scan_object,
                      status_code=status_code,
                      status_message=status_message)
     secure_save(broken_url)
     return broken_url
    def create_ressources(self, filename):
        shutil.copy2(self.test_dir + 'pdf/' + filename, self.test_dir + 'tmp/')
        url = Url(scan=Scan(), url=self.test_dir + 'tmp/' + filename)
        item = ConversionQueueItem(url=url,
                                   file=self.test_dir + 'tmp/' + filename,
                                   type=pdf.PDFProcessor,
                                   status=ConversionQueueItem.NEW)

        with tempfile.TemporaryDirectory(dir=self.test_dir + 'tmp/') as temp_dir:
            result = pdf.PDFProcessor.convert(self, item, temp_dir)

        return result
Exemple #5
0
    def external_link_check(self, external_urls):
        """Perform external link checking."""
        logging.info("Link checking %d external URLs..." % len(external_urls))

        for url in external_urls:
            url_parse = urlparse(url)
            if url_parse.scheme not in ("http", "https"):
                # We don't want to allow external URL checking of other
                # schemes (file:// for example)
                continue

            logging.info("Checking external URL %s" % url)

            result = linkchecker.check_url(url)
            if result is not None:
                broken_url = Url(url=url,
                                 scan=self.scan_object.webscan,
                                 status_code=result["status_code"],
                                 status_message=result["status_message"])
                broken_url.save()
                self.scanner_spider.associate_url_referrers(broken_url)
    def external_link_check(self, external_urls):
        """Perform external link checking."""
        from os2webscanner.models.url_model import Url
        logging.info("Link checking %d external URLs..." % len(external_urls))

        for url in external_urls:
            url_parse = urlparse(url)
            if url_parse.scheme not in ("http", "https"):
                # We don't want to allow external URL checking of other
                # schemes (file:// for example)
                continue

            logging.info("Checking external URL %s" % url)

            result = linkchecker.check_url(url)
            if result is not None:
                broken_url = Url(url=url, scan=self.scanner.scan_object.webscan,
                                 status_code=result["status_code"],
                                 status_message=result["status_message"])
                broken_url.save()
                self.scanner_crawler.spider.associate_url_referrers(broken_url)
    def create_ressources(self, filename):
        try:
            shutil.copy2(self.test_dir + 'html/' + filename, self.test_dir + 'tmp/')
        except FileNotFoundError:
            print('File not found error: {}'.format(self.test_dir + 'html/' + filename))
            return None

        url = Url(scan=Scan(), url=self.test_dir + 'tmp/' + filename)
        item = ConversionQueueItem(url=url,
                                   file=self.test_dir + 'tmp/' + filename,
                                   type=html.HTMLProcessor,
                                   status=ConversionQueueItem.NEW)

        return item
Exemple #8
0
    def create_ressources(self, filename):
        shutil.copy2(self.test_dir + 'libreoffice/' + filename,
                     self.test_dir + 'tmp/')
        url = Url(scan=Scan(), url=self.test_dir + 'tmp/' + filename)
        item = ConversionQueueItem(url=url,
                                   file=self.test_dir + 'tmp/' + filename,
                                   type=libreoffice.LibreOfficeProcessor,
                                   status=ConversionQueueItem.NEW)

        with tempfile.TemporaryDirectory(dir=self.test_dir +
                                         'tmp/') as temp_dir:
            libreoffice_processor = libreoffice.LibreOfficeProcessor()
            libreoffice_processor.set_home_dir(self.test_dir +
                                               'libreoffice/home_dir/')
            result = libreoffice_processor.convert(item, temp_dir)

        return result
    def create_ressources(self, filename):
        try:
            shutil.copy2(self.test_dir + 'zip/' + filename, self.test_dir + 'tmp/')
        except FileNotFoundError:
            print('File not found error: {}'.format(self.test_dir + 'zip/' + filename))
            return None

        url = Url(scan=Scan(), url=self.test_dir + 'tmp/' + filename)
        item = ConversionQueueItem(pk=0,
                                   url=url,
                                   file=self.test_dir + 'tmp/' + filename,
                                   type=zip.ZipProcessor,
                                   status=ConversionQueueItem.NEW)

        with tempfile.TemporaryDirectory(dir=self.test_dir + 'tmp/') as temp_dir:
            zip_processor = zip.ZipProcessor()
            result = zip_processor.convert(item, temp_dir)

        return result
    def create_ressources(self, filename):
        try:
            shutil.copy2(self.test_dir + 'libreoffice/' + filename, self.test_dir + 'tmp/')
        except FileNotFoundError:
            print('File not found error: {}'.format(self.test_dir + 'libreoffice/' + filename))
            return None

        url = Url(scan=Scan(), url=self.test_dir + 'tmp/' + filename)
        item = ConversionQueueItem(url=url,
                                   file=self.test_dir + 'tmp/' + filename,
                                   type=libreoffice.LibreOfficeProcessor,
                                   status=ConversionQueueItem.NEW)

        with tempfile.TemporaryDirectory(dir=self.test_dir + 'tmp/') as temp_dir:
            libreoffice_processor = libreoffice.LibreOfficeProcessor()
            libreoffice_processor.set_home_dir(self.test_dir + 'libreoffice/home_dir/')
            result = libreoffice_processor.convert(item, temp_dir)

        return result
    def handle_error(self, failure):
        """Handle an error due to a non-success status code or other reason.

        If link checking is enabled, saves the broken URL and referrers.
        """
        # If scanner is type filescan
        url = getattr(failure.value, "filename", "Not Filled")
        status_message = "Not filled"
        status_code = -1
        if  hasattr(self.scanner.scan_object, 'filescan'):
            # If file is a directory loop through files within
            if isinstance(failure.value, IOError) \
                    and failure.value.errno == errno.EISDIR:
                logging.debug('File that is failing: {0}'.format(failure.value.filename))

                return self.append_file_request('file://' + failure.value.filename)
            # If file has not been changes since last, an ignorerequest is returned.
            elif isinstance(failure.value, IgnoreRequest):
                return
            elif isinstance(failure.value, IOError):
                status_message = str(failure.value.errno)
        # Else if scanner is type webscan
        elif  hasattr(self.scanner.scan_object, 'webscan'):
            # If we should not do link check or failure is ignore request
            # and it is not a http error we know it is a last-modified check.
            if (not self.scanner.scan_object.webscan.do_link_check or
                    (isinstance(failure.value, IgnoreRequest) and not isinstance(
                        failure.value, HttpError))):
                return
            if hasattr(failure.value, "response"):
                response = failure.value.response
                url = response.request.url
                status_code = response.status
                status_message = response_status_message(status_code)

                if "redirect_urls" in response.request.meta:
                    # Set URL to the original URL, not the URL after redirection
                    url = response.request.meta["redirect_urls"][0]

                referer_header = response.request.headers.get("referer", None)
            else:
                url = failure.request.url
                status_code = -1
                status_message = "%s" % failure.value
                referer_header = None

        logging.info("Handle Error: %s %s" % (status_message, url))

        status_message = regex.sub("\[.+\] ", "", status_message)
        status_message = capitalize_first(status_message)

        # Add broken URL
        broken_url = Url(url=url, scan=self.scanner.scan_object,
                         status_code=status_code,
                         status_message=status_message)

        secure_save(broken_url)

        if hasattr(self.scanner.scan_object, 'webscan'):
            self.broken_url_objects[url] = broken_url

            # Associate referer using referer heade
            if referer_header is not None:
                self.associate_url_referrer(referer_header, broken_url)

            self.associate_url_referrers(broken_url)
 def url_save(self, mime_type, url):
     url_object = Url(url=url, mime_type=mime_type,
                      scan=self.scanner.scan_object)
     url_object.save()
     return url_object
 def url_save(self, mime_type, url):
     url_object = Url(url=url,
                      mime_type=mime_type,
                      scan=self.scanner.scan_object)
     url_object.save()
     return url_object