def scan(self, response): """Scan a response, returning any matches.""" # logging.info('Stats: {0}'.format(self.crawler.stats.get_stats())) content_type = response.headers.get('content-type') if content_type: mime_type = parse_content_type(content_type) else: mime_type, encoding = mimetypes.guess_type(response.url) if not mime_type: try: mime_type = self.magic.from_buffer(response.body) except MagicException as me: logging.error(me) # data, mime_type = self.check_encoding(mime_type, response) data = response.body # Save the URL item to the database if (Processor.mimetype_to_processor_type(mime_type) == 'ocr' and not self.scanner.scan_object.do_ocr): # Ignore this URL return url_object = Url(url=response.request.url, mime_type=mime_type, scan=self.scanner.scan_object) url_object.save() self.scanner.scan(data, url_object)
def create_ressources(self, filename): shutil.copy2(self.test_dir + 'html/' + filename, self.test_dir + 'tmp/') url = Url(scan=Scan(), url=self.test_dir + 'tmp/' + filename) item = ConversionQueueItem(url=url, file=self.test_dir + 'tmp/' + filename, type=html.HTMLProcessor, status=ConversionQueueItem.NEW) return item
def broken_url_save(self, status_code, status_message, url): logging.info("Handle Error: %s %s" % (status_message, url)) status_message = regex.sub("\[.+\] ", "", status_message) status_message = capitalize_first(status_message) # Add broken URL broken_url = Url(url=url, scan=self.scanner.scan_object, status_code=status_code, status_message=status_message) secure_save(broken_url) return broken_url
def create_ressources(self, filename): shutil.copy2(self.test_dir + 'pdf/' + filename, self.test_dir + 'tmp/') url = Url(scan=Scan(), url=self.test_dir + 'tmp/' + filename) item = ConversionQueueItem(url=url, file=self.test_dir + 'tmp/' + filename, type=pdf.PDFProcessor, status=ConversionQueueItem.NEW) with tempfile.TemporaryDirectory(dir=self.test_dir + 'tmp/') as temp_dir: result = pdf.PDFProcessor.convert(self, item, temp_dir) return result
def external_link_check(self, external_urls): """Perform external link checking.""" logging.info("Link checking %d external URLs..." % len(external_urls)) for url in external_urls: url_parse = urlparse(url) if url_parse.scheme not in ("http", "https"): # We don't want to allow external URL checking of other # schemes (file:// for example) continue logging.info("Checking external URL %s" % url) result = linkchecker.check_url(url) if result is not None: broken_url = Url(url=url, scan=self.scan_object.webscan, status_code=result["status_code"], status_message=result["status_message"]) broken_url.save() self.scanner_spider.associate_url_referrers(broken_url)
def external_link_check(self, external_urls): """Perform external link checking.""" from os2webscanner.models.url_model import Url logging.info("Link checking %d external URLs..." % len(external_urls)) for url in external_urls: url_parse = urlparse(url) if url_parse.scheme not in ("http", "https"): # We don't want to allow external URL checking of other # schemes (file:// for example) continue logging.info("Checking external URL %s" % url) result = linkchecker.check_url(url) if result is not None: broken_url = Url(url=url, scan=self.scanner.scan_object.webscan, status_code=result["status_code"], status_message=result["status_message"]) broken_url.save() self.scanner_crawler.spider.associate_url_referrers(broken_url)
def create_ressources(self, filename): try: shutil.copy2(self.test_dir + 'html/' + filename, self.test_dir + 'tmp/') except FileNotFoundError: print('File not found error: {}'.format(self.test_dir + 'html/' + filename)) return None url = Url(scan=Scan(), url=self.test_dir + 'tmp/' + filename) item = ConversionQueueItem(url=url, file=self.test_dir + 'tmp/' + filename, type=html.HTMLProcessor, status=ConversionQueueItem.NEW) return item
def create_ressources(self, filename): shutil.copy2(self.test_dir + 'libreoffice/' + filename, self.test_dir + 'tmp/') url = Url(scan=Scan(), url=self.test_dir + 'tmp/' + filename) item = ConversionQueueItem(url=url, file=self.test_dir + 'tmp/' + filename, type=libreoffice.LibreOfficeProcessor, status=ConversionQueueItem.NEW) with tempfile.TemporaryDirectory(dir=self.test_dir + 'tmp/') as temp_dir: libreoffice_processor = libreoffice.LibreOfficeProcessor() libreoffice_processor.set_home_dir(self.test_dir + 'libreoffice/home_dir/') result = libreoffice_processor.convert(item, temp_dir) return result
def create_ressources(self, filename): try: shutil.copy2(self.test_dir + 'zip/' + filename, self.test_dir + 'tmp/') except FileNotFoundError: print('File not found error: {}'.format(self.test_dir + 'zip/' + filename)) return None url = Url(scan=Scan(), url=self.test_dir + 'tmp/' + filename) item = ConversionQueueItem(pk=0, url=url, file=self.test_dir + 'tmp/' + filename, type=zip.ZipProcessor, status=ConversionQueueItem.NEW) with tempfile.TemporaryDirectory(dir=self.test_dir + 'tmp/') as temp_dir: zip_processor = zip.ZipProcessor() result = zip_processor.convert(item, temp_dir) return result
def create_ressources(self, filename): try: shutil.copy2(self.test_dir + 'libreoffice/' + filename, self.test_dir + 'tmp/') except FileNotFoundError: print('File not found error: {}'.format(self.test_dir + 'libreoffice/' + filename)) return None url = Url(scan=Scan(), url=self.test_dir + 'tmp/' + filename) item = ConversionQueueItem(url=url, file=self.test_dir + 'tmp/' + filename, type=libreoffice.LibreOfficeProcessor, status=ConversionQueueItem.NEW) with tempfile.TemporaryDirectory(dir=self.test_dir + 'tmp/') as temp_dir: libreoffice_processor = libreoffice.LibreOfficeProcessor() libreoffice_processor.set_home_dir(self.test_dir + 'libreoffice/home_dir/') result = libreoffice_processor.convert(item, temp_dir) return result
def handle_error(self, failure): """Handle an error due to a non-success status code or other reason. If link checking is enabled, saves the broken URL and referrers. """ # If scanner is type filescan url = getattr(failure.value, "filename", "Not Filled") status_message = "Not filled" status_code = -1 if hasattr(self.scanner.scan_object, 'filescan'): # If file is a directory loop through files within if isinstance(failure.value, IOError) \ and failure.value.errno == errno.EISDIR: logging.debug('File that is failing: {0}'.format(failure.value.filename)) return self.append_file_request('file://' + failure.value.filename) # If file has not been changes since last, an ignorerequest is returned. elif isinstance(failure.value, IgnoreRequest): return elif isinstance(failure.value, IOError): status_message = str(failure.value.errno) # Else if scanner is type webscan elif hasattr(self.scanner.scan_object, 'webscan'): # If we should not do link check or failure is ignore request # and it is not a http error we know it is a last-modified check. if (not self.scanner.scan_object.webscan.do_link_check or (isinstance(failure.value, IgnoreRequest) and not isinstance( failure.value, HttpError))): return if hasattr(failure.value, "response"): response = failure.value.response url = response.request.url status_code = response.status status_message = response_status_message(status_code) if "redirect_urls" in response.request.meta: # Set URL to the original URL, not the URL after redirection url = response.request.meta["redirect_urls"][0] referer_header = response.request.headers.get("referer", None) else: url = failure.request.url status_code = -1 status_message = "%s" % failure.value referer_header = None logging.info("Handle Error: %s %s" % (status_message, url)) status_message = regex.sub("\[.+\] ", "", status_message) status_message = capitalize_first(status_message) # Add broken URL broken_url = Url(url=url, scan=self.scanner.scan_object, status_code=status_code, status_message=status_message) secure_save(broken_url) if hasattr(self.scanner.scan_object, 'webscan'): self.broken_url_objects[url] = broken_url # Associate referer using referer heade if referer_header is not None: self.associate_url_referrer(referer_header, broken_url) self.associate_url_referrers(broken_url)
def url_save(self, mime_type, url): url_object = Url(url=url, mime_type=mime_type, scan=self.scanner.scan_object) url_object.save() return url_object