def run(self, execution, page, validator, response): self.execution = execution self.page = page self.validator = validator if response.status >= 400: errors_json = { "page": response.url, "errors": [] } errors_json['errors'].append({ "error": response.status }) result = Result(description=errors_json, execution=self.execution, page=self.page, validator=self.validator) result.save()
def run(self, execution, page, validator): self.execution = execution self.page = page self.validator = validator errors_json = { "page": self.response.url, "errors": [] } document, errors = tidy_document(self.response.body, options={'numeric-entities': 1}) for error in errors.split('\n'): errors_json['errors'].append({ "error": error }) result = Result(description=errors_json, execution=self.execution, page=self.page, validator=self.validator) result.save()
def handleError(self, reason): with transaction.commit_manually(): try: result = Result.objects.get(execution=self.execution, page=self.page, validator=self.validator) errors_json = result.description except: result = Result(execution=self.execution, page=self.page, validator=self.validator) errors_json = { "page": self.response.url, "errors": [] } errors_json['errors'].append({ "error": reason.getErrorMessage() }) result.description = errors_json result.save() transaction.commit()
def process_exception(self, request, exception, spider): if 'BrokenLinksValidator' in spider.validators: validator = spider.validators['BrokenLinksValidator'] page = spider.pages.get(request.headers['Referer']) try: result = Result.objects.get(execution=spider.execution, page=page, validator=validator) except ObjectDoesNotExist: errors_json = { "page": request.url, "errors": [] } result = Result(description=errors_json, execution=spider.execution, page=page, validator=validator) result.description['errors'].append({ "error": str(exception) }) result.save() return None
def process_response(self, request, response, spider): domain = urlparse(response.url).netloc if 'W3CValidator' in spider.validators and domain in spider.allow_domains: validator = spider.validators['W3CValidator'] page = spider.pages.get(response.url) errors_json = { "page": response.url, "errors": [] } document, errors = tidy_document(response.body, options={ 'numeric-entities': 1 }) for error in errors.split('\n'): # if error not in spider.errors: errors_json['errors'].append({ "error": error }) # spider.errors[error] = True result = Result(description=errors_json, execution=spider.execution, page=page, validator=validator) result.save() return response
def process_response(self, request, response, spider): domain = urlparse(response.url).netloc if 'DownloadTimeValidator' in spider.validators and domain in spider.allow_domains: validator = spider.validators['DownloadTimeValidator'] page = spider.pages.get(response.url) start_time = request.meta['__start_time'] end_time = time() limit_time = float(spider.parameters["DownloadTimeValidator"]["fields"]["time"]) if end_time - start_time > limit_time: log.msg("R1 " + request.url + " " + str(end_time) + " - " + str(request.meta['__start_time']) + " = " + str(end_time - request.meta['__start_time']), level=log.INFO) errors_json = { "page": response.url, "errors": [] } errors_json['errors'].append({ "error": "Download Time exceded (> " + str(limit_time) + ")"}) result = Result(description=errors_json, execution=spider.execution, page=page, validator=validator) result.save() return response
def process_response(self, request, response, spider): domain = urlparse(response.url).netloc if 'SpellingValidator' in spider.validators and domain in spider.allow_domains: if 'text/html' in response.headers['Content-Type']: validator = spider.validators['SpellingValidator'] page = spider.pages.get(response.url) cleaner = Cleaner(scripts=True, embedded=True, meta=True, page_structure=True, links=True, style=True, processing_instructions=True, annoying_tags=True, remove_tags = ['a', 'ul', 'li', 'table', 'tr', 'td', 'div', 'span', 'img', 'p', 'h1', 'h2', 'h3', 'strong', 'body', 'br']) text = cleaner.clean_html(response.body) lang = spider.parameters["SpellingValidator"]["fields"]["language"] errors = self.check_errors({lang, "en_GB"}, text) errors_json = { "page": response.url, "errors": [] } for error in errors: errors_json['errors'].append({ "error": error }) result = Result(description=errors_json, execution=spider.execution, page=page, validator=validator) result.save() return response
def dataReceived(self, body): p = ImageFile.Parser() p.feed(body) (self.width, self.height) = p.image.size # log.msg("B1 " + " " + self.url + " " + str(self.filesize) + " " + str(p.image.size), level=log.INFO) try: result = Result.objects.get(execution=self.spider.execution, page=self.page, validator=self.validator) except ObjectDoesNotExist: errors_json = { "page": self.page.url, "errors": [] } result = Result(description=errors_json, execution=self.spider.execution, page=self.page, validator=self.validator) width = int(self.spider.parameters["ImagesValidator"]["fields"]["width"]) height = int(self.spider.parameters["ImagesValidator"]["fields"]["height"]) size = int(self.spider.parameters["ImagesValidator"]["fields"]["size"]) if self.filesize > size * 1024: result.description['errors'].append({ "url": self.url, "error": "Tamaño del fichero demasiado grande." }) if self.width > width or self.height > height: result.description['errors'].append({ "url": self.url, "error": "Imagen demasiado grande." }) if len(result.description["errors"]) > 0: result.save()