def parse(self, response: TextResponse): response_is_blank = self.blank_html_extractor.is_blank(response) if response_is_blank: blank_result = { 'type': 'gov', 'reason': '网页内容为空', 'url': response.url } render_dict = { 'title': '(PyScraper发送)错误网站', 'url': response.url, 'tablehead': ['错误原因'], 'table_data': blank_result['reason'] } body = render_error_correction_result_mail(**render_dict) self.mailer.send(to=["*****@*****.**"], subject='(PyScraper发送)网站纠错情况', body=body, mimetype='text/html') yield blank_result error_correction_result = self.error_correction_extractor.find_error( response) if error_correction_result: print("error_correction_result", error_correction_result) render_dict = { 'title': '(PyScraper发送)错误网站', 'url': response.url, 'tablehead': ['正确词', '错误词'], 'table_data': error_correction_result } body = render_error_correction_result_mail(**render_dict) self.mailer.send(to=["*****@*****.**"], subject='(PyScraper发送)网站纠错情况', body=body, mimetype='text/html') yield { 'type': 'gov', 'reason': '网页无法访问状态{}'.format(response.status), 'url': response.url } links: List[Link] = [ lnk for lnk in self.htmk_link_extractor.extract_links(response) ] for link in links: yield Request(link.url, callback=self.parse, errback=self.errorback) """ 获取dataproxy接口的链接 """ data_proxy_extractor = DataProxyXmlLinkExtractor() if data_proxy_extractor.has_dataproxy_link(response): yield data_proxy_extractor.gen_dataproxy_links()
def parse(self, response: TextResponse): result = self.error_correction_extractor.find_error(response) if result: print("error_correction_result", result) table_data = [{ 'correct': r['correct'], 'error': r['error'], 'url': response.url } for r in result] render_dict = { 'title': '(PyScraper发送)错误网站', 'table_head': ['正确词', '错误词', '网站地址'], 'table_data': table_data } body = render_error_correction_result_mail( render_dict['title'], render_dict['table_head'], render_dict['table_data']) self.mailer.send(to=["*****@*****.**"], subject='(PyScraper发送)网站纠错情况', body=body, mimetype='text/html') links: List[Link] = [ lnk for lnk in self.htmk_link_extractor.extract_links(response) ] for link in links: yield Request(link.url, callback=self.parse, errback=self.errorback) """ 获取dataproxy接口的链接 """ data_proxy_extractor = DataProxyXmlLinkExtractor() if data_proxy_extractor.has_dataproxy_link(response): yield data_proxy_extractor.gen_dataproxy_links()
def errorback(self, failure): if isinstance(failure.value, HttpError): response = failure.value.response request_url = response.meta.get("url") first_url = response.meta.get("first_url") result = { 'type': 'gov', 'reason': '网页无法访问状态{}'.format(response.status), 'url': request_url or response.url, 'first_url': first_url } yield result render_dict = { 'title': '(PyScraper发送)错误网站', 'url': first_url or response.url, 'table_head': ['错误原因'], 'table_data': result['reason'] } body = render_error_correction_result_mail(**render_dict) # self.mailer.send(to=["*****@*****.**"], subject='(PyScraper发送)网站纠错情况', body=body, mimetype='text/html') print('response is error in response.url:', failure)
def parse(self, response: TextResponse): request_url = response.meta.get("url") response_is_blank = self.blank_html_extractor.is_blank(response) if response_is_blank: blank_result = {'type': 'gov', 'reason': '网页内容为空', 'url': request_url or response.url} yield blank_result render_dict = { 'title': '(PyScraper发送)错误网站', 'url': request_url or response.url, 'table_head': ['错误原因'], 'table_data': blank_result['reason']} body = render_error_correction_result_mail(**render_dict) self.mailer.send(to=["*****@*****.**"], subject='(PyScraper发送)网站纠错情况', body=body, mimetype='text/html') error_correction_result = self.error_correction_extractor.find_error(response) if error_correction_result: print("error_correction_result", error_correction_result) message = "\n".join(["正确词:{} 错误词: {}".format(error['correct'], error['error']) for error in error_correction_result]) yield {'type': 'gov', 'reason': '网页中有错误词:\n' + message, 'url': request_url or response.url} render_dict = { 'title': '(PyScraper发送)错误网站', 'url': request_url or response.url, 'table_head': ['正确词', '错误词'], 'table_data': error_correction_result} body = render_error_correction_result_mail(**render_dict) self.mailer.send(to=["*****@*****.**"], subject='(PyScraper发送)网站纠错情况', body=body, mimetype='text/html') links: List[Link] = [lnk for lnk in self.htmk_link_extractor.extract_links(response)] for link in links: yield Request(link.url, callback=self.parse, errback=self.errorback, meta={"url": link.url}) """ 获取dataproxy接口的链接 """ data_proxy_extractor = DataProxyXmlLinkExtractor() if data_proxy_extractor.has_dataproxy_link(response): yield data_proxy_extractor.gen_dataproxy_links()
def rendered_html_table_email(): render_dict = { 'title': '(PyScraper发送)错误网站', 'url': 'http://www.w3school.com.cn/tags/tag_table.asp', 'table_head': ['正确词', '错误词'], 'table_data': [ { 'correct': '中华人民共和国', 'error': '中黑人民共和国' }, { 'correct': '中华人民共和国', 'error': '中黑人民共和国' }, { 'correct': '中华人民共和国', 'error': '中黑人民共和国' }, ] } return render_error_correction_result_mail(**render_dict)