def parse(self, response: TextResponse):
        response_is_blank = self.blank_html_extractor.is_blank(response)
        if response_is_blank:
            blank_result = {
                'type': 'gov',
                'reason': '网页内容为空',
                'url': response.url
            }
            render_dict = {
                'title': '(PyScraper发送)错误网站',
                'url': response.url,
                'tablehead': ['错误原因'],
                'table_data': blank_result['reason']
            }
            body = render_error_correction_result_mail(**render_dict)
            self.mailer.send(to=["*****@*****.**"],
                             subject='(PyScraper发送)网站纠错情况',
                             body=body,
                             mimetype='text/html')
            yield blank_result

        error_correction_result = self.error_correction_extractor.find_error(
            response)
        if error_correction_result:
            print("error_correction_result", error_correction_result)
            render_dict = {
                'title': '(PyScraper发送)错误网站',
                'url': response.url,
                'tablehead': ['正确词', '错误词'],
                'table_data': error_correction_result
            }
            body = render_error_correction_result_mail(**render_dict)
            self.mailer.send(to=["*****@*****.**"],
                             subject='(PyScraper发送)网站纠错情况',
                             body=body,
                             mimetype='text/html')

            yield {
                'type': 'gov',
                'reason': '网页无法访问状态{}'.format(response.status),
                'url': response.url
            }

        links: List[Link] = [
            lnk for lnk in self.htmk_link_extractor.extract_links(response)
        ]
        for link in links:
            yield Request(link.url,
                          callback=self.parse,
                          errback=self.errorback)
        """
        获取dataproxy接口的链接
        """
        data_proxy_extractor = DataProxyXmlLinkExtractor()
        if data_proxy_extractor.has_dataproxy_link(response):
            yield data_proxy_extractor.gen_dataproxy_links()
Exemple #2
0
 def parse(self, response: TextResponse):
     result = self.error_correction_extractor.find_error(response)
     if result:
         print("error_correction_result", result)
         table_data = [{
             'correct': r['correct'],
             'error': r['error'],
             'url': response.url
         } for r in result]
         render_dict = {
             'title': '(PyScraper发送)错误网站',
             'table_head': ['正确词', '错误词', '网站地址'],
             'table_data': table_data
         }
         body = render_error_correction_result_mail(
             render_dict['title'], render_dict['table_head'],
             render_dict['table_data'])
         self.mailer.send(to=["*****@*****.**"],
                          subject='(PyScraper发送)网站纠错情况',
                          body=body,
                          mimetype='text/html')
     links: List[Link] = [
         lnk for lnk in self.htmk_link_extractor.extract_links(response)
     ]
     for link in links:
         yield Request(link.url,
                       callback=self.parse,
                       errback=self.errorback)
     """
     获取dataproxy接口的链接
     """
     data_proxy_extractor = DataProxyXmlLinkExtractor()
     if data_proxy_extractor.has_dataproxy_link(response):
         yield data_proxy_extractor.gen_dataproxy_links()
Exemple #3
0
    def errorback(self, failure):

        if isinstance(failure.value, HttpError):
            response = failure.value.response
            request_url = response.meta.get("url")
            first_url = response.meta.get("first_url")

            result = {
                'type': 'gov',
                'reason': '网页无法访问状态{}'.format(response.status),
                'url': request_url or response.url,
                'first_url': first_url
            }
            yield result

            render_dict = {
                'title': '(PyScraper发送)错误网站',
                'url': first_url or response.url,
                'table_head': ['错误原因'],
                'table_data': result['reason']
            }
            body = render_error_correction_result_mail(**render_dict)
            # self.mailer.send(to=["*****@*****.**"], subject='(PyScraper发送)网站纠错情况', body=body, mimetype='text/html')

        print('response is error in response.url:', failure)
Exemple #4
0
    def parse(self, response: TextResponse):
        request_url = response.meta.get("url")
        response_is_blank = self.blank_html_extractor.is_blank(response)
        if response_is_blank:
            blank_result = {'type': 'gov', 'reason': '网页内容为空', 'url': request_url or response.url}
            yield blank_result

            render_dict = {
                'title': '(PyScraper发送)错误网站',
                'url': request_url or response.url,
                'table_head': ['错误原因'],
                'table_data': blank_result['reason']}
            body = render_error_correction_result_mail(**render_dict)
            self.mailer.send(to=["*****@*****.**"], subject='(PyScraper发送)网站纠错情况', body=body, mimetype='text/html')

        error_correction_result = self.error_correction_extractor.find_error(response)
        if error_correction_result:
            print("error_correction_result", error_correction_result)

            message = "\n".join(["正确词:{} 错误词: {}".format(error['correct'], error['error']) for error in error_correction_result])
            yield {'type': 'gov', 'reason': '网页中有错误词:\n' + message, 'url': request_url or response.url}

            render_dict = {
                'title': '(PyScraper发送)错误网站',
                'url': request_url or response.url,
                'table_head': ['正确词', '错误词'],
                'table_data': error_correction_result}
            body = render_error_correction_result_mail(**render_dict)
            self.mailer.send(to=["*****@*****.**"], subject='(PyScraper发送)网站纠错情况', body=body, mimetype='text/html')


        links: List[Link] = [lnk for lnk in self.htmk_link_extractor.extract_links(response)]
        for link in links:
            yield Request(link.url, callback=self.parse, errback=self.errorback, meta={"url": link.url})
        """
        获取dataproxy接口的链接
        """
        data_proxy_extractor = DataProxyXmlLinkExtractor()
        if data_proxy_extractor.has_dataproxy_link(response):
            yield data_proxy_extractor.gen_dataproxy_links()
Exemple #5
0
def rendered_html_table_email():
    render_dict = {
        'title':
        '(PyScraper发送)错误网站',
        'url':
        'http://www.w3school.com.cn/tags/tag_table.asp',
        'table_head': ['正确词', '错误词'],
        'table_data': [
            {
                'correct': '中华人民共和国',
                'error': '中黑人民共和国'
            },
            {
                'correct': '中华人民共和国',
                'error': '中黑人民共和国'
            },
            {
                'correct': '中华人民共和国',
                'error': '中黑人民共和国'
            },
        ]
    }
    return render_error_correction_result_mail(**render_dict)