Esempi in Python per HtmlLinkExtractor

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: PyScraper.extractor.html_link

Classe/tipologia: HtmlLinkExtractor

Esempi su hotexamples.com: 2

HtmlLinkExtractor in Python: 2 esempi trovati. Questi sono i migliori esempi reali in Python per PyScraper.extractor.html_link.HtmlLinkExtractor, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

HtmlLinkExtractor(2)

Metodi utilizzati di frequente

HtmlLinkExtractor (2)

Esempio n. 1

Mostra file

class wzskjjSpider(Spider):
    name = 'wzskjj'
    allowed_domains = ['wzkj.wenzhou.gov.cn']
    start_urls = ['http://wzkj.wenzhou.gov.cn/']
    rules = [
        ('温[\s\S]{1}市', '温州市'),
        ('瑞[\s\S]{1}科技', '瑞）科技'),
    ]
    htmk_link_extractor = HtmlLinkExtractor()
    error_correction_extractor = ErrorCorrectionExtractor(
        rules, domain=allowed_domains[0])
    mailer = MailSender(smtphost='smtp.qq.com',
                        mailfrom='*****@*****.**',
                        smtpport=465,
                        smtpssl=True,
                        smtpuser='******',
                        smtppass='******')
    custom_settings = {
        # 'CONCURRENT_REQUESTS_PER_DOMAIN' : 4,
        'LOG_LEVEL': 'INFO'
        # 'DOWNLOAD_DELAY': 0.3,
    }

    def parse(self, response: TextResponse):
        result = self.error_correction_extractor.find_error(response)
        if result:
            print("error_correction_result", result)
            table_data = [{
                'correct': r['correct'],
                'error': r['error'],
                'url': response.url
            } for r in result]
            render_dict = {
                'title': '（PyScraper发送）错误网站',
                'table_head': ['正确词', '错误词', '网站地址'],
                'table_data': table_data
            }
            body = render_error_correction_result_mail(
                render_dict['title'], render_dict['table_head'],
                render_dict['table_data'])
            self.mailer.send(to=["*****@*****.**"],
                             subject='(PyScraper发送）网站纠错情况',
                             body=body,
                             mimetype='text/html')
        links: List[Link] = [
            lnk for lnk in self.htmk_link_extractor.extract_links(response)
        ]
        for link in links:
            yield Request(link.url,
                          callback=self.parse,
                          errback=self.errorback)
        """
        获取dataproxy接口的链接
        """
        data_proxy_extractor = DataProxyXmlLinkExtractor()
        if data_proxy_extractor.has_dataproxy_link(response):
            yield data_proxy_extractor.gen_dataproxy_links()

    def errorback(self, failure):
        print('resonse is error in response.url:', failure)

Esempio n. 2

Mostra file

class xmcsSpider(Spider):
    name = 'xmcs'
    allowed_domains = ['wzkj.wenzhou.gov.cn']
    start_urls = ['http://wzkj.wenzhou.gov.cn/']
    rules = [
    ]
    htmk_link_extractor = HtmlLinkExtractor()
    error_correction_extractor = ErrorCorrectionExtractor(rules, domain='wzkj.wenzhou.gov.cn')
    blank_html_extractor = BlankHtmlExtractor()
    mailer = MailSender(smtphost='smtp.qq.com', mailfrom='*****@*****.**', smtpport=465,
                        smtpssl=True, smtpuser='******', smtppass='******')
    custom_settings = {
    # 'CONCURRENT_REQUESTS_PER_DOMAIN' : 4,
    'LOG_LEVEL': 'INFO'
    # 'DOWNLOAD_DELAY': 0.3,
    }

    def parse(self, response: TextResponse):
        request_url = response.meta.get("url")
        response_is_blank = self.blank_html_extractor.is_blank(response)
        if response_is_blank:
            blank_result = {'type': 'gov', 'reason': '网页内容为空', 'url': request_url or response.url}
            yield blank_result

            render_dict = {
                'title': '（PyScraper发送）错误网站',
                'url': request_url or response.url,
                'table_head': ['错误原因'],
                'table_data': blank_result['reason']}
            body = render_error_correction_result_mail(**render_dict)
            self.mailer.send(to=["*****@*****.**"], subject='(PyScraper发送）网站纠错情况', body=body, mimetype='text/html')

        error_correction_result = self.error_correction_extractor.find_error(response)
        if error_correction_result:
            print("error_correction_result", error_correction_result)

            message = "\n".join(["正确词：{} 错误词： {}".format(error['correct'], error['error']) for error in error_correction_result])
            yield {'type': 'gov', 'reason': '网页中有错误词:\n' + message, 'url': request_url or response.url}

            render_dict = {
                'title': '（PyScraper发送）错误网站',
                'url': request_url or response.url,
                'table_head': ['正确词', '错误词'],
                'table_data': error_correction_result}
            body = render_error_correction_result_mail(**render_dict)
            self.mailer.send(to=["*****@*****.**"], subject='(PyScraper发送）网站纠错情况', body=body, mimetype='text/html')


        links: List[Link] = [lnk for lnk in self.htmk_link_extractor.extract_links(response)]
        for link in links:
            yield Request(link.url, callback=self.parse, errback=self.errorback, meta={"url": link.url})
        """
        获取dataproxy接口的链接
        """
        data_proxy_extractor = DataProxyXmlLinkExtractor()
        if data_proxy_extractor.has_dataproxy_link(response):
            yield data_proxy_extractor.gen_dataproxy_links()

    def errorback(self, failure):
        if isinstance(failure.value, HttpError):
            response = failure.value.response
            request_url = response.meta.get("url")
            result = {'type': 'gov', 'reason': '网页无法访问状态{}'.format(response.status), 'url': request_url or response.url}
            yield result

            render_dict = {
                'title': '（PyScraper发送）错误网站',
                'url': response.url,
                'table_head': ['错误原因'],
                'table_data': result['reason']}
            body = render_error_correction_result_mail(**render_dict)
            self.mailer.send(to=["*****@*****.**"], subject='(PyScraper发送）网站纠错情况', body=body, mimetype='text/html')

        print('response is error in response.url:', failure)