Esempio n. 1
0
    def _extract(self, href, referer):
        resp = rget(href, referer=referer)
        if not resp: return
        html = etree.HTML(resp.content)
        if not html: return

        title = html.xpath('//td[@class="article_title1a"]/h1/text()')
        # title = title if title else html.xpath('//h1[@class="nw"]/text()')
        if title:
            title = title[0]
        else:
            return

        tag = html.xpath(
            '//div[@id="pagecenter"]/table/tr[2]/td[1]/table/tr/td/a[3]/text()'
        )
        tag = tag[0] if tag else '-1'
        publish_time = html.xpath('//td[@align="center"]//text()')
        publish_time = publish_time[0].split(
            '\xa0')[0] if publish_time else '-1'

        ps = html.xpath('//td[@class="article_title2a"]//text()')
        sText = ''.join(ps)
        if len(sText) <= 100:
            content = trim(sText)
        else:
            sText = sText.split('。')
            content = trim('。&&&'.join(sText))

        if filter_(content) or not content: return
        logger.debug(
            '\033[96m title:{}; href:{}; tag:{}; content:{}\033[0m'.format(
                title, href, tag, len(content)))
        return {
            'category': '手表',
            'site': self.site,
            'tag': tag,
            'news_url': href,
            'title': title,
            'content': content,
            'author': '',
            'publish_time': publish_time,
        }
Esempio n. 2
0
    def _extract(self, href, referer):
        resp = rget(href, referer=referer)
        if not resp: return
        html = etree.HTML(resp.content)
        if not html: return

        title = html.xpath('//*[@class="title"]/h1/text()')
        if title:
            title = title[0]
        else:
            return

        tag = html.xpath('//*[@class="breadcrumb left"]/p/a[2]/text()')
        tag = tag[0] if tag else '-1'
        publish_time = html.xpath('//*[@class="article-attr"]/span[1]/text()')
        publish_time = publish_time[0] if publish_time else ''
        author = html.xpath('//*[@class="article-attr"]/span[4]/text()')
        author = author[0].split(':')[1] if author else ''

        ps = html.xpath('//*[@class="article"]//p/text()')
        sText = ''.join(ps)
        if len(sText) <= 100:
            content = trim(sText)
        else:
            sText = sText.split('。')
            content = trim('。&&&'.join(sText))

        if filter_(content) or not content: return
        logger.debug('\033[96m title:{}; href:{}; tag:{}; content:{}\033[0m'
                             .format(title, href, tag, len(content)))
        return {
            'category': '手表',
            'site': self.site,
            'tag': tag,
            'news_url': href,
            'title': title,
            'content': content,
            'author': author,
            'publish_time': publish_time,
        }
Esempio n. 3
0
    def _extract(self, href, referer):
        resp = rget(href, referer=referer)
        if not resp: return
        html = etree.HTML(resp.content)
        if len(html) is None: return

        tag = ''.join(html.xpath('//div[@class="h"]/a[last()]/text()'))
        title = ''.join(html.xpath('//font[@class="f5"]/text()'))

        other = trim(''.join(html.xpath('//font[@class="f3"]/text()')))
        other = re.findall('发布时间:(\d{4}-\d{2}-\d{2})来源:(\w+)', other)
        if other:
            publish_time = other[0][0]
            author = other[0][1]
        else:
            publish_time = author = ''

        content = ''.join(
            html.xpath(
                '//div[@class="mcontent"]//p[string-length(text()) >1]/text()')
        )
        if content:
            content = trim('。&&&'.join(content.split('。')))
        else:
            content = ''.join(html.xpath('//div[@class="mcontent"]//text()'))
        if filter_(content) or not content: return
        logger.debug(
            '\033[96m title:{}; href:{}; tag:{}; content:{} \033[0m'.format(
                title, href, tag, len(content)))
        return {
            'category': '包包',
            'site': self.site,
            'tag': tag,
            'news_url': href,
            'title': title,
            'content': content,
            'author': author,
            'publish_time': publish_time,
        }
Esempio n. 4
0
    def _extract(self, href, referer):
        resp = rget(href, referer=referer)
        if not resp: return
        html = etree.HTML(resp.content)
        if len(html) is None: return

        title = ''.join(html.xpath('//font[@class="f5 f6"]/text()'))
        tag = ''.join(html.xpath('//div[@id="ur_here"]/a[2]/text()'))

        other = ''.join(html.xpath('//font[@class="f3"]/text()')).split('/')
        publish_time = other[1].strip()
        author = other[0].strip() if other[0] else '-1'

        content = ''.join(
            html.xpath(
                '//div[@class="box_1"]/div//span[string-length(text()) >1]/text()'
            ))
        if not content:
            content = ''.join(
                html.xpath(
                    '//div[@class="box_1"]/div//p[string-length(text()) >1]/text()'
                ))
        content = trim('。&&&'.join(content.split('。')))
        if filter_(content) and not content: return
        logger.debug(
            '\033[96m title:{}; href:{}; tag:{}; content:{}; \033[0m'.format(
                title, href, tag, len(content)))
        return {
            'category': '包包',
            'site': self.site,
            'tag': tag,
            'news_url': href,
            'title': title,
            'content': content,
            'author': author,
            'publish_time': publish_time,
        }