Esempio n. 1
0
    def _extract(self, href, referer):
        resp = rget(href, referer=referer)
        if not resp: self._extract(href, referer=referer)
        html = etree.HTML(resp.content)
        if not html: return

        title = ''.join(html.xpath('//*[@id="activity-name"]/text()'))
        if title:
            title = trim(title)
        else:
            return

        publish_time = re.findall(r'publish_time = "(\d{4}-\d{2}-\d{2})"?',
                                  resp.text)
        publish_time = publish_time[0] if publish_time else ''
        author = trim(''.join(html.xpath('//*[@id="js_name"]/text()')))

        content = ''.join(html.xpath('//*[@id="js_content"]//text()'))
        if content:
            content = trim('。&&&'.join(content.split('。')))
        else:
            return
        logger.debug('\033[96m title:{}; href:{}; content:{} \033[0m'.format(
            title, href, len(content)))

        return {
            'category': 'news',
            'site': self.url,
            'tag': -1,
            'news_url': href,
            'title': title,
            'content': content,
            'author': author,
            'publish_time': publish_time,
        }
Esempio n. 2
0
    def _extract(self, href, referer):
        resp = rget(href, referer=referer)
        if not resp: return
        html = etree.HTML(resp.content)
        if len(html) is None: return

        info = html.xpath('//div[@class="info"]/text()')
        tag = info[-1]
        publish_time = info[0].split('\xa0')[0]
        author = ''.join(html.xpath('//div[@class="info"]/a/text()'))

        title = ''.join(html.xpath('//div[@class="article_con"]/h1/text()'))

        content = ''.join(html.xpath('//div[@class="art_con"]//text()'))
        if content:
            content = trim('。&&&'.join(content.split('。')))
        else:
            content = ''.join(html.xpath('//div[@class="mcontent"]//text()'))
        if not content: return
        logger.debug(
            '\033[96m title:{}; href:{}; tag:{}; content:{} \033[0m'.format(
                title, href, tag, len(content)))
        return {
            'category': '包包',
            'site': self.site,
            'tag': tag,
            'news_url': href,
            'title': title,
            'content': content,
            'author': author,
            'publish_time': publish_time,
        }
Esempio n. 3
0
    def _extract(self, href, referer):
        resp = rget(href, referer=referer)
        if not resp: return
        html = etree.HTML(resp.content)
        if not html: return

        title = html.xpath('//td[@class="article_title1a"]/h1/text()')
        # title = title if title else html.xpath('//h1[@class="nw"]/text()')
        if title:
            title = title[0]
        else:
            return

        tag = html.xpath(
            '//div[@id="pagecenter"]/table/tr[2]/td[1]/table/tr/td/a[3]/text()'
        )
        tag = tag[0] if tag else '-1'
        publish_time = html.xpath('//td[@align="center"]//text()')
        publish_time = publish_time[0].split(
            '\xa0')[0] if publish_time else '-1'

        ps = html.xpath('//td[@class="article_title2a"]//text()')
        sText = ''.join(ps)
        if len(sText) <= 100:
            content = trim(sText)
        else:
            sText = sText.split('。')
            content = trim('。&&&'.join(sText))

        if filter_(content) or not content: return
        logger.debug(
            '\033[96m title:{}; href:{}; tag:{}; content:{}\033[0m'.format(
                title, href, tag, len(content)))
        return {
            'category': '手表',
            'site': self.site,
            'tag': tag,
            'news_url': href,
            'title': title,
            'content': content,
            'author': '',
            'publish_time': publish_time,
        }
Esempio n. 4
0
    def _extract(self, href, referer):
        resp = rget(href, referer=referer)
        if not resp: return
        html = etree.HTML(resp.content)
        if not html: return

        title = html.xpath('//*[@class="title"]/h1/text()')
        if title:
            title = title[0]
        else:
            return

        tag = html.xpath('//*[@class="breadcrumb left"]/p/a[2]/text()')
        tag = tag[0] if tag else '-1'
        publish_time = html.xpath('//*[@class="article-attr"]/span[1]/text()')
        publish_time = publish_time[0] if publish_time else ''
        author = html.xpath('//*[@class="article-attr"]/span[4]/text()')
        author = author[0].split(':')[1] if author else ''

        ps = html.xpath('//*[@class="article"]//p/text()')
        sText = ''.join(ps)
        if len(sText) <= 100:
            content = trim(sText)
        else:
            sText = sText.split('。')
            content = trim('。&&&'.join(sText))

        if filter_(content) or not content: return
        logger.debug('\033[96m title:{}; href:{}; tag:{}; content:{}\033[0m'
                             .format(title, href, tag, len(content)))
        return {
            'category': '手表',
            'site': self.site,
            'tag': tag,
            'news_url': href,
            'title': title,
            'content': content,
            'author': author,
            'publish_time': publish_time,
        }
Esempio n. 5
0
    def _extract(self, href, referer):
        resp = rget(href, referer=referer)
        if not resp: return
        html = etree.HTML(resp.content)
        if len(html) is None: return

        tag = ''.join(html.xpath('//div[@class="h"]/a[last()]/text()'))
        title = ''.join(html.xpath('//font[@class="f5"]/text()'))

        other = trim(''.join(html.xpath('//font[@class="f3"]/text()')))
        other = re.findall('发布时间:(\d{4}-\d{2}-\d{2})来源:(\w+)', other)
        if other:
            publish_time = other[0][0]
            author = other[0][1]
        else:
            publish_time = author = ''

        content = ''.join(
            html.xpath(
                '//div[@class="mcontent"]//p[string-length(text()) >1]/text()')
        )
        if content:
            content = trim('。&&&'.join(content.split('。')))
        else:
            content = ''.join(html.xpath('//div[@class="mcontent"]//text()'))
        if filter_(content) or not content: return
        logger.debug(
            '\033[96m title:{}; href:{}; tag:{}; content:{} \033[0m'.format(
                title, href, tag, len(content)))
        return {
            'category': '包包',
            'site': self.site,
            'tag': tag,
            'news_url': href,
            'title': title,
            'content': content,
            'author': author,
            'publish_time': publish_time,
        }
Esempio n. 6
0
    def _extract(self, href, referer):
        resp = rget(href, referer=referer)
        if not resp: return
        html = etree.HTML(resp.content)
        if not html: return

        title = ''.join(html.xpath('//title/text()'))
        if title:
            title = trim(title)
        else:
            return

        publish_time = re.findall(
            r"time: '(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})'?", resp.text)
        publish_time = publish_time[0] if publish_time else ''
        author = trim(''.join(re.findall(r"name: '(\w+)'?", resp.text)))
        tag = ','.join(re.findall(r'{"name":"(\w+)"}\]?', resp.text))

        content = ''.join(re.findall(r"content: '(.+)'?", resp.text))
        if content:
            content = trim(content)
            content = re.sub('[&lt&gt&quot;pa-z\/#3D\.-:_]', '', content)
            content = '。&&&'.join(content.split('。'))
        else:
            return
        logger.debug('\033[96m title:{}; href:{}; content:{} \033[0m'.format(
            title, href, len(content)))

        return {
            'category': 'news',
            'site': self.url,
            'tag': tag,
            'news_url': href,
            'title': title,
            'content': content,
            'author': author,
            'publish_time': publish_time,
        }
Esempio n. 7
0
    def _extract(self, href, referer):
        resp = rget(href, referer=referer)
        if not resp: self._extract(href, referer=referer)
        html = etree.HTML(resp.content)
        if not html: return

        title = ''.join(html.xpath('//h2[@class="title-info"]/text()'))
        if title:
            title = trim(title)
        else:
            return

        publish_time = trim(''.join(
            html.xpath('//footer[@class="time"]/text()')))
        author = trim(''.join(html.xpath('//header[@class="name"]/text()')))

        content = html.xpath('//div[@class="display-content"]//p/text()') + \
            html.xpath('//div[@class="hidden-content hide"]//p/text()')
        content = ''.join(content)
        if content:
            content = trim('。&&&'.join(content.split('。')))
        else:
            return
        logger.debug('\033[96m title:{}; href:{}; content:{} \033[0m'.format(
            title, href, len(content)))

        return {
            'category': 'news',
            'site': self.url,
            'tag': '-1',
            'news_url': href,
            'title': title,
            'content': content,
            'author': author,
            'publish_time': publish_time,
        }
Esempio n. 8
0
    def _extract(self, href, referer):
        resp = rget(href, referer=referer)
        if not resp: return
        html = etree.HTML(resp.content)
        if len(html) is None: return

        title = ''.join(html.xpath('//font[@class="f5 f6"]/text()'))
        tag = ''.join(html.xpath('//div[@id="ur_here"]/a[2]/text()'))

        other = ''.join(html.xpath('//font[@class="f3"]/text()')).split('/')
        publish_time = other[1].strip()
        author = other[0].strip() if other[0] else '-1'

        content = ''.join(
            html.xpath(
                '//div[@class="box_1"]/div//span[string-length(text()) >1]/text()'
            ))
        if not content:
            content = ''.join(
                html.xpath(
                    '//div[@class="box_1"]/div//p[string-length(text()) >1]/text()'
                ))
        content = trim('。&&&'.join(content.split('。')))
        if filter_(content) and not content: return
        logger.debug(
            '\033[96m title:{}; href:{}; tag:{}; content:{}; \033[0m'.format(
                title, href, tag, len(content)))
        return {
            'category': '包包',
            'site': self.site,
            'tag': tag,
            'news_url': href,
            'title': title,
            'content': content,
            'author': author,
            'publish_time': publish_time,
        }