コード例 #1
0
 def parse_item(response):
     sel = Selector(response)
     print(sel)
     url = response.request.url
     if re.match(r'.*?tibet.people.com.cn/.*?', url):
         print('---------------------')
         print(url)
         content = response.xpath('//html/body/div[2]/div[4]/div[1]/div[2]/div[2]/text()').extract()
                                # '//*[@id="rwb_zw"]/p/text() | //*[@id="rwb_zw"]/p/strong/text()'| //*[@id="content"]/p[2]/span/span/text()
         print(content)
         if content:
             item = NewsItem(
                 domainname='http://tibet.people.com.cn',
                 chinesename='people',
                 url=sel.root.base,
                 title=sel.css('.gq_content > h1:nth-child(2)::text').extract_first(),
                 subtitle=sel.css('.sub::text').extract_first(),
                 language='tibet',
                 encodingtype='utf-8',
                 corpustype='网络',
                 timeofpublish=sel.re(r'\d{4}.*?\d{2}.*?\d{2}.*?\d{2}:\d{2}')[0].replace('ལོའི་ཟླ་ ', '年').replace('ཚེས་', '月').replace('ཉིན།  ', '日'),
                 # timeofpublish = re.search(r'\d{4}.*?\d{2}.*?\d{2}',sel.css('.title_hd > p:nth-child(2)::text').extract_first()).group(0),
                 content=''.join(content),
                 # source=sel.css('.title_hd > p:nth-child(2)::text').extract_first(),
                 # author=sel.css('.title_hd > p:nth-child(2)::text').extract_first()
             )
             print(item.get("title", None))
             print(item.get("timeofpublish", None))
             print(item.get("source", None))
             print(item.get("author", None))
             item = judge_time_news_people(item)
             if item:
                 yield item
コード例 #2
0
 def parse_item(response):
     sel = Selector(response)
     url = response.request.url
     if re.match(r'.*?people.com.cn.*?/\d+/\d+/.*?',
                 url) and 'BIG' not in url:
         content = response.xpath(
             '//*[@id="rwb_zw"]/p/text() | //*[@id="rwb_zw"]/p/strong/text()'
         ).extract()
         if content:
             item = NewsItem(
                 domainname='http://people.com.cn',
                 chinesename='人民网',
                 url=sel.root.base,
                 title=sel.css('div.text_title > h1::text').extract_first(),
                 subtitle=sel.css('.sub::text').extract_first(),
                 language='中文',
                 encodingtype='utf-8',
                 corpustype='网络',
                 timeofpublish=sel.re(r'\d{4}年\d{2}月\d{2}日\d{2}:\d{2}')[0],
                 content=''.join(content),
                 source=sel.css(
                     'div.box01 > div.fl > a::text').extract_first(),
                 author=sel.css('p.author::text').extract_first())
             item = judge_time_news(item)
             if item:
                 yield item
コード例 #3
0
ファイル: SinaNewsSpider.py プロジェクト: Hansen06/NewsSpider
    def parse_item(response):
        sel = Selector(response)
        url = response.request.url
        if re.match(r'.*?sina.com.*?/\d{4}-\d{2}-\d{2}/.*?', url):
            content = response.xpath(
                '//*[@id="artibody"]//p//text()').extract()
            # 移除编辑
            editor = response.xpath(
                '//*[@class="article-editor"]/text()').extract_first()
            if editor:
                content.remove(editor)
            publish_time = sel.re(r'\d{4}年\d{2}月\d{2}日.{0,1}\d{2}:\d{2}')[0]
            if ' ' in publish_time:
                publish_time = publish_time.replace(' ', '')

            if content:
                item = NewsItem(
                    domainname='http://sina.com.cn',
                    chinesename='新浪网',
                    url=sel.root.base,
                    title=sel.css('#artibodyTitle::text, #main_title::text'
                                  ).extract_first(),
                    subtitle=sel.css('.sub::text').extract_first(),
                    language='中文',
                    encodingtype='utf-8',
                    corpustype='网络',
                    timeofpublish=publish_time,
                    content=''.join(content),
                    source=sel.xpath(
                        '//*[@data-sudaclick="media_name"]/text() | //*[@data-sudaclick="media_name"]/a/text()'
                    ).extract_first(),
                    author=None)
                item = judge_time_news(item)
                if item:
                    yield item
コード例 #4
0
ファイル: SohuNewsSpider.py プロジェクト: Hansen06/NewsSpider
    def parse_item(response):
        sel = Selector(response)
        url = response.request.url
        if re.match(r'.*?sohu.com.*?/\d{4}\d{2}\d{2}/.*?', url):
            content = response.xpath(
                '//*[@itemprop="articleBody"]//p//text()').extract()
            # 有的段落并不是在p标签下,所以
            if len(content) < 3:
                content = response.xpath(
                    '//*[@itemprop="articleBody"]//p//text() | //*[@id="contentText"]//div/text()'
                ).extract()

            publish_time = sel.re(
                r'\d{4}-\d{2}-\d{2} {0,1}\d{2}:\d{2}:\d{2}')[0]
            if content:
                item = NewsItem(
                    domainname='http://sohu.com',
                    chinesename='搜狐网',
                    url=sel.root.base,
                    title=sel.xpath(
                        '//*[@itemprop="headline"]/text()').extract_first(),
                    subtitle=sel.css('.sub::text').extract_first(),
                    language='中文',
                    encodingtype='utf-8',
                    corpustype='网络',
                    timeofpublish=publish_time,
                    content=''.join(content),
                    source=sel.xpath(
                        '//*[@id="media_span"]/span/text()').extract_first(),
                    author=sel.xpath(
                        '//*[@id="author_baidu"]/text()').extract_first())
                item = judge_time_news(item)
                if item:
                    yield item
コード例 #5
0
    def parse_item(response):
        sel = Selector(response)
        url = response.request.url
        if re.match(r'.*?tb.chinatibetnews.com/.*?', url):

            print('---------------------')
            print(url)

            content = response.xpath(
                '/html/body/div[4]/div[1]/div[2]/ul[1]/li[2]/div[2]/div[1]//p//text()'
            ).extract()
            print(content)
            # 移除编辑
            editor = response.xpath(
                '//*[@class="-articleeditor"]/text()').extract_first()
            title = response.xpath(
                '/html/body/div[4]/div[1]/div[2]/ul[1]/li[1]/p[2]//text()'
            ).extract()
            if editor:
                content.remove(editor)
            publish_time = sel.re(r'\d{4}-\d{2}-\d{2}')[0]
            print(publish_time)
            if ' ' in publish_time:
                publish_time = publish_time.replace(' ', '')

            if content:
                item = NewsItem(domainname='http://tibet.cpc.people.com.cn/',
                                chinesename='tibet3',
                                url=sel.root.base,
                                title=''.join(title),
                                subtitle=sel.css('.sub::text').extract_first(),
                                language='藏文',
                                encodingtype='utf-8',
                                corpustype='网络',
                                timeofpublish=publish_time,
                                content=''.join(content),
                                author=None)
                print(item.get("title", None))
                print(item.get("timeofpublish", None))
                print(item.get("source", None))
                print(item.get("author", None))
                # yield item
                # item = judge_time_news(item)
                # if item:
                yield item
コード例 #6
0
    def parse_item(response):
        sel = Selector(response)
        url = response.request.url
        if re.match(r'.*?/\d{4}-\d{2}/\d{2}/.*?', url):

            print('---------------------')
            print(url)
            content = response.xpath(
                '/html/body/div[6]/div/div/div[3]//p//text()').extract()
            print(content)
            # 移除编辑
            editor = response.xpath(
                '//*[@class="-articleeditor"]/text()').extract_first()
            if editor:
                content.remove(editor)
            publish_time = sel.re(r'\d{4}-\d{2}-\d{2}')[0]
            print(publish_time)
            if ' ' in publish_time:
                publish_time = publish_time.replace(' ', '')

            if content:
                item = NewsItem(
                    domainname='http://xizang.news.cn/',
                    chinesename='tibetxinhua',
                    url=sel.root.base,
                    title=sel.css('#ArticleTit::text').extract_first(),
                    subtitle=sel.css('.sub::text').extract_first(),
                    language='藏文',
                    encodingtype='utf-8',
                    corpustype='网络',
                    timeofpublish=publish_time,
                    content=''.join(content),
                    source=sel.css(
                        '#Articlely > div.laiyuan > a::text').extract_first(),
                    author=sel.css(
                        '#contentK > div.xinxi > span:nth-child(3)::text').
                    extract_first())
                print(item.get("title", None))
                print(item.get("timeofpublish", None))
                print(item.get("source", None))
                print(item.get("author", None))
                # yield item
                item = judge_time_news(item)
                if item:
                    yield item
コード例 #7
0
 def parse_item(response):
     sel = Selector(response)
     print(sel)
     url = response.request.url
     if re.match(r'.*?/\d{4}/\d{2}/\d{2}/.*?', url):
         print('---------------------')
         print(url)
         content = response.xpath(
             '//*[@id="page_body"]/div[2]/div/div[1]/div[1]/div/div[2]/p[2]/text() | //*[@id="page_body"]/div[2]/div/div[1]/div[1]/div/div[2]/p[2]/span/text()'
         ).extract()
         # '//*[@id="rwb_zw"]/p/text() | //*[@id="rwb_zw"]/p/strong/text()'| //*[@id="content"]/p[2]/span/span/text()
         print(content)
         if content:
             item = NewsItem(
                 domainname='http://uyghur.cntv.com',
                 chinesename='CCTV',
                 url=sel.root.base,
                 title=sel.css(
                     '.title_hd > h3:nth-child(1)::text').extract_first(),
                 subtitle=sel.css('.sub::text').extract_first(),
                 language='中文',
                 encodingtype='utf-8',
                 corpustype='网络',
                 # timeofpublish=sel.re(r'\d{4}_\d{2}_\d{2}')[0],
                 # strr=sel.css('.title_hd > p:nth-child(2)::text').extract_first(),
                 timeofpublish=re.search(
                     r'\d{2}-\d{2}-\d{4}',
                     sel.css('.title_hd > p:nth-child(2)::text').
                     extract_first()).group(0),
                 content=''.join(content),
                 # source=sel.css('.title_hd > p:nth-child(2)::text').extract_first(),
                 # author=sel.css('.title_hd > p:nth-child(2)::text').extract_first()
             )
             print(item.get("title", None))
             print(item.get("timeofpublish", None))
             print(item.get("source", None))
             print(item.get("author", None))
             # item = judge_time_news(item)
             # if item:
             yield item
コード例 #8
0
    def parse_item(response):
        sel = Selector(response)
        url = response.request.url
        if re.match(r'.*?people.com.cn.*?/\d+/.*?', url) and 'BIG' not in url:
            content = response.xpath('//*[@id="p_content"]/span/text() | //*[@class="clearfix"]/p/text()').extract()
            if content:
                item = NewsItem(
                    domainname='http://uyghur.people.com.cn/',
                    chinesename='维语人民网',
                    url=sel.root.base,
                    title=sel.css('div.ej_right > h1::text').extract_first(),
                    subtitle=sel.css('div.ej_right > h5::text').extract_first(),
                    language='维文',
                    encodingtype='utf-8',
                    corpustype='网络',
                    # timeofpublish=sel.re(
                    #     r'\d{4}年\d{2}月\d{2}日\d{2}:\d{2}')[0],
                    timeofpublish=sel.css('div.ej_right #p_publishtime::text').extract_first(),
                    content=''.join(content),
                    source=sel.css('div.ej_right #p_origin > a:nth-child(1)::text').extract_first(),
                    author=sel.css('div.ej_right #p_publishtime::text').extract_first()

                )
                print(item.get("title", None))
                print(item.get("timeofpublish", None))
                print(item.get("source", None))
                print(item.get("author",None))
                # item = judge_time_news(item)
                # if item:
                yield item
コード例 #9
0
ファイル: NewsSpider.py プロジェクト: Hansen06/NewsSpider
    def parse_item(response):
        sel = Selector(response)
        url = response.request.url
        news_urls = sel.css('a::attr(href)').extract()
        if re.match(r'.*?news.163.com.*?/\d+/\d+/.*?', response.request.url):
            content = response.xpath('//*[@id="endText"]//p//text()').extract()
            need_removes = response.xpath('//*[@id="endText"]//p//style/text()').extract()
            # 移除原标题
            otitle = response.xpath('//p[@class="otitle"]/text()').extract_first()
            if otitle:
                content.remove(otitle)

            if need_removes:
                for i, need_remove in enumerate(content):
                    if need_remove.startswith('\n\t') or need_remove.count('=') > 6:  # 将可能出现的视频页面乱码去除
                        content[i] = ''
            title = sel.css('#epContentLeft > h1::text').extract_first()
            if not title:
                title = sel.css('head > title::text').extract_first()
                index = title.find('_')
                title = title[:index]
            if content:
                item = NewsItem(
                    domainname='http://news.163.com',
                    chinesename='网易新闻',
                    url=sel.root.base,
                    title=title,
                    language='中文',
                    encodingtype='utf-8',
                    corpustype='网络',
                    timeofpublish=sel.re(
                        r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')[0],
                    content=''.join(content),
                    source=sel.css('#ne_article_source::text').extract_first(),
                    author=sel.css('div.author_txt > span.name::text').extract_first()
                )
                item = judge_time_news(item)
                if item:
                    yield item
コード例 #10
0
    def parse_item(response):
        sel = Selector(response)
        url = response.request.url
        if re.match(r'.*?chinanews.com.*?/\d{4}/\d{2}-\d{2}/.*?', url):
            content = response.xpath(
                '//*[@class="left_zw"]//p//text()').extract()

            publish_time = sel.re(r'\d{4}年\d{2}月\d{2}日 {0,1}\d{2}:\d{2}')[0]
            # 取出来源
            source = sel.xpath('//*[@class="left-t"]//text()').extract()
            source = ''.join(source)
            if ':' in source:
                source = source[source.rfind(':') + 1:source.find(" 参与")]
            else:
                source = None

            if ' ' in publish_time:
                publish_time = publish_time.replace(' ', '')
            if content:
                item = NewsItem(
                    domainname='http://chinanews.com',
                    chinesename='中新网',
                    url=sel.root.base,
                    title=sel.xpath(
                        '//*[@id="cont_1_1_2"]/h1/text()').extract_first(),
                    subtitle=sel.css('.sub::text').extract_first(),
                    language='中文',
                    encodingtype='utf-8',
                    corpustype='网络',
                    timeofpublish=publish_time,
                    content=''.join(content),
                    source=source,
                    author=sel.xpath(
                        '//*[@id="author"]/text()').extract_first())
                item = judge_time_news(item)
                if item:
                    yield item
コード例 #11
0
 def parse_item(response):
     sel = Selector(response)
     url = response.request.url
     if re.match(r'.*?xinhuanet.com.*?/\d{4}-\d{2}/\d{2}/.*?', url):
         content = response.xpath(
             '//*[@id="p-detail"]//p//text()').extract()
         if content:
             item = NewsItem(
                 domainname='http://xinhuanet.com',
                 chinesename='新华网',
                 url=sel.root.base,
                 title=sel.css('div > div.h-title::text').extract_first(),
                 subtitle=sel.css('.sub::text').extract_first(),
                 language='中文',
                 encodingtype='utf-8',
                 corpustype='网络',
                 timeofpublish=sel.re(
                     r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')[0],
                 content=''.join(content),
                 source=sel.css('#source::text').extract_first(),
                 author=None)
             item = judge_time_news(item)
             if item:
                 yield item