def parse_item(response):
        sel = Selector(response)
        url = response.request.url
        if re.match(r'.*?people.com.cn.*?/\d+/.*?', url) and 'BIG' not in url:
            content = response.xpath('//*[@id="p_content"]/span/text() | //*[@class="clearfix"]/p/text()').extract()
            if content:
                item = NewsItem(
                    domainname='http://uyghur.people.com.cn/',
                    chinesename='维语人民网',
                    url=sel.root.base,
                    title=sel.css('div.ej_right > h1::text').extract_first(),
                    subtitle=sel.css('div.ej_right > h5::text').extract_first(),
                    language='维文',
                    encodingtype='utf-8',
                    corpustype='网络',
                    # timeofpublish=sel.re(
                    #     r'\d{4}年\d{2}月\d{2}日\d{2}:\d{2}')[0],
                    timeofpublish=sel.css('div.ej_right #p_publishtime::text').extract_first(),
                    content=''.join(content),
                    source=sel.css('div.ej_right #p_origin > a:nth-child(1)::text').extract_first(),
                    author=sel.css('div.ej_right #p_publishtime::text').extract_first()

                )
                print(item.get("title", None))
                print(item.get("timeofpublish", None))
                print(item.get("source", None))
                print(item.get("author",None))
                # item = judge_time_news(item)
                # if item:
                yield item
 def parse_item(response):
     sel = Selector(response)
     print(sel)
     url = response.request.url
     if re.match(r'.*?tibet.people.com.cn/.*?', url):
         print('---------------------')
         print(url)
         content = response.xpath('//html/body/div[2]/div[4]/div[1]/div[2]/div[2]/text()').extract()
                                # '//*[@id="rwb_zw"]/p/text() | //*[@id="rwb_zw"]/p/strong/text()'| //*[@id="content"]/p[2]/span/span/text()
         print(content)
         if content:
             item = NewsItem(
                 domainname='http://tibet.people.com.cn',
                 chinesename='people',
                 url=sel.root.base,
                 title=sel.css('.gq_content > h1:nth-child(2)::text').extract_first(),
                 subtitle=sel.css('.sub::text').extract_first(),
                 language='tibet',
                 encodingtype='utf-8',
                 corpustype='网络',
                 timeofpublish=sel.re(r'\d{4}.*?\d{2}.*?\d{2}.*?\d{2}:\d{2}')[0].replace('ལོའི་ཟླ་ ', '年').replace('ཚེས་', '月').replace('ཉིན།  ', '日'),
                 # timeofpublish = re.search(r'\d{4}.*?\d{2}.*?\d{2}',sel.css('.title_hd > p:nth-child(2)::text').extract_first()).group(0),
                 content=''.join(content),
                 # source=sel.css('.title_hd > p:nth-child(2)::text').extract_first(),
                 # author=sel.css('.title_hd > p:nth-child(2)::text').extract_first()
             )
             print(item.get("title", None))
             print(item.get("timeofpublish", None))
             print(item.get("source", None))
             print(item.get("author", None))
             item = judge_time_news_people(item)
             if item:
                 yield item
    def parse_item(response):
        sel = Selector(response)
        url = response.request.url
        if re.match(r'.*?tb.chinatibetnews.com/.*?', url):

            print('---------------------')
            print(url)

            content = response.xpath(
                '/html/body/div[4]/div[1]/div[2]/ul[1]/li[2]/div[2]/div[1]//p//text()'
            ).extract()
            print(content)
            # 移除编辑
            editor = response.xpath(
                '//*[@class="-articleeditor"]/text()').extract_first()
            title = response.xpath(
                '/html/body/div[4]/div[1]/div[2]/ul[1]/li[1]/p[2]//text()'
            ).extract()
            if editor:
                content.remove(editor)
            publish_time = sel.re(r'\d{4}-\d{2}-\d{2}')[0]
            print(publish_time)
            if ' ' in publish_time:
                publish_time = publish_time.replace(' ', '')

            if content:
                item = NewsItem(domainname='http://tibet.cpc.people.com.cn/',
                                chinesename='tibet3',
                                url=sel.root.base,
                                title=''.join(title),
                                subtitle=sel.css('.sub::text').extract_first(),
                                language='藏文',
                                encodingtype='utf-8',
                                corpustype='网络',
                                timeofpublish=publish_time,
                                content=''.join(content),
                                author=None)
                print(item.get("title", None))
                print(item.get("timeofpublish", None))
                print(item.get("source", None))
                print(item.get("author", None))
                # yield item
                # item = judge_time_news(item)
                # if item:
                yield item
    def parse_item(response):
        sel = Selector(response)
        url = response.request.url
        if re.match(r'.*?/\d{4}-\d{2}/\d{2}/.*?', url):

            print('---------------------')
            print(url)
            content = response.xpath(
                '/html/body/div[6]/div/div/div[3]//p//text()').extract()
            print(content)
            # 移除编辑
            editor = response.xpath(
                '//*[@class="-articleeditor"]/text()').extract_first()
            if editor:
                content.remove(editor)
            publish_time = sel.re(r'\d{4}-\d{2}-\d{2}')[0]
            print(publish_time)
            if ' ' in publish_time:
                publish_time = publish_time.replace(' ', '')

            if content:
                item = NewsItem(
                    domainname='http://xizang.news.cn/',
                    chinesename='tibetxinhua',
                    url=sel.root.base,
                    title=sel.css('#ArticleTit::text').extract_first(),
                    subtitle=sel.css('.sub::text').extract_first(),
                    language='藏文',
                    encodingtype='utf-8',
                    corpustype='网络',
                    timeofpublish=publish_time,
                    content=''.join(content),
                    source=sel.css(
                        '#Articlely > div.laiyuan > a::text').extract_first(),
                    author=sel.css(
                        '#contentK > div.xinxi > span:nth-child(3)::text').
                    extract_first())
                print(item.get("title", None))
                print(item.get("timeofpublish", None))
                print(item.get("source", None))
                print(item.get("author", None))
                # yield item
                item = judge_time_news(item)
                if item:
                    yield item
Beispiel #5
0
 def parse_item(response):
     sel = Selector(response)
     print(sel)
     url = response.request.url
     if re.match(r'.*?/\d{4}/\d{2}/\d{2}/.*?', url):
         print('---------------------')
         print(url)
         content = response.xpath(
             '//*[@id="page_body"]/div[2]/div/div[1]/div[1]/div/div[2]/p[2]/text() | //*[@id="page_body"]/div[2]/div/div[1]/div[1]/div/div[2]/p[2]/span/text()'
         ).extract()
         # '//*[@id="rwb_zw"]/p/text() | //*[@id="rwb_zw"]/p/strong/text()'| //*[@id="content"]/p[2]/span/span/text()
         print(content)
         if content:
             item = NewsItem(
                 domainname='http://uyghur.cntv.com',
                 chinesename='CCTV',
                 url=sel.root.base,
                 title=sel.css(
                     '.title_hd > h3:nth-child(1)::text').extract_first(),
                 subtitle=sel.css('.sub::text').extract_first(),
                 language='中文',
                 encodingtype='utf-8',
                 corpustype='网络',
                 # timeofpublish=sel.re(r'\d{4}_\d{2}_\d{2}')[0],
                 # strr=sel.css('.title_hd > p:nth-child(2)::text').extract_first(),
                 timeofpublish=re.search(
                     r'\d{2}-\d{2}-\d{4}',
                     sel.css('.title_hd > p:nth-child(2)::text').
                     extract_first()).group(0),
                 content=''.join(content),
                 # source=sel.css('.title_hd > p:nth-child(2)::text').extract_first(),
                 # author=sel.css('.title_hd > p:nth-child(2)::text').extract_first()
             )
             print(item.get("title", None))
             print(item.get("timeofpublish", None))
             print(item.get("source", None))
             print(item.get("author", None))
             # item = judge_time_news(item)
             # if item:
             yield item