def parse_item(response): sel = Selector(response) print(sel) url = response.request.url if re.match(r'.*?tibet.people.com.cn/.*?', url): print('---------------------') print(url) content = response.xpath('//html/body/div[2]/div[4]/div[1]/div[2]/div[2]/text()').extract() # '//*[@id="rwb_zw"]/p/text() | //*[@id="rwb_zw"]/p/strong/text()'| //*[@id="content"]/p[2]/span/span/text() print(content) if content: item = NewsItem( domainname='http://tibet.people.com.cn', chinesename='people', url=sel.root.base, title=sel.css('.gq_content > h1:nth-child(2)::text').extract_first(), subtitle=sel.css('.sub::text').extract_first(), language='tibet', encodingtype='utf-8', corpustype='网络', timeofpublish=sel.re(r'\d{4}.*?\d{2}.*?\d{2}.*?\d{2}:\d{2}')[0].replace('ལོའི་ཟླ་ ', '年').replace('ཚེས་', '月').replace('ཉིན། ', '日'), # timeofpublish = re.search(r'\d{4}.*?\d{2}.*?\d{2}',sel.css('.title_hd > p:nth-child(2)::text').extract_first()).group(0), content=''.join(content), # source=sel.css('.title_hd > p:nth-child(2)::text').extract_first(), # author=sel.css('.title_hd > p:nth-child(2)::text').extract_first() ) print(item.get("title", None)) print(item.get("timeofpublish", None)) print(item.get("source", None)) print(item.get("author", None)) item = judge_time_news_people(item) if item: yield item
def parse_item(response): sel = Selector(response) url = response.request.url if re.match(r'.*?people.com.cn.*?/\d+/\d+/.*?', url) and 'BIG' not in url: content = response.xpath( '//*[@id="rwb_zw"]/p/text() | //*[@id="rwb_zw"]/p/strong/text()' ).extract() if content: item = NewsItem( domainname='http://people.com.cn', chinesename='人民网', url=sel.root.base, title=sel.css('div.text_title > h1::text').extract_first(), subtitle=sel.css('.sub::text').extract_first(), language='中文', encodingtype='utf-8', corpustype='网络', timeofpublish=sel.re(r'\d{4}年\d{2}月\d{2}日\d{2}:\d{2}')[0], content=''.join(content), source=sel.css( 'div.box01 > div.fl > a::text').extract_first(), author=sel.css('p.author::text').extract_first()) item = judge_time_news(item) if item: yield item
def parse_item(response): sel = Selector(response) url = response.request.url if re.match(r'.*?sina.com.*?/\d{4}-\d{2}-\d{2}/.*?', url): content = response.xpath( '//*[@id="artibody"]//p//text()').extract() # 移除编辑 editor = response.xpath( '//*[@class="article-editor"]/text()').extract_first() if editor: content.remove(editor) publish_time = sel.re(r'\d{4}年\d{2}月\d{2}日.{0,1}\d{2}:\d{2}')[0] if ' ' in publish_time: publish_time = publish_time.replace(' ', '') if content: item = NewsItem( domainname='http://sina.com.cn', chinesename='新浪网', url=sel.root.base, title=sel.css('#artibodyTitle::text, #main_title::text' ).extract_first(), subtitle=sel.css('.sub::text').extract_first(), language='中文', encodingtype='utf-8', corpustype='网络', timeofpublish=publish_time, content=''.join(content), source=sel.xpath( '//*[@data-sudaclick="media_name"]/text() | //*[@data-sudaclick="media_name"]/a/text()' ).extract_first(), author=None) item = judge_time_news(item) if item: yield item
def parse_item(response): sel = Selector(response) url = response.request.url if re.match(r'.*?sohu.com.*?/\d{4}\d{2}\d{2}/.*?', url): content = response.xpath( '//*[@itemprop="articleBody"]//p//text()').extract() # 有的段落并不是在p标签下,所以 if len(content) < 3: content = response.xpath( '//*[@itemprop="articleBody"]//p//text() | //*[@id="contentText"]//div/text()' ).extract() publish_time = sel.re( r'\d{4}-\d{2}-\d{2} {0,1}\d{2}:\d{2}:\d{2}')[0] if content: item = NewsItem( domainname='http://sohu.com', chinesename='搜狐网', url=sel.root.base, title=sel.xpath( '//*[@itemprop="headline"]/text()').extract_first(), subtitle=sel.css('.sub::text').extract_first(), language='中文', encodingtype='utf-8', corpustype='网络', timeofpublish=publish_time, content=''.join(content), source=sel.xpath( '//*[@id="media_span"]/span/text()').extract_first(), author=sel.xpath( '//*[@id="author_baidu"]/text()').extract_first()) item = judge_time_news(item) if item: yield item
def parse_item(response): sel = Selector(response) url = response.request.url if re.match(r'.*?tb.chinatibetnews.com/.*?', url): print('---------------------') print(url) content = response.xpath( '/html/body/div[4]/div[1]/div[2]/ul[1]/li[2]/div[2]/div[1]//p//text()' ).extract() print(content) # 移除编辑 editor = response.xpath( '//*[@class="-articleeditor"]/text()').extract_first() title = response.xpath( '/html/body/div[4]/div[1]/div[2]/ul[1]/li[1]/p[2]//text()' ).extract() if editor: content.remove(editor) publish_time = sel.re(r'\d{4}-\d{2}-\d{2}')[0] print(publish_time) if ' ' in publish_time: publish_time = publish_time.replace(' ', '') if content: item = NewsItem(domainname='http://tibet.cpc.people.com.cn/', chinesename='tibet3', url=sel.root.base, title=''.join(title), subtitle=sel.css('.sub::text').extract_first(), language='藏文', encodingtype='utf-8', corpustype='网络', timeofpublish=publish_time, content=''.join(content), author=None) print(item.get("title", None)) print(item.get("timeofpublish", None)) print(item.get("source", None)) print(item.get("author", None)) # yield item # item = judge_time_news(item) # if item: yield item
def parse_item(response): sel = Selector(response) url = response.request.url if re.match(r'.*?/\d{4}-\d{2}/\d{2}/.*?', url): print('---------------------') print(url) content = response.xpath( '/html/body/div[6]/div/div/div[3]//p//text()').extract() print(content) # 移除编辑 editor = response.xpath( '//*[@class="-articleeditor"]/text()').extract_first() if editor: content.remove(editor) publish_time = sel.re(r'\d{4}-\d{2}-\d{2}')[0] print(publish_time) if ' ' in publish_time: publish_time = publish_time.replace(' ', '') if content: item = NewsItem( domainname='http://xizang.news.cn/', chinesename='tibetxinhua', url=sel.root.base, title=sel.css('#ArticleTit::text').extract_first(), subtitle=sel.css('.sub::text').extract_first(), language='藏文', encodingtype='utf-8', corpustype='网络', timeofpublish=publish_time, content=''.join(content), source=sel.css( '#Articlely > div.laiyuan > a::text').extract_first(), author=sel.css( '#contentK > div.xinxi > span:nth-child(3)::text'). extract_first()) print(item.get("title", None)) print(item.get("timeofpublish", None)) print(item.get("source", None)) print(item.get("author", None)) # yield item item = judge_time_news(item) if item: yield item
def parse_item(response): sel = Selector(response) print(sel) url = response.request.url if re.match(r'.*?/\d{4}/\d{2}/\d{2}/.*?', url): print('---------------------') print(url) content = response.xpath( '//*[@id="page_body"]/div[2]/div/div[1]/div[1]/div/div[2]/p[2]/text() | //*[@id="page_body"]/div[2]/div/div[1]/div[1]/div/div[2]/p[2]/span/text()' ).extract() # '//*[@id="rwb_zw"]/p/text() | //*[@id="rwb_zw"]/p/strong/text()'| //*[@id="content"]/p[2]/span/span/text() print(content) if content: item = NewsItem( domainname='http://uyghur.cntv.com', chinesename='CCTV', url=sel.root.base, title=sel.css( '.title_hd > h3:nth-child(1)::text').extract_first(), subtitle=sel.css('.sub::text').extract_first(), language='中文', encodingtype='utf-8', corpustype='网络', # timeofpublish=sel.re(r'\d{4}_\d{2}_\d{2}')[0], # strr=sel.css('.title_hd > p:nth-child(2)::text').extract_first(), timeofpublish=re.search( r'\d{2}-\d{2}-\d{4}', sel.css('.title_hd > p:nth-child(2)::text'). extract_first()).group(0), content=''.join(content), # source=sel.css('.title_hd > p:nth-child(2)::text').extract_first(), # author=sel.css('.title_hd > p:nth-child(2)::text').extract_first() ) print(item.get("title", None)) print(item.get("timeofpublish", None)) print(item.get("source", None)) print(item.get("author", None)) # item = judge_time_news(item) # if item: yield item
def parse_item(response): sel = Selector(response) url = response.request.url if re.match(r'.*?people.com.cn.*?/\d+/.*?', url) and 'BIG' not in url: content = response.xpath('//*[@id="p_content"]/span/text() | //*[@class="clearfix"]/p/text()').extract() if content: item = NewsItem( domainname='http://uyghur.people.com.cn/', chinesename='维语人民网', url=sel.root.base, title=sel.css('div.ej_right > h1::text').extract_first(), subtitle=sel.css('div.ej_right > h5::text').extract_first(), language='维文', encodingtype='utf-8', corpustype='网络', # timeofpublish=sel.re( # r'\d{4}年\d{2}月\d{2}日\d{2}:\d{2}')[0], timeofpublish=sel.css('div.ej_right #p_publishtime::text').extract_first(), content=''.join(content), source=sel.css('div.ej_right #p_origin > a:nth-child(1)::text').extract_first(), author=sel.css('div.ej_right #p_publishtime::text').extract_first() ) print(item.get("title", None)) print(item.get("timeofpublish", None)) print(item.get("source", None)) print(item.get("author",None)) # item = judge_time_news(item) # if item: yield item
def parse_item(response): sel = Selector(response) url = response.request.url news_urls = sel.css('a::attr(href)').extract() if re.match(r'.*?news.163.com.*?/\d+/\d+/.*?', response.request.url): content = response.xpath('//*[@id="endText"]//p//text()').extract() need_removes = response.xpath('//*[@id="endText"]//p//style/text()').extract() # 移除原标题 otitle = response.xpath('//p[@class="otitle"]/text()').extract_first() if otitle: content.remove(otitle) if need_removes: for i, need_remove in enumerate(content): if need_remove.startswith('\n\t') or need_remove.count('=') > 6: # 将可能出现的视频页面乱码去除 content[i] = '' title = sel.css('#epContentLeft > h1::text').extract_first() if not title: title = sel.css('head > title::text').extract_first() index = title.find('_') title = title[:index] if content: item = NewsItem( domainname='http://news.163.com', chinesename='网易新闻', url=sel.root.base, title=title, language='中文', encodingtype='utf-8', corpustype='网络', timeofpublish=sel.re( r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')[0], content=''.join(content), source=sel.css('#ne_article_source::text').extract_first(), author=sel.css('div.author_txt > span.name::text').extract_first() ) item = judge_time_news(item) if item: yield item
def parse_item(response): sel = Selector(response) url = response.request.url if re.match(r'.*?chinanews.com.*?/\d{4}/\d{2}-\d{2}/.*?', url): content = response.xpath( '//*[@class="left_zw"]//p//text()').extract() publish_time = sel.re(r'\d{4}年\d{2}月\d{2}日 {0,1}\d{2}:\d{2}')[0] # 取出来源 source = sel.xpath('//*[@class="left-t"]//text()').extract() source = ''.join(source) if ':' in source: source = source[source.rfind(':') + 1:source.find(" 参与")] else: source = None if ' ' in publish_time: publish_time = publish_time.replace(' ', '') if content: item = NewsItem( domainname='http://chinanews.com', chinesename='中新网', url=sel.root.base, title=sel.xpath( '//*[@id="cont_1_1_2"]/h1/text()').extract_first(), subtitle=sel.css('.sub::text').extract_first(), language='中文', encodingtype='utf-8', corpustype='网络', timeofpublish=publish_time, content=''.join(content), source=source, author=sel.xpath( '//*[@id="author"]/text()').extract_first()) item = judge_time_news(item) if item: yield item
def parse_item(response): sel = Selector(response) url = response.request.url if re.match(r'.*?xinhuanet.com.*?/\d{4}-\d{2}/\d{2}/.*?', url): content = response.xpath( '//*[@id="p-detail"]//p//text()').extract() if content: item = NewsItem( domainname='http://xinhuanet.com', chinesename='新华网', url=sel.root.base, title=sel.css('div > div.h-title::text').extract_first(), subtitle=sel.css('.sub::text').extract_first(), language='中文', encodingtype='utf-8', corpustype='网络', timeofpublish=sel.re( r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')[0], content=''.join(content), source=sel.css('#source::text').extract_first(), author=None) item = judge_time_news(item) if item: yield item