def parse_roll_item(self, response): print '=====parse_roll_item:=====response:', response sel_a = response.xpath( '//div[@class="main"]/div[@class="mainCon"]/div[@class="list c"]') sel_b = sel_a.xpath('.//ul') sel_b = sel_a.xpath('.//li') id = 0 for site in sel_b: id += 1 title = site.xpath('a/text()').extract() link = site.xpath('a/@href').extract() time_release = site.xpath('a/@href ').extract() type_news = site.xpath('span[@class="t-tit"]/text()').extract() #response_news=site.xpath('p[@class]/text() ').extract() item = QqproItem(title=title, link=link, time_release=time_release, type_news=type_news) print 'id : ', id print 'response : ', response print 'response_news:', item['response_news'][0].encode('utf-8') if not len(item['title']) == 0: print 'title:', item['title'][0].encode('utf-8') else: print 'title:', 'null' yield QqproItem(title=title, link=link, time_release=time_release, type_news=type_news)
def parse_roll_item(self, response): print '=====parse_roll_item:=====response:', response #sel=Selector(response) #sel_a=response.xpath('//div[@class="main"]/div[@class="mainBody"]/div[@class="mainCon"]/div[@class="list c"]') sel_a = response.xpath('//*[@id=\'artContainer\']') #sel_a=response.xpath('/html/body/div[@id="iBody"]/div[@class="wrap c"]/div[@class="main"]/div[@class="mainBody"]/div[@class="mainCon"]/div[@id="artContainer"]/ul/li') #sel_m=sel_a.xpath('.//div[@id="artContainer"]/ul/li') #sel_a=sel sel_b = response.xpath('//ul') sel_c = sel_b.xpath('//li') print 'sel_a : ', sel_a print 'sel_b : ', sel_b print 'sel_c : ', sel_c #print 'sel_m : ',sel_m id = 0 for site in sel_c: id += 1 title = site.xpath('a/text()').extract() link = site.xpath('a/@href').extract() time_release = site.xpath('span[2]/text() ').extract() response_news = site.xpath('a/text() ').extract() #type_news=site.xpath('span[@class="t-tit"]/text()' ).extract() type_news = site.xpath('span[2]/text()').extract() item = QqproItem(title=title, link=link, time_release=time_release, type_news=type_news, response_news=response_news) print 'id : ', id print 'response : ', response if not len(item['response_news']) == 0: print 'response_news:', item['response_news'][0].encode( 'utf-8') else: print 'response_news', 'null' if not len(item['title']) == 0: print 'title:', item['title'][0].encode('utf-8') else: print 'title:', 'null' yield QqproItem(title=title, link=link, time_release=time_release, type_news=type_news, response_news=response_news)
def parse_base_item(self, response): ### (1)新闻中心_要闻 print '=====parse_base_item:=====response:', response base_url = get_base_url(response) sel_a = response.xpath('//div[@id="news"]') sel_b = sel_a.xpath('.//div[@class="Q-tpWrap"]') id = 0 ##爬取主页新闻标题列表 for site in response.xpath('//em[@class]'): id += 1 title = site.xpath('span/span/a/text() ').extract() link = site.xpath('span/span/a/@href').extract() response_news = site.xpath('../p[@class]/text() ').extract() ##show contents item=QqproItem(title=title,link=link,\ response_news=response_news,\ manufacturer='qq_center_yaowen') ##正文抓取 print 'main body loop:' print 'response = ', response link = site.xpath('span/span/a/@href').extract() url_n = ''.join(link) url_new = urljoin(base_url, url_n) yield scrapy.Request(url_new, callback=self.parse_body_center_yaowen, meta={'item': item})
def parse(self, response): print '=====parse_pro_item:=====response:', response #a: ######## /ul/li/div/div ######## #b: ######## /ul/li ######## base_url = get_base_url(response) sel_b = response.xpath('//ul') sel_c = sel_b.xpath('.//li') sel_d = sel_c.xpath('.//div') sel_e = sel_d.xpath('.//div') id = 0 for site in sel_e: id += 1 #a: #############################楚################粤######################## title = site.xpath('h3/a/text() |a/text()').extract() print 'title: ', title if len(title) != 0: link = site.xpath('h3/a/@href |a/@href').extract() #time_release=site.xpath(' ./../div[@class="pubTime"]/text()').extract() response_news = site.xpath( 'p/text() | ./../p/text() ').extract() #type_news=[''] ##item: hash title_a = ''.join(title[0]).encode('utf-8') ###严格的格式 sha1obj = hashlib.sha1() sha1obj.update(title_a) hash = sha1obj.hexdigest() hash = [hash] ##item各项采用列表类型 print 'spider: hash: ', hash #b: #############################楚############################粤#######################湘、豫######## #title=site.xpath('div/div[2]/h3/a/text() |div[2]/div[1]/a/text() |a/text()').extract() #link=site.xpath('div/div[2]/h3/a/@href |div[2]/div[1]/a/@href |a/@href').extract() #time_release=site.xpath('div/div[2]/h3/a/@href |div[2]/div[1]/a/@href |a/@href').extract() #response_news=site.xpath('div/p/text() |div[2]/p/text() |../p/text()').extract() #type_news=site.xpath('div/div[2]/h3/a/text() |div[2]/div[1]/a/text() |a/text()').extract() url_m = ''.join(link) url_new = urljoin(base_url, url_m) item=QqproItem(title=title,\ link=url_new,\ response_news=response_news,\ manufacturer='province') #yield item yield scrapy.Request(url_new, callback=self.parse_body, meta={'item': item}) else: print 'parse: 标题为空。不操作.\n\n'
def parse_china_item(self, response): print '=====parse_china_item:=====response:', response sel_a = response.xpath('.//div[@id="news"]') sel_b = sel_a.xpath('.//div[@class="Q-tpWrap"]') id = 0 for site in sel_b: id += 1 title = site.xpath('em[@class]/a/text() ').extract() link = site.xpath('em[@class]/a/@href').extract() #link="http://news.qq.com/"+str(link) #print "=============link: ",link time_release = site.xpath('em[@class]/a/@href ').extract() response_news = site.xpath('p[@class]/text() ').extract() type_news = site.xpath('em[@class]/a/text() ').extract() ##item: hash title_a = ''.join(title) sha1obj = hashlib.sha1() sha1obj.update(title_a) hash = sha1obj.hexdigest() print 'spider: hash: ', hash item = QqproItem(title=title, link=link, time_release=time_release, response_news=response_news, type_news=type_news, hash=hash, manufacturer='qq_center_china') print 'id : ', id print 'response : ', response if not len(item['response_news']) == 0: print 'response_news:', item['response_news'][0].encode( 'utf-8') else: print 'response_news:', 'response_news' if not len(item['title']) == 0: print 'title:', item['title'][0].encode('utf-8') else: print 'title:', 'null' yield item
def parse(self, response): #print '=====parse:response:',response id = 0 base_url = get_base_url(response) ##目录列表抓取 sel_b = response.xpath('//em[@class]') for site in sel_b: #print 'contents loop:' #print 'response= ',response id += 1 ##item: title link time res type title = site.xpath('span/span/a/text() ').extract() if len(title) != 0: link = site.xpath('span/span/a/@href').extract() time_release = [''] #response_news=site.xpath('div[@class="Q-tpWrap"]/p[@class]/text() ').extract() response_news = site.xpath('./../p[@class]/text() ').extract() #type_news=site.xpath('span/span/a/text() ' ).extract() ##item: hash #title_a=''.join(title[0]).encode('utf-8') ##title_a=str( ( title ) ).encode('utf-8')## #sha1obj = hashlib.sha1() #sha1obj.update(title_a) #hash = sha1obj.hexdigest() #hash=[hash]##hash 为列表类型 ##print 'spider: hash: ',hash #item=QqproItem(title=title,link=link,time_release=time_release,\ # response_news=response_news,hash=hash,\ # manufacturer='s_yaowen') url_m = (str(link))[3:-2] url_new = urljoin(base_url, url_m) item=QqproItem(title=title,link=url_new,\ response_news=response_news,\ manufacturer='s_yaowen') yield scrapy.Request(url_new, callback=self.parse_body, meta={'item': item})
def parse(self,response): print '=====parse_surface_item:=====response:',response sel_a=response.xpath('//div[contains(@id,"newsContent")]') sel_b=sel_a.xpath('.//ul') sel_c=sel_b.xpath('.//li') id=0 base_url = get_base_url(response) for site in sel_c: id+=1 ##item: ... title=site.xpath('a/text()' ).extract() if len(title)!=0: link=site.xpath('a/@href' ).extract() response_news=[''] item=QqproItem(title=title,link=link,response_news=response_news,\ manufacturer='surface') url_m= (str(link))[3:-2] #url_m= ''.join(link) url_new=urljoin(base_url,url_m ) yield scrapy.Request(url_new, callback=self.parse_body,meta={'item':item})