def parse(self, response): global cnt hxs = HtmlXPathSelector(response) sites = hxs.x('//table/tbody') items = [] # 标记是哪个用户 user = hxs.x('//head/title/text()').extract()[0][:-7].encode('utf-8') # 标记用户总共有多少条收听记录 sum = hxs.x('//span').extract()[-3].encode('utf-8').split('共')[1].split('条')[0] currentPage = hxs.x('//span').extract()[-3].encode('utf-8').split('第')[1].split('页')[0] #for site in sites: if int(currentPage) <= int(sum) / 50: for i in range(1, 50): item = XiamiItem() item['user'] = user item['song'] = sites.x('tr[' + str(i) + ']/td[2]/a').extract()[0].split('\"')[3].encode('utf-8') print '_______________' + item['song'] item['artist'] = sites.x('tr[' + str(i) + ']/td[2]/a/text()').extract()[1].encode('utf-8') print '+++++++++++++++' + item['artist'] items.append(item) #yield item return items #yield items """
def parse(self, response): items = [] hxs = HtmlXPathSelector(response) posts = hxs.x('//h1/a/@href').extract() items.extend([ self.make_requests_from_url(url).replace(callback=self.parse_post) for url in posts ]) page_links = hxs.x('//div[@class="wp-pagenavi"]/a[not(@title)]') for link in page_links: if link.x('text()').extract()[0] == u'\xbb': url = link.x('@href').extract()[0] items.append(self.make_requests_from_url(url)) return items
def parse(self, response): self.log("OK,%s"%response.url) hxs = HtmlXPathSelector(response) #将文章的链接继续进行处理 divs = hxs.x('//div[@class="publicLeftCon mt10"]') for div in divs: url = div.x('h5/a/@href').extract()[0] yield self.make_requests_from_url(url).replace(callback=self.parse_content) #将下一页的链接继续进行处理 try: next_url = hxs.x('//div[@id="project_left"]/div[@class="publicMiddleLine"]/span/a[b="下一页"]/@href').extract()[0] except Exception: return next_url = 'http://article.yeeyan.org'+next_url # if self.count==10: # return # self.count+=1 yield self.make_requests_from_url(next_url).replace(callback=self.parse)
def parse(self, response): self.log("OK,%s" % response.url) hxs = HtmlXPathSelector(response) # 将文章的链接继续进行处理 divs = hxs.x('//div[@class="publicLeftCon mt10"]') for div in divs: url = div.x('h5/a/@href').extract()[0] yield self.make_requests_from_url(url).replace( callback=self.parse_content) # 将下一页的链接继续进行处理 try: next_url = \ hxs.x('//div[@id="project_left"]/div[@class="publicMiddleLine"]/span/a[b="下一页"]/@href').extract()[0] except Exception: return next_url = 'http://article.yeeyan.org' + next_url # if self.count==10: # return # self.count+=1 yield self.make_requests_from_url(next_url).replace( callback=self.parse)
def parse_jx(self, item, response): hxs = HtmlXPathSelector(response) item['url'] = response.url item['title'] = hxs.x('//title/text()').extract()[0].split('|')[1].strip() div = hxs.x('//div[@class="jxar_author"]') item['author'] = div.x('.//a/text()').extract()[0] item['release_time'] = hxs.x('//p[@class="jxa_info"]/span[1]/text()').extract()[0] try: item['excerpt'] = hxs.x('//p[@class="jxa_intro"]/text()').extract()[0] except Exception: item['excerpt'] = None item['category'] = hxs.x('//p[@class="jxa_map"]/text()').extract()[1].split()[1] item['content_html'] = hxs.x('//div[@class="jxa_content"]').extract()[0] return item
def parse_content(self, response): hxs = HtmlXPathSelector(response) item = YeeyanItem() if hxs.x('//a[@class="jx_logo"]/text()'): item = self.parse_jx(item, response) else: item['url'] = response.url item['title'] = hxs.x('//title/text()').extract()[0].split('|')[1].strip() div = hxs.x('//div[@class="user_info"]') item['author'] = div.x('.//h2/a/text()').extract()[0] item['excerpt'] = hxs.x('//p[@class="excerpt"]/text()').extract() if item['excerpt']: item['excerpt'] = item['excerpt'][0] else: item['excerpt'] = '' item['content_html'] = hxs.x('//div[@id="conBox"]').extract()[0] item['release_time'] = div.x('.//p/text()').extract()[0].strip()[1:-7] item['category'] = hxs.x('//div[@class="crumb"]/a/text()').extract()[1] return item
def parse_jx(self, item, response): hxs = HtmlXPathSelector(response) item['url'] = response.url item['title'] = hxs.x('//title/text()').extract()[0].split( '|')[1].strip() div = hxs.x('//div[@class="jxar_author"]') item['author'] = div.x('.//a/text()').extract()[0] item['release_time'] = hxs.x( '//p[@class="jxa_info"]/span[1]/text()').extract()[0] try: item['excerpt'] = hxs.x( '//p[@class="jxa_intro"]/text()').extract()[0] except Exception: item['excerpt'] = None item['category'] = hxs.x( '//p[@class="jxa_map"]/text()').extract()[1].split()[1] item['content_html'] = hxs.x( '//div[@class="jxa_content"]').extract()[0] return item
def parse_content(self, response): hxs = HtmlXPathSelector(response) item = YeeyanItem() if hxs.x('//a[@class="jx_logo"]/text()'): item = self.parse_jx(item, response) else: item['url'] = response.url item['title'] = hxs.x('//title/text()').extract()[0].split( '|')[1].strip() div = hxs.x('//div[@class="user_info"]') item['author'] = div.x('.//h2/a/text()').extract()[0] item['excerpt'] = hxs.x('//p[@class="excerpt"]/text()').extract() if item['excerpt']: item['excerpt'] = item['excerpt'][0] else: item['excerpt'] = '' item['content_html'] = hxs.x('//div[@id="conBox"]').extract()[0] item['release_time'] = div.x( './/p/text()').extract()[0].strip()[1:-7] item['category'] = hxs.x( '//div[@class="crumb"]/a/text()').extract()[1] return item