def parse_item(self, response): item = DoubanbookItem() selector = Selector(response) name = selector.xpath( '//div[@id="wrapper"]/h1/span/text()').extract()[0] star = selector.xpath( '//strong[@class="ll rating_num "]/text()').extract()[0].strip() ratingPeople = selector.xpath( '//a[@class="rating_people"]/span/text()').extract()[0] #print name + ratingPeople #auther = re.findall(u'<span class="pl"> 作者<\/span><a class="" href=".*">(.*)<\/a><\/span>',response) # for span in info: # auther = selector.xpath('span/a/@href').extract()[0] # #publish = # print auther #print book_info bookImg = selector.xpath('//a[@class="nbg"]/@href').extract()[0] book_info = selector.xpath('//div[@id="info"]').extract()[0].encode( 'utf8') author = re.findall(r'<a class="" href=".*?">(.*?)</a>', book_info)[0].decode('utf8') press = re.search(r'出版社:</span>(.*?)<br', book_info).group(1).strip().decode('utf8') try: original = re.search(r'原作名:</span>(.*?)<br', book_info).group(1).strip().decode('utf8') except Exception, e: original = ''
def parse_item(self, response): sel = Selector(response) divs = sel.xpath( '//div[@class="doulist-item"]/div[@class="mod"]/div[1]') for div in divs: link = div.xpath('./div[4]/a/@href') if len(link) > 0: item = DoubanbookItem() t_link = link[0] bookname = div.xpath('./div[4]/a/text()').extract() author = div.xpath('./div[6]/text()[1]').extract() score = div.xpath('./div[5]/span[2]/text()').extract() scoreCount = div.xpath('./div[5]/span[3]/text()').extract() publishCompany = div.xpath('./div[6]/text()[2]').extract() publishTime = div.xpath('./div[6]/text()[3]').extract() item['bookName'] = [b.encode("utf-8") for b in bookname] item['author'] = [a.encode("utf-8") for a in author] item['score'] = [s.encode("utf-8") for s in score] item['scoreCount'] = [sc.encode("utf-8") for sc in scoreCount] item['publishCompany'] = [ p.encode('utf-8') for p in publishCompany ] item['publishTime'] = [ pt.encode('utf-8') for pt in publishTime ] item['link'] = t_link yield scrapy.Request(t_link, callback=self.parse_detail, meta={"item": item})
def parse(self, response): book = DoubanbookItem() for book_selector in response.xpath( '//div[@class="bd doulist-subject"]'): book['book_src'] = book_selector.xpath( 'div[@class="post"]/a/img/@src').extract_first() book['book_title'] = book_selector.xpath( 'div[@class="title"]/a/text()').extract_first().strip(' \n') book['book_href'] = book_selector.xpath( 'div[@class="title"]/a/@href').extract_first() book['book_rating_nums'] = book_selector.xpath( 'div[@class="rating"]/span[2]/text()').extract_first() book['book_rating_counting'] = book_selector.xpath( 'div[@class="rating"]/span[3]/text()').extract_first().strip( '()') book_abstract = book_selector.xpath( 'div[@class="abstract"]').xpath('string(.)').extract_first() book['book_author'] = self.__match("作者:(.*?)\n", book_abstract) book['book_publisher'] = self.__match("出版社:(.*?)\n", book_abstract) book['book_publish_date'] = self.__match("出版年:(.*?)\n", book_abstract) yield book next_page = response.xpath('//div[@class="paginator"]').xpath( 'span[@class="next"]/a/@href').extract_first() if next_page is not None: yield scrapy.Request(url=next_page, callback=self.parse)
def parse(self, response): #print(str(response.body,encoding="utf8")) item = DoubanbookItem() selector = scrapy.Selector(response) books = selector.xpath('//div[@class="bd doulist-subject"]') for x in books: title = x.xpath('div[@class="title"]/a/text()').extract()[0] rate = x.xpath( 'div[@class="rating"]/span[@class="rating_nums"]/text()' ).extract()[0] author = re.search('<div class="abstract">(.*?)<br', x.extract(), re.S).group(1) title = title.replace(' ', '').replace('\n', '') author = author.replace(' ', '').replace('\n', '') item["title"] = title item["rate"] = rate item["author"] = author # print('标题:' + title) # print('评分:' + rate) # print(author) # print('-----------------------------------------') yield item nextpage = selector.xpath( '//span[@class="next"]/link/@href').extract() if nextpage: next = nextpage[0] print(next) yield scrapy.http.Request(next, callback=self.parse)
def parse(self, response): books = response.xpath('//div[@class="bd doulist-subject"]') item = DoubanbookItem() for each_book in books: title = each_book.xpath( 'div[@class="title"]/a/text()').extract_first("null").replace( "\n", "").replace(" ", "") author = each_book.xpath( 'div[@class="abstract"]/text()').extract_first("null").replace( "\n", "").replace(" ", "").replace('\"', "") rate = each_book.xpath( 'div[@class="rating"]/span[@class="rating_nums"]/text()' ).extract_first("null") author = re.match(r'.+:(.*)', author).group(1).replace('\"', "") item['num'] = self.count item['title'] = title item['author'] = author item['rate'] = rate self.count += 1 yield item next_page = response.xpath( '//span[@class="next"]/a/@href').extract_first("") if next_page: print(next_page) yield scrapy.http.Request(next_page, callback=self.parse)
def parse(self, response): item = DoubanbookItem() selector = scrapy.Selector(response) books = selector.xpath('//div[@class="bd doulist-subject"]') for each in books: # print each.extract() t = each.xpath('div[@class="title"]/a/text()').extract() title = t[0].replace(' ', '').replace('\n', '') r = each.xpath( 'div[@class="rating"]/span[@class="rating_nums"]/text()' ).extract() rate = r[0] if len(r) > 0 else "" author = re.search('<div class="abstract">(.*?)<br', each.extract(), re.S).group(1) author = author.replace(' ', '').replace('\n', '') print 'Title:' + title print 'Rate:' + rate print author print '' item['title'] = title item['rate'] = rate item['author'] = author yield item nextPage = selector.xpath( '//span[@class="next"]/link/@href').extract() if nextPage: next = nextPage[0] # print next yield scrapy.http.Request(next, callback=self.parse)
def parse(self, response): #print(response.body) item = DoubanbookItem() selector = scrapy.Selector(response) #scrapy.Selector提取内容 books = selector.xpath('//div[@class="bd doulist-subject"]') for each in books: title = each.xpath('div[@class="title"]/a/text()').extract()[0] #xpath如果要提取内容,需要在后面加上.extract() extract提取 rate = each.xpath( 'div[@class="rating"]/span[@class="rating_nums"]/text()' ).extract()[0] author = re.search('<div class="abstract">(.*?)<br', each.extract(), re.S).group(1) title = title.replace(' ', '').replace('\n', '') author = author.replace(' ', '').replace('\n', '') item['title'] = title item['rate'] = rate item['author'] = author yield item nextPage = selector.xpath( '//span[@class="next"]/link/@href').extract() if nextPage: next = nextPage[0] print(next) yield scrapy.http.Request(next, callback=self.parse)
def parse(self, response): #print response.body item = DoubanbookItem() selector = scrapy.Selector(response) albums = selector.xpath('//div[@class="bd doulist-subject"]') for each in albums: title = each.xpath('div[@class="title"]/a/text()').extract()[0] title = title.replace(' ', '').replace('\n', '') rate = each.xpath( 'div[@class="rating"]/span[@class="rating_nums"]/text()' ).extract()[0] artist = re.search('<div class="abstract">(.*?)<br', each.extract(), re.S).group(1) artist = artist.replace(' ', '').replace('\n', '') item['title'] = title item['rate'] = rate item['artist'] = artist #print 'title' + title #print 'rate' + rate #print artist print '' yield item nextPage = selector.xpath( '//span[@class="next"]/link/@href').extract() if nextPage: next = nextPage[0] print next yield scrapy.http.Request(next, callback=self.parse)
def parse_item(self, response): #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() item = DoubanbookItem() # 标题 item['title'] = response.xpath( '//div[contains(@class, "pagecenter p3")]//strong/text()').extract( )[0] # 编号 item['author'] = item['title'].split(' ')[-1].split(":")[-1] # 内容,先使用有图片情况下的匹配规则,如果有内容,返回所有内容的列表集合 content = response.xpath('//div[@class="contentext"]/text()').extract() # 如果没有内容,则返回空列表,则使用无图片情况下的匹配规则 if len(content) == 0: content = response.xpath( '//div[@class="c1 text14_2"]/text()').extract() item['info'] = "".join(content).strip() else: item['info'] = "".join(content).strip() # 链接 item['url'] = response.url yield item
def parse(self, response): info_list = response.xpath('//div[@class="info"]') # print(info_list) for info in info_list: item = DoubanbookItem() item['title'] = info.xpath( './/div[@class="title"]/a/text()').extract_first() item['author'] = info.xpath( './/span[1]/span[2]/a/text()').extract_first() item['category'] = info.xpath( './/span[@itemprop="genre"]/text()').extract_first() item['rate'] = info.xpath( './/span[@class="rating-average"]/text()').extract_first() item['count'] = info.xpath( './/a[@class="ratings-link"]/span/text()').extract_first() item['brief'] = info.xpath( './/div[@class="article-desc-brief"]/text()').extract_first() yield item next_temp_url = response.xpath( '//li[@class="next"]/a/@href').extract_first() if next_temp_url: next_url = response.urljoin(next_temp_url) yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): print(response.url) commonitems = Selector( response=response).xpath('//li[@class="comment-item"]') # print(commonitems) for ci in commonitems: star = ci.xpath( './div[@class="comment"]/h3/span[@class="comment-info"]/span[1]/@title' ).extract_first().strip() vote = ci.xpath( './div[@class="comment"]/h3/span[@class="comment-vote"]/span[@class="vote-count"]/text()' ).extract_first().strip() short = ci.xpath( './div[@class="comment"]/p[@class="comment-content"]/span[@class="short"]/text()' ).extract_first().strip() # print(star) # print(vote) # print(short) # 在items.py定义DoubanbookItem item = DoubanbookItem() item['star'] = star item['vote'] = vote item['short'] = short yield item
def parse(self, response): # print response.body item = DoubanbookItem() selector = scrapy.Selector(response) books = selector.xpath('//div[@class="bd doulist-subject"]') for each in books: title = each.xpath('div[@class="title"]/a/text()').extract()[0] rate = each.xpath( 'div[@class="rating"]/span[@class="rating_nums"]/text()' ).extract()[0] author = re.search('<div class="abstract">(.*?)<br', each.extract(), re.S).group(1) title = title.replace(' ', '').replace('\n', '') author = author.replace(' ', '').replace('\n', '') item['title'] = title item['rate'] = rate item['author'] = author print '标题:', title print '评分:', rate print author print '' # 让scrapy自动去处理item yield item # 爬后面几页 nextPage = selector.xpath( '//span[@class="next"]/link/@href').extract() if nextPage: next = nextPage[0] print next yield scrapy.http.Request(next, callback=self.parse)
def parse_2(self, response): for i, book in enumerate(response.css('#wrapper')): item = DoubanbookItem() item['rate_num'] = book.xpath( './/*[@id="interest_sectl"]/div/div[2]/strong/text()' ).extract_first() if round(float(item['rate_num'].strip())) >= 7: item['name'] = book.xpath('.//h1/span/text()').extract_first() yield item
def parse_item(self,response): type_out = True sel=Selector(response) item=DoubanbookItem() item['name']=sel.xpath('//*[@id="wrapper"]/h1/span/text()').extract()[0] item['url']=response.url try: item['author']=sel.xpath('//*[@id="info"]/a[1]/text()').extract()[0] except Exception,e: type_out = False item['author']=sel.xpath('//*[@id="info"]/span[1]/a/text()').extract()[0]
def parse(self, response): links = response.xpath('//*[@id="subject_list"]/ul/li/div[2]/h2/a') for link in links: item = DoubanbookItem() item['href'] = link.xpath('@href').extract_first() item['title'] = link.xpath('@title').extract_first() yield item next = response.xpath('//*[@class="next"]/a/@href').extract_first() if next is not None: yield scrapy.Request(response.urljoin(next))
def parse_data(self, response): item = DoubanbookItem() item['book_name'] = response.xpath( '//*[@id="wrapper"]/h1/span/text()').extract()[0] item['auth_name'] = response.xpath( '//*[@id="info"]/span[1]/a/text()').extract()[0] item['book_url'] = response.meta['url'] item['pic_url'] = response.xpath( '//*[@id="mainpic"]/a/img/@src').extract()[0] item['rate'] = response.xpath( '//*[@id="interest_sectl"]/div/div[@class="rating_self clearfix"]/strong/text()' ).extract()[0] item['rate_num'] = response.xpath( '//span[@property="v:votes"]/text()').extract()[0] print "----------- Current book:%s-----------" % item[ 'book_name'].encode('gbk') yield item
def parse(self, response): # ###debug # items = [] soup = BeautifulSoup(response.text, 'html.parser') title_list = soup.find_all('div', attrs={'class': 'pl2'}) for i in range(len(title_list)): # 在items.py定义 item = DoubanbookItem() title = title_list[i].find('a').get('title') link = title_list[i].find('a').get('href') item['title'] = title item['link'] = link # ##debug # items.append(item) # return items yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
def parse(self, response): # ###debug # items = [] selector = etree.HTML(response.text) IDs = selector.xpath( '//*[@id="content"]/div/div[1]/div[1]/div/@data-cid') for id in IDs: # 在items.py定义 item = DoubanbookItem() full_review_url = 'https://book.douban.com/j/review/' + id + '/full' item['url'] = full_review_url # ##debug # items.append(item) # return items yield scrapy.Request(url=full_review_url, meta={'item': item}, callback=self.parse2)
def parse(self, response): item = DoubanbookItem() book_name = self.get_name(response) author_name = self.get_author(response) # book_url = self.get_bookurl(response) pic_url = self.get_picurl(response) # rate = self.get_rate(response) # rate_num = self.get_ratenum(response) length = len(book_name) for i in range(length): print "----------- Current num:%d -----------" % i item['book_name'] = book_name[i] item['auth_name'] = author_name[i].replace('\n','').replace(' ', '') # item['book_url'] = book_url[i] item['pic_url'] = pic_url[i] # item['rate'] = rate[i] # item['rate_num'] = rate_num[i].replace('\n','').replace(' ', '') yield item
def parse(self, response): item = DoubanbookItem() selector = scrapy.Selector(response) item['name'] = "".join( selector.xpath( "//*[@id=\"wrapper\"]/h1/span/text()").extract()).replace( ' ', '').replace('\n', '') writer_case1 = "/".join( selector.xpath( "//*[@id=\"info\"]/span[1]/a/text()").extract()).replace( ' ', '').replace('\n', '') writer_case2 = "/" "/".join( selector.xpath("//*[@id=\"info\"]/a[1]/text()").extract()).replace( ' ', '').replace('\n', '') item['date'] = "".join( selector.xpath( "//*[@id=\"info\"]/span[contains(./text(),'出版年')]/following::text()[1]" ).extract()).replace(' ', '').replace('\n', '') item['pagenum'] = "".join( selector.xpath( "//*[@id=\"info\"]/span[contains(./text(),'页数')]/following::text()[1]" ).extract()).replace(' ', '').replace('\n', '') item['ISBN'] = "".join( selector.xpath( "//*[@id=\"info\"]/span[contains(./text(),'ISBN')]/following::text()[1]" ).extract()).replace(' ', '').replace('\n', '') item['price'] = "".join( selector.xpath( "//*[@id=\"info\"]/span[contains(./text(),'定价')]/following::text()[1]" ).extract()).replace(' ', '').replace('\n', '') item['tags'] = ";".join( selector.xpath("//*[@id=\"db-tags-section\"]/div/span/a/text()"). extract()).replace(' ', '').replace('\n', '') if (writer_case1 == ''): item['writer'] = writer_case2 else: item['writer'] = writer_case1 yield item url_list = response.xpath( "//div[@class='content clearfix']/dl/dd/a/@href").extract() for url in url_list: yield scrapy.Request(url=url, callback=self.parse)
def parse2(self, response): print('response.url: ', response.url) commonitems = Selector( response=response).xpath('//li[@class="comment-item"]') for ci in commonitems: short = ci.xpath( './div[@class="comment"]/p[@class="comment-content"]/span[@class="short"]/text()' ).extract_first().strip() shorttime = ci.xpath( './div[@class="comment"]//span[@class="comment-info"]/span[2]/text()' ).extract_first().strip() # 判断数据是否已经读取过,读取过则返回 sql = 'select count(*) from hlmshorts_new t where t.S_SHORTSTIME = "%s" and t.S_SHORTS = "%s"' % ( shorttime, short) df = db.readtable(sql) cnt = df.iat[0, 0] if cnt > 0: return star = ci.xpath( './div[@class="comment"]/h3/span[@class="comment-info"]/span[1]/@title' ).extract_first().strip() vote = ci.xpath( './div[@class="comment"]/h3/span[@class="comment-vote"]/span[@class="vote-count"]/text()' ).extract_first().strip() # 在items.py定义DoubanbookItem item = DoubanbookItem() item['star'] = star item['vote'] = vote item['short'] = short item['shorttime'] = shorttime yield item # 取下一页数据 nextpage1 = Selector(response=response).xpath( '//div[@class="paginator-wrapper"]/ul[@class="comment-paginator"]/li[last()]/a/@href' ) if nextpage1: nextpage = nextpage1.extract_first().strip() print('nextpage: ', nextpage) url = f'{HongloumengSpider.start_urls[0]}{nextpage}' yield scrapy.Request(url=url, callback=self.parse2) time.sleep(5)
def parse(self, response): #print response.body item = DoubanbookItem() selector = scrapy.Selector(response) books = selector.xpath('//div[@class="bd doulist-subject"]') for each in books: title = each.xpath('div[@class="title"]/a/text()').extract()[0] rate = each.xpath( 'div[@class="rating"]/span[@class="rating_nums"]/text()' ).extract()[0] author = re.search('<div class="abstract">(.*?)<br', each.extract(), re.S).group(1) title = title.replace(' ', '').replace('\n', '') author = author.replace(' ', '').replace('\n', '') item['title'] = title item['rate'] = rate item['author'] = author # print 'title:' + title # print 'rate:' + rate # print author # print '' yield item
def parse_item(self, response): pass print("-------3-------") item = DoubanbookItem() title = response.xpath('//div[@class="article-profile-bd"]/h1/text()').extract() if len(title) != 0: item['title'] = title[0] author = response.xpath(' //div[@class="article-meta"]/p[@class="author"]//a/text()').extract() if len(author) != 0: item['author'] = author[0] info = response.xpath('//div[@class]/div[@class="info"]/p/text()').extract() if len(info) != 0: item['info'] = info[0] item['url'] = response.url yield item
def parse_item(self, response): """ 解析页面数据 """ # 根据页面中item的总数,遍历解析出每个item for i in range(self.get_item_count(response)): item = DoubanbookItem() # 书名 item['book_name'] = self.get_book_name(response, i) # 出版信息 item['info'] = self.get_info(response, i) # 简介 item['intro'] = self.get_intro(response, i) # 评分 item['grade'] = self.get_grade(response, i) # 评价人数 item['evaluate_number'] = self.get_evaluate_number(response, i) # 来源链接 item['source_url'] = self.get_source_url(response, i) # 来源名称 item['source_name'] = 'douban' yield item