def parse_url(self, response): html = Selector(response) id = html.xpath( "//ul[@class= 'bigimg cloth_shoplist']/li/@id").extract() next_link = html.xpath( "//*[@id='12810']/div[3]/div[2]/div/ul/li/a[@title='下一页']/@href" ).extract_first() start_url = 'http://product.dangdang.com/' next_start_url = 'http://category.dangdang.com' item = response.meta['item'] atype = item['atype'] item = DangdangItem(atype=atype) for aid in id: item['aid'] = int(aid) item = DangdangItem(aid=aid, atype=atype) item_url = start_url + aid + '.html' yield scrapy.Request(url=item_url, callback=self.parse_item, headers=self.header, meta={'item': item}) if next_link != '': next_url = next_start_url + next_link yield scrapy.Request(url=next_url, callback=self.parse_url, headers=self.header, meta={'item': item})
def parse(self, response): for atype in range(0, 3): item = DangdangItem() item['atype'] = atype item = DangdangItem(atype=atype) cid = "-cid400252" + str(atype) + ".html" base_url = "http://category.dangdang.com/pg1" + cid yield scrapy.Request(url=base_url, callback=self.parse_url, headers=self.header, meta={'item': item})
def parse(self, response): lilist = response.css('ul.bigimg li') #print(lilist) for li in lilist: item = DangdangItem() title = li.css('a::attr(title)').extract_first() if len(title) > 32: item['title'] = title[:32] else: item['title'] = title item['author'] = li.css('p.search_book_author span a::text').extract_first() item['price'] = li.css('p.price span::text').extract_first() item['comment_num'] = li.css('p.search_star_line a::text').extract_first()[:-3] item['detail'] = li.css('p.detail::text').extract_first() if li.css('a img::attr(data-original)').extract_first(): item['picurl'] = li.css('a img::attr(data-original)').extract_first() else: item['picurl'] = li.css('a img::attr(src)').extract_first() item['picurl'] = item['picurl'].replace('_b_','_w_') yield item #print(item) time.sleep(3) self.p += 1 if self.p < 6: next_url = 'http://search.dangdang.com/?key=python&page_index='+str(self.p) url = response.urljoin(next_url) yield scrapy.Request(url=url,callback=self.parse)
def parse(self, response): all_data_x = response.xpath('//div[@class="con flq_body"]')[0] all_first_title = all_data_x.xpath('./div[contains(@class,"level_one")]') for first_title_e in all_first_title: f = first_title_e.xpath('./dl/dt/text()').extract_first() if f is not None: item = DangdangItem() f = re.sub(r'\r|\n|\t| ','',f) if not f: f = first_title_e.xpath('./dl/dt/a/text()').extract() f = ''.join(f) f = re.sub(r'\r|\n|\t| ','',f) item['first_title'] = f all_s_t_title = first_title_e.xpath('.//div[@class="col eject_left"]/dl') if all_s_t_title.extract_first() is not None: for s_t in all_s_t_title: second_title = s_t.xpath('./dt/text()').extract_first() second_title = re.sub(r'\r|\n|\t| ','',second_title) if second_title: item['second_title'] = second_title else: item['second_title'] = s_t.xpath('./dt/a/text()').extract_first() item['second_title'] = item['second_title'].replace(' ','') all_third_title = s_t.xpath('./dd/a') for third_e in all_third_title: item['third_title'] = third_e.xpath('./@title').extract_first() item['label_url'] = third_e.xpath('./@href').extract_first() if ('search.dangdang.com' in item['label_url']) or ('category.dangdang.com' in item['label_url']): item_copy = copy.deepcopy(item) yield scrapy.Request(item_copy['label_url'],callback=self.parse_book_detail,meta={'item':item_copy})
def parse_item(self, response): item_html = Selector(response) item = DangdangItem() item = response.meta['item'] shop_name = item_html.xpath( "//*[@id='service-more']/div[2]/p[1]/span/span[2]/a/text()" ).extract_first() item['shop_name'] = shop_name item_name = ','.join( item_html.xpath( "//*[@id='product_info']/div[1]/h1/text()").extract()).replace( ' ', '').replace('\r\n', '') item['item_name'] = item_name item_price = ','.join( item_html.xpath("//*[@id='dd-price']/text()").extract()).replace( ' ', '').replace(',', '') item['item_price'] = item_price item_from = ','.join( item_html.xpath( "//*[@id='shop-geo-name']/text()").extract()).replace( ' 至', '') item['item_from'] = item_from image_url = item_html.xpath( "//*[@id='main-img-slider']/li/a/@data-imghref").extract() item['image_url'] = image_url image_list = ','.join(image_url) item['image_list'] = image_list yield item
def parse(self, response): ''' 递归解析响应数据 ''' print('*'*64) dlist = response.selector.xpath(".//ul[@class='bigimg']/li") for dd in dlist: item = DangdangItem() item['name'] = dd.xpath("./a/@title").extract_first() #good price = dd.xpath(".//span[@class='search_now_price']").extract_first() price = re.findall(".*?([0-9]*\.[0-9]*)",price) if(price[0]): item['price'] = price[0] else: item['price'] = None item['pic'] = dd.xpath(".//img/@data-original|.//img/@src").extract_first() item['author'] = dd.xpath(".//a[@name='itemlist-author']/@title").extract_first() item['publisher'] = dd.xpath(".//a[@name='P_cbs']/text()").extract_first() #good item['comments'] = dd.xpath(".//a[@class='search_comment_num']/text()").extract_first() #wrong item['pubdate'] = dd.re_first("(([0-9]{4})-([0-9]{2})-([0-9]{2}))") #good item['description'] = dd.xpath(".//p[@class='detail']/text()").extract_first() # good yield item self.p += 1 # 想爬取多少页? if(self.p<=10): next_url = "http://search.dangdang.com/?key=python&act=input&page_index=" + str(self.p) url = response.urljoin(next_url) yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): item = DangdangItem() item['title'] = response.xpath("//a[@class ='pic']/@title").extract() item['link'] = response.xpath("//a[@class ='pic']/@href").extract() item['comment'] = response.xpath("//a[@class ='search_comment_num']/text()").extract() yield item for i in range(2,101): url = "http://category.dangdang.com/pg"+str(i)+"-cp01.54.06.00.00.00.html" yield Request(url,callback=self.parse)
def parse(self, response): item = DangdangItem() item["title"] = response.xpath("//a[@name='itemlist-picture']/@title").extract() item["link"] = response.xpath("//a[@name='itemlist-picture']/@href").extract() item["comment"] = response.xpath("//a[@name='itemlist-review']/text()").extract() yield item for i in range(2,81): url = "http://category.dangdang.com/pg"+str(i)+"-cid4008154.html" yield Request(url,callback=self.parse)
def parse(self, response): item = DangdangItem() #items.py里面的函数 item["title"] = response.xpath("//a[@class='pic']/@title").extract() item["link"] = response.xpath("//a[@class='pic']/@href").extract() item["comment"] = response.xpath("//a[@name='itemlist-review']/text()").extract() yield item for i in range(2, 10): url = "http://category.dangdang.com/pg"+str(i)+"-cp01.54.06.00.00.00.html" yield Request(url, callback = self.parse)
def handle_items(self, response): item = DangdangItem() item["title"] = response.xpath( "//a[@name='sort-big-pic']/@title").extract() item["link"] = response.xpath( "//a[@name='sort-big-pic']/@href").extract() item["comment"] = response.xpath( "//a[@name='sort-evaluate']/text()").extract() yield item
def parse(self, response): item=DangdangItem() item["title"]=response.xpath("//a[@name='sort-big-pic']/@title").extract() item["link"]=response.xpath("//a[@name='sort-big-pic']/@href").extract() item["comment"]=response.xpath("//a[@name='sort-evaluate']/text()").extract() yield item for i in range(2,81): url='http://category.dangdang.com/pg'+str(i)+'-cid4008154.html' yield Request(url,callback=self.parse)
def parse_name(self, response): items = DangdangItem() items['title'] = response.xpath( '//div[@class="name_info"]/h1/@title').extract() items['num'] = response.xpath( '//a[@id="comm_num_down"]/text()').extract() items['link'] = response.url items['price'] = response.xpath('//p[@id="dd-price"]/text()').extract() yield items
def parse(self, response): item = DangdangItem() item['title'] = response.xpath("//a[@class='pic']/@title").extract item['link'] = response.xpath("//a[@class='pic']/@href").extract item['comment'] = response.xpath( "//a[@class='search_comment_num']/text()").extract print(item['title']) print(item['link']) print(item['comment']) yield item
def get_info(self, response): item = DangdangItem() item['name'] = response.xpath('//a[@class="pic"]/@title').extract() item['price'] = response.xpath( '//p[@class="price"]/span/text()').extract() item["link"] = response.xpath("//a[@class='pic']/@href").extract() item["comnum"] = response.xpath( "//a[@name='itemlist-review']/text()").extract() #print('11111111111111111111') yield item
def parse_item(self, response): lst = response.xpath('//ul[@class="bigimg cloth_shoplist"]/li') for i in lst: item = DangdangItem() item['name'] = i.xpath('./p[@class="name"]/a/@title').extract()[0] item['price'] = i.xpath( './p[@class="price"]/span/text()').extract()[0][1:] item['link'] = i.xpath('./p[@class="link"]/a/@href').extract()[0] item['comment'] = i.xpath( './p[@class="star"]/a/text()').extract()[0].replace('条评论', '') yield item
def parse(self, response): item = DangdangItem() item["title"] = response.xpath("//a[@name='itemlist-picture']/@title").extract() item["link"] = response.xpath("//a[@name='itemlist-picture']/@href").extract() item["price"] = response.xpath("//span[@class='price_n']/text()").extract() item["comment"] = response.xpath("//a[@name='itemlist-review']/text()").extract() # print(item["title"]) yield item for i in range(2, 20): url = 'http://search.dangdang.com/?key=%CC%A4%B2%BD%BB%FA&page_index=' + str(i) yield Request(url, callback=self.parse)
def parse(self, response): html = response.text tree = etree.HTML(html) li_list = tree.xpath('//*[@id="component_59"]/li') for li in li_list: item = DangdangItem() item['title'] = li.xpath('./a/@title') item['miaoshu'] = li.xpath('./p[@class = "detail"]/text()' ) # 之前的爬虫写错了导致抓到的title和miaoshu放在一个了 # print(li) print(item['title']) yield item
def parse_name(self, response): items = DangdangItem() items['title'] = response.xpath( '//*[@id="product_info"]/div[1]/h1/@title').extract() items['num'] = response.xpath( '//*[@id="comm_num_down"]/text()').extract() items['link'] = response.url items['price'] = response.xpath('//*[@id="dd-price"]/text()').extract() items['cbs'] = response.xpath( '//*[@id="product_info"]/div[2]/span[2]/a/text()').extract() items['pic'] = response.xpath('//*[@id="largePic"]/@src').extract() yield items
def parse_subpage(self,response,category): length= len(response.xpath('//*[@id="component_0__0__8395"]/li/a/img').extract())#获取每一面的图书数量 for i in range(0,length+1): item = DangdangItem() item['name']=response.xpath('//*[@id = "component_0__0__8395"] /li[{}]/p[2]/a/text()'.format(i)).extract() item['author']=response.xpath('//*[@id="component_0__0__8395"]/li[{}]/p[5]/text()'.format(i)).extract() item['price']=response.xpath('//*[@id="component_0__0__8395"]/li[{}]/p[1]/span[1]/text()'.format(i)).extract() item['comments']=response.xpath('//*[@id="component_0__0__8395"]/li[{}]/p[4]/a/text()'.format(i)).extract() item['category']=category yield item
def parse(self, response): item = DangdangItem() item['title'] = response.xpath( '//a[@name="itemlist-picture"]/@title').extract() item['link'] = response.xpath( '//a[@name="itemlist-picture"]/@href').extract() item['comment'] = response.xpath( '//a[@class="search_comment_num"]/text()').extract() yield item for i in range(1, 100): url = 'http://search.dangdang.com/?key=%BC%C6%CB%E3%BB%FA&act=input&page_index=' + str( i) yield Request(url, callback=self.parse)
def parse(self, response): item = DangdangItem() item["name"] = response.xpath("//h1/@title").extract_first() item["author"] = response.xpath( "//span[@id='author']/a[1]/text()").extract_first() item["price"] = response.xpath( "//p[@id='dd-price']/text()[2]").extract_first() item["ISBN"] = response.xpath( "//div[@id='detail_describe']/ul/li[5]/text()").extract_first() item["crawl_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") self.number = self.number + 1 item["id"] = self.number yield item
def parse(self, response): item = DangdangItem() item['name'] = response.xpath('//*[@class="pic"]/@title').extract() item['price'] = response.xpath( '//*[@class="price_n"]/text()').extract() item['link'] = response.xpath('//*[@class="pic"]/@href').extract() item['comment'] = response.xpath( '//*[@dd_name="单品评论"]/text()').extract() yield item for i in range(2, 5): url = 'http://category.dangdang.com/pg{}-cid4005627.html'.format(i) yield Request(url, callback=self.parse)
def parse(self, response): item = DangdangItem() item['title'] = response.xpath( '//a[@name="itemlist-title"]/@title').extract() item['link'] = response.xpath( '//a[@name="itemlist-title"]/@href').extract() item['comment'] = response.xpath( '//a[@class="search_comment_num"]/text()').extract() yield item for i in range(2, 101): url = 'http://category.dangdang.com/pg' + str( i) + '-cp01.54.06.00.00.00.html' yield Request(url, callback=self.parse)
def parse(self, response): #print(response.text) tmp = response.xpath("//div[@id='bd']//li[@id]") #选取中商品 # print(type(tmp)) print(len(tmp)) for li in tmp: item = DangdangItem() item['book_name'] = li.xpath('p[@name]/a/@title').extract_first() item['price'] = li.xpath( "p[@class='price']/span[@class='search_now_price']/text()" ).extract_first() #print(item['price']) item['author'] = li.xpath( "p[@class='search_book_author']/span[1]/a[1]/@title" ).extract_first() item['pubdate'] = li.xpath( "p[@class='search_book_author']/span[2]/text()").extract_first( ) item['press'] = li.xpath( "p[@class='search_book_author']/span[3]/a[1]/@title" ).extract_first() item['star'] = li.xpath( "p[@class='search_star_line']/span/span/@style").extract_first( ) item['_id'] = li.xpath("a/@href").extract_first() item['comment'] = li.xpath("p[@class='search_star_line']/a/text()" ).extract_first() #a/@href可以提取到评论的连接 item['comment_url'] = li.xpath( "p[@class='search_star_line']/a/@href").extract_first( ) # a/@href可以提取到评论的连接 #print(item['comment_url']) #print(item['star'],item['book_url']) #print(item['author'],item['press'],item['pubdate']) yield item # next = tmp.xpath("//li[@class='next']/a/@href") #print(next) # if next: # print("http://category.dangdang.com"+next.extract_first()) # yield scrapy.Request("http://category.dangdang.com"+next.extract_first(),callback=self.parse) #书名 // li[ @ sku] / p[ @ name] #价格//li[@sku]/p[@class='price']/span[@class='search_now_price'] 有折扣的情况下,无折扣饿另说 #作者,出版社//li[@sku]/p[@class='search_book_author']/span/a[1]/@title #星级//li[@sku]/p[@class='search_star_line']/span/span/@style width: 90%; #//li[@sku]/p[@class='search_star_line']/a 1232312条评论 评论数 #//评论网址//li[@sku]/p[@class='search_star_line']/a/@href #出版日期//li[@sku]/p[@class='search_book_author']/span[2] /2013-01-01
def parse_page(self, response): for item in response.xpath( '//*[@id="search_nature_rg"]/ul[@class="bigimg"]/li'): # 所有图书 book = DangdangItem() # try: book['price'] = float( item.xpath('./p[@class="price"]/span[1]/text()').pop().extract( ).lstrip('¥')) book['type_tag'] = response.meta['type'] book['name'] = item.xpath( './p[@class="name"]/a/text()').pop().extract().strip() book['book_tag'] = str(time.time()) + book.get('name', None) # book['image_url'] = item.xpath('./a/img/@src').pop().extract() book['link'] = item.xpath('./p[1]/a/@href').pop().extract() book['star_level'] = \ int(item.xpath('./p[@class="search_star_line"]/span/span/@style').pop().extract().split(' ')[-1].rstrip( '%;')) try: book['time'] = item.xpath( './p[@class="search_book_author"]/span[2]/text()').pop( ).extract().split('/')[-1] book['author_tag'] = ','.join( item.xpath( './p[@class="search_book_author"]/span[1]/a/text()'). extract()).strip() book['publish_company'] = item.xpath( './p[@class="search_book_author"]/span[3]/a/text()').pop( ).extract().strip() book['brief'] = item.xpath( './p[2]/text()').pop().extract().strip() except: scrapy.Spider.log( self, "Error:{} , url {}:".format(book['name'], response.url)) finally: yield book
def parse(self, response): item = DangdangItem() item["title"] = response.xpath("//a[@class='pic']/@title").extract() item["link"] = response.xpath("//a[@class='pic']/@href").extract() item["comment"] = response.xpath( "//a[@class='search_comment_num']/text()").extract() item["price"] = response.xpath( "//p[@class='price']/span[@class='search_now_price']/text()" ).extract() yield item for i in xrange(1, 101): url = "http://category.dangdang.com/pg" + str( i) + "-cp01.54.06.00.00.00.html" yield Request(url, callback=self.parse)
def parse(self, response): item = DangdangItem() item["title"] = response.xpath( "//a[@name='itemlist-title']/@title").extract() item["link"] = response.xpath( "//a[@name='itemlist-title']/@href").extract() item["comment"] = response.xpath( "//a[@name='itemlist-review']/text()").extract() # print(item["title"]) yield item for i in range(2, 11): #爬取2~10页 url = 'http://category.dangdang.com/pg' + str( i) + '-cid4008154.html' yield Request(url, callback=self.parse)
def parse(self, response): lst = response.xpath('//ul[@class="bigimg cloth_shoplist"]/li') for i in lst: item = DangdangItem() item['name'] = i.xpath('./p[@class="name"]/a/@title').extract()[0] item['price'] = i.xpath( './p[@class="price"]/span/text()').extract()[0][1:] item['link'] = i.xpath('./p[@class="link"]/a/@href').extract()[0] item['comment'] = i.xpath( './p[@class="star"]/a/text()').extract()[0].replace('条评论', '') yield item for i in range(2, 5): url = 'http://category.dangdang.com/pg{}-cid4005627.html'.format(i) yield Request(url, callback=self.parse)
def parse(self, response): # hax = Selector(response) items = [] titles = response.xpath( '//body//div[@id="search_nature_rg"]//a/@href').extract() for index in range(len(titles)): item = DangdangItem() title = titles[index] item['links'] = title items.append(item) for item in items: yield FormRequest(dont_filter=True, url=item['links'], meta={'item': item}, callback=self.parse2)
def parse_item(self, response): item = DangdangItem() item['kind'] = response.xpath( "//div[@class='layout_location']/span[last()]/text()" ).extract_first() allitem = response.css('.bang_list.clearfix.bang_list_mode>li') for i in allitem: item['name'] = i.css('.name>a').xpath('./@title').extract_first() item['link'] = i.css('.name>a').xpath('./@href').extract_first() item['comment'] = i.css('.star>a::text').extract_first() item['satisfaction'] = i.css('.tuijian::text').extract_first() item['price'] = i.xpath( "//div[@class='price']/p[1]/span[@class='price_n']/text()" ).extract_first() yield item