def parse_item(self, response): ratings = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5} book = response.css('.product_page .product_main') stock = book.css('.instock').get() breabcumb = response.css('.breadcrumb li') name = book.css('h1::text').get() price = float( re.search(r'\d+\.\d+', book.css('.price_color::text').get())[0]) available = book.css('.instock .icon-ok') is not None quantity = int(re.search(r'\d+', stock)[0]) rating = ratings[book.css('.star-rating::attr(class)').get().replace( 'star-rating ', '')] category = breabcumb[2].css('a::text').get() upc = response.css('.product_page table tr')[0].css('td::text').get() return BooksItem(name=name, price=price, quantity=quantity, available=available, rating=rating, category=category, url=response.url, scrape_date=datetime.today().isoformat(), upc=upc)
def parse(self, response): item = BooksItem() book_list = response.xpath('//u1[@class="subject-list"]/li[@class="subject-item"]/div[@class= "info" ]') for book in book_list: try: title = book.xpath('/h2/a/text()').extract() pub = book.xpath('/h2/div[@class="pub"]/text()').extract().strip().split('/') price = pub.pop() date = pub.pop() publish = pub.pop() auther = '/'.join(pub) grade = book.xpath('div[@class="star clearfix"]/span[@class="rating_nums"]/text()').extract().strip() count = book.xpath('div[@class="star clearfix"]/span[@class="pl"]/text()').extract().strip() desc = book.xpath('p/text()').extract().strip() link = book.xpath('div[@class="ft"]/div[@class="ebook-link"]/a/@href').extract() item['book_name'] = title item['book_auther'] = auther item['book_grade'] = grade item['book_count'] = count item['book_publish'] = publish item['book_date'] = date item['book_price'] = price item['book_desc'] = desc item['book_link'] = link yield item except: pass nextpage = response.xpath('//div[@class="paginator"]/span[@class="next"]/a/@href').extract()[0].strip() if nextpage is not None: nextpage = response.urljoin(nextpage) yield scrapy.Request(nextpage)
def parse(self, response): sel = Selector(response) book_list = sel.css('#subject_list > ul > li') for book in book_list: item = BooksItem() try: item['book_name'] = book.xpath( 'div[@class="info"]/h2/a/text()').extract()[0].strip() item['book_star'] = book.xpath( 'div[@class="info"]/div[2]/span[@class="rating_nums"]/text()' ).extract()[0].strip() item['book_pl'] = book.xpath( 'div[@class="info"]/div[2]/span[@class="pl"]/text()' ).extract()[0].strip() pub = book.xpath('div[@class="info"]/div[@class="pub"]/text()' ).extract()[0].strip().split('/') item['book_price'] = pub.pop() item['book_date'] = pub.pop() item['book_publish'] = pub.pop() item['book_author'] = '/'.join(pub) yield item except Exception as e: print(e) nextPage = sel.xpath( '//div[@id="subject_list"]/div[@class="paginator"]/span[@class="next"]/a/@href' ).extract()[0].strip() if nextPage: next_url = 'https://book.douban.com' + nextPage yield Request(next_url, callback=self.parse) pass
def parse_review_page(self, response): Title = response.xpath( '//div[@id="metacol"]/h1/text()').extract_first().strip() Author = response.xpath( '//div[@class="authorName__container"]/a/span/text()' ).extract_first() Score = response.xpath( '//div[@id="bookMeta"]/span/text()').extract_first().strip() Pages = response.xpath('//span[@itemprop="numberOfPages"]/text()' ).extract_first().strip() Pages = int(re.findall('\d+', Pages)[0]) Genre = response.xpath( '//a[@class="actionLinkLite bookPageGenreLink"]/text()').extract() Year = str(response.xpath('//div[@class="row"]//text()').extract()) Year = int(re.findall('((200|201)\d+)', Year)[0][0]) item = BooksItem() item['Title'] = Title item['Author'] = Author item['Score'] = Score item['Pages'] = Pages item['Genre'] = Genre item['Year'] = Year yield item
def parse(self, response): divs = response.xpath("//div[@father='1']")[2:12] for div in divs: item = BooksItem() item["l_one"] = [ x.strip() for x in div.xpath("./dl/dt//text()").extract() if len(x.strip()) > 0 ][0] dls = div.xpath(".//dl[@class='inner_dl']") for dl in dls: item["l_two"] = [ x.strip() for x in dl.xpath("./dt/a/text()").extract() ] if not item["l_two"]: item["l_two"] = dl.xpath( "./dt[position()=1]//text()").extract_first().strip() else: item["l_two"] = item["l_two"][0] a_list = dl.xpath("./dd/a") for a in a_list: item["l_three"] = a.xpath("./text()").extract_first() item['category_url'] = a.xpath("./@href").extract_first() yield scrapy.Request(item['category_url'], callback=self.parse_detail, meta=deepcopy(item))
def parse(self, response): sel = Selector(response) book_list = sel.css('#subject_list>ul>li') for book in book_list: item = BooksItem() try: #strip 方法用于一处字符串头尾指定的字符(默认空格) item['book_name'] = book.xpath( 'div[@class="info"]/h2/a/text()').extract()[0].strip() item['book_star'] = book.xpath( "div[@class='info']/div[2]/span[@class='rating_nums']/text()" ).extract()[0].strip() item['book_pl'] = book.xpath( "div[@class='info']/div[2]/span[@class='pl']/text()" ).extract()[0].strip() # item['book_pl'] = book.xpath("div[@class='info']/div[2]/span[@class='pl']/text()").extract()[0].strip() pub = book.xpath('div[@class="info"]/div[@class="pub"]/text()' ).extract()[0].strip().split('/') # pub = book.xpath('div[@class="info"]/div[@class="pub"]/text()').extract()[0].strip().split('/') item['book_price'] = pub.pop() item['book_date'] = pub.pop() item['book_publish'] = pub.pop() item['book_author'] = '/'.join(pub) yield item except: pass nextPage = sel.xpath( '//div[@id="subject_list"]/div[@class="paginator"]/span[@class="next"]/a/@href' ).extract()[0].strip() if nextPage: next_url = 'https://book.douban.com' + nextPage yield scrapy.http.Request(next_url, callback=self.parse)
def detail_parse(self, response): item = BooksItem() content = [] selector = Selector(response) reads = selector.xpath('//div[@id="tab1"]/div/ol/li').extract() for read in reads: read = read.replace("<li>", "").replace("</li>", "").replace("\r", " ") content.append(read) #content = reads.xpath('./text()').extract() #content = reads #content = reads.pop().replace("<li>", "") str_convert = ''.join(content) price = selector.xpath( '// *[ @ id = "book_cart_box"] / div[2] / div / div[1]/text()' ).extract() # print 123 # print str_convert item['title'] = response.meta["title"] item['price'] = price item['content'] = str_convert item['bookurl'] = response.meta["bookurl"] yield item nextLink = selector.xpath( '//div[@class="plain_page"]/div/span/span/a/@href').extract() #第10页是最后一页,没有下一页的链接 if nextLink: nextLink = nextLink[0] print nextLink yield Request(self.url + nextLink, callback=self.parse)
def parse(self, response): items = BooksItem() for book in response.xpath('//article[@class="product_pod"]'): items['title'] = book.xpath('./h3/a/@title').extract_first() # 书本标题 items['price'] = book.xpath('./div/p[@class="price_color"]/text()').extract_first() # 书本价格 review = book.xpath('./p[1]/@class').extract_first() # 书本评级 items['review'] = review.split(' ')[-1] self.Q.put(f"{items['title']}\n{items['price']}\n{items['review']}\n") yield items
def parse_book_detail(self, response): if not response.css(".h1").xpath('./a/text()').get(): yield scrapy.Request(url=response.url, dont_filter=True) item = BooksItem() item['title'] = response.xpath('//h1/text()').get() item['description'] = response.css('.product_page').xpath( './p/text()').get() item['price'] = response.css('.price_color::text').get() item['UPC'] = response.css('tr td::text').extract_first() item['rating'] = response.css('p.star-rating').xpath( './@class').get().split(' ')[1] yield item
def parse(self, response): item = BooksItem() lies = response.css('ol.row >li') for li in lies: item['title'] = li.xpath('article/div/a/img/@alt').extract() item['price'] = li.css('div.product_price p::text').extract()[0] item['star'] = li.xpath('article/p/@class').extract() yield item next_url = response.xpath( '//li[@class="next"]/a/@href').extract_first() if next_url: next_url = response.urljoin(next_url) yield scrapy.Request(next_url, callback=self.parse)
def getinfo(self,response): item = BooksItem() bookname = response.xpath('//h1/text()').extract()[0] bookprice = response.xpath('//div/p/text()').extract()[0] #去除货币符号 bookprice = ''.join(re.findall(r"[\d+\.\d+]", bookprice)) bookdescription = response.xpath('//article/p/text()').extract()[0] bookid = response.xpath('//tr[1]/td/text()').extract()[0] item['name'] = bookname item['price'] = bookprice item['des'] = bookdescription item['id'] = bookid yield item #print(bookname,bookprice,bookid)
def get_content(self,response): item = BooksItem() name = response.xpath("//span[@id='breadnav']/a[2]/text()").get() item["name"] = name zhangjie = response.xpath("//h1[@id='title']/text()").get() item["zhangjie"] = zhangjie zhangjie_content = "" contents = response.xpath("//div[@class='vcon']/p/text()").extract() for content in contents: zhangjie_content = zhangjie_content + content item["content"] = zhangjie_content yield item
def parse_item(self, response): item = BooksItem() # print(response.status) if response.status == 200: # try: item['url'] = response.url item['name'] = response.xpath("//div[@id='wrapper']/h1/span/text()").extract_first() # item['author'] = response.xpath("//div[@id='info']/a[1]/text()")[0].extract().replace('\n' ,'').replace(' ', '') _info = response.xpath("//div[@id='info']//text()").extract() info = [s.strip() for s in _info if s.strip() != ''] # item['publishing'] = info[0] # item['publsh_time'] = info[-5] # item['page'] = info[-4] # item['price'] = info[-3] # item['ISBN'] = info[-1] item['author'] = '' item['publishing'] = '' item['publish_time'] = '' item['page'] = '' item['price'] = '' item['ISBN'] = '' item['score'] = 0.0 item['evaluation_num'] = 0 if '作者' in info: item['author'] = info[info.index('作者') + 2] if '作者:' in info: item['author'] = info[info.index('作者:') + 1] if '出版社:' in info: item['publishing'] = info[info.index('出版社:') + 1] if '出版年:' in info: item['publish_time'] = info[info.index('出版年:') + 1] if '页数:' in info: item['page'] = info[info.index('页数:') + 1] if '定价:' in info: item['price'] = info[info.index('定价:') + 1] if 'ISBN:' in info: item['ISBN'] = info[info.index('ISBN:') + 1] flag = response.xpath("//div[@class='rating_sum']/span/a/text()").extract() if flag: if flag[0] == '人评价': item['score'] = response.xpath("//div[@class='rating_self clearfix']/strong/text()")[0].extract().strip() item['evaluation_num'] = response.xpath("//a[@class='rating_people']/span/text()")[0].extract() yield item # except: # print(f'{response.url} have some problem') else: print('***********************something wrong***********************')
def category_parse(self, response): # Extracting data of each book from the page. for book in response.xpath("//section/div/ol[@class='row']/li"): loader = ItemLoader(item=BooksItem(), selector=book) loader.add_xpath('book', ".//article[@class='product_pod']/h3/a/@title") loader.add_xpath('price', ".//article[@class='product_pod']/div[@class='product_price']/p[""@class='price_color']/text()") loader.add_xpath('image_url', ".//div[@class='image_container']//img//@src") loader.add_xpath('book_url', ".//div[@class='image_container']//a//@href") yield loader.load_item() # Navigating to next page if it exists. next_page = response.xpath("//section/div/div//ul[@class='pager']/li[@class='next']/a/@href").extract_first() if next_page is not None: next_page_link = response.urljoin(next_page) yield scrapy.Request(url=next_page_link, callback=self.category_parse)
def book_parse(self, response: HtmlResponse): name = response.xpath('//h1/text()').extract() link = response.url author = response.xpath( '//div[@class = "authors"][1]/a/text()').extract() old_price = response.xpath( '//span[@class="buying-priceold-val-number"]/text()' ).extract_first() new_price = response.xpath( "//span[@class='buying-pricenew-val-number']/text()" ).extract_first() rating = response.xpath("//div[@id='rate']/text()").extract_first() yield BooksItem(name=name, link=link, author=author, old_price=old_price, new_price=new_price, rating=rating)
def parse_book(self, response): nome = response.css('.product_main').css('h1::text').get() preco = response.css('.product_main').css( '.price_color::text').get().replace('£', '') disponivel_selector = response.css('.product_main').css( '.availability.instock') if disponivel_selector is not None: disponivel = True regex = re.compile('\d+') quantidade = regex.search( response.css('table tr:nth-child(6) td').get()).group() avaliacao = response.css('.star-rating').xpath("@class").get().replace( 'star-rating ', '') if avaliacao == 'One': avaliacao = 1 elif avaliacao == 'Two': avaliacao = 2 elif avaliacao == 'Three': avaliacao = 3 elif avaliacao == 'Four': avaliacao = 4 else: avaliacao = 5 categoria = response.css('.breadcrumb li:nth-child(3)').css( 'a::text').get() UPC = response.css('table tr:nth-child(1) td::text').get() url = response.request.url yield BooksItem(nome=nome, preco=float(preco), disponivel=disponivel, quantidade=int(quantidade), avaliacao=float(avaliacao), categoria=categoria, UPC=UPC, url=url, data=datetime.now().isoformat())
def book_parse(self, response: HtmlResponse): name = response.xpath( '//h1[@class="item-detail__title"]/text()').extract() link = response.url author = response.xpath( "//a[@class='item-tab__chars-link js-data-link']/text()").extract( ) old_price = response.xpath( "//div[@class='item-actions__price-old']/text()").extract_first() new_price = response.xpath( "//div[@class='item-actions__prices']//b/text()").extract_first() rating = response.xpath( "//span[@class='rating__rate-value']/text()").extract_first() yield BooksItem(name=name, link=link, author=author, old_price=old_price, new_price=new_price, rating=rating)
def parse(self, response): for info in response.css('.product_pod'): item = BooksItem() # print(info) item['name'] = info.css('h3>a::attr(title)').extract_first() # name = info.xpath('./h3/a/@title').extract_first() # print(name) item['price'] = info.css( '.product_price .price_color::text').extract_first() # price = info.xpath('//p[@class="price_color"]/text()').extract() # print(price) yield item bookstr = item['name'] + '\t' + item['price'] + '\n' self.f.write(bookstr) le = LinkExtractor(restrict_css='ul.pager li.next') links = le.extract_links(response) if links: next_url = links[0].url yield scrapy.Request(next_url, callback=self.parse) '''next_url = response.css('.pager .next>a::attr(href)').extract_first()
def parse(self, response): books = Selector(response).xpath('//article[@class="product_pod"]') for book in books: item = BooksItem() item['title'] = book.xpath('div/a/img/@alt').extract()[0] item['price'] = book.xpath( 'div/p[@class="price_color"]/text()').extract()[0] instock_status = "".join( book.xpath( 'div/p[@class="instock availability"]/text()').extract()) instock_status = instock_status.strip('\n') instock_status = instock_status.strip() item['in_stock'] = instock_status rating = book.xpath( 'p[contains(@class, "star-rating")]/@class').extract()[0] rating = rating.replace("star-rating ", "") item['rating'] = rating item['url'] = book.xpath( 'div[@class="image_container"]/a/@href').extract()[0] yield item
def parse(self, response): data = OrderedDict(BooksItem()) books = response.css('ol.row') for book in books: for b in book.css('article.product_pod'): data['title'] = b.css('a::attr(title)').getall() ''' #without import items data['title']=b.css('a::attr(title)').getall() data['price']=b.css('div.product_price p.price_color::text').getall() data['stock']=b.css('div.product_price p.instock.availability::text').getall()[1].strip() data['rating']=conv[b.css('p::attr(class)').getall()[0].split()[-1]] ''' yield data nextPage = response.css('li.next a::attr(href)').getall() if self.COUNT < self.LIMIT: nextLink = response.urljoin(nextPage[0]) self.COUNT += 1 yield scrapy.Request(url=nextLink, callback=self.parse)
def parse(self, response): getItems = response.css('.wrap .item') for getItem in getItems: item = BooksItem() title = getItem.css('h4 a::text').get() author = getItem.css('.info a::text').getall()[0] price = getItem.css('.price_box .set2 strong::text').getall()[1] item['title'] = title item['author'] = author item['price'] = price yield item next_page = response.css('.wrap .nxt::attr(href)').get() print(next_page) # 下面這一段老師是教 yield response.follow(next_page, self.parse) # 但是我試了就是沒有過,會是因為爬取不同網頁的關係嗎? 他爬的是 PTT page = 0 if next_page is not None and page < 3: page += 1 # time.sleep(3) yield Request(start_urls=next_page, callback=self.parse)
def parse(self,response): item = BooksItem() selector = Selector(response) books = selector.xpath('//a[@class="tip"]') for book in books: title = book.xpath('./text()').extract() item['title'] = title id = book.xpath('./@href').extract() id = id[0] whichbook = "http://www.oreilly.com.cn/{}".format(str(id)) item['content']=whichbook yield item # yield scrapy.Request(url=whichbook,callback=self.detail_parse, # meta={"title":title}) nextLink = selector.xpath('//div[@class="plain_page"]/div/span/span/a/@href').extract() #第10页是最后一页,没有下一页的链接 if nextLink: nextLink = nextLink[0] print nextLink yield Request(self.url + nextLink,callback=self.parse)
def parse(self, response): urls = "https://book.douban.com/tag/" isFirst = re.findall('start=(\d*)&', response.url) if isFirst: pass else: try: related = response.xpath( '//div[@class="tags-list"]/a/@href').extract() except: related = None for rel in related: next_tag = parse.quote(rel) next_tag = 'https://book.douban.com' + next_tag with open("tag.txt") as f: url_list = f.read() if next_tag not in url_list: with open("tag.txt", "a") as f: f.write(next_tag + '----') yield Request(next_tag) # print(related) try: result = (response.url.split('ag')[0]) in urls # 判断是否为列表页 except: result = False ret = 'doings' item = BooksItem() # wait = round(random.uniform(0, 1), 2) # time.sleep(wait) # print(wait) if result: # 详情页 count = 0 for href in response.xpath( '//li[@class="subject-item"]/div[@class="info"]//h2//a/@href' ).extract(): count += 1 with open("url.txt") as f: url_list = f.read() if href not in url_list: with open("url.txt", "a") as f: f.write(href + '----') print(href) yield Request(href) print(response.url) if count == 0: print( '---------------------------------------------------------------' ) # yield下一页链接 try: next = response.xpath( '//span[@class="next"]/link/@href').extract_first() except: next = None if next: next_url = 'https://book.douban.com' + next yield Request(next_url) # elif ret in response.url: # item['url'] = response.url.split('/d',1)[0] # try: # item['read'] = response.xpath('//*[@id="collections_bar"]/span/a/text()').extract()[0] # item['read'] = re.findall(r'\d*', item['read'])[0] # except: # print("No find read!") # item['read'] = 0 # # try: # item['read_want'] = response.xpath('//*[@id="wishes_bar"]/span/a/text()').extract()[0] # item['read_want'] = re.findall(r'\d*', item['read_want'])[0] # # except: # print("No find read_want!") # item['read_want'] = 0 # # try: # item['reading'] = response.xpath('//*[@id="doings_bar"]/span/text()').extract()[0] # item['reading'] = re.findall(r'\d*', item['reading'])[0] # except: # print("No find reading!") # item['reading'] = 0 # try: # base_info = response.xpath('//div[@class="indent"]').extract()[0].replace('\n', '').replace(' ', '') # item['ISBN'] = re.findall(r'isbn:</span>(\d*)<br>', base_info)[0] # except: # print('No find ISBN!') # item['ISBN'] = 0 # try: # item['author'] = re.findall(r"作者:(.*?)</span>", base_info, re.S)[0] # except: # print('No find 作者!') # item['author'] = 0 # try: # item['title'] = re.findall(r'书名:</span>(.*?)<br>', base_info)[0] # except: # item['title'] = 0 # try: # item['press'] = re.findall(r'出版社:</span>(.*?)<br>', base_info)[0] # except: # item['press'] = 0 # try: # item['trans'] = re.findall(r'译者:(.*?)</span>', base_info)[0] # except: # item['trans'] = 0 # try: # date = re.findall(r'出版年:</span>(.*?)<br>', base_info)[0] # item['publish_date'] = re.findall(r'\d*-\d+', date)[0] # except: # item['publish_date'] = 0 # # try: # item['price'] = re.findall(r'定价:</span>:(\d*.\d*)<br>', base_info)[0] # except: # item['price'] = 0 # # try: # item['score'] = response.xpath('//*[@id="content"]/div/div[1]/div[1]/div[1]/p/strong/text()').extract()[0] # except: # item['score'] = 0 # try: # number = response.xpath('//*[@id="content"]/div/div[1]/h2/span/text()').extract()[0] # item['number'] = re.findall(r'(\d*)', number)[0] # except: # item['number'] = 0 # try: # item['image'] = response.xpath('//*[@id="content"]/div/div[2]/div[2]/a/img/@src').extract()[0] # except: # item['image'] = 0 # # try: # distr = response.xpath('//div[@class="rating_detail_star"]').extract()[0].replace('\n', '').replace(' ', '') # part = re.compile("(\d*.\d*)%") # item['one'] = part.findall(distr)[4] # item['two'] = part.findall(distr)[3] # item['three'] = part.findall(distr)[2] # item['four'] = part.findall(distr)[1] # item['five'] = part.findall(distr)[0] # except: # print('can not find distr') # item['one'] = 0 # item['two'] = 0 # item['three'] = 0 # item['four'] = 0 # item['five'] = 0 # try: # item['reading_date'] = response.xpath('//div[@class="sub_ins"]//tr/td//p[@class="pl"]/span[1]/text()').extract()[0] # except: # item['reading_date'] = 0 # # # # postitem = dict(item) # tes.insert(postitem) else: item['url'] = response.url try: item['title'] = response.xpath( '//*[@id="wrapper"]/h1/span/text()').extract()[0] except: print("No find title!") item['title'] = 0 try: item['score'] = response.xpath( '//div[@id="interest_sectl"]//strong/text()').extract( )[0].replace(' ', '') except: print("未找到评分") item['score'] = 0 try: item['number'] = response.xpath( '//div[@class="rating_sum"]//a//text()').extract( )[0].replace('\n', '') except: print("未找到评分人数") item['number'] = 0 # 基本信息 try: base_info = response.xpath( '//div[@id="info"]').extract()[0].replace('\n', '').replace( ' ', '') item['info'] = base_info except: item['info'] = None try: author = re.findall(r"作者(.*?)</a>", base_info, re.S)[0] part = re.compile(r'">(.*)') item['author'] = part.findall(author)[0] except: print("No find author!") item['author'] = 0 try: part = re.findall(r"译者</span>(.*?)</a>", base_info, re.S)[0] ret = re.compile(r'">(.*)') item['trans'] = ret.findall(part)[0] except: print("trans find error!") item['trans'] = 0 try: press = re.findall(r'出版社(.*?)<br>', base_info)[0] part = re.compile(r'</span>(.*)') press = part.findall(press)[0] item['press'] = press except: print("No find press!") item['press'] = 0 try: item['price'] = re.findall(r'定价:</span>(\d*.\d*)', base_info)[0] except: item['price'] = 0 try: item['pages'] = re.findall(r'页数:</span>(\d*)<br>', base_info)[0] except: item['pages'] = None try: ret = re.findall(r'出版年(.*?)<br>', base_info)[0] ret = re.findall(r'\d*-\d+', ret)[0] item['publish_date'] = ret except: print("No find date!") item['publish_date'] = 0 try: item['ISBN'] = re.findall(r'ISBN:</span>(\d*)', base_info)[0] except: item['ISBN'] = None try: reads = response.text except: pass try: item['reading'] = re.findall(r'>(\d*)人在读', reads)[0] except: item['reading'] = 0 try: item['read_want'] = re.findall(r'>(\d*)人想读', reads)[0] except: item['read_want'] = 0 try: item['read'] = re.findall(r'>(\d*)人读过', reads)[0] except: item['read'] = 0 # try: # read = response.xpath('//div[@id="collector"]/p/a/text()').extract() # item['reading'] = re.findall(r'(\d*)人在读', read)[0] # except: # # print("reading find error!") # item['reading'] = 0 # # try: # item['read'] = re.findall(r'(\d*)人读过', read)[0] # except: # # print("read find error!") # item['read'] = 0 # # try: # item['read_want'] = re.findall(r'(\d*)人想读', read)[0] # except: # # print('read_want find error!') # item['read_want'] = 0 try: item['label'] = response.xpath( '//div[@id="db-tags-section"] /div[@class="indent"]/span/a/text()' ).extract() except: print("No label!") item['label'] = 0 try: item['image'] = response.xpath( '//*[@id="mainpic"]/a/img/@src').extract()[0] except: item['image'] = None try: item['short'] = response.xpath( '//*[@id="comments"]//p[@class="comment-content"]/span/text()' ).extract() except: item['short'] = 0 try: ret = response.xpath( '//div[@id="buyinfo-ebook"]//li//text()').extract() item['price_d'] = re.findall(r'(\d*.\d*)', ret)[0] except: item['price_d'] = 0 try: ret = response.xpath( '//div[@class="mod-hd"]/h2/span/a/text()').extract()[0] item['short_number'] = re.findall(r'(\d+)', ret)[0] except: item['short_number'] = 0 try: ret = response.xpath( '//section[@class="reviews mod book-content"]//h2/span/a/text()' ).extract()[0] item['book_number'] = re.findall(r'(\d+)', ret)[0] except: item['book_number'] = 0 try: ret = response.xpath( '//div[@class="ugc-mod reading-notes"]//span/a/span/text()' ).extract()[0] item['note_number'] = ret except: item['note_number'] = 0 try: distr = response.xpath('//div[@class="rating_wrap clearbox"]' ).extract()[0].replace('\n', '').replace( ' ', '') part = re.compile("(\d*.\d*)%") item['one'] = part.findall(distr)[4] item['two'] = part.findall(distr)[3] item['three'] = part.findall(distr)[2] item['four'] = part.findall(distr)[1] item['five'] = part.findall(distr)[0] except: print('can not find distr') item['one'] = 0 item['two'] = 0 item['three'] = 0 item['four'] = 0 item['five'] = 0 # 想看 详情页 # yield Request(response.url + 'doings') # data={ # '书名' : item['title'], # '作者' : item['author'], # '评分' : item['score'], # '评分人数': item['number'], # '定价': item['price'], # '出版社': item['press'], # '出版年': item['publish_date'], # '页数': item['pages'], # 'ISBN': item['ISBN'], # '标签': item['label'], # '读过的人': item['read'], # '想读的人': item['read_want'], # '在读的人': item['reading'], # '内容简介':item['summary'], # '作者简介':item['author_s'], # '图书链接':item['url'] # } # print(item) postitem = dict(item) all.insert(postitem)
def detail_parse(self, response): item = BooksItem() selector = Selector(response) #http://p.3.cn/prices/mgets?skuIds=J_11252778,J_&type=1 #price = selector.xpath('//*[@id="page_maprice"]/text()').extract() #content = selector.xpath('//*[@id="detail-tag-id-6"]/div[2]/div/text()').extract() #.pop().replace("<br>", " ") # with Browser() as browser: # # Visit URL # executable_path = {'executable_path': '/usr/local/Cellar/phantomjs/2.1.1/bin/phantomjs'} # browser = Browser('phantomjs', **executable_path) #repr # driver.execute_script("window.scrollBy(0,5000)") # time.sleep(4) # time.sleep(4) # content = driver.find_elements_by_class_name(book-detail-content) # driver = webdriver.Chrome() bookurl = response.meta["bookurl"] driver = webdriver.PhantomJS() driver.get(bookurl) content = driver.find_element_by_xpath( '//*[@id="detail-tag-id-6"]/div[2]/div').text driver.close() #content = 123 try: item['title'] = response.meta["title"] item['price'] = response.meta["price"] item['content'] = content item['bookurl'] = response.meta["bookurl"] yield item except Exception: pass # import urllib.request; #载入urllib.request,用于获取页面html源代码 # from pandas import Series; #载入series包 # from pandas import DataFrame; #载入dataframe包 # from bs4 import BeautifulSoup; #载入beautifulsoup包 # import json; #载入json包 # # response = urllib.request.urlopen('http://item.jd.com/2957726.html'); #获取html源代码 # html = response.read(); #将源代码转入html # soup = BeautifulSoup(html); #解析html # data = DataFrame(columns=['Feature', 'Property']) #创建空白dataframe用于装载爬取信息 # # divSoup = soup.find(id="product-detail-2") #通过分析,发现规格参数所在部分id # trs = divSoup.find_all('tr'); # # for tr in trs : # tds = tr.find_all('td'); # if len(tds)==2: #列表有两个值的时才执行爬取 # f=tds[0].getText(); # p=tds[1].getText(); # data = data.append( # Series( # [f, p], # index=['Feature', 'Property'] # ), ignore_index=True # ); # # response = urllib.request.urlopen('http://p.3.cn/prices/get?skuid=J_2244423'); # jsonString = response.read(); # jsonObject = json.loads(jsonString.decode()) # jsonObject[0]['p'] #解析p的值,即价格 # # df.to_csv("D:\\df.csv"); #导出结果