def parse(self, response): item = JdspiderItem() selector = Selector(response) Books = selector.xpath('/html/body/div[8]/div[2]/div[3]/div/ul/li') for each in Books: num = each.xpath('div[@class="p-num"]/text()').extract() bookName = each.xpath('div[@class="p-detail"]/a/text()').extract() author = each.xpath('div[@class="p-detail"]/dl[1]/dd/a[1]/text()').extract() press = each.xpath('div[@class="p-detail"]/dl[2]/dd/a/text()').extract() temphref = each.xpath('div[@class="p-detail"]/a/@href').extract() temphref = str(temphref) BookID = str(re.search('com/(.*?)\.html',temphref).group(1)) json_url = 'http://p.3.cn/prices/mgets?skuIds=J_' + BookID r = requests.get(json_url).text data = json.loads(r)[0] price = data['m'] PreferentialPrice = data['p'] item['number'] = num item['bookName'] = bookName item['author'] = author item['press'] = press item['BookID'] = BookID item['price'] = price item['PreferentialPrice'] = PreferentialPrice yield item nextLink = selector.xpath('/html/body/div[8]/div[2]/div[4]/div/div/span/a[7]/@href').extract() if nextLink: nextLink = nextLink[0] print(nextLink) yield Request(nextLink,callback=self.parse)
def parse_detail(self, response): item = JdspiderItem() item['collect_date'] = time.strftime("%Y-%m-%d %H:%M:%S") item['url'] = response.request.url item['cat'] = response.meta['cat'] item['skuId'] = response.meta['skuId'] item['title'] = response.xpath("//div[@class='item ellipsis']/text()").extract_first() print(item['title']) lis = response.xpath("//div[@class='p-parameter']/ul/li") item['publish'],item['ISBN'],item['edition'],item['brand'],item['series_name'],item['publish_date'] = "","","","","","" for li in lis: desc = re.sub('\r|\n|\t|\s','',li.xpath("string(.)").extract_first()) item['publish'] = desc.split(':')[1] if desc.count('出版社') else item['publish'] item['ISBN'] = desc.split(':')[1] if desc.count('ISBN') else item['ISBN'] item['edition'] = desc.split(':')[1] if desc.count('版次') else item['edition'] item['brand'] = desc.split(':')[1] if desc.count('品牌') else item['brand'] item['series_name'] = desc.split(':')[1] if desc.count('丛书名') else item['series_name'] item['publish_date'] = desc.split(':')[1] if desc.count('出版时间') else item['publish_date'] yield scrapy.Request( "https://c0.3.cn/stock?skuId={}&cat={}&area=1_72_4137_0".format(item['skuId'], item['cat']), callback=self.get_other_info, meta={'item': item}, dont_filter=False )
def parse(self, response): global count global tot_item global trg_item tot_item = 0 trg_item = 0 item = JdspiderItem() selector = Selector(response) Pages = selector.xpath('/html/body/li') for each in Pages: product_id = each.xpath('@data-sku').extract() name = each.xpath('div/div[3]/a/em/text()').extract() price = each.xpath('div/div[2]/strong/i/text()').extract() item['name'] = name item['product_id'] = product_id item['price'] = price tot_item += 1 if '$$query$$' in name: trg_item += 1 yield item count += 1 print '+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' print count print 'http://search.jd.com/s_new.php?keyword=$$query$$&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&offset=3&page=' + str( count) + '&s=26&scrolling=y&pos=30' print '+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' if (tot_item * 0.5 > trg_item) and (count <= 10): yield Request( 'http://search.jd.com/s_new.php?keyword=$$query$$&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&offset=3&page=' + str(count) + '&s=26&scrolling=y&pos=30', callback=self.parse)
def parse(self, response): pre_item = copy.copy(response.meta) html = json.loads(response.text) for price in eval(html['datePrice']): item = JdspiderItem() item['product_id'] = pre_item['product_id'] item['date'] = price[0] item['price'] = price[1] yield item
def parse(self, response): meta = copy.copy(response.meta) page = requests.get("http://tool.manmanbuy.com/history.aspx", headers=headers, params=meta['data']) content = json.loads(page.text) datePrices = eval('[' + content['datePrice'] + ']') for datePrice in datePrices: item = JdspiderItem() item['date'] = self._time_process(datePrice[0]) item['product_id'] = meta['product_id'] item['price'] = datePrice[1] item['campaign'] = datePrice[2] yield item
def detail_parse(self,response): #item对象 item =JdspiderItem() try: #提取的字段 ID item['ID'] =response.xpath('//div[@class="p-parameter"]/ul[3]/li[2]/@title').extract()[0] # 提取的字段 name item['name'] =response.xpath('//div[@class="p-parameter"]/ul[3]/li[1]/@title').extract()[0] # 提取的字段 brand item['brand'] =response.xpath('//ul[@id="parameter-brand"]/li/@title').extract()[0] # 提取的字段 resolution item['resolution']=response.xpath('//div[@class="p-parameter"]//li[@class="fore0"]//div[@class="detail"]/p/@title').extract()[0] # 提取的字段 weight item['weight'] =response.xpath('//div[@class="p-parameter"]//ul[3]/li[4]/@title').extract()[0] #提取的字段 image_url item['image_url'] ='https:'+str(response.xpath('//img[@id="spec-img"]/@data-origin').extract()[0]) # 提取的字段 store' item['store'] =response.xpath('//div[@class="p-parameter"]//ul[3]/li[3]/@title').extract()[0] #抛出 yield item # 因累计评论这个字段不存在于源代码,所以需要在network中查找对应的加载文件,找到其url. try: #累计评论的详细url. comment_count_url ='https://club.jd.com/comment/productCommentSummaries.action?referenceIds='+str(item['ID']) #发起yield请求 yield scrapy.Request( comment_count_url, callback=self.comment_count_parse, meta={'item': item} ) except: print("累计评论提取失败!") # 同样的道理,价格也属于动态字段,同样需要在加载文件中寻找. try: #价格的完整url. price_url ='https://pm.3.cn/prices/pcpmgets?callback=jQuery&skuids={}&origin=2'.format(str(item['ID'])) #发起yield请求 yield scrapy.Request( price_url, callback=self.price_parse, meta={'item':item} ) except: print("价格提取失败!") except: print("ERROR")
def parse(self, response): #print(response.text) html = json.loads(response.text) pre_item = copy.copy(response.meta) category_id = pre_item['category_id'] shop_id = pre_item['shop_id'] for data in html['data']: item = JdspiderItem() item['product_id'] = data['itemid'] item['category_id'] = category_id item['product_name'] = data['t'] item['shop_id'] = shop_id item['price'] = data['jp'] item['sales_volume'] = data['w'] item['date'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') yield item
def parse(self, response): for link in LinkExtractor(allow=()).extract_links(response): yield scrapy.Request(link.url, callback=self.parse) product_id = self.get_product_id(response.url) if product_id: loader = JdItemLoader(item=JdspiderItem(), response=response) loader.add_xpath( 'name', '//div[@id="crumb-wrap"]//div[@class="item ellipsis"]/text()' ) #normal page loader.add_xpath('name', '//div[@class="breadcrumb"]/span[2]/a[2]/text()' ) #eg:https://item.jd.com/2386353.html loader.add_xpath('title', \ '//div[@class="w"]/div[@class="product-intro clearfix"]//div[@class="sku-name"]/text()') #normal page loader.add_xpath('title', '//div[@id="itemInfo"]/div[@id="name"]/h1/text()' ) #eg: https://item.jd.com/2386353.html loader.add_value('product_id', product_id) loader.add_xpath( 'merchant', '//div[@class="J-hove-wrap EDropdown fr"]/div[@class="item"]/div[@class="name"]/a/text()' ) loader.add_xpath('merchant', '//div[@class="seller-infor"]/a/text()' ) #eg: https://item.jd.com/2386353.html loader.add_xpath('merchant_grade', \ '//div[@class="J-hove-wrap EDropdown fr"]/div[@class="item"]/div[@class="name"]/em/text()') #jd self loader.add_xpath('merchant_grade', '//em[@class="evaluate-grade"]/span/a/text()' ) #third part merchant score loader.add_xpath('merchant_grade', '//div[@class="seller-infor"]/em/text()' ) #eg:https://item.jd.com/2386353.html loader.add_xpath('merchant_grade', '//div[@class="score-sum"]/span/text()' ) #eg:https://item.jd.com/10605700987.html loader.add_value('utc_timestamp', int(datetime.utcnow().timestamp())) item = loader.load_item() request = scrapy.Request('https://p.3.cn/prices/mgets?skuIds=J_' + str(product_id), \ callback=self.parse_price) request.meta['item'] = item yield request
def parse(self, response): item = JdspiderItem() selector = Selector(response) Products = selector.xpath('//*[@id="plist"]/ul/li') for each in Products: temphref = each.xpath('div/div[@class="p-img"]/a/@href').extract() temphref = str(temphref) ProductID = str(re.search('com/(.*?)\.html', temphref).group(1)) product_typ_url = "https://item.jd.com/" + ProductID + ".html" print("====product_typ_url:", product_typ_url) # product_typ=Selector(response).xpath('//html/body/div[9]/div[2]/div[1]/div[2]/div[1]/div[1]/ul[2]/li[11]/text()').extract() product_typ = Selector(response).xpath( '//*[@class="parameter2 p-parameter-list"]/ul[2]/li[11]/text()' ).extract() print(product_typ) item['product_typ'] = product_typ yield item
def parse(self, response): project_list = response.xpath('//div[@class="p-name"]') item = JdspiderItem() for project in project_list: title_test = project.xpath( 'normalize-space(./a/em/text())').extract_first() title_test_url = 'https:' + project.xpath( 'normalize-space(./a/@href)').extract_first() item['title'] = title_test item['title_url'] = title_test_url yield scrapy.Request(item['title_url'], callback=self.product_list_parse, meta={'item': deepcopy(item)}, dont_filter=True) callback_url = response.xpath( '//a[@class="pn-next"]/@href').extract_first() if callback_url: yield scrapy.Request('https:/' + callback_url, callback=self.parse)
def parse(self, response): item = JdspiderItem() selector = Selector(response) # Books = selector.xpath('/html/body/div[8]/div[2]/div[3]/div/ul/li') PhonesLink = selector.xpath( '//*[@id="J_goodsList"]/ul/li/div/div[4]/a/@href' ) #每个li下面都有一个手机跳转的链接 for each in PhonesLink: # num = each.xpath('div[@class="p-num"]/text()').extract() # bookName = each.xpath('div[@class="p-detail"]/a/text()').extract() # author = each.xpath('div[@class="p-detail"]/dl[1]/dd/a[1]/text()').extract() # press = each.xpath('div[@class="p-detail"]/dl[2]/dd/a/text()').extract() temphref = each.xpath('div[@class="p-detail"]/a/@href').extract() temphref = str(temphref) # BookID = str(re.search('com/(.*?)\.html',temphref).group(1)) phoneID = str(re.search('com/(.*?)\.html', temphref).group(1)) json_url = 'http://p.3.cn/prices/mgets?skuIds=J_' + phoneID r = requests.get(json_url).text data = json.loads(r)[0] price = data['m'] # PreferentialPrice = data['p'] item['phoneName'] = name item['phoneID'] = phoneID item['phoneRAM'] = phoneRAM item['phoneColor'] = phoneColor item['phoneBattery'] = phoneBattery item['price'] = price item['frontcamera'] = frontcamera item['backcamera'] = backcamera yield item nextLink = selector.xpath( '/html/body/div[8]/div[2]/div[4]/div/div/span/a[7]/@href').extract( ) if nextLink: nextLink = nextLink[0] print(nextLink) yield Request(nextLink, callback=self.parse)
def parse(self, response): # Brand = re.match(r'(^.*C_)(.*)', response.url).group(2) # result = unquote(Brand,'utf-8') project_list = response.xpath('//div[@class="p-name"]') for project in project_list: item = JdspiderItem() item['Brand'] = '华为(HUAWEI)' title_test = project.xpath( 'normalize-space(./a/em/text())').extract_first() title_test_url = 'https:' + project.xpath( 'normalize-space(./a/@href)').extract_first() item['title'] = title_test item['title_url'] = title_test_url yield scrapy.Request(item['title_url'], callback=self.product_list_parse, meta={'item': deepcopy(item)}, dont_filter=True) callback_url = response.xpath( '//a[@class="pn-next"]/@href').extract_first() if callback_url: print('下一页') yield scrapy.Request('https:/' + callback_url, callback=self.parse)
def parse(self, response): item = JdspiderItem() selector = Selector(response) pName = selector.xpath( '/html/body/div[@id="bodyContainer"]/div[@id="bindingRoot"]/div[@class="grid_12 l-clearfix l-row"]/div[@id="job-listing-wrapper"]/div[@class="jobmail-signed-in jobs-exist first-page premium-jobs-exist no-tier1-jobs"]/section[@id="jobsListing"]/div[@class="jobs-list jobs-list-primary"]/article[@class="experimental-fade experimental-fade-completed"][1]/dl/dd[1]/p[@class="job-description"]/text()' ).extract() item['Name'] = pName yield item #Cates = selector.xpath('/html/body/div[@id="bodyContainer"]/div[@id="bindingRoot"]/div[@class="grid_12 l-clearfix l-row"]/div[@id="job-listing-wrapper"]/div[@class="jobmail-signed-in jobs-exist first-page premium-jobs-exist no-tier1-jobs"]/section[@id="jobsListing"]/div[@class="jobs-list jobs-list-premium"][1]/article') #for each in Cates: # bPrice = each.xpath('dl/dd[1]/ul[@class="bullet-points"]/li[1]/text()').extract() # pName = each.xpath('dl/dd[1]/h2/a[@class="job-title"]/text()').extract() #temphref = each.xpath('div[@class="p-detail"]/a/@href').extract() #temphref = str(temphref) #BookID = str(re.search('com/(.*?)\.html',temphref).group(1)) #json_url = 'http://p.3.cn/prices/mgets?skuIds=J_'+ BookID #r = requests.get(json_url).text #data = json.loads(r)[0] #price = data['m'] #PreferentialPrice = data['p'] # item['Price'] = bPrice # item['Name'] = pName #item['author'] = author #item['press'] = press #item['BookID'] = BookID #item['price'] = price #item['PreferentialPrice'] = PreferentialPrice #yield item nextLink = selector.xpath( '/html/body/div[7]/div[5]/form[1]/div/a[2]/@href').extract() #/html/body/div[@id="J_searchWrap"]/div[@id="J_container"]/div[@id="J_main"]/div[@class="m-list"]/div[@class="ml-wrap"]/div[@class="page clearfix"]/div[@id="J_bottomPage"]/span[@class="p-num"]/a[@class="pn-next"]/@href').extract() #/html/body/div[8]/div[1]/div[2]/div[1]/div/div[3]/div/span/a[2]/@href' if nextLink: nextLink = nextLink[0] print(nextLink) yield Request(nextLink, callback=self.parse)
def parse_detail(self, response): item_loader = ArticleItemLoader(item=JdspiderItem(), response=response) phone_title = response.css(".sku-name::text").extract()[0].strip() match_re = re.match(u".*[\u4e00-\u9fa5]+", phone_title) if match_re: item_loader.add_css("title", ".sku-name::text") else: title = response.xpath("/html/body/div[8]/div/div[2]/div[1]/text()" ).extract()[1].strip() item_loader.add_value("title", title) item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("front_image_url", "#spec-n1 img::attr(src)") shop_name = response.css(".name a::text") if shop_name: item_loader.add_css("shop_name", ".name a::text") else: shop_name = "null" item_loader.add_value("shop_name", shop_name) price_item = response.xpath( "/html/body/div[8]/div/div[2]/div[3]/div/div[1]/div[2]/span[1]/span[2]/text()" ) if price_item: price_item = price_item.extract()[0] item_loader.add_value("price", price_item) else: item_price = response.css('.dd .p-price .price::text').extract()[0] item_loader.add_value("price", item_price) item_loader.add_css("brand", ".p-parameter a::text") item_loader.add_xpath( "good_name", "//*[@id='crumb-wrap']/div/div[1]/div[9]/text()") item_loader.add_xpath("comment_nums", "//*[@id='comment-count']/a/text()") item_loader.add_value("crawl_time", datetime.datetime.now()) phone_item = item_loader.load_item() yield phone_item
def parse(self, response): pre_item = copy.copy(response.meta) #product_id_lst = response.xpath("//ul[@class='gl-warp clearfix']/li/@data-sku").extract() division_id = pre_item['division_id'] #product_name_lst = response.xpath("//div[@class='p-name p-name-type-2']/a/@title").extract() shop_id_lst = response.xpath( "//div[@class='p-shop']//a[@class='curr-shop hd-shopname']/@href" ).extract() shop_name_lst = response.xpath( "//div[@class='p-shop']//a[@class='curr-shop hd-shopname']/text()" ).extract() #goods_set = list(zip(product_id_lst,product_name_lst,shop_id_lst,shop_name_lst)) goods_set = list(zip(shop_id_lst, shop_name_lst)) for good in goods_set: item = JdspiderItem() #item['product_id'] = good[0] item['division_id'] = int(division_id) #item['product_name'] = good[1] item['shop_id'] = self._shops_id_process(good[0]) item['shop_name'] = good[1] yield item
def parse(self, response): item = JdspiderItem() selector = Selector(response) # gl_items = selector.xpath('//li/div[@class="gl-i-wrap j-sku-item"]') gl_items = selector.xpath('//li[@class="gl-item"]') # print('得到网页的内容') for each in gl_items: print('开始解析得到的数据') # 得到物品名称 name = each.xpath( 'div/div[@class="p-name"]/a/em/text()').extract()[0].strip() # print(name) # 得到店铺的链接 name_link = 'http:' + str( each.xpath('div/div[@class="p-name"]/a/@href').extract()[0]) # print(name_link) temphref = each.xpath('div/div[@class="p-name"]/a/@href').extract() temphref = str(temphref) skuId = str(re.search('com/(.*?)\.html', temphref).group(1)) # print(skuId) # 得到价格信息 price_url = 'https://p.3.cn/prices/mgets?&skuIds=J_' + skuId print(price_url) price_text = requests.get(price_url).text data = json.loads(price_text)[0] o_price = data['m'] c_price = data['p'] print(o_price, c_price) # 得到评论信息 commit_url = 'https://club.jd.com/comment/productCommentSummaries.action?&referenceIds=' + skuId print(commit_url) try: commit_text = requests.get(commit_url).text comment_count = json.loads( commit_text)['CommentsCount'][0]['CommentCountStr'] print(comment_count) except Exception as ex: print('request commit_url failed') print(ex) # 得到店铺名称 shopId = each.xpath('div/@venderid').extract()[0] shop_url = 'https://rms.shop.jd.com/json/pop/shopInfo.action?ids=' + str( shopId) print(shop_url) try: shop_text = requests.get(shop_url).text data = json.loads(shop_text) shop_name = data[0]['name'] print(shop_name) except Exception as ex: print('get shop id failed') print(ex) item['name'] = name item['ori_price'] = o_price item['cur_price'] = c_price item['commit'] = comment_count item['shop'] = shop_name item['ItemID'] = skuId item['shop_href'] = name_link yield item time.sleep(0.2) # nextLink = selector.xpath('/html/body/div[8]/div[2]/div[4]/div/div/span/a[7]/@href').extract() print('开始得到下一页的地址') nextLink = selector.xpath( '//div[@class="page clearfix"]/div/span/a[@class="pn-next"]/@href' ).extract() print(nextLink) if nextLink: nextLink = 'https://list.jd.com' + nextLink[0] print(nextLink) yield Request(nextLink, callback=self.parse)
def parse(self, response): item = JdspiderItem() selector = Selector(response) Products = selector.xpath('//*[@id="plist"]/ul/li') for each in Products: p_Name = each.xpath( 'div/div[@class="p-name"]/a/em/text()').extract()[0] temphref = each.xpath('div/div[@class="p-img"]/a/@href').extract() temphref = str(temphref) ProductID = str(re.search('com/(.*?)\.html', temphref).group(1)) # ProductID='1959718783' ##获取价格 json_url_p = 'http://p.3.cn/prices/mgets?skuIds=J_' + ProductID try: data = requests.get(json_url_p, timeout=1000).json()[0] price = data['m'] PreferentialPrice = data['p'] except requests.exceptions.ConnectionError: #requests.exceptions.ReadTimeout print('Timeout ConnectionError1:json_url_p') time.sleep(600) try: data = requests.get(json_url_p, timeout=1000).json()[0] price = data['m'] PreferentialPrice = data['p'] except requests.exceptions.ConnectionError: print('Timeout ConnectionError2:json_url_p') time.sleep(3600) data = requests.get(json_url_p, timeout=1000).json()[0] price = data['m'] PreferentialPrice = data['p'] except requests.exceptions.ReadTimeout: print('Timeout,ReadTimeout:', json_url_p) except requests.exceptions.ReadTimeout: print('Timeout,ReadTimeout:', json_url_p) ##获取评论总数 json_url_connent = 'https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds=' + ProductID try: data = requests.get(json_url_connent, timeout=1000).json() data = data['CommentsCount'][0] CommentCount = data['CommentCount'] GoodRateShow = data['GoodRateShow'] GoodCount = data['GoodCount'] GeneralCount = data['GeneralCount'] PoorCount = data['PoorCount'] except requests.exceptions.ConnectionError: print('Timeout ConnectionError1:json_url_connent') time.sleep(600) try: data = requests.get(json_url_connent, timeout=1000).json() data = data['CommentsCount'][0] CommentCount = data['CommentCount'] GoodRateShow = data['GoodRateShow'] GoodCount = data['GoodCount'] GeneralCount = data['GeneralCount'] PoorCount = data['PoorCount'] except requests.exceptions.ConnectionError: print('Timeout ConnectionError2:json_url_connent') time.sleep(3600) data = requests.get(json_url_connent, timeout=1000).json() data = data['CommentsCount'][0] CommentCount = data['CommentCount'] GoodRateShow = data['GoodRateShow'] GoodCount = data['GoodCount'] GeneralCount = data['GeneralCount'] PoorCount = data['PoorCount'] except requests.exceptions.ReadTimeout: print('Timeout,ReadTimeout:', json_url_connent) except requests.exceptions.ReadTimeout: print('Timeout,ReadTimeout:', json_url_connent) ##获取商品评论关键字 json_url_keyword = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv79456&score=0&sortType=5&pageSize=10&isShadowSku=0&page=0&productId=' + ProductID # r = requests.get(json_url_keyword,timeout = 100) # time.sleep(2) # html = r.content.decode('gb2312', 'ignore') # keywords = re.findall(r',"name":"(.*?)",', html) # keyword = ' '.join(keywords) try: r = requests.get(json_url_keyword, timeout=1000) html = r.content.decode('gb2312', 'ignore') keywords = re.findall(r',"name":"(.*?)",', html) keyword = ' '.join(keywords) except requests.exceptions.ConnectionError: # this is important print('Timeout ConnectionError1:json_url_keyword') time.sleep(600) try: r = requests.get(json_url_keyword, timeout=1000) html = r.content.decode('gb2312', 'ignore') keywords = re.findall(r',"name":"(.*?)",', html) keyword = ' '.join(keywords) except requests.exceptions.ConnectionError: # this is important print('Timeout ConnectionError2:json_url_keyword') time.sleep(3600) r = requests.get(json_url_keyword, timeout=1000) html = r.content.decode('gb2312', 'ignore') keywords = re.findall(r',"name":"(.*?)",', html) keyword = ' '.join(keywords) except requests.exceptions.ReadTimeout: print('Timeout,ReadTimeout:', json_url_keyword) except requests.exceptions.ReadTimeout: print('Timeout,ReadTimeout:', json_url_keyword) # ##获取商品参数 (冷风扇) # product_typ_url="https://item.jd.com/"+ ProductID+".html" # r = requests.get(product_typ_url,timeout = 100) # time.sleep(2) # soup = BeautifulSoup(r.text, 'lxml') # ips1 = soup.find_all('ul', class_="parameter2 p-parameter-list") # ips2 = soup.find_all('div', class_="detail-elevator-floor") # ips = [ips1, ips2] # try: # for i in ips: # type = re.findall(r'<li title=".*?">类别:(.*?)<', str(ips))[0] # break # except IndexError: # type = "没有对应数据" # print(type) ##获取商品参数 (原汁) product_typ_url = "https://item.jd.com/" + ProductID + ".html" try: r = requests.get(product_typ_url, timeout=1000) soup = BeautifulSoup(r.text, 'lxml') try: shop_name = re.findall( r'<a clstag=".*?" href=".*?" target="_blank" title="(.*?)">', str(soup))[0] except IndexError: shop_name = "none" try: brand = soup.find_all('ul', id="parameter-brand") brand = re.findall(r'<li title="(.*?)"', str(brand))[0] except IndexError: brand = "None" ips1 = soup.find_all('ul', class_="parameter2 p-parameter-list") ips2 = soup.find_all('div', class_="detail-elevator-floor") ips = [ips1, ips2] for i in ips: type = re.findall(r'<li title=".*?">.*?:(.*?)<', str(ips)) try: X_type = re.findall(r'<li title=".*?">.*?吸头:(.*?)<', str(ips))[0] except IndexError: X_type = "none" # try: # F_type = re.findall(r'<li title=".*?">类别:(.*?)<', str(ips))[0] # except IndexError: # F_type= "none" # try: # Y_type = re.findall(r'<li title=".*?">类型:(.*?)<', str(ips))[0] # except IndexError: # Y_type= "none" except requests.exceptions.ConnectionError: # this is important print('Timeout ConnectionError1:product_typ_url') time.sleep(600) try: r = requests.get(product_typ_url, timeout=1000) soup = BeautifulSoup(r.text, 'lxml') try: shop_name = re.findall( r'<a clstag=".*?" href=".*?" target="_blank" title="(.*?)">', str(soup))[0] except IndexError: shop_name = "none" try: brand = soup.find_all('ul', id="parameter-brand") brand = re.findall(r'<li title="(.*?)"', str(brand))[0] except IndexError: brand = "None" ips1 = soup.find_all('ul', class_="parameter2 p-parameter-list") ips2 = soup.find_all('div', class_="detail-elevator-floor") ips = [ips1, ips2] for i in ips: type = re.findall(r'<li title=".*?">.*?:(.*?)<', str(ips)) try: X_type = re.findall( r'<li title=".*?">.*?吸头:(.*?)<', str(ips))[0] except IndexError: X_type = "none" # try: # F_type = re.findall(r'<li title=".*?">类别:(.*?)<', str(ips))[0] # except IndexError: # F_type = "none" # try: # Y_type = re.findall(r'<li title=".*?">类型:(.*?)<', str(ips))[0] # except IndexError: # Y_type = "none" except requests.exceptions.ConnectionError: # this is important print('Timeout ConnectionError2:product_typ_url') time.sleep(3600) r = requests.get(product_typ_url, timeout=1000) soup = BeautifulSoup(r.text, 'lxml') try: shop_name = re.findall( r'<a clstag=".*?" href=".*?" target="_blank" title="(.*?)">', str(soup))[0] except IndexError: shop_name = "none" try: brand = soup.find_all('ul', id="parameter-brand") brand = re.findall(r'<li title="(.*?)"', str(brand))[0] except IndexError: brand = "None" ips1 = soup.find_all('ul', class_="parameter2 p-parameter-list") ips2 = soup.find_all('div', class_="detail-elevator-floor") ips = [ips1, ips2] for i in ips: type = re.findall(r'<li title=".*?">.*?:(.*?)<', str(ips)) try: X_type = re.findall( r'<li title=".*?">.*?吸头:(.*?)<', str(ips))[0] except IndexError: X_type = "none" # try: # F_type = re.findall(r'<li title=".*?">类别:(.*?)<', str(ips))[0] # except IndexError: # F_type = "none" # try: # Y_type = re.findall(r'<li title=".*?">类型:(.*?)<', str(ips))[0] # except IndexError: # Y_type = "none" except requests.exceptions.ReadTimeout: print('Timeout,ReadTimeout:', product_typ_url) except requests.exceptions.ReadTimeout: print('Timeout,ReadTimeout:', product_typ_url) # ##item item['p_Name'] = p_Name item['shop_name'] = shop_name item['ProductID'] = ProductID item['price'] = price item['PreferentialPrice'] = PreferentialPrice item['CommentCount'] = CommentCount item['GoodRateShow'] = GoodRateShow item['GoodCount'] = GoodCount item['GeneralCount'] = GeneralCount item['PoorCount'] = PoorCount item['keyword'] = keyword item['brand'] = brand item['type'] = type item['X_type'] = X_type # item['F_type'] = F_type # item['Y_type'] = Y_type yield item # donetime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # print("Sleep time start......") # time.sleep(5) # print("donetime is:", donetime) nextLink = selector.xpath( '//*[@id="J_bottomPage"]/span[1]/a[10]/@href').extract() if nextLink: nextLink = nextLink[0] yield Request('https://list.jd.com/' + nextLink, callback=self.parse)
def parse(self, response): item = JdspiderItem() selector = Selector(response) Products = selector.xpath('//*[@id="plist"]/ul/li') for each in Products: # p_Name = each.xpath('div/div[@class="p-name"]/a/em/text()').extract() p_Name = each.xpath( 'div/div[@class="p-name p-name-type-2"]/a/em/text()').extract( ) shop_name = each.xpath( 'div/div[@class="p-shop"]/@data-shop_name').extract() temphref = each.xpath('div/div[@class="p-img"]/a/@href').extract() temphref = str(temphref) ProductID = str(re.search('com/(.*?)\.html', temphref).group(1)) # ProductID='1069555' ##获取价格 json_url_p = 'http://p.3.cn/prices/mgets?skuIds=J_' + ProductID try: r = requests.get(json_url_p).text time.sleep(1) data = json.loads(r)[0] price = data['m'] PreferentialPrice = data['p'] except requests.exceptions.ConnectionError: # this is important print('Timeout') time.sleep(600) r = requests.get(json_url_p).text time.sleep(1) data = json.loads(r)[0] price = data['m'] PreferentialPrice = data['p'] ##获取评论总数 json_url_connent = 'https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds=' + ProductID try: r = requests.get(json_url_connent).text time.sleep(1) data = json.loads(r) data = data['CommentsCount'][0] CommentCount = data['CommentCount'] GoodRateShow = data['GoodRateShow'] GoodCount = data['GoodCount'] GeneralCount = data['GeneralCount'] PoorCount = data['PoorCount'] except requests.exceptions.ConnectionError: # this is important print('Timeout') time.sleep(600) r = requests.get(json_url_connent).text time.sleep(1) data = json.loads(r) data = data['CommentsCount'][0] CommentCount = data['CommentCount'] GoodRateShow = data['GoodRateShow'] GoodCount = data['GoodCount'] GeneralCount = data['GeneralCount'] PoorCount = data['PoorCount'] json_url_keyword = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv79456&score=0&sortType=5&pageSize=10&isShadowSku=0&page=0&productId=' + ProductID try: r = requests.get(json_url_keyword) html = r.content.decode('gb2312', 'ignore') keywords = re.findall(r',"name":"(.*?)",', html) keyword = ' '.join(keywords) except requests.exceptions.ConnectionError: # this is important print('Timeout') time.sleep(600) r = requests.get(json_url_keyword) html = r.content.decode('gb2312', 'ignore') keywords = re.findall(r',"name":"(.*?)",', html) keyword = ' '.join(keywords) ##获取商品参数 product_typ_url = "https://item.jd.com/" + ProductID + ".html" try: r = requests.get(product_typ_url) time.sleep(1) soup = BeautifulSoup(r.text, 'lxml') ips1 = soup.find_all('ul', class_="parameter2 p-parameter-list") ips2 = soup.find_all('div', class_="detail-elevator-floor") ips = [ips1, ips2] try: for i in ips: type = re.findall(r'<li title=".*?">类别:(.*?)<', str(ips))[0] # control_mode = re.findall(r'<li title=".*?">控制方式:(.*?)<', str(ips))[0] FBnumber = re.findall(r'<li title=".*?">扇叶片数:(.*?)<', str(ips))[0] break except IndexError: type = "没有对应数据" print(type) except requests.exceptions.ConnectionError: # this is important print('Timeout') time.sleep(600) r = requests.get(product_typ_url) time.sleep(1) soup = BeautifulSoup(r.text, 'lxml') ips1 = soup.find_all('ul', class_="parameter2 p-parameter-list") ips2 = soup.find_all('div', class_="detail-elevator-floor") ips = [ips1, ips2] try: for i in ips: type = re.findall(r'<li title=".*?">类别:(.*?)<', str(ips))[0] # control_mode = re.findall(r'<li title=".*?">控制方式:(.*?)<', str(ips))[0] FBnumber = re.findall(r'<li title=".*?">扇叶片数:(.*?)<', str(ips))[0] break except IndexError: type = "没有对应数据" print(type) # ##item item['p_Name'] = p_Name item['shop_name'] = shop_name item['ProductID'] = ProductID item['price'] = price item['PreferentialPrice'] = PreferentialPrice item['CommentCount'] = CommentCount item['GoodRateShow'] = GoodRateShow item['GoodCount'] = GoodCount item['GeneralCount'] = GeneralCount item['PoorCount'] = PoorCount item['keyword'] = keyword item['type'] = type # item['control_mode'] = control_mode item['FBnumber'] = FBnumber yield item donetime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) print("Sleep time start......") time.sleep(300) print("donetime is:", donetime) nextLink = selector.xpath( '//*[@id="J_bottomPage"]/span[1]/a[10]/@href').extract() if nextLink: nextLink = nextLink[0] yield Request('https://list.jd.com/' + nextLink, callback=self.parse)