def detail(self, url): try: self.domain = tool.get_domain(url) if 'web1.sasa.com' in self.domain: return self.detail_by_hk(url) elif 'www.sasa.com' in self.domain: return self.detail_by_www(url) except Exception, e: raise
def detail(self, url): try: # resp = self.session.get(url,timeout=self.cfg.REQUEST_TIME_OUT) resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('#productDetailsWrapper') domain = tool.get_domain(url) Jtxt = pqhtml('script').text() # pdata = self.get_pdata(area) # print area.outerHtml() # exit() #下架 if 'SOLD OUT' in area('#productPrice').text() or not area: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = re.search(r'FL.setup.brand = "(.*?)"', Jtxt, re.DOTALL).groups()[0] detail['brand'] = brand #名称 detail['name'] = area('h1#title').text() #货币,官网固定 currency = 'USD' detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = self.get_all_price(area) detail['price'] = price detail['listPrice'] = listPrice #颜色 colors = self.get_colors(area) detail['color'] = colors detail['colorId'] = {cid: cid for cid in colors.keys()} #图片集 imgs = self.get_imgs(area, pqhtml) detail['img'] = imgs[0] if isinstance(imgs, list) else { cid: imgArr[0] for cid, imgArr in imgs.items() } detail['imgs'] = imgs #产品ID productId = area('h1.title').attr('data-productitemid') detail['productId'] = productId #规格 detail['sizes'] = self.get_sizes(area) #键 detail['keys'] = colors.keys() #描述 detail['descr'] = area('div#productDescription').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self,url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get('SCERR','ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('#goodsInfo') domain = tool.get_domain(url) # pdata = self.get_pdata(area) # print area.outerHtml().encode('utf-8') # exit() #下架 if not area : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #产品ID productId = pqhtml('#goodsForm input#bskGodGodNo').attr('value') detail['productId'] = productId detail['productSku'] = productId detail['productCode'] = area('.prd-code').text() #品牌 brand = pqhtml('#goodsForm input#brndNm').attr('value') detail['brand'] = brand #名称 detail['name'] = u'{0} {1}'.format(brand,pqhtml('#goodsForm input#godNm').attr('value')) #货币,价格 currency,price,listPrice = self.get_currency_prices(pqhtml,area) detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) detail['price'] = price detail['listPrice'] = listPrice #描述 detail['descr'] = pqhtml('meta[name="description"]').attr('content') #详细 detail['detail'] = pqhtml('meta[name="description"]').attr('content') + area('.desc-area').text() #颜色 color = self.get_color(area) detail['color'] = self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = self.cfg.DEFAULT_COLOR_SKU #图片集 imgs = [ img.attr('src') for img in pqhtml('#prdImgWrap .prdImg ul>li>img').items()] detail['img'] = pqhtml('meta[property="og:image"][name="og_image"]').attr('content') detail['imgs'] = imgs #规格 detail['sizes'] = self.get_sizes(area) #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url #返回的IP和端口 if resp.raw._original_response.peer : detail['ip_port']=':'.join(map( lambda x:str(x),resp.raw._original_response.peer)) log_info = json.dumps(dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self,url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get('SCERR','ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('#theater') domain = tool.get_domain(url) pdata = self.get_pdata(pqhtml('script:gt(20)')) # exit() #下架 # if True : # log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) # self.logger.info(log_info) # data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) # return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = area('.brand').text() detail['brand'] = brand #名称 detail['name'] = area('h1:first').text() currencySymbol,price,listPrice = self.get_price_info(pdata) if currencySymbol != '$' : raise ValueError('currencySymbol is not USD') #货币 currency = 'USD' detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 detail['price'] = price detail['listPrice'] = listPrice #颜色 color = self.get_color(pdata) detail['color'] = color detail['colorId'] = {cid:cid for cid in color.keys() } #图片集 img,imgs = self.get_imgs(pdata) detail['img'] = img detail['imgs'] = imgs #产品ID productId = pqhtml('input[name="productId"]').attr('value') detail['productId'] = productId #规格 sizes = self.get_sizes(pdata) detail['sizes'] = sizes #描述 detail['descr'] = area('.description').text() detail['keys'] = set(img.keys())&set(sizes.keys()) #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url #返回的IP和端口 if resp.raw._original_response.peer : detail['ip_port']=':'.join(map( lambda x:str(x),resp.raw._original_response.peer)) log_info = json.dumps(dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('div.primary-content') domain = tool.get_domain(url) # print area.outerHtml().encode('utf-8') # exit() #下架 # if True : # log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) # self.logger.info(log_info) # data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) # return tool.return_data(successful=False, data=data) detail = dict() #产品ID # productId = area('input.productId').attr('value') productId = pqhtml('span[itemprop="productID"]').attr('content') detail['productId'] = productId detail['productSku'] = productId detail['productCode'] = productId #品牌 brand = pqhtml('span[itemprop="brand"]').attr('content') detail['brand'] = brand #名称 detail['name'] = pqhtml('span[itemprop="name"]').attr('content') #货币 currency = pqhtml('span[itemprop="priceCurrency"]').attr('content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = self.get_all_price(area) price = pqhtml('span[itemprop="price"]').attr('content') detail['price'] = price detail['listPrice'] = listPrice #一级分类 detail['category'] = area('a[data-bigpopup="sizeChart"]').attr( 'data-category') #二级分类 detail['subcategory'] = area('a[data-bigpopup="sizeChart"]').attr( 'data-sub-category') #描述 detail['descr'] = pqhtml('span[itemprop="description"]').attr( 'content') #详细 detail['detail'] = area('#collapseOne').text() #退换货 detail['returns'] = area('#collapseFive').text() #颜色 # color = self.get_color(area) detail['color'] = pqhtml('span[itemprop="color"]').attr('content') detail['colorId'] = self.cfg.DEFAULT_COLOR_SKU #图片集 imgs = [ img.attr('src') for img in area( '.product-image-carousel img.primary-image').items() ] detail['img'] = pqhtml('span[itemprop="image"]').attr('content') detail['imgs'] = imgs #规格 detail['sizes'] = self.get_sizes(area) #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url #返回的IP和端口 if resp.raw._original_response.peer: detail['ip_port'] = ':'.join( map(lambda x: str(x), resp.raw._original_response.peer)) log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('#container') domain = tool.get_domain(url) pdata = self.get_pdata(pqhtml, domain) # print area.outerHtml() # print json.dumps(pdata) # exit() #下架 if not pdata['hasOrderableVariants']: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = area('.product-meta').attr('data-brand') detail['brand'] = brand #名称 detail['name'] = area('.product-meta').attr('data-productname') #货币 currency = re.search(r's\["currencyCode"\]="(\w{3})";', pqhtml('script').text()).groups()[0] detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #获取信息. price, sizes = self.get_info(pdata) #价格 detail['price'] = price ptxt = area('.pricenotebucket').text() listPrice = re.search(r'\d[\d\.]', ptxt).groups()[0] if ptxt else price detail['listPrice'] = listPrice #颜色 status, color, imgs = self.get_color(pdata) detail['color'] = color detail['colorId'] = dict([(key, key) for key in color.keys()]) #钥匙 detail['keys'] = color.keys() #图片集 detail['img'] = imgs[0] if isinstance(imgs, list) else dict( [(cId, imgArr[0]) for cId, imgArr in imgs.items()]) detail['imgs'] = imgs #产品ID productId = area('.product-meta').attr('data-pid') detail['productId'] = productId #规格 detail['sizes'] = sizes #描述 detail['descr'] = area( 'section.product-details .longdescription').text() #详细 detail['detail'] = area('section.product-details').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = status #返回链接 detail['backUrl'] = resp.url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): try: self.domain = tool.get_domain(url) resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('#content>#productContainer') pdata = self.get_pdata(pqhtml) # print area.outerHtml() # print json.dumps(pdata) # exit() #下架 if not area or area('.productButtons #disabledAddtobasket'): log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = 'COS' detail['brand'] = brand #名称 detail['name'] = area('.productInfo h1:first').text() #货币 currency = pqhtml('meta[property="og:price:currency"]').attr( 'content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = self.get_all_price(pqhtml, area) detail['price'] = price detail['listPrice'] = listPrice #颜色 color = self.get_color(area) detail['color'] = color detail['colorId'] = dict([(key, key) for key in color.keys()]) #图片集 imgs = self.get_imgs(area) detail['img'] = imgs[0] if isinstance(imgs, list) else dict( [(cid, Arr[0]) for cid, Arr in imgs.items()]) detail['imgs'] = imgs #钥匙 detail['keys'] = color.keys() #产品ID productId = area('input[data-product-identifier!=""]').attr( 'data-product-identifier') detail['productId'] = productId #规格 detail['sizes'] = self.get_sizes(area) #描述 detail['descr'] = area('.productInfo>.infowrap>dl>dd:first').text() #退换货 detail['returns'] = area( '.productInfo>.infowrap>dl>dd:first').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('#pdp-page') domain = tool.get_domain(url) # print area.outerHtml() # exit() #下架 if not area: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #产品ID productId = pqhtml('img[data-stylenumber!=""]').attr( 'data-stylenumber').split('_')[0] detail['productId'] = productId #品牌 brand = 'Lululemon' detail['brand'] = brand #名称 detail['name'] = area('h1.OneLinkNoTx').text() #货币 currency = pqhtml('input#currencyCode').attr('value').strip() detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = self.get_all_price(area) detail['price'] = price detail['listPrice'] = listPrice #描述 detail['descr'] = self.get_descr(pqhtml, area) #详细 detail['detail'] = area('#fabric').text() #退换货 detail['returns'] = '' colorDriver, colorCount = self.get_pdata(pqhtml) #颜色 img, imgs, color = self.get_color(area, colorCount) detail['color'] = color detail['colorId'] = {key: key for key in color} #图片集 detail['img'] = img detail['imgs'] = imgs #规格 sizes, price = self.get_sizes(colorDriver) detail['sizes'] = sizes detail['price'] = price if isinstance(color, dict): detail['keys'] = [key for key in color] #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url #返回的IP和端口 if resp.raw._original_response.peer: detail['ip_port'] = ':'.join( map(lambda x: str(x), resp.raw._original_response.peer)) log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self,url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get('SCERR','ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 pqhtml.remove('style') area = pqhtml('#overall_content') domain = tool.get_domain(url) pdata = self.get_pdata(pqhtml) # print pqhtml.outerHtml() # print area.outerHtml() # print json.dumps(pdata) # exit() #下架 # if area('div[itemprop="availability"]').text().strip() != 'Available' : # log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) # self.logger.info(log_info) # data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) # return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = 'TouchOfModern' detail['brand'] = brand #名称 detail['name'] = brand + ' ' + pdata['name'] #货币 currency = pqhtml('meta[property="og:price:currency"]').attr('content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 # listPrice = self.get_all_price(area) detail['price'] = pdata['price'] #颜色 detail['color'] = self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = pdata['id'] #图片集 detail['img'] = area('.big_image_wrapper a').attr('href') detail['imgs'] = [ e.attr('href') for e in area('div[class="product-image-container"] a').items() ] #产品ID productId = pdata['id'] detail['productId'] = productId #规格 listPrice,sizes = self.get_sizes(area) detail['sizes'] = sizes detail['listPrice'] = listPrice or pdata['price'] #视频 if len(area('.product-video-container')) > 0 : detail['video'] = self.get_video(area) #描述 detail['descr'] = area('.product-details-section').text() #详细 detail['detail'] = area('.product-details-section').text() #退换货 detail['returns'] = area('.shipping-details-listt').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = url log_info = json.dumps(dict(time=time.time(), productId=detail['productId'], name=detail[ 'name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self,url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get('SCERR','ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('.product-detail-information') self.domain = tool.get_domain(url) pdata = self.get_pdata(pqhtml) productId = pdata['id'] subData = self.get_subData(productId) # print json.dumps(pdata) # print json.dumps(subData) # exit() #下架 if 'Product may be unavailable' in subData.get('errorMessage','') : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #产品ID detail['productId'] = productId detail['productSku'] = productId detail['productCode'] = productId #品牌 brand = subData['productThumbnail']['brand'] detail['brand'] = brand #名称 detail['name'] = pdata.get('name','') or PyQuery(pdata['shortDescription']).text() #货币 currency = pqhtml('meta[itemprop="priceCurrency"]').attr('content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) # print json.dumps(pdata) #价格 # price,listPrice = pdata['salePrice'],pdata.get('regularPrice',pdata['salePrice']) price,listPrice = self.get_all_price(pdata) detail['price'] = price or listPrice detail['listPrice'] = listPrice #描述 detail['descr'] = pqhtml('#memberProductDetails').text() detail['descr'] = pqhtml('.product-details-content').text() #图片集 imgs = self.get_imgs(subData) detail['img'] = imgs[0] if isinstance(imgs,list) else {cid:Arr[0] for cid,Arr in imgs.items()} detail['imgs'] = imgs #规格 sizes = self.get_sizes_by_subdata(subData['availabilityMap']) detail['sizes'] = sizes # detail['keys'] = sizes.keys() #size里面有的颜色,price里面没有,2016-11-27 keys = price.keys() if isinstance(price,dict) else sizes.keys() keys = map(lambda x: x , keys) detail['keys'] = set(keys) #部分颜色没有图片。随机取一个图片,2016-11-27 if isinstance(price,dict) : for colorName in price.keys(): if colorName not in detail['imgs'] : detail['imgs'][colorName] = imgs.values()[0] #[pdata['mainImageURL']] detail['img'][colorName] = imgs.values()[0][0] #pdata['mainImageURL'] #颜色 color = {color:color for color in sizes.keys()} detail['color'] = color detail['colorId'] = color #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url #返回的IP和端口 if resp.raw._original_response.peer : detail['ip_port']=':'.join(map( lambda x:str(x),resp.raw._original_response.peer)) log_info = json.dumps(dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') # 下架 is_ok,data = self.is_ok_status_code(status_code, pqhtml, url, resp) if not is_ok : return data # 前期准备 area = pqhtml('#contentArea') domain = tool.get_domain(url) # pdata = self.get_pdata(area) productId = area('form input[name="prodCode"]').attr('value') pdata = self.get_pdata(productId) # print pdata # print area.outerHtml().encode('utf-8') # exit() # 下架 if not pdata and not area('#divSelectOpt input') : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() # 产品ID detail['productId'] = productId detail['productSku'] = productId detail['productCode'] = productId # 品牌 brand = area('.detailInfo .infoTable .titleWrap a:first').text() detail['brand'] = brand # 名称 detail['name'] = area('.detailInfo .infoTable .titleWrap').text() # 价格 currency, price, listPrice = self.get_all_price(area) detail['price'] = price detail['listPrice'] = listPrice # 货币,取固定的美元价格 detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) # 描述 detail['descr'] = area('.infoTable .optTable_1').text() or u'没有获取到描述' # 颜色 # color = self.get_color(area) detail['color'] = self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = self.cfg.DEFAULT_COLOR_SKU # 图片集 imgs = [ 'http://www.sheisback.com'+img.attr('src') for img in area('.detailArea img').items()] detail['img'] = 'http://www.sheisback.com' + area('.detailImg img:first').attr('data-zoom-image') detail['imgs'] = imgs # 规格 detail['sizes'] = self.get_sizes(pdata) # HTTP状态码 detail['status_code'] = status_code # 状态 detail['status'] = self.cfg.STATUS_SALE # 返回链接 detail['backUrl'] = resp.url # 返回的IP和端口 if resp.raw._original_response.peer: detail['ip_port'] = ':'.join( map(lambda x: str(x), resp.raw._original_response.peer)) log_info = json.dumps(dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail)
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') domain = tool.get_domain(url) #下架: if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 Jtxt = pqhtml('script').text() area = pqhtml('.product-area') dataLayer = json.loads( re.search(r'dataLayer = \[(\{.*?\})\];', Jtxt, re.DOTALL).groups()[0].replace('\'', '"')) #默认官网只有一个颜色,一个产品,多颜色多size,多colorID已处理好,但是多颜色多图片没有处理. 在 get_imgs 方法. assert len(dataLayer['productDetails'] ) == 1, 'coggles too many products , fix this bug' productId, pdata = self.get_pdata(domain, dataLayer) instock = area('meta[itemprop="availability"]').attr( 'content') == 'InStock' #下架 if not instock or 'Sold Out' in area('.product-simple').text(): log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) # print area.outerHtml() # exit() detail = dict() #图片 imgsTmp = self.get_imgs(pdata, pqhtml) detail['img'] = imgsTmp[0] detail['imgs'] = imgsTmp #名称 detail['name'] = area('.product-title-wrap').text() #品牌 detail['brand'] = re.search(r'productBrand: "(.*?)",', Jtxt, re.DOTALL).groups()[0] #价格 price, listPrice = self.get_all_price(area, pdata) detail['price'] = price detail['listPrice'] = listPrice #价格符号 currency = dataLayer['pageAttributes'][0]['currency'] detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #产品id prodId = pqhtml('input[name="prodId"]').attr('value') detail['productId'] = prodId #颜色 color = self.get_color(pdata) detail['color'] = color detail['colorId'] = dict([(key, key) for key in color.keys()]) #钥匙 detail['keys'] = color.keys() #规格 detail['sizes'] = self.get_sizes(productId, pdata) #描述 detail['descr'] = area('div[itemprop="product-description"]').text( ).replace('\'', '') + area('div[itemprop="description"]').text().replace( '\'', '') #注意: if len(area('.promotionalmessage')) > 1: detail['note'] = area('.promotionalmessage').text() #详细 detail['detail'] = area('.js-prodInfo-details').text() #退货和配送信息 detail['returns'] = area( 'div.product-delivery-returns').text().replace('\'', '') #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #下架 if len(pqhtml('#itemOptions #addToBasketDisabled')) > 0: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('#details') or pqhtml('#productPage') domain = tool.get_domain(url) pdata = self.get_data(pqhtml) # print area.outerHtml().encode('utf-8') # print pdata # exit() detail = dict() #品牌 brand = re.search(r'brand: "(.*?)",', pdata).groups()[0] detail['brand'] = brand #名称 detail['name'] = area('h1[itemprop="name"]').text() #货币 currency = self.get_currency(pdata) detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = self.get_all_price(pdata, area) detail['price'] = price detail['listPrice'] = listPrice # print area.outerHtml() #图片集 img_area = area('#itemGallery') or area( '#galleryBasic') #2016-09-16 13:51:08 更新 imgs = [img.attr('src') for img in img_area('img').items()] imgs = imgs or [ img.attr('data-zoom-image') for img in area('#product-view .main-image img').items() ] # 2017-03-3更新 detail['img'] = imgs[0] detail['imgs'] = imgs #产品ID productId = area('.wishlistAdd').attr('data-sku') or area( '#productPage').attr('data-sku') detail['productId'] = productId #颜色 # color = self.get_color(area) detail['color'] = self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = productId #规格 detail['sizes'] = self.get_sizes(area) #描述 detail['descr'] = area('#itemInfo ul>li:first').text() #退换货 detail['returns'] = area('#itemInfo ul>li:last').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception: raise
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('.detalheProdutos') domain = tool.get_domain(url) pdata = json.loads( pqhtml('script[type=\'application/ld+json\']').text()) # print area.outerHtml() # exit() #下架 if 'SOLD OUT' in area('.topOff').text(): log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = pdata['brand']['name'] detail['brand'] = brand #名称 detail['name'] = pdata['name'] #货币 currency = pdata['offers']['priceCurrency'] detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 listPrice = self.get_listPrice(area, currency) detail['price'] = pdata['offers']['price'] detail['listPrice'] = listPrice #颜色多于2个... if len(area('#listaCores a')) > 1: raise ValueError, 'color number is great than 1 , fix this bug : %s' % url #颜色 detail['color'] = area('#listaCores a:first').text() detail['colorId'] = area('#listaCores a:first').attr('data-id') #图片集 imgs = [ domain + a.attr('href')[1:] for a in pqhtml('.lightgalleryG .item a').items() ] detail['img'] = pdata['image'] detail['imgs'] = imgs #产品ID productId = area('a#btAddCarrinho').attr('data-id') detail['productId'] = productId # for ele in area('div#listaTamanhos a').items() : # print ele.text() #规格 detail['sizes'] = [ dict(name=ele.text(),inventory=self.cfg.DEFAULT_STOCK_NUMBER,id=ele.attr('data-id'),sku=ele.attr('data-ref'),price=ele.attr('data-preco').split()[0]) for ele in area('div#listaTamanhos a').items() ] \ or [ dict(name=self.cfg.DEFAULT_ONE_SIZE,inventory=self.cfg.DEFAULT_STOCK_NUMBER,id=productId) ] #描述 detail['descr'] = pdata['description'] #详细 detail['detail'] = area('.descMarca').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('#productview #main') domain = tool.get_domain(url) # exit() #下架 # if True : # log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) # self.logger.info(log_info) # data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) # return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = self.get_brand(pqhtml) detail['brand'] = brand #名称 detail['name'] = area('#name').text() #货币 currency = pqhtml('div[id="doc"]').attr('currency') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #获取信息 price, listPrice, sizes = self.get_info(area) #价格 detail['price'] = price detail['listPrice'] = listPrice #产品ID productId = area('input#productid').attr('value') detail['productId'] = productId #颜色 # color = self.get_color(area) detail['color'] = self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = productId #图片集 imgs = [ img.attr('data-hires') for img in area('#thumbs-anim img').items() ] detail['img'] = imgs[0] detail['imgs'] = imgs #规格 detail['sizes'] = sizes #描述 detail['descr'] = area('.product-details').text() #详细 detail['detail'] = area('.product-details').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self,url): pqhtml = '' try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get('SCERR','ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('form[name="productPage"]') domain = tool.get_domain(url) # print area.html().encode('utf-8') # exit() need_refresh_node = pqhtml('meta[http-equiv="refresh"]') if len(need_refresh_node) : time_limit = need_refresh_node.attr('content').strip().split(';')[0] sleep_seconds = int(time_limit)/2 time.sleep(sleep_seconds) #<RequestsCookieJar[]> self.session.cookies.set('INSTART_SESSION_ID',str(int((time.time()-sleep_seconds)*1000))) resp = self.session.get(url, verify=False) pqhtml = PyQuery(resp.text) area = pqhtml('form[name="productPage"]') #下架 if not area or len(area('.cannotorder')): # if not area : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) productId = area('input[name$="productId"][value!=""]').attr('value') pdata = self.get_pdata(area,productId) detail = dict() #产品ID detail['productId'] = productId #品牌 brand = area('input.cmDesignerName').attr('value') detail['brand'] = brand #名称 detail['name'] =area('h1.product-name:first').text() #货币 currency = pqhtml('meta[itemprop="priceCurrency"]').attr('content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price,listPrice = self.get_all_price(area) detail['price'] = price detail['listPrice'] = listPrice #描述 detail['descr'] = area('div[itemprop="description"]').text() #详细 detail['detail'] = area('.product-details-info').text() #颜色 # color = self.get_color(area) # detail['color'] = self.cfg.DEFAULT_ONE_COLOR # detail['colorId'] = self.cfg.DEFAULT_COLOR_SKU #图片集 img,imgs = self.get_imgs(area) detail['img'] = img detail['imgs'] = imgs #规格 sizes = self.get_sizes(pdata) detail['sizes'] = sizes if isinstance(sizes,dict): detail['keys'] = sizes.keys() detail['color'] = {key:key for key in sizes} detail['colorId'] = {key:key for key in sizes} #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url #返回的IP和端口 if resp.raw._original_response.peer : detail['ip_port']=':'.join(map( lambda x:str(x),resp.raw._original_response.peer)) log_info = json.dumps(dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: self.logger.exception('html:{0}'.format(pqhtml)) raise
def detail(self,url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get('SCERR','ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('.page-content') domain = tool.get_domain(url) # pdata = self.get_pdata(pqhtml) # print area.outerHtml() # print json.dumps(pdata) # exit() #下架 if area('div[itemprop="availability"]').text().strip() != 'Available' : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = 'Kit and Ace' detail['brand'] = brand #名称 detail['name'] = area('h1[itemprop="name"]').text() #货币 currency = pqhtml('meta[itemprop="priceCurrency"]').attr('content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price,listPrice = self.get_all_price(area) detail['price'] = price detail['listPrice'] = listPrice #颜色 color = self.get_color(area) detail['color'] = color detail['colorId'] = dict([ (key,key) for key in color.keys() ]) #图片集 imgs = self.get_imgs(area,domain) detail['img'] = imgs[0] if isinstance(imgs,list) else dict([ (cid,Arr[0]) for cid,Arr in imgs.items() ]) detail['imgs'] = imgs #钥匙 detail['keys'] = color.keys() #产品ID productId = area('.js-pdp-product-code').attr('data-product-id') detail['productId'] = productId #规格 detail['sizes'] = self.get_sizes(area) #描述 detail['descr'] = area('.pdp-desc__description').text() #构造物 detail['fabric'] = area('.pdp-info-components').text() #详细 detail['detail'] = area('.productDetailsPageSection1').text() #退换货 detail['returns'] = area('.productInfo>.infowrap>dl>dd:first').text() #模特信息 detail['model'] = self.get_model(area,color.keys()) #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = url log_info = json.dumps(dict(time=time.time(), productId=detail['productId'], name=detail[ 'name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') domain = tool.get_domain(url) productId = re.search(r'.*\/(\d+)\/.*', url, re.DOTALL).groups()[0] link = domain + ('/ajaxprodDetail.aspx?ProdId=%s' % productId) resp = self.session.get(link, verify=False) #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 pdata = json.loads(resp.text) # print json.dumps(pdata) #下架 if pdata['Prods'] == []: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = pdata['Brand'].get( 'BrandLangName', None) or pdata['Brand']['DisplayBrandName'] detail['brand'] = brand #名称 currency = re.search( r'\(\'(\w{3})\'\)', pqhtml('a[onclick^="changeCurrency"]').attr('onclick'), re.DOTALL).groups()[0] detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #其他信息: detail['keys'] = [] detail['name'] = dict() detail['sizes'] = dict() detail['price'] = dict() detail['img'] = dict() detail['imgs'] = dict() detail['descr'] = dict() detail['listPrice'] = dict() detail['color'] = dict() detail['colorId'] = dict() detail['productId'] = dict() for product in pdata['Prods']: productId = product['ProdID'] detail['keys'].append(productId) detail['productId'][productId] = productId detail['color'][productId] = self.cfg.DEFAULT_ONE_COLOR detail['colorId'][productId] = productId detail['name'][ productId] = brand + ' ' + product['ProdLangName'] detail['sizes'][productId] = [ dict(name=product['OptionValue'], inventory=self.cfg.DEFAULT_STOCK_NUMBER, sku=product['OptionValue']) ] detail['price'][productId] = re.search( r'(\d[\d\.]*)', PyQuery(product['ShopPrice']).text().replace(',', ''), re.DOTALL).groups()[0] detail['listPrice'][productId] = re.search( r'(\d[\d\.]*)', PyQuery(product['WasPrice'] or product['ShopPrice']).text().replace(',', ''), re.DOTALL).groups()[0] detail['img'][productId] = product['ProductImages'][0][ 'img700Src'] or product['ProductImages'][0][ 'img350Src'] or product['ProductImages'][0]['imgSrc'] detail['imgs'][productId] = [ img['img700Src'] or img['img350Src'] or img['imgSrc'] for img in product['ProductImages'] ] detail['descr'][productId] = ' '.join( [descr.get('text') for descr in product['Description']]) #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('.product-detail-container') domain = tool.get_domain(url) # print area.outerHtml() # exit() #下架 if u'缺货' in area('#stock-status').text(): log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = area('#brand:first span').text() or area('#brand a').text() detail['brand'] = brand #名称 detail['name'] = area('#name').text() #货币 currency = area('#price-currency').attr('content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = self.get_all_price(area) detail['price'] = price detail['listPrice'] = listPrice #颜色 detail['color'] = self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = self.cfg.DEFAULT_COLOR_SKU #图片集 imgs = [ a.attr('data-large-img') for a in area( '.image-container .thumbnail-container img').items() ] or [ img.attr('src') for img in area('#iherb-product-zoom img').items() ] imgs = imgs or [ area('#product-image .product-summary-image a').attr('href') ] detail['img'] = imgs[0] detail['imgs'] = imgs #产品ID productId = area('input[name="pid"]').attr('value') detail['productId'] = productId #规格 stock_txt = area('#stock-status').text() inv = area('#ddlQty option:last').attr( 'value' ) if 'In Stock' in stock_txt or u'有库存' in stock_txt else 0 detail['sizes'] = [ dict(name=self.cfg.DEFAULT_ONE_SIZE, inventory=inv, id=productId, sku=productId) ] #描述 detail['descr'] = area('#product-specs-list li').text() #详细 detail['detail'] = pqhtml('div[itemprop="description"]').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): try: #打印当前IP # print self.session.get('http://geo.yieldify.com/geolocation.json').text # 0627 写一个插件版 #绑定域名 self.domain = tool.get_domain(url) resp = self.session.get(url, verify=False) #end 特有验证 resp = self.end_verify(resp, url) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') # not found 错误 if status_code == 404 or '404 not found' in pqhtml( 'head title').text().lower(): log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) # 非200 错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) Jtxt = pqhtml('script').text() area = pqhtml('div.product-essential') #下架 if 'Sold out' in area('div.product-buy-box').text(): log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #线下销售 if len(area('div.notonline')) > 0: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'NTONLINE', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) # print area.outerHtml().encode('utf-8') #productConfig pcfg = self.get_pcfg(Jtxt) detail = dict() #价格符号 currency = pqhtml('meta[property="product:price:currency"]').attr( 'content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = self.get_all_price(pcfg) detail['price'] = price detail['listPrice'] = listPrice # print area.outerHtml().encode('utf-8') #品牌 detail['brand'] = pqhtml('meta[name="WT.z_pbrand"]').attr( 'content') or area('.product-description span h1').text() #名称 detail['name'] = area('h1[itemprop="name"]').text() or area( '.product-description h1').text() #图片 imgs = self.get_imgs(area) detail['imgs'] = imgs detail['img'] = imgs[0] #产品ID productId = pcfg['productId'] detail['productId'] = productId #颜色 detail['color'] = area('div.product-description h3:first').text( ) or self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = productId #退换货 detail['returns'] = area('#prod-info-tab4').text() #规格 detail['sizes'] = self.get_sizes(pcfg) #描述 detail['descr'] = area('div.product-description-text').text( ) + area('#prod-info-tab2').text() + area( '#fit-description').text() #配送 detail['delivery'] = area('#prod-info-tab2').text() #size说明. detail['sizeFit'] = area('#fit-description').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): try: resp = self.session.get(url, verify=False) #加载第一次. #验证resp.防爬虫.!!! resp = self.resp_verify(resp) if 'window.location.reload(true);' in resp.text: resp = self.session.get(url, verify=False) #加载第二次. #会出现不返回内容的情况 while not resp.text: return self.detail(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #下架 if 'Out of stock' in pqhtml('.product-availability').text(): log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('div#product-view') domain = tool.get_domain(url) pdata = self.get_pdata(area) detail = dict() #品牌 brand = area('.panel-a h1:first').text().split('-')[0].strip() detail['brand'] = brand #名称 detail['name'] = area('.panel-a h1:first').text() #货币 currency = pqhtml('meta[itemprop="priceCurrency"]').attr('content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #产品ID productId = pdata['productId'] detail['productId'] = productId #价格 price, listPrice = pdata['basePrice'].replace( ',', ''), pdata['oldPrice'].replace(',', '') detail['price'] = price detail['listPrice'] = listPrice #颜色 # color = self.get_color(area) detail['color'] = area('button#product-addtocart-button').attr( 'data-variant') or self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = productId #图片集 imgs = [ img.attr('data-src') for img in area('div#mobile-carousel-images a>img').items() ] detail['img'] = imgs[0] detail['imgs'] = imgs #规格 detail['sizes'] = self.get_sizes(area, pdata) #描述 detail['descr'] = area('div.tog-desc').text() + area.parent()( '.description-section:first').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('.product-area') domain = tool.get_domain(url) # pdata = self.get_pdata(area) # print area.outerHtml() # exit() #下架 if 'In stock' not in area('p.availability').text(): log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = area('meta[itemprop="brand"]').attr('content') detail['brand'] = brand #名称 detail['name'] = area('h1[itemprop="name"]').text() #货币 currency = area('meta[itemprop="priceCurrency"]').attr('content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #产品ID productId = area('input[name="prodId"]').attr('value') detail['productId'] = productId self.productId = productId #获取信息 color, price, listPrice, img, imgs, sizes = self.get_info(area) #钥匙 if isinstance(color, dict): detail['keys'] = color.keys() #价格 detail['price'] = price detail['listPrice'] = listPrice #颜色 detail['color'] = color detail['colorId'] = productId if isinstance( color, basestring) else {key: key for key in color.keys()} #图片集 detail['img'] = img detail['imgs'] = imgs #规格 detail['sizes'] = sizes #描述 detail['descr'] = area('.js-prodInfo-description').text() #详细 detail['detail'] = area('.js-prodInfo-details').text() #退货 detail['returns'] = area('.js-prodInfo-delivery').text() #配送 detail['delivery'] = area('.js-prodInfo-delivery').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') # 下架 is_ok, data = self.is_ok_status_code(status_code, pqhtml, url, resp) if not is_ok: return data # 前期准备 area = pqhtml('.product-detail-information') domain = tool.get_domain(url) pdata = self.get_pdata(area) # exit() # 下架 # if not area : # log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) # self.logger.info(log_info) # data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) # return tool.return_data(successful=False, data=data) detail = dict() # 产品ID productId = area('.product-detail-selection-sku').text() detail['productId'] = productId detail['productSku'] = productId detail['productCode'] = productId # 品牌 brand = None detail['brand'] = brand # 名称 detail['name'] = area('.J_title_name').text() # 货币 currency = pqhtml('a#select_currency').text().split()[0] detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) # 价格 price, listPrice = self.get_all_price(area) detail['price'] = price detail['listPrice'] = listPrice # 描述 detail['descr'] = area('#product-description-tab').text() # 详细 detail['detail'] = area('#product-description-tab').text() # 退换货 detail['returns'] = area('.product-directions').text() # 颜色 # color = self.get_color(area) detail['color'] = self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = self.cfg.DEFAULT_COLOR_SKU # 图片集 imgs = [ a.attr('href') for a in pqhtml('.product-detail-preview .toolbar>li>a').items() ] detail['img'] = imgs[0] detail['imgs'] = imgs # 规格 detail['sizes'] = self.get_sizes(area) # HTTP状态码 detail['status_code'] = status_code # 状态 detail['status'] = self.cfg.STATUS_SALE # 返回链接 detail['backUrl'] = resp.url # 返回的IP和端口 if resp.raw._original_response.peer: detail['ip_port'] = ':'.join( map(lambda x: str(x), resp.raw._original_response.peer)) log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail)
def detail(self, url): try: resp = self.session.get(url, verify=False) #这两行代码为验证做准备,勿动. self.domain = tool.get_domain(url) self.url = url #验证resp resp = self.resVerify(resp) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备: area = pqhtml('.product-view') productId = area('.product-ids').attr('content') pdata = self.get_pdata(pqhtml) # print pqhtml.outerHtml() # print area.outerHtml() # print json.dumps(pdata) # exit() #下架 if not len(PyQuery(pdata['availability'])('.instock')): log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = pdata['brand'] detail['brand'] = brand #名称 detail['name'] = brand + ' ' + pdata['title'] #货币单位 currency = pqhtml('meta[itemprop="priceCurrency"]').attr('content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = self.get_all_price(pdata) detail['price'] = price detail['listPrice'] = listPrice #描述 detail['descr'] = pqhtml('div.product-description').text() #颜色 detail['color'] = self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = pdata['sku'] #图片集 detail['img'] = pdata['image'] detail['imgs'] = pdata['images'] #规格 detail['sizes'] = [ dict(name=self.cfg.DEFAULT_ONE_SIZE, inventory=self.cfg.DEFAULT_STOCK_NUMBER, sku=pdata['sku']) ] #产品ID detail['productId'] = pdata['sku'] #退换货 detail['returns'] = pqhtml('dd#tab-container-guarantee').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') domain = tool.get_domain(url) #下架: if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 Jtxt = pqhtml('script').text() area = pqhtml('#container') pdata = self.get_pdata(Jtxt) domain = tool.get_domain(url) #下架 # if not instock : # data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) # return tool.return_data(successful=False, data=data) # print area.outerHtml() # exit() detail = dict() #图片 imgsTmp = [ domain + a.attr('href') for a in area('form#addToCart ul.alt_imgs:first>li>a').items() ] detail['img'] = imgsTmp[0] detail['imgs'] = imgsTmp #名称 detail['name'] = pdata['product']['name'] #品牌 detail['brand'] = area('form#addToCart a#sameBrandProduct').text() #价格 detail['price'] = pdata['product']['unit_sale_price'] detail['listPrice'] = pdata['product']['unit_price'] #价格符号 currency = pdata['product']['currency'] detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #产品id productId = pdata['product']['id'] detail['productId'] = productId #颜色 detail['color'] = self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = productId #规格 detail['sizes'] = [ dict(name=self.cfg.DEFAULT_ONE_SIZE, inventory=pdata['product']['stock'], sku=productId) ] #描述 detail['descr'] = area('.prod_desc').text() + ' ' + area( 'div#info_tabs>div.wrap>div#tab1_info').text() #详细 detail['detail'] = area('#tab1_info').text() #品牌描述 detail['brandDescr'] = area('#tab2_info').text() #保修 detail['note'] = area('#tab5_info').text() #配送 detail['delivery'] = area('#shippingData').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 # area = pqhtml('td[align="left"][width="619"][valign="top"]') area = pqhtml('form#productForm #ProductDetailPage #ProductDetails' ) #2016-12-15添加 pdata = self.get_pdata(pqhtml) domain = tool.get_domain(url) # print area.outerHtml().encode('utf-8') # exit() #下架 if 'SOLD OUT' in pqhtml('font').text(): log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = pqhtml('input[id="Brand"]').attr('value') detail['brand'] = brand #名称 detail['name'] = area('#DetailsHeading').text() #价格 currency, price, listPrice = self.get_all_price(area) detail['price'] = price detail['listPrice'] = listPrice #货币 detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #产品ID productId = area('input[name="ChildID"]').attr( 'value' ) #也是colorID,产品ID是area('input[name="MasterID"]').attr('value') detail['productId'] = productId #图片集 imgs = self.get_imgs(productId, pdata) detail['img'] = imgs[0] detail['imgs'] = imgs #颜色 color = self.get_color(productId, pdata) detail['color'] = self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = productId #规格 detail['sizes'] = self.get_sizes(productId, pdata) #描述 detail['descr'] = area('#Description').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 # area = pqhtml('.caption-product') area = pqhtml('.product-single-section-main') imgArea = pqhtml('.slider') domain = tool.get_domain(url) pdata = self.get_pdata(pqhtml('head')) # print area.outerHtml().encode('utf-8') # exit() #下架 # if len(area('#variant-listbox')) == 0 : # log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) # self.logger.info(log_info) # data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) # return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = pdata['product']['vendor'] detail['brand'] = brand #名称 detail['name'] = area('h1[itemprop="name"]').text() #货币 currency = pqhtml('meta[itemprop="priceCurrency"]').attr('content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = self.get_all_price(area) detail['price'] = price detail['listPrice'] = listPrice #颜色 # color = self.get_color(area) detail['color'] = self.cfg.DEFAULT_ONE_COLOR detail['colorId'] = pdata['product']['id'] #图片集 # imgs = [ 'https:'+a.attr('src') for a in imgArea('img').items()] imgs = [ 'http:' + img.attr('src') for img in area('.super-slider-main img').items() ] detail['img'] = imgs[0] detail['imgs'] = imgs #产品ID productId = pdata['product']['id'] detail['productId'] = productId #规格 detail['sizes'] = self.get_sizes(pdata, area) #描述 detail['descr'] = area('.product-single-details-dropdown').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self,url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get('SCERR','ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('.body-wrap .primary-wrap .product-area') domain = tool.get_domain(url) siteObj = self.get_siteObj(pqhtml) print area.outerHtml().encode('utf-8') # exit() #下架 if 'InStock' != area('meta[itemprop="availability"]').attr('content') or 'sold out' in area('.availability').text().lower() : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #产品ID productId = area('input[class="buy"][name="buy"][type="hidden"]').attr('value') or self.get_product_id(siteObj) detail['productId'] = productId #品牌 brand = self.get_brand(siteObj) detail['brand'] = brand #名称 detail['name'] = area('.product-title').text() #货币 currency = pqhtml('meta[itemprop="priceCurrency"]').attr('content') detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price,listPrice = self.get_all_price(area) detail['price'] = price detail['listPrice'] = listPrice #颜色,图片,尺码信息 if area('.variation-dropdowns') : img,imgs,color,sizes = self.get_color_img_size(area,productId) detail['keys'] = color.keys() else : img = area('.main-product-image a').attr('href') imgs = [ li_a.attr('href').strip().replace('/300/300/','/600/600/') for li_a in area('ul.product-thumbnails li a').items()] color = self.cfg.DEFAULT_ONE_COLOR sizes = [dict(name=self.cfg.DEFAULT_ONE_SIZE,id=productId,sku=productId,inventory=self.cfg.DEFAULT_STOCK_NUMBER)] #颜色 # color = self.get_color(area) detail['color'] = color detail['colorId'] = {cid:cid for cid in color.keys()} if isinstance(color,dict) else productId #图片集 detail['img'] = img detail['imgs'] = imgs #规格 detail['sizes'] = sizes #描述 detail['descr'] = area('div[itemprop="description"]').text() + area('div.product-more-details').text() #详细 detail['detail'] = area('div.product-more-details').text() #退换货 detail['returns'] = area('.product-delivery-returns').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url #返回的IP和端口 if resp.raw._original_response.peer : detail['ip_port']=':'.join(map( lambda x:str(x),resp.raw._original_response.peer)) log_info = json.dumps(dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self,url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200 : log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get('SCERR','ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('.product-detail') detail_tab = pqhtml('#product-detail-tabs') img_tab = pqhtml('div.images') domain = tool.get_domain(url) pdata = self.get_pdata(pqhtml) # print area.outerHtml().encode('utf-8') # print json.dumps(pdata) # print detail_tab.outerHtml().encode('utf-8') # print img_tab.outerHtml().encode('utf-8') # exit() #下架 if not area or 'out of stock' in area('.out-of-stock').text(): log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) video_prefix = 'http://image1.superdry.com/static/images/products/' detail = dict() detail['stock'] = pdata['product']['stock'] #该商品总库存. detail['video'] = video_prefix+pdata['product']['video'] detail['gender'] = pdata['product']['gender'] detail['season'] = pdata['product']['season'] detail['category'] = pdata['product']['category'] detail['productSku'] = pdata['product']['sku_code'] detail['size_guide'] = pdata['product']['size_guide'] detail['subcategory'] = pdata['product']['subcategory'] detail['productCode'] = pdata['product']['sku_code'] #产品ID productId = pdata['product']['id'] detail['productId'] = productId #品牌 brand = 'SUPERDRY' detail['brand'] = brand #名称 detail['name'] = pdata['product']['name'] #货币 currency = pdata['product']['currency'] detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 detail['price'] = pdata['product']['unit_sale_price'] detail['listPrice'] = pdata['product']['unit_price'] #描述 detail['descr'] = pdata['product']['description'] #详细 detail['detail'] = detail_tab.text() #退换货 detail['returns'] = detail_tab('tab-page:last').text() #颜色 detail['color'] = pdata['product']['color'] detail['colorId'] = pdata['product']['color'] #图片集 imgs = [ ele.attr('src') for ele in img_tab('.scroller li img').items()] imgs = map(lambda x : x.replace('/productthumbs/','/zoom/'), imgs) detail['img'] = img_tab('.scroller li img:first').attr('src').replace('/productthumbs/','/zoom/') detail['imgs'] = imgs #规格 detail['sizes'] = self.get_sizes(area) #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url #返回的IP和端口 if resp.raw._original_response.peer : detail['ip_port']=':'.join(map( lambda x:str(x),resp.raw._original_response.peer)) log_info = json.dumps(dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def detail(self, url): resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 # area = pqhtml('.product-detail-information') self.domain = tool.get_domain(url) # pdata = self.get_pdata(area) # print pqhtml.outerHtml().encode('utf-8') # exit() #下架 # if True : # log_info = json.dumps(dict(time=time.time(),title=pqhtml('title').text(),url=url)) # self.logger.info(log_info) # data = tool.get_off_shelf(code=status_code,message=self.cfg.SOLD_OUT,backUrl=resp.url, html=pqhtml.outerHtml()) # return tool.return_data(successful=False, data=data) detail = dict() #品牌 brand = self.getBrandByHtml(pqhtml).strip() detail['brand'] = brand or 'MYGEEK' #名称 detail['name'] = pqhtml('span.title strong').text().strip() #货币 currency = 'CNY' detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = self.getPriceByHtml(pqhtml) detail['price'] = price detail['listPrice'] = listPrice #颜色 color = self.get_color(pqhtml) detail['color'] = color detail['colorId'] = {k: k for k in color.keys()} if isinstance( color, dict) else self.cfg.DEFAULT_COLOR_SKU #skus: if isinstance(color, dict): detail['keys'] = color.keys() #图片集 imgs = self.getImgsByHtml(pqhtml) detail['img'] = imgs[0] detail['imgs'] = imgs #产品ID productId = re.search( r'id=(\d*)', pqhtml('div.pid5 form:first').attr('action')).groups()[0] detail['productId'] = productId #规格 detail['sizes'] = self.getSizesByHtml(pqhtml) #描述 detail['descr'] = pqhtml('#pid1_2').remove('.title').remove( 'script').text() + pqhtml('.pid2').remove('.title').remove( 'script').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail)