def getProductList(self, pageUrl): product_list_page = getHtmlAsSoup(pageUrl) # 尝试获取下一页url nextPage = product_list_page.find('a', class_='page_curl_btn', text='下一页') if nextPage != None and hasattr(nextPage, 'href'): nextPage = 'http://item.grainger.cn/' + nextPage['href'] else: nextPage = None # 获取商品tag productTags = product_list_page.find_all('div', class_='product_grid_image img') urls = [] for div in productTags: urlOne = div.a['href'] urls.append('http://item.grainger.cn' + urlOne) return {'urls': urls, 'next': nextPage}
def getCate4(self, name3, url3): soupCate4 = getHtmlAsSoup(url3) cateLi4 = soupCate4.find_all('a', href=re.compile(url3[-11:] + "\d\d$")) cate4 = [] if len(cateLi4) > 0: for ct4 in cateLi4: name4 = ct4.string url4 = self.fullUrl(ct4['href']) cate4.append({'name': name4, 'url': url4}) # print(mCateName+"|"+ mainUrl+"|"+name2+"|"+url2+"|"+name3+"|"+url3+"|"+name4+"|"+url4) else: cate4.append({ 'name': name3, 'url': url3, }) # print(mCateName + "|" + mainUrl + "|" + name2 + "|" + url2 + "|" + name3 + "|" + url3 ) return cate4
def getProductList(self, pageUrl): product_list_page = getHtmlAsSoup(pageUrl) #尝试获取下一页url nextPage = product_list_page.find('div', class_='m-pagination').find( 'a', text='下一页') if nextPage != None and len(nextPage) != 0: nextPage = self.fullUrl(nextPage['href']) else: nextPage = None #获取商品tag productTags = product_list_page.find_all( 'a', class_='pic', href=re.compile('^/product/\d+$')) urls = [] for a in productTags: urls.append(a['href']) return {'urls': urls, 'next': nextPage}
def getProductOne(self, url): raiseIf(url.replace('item.grainger.cn', '').find('g') <= 0, '传入的URL不属于SPU') soup = getHtmlAsSoup(url) productId = url.split('/')[-2:-1][0] productName = soup.find('div', id='product-intro').find('h1').string productCode = productId categoryPathTag = soup.find('div', class_='node_path').find_all('a')[1:] categoryPath = '' for cpt in categoryPathTag: categoryPath += cpt.string.strip() + '>' categoryPath += productName # 获取直属分类名称 categoryName = categoryPath.split('>') categoryName = categoryName[-2:-1][0] productInrtoTag = soup.find('div', id='product-intro').find('ul', id='summary').find_all('li') price = '' unit_name = '' markedPrice = '' buyNo = '' brandName = '' brandUrl = '' brandImg = '' model = '分SKU' for mainInfo in productInrtoTag: if mainInfo.find('div').string == '价 格:': price = mainInfo.find('strong', class_='p-price').string[1:] if mainInfo.find('div').string == '品 牌:': brandName = mainInfo.find('a').string if hasattr(mainInfo.find('a'), 'href'): brandUrl = "http://item.grainger.cn/" + mainInfo.find('a')['href'][1:] productDetailTag = soup.find(id='content_product') descriptionTag = productDetailTag.find('div', class_='property') description = str(descriptionTag).replace('<br/>', '') specTag = soup.find('ul', class_='specifications') specInfo = [] if specTag != None: for div in specTag.find_all('div'): specPair = div.string.split(':') if len(specPair) == 2: dbSpec = ProductSpec( {'product_code': productCode, 'product_id': productId, 'spec_name': str(specPair[0]).strip(), 'spec_value': str(specPair[1]).strip()}) specInfo.append(dbSpec) #保存主图 mainImage = [] mainImageTags = soup.find('ul', class_='lh imageThumb') if mainImageTags != None: for mtag in mainImageTags.find_all('a'): mainUrl = mtag['rel'][4][1:][:-2] dbImage = ProductImage({'product_code': productCode, 'product_id': productId, 'image_url': mainUrl, 'type': '2'}) mainImage.append(dbImage) #保存详情图 imageInfo = [] detailImageTags = productDetailTag.find('div', class_='group-picture') if detailImageTags != None: for imageTag in detailImageTags.find_all('img'): imageUrl = imageTag['data-original'] dbImage = ProductImage({'product_code':productCode,'product_id':productId,'image_url':imageUrl, 'type':'1'}) imageInfo.append(dbImage) #获取SKU信息 skuInfos = [] skuTags = soup.find(id='pd_table').tbody if skuTags != None: for skuTr in skuTags.find_all('tr'): skuTag = skuTr.find('a', target='_blank') skuUrl = 'http://'+skuTag['href'][2:] skuModel = skuTag.string stopSaleTag = skuTr.find('span', class_='iconOutOfOrder') stopSale = '' if stopSaleTag != None: stopSale = '停止销售' alternativeProductUrl = '' alternativeTag = skuTr.find('td', class_='alternative') if alternativeTag != None and alternativeTag.a != None: alternativeProductUrl = alternativeTag.a['href'][2:] dbProductSku = ProductSku({'product_code':productCode,'product_id':productId,'product_model':skuModel, 'model_url':skuUrl,'remark': stopSale, 'can_replace':alternativeProductUrl,'info_saved': '0'}) skuInfos.append(dbProductSku) dbProduct = Product({'category_path':categoryPath, 'product_id':productId, 'product_code':productCode, 'product_url': url, 'product_name':productName, 'price':price, 'model':model, 'description':description, 'buy_code':buyNo, 'brand_name':brandName, 'brand_img':brandImg, 'brand_url':brandUrl, 'unit_name':unit_name, 'market_price':markedPrice, 'image_saved':'0', 'product_type':'SPU', 'category_name': categoryName, 'main_img':mainImage, 'detail_img':imageInfo, 'specs':specInfo, 'skus':skuInfos, 'comments': []}) return dbProduct
def getSkuOne(self, skuUrl): raiseIf(skuUrl.replace('item.grainger.cn', '').find('u') <= 0, '传入的URL不属于SKU') soup = getHtmlAsSoup(skuUrl) categoryPathTag = soup.find('div', class_='node_path').find_all('a') categoryPath = '' for cpt in categoryPathTag[1:]: categoryPath += cpt.string.strip() + '>' categoryPath = categoryPath[:-1] productId = skuUrl.split('/')[-2:-1][0] # 获取直属分类名称 categoryName = categoryPath.split('>') categoryName = categoryName[-2:-1][0] productName = soup.find('div', id='product-intro').find('h1').string productCode = productId productInrtoTag = soup.find('div', id='product-intro').find('div', class_='line').find_all('dl') price = '' unit_name = '' markedPrice = '' buyNo = '' brandName = '' brandUrl = '' brandImg = '' model = '' for mainInfo in productInrtoTag: if mainInfo.find('dt').string == '价 格': price = mainInfo.find('span', class_='p-price').contents[0][1:] unit_name = mainInfo.find('span', class_='p-price').contents[1].string[1:] if mainInfo.find('dt').string == '面 价': markedPrice = mainInfo.find('dd', class_='p-price-del').string[1:] if mainInfo.find('dt').string == '订 货 号': buyNo = mainInfo.find('span').string if mainInfo.find('dt').string == '品 牌': brandName = mainInfo.find('a').string brandUrl = "http:" + mainInfo.find('a')['href'][1:] if mainInfo.find('dt').string == '制造商型号': model = mainInfo.find('dd').string productDetailTag = soup.find(id='content_product') descriptionTag = productDetailTag.find('div', class_='property') description = str(descriptionTag).replace('<br/>', '') specTag = soup.find('ul', class_='specifications') specInfo = [] if specTag != None: for div in specTag.find_all('div'): specPair = div.string.split(':') if len(specPair) == 2: dbSpec = ProductSpec( {'product_code': productCode, 'product_id': productId, 'spec_name': str(specPair[0]).strip(), 'spec_value': str(specPair[1]).strip()}) specInfo.append(dbSpec) #保存主图 mainImage = [] mainImageTag = soup.find('div', id='spec-n1') if mainImageTag != None: mainImageTag = mainImageTag.find('a') dbImage = ProductImage({'product_code': productCode, 'product_id': productId, 'image_url': mainImageTag['href'], 'type': '2'}) mainImage.append(dbImage) # 保存详情图 imageInfo = [] detailImageTags = productDetailTag.find('div', class_='group-picture') if detailImageTags != None: for imageTag in detailImageTags.find_all('img'): url = imageTag['data-original'] dbImage = ProductImage( {'product_code': productCode, 'product_id': productId, 'image_url': url, 'type': '1'}) imageInfo.append(dbImage) dbSkuProduct = Product({'mark':self.mark, 'category_path': categoryPath, 'product_id': productId, 'product_code': productCode, 'product_url': skuUrl, 'product_name': productName, 'price': price, 'model': model, 'description': description, 'buy_code': buyNo, 'brand_name': brandName, 'brand_img': brandImg, 'brand_url': brandUrl, 'unit_name': unit_name, 'market_price': markedPrice, 'image_saved': '0', 'product_type': 'SKU', 'category_name': categoryName, 'main_img': mainImage, 'detail_img': imageInfo, 'specs': specInfo, 'skus': [], 'comments': []}) return dbSkuProduct
def getMainPage(self): self.mainPage = getHtmlAsSoup(self.url)
def processOneProduct(self, productUrl): detailSoup = getHtmlAsSoup(productUrl) categoryPathTag = detailSoup.find('div', class_='g-wrapper brand-menu-text') categoryPath = '' for cpt in categoryPathTag: categoryPath += cpt.string.strip() productId = productUrl[productUrl.find('product/') + 8:] #获取直属分类名称 categoryName = categoryPath.split('>') categoryName = categoryName[-2:-1][0] #品名 productName = detailSoup.find( 'h1', class_='detail-goods-right-head ft22').string.strip() productCode = detailSoup.find('font', class_='J_goodNo').string.strip() #价格 price = detailSoup.find('label', class_='ft24 a weight J_salePrice').text[1:] #订货号 buyNo = detailSoup.find('label', class_='J_buyNo d').string.strip() #型号 model = detailSoup.find('label', class_='J_model d').string.strip() brandInfo = detailSoup.find('div', class_='detail-goods-brand') brandName = brandInfo.find('a').find('img')['title'] brandImg = brandInfo.find('a').find('img')['src'] brandUrl = brandInfo.find('a')['href'] #保存 specTag = detailSoup.find( 'table', class_='detail-attrs-right-attrs fl J_attrs').find_all('tr') specInfo = [] for tr in specTag: tr = tr.find_all('td') if len(tr[0].text.strip()) > 0: specInfo.append({ 'key': tr[0].text.strip(), 'value': tr[1].text.strip() }) if len(tr[2].text.strip()) > 0: specInfo.append({ 'key': tr[2].text.strip(), 'value': tr[3].text.strip() }) imgTags = detailSoup.find( 'div', class_='detail-attrs-right-body J_body').find_all('img') imgUrls = [] for imgtag in imgTags: imgUrls.append(imgtag['src']) smallImgUrls = [] smallImgTag = detailSoup.find( 'div', class_='detail-goods-left-do-box').find_all('img') for smImgTg in smallImgTag: smallImgUrls.append(smImgTg['src']) imageInfo = imgUrls productInfo = { 'id': productId, 'categoryPath': categoryPath, 'code': productCode, 'url': productUrl, 'name': productName, 'price': price, 'model': model, 'buyCode': buyNo, 'brandname': brandName, 'brandimg': brandImg, 'brandurl': brandUrl, 'spec': specInfo, 'detail': imageInfo, 'small_img': smallImgUrls, 'category_name': categoryName } return productInfo
def loadMainPage(self): self.mainPage = tool.getHtmlAsSoup(self.url)