Esempi in Python per getHtmlAsSoup, esempi in Python per tool.getHtmlAsSoup

Esempio n. 1

0

Mostra file

File: m_grainger.py Progetto: xiangbohua/python_spider

    def getProductList(self, pageUrl):
        product_list_page = getHtmlAsSoup(pageUrl)
        # 尝试获取下一页url
        nextPage = product_list_page.find('a', class_='page_curl_btn', text='下一页')

        if nextPage != None and hasattr(nextPage, 'href'):
            nextPage = 'http://item.grainger.cn/' + nextPage['href']
        else:
            nextPage = None

        # 获取商品tag
        productTags = product_list_page.find_all('div',  class_='product_grid_image img')
        urls = []
        for div in productTags:
            urlOne = div.a['href']
            urls.append('http://item.grainger.cn' + urlOne)

        return {'urls': urls, 'next': nextPage}

Esempio n. 2

0

Mostra file

File: m_vipmro.py Progetto: xiangbohua/python_spider

    def getCate4(self, name3, url3):
        soupCate4 = getHtmlAsSoup(url3)
        cateLi4 = soupCate4.find_all('a',
                                     href=re.compile(url3[-11:] + "\d\d$"))
        cate4 = []
        if len(cateLi4) > 0:
            for ct4 in cateLi4:
                name4 = ct4.string
                url4 = self.fullUrl(ct4['href'])

                cate4.append({'name': name4, 'url': url4})
                # print(mCateName+"|"+ mainUrl+"|"+name2+"|"+url2+"|"+name3+"|"+url3+"|"+name4+"|"+url4)
        else:
            cate4.append({
                'name': name3,
                'url': url3,
            })
            # print(mCateName + "|" + mainUrl + "|" + name2 + "|" + url2 + "|" + name3 + "|" + url3 )
        return cate4

Esempio n. 3

0

Mostra file

File: m_vipmro.py Progetto: xiangbohua/python_spider

    def getProductList(self, pageUrl):
        product_list_page = getHtmlAsSoup(pageUrl)
        #尝试获取下一页url
        nextPage = product_list_page.find('div', class_='m-pagination').find(
            'a', text='下一页')

        if nextPage != None and len(nextPage) != 0:
            nextPage = self.fullUrl(nextPage['href'])
        else:
            nextPage = None

        #获取商品tag
        productTags = product_list_page.find_all(
            'a', class_='pic', href=re.compile('^/product/\d+$'))

        urls = []
        for a in productTags:
            urls.append(a['href'])

        return {'urls': urls, 'next': nextPage}

Esempio n. 4

0

Mostra file

File: m_grainger.py Progetto: xiangbohua/python_spider

    def getProductOne(self, url):
        raiseIf(url.replace('item.grainger.cn', '').find('g') <= 0, '传入的URL不属于SPU')

        soup = getHtmlAsSoup(url)

        productId = url.split('/')[-2:-1][0]

        productName = soup.find('div', id='product-intro').find('h1').string
        productCode = productId

        categoryPathTag = soup.find('div', class_='node_path').find_all('a')[1:]
        categoryPath = ''
        for cpt in categoryPathTag:
            categoryPath += cpt.string.strip() + '>'

        categoryPath += productName


        # 获取直属分类名称
        categoryName = categoryPath.split('>')
        categoryName = categoryName[-2:-1][0]

        productInrtoTag = soup.find('div', id='product-intro').find('ul', id='summary').find_all('li')


        price = ''
        unit_name = ''
        markedPrice = ''
        buyNo = ''
        brandName = ''
        brandUrl = ''
        brandImg = ''
        model = '分SKU'

        for mainInfo in productInrtoTag:
            if mainInfo.find('div').string == '价　　格：':
                price = mainInfo.find('strong', class_='p-price').string[1:]

            if mainInfo.find('div').string == '品　　牌：':
                brandName = mainInfo.find('a').string
                if hasattr(mainInfo.find('a'), 'href'):
                    brandUrl = "http://item.grainger.cn/" + mainInfo.find('a')['href'][1:]


        productDetailTag = soup.find(id='content_product')

        descriptionTag = productDetailTag.find('div', class_='property')

        description = str(descriptionTag).replace('<br/>', '')

        specTag = soup.find('ul', class_='specifications')
        specInfo = []
        if specTag != None:
            for div in specTag.find_all('div'):
                specPair = div.string.split('：')
                if len(specPair) == 2:
                    dbSpec = ProductSpec(
                        {'product_code': productCode, 'product_id': productId, 'spec_name': str(specPair[0]).strip(),
                         'spec_value': str(specPair[1]).strip()})
                    specInfo.append(dbSpec)


        #保存主图
        mainImage = []
        mainImageTags = soup.find('ul', class_='lh imageThumb')
        if mainImageTags != None:
            for mtag in mainImageTags.find_all('a'):
                mainUrl = mtag['rel'][4][1:][:-2]
                dbImage = ProductImage({'product_code': productCode, 'product_id': productId, 'image_url': mainUrl, 'type': '2'})
                mainImage.append(dbImage)

        #保存详情图
        imageInfo = []
        detailImageTags = productDetailTag.find('div', class_='group-picture')
        if detailImageTags != None:
            for imageTag in detailImageTags.find_all('img'):
                imageUrl = imageTag['data-original']
                dbImage = ProductImage({'product_code':productCode,'product_id':productId,'image_url':imageUrl, 'type':'1'})
                imageInfo.append(dbImage)

        #获取SKU信息
        skuInfos = []
        skuTags = soup.find(id='pd_table').tbody

        if skuTags != None:
            for skuTr in skuTags.find_all('tr'):
                skuTag = skuTr.find('a', target='_blank')
                skuUrl = 'http://'+skuTag['href'][2:]

                skuModel = skuTag.string
                stopSaleTag = skuTr.find('span', class_='iconOutOfOrder')
                stopSale = ''
                if stopSaleTag != None:
                    stopSale = '停止销售'

                alternativeProductUrl = ''
                alternativeTag = skuTr.find('td', class_='alternative')
                if alternativeTag != None and alternativeTag.a != None:
                    alternativeProductUrl = alternativeTag.a['href'][2:]

                dbProductSku = ProductSku({'product_code':productCode,'product_id':productId,'product_model':skuModel, 'model_url':skuUrl,'remark': stopSale, 'can_replace':alternativeProductUrl,'info_saved': '0'})
                skuInfos.append(dbProductSku)

        dbProduct = Product({'category_path':categoryPath,
                             'product_id':productId,
                             'product_code':productCode,
                             'product_url': url,
                             'product_name':productName,
                             'price':price,
                             'model':model,
                             'description':description,
                             'buy_code':buyNo,
                             'brand_name':brandName,
                             'brand_img':brandImg,
                             'brand_url':brandUrl,
                             'unit_name':unit_name,
                             'market_price':markedPrice,
                             'image_saved':'0',
                             'product_type':'SPU',
                             'category_name': categoryName,

                             'main_img':mainImage,
                             'detail_img':imageInfo,
                             'specs':specInfo,
                             'skus':skuInfos,
                             'comments': []})


        return dbProduct

Esempio n. 5

0

Mostra file

File: m_grainger.py Progetto: xiangbohua/python_spider

    def getSkuOne(self, skuUrl):
        raiseIf(skuUrl.replace('item.grainger.cn', '').find('u') <= 0, '传入的URL不属于SKU')

        soup = getHtmlAsSoup(skuUrl)
        categoryPathTag = soup.find('div', class_='node_path').find_all('a')
        categoryPath = ''
        for cpt in categoryPathTag[1:]:
            categoryPath += cpt.string.strip() + '>'

        categoryPath = categoryPath[:-1]

        productId = skuUrl.split('/')[-2:-1][0]
        # 获取直属分类名称
        categoryName = categoryPath.split('>')
        categoryName = categoryName[-2:-1][0]

        productName = soup.find('div', id='product-intro').find('h1').string
        productCode = productId

        productInrtoTag = soup.find('div', id='product-intro').find('div', class_='line').find_all('dl')
        price = ''
        unit_name = ''
        markedPrice = ''
        buyNo = ''
        brandName = ''
        brandUrl = ''
        brandImg = ''
        model = ''

        for mainInfo in productInrtoTag:
            if mainInfo.find('dt').string == '价　　格':
                price = mainInfo.find('span', class_='p-price').contents[0][1:]
                unit_name = mainInfo.find('span', class_='p-price').contents[1].string[1:]

            if mainInfo.find('dt').string == '面　　价':
                markedPrice = mainInfo.find('dd', class_='p-price-del').string[1:]

            if mainInfo.find('dt').string == '订 货 号':
                buyNo = mainInfo.find('span').string

            if mainInfo.find('dt').string == '品　　牌':
                brandName = mainInfo.find('a').string
                brandUrl = "http:" + mainInfo.find('a')['href'][1:]

            if mainInfo.find('dt').string == '制造商型号':
                model = mainInfo.find('dd').string

        productDetailTag = soup.find(id='content_product')
        descriptionTag = productDetailTag.find('div', class_='property')
        description = str(descriptionTag).replace('<br/>', '')

        specTag = soup.find('ul', class_='specifications')
        specInfo = []
        if specTag != None:
            for div in specTag.find_all('div'):
                specPair = div.string.split('：')
                if len(specPair) == 2:
                    dbSpec = ProductSpec(
                        {'product_code': productCode, 'product_id': productId, 'spec_name': str(specPair[0]).strip(),
                         'spec_value': str(specPair[1]).strip()})
                    specInfo.append(dbSpec)


        #保存主图
        mainImage = []
        mainImageTag = soup.find('div', id='spec-n1')
        if mainImageTag != None:
            mainImageTag = mainImageTag.find('a')
            dbImage = ProductImage({'product_code': productCode, 'product_id': productId, 'image_url': mainImageTag['href'], 'type': '2'})
            mainImage.append(dbImage)

        # 保存详情图
        imageInfo = []
        detailImageTags = productDetailTag.find('div', class_='group-picture')
        if detailImageTags != None:
            for imageTag in detailImageTags.find_all('img'):
                url = imageTag['data-original']
                dbImage = ProductImage(
                    {'product_code': productCode, 'product_id': productId, 'image_url': url, 'type': '1'})
                imageInfo.append(dbImage)

        dbSkuProduct = Product({'mark':self.mark,
                             'category_path': categoryPath,
                             'product_id': productId,
                             'product_code': productCode,
                             'product_url': skuUrl,
                             'product_name': productName,
                             'price': price,
                             'model': model,
                             'description': description,
                             'buy_code': buyNo,
                             'brand_name': brandName,
                             'brand_img': brandImg,
                             'brand_url': brandUrl,
                             'unit_name': unit_name,
                             'market_price': markedPrice,
                             'image_saved': '0',
                             'product_type': 'SKU',
                             'category_name': categoryName,

                             'main_img': mainImage,
                             'detail_img': imageInfo,
                             'specs': specInfo,
                             'skus': [],
                             'comments': []})

        return dbSkuProduct

Esempio n. 6

0

Mostra file

File: m_vipmro.py Progetto: xiangbohua/python_spider

 def getMainPage(self):
     self.mainPage = getHtmlAsSoup(self.url)

Esempio n. 7

0

Mostra file

File: m_vipmro.py Progetto: xiangbohua/python_spider

    def processOneProduct(self, productUrl):
        detailSoup = getHtmlAsSoup(productUrl)
        categoryPathTag = detailSoup.find('div',
                                          class_='g-wrapper brand-menu-text')
        categoryPath = ''
        for cpt in categoryPathTag:
            categoryPath += cpt.string.strip()

        productId = productUrl[productUrl.find('product/') + 8:]

        #获取直属分类名称
        categoryName = categoryPath.split('>')
        categoryName = categoryName[-2:-1][0]

        #品名
        productName = detailSoup.find(
            'h1', class_='detail-goods-right-head ft22').string.strip()
        productCode = detailSoup.find('font', class_='J_goodNo').string.strip()

        #价格
        price = detailSoup.find('label',
                                class_='ft24 a weight J_salePrice').text[1:]
        #订货号
        buyNo = detailSoup.find('label', class_='J_buyNo d').string.strip()
        #型号
        model = detailSoup.find('label', class_='J_model d').string.strip()

        brandInfo = detailSoup.find('div', class_='detail-goods-brand')

        brandName = brandInfo.find('a').find('img')['title']
        brandImg = brandInfo.find('a').find('img')['src']
        brandUrl = brandInfo.find('a')['href']

        #保存
        specTag = detailSoup.find(
            'table',
            class_='detail-attrs-right-attrs fl J_attrs').find_all('tr')
        specInfo = []
        for tr in specTag:
            tr = tr.find_all('td')

            if len(tr[0].text.strip()) > 0:
                specInfo.append({
                    'key': tr[0].text.strip(),
                    'value': tr[1].text.strip()
                })
            if len(tr[2].text.strip()) > 0:
                specInfo.append({
                    'key': tr[2].text.strip(),
                    'value': tr[3].text.strip()
                })

        imgTags = detailSoup.find(
            'div', class_='detail-attrs-right-body J_body').find_all('img')

        imgUrls = []
        for imgtag in imgTags:
            imgUrls.append(imgtag['src'])

        smallImgUrls = []
        smallImgTag = detailSoup.find(
            'div', class_='detail-goods-left-do-box').find_all('img')
        for smImgTg in smallImgTag:
            smallImgUrls.append(smImgTg['src'])

        imageInfo = imgUrls

        productInfo = {
            'id': productId,
            'categoryPath': categoryPath,
            'code': productCode,
            'url': productUrl,
            'name': productName,
            'price': price,
            'model': model,
            'buyCode': buyNo,
            'brandname': brandName,
            'brandimg': brandImg,
            'brandurl': brandUrl,
            'spec': specInfo,
            'detail': imageInfo,
            'small_img': smallImgUrls,
            'category_name': categoryName
        }
        return productInfo

Esempio n. 8

0

Mostra file

 def loadMainPage(self):
     self.mainPage = tool.getHtmlAsSoup(self.url)