def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) soup = BeautifulSoup(session.get(url).text, 'html.parser') name = soup.find("h1", "title").text if soup.find("div", "product-price") is None: return [] sku = re.search(r'(\d+)', soup.find("div", "product-price")['id'] ).groups()[0] stock = -1 price = soup.find('div', 'product-price') price = price.find('div', 'PricesalesPrice').span.text price = Decimal(remove_words(price)) description_a = html_to_markdown(str(soup.find('div', 's_desc').text)) description_b = html_to_markdown(str(soup.find('div', 'desc'))) description = description_a + '\n\n' + description_b resized_picture_urls = soup.find('ul', 'pagination2').img['src'] resized_picture_name = resized_picture_urls.split('/')[-1] picture_size = re.search(r'(_\d+x\d+)', resized_picture_name).groups()[0] picture_name = resized_picture_name.replace(picture_size, '') picture_urls = ['http://www.airecenter.cl/images/stories/' 'virtuemart/product/' + picture_name] p = Product( name, cls.__name__, category, url, url, sku, stock, price, price, 'CLP', sku=sku, description=description, picture_urls=picture_urls ) return [p]
def products_for_url(cls, url, category=None, extra_args=None): print(url) session = session_with_proxy(extra_args) for i in range(5): response = session.get(url) if response.status_code == 404: return [] if response.status_code == 200: break else: # Called if no "break" was executed raise Exception('Could not bypass Incapsulata') soup = BeautifulSoup(response.text, 'html.parser') name = soup.find('span', {'itemprop': 'name'}).text.strip() sku = soup.find('div', {'itemprop': 'sku'}).text.strip() price = Decimal( soup.find('meta', {'itemprop': 'price'})['content'].strip()) stock = -1 pictures_data = re.search(r'"mage/gallery/gallery": ([\s\S]*?)\}\n', response.text).groups()[0] pictures_json = json.loads(pictures_data + '}') picture_urls = [tag['full'] for tag in pictures_json['data']] description = '{}\n\n{}'.format( html_to_markdown( str(soup.find('div', 'additional-attributes-wrapper'))), html_to_markdown(str(soup.find('div', 'description')))) p = Product(name, cls.__name__, category, url, url, sku, stock, price, price, cls.currency_iso, sku=sku, picture_urls=picture_urls, description=description) return [p]
def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) soup = BeautifulSoup(session.get(url).text, 'html.parser') name = soup.find('div', 'product-name').text.strip() sku = soup.find('input', {'name': 'product'})['value'].strip() price_string = soup.find('span', 'price').text price = Decimal(price_string.replace( '.', '').replace('$', '').replace(',', '.')) description = html_to_markdown( str(soup.find('div', 'product-collateral'))) picture_urls = [tag['src'] for tag in soup.findAll('img', {'id': 'image'})] p = Product( name, cls.__name__, category, url, url, sku, -1, price, price, 'ARS', sku=sku, description=description, picture_urls=picture_urls ) return [p]
def products_for_url(cls, url, category=None, extra_args=None): print(url) # &_from=0&_to=49 session = session_with_proxy(extra_args) page = 0 page_size = 50 products = [] while True: target_url = '{}&_from={}&_to={}'.format( url, page*page_size, (page + 1) * page_size - 1 ) data = session.get(target_url) json_data = json.loads(data.text) if not json_data: if page == 0: raise Exception('Empty category: ' + target_url) break for product in json_data: name = product['productName'] sku = product['productReference'] product_url = product['link'] stock = product['items'][0]['sellers'][0][ 'commertialOffer']['AvailableQuantity'] price = Decimal(product['items'][0]['sellers'][0] ['commertialOffer']['Price']) pictures = product['items'][0]['images'] picture_urls = [] for picture in pictures: picture_urls.append(picture['imageUrl']) description = html_to_markdown(product['description']) p = Product( name, cls.__name__, category, product_url, url, sku, stock, price, price, 'CLP', sku=sku, part_number=product['productReference'], description=description, picture_urls=picture_urls ) products.append(p) page += 1 return products
def products_for_url(cls, url, category=None, extra_args=None): print(url) session = session_with_proxy(extra_args) page_source = session.get(url).text soup = BeautifulSoup(page_source, 'html.parser') name = soup.find('h1', {'itemprop': 'name'}) if not name: name = soup.find('p', {'itemprop': 'name'}) name = name.text sku = soup.find('span', 'variant-sku').text potential_stock_containers = soup.findAll('h6') stock_container = None for container in potential_stock_containers: if 'EXISTENCIA' in container.text: stock_container = container.parent.find('div').contents if not stock_container: stock = 0 else: for item in stock_container: if 'pzas.' in item: stock = int(item.replace('pzas.', '')) break price = soup.find('span', 'gf_product-price money').text price = Decimal(price.replace('$', '').replace(',', '')) images = soup.findAll('meta', {'property': 'og:image:secure_url'}) picture_urls = [i["content"] for i in images] description = html_to_markdown( str(soup.find('div', 'product-description'))) if 'reacondicionado' in name.lower(): condition = 'https://schema.org/RefurbishedCondition' else: condition = 'https://schema.org/NewCondition' p = Product(name, cls.__name__, category, url, url, sku, stock, price, price, 'MXN', sku=sku, picture_urls=picture_urls, description=description, condition=condition) return [p]
def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) soup = BeautifulSoup(session.get(url).text, 'html.parser') name = soup.find('h1', 'entry-title').text.strip() sku = soup.find('input', {'name': 'product_id'})['value'].strip() description = html_to_markdown( str(soup.find('div', 'product_description'))) picture_urls = [tag['href'] for tag in soup.findAll('a', 'thickbox')] price = Decimal(remove_words(soup.find('span', 'currentprice').text)) price *= Decimal('1.19') price = price.quantize(0) p = Product(name, cls.__name__, category, url, url, sku, -1, price, price, 'CLP', sku=sku, description=description, picture_urls=picture_urls) return [p]
def products_for_url(cls, url, category=None, extra_args=None): print(url) session = session_with_proxy(extra_args) response = session.get(url) soup = BeautifulSoup(response.text, 'html.parser') data = re.search(r'value_product = ([\s\S]+?)\];', response.text).groups()[0] + ']' data = json.loads(data)[0] name = data['descripcion'].strip() sku = data['idproducto'].strip() stock = round(float(data['stock'])) offer_price = Decimal(data['precioweb1']) normal_price = Decimal(data['precioweb2']) description = None if data['long_descrip']: description = html_to_markdown(data['long_descrip']) picture_urls = [x['href'] for x in soup.findAll('a', 'fancybox')] p = Product(name, cls.__name__, category, url, url, sku, stock, normal_price, offer_price, 'CLP', sku=sku, description=description, picture_urls=picture_urls) return [p]
def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) session.headers['user-agent'] = 'python-requests/2.21.0' response = session.get(url) if response.status_code == 404: return [] page_source = response.text soup = BeautifulSoup(page_source, 'html.parser') if not soup.find('body') or \ not soup.find('h1', {'id': 'nombre-producto'}): return [] name = soup.find('h1', {'id': 'nombre-producto'}).text.strip() sku = soup.find('div', {'itemprop': 'sku'}).text.strip() ajax_session = session_with_proxy(extra_args) ajax_session.headers['user-agent'] = 'python-requests/2.21.0' ajax_session.headers['x-requested-with'] = 'XMLHttpRequest' ajax_session.headers['content-type'] = \ 'application/x-www-form-urlencoded' stock_data = json.loads( ajax_session.post( 'https://catalogo.movistar.cl/fullprice/stockproducto/validar/', 'sku=' + sku).text) stock = stock_data['respuesta']['cantidad'] price_container = soup.find('span', 'special-price').find('p') price = Decimal(remove_words(price_container.text)) description = html_to_markdown( str(soup.find('div', 'detailed-desktop'))) if 'seminuevo' in description: condition = 'https://schema.org/RefurbishedCondition' else: condition = 'https://schema.org/NewCondition' picture_urls = [soup.find('meta', {'property': 'og:image'})['content']] return [ Product(name, cls.__name__, category, url, url, sku, stock, price, price, 'CLP', condition=condition, sku=sku, description=description, picture_urls=picture_urls) ]
def products_for_url(cls, url, category=None, extra_args=None): print(url) session = session_with_proxy(extra_args) page_source = session.get(url).text soup = BeautifulSoup(page_source, 'html.parser') name = soup.find('h1', 'product_title').text sku = soup.find('div', 'wd-wishlist-btn').find('a')['data-product-id'] stock_container = soup.find('p', 'stock') if stock_container: stock_text = stock_container.text.split(' ')[0] if stock_text == 'Agotado': stock = 0 else: stock = int(stock_text) else: stock = -1 part_number_container = soup.find('span', 'sku') if part_number_container: part_number = part_number_container.text.strip() else: part_number = None price_container = soup.find('p', 'price') if price_container.find('ins'): price = Decimal( price_container.find('ins').text.replace('$', '').replace('.', '')) else: price = Decimal( price_container.text.replace('$', '').replace('.', '')) picture_containers = soup.findAll('div', 'product-image-wrap') picture_urls = [p.find('a')['href'] for p in picture_containers] description = html_to_markdown( str(soup.find('div', {'id': 'tab-description'}))) p = Product(name, cls.__name__, category, url, url, sku, stock, price, price, 'CLP', sku=sku, picture_urls=picture_urls, description=description, part_number=part_number) return [p]
def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) soup = BeautifulSoup(session.get(url).text, 'html.parser') name = soup.find('meta', {'name': 'Title'})['content'].strip() sku = soup.find('input', {'name': 'id'})['value'].strip() price_string = soup.find('input', {'id': 'product_price'})['value'] price = Decimal(price_string) description = html_to_markdown( str(soup.find('div', {'id': 'especificaciones-container'}))) picture_urls = [tag['data-zoom-image'] for tag in soup.find('div', 'owl-carousel').findAll('img')] p = Product( name, cls.__name__, category, url, url, sku, -1, price, price, 'ARS', sku=sku, description=description, picture_urls=picture_urls ) return [p]
def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) data = session.get(url).text soup = BeautifulSoup(data, 'html.parser') sku_container = soup.find('h6', 'sku') if not sku_container: return [] sku = sku_container.text.strip() name = "{} ({})".format( soup.find('div', 'product-name').find('h1').text.strip(), sku) if soup.find('p', 'availability').find('span').text.strip() \ == 'En existencia': stock = -1 else: stock = 0 price = Decimal( soup.find('div', 'price-box').find('span', 'price').text.replace( 'Q', '').replace(',', '')) picture_urls = [soup.find('p', 'product-image').find('a')['href']] description = html_to_markdown( str(soup.find('div', {'id': 'product_tabs_description_contents'}))) description += '\n\n' description += html_to_markdown( str(soup.find('div', {'id': 'product_tabs_additional_contents'}))) p = Product(name, cls.__name__, category, url, url, sku, stock, price, price, 'GTQ', sku=sku, picture_urls=picture_urls, description=description) return [p]
def products_for_url(cls, url, category=None, extra_args=None): print(url) session = session_with_proxy(extra_args) page_source = session.get(url).text pricing_data = re.search(r'vtex.events.addData\(([\S\s]+?)\);', page_source).groups()[0] pricing_data = json.loads(pricing_data) skus_data = re.search(r'var skuJson_0 = ([\S\s]+?);CATALOG', page_source).groups()[0] skus_data = json.loads(skus_data) name = '{} {}'.format(pricing_data['productBrandName'], pricing_data['productName']) price = Decimal(pricing_data['productPriceTo']) soup = BeautifulSoup(page_source, 'html.parser') picture_urls = [ tag['rel'][0] for tag in soup.findAll('a', {'id': 'botaoZoom'}) ] description = html_to_markdown( str(soup.find('section', 'product-specs'))) products = [] if 'productEans' in pricing_data: ean = pricing_data['productEans'][0] if len(ean) == 12: ean = '0' + ean if not check_ean13(ean): ean = None else: ean = None for sku_data in skus_data['skus']: sku = str(sku_data['sku']) stock = pricing_data['skuStocks'][sku] if sku_data['sellerId'] == 'lojamultilaser': price = (price * Decimal('0.95')).quantize(Decimal('0.01')) p = Product(name, cls.__name__, category, url, url, sku, stock, price, price, 'COP', sku=sku, ean=ean, description=description, picture_urls=picture_urls) products.append(p) return products
def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) page_source = session.get(url).text soup = BeautifulSoup(page_source, 'html.parser') picture_urls = [] for tag in soup.findAll('li', 'owl-item'): picture_path = tag.find('a')['data-zoom'].replace(' ', '%20').strip() if not picture_path: picture_path = tag.find('a')['data-normal'].replace( ' ', '%20').strip() if not picture_path: continue picture_url = 'https:' + picture_path picture_urls.append(picture_url) if not picture_urls: picture_urls = None pricing_data = demjson.decode( re.search(r'dataLayer = ([\S\s]+?);dataLayer', page_source).groups()[0])[0] products = [] for product_entry in pricing_data['product']: name = product_entry['productName'] sku = str(product_entry['productSku']) price = Decimal(product_entry['productDiscount']) if product_entry['productAvailable']: stock = -1 else: stock = 0 description = html_to_markdown( html.unescape(product_entry['productDescription'])) p = Product(name, cls.__name__, category, url, url, sku, stock, price, price, 'BRL', sku=sku, description=description, picture_urls=picture_urls) products.append(p) return products
def products_for_url(cls, url, category=None, extra_args=None): print(url) session = session_with_proxy(extra_args) response = session.get(url) if response.status_code in [404]: return [] page_source = response.text soup = BeautifulSoup(page_source, 'html5lib') name = soup.find('h1', 'name').text sku = soup.find('div', {'itemprop': 'sku'}).text availability = soup.find('div', 'availability') if availability: stock = int(soup.find('div', 'availability').find('strong').text) else: stock = 0 price = Decimal( soup.find('span', 'price').text.replace('$', '').replace(',', '')) if soup.find('div', {'id': 'owl-carousel-gallery'}): picture_urls = [ i['src'] for i in soup.find('div', { 'id': 'owl-carousel-gallery' }).findAll('img', 'img-fluid') ] else: picture_urls = [soup.find('img', 'img-fluid')['src']] description = html_to_markdown(str(soup.find('div', 'description'))) ths = soup.findAll('th') part_number = None for th in ths: if th.text == "mpn": part_number = th.parent.find('td').text p = Product(name, cls.__name__, category, url, url, sku, stock, price, price, 'MXN', sku=sku, picture_urls=picture_urls, description=description, part_number=part_number) return [p]
def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) session.headers['User-Agent'] = \ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \ '(KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36' soup = BeautifulSoup(session.get(url, timeout=30).text, 'html.parser') containers = soup.findAll('div', 'textOtrosPrecios') normal_price = Decimal(remove_words(containers[0].text)) stock_image = containers[1].find('img')['src'] if stock_image in [ 'images/imagenes/ico_normal.jpg', 'images/imagenes/ico_bajo.jpg' ]: stock = -1 else: stock = 0 sku = containers[2].text.strip() name = soup.find('div', 'textTituloProducto').text.strip() offer_price = Decimal( remove_words(soup.find('div', 'textPrecioContado').text)) description = html_to_markdown(str(soup.find('div', 'p7TPcontent'))) main_picture = soup.findAll( 'table', {'id': 'table20'})[1].findAll('img')[2]['src'] picture_paths = [main_picture] picture_paths.extend( [tag['src'] for tag in soup.findAll('img', 'Imagen')]) picture_urls = [] for path in picture_paths: picture_id = path.split('=')[-1] picture_url = 'http://www.ttchile.cl/images/imgproductos/' \ 'imgImagenMarco.php?imagen=' + picture_id picture_urls.append(picture_url.replace(' ', '%20')) p = Product(name, cls.__name__, category, url, url, sku, stock, normal_price, offer_price, 'CLP', sku=sku, description=description, picture_urls=picture_urls) return [p]
def products_for_url(cls, url, category=None, extra_args=None): print(url) session = session_with_proxy(extra_args) soup = BeautifulSoup(session.get(url).text, 'html.parser') name = soup.find('h1', 'product_title').text.strip() sku = soup.find('meta', {'property': 'product:retailer_item_id'})['content'] if not soup.find('input', {'id': 'the-cantidad-selector'}): return [] stock = soup.find('input', {'id': 'the-cantidad-selector'})['max'] if stock: stock = int(stock) else: stock = -1 if 'LG' not in name.upper().split(' '): stock = 0 normal_price = Decimal( soup.find('p', 'price').find('span', 'amount').text.replace( 'Gs.', '').replace('.', '').strip()) offer_price = Decimal( soup.find('p', 'price').find('span', { 'id': 'elpreciocentralPorta' }).text.split('Gs.')[-1].replace('.', '').replace('!', '').strip()) if normal_price < offer_price: offer_price = normal_price description = html_to_markdown( str(soup.find('div', {'itemprop': 'description'}))) pictures = soup.findAll('div', 'thumbnails-single owl-carousel') picture_urls = [] for picture in pictures: picture_url = picture.find('a')['href'] picture_urls.append(picture_url) return [ Product(name, cls.__name__, category, url, url, sku, stock, normal_price, offer_price, 'PYG', sku=sku, description=description, picture_urls=picture_urls) ]
def products_for_url(cls, url, category=None, extra_args=None): print(url) session = session_with_proxy(extra_args) response = session.get(url) if response.url != url: return [] page_source = response.text soup = BeautifulSoup(page_source, 'html.parser') name = soup.find('h1', 'product_title').text sku_container = soup.find('span', 'sku') if not sku_container: return [] sku = sku_container.text if soup.find('p', 'out-of-stock'): stock = 0 else: stock = -1 price_container = soup.find('p', 'price').find('ins') if price_container: price = price_container.find('span', 'amount').text else: price = soup.find('p', 'price').find('span', 'amount').text price = Decimal(price.replace('$', '').replace(',', '')) images = soup.find( 'figure', 'woocommerce-product-gallery__wrapper').findAll('img') picture_urls = [i['src'] for i in images] description = html_to_markdown( str(soup.find('div', {'id': 'tab-description'}))) p = Product( name, cls.__name__, category, url, url, sku, stock, price, price, 'MXN', sku=sku, picture_urls=picture_urls, description=description, ) return [p]
def products_for_url(cls, url, category=None, extra_args=None): print(url) session = session_with_proxy(extra_args) response_text = session.get(url).text variants_raw_data = re.search(r'var meta = ([\S\s]+?);\n', response_text).groups()[0] variants_data = json.loads(variants_raw_data)['product']['variants'] products = [] for variant in variants_data: variant_id = variant['id'] sku = variant['sku'] color = variant['public_title'] variant_url = '{}?variant={}'.format(url, variant_id) variant_url_source = session.get(variant_url).text soup = BeautifulSoup(variant_url_source, 'html.parser') name = soup.find('h1', 'product_name').text + " ({})".format(color) stock = 0 if soup.find('link', {'itemprop': 'availability'})['href'] == \ 'http://schema.org/InStock': stock = -1 price_text = soup.find('span', 'current_price').text.strip()\ .replace('$', '').replace('.', '') if price_text == '-': continue price = Decimal(price_text) image_containers = soup.findAll('div', 'image__container') picture_urls = [ 'http:' + i.find('img')['data-src'] for i in image_containers ] description = html_to_markdown( str(soup.find('div', {'data-et-handle': 'tabs-descripcion'}))) p = Product(name, cls.__name__, category, variant_url, url, sku, stock, price, price, 'CLP', sku=sku, picture_urls=picture_urls, description=description) products.append(p) return products
def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) soup = BeautifulSoup(session.get(url).text, 'html.parser') name = soup.find('h1').text.strip() if soup.find('link', {'itemprop': 'availability'})['href'] == \ 'http://schema.org/InStock': stock = -1 else: stock = 0 sku = soup.find('div', 'product-name').find('span').text.strip() panels = [ soup.find('div', {'id': 'description'}), soup.find('div', {'id': 'additional'}) ] description = '\n\n'.join([html_to_markdown(str(panel)) for panel in panels]) normal_price = soup.find('p', {'itemprop': 'price'}).text normal_price = Decimal(normal_price.replace('R$', '').replace( '.', '').replace(',', '.')) if stock == 0: offer_price = normal_price else: offer_price = soup.find('span', 't_boleto_price').text offer_price = Decimal(offer_price.split('$')[1].replace( '.', '').replace(',', '.')) pictures_container = soup.find('ul', 'bxslider') if pictures_container: picture_urls = [link['href'] for link in pictures_container.findAll('a')] else: picture_urls = [soup.find('a', 'cloud-zoom-gallery')['href']] p = Product( name, cls.__name__, category, url, url, sku, stock, normal_price, offer_price, 'BRL', sku=sku, description=description, picture_urls=picture_urls ) return [p]
def products_for_url(cls, url, category=None, extra_args=None): print(url) session = session_with_proxy(extra_args) response = session.get(url) if response.status_code == 404: return [] soup = BeautifulSoup(response.text, 'html.parser') name = soup.find('span', {'itemprop': 'name'}).text sku = soup.find('div', {'itemprop': 'sku'}).text stock = 0 stock_container = soup.find('div', 'product-stock') if stock_container: stock = int(stock_container.text.strip().split(' ')[1]) offer_price = Decimal( soup.find('span', 'efectivo').find('span', 'price').text.replace( '$', '').replace('.', '')) normal_price = offer_price * Decimal(1.034) image_scripts = soup.findAll('script', {'type': 'text/x-magento-init'}) picture_urls = [] for script in image_scripts: if 'mage/gallery/gallery' in script.text: image_data = json.loads( script.text)['[data-gallery-role=gallery-placeholder]'][ 'mage/gallery/gallery']['data'] for data in image_data: picture_urls.append(data['img']) description = html_to_markdown(str(soup.find('div', 'description'))) if len(sku) > 50: sku = sku[0:50] p = Product( name, cls.__name__, category, url, url, sku, stock, normal_price, offer_price, 'CLP', sku=sku, part_number=sku, picture_urls=picture_urls, description=description, ) return [p]
def products_for_url(cls, url, category=None, extra_args=None): print(url) session = session_with_proxy(extra_args) response = session.get(url) if response.status_code == 404: return [] page_source = session.get(url).text soup = BeautifulSoup(page_source, 'html.parser') name = soup.find('h1', 'detailsInfo_right_title').text sku = soup.find('div', 'detailsInfo_right_artnum')\ .text.replace('SKU:', '').strip() if not soup.find('span', 'stockFlag'): stock = 0 else: stock = int(soup.find('span', 'stockFlag').find('span').text) if not soup.find('span', 'priceText'): return [] price = Decimal( soup.find('span', 'priceText').text.replace('$', '').replace(',', '')) if soup.find('div', 'detailsInfo_left_picture_morepictures')\ .find('div', 'emslider2_items'): picture_urls = [] images = soup.find('div', 'detailsInfo_left_picture_morepictures')\ .find('div', 'emslider2_items').findAll('li') for image in images: picture_urls.append(image.find('a')['data-src']) else: picture_urls = None description = html_to_markdown( str(soup.find('div', 'cpattributes-box'))) p = Product(name, cls.__name__, category, url, url, sku, stock, price, price, 'MXN', sku=sku, picture_urls=picture_urls, description=description, part_number=sku) return [p]
def products_for_url(cls, url, category=None, extra_args=None): products = [] if url == cls.prepago_url: # Plan Prepago p = Product( 'GTD Prepago', cls.__name__, category, url, url, 'Claro Prepago', -1, Decimal(0), Decimal(0), 'CLP', ) products.append(p) elif url == cls.equipos_url: session = session_with_proxy(extra_args) body = session.get(url).text json_body = re.search(r'var catalog = (.+)', body).groups()[0][:-1] json_body = json.loads(json_body) for json_product in json_body['products']: if not json_product['published']: continue name = json_product['name'] sku = json_product['id'] price = Decimal(remove_words(json_product['leasing_price'])) description = html_to_markdown(json_product['description']) picture_urls = [ 'https://nuevo.gtdmanquehue.com' + im['options']['url'] for im in json_product['images'] ] product = Product(name, cls.__name__, 'Cell', url, url, sku, -1, price, price, 'CLP', sku=sku, cell_plan_name='GTD Prepago', description=description, picture_urls=picture_urls) products.append(product) else: raise Exception('Invalid URL: ' + url) return products
def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) response = session.get(url) if response.url != url: return [] page_source = response.text pricing_data = re.search(r'digitalData = ([\S\s]+?); </script', page_source).groups()[0] for kw in ['domain', 'fullName', 'protocol', 'pathname', 'referrer']: for_replace = "'{}': .+".format(kw) pricing_data = re.sub(for_replace, '', pricing_data) pricing_data = demjson.decode(pricing_data)['page']['product'] name = pricing_data['title'] sku = pricing_data['idSku'] if pricing_data['stockAvailability']: stock = -1 else: stock = 0 if 'cashPrice' in pricing_data: normal_price = Decimal(pricing_data['salePrice']) offer_price = Decimal(pricing_data['cashPrice']) else: normal_price = Decimal(0) offer_price = Decimal(0) soup = BeautifulSoup(page_source, 'html.parser') description = html_to_markdown(str(soup.find('div', 'description'))) picture_urls = [tag['data-src'] for tag in soup.findAll('img', 'carousel-product__item-img')] p = Product( name, cls.__name__, category, url, url, sku, stock, normal_price, offer_price, 'BRL', sku=sku, description=description, picture_urls=picture_urls ) return [p]
def products_for_url(cls, url, category=None, extra_args=None): print(url) session = session_with_proxy(extra_args) response = session.get(url) if response.status_code == 404: return [] soup = BeautifulSoup(response.text, 'html5lib') name = soup.find('h1', 'name').text.strip() info_table = soup.find('div', 'listing') rows = info_table.findAll('tr') sku = rows[0].find('td', 'td_right').text.strip() stock = 0 for i in range(0, len(rows) - 1): left_text = rows[i].find('td', 'td_left').text if 'Cantidad' not in left_text: continue right_text = rows[i].find('td', 'td_right').text if '+' in right_text: stock = -1 break if 'Agotado' not in right_text: stock += int(right_text) price = Decimal(rows[-1].find('td', 'td_right').text.split('$')[-1].replace( ',', '')) description = html_to_markdown(str(soup.find('div', 'description'))) image_containers = soup.findAll('li', 'wrapper_pic_div') picture_urls = [] for image in image_containers: picture_url = image.find('a')['href'].replace(' ', '%20') picture_urls.append(picture_url) p = Product(name, cls.__name__, category, url, url, sku, stock, price, price, 'USD', sku=sku, description=description, picture_urls=picture_urls) return [p]
def products_for_url(cls, url, category=None, extra_args=None): print(url) session = session_with_proxy(extra_args) soup = BeautifulSoup(session.get(url).text, 'html.parser') name = soup.find('div', 'product-info__description').text.strip() sku = soup.find( 'div', 'product-info__title').find('span').text.split(':')[1].strip() stock_container = soup.find('input', {'id': 'producto_cantidad'}) if stock_container: stock = int(stock_container['max']) else: stock = 0 price_container = soup.find('span', 'price-box__new') old_price_container = price_container.find('s') if old_price_container: old_price = Decimal(remove_words(old_price_container.text)) price = (old_price * Decimal('0.9')).quantize(0) else: price = Decimal(remove_words(price_container.text)) description = html_to_markdown(str(soup.find('div', 'tab-content')), 'http://www.eglo.cl') picture_containers = soup.findAll('a', 'swiper-slide') if picture_containers: picture_urls = [] for container in picture_containers: picture_url = container.find('img')['src'] picture_urls.append(picture_url) else: picture_urls = [ soup.find('div', 'product-main-' 'image__item').img['src'] ] p = Product(name, cls.__name__, category, url, url, sku, stock, price, price, 'CLP', sku=sku, description=description, picture_urls=picture_urls) return [p]
def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) session.headers['User-Agent'] = \ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \ '(KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36' soup = BeautifulSoup(session.get(url).text, 'html.parser') name = soup.find('h1', {'itemprop': 'name'}) if not name: return [] name = name.text.strip() stock = -1 if soup.find('strong', 'text-not-product-avisme'): stock = 0 price = soup.find('meta', {'itemprop': 'lowPrice'}) if not price: price = soup.find('meta', {'itemprop': 'price'}) normal_price = Decimal(price['content']) offer_price = normal_price sku = soup.find('span', {'itemprop': 'sku'}).text.split('.', 1)[1] description = html_to_markdown(str( soup.find('div', 'yCmsContentSlot productDetailsPageShortDescription'))) picture_tags = soup.find('div', 'gallery-image').findAll('img') picture_urls = [tag['data-zoom-image'] for tag in picture_tags if tag.has_attr('data-zoom-image')] if not picture_urls: picture_urls = None p = Product( name, cls.__name__, category, url, url, sku, stock, normal_price, offer_price, 'BRL', sku=sku, description=description, picture_urls=picture_urls ) return [p]
def products_for_url(cls, url, category=None, extra_args=None): print(url) session = session_with_proxy(extra_args) session.headers['User-Agent'] = \ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \ '(KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36' soup = BeautifulSoup(session.get(url).text, 'html.parser') name = soup.find('h2', 'product_title').text.strip() sku = soup.find('span', 'sku').text.strip() stock_text = soup.find('span', 'stock').text.strip() stock = 0 if stock_text != 'Agotado': stock = int(stock_text.split(' ')[0]) price_container = soup.find('p', 'price') if not price_container.text.strip(): return [] offer_price = Decimal( remove_words(price_container.find('ins').find('span').text)) normal_price = Decimal( remove_words(price_container.find('del').find('span').text)) picture_containers = soup.findAll('div', 'img-thumbnail') picture_urls = [] for picture in picture_containers: try: picture_url = picture.find('img')['content'] picture_urls.append(picture_url) except KeyError: continue description = html_to_markdown( str(soup.find('div', {'id': 'tab-description'}))) p = Product( name, cls.__name__, category, url, url, sku, stock, normal_price, offer_price, 'CLP', sku=sku, description=description, picture_urls=picture_urls ) return [p]
def products_for_url(cls, url, category=None, extra_args=None): query_string = urllib.parse.urlparse(url).query params = urllib.parse.parse_qs(query_string) price = Decimal(params['price'][0]) currency = params['currency'][0] id = params['fProdId'][0] product_url = 'https://www.grupocva.com/me_bpm/' \ 'detalle_articulo/me_articulo.php?fProdId=' + id session = session_with_proxy(extra_args) session.headers['Content-Type'] = 'application/x-www-form-urlencoded' request_payload = 'accion=getArticulo&id=' + id response = cls._retrieve_page( session, 'https://www.grupocva.com/me_bpm/detalle_articulo/' 'fcDetArticulo.php', request_payload, extra_args) json_data = json.loads(response.text) name = json_data['descripcion'][:255] sku = json_data['clave'] key = json_data['idProd'] part_number = json_data['fabricante'] description = html_to_markdown(json_data['desT']) picture_urls = [ 'https://www.grupocva.com/me_bpm/' 'detalle_articulo/imagen_art.php?fProd=' + key ] stock_url = 'https://www.grupocva.com/me_bpm/' \ 'existencia/exs_general.php?fPID=' + key stock_soup = BeautifulSoup( requests.get(stock_url, cookies=cls.SESSION_COOKIES, timeout=30).text, 'html.parser') stock = int( stock_soup.find('strong', text='Total General').next.next.next.text) p = Product(name, cls.__name__, category, product_url, product_url, key, stock, price, price, currency, sku=sku, part_number=part_number, description=description, picture_urls=picture_urls) return [p]
def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) session.headers['Accept-Encoding'] = 'deflate' response = session.get(url) if response.status_code == 500: return [] soup = BeautifulSoup(response.text, 'html.parser') name = soup.find('title').text.strip() sku = soup.find('input', {'name': 'product_id'})['value'] description = html_to_markdown( str(soup.find('div', {'id': 'tab-description'}))) pictures_container = soup.find('ul', 'thumbnails') if pictures_container: picture_urls = [tag['href'] for tag in pictures_container.findAll( 'a', 'thumbnail') if tag['href']] else: picture_urls = None if soup.find('button', {'id': 'button-cart'}): stock = -1 else: stock = 0 price_text = soup.findAll('h2')[-1].text.replace('.', '') normal_price = re.search(r'Webpay: \$(\d+)', price_text) normal_price = Decimal(normal_price.groups()[0]) offer_price = re.search(r'Transferencia: \$(\d+)', price_text) offer_price = Decimal(offer_price.groups()[0]) p = Product( name, cls.__name__, category, url, url, sku, stock, normal_price, offer_price, 'CLP', sku=sku, part_number=sku, description=description, picture_urls=picture_urls ) return [p]
def products_for_url(cls, url, category=None, extra_args=None): print(url) session = session_with_proxy(extra_args) response = session.get(url) if response.status_code != 200: return [] soup = BeautifulSoup(response.text, 'html.parser') scripts = soup.findAll('script') product_data = [s for s in scripts if 'var skuJson' in s.text] if product_data: product_data = product_data[0].text else: raise Exception('No Data') product_json = json.loads( re.search(r'var skuJson_0 = ([\S\s]+?);', product_data).groups()[0]) name = product_json['name'] sku = str(product_json['skus'][0]['sku']) stock = 0 if product_json['available']: stock = -1 tax = Decimal('1.12') price = Decimal(product_json['skus'][0]['bestPrice'] / 100) * tax picture_urls = [ a['zoom'] for a in soup.findAll('a', {'id': 'botaoZoom'}) ] description = html_to_markdown( str(soup.find('div', 'product-description'))) p = Product( name, cls.__name__, category, url, url, sku, stock, price, price, 'USD', sku=sku, picture_urls=picture_urls, description=description, ) return [p]