def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) response = cls._retrieve_page(session, url) soup = BeautifulSoup(response.text, 'html.parser') part_number = soup.find('span', {'id': '_sku'}) if not part_number: return [] if soup.find('span', 'product-view-price-a-pedido'): return [] part_number = part_number.text.strip() # Remove \x9d character for this case # https://www.spdigital.cl/products/view/55577 name = soup.find('h1').text.strip().replace('\x9d', '') sku = [x for x in url.split('/') if x][-1] if soup.find('a', 'stock-amount-cero') or \ not soup.find('div', 'product-view-stock'): stock = 0 else: stock_text = soup.find('div', 'product-view-stock').find('span').text if 'preventa' in stock_text.lower(): stock = -1 else: stock_overflow, stock_value = re.match(r'(.*?)(\d+) UNIDADES', stock_text).groups() if stock_overflow: stock = -1 else: stock = int(stock_value) containers = soup.findAll('span', 'product-view-cash-price-value') offer_price = Decimal(remove_words(containers[0].text)) normal_price = Decimal(remove_words(containers[1].text)) if normal_price < offer_price: offer_price = normal_price tabs = [ soup.find('div', 'product-description-tab'), soup.find('div', {'data-tab': 'specifications'}) ] description = '' for tab in tabs: if not tab: continue description += html_to_markdown( str(tab), 'https://www.spdigital.cl') + '\n\n' picture_containers = soup.findAll('a', {'rel': 'lightbox'}) picture_urls = [] for container in picture_containers: picture_url = container.find('img')['src'].replace(' ', '%20') if 'http' not in picture_url: picture_url = 'https://www.spdigital.cl' + picture_url picture_urls.append(picture_url) reviews_url = 'https://d1le22hyhj2ui8.cloudfront.net/onpage/' \ 'spdigital.cl/reviews.js?url_key={}'.format(sku) review_data = json.loads(session.get(reviews_url).text) if 'user_review_count' in review_data: review_count = review_data['user_review_count'] if review_count: review_avg_score = review_data['score'] / 2 else: review_avg_score = None else: review_count = None review_avg_score = None flixmedia_id = None video_urls = None flixmedia_tag = soup.find( 'script', {'src': '//media.flixfacts.com/js/loader.js'}) if flixmedia_tag: try: flixmedia_id = flixmedia_tag['data-flix-mpn'] video_urls = flixmedia_video_urls(flixmedia_id) except KeyError: pass p = Product(name, cls.__name__, category, url, url, sku, stock, normal_price, offer_price, 'CLP', sku=sku, part_number=part_number, description=description, picture_urls=picture_urls, review_count=review_count, review_avg_score=review_avg_score, flixmedia_id=flixmedia_id, video_urls=video_urls) return [p]
def _products_for_url(cls, url, category=None, extra_args=None, retries=9): session = get_cf_session(extra_args) page_source = session.get(url).text soup = BeautifulSoup(page_source, 'html.parser') if soup.find('div', 'error-page'): return [] product_data = re.search(r'window.__PRELOADED_STATE__ = (.+);', page_source) if not product_data: if retries: return cls._products_for_url(url, category, extra_args, retries=retries - 1) else: return [] product_json = json.loads(product_data.groups()[0]) specs_json = product_json['product']['product'] sku = specs_json['partNumber'] name = specs_json['name'].encode('ascii', 'ignore').decode('ascii') short_description = specs_json.get('shortDescription', '') # If it's a cell sold by Ripley directly (not Mercado Ripley) add the # "Prepago" information in its description if category in ['Cell', 'Unknown'] and 'MPM' not in sku: name += ' ({})'.format(short_description) if specs_json['isOutOfStock'] or specs_json['isUnavailable']: stock = 0 else: stock = -1 if 'offerPrice' in specs_json['prices']: normal_price = Decimal(specs_json['prices']['offerPrice']) elif 'listPrice' in specs_json['prices']: normal_price = Decimal(specs_json['prices']['listPrice']) else: return [] offer_price = Decimal(specs_json['prices'].get('cardPrice', normal_price)) if offer_price > normal_price: offer_price = normal_price description = '' refurbished_notice = soup.find('div', 'emblemaReaccondicionados19') if refurbished_notice: description += html_to_markdown(str(refurbished_notice)) if 'longDescription' in specs_json: description += html_to_markdown(specs_json['longDescription']) description += '\n\nAtributo | Valor\n-- | --\n' for attribute in specs_json['attributes']: if 'name' in attribute and 'value' in attribute: description += '{} | {}\n'.format(attribute['name'], attribute['value']) description += '\n\n' condition = 'https://schema.org/NewCondition' if 'reacondicionado' in description.lower() or \ 'reacondicionado' in name.lower() or \ 'reacondicionado' in short_description.lower(): condition = 'https://schema.org/RefurbishedCondition' if soup.find( 'img', {'src': '//home.ripley.cl/promo-badges/' 'reacondicionado.png'}): condition = 'https://schema.org/RefurbishedCondition' picture_urls = [] for path in specs_json['images']: picture_url = path if 'file://' in picture_url: continue if not picture_url.startswith('http'): picture_url = 'https:' + picture_url picture_urls.append(picture_url) if not picture_urls: picture_urls = None flixmedia_id = None video_urls = [] flixmedia_urls = [ '//media.flixfacts.com/js/loader.js', 'https://media.flixfacts.com/js/loader.js' ] for flixmedia_url in flixmedia_urls: flixmedia_tag = soup.find('script', {'src': flixmedia_url}) if flixmedia_tag and flixmedia_tag.has_attr('data-flix-mpn'): flixmedia_id = flixmedia_tag['data-flix-mpn'] video_urls = flixmedia_video_urls(flixmedia_id) break review_count = int(specs_json['powerReview']['fullReviews']) if review_count: review_avg_score = float( specs_json['powerReview']['averageRatingDecimal']) else: review_avg_score = None has_virtual_assistant = False for keyword in keywords: if keyword in url: has_virtual_assistant = True break if 'shopName' in specs_json['marketplace']: seller = specs_json['marketplace']['shopName'] elif specs_json['isMarketplaceProduct']: seller = 'Mercado R' else: seller = None p = Product(name, cls.__name__, category, url, url, sku, stock, normal_price, offer_price, 'CLP', sku=sku, description=description, picture_urls=picture_urls, condition=condition, flixmedia_id=flixmedia_id, review_count=review_count, review_avg_score=review_avg_score, video_urls=video_urls, has_virtual_assistant=has_virtual_assistant, seller=seller) return [p]
def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) response = session.get(url) if response.url != url: return [] page_source = response.text soup = BeautifulSoup(page_source, 'html5lib') name = soup.find('h1', {'itemprop': 'name'}).text.strip() part_number = soup.find('span', 'sku').text.strip() sku = part_number condition_str = soup.find('span', { 'itemprop': 'itemCondition' }).text.strip() condition_dict = { 'NUEVO': 'https://schema.org/NewCondition', 'REEMBALADO': 'https://schema.org/RefurbishedCondition', 'REACONDICIONADO': 'https://schema.org/RefurbishedCondition', 'SEMI-NUEVO': 'https://schema.org/RefurbishedCondition', 'USADO': 'https://schema.org/UsedCondition', 'DE SHOW ROOM': 'https://schema.org/RefurbishedCondition', } condition = condition_dict[condition_str] if soup.find('div', 'sinstock'): stock = 0 normal_price = Decimal( remove_words( soup.find( 'meta', {'property': 'product:price:amount'})['content'])) offer_price = normal_price else: stock = int(soup.find('p', {'itemprop': 'offerCount'}).text) offer_price = Decimal( remove_words(soup.find('h2', { 'itemprop': 'lowPrice' }).string)) normal_price = Decimal( remove_words( soup.find('h3', { 'itemprop': 'highPrice' }).string)) description = html_to_markdown(str(soup.find('div', 'info'))) picture_tags = soup.findAll('img', {'itemprop': 'image'}) picture_urls = [ 'https://www.winpy.cl' + urllib.parse.quote(tag['src']) for tag in picture_tags ] flixmedia_id = None video_urls = None flixmedia_tag = soup.find( 'script', {'src': '//media.flixfacts.com/js/loader.js'}) if flixmedia_tag: try: flixmedia_id = flixmedia_tag['data-flix-mpn'] video_urls = flixmedia_video_urls(flixmedia_id) except KeyError: pass p = Product(name, cls.__name__, category, url, url, sku, stock, normal_price, offer_price, 'CLP', sku=sku, part_number=part_number, condition=condition, description=description, picture_urls=picture_urls, flixmedia_id=flixmedia_id, video_urls=video_urls) return [p]
def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) page_content = session.get(url).text soup = BeautifulSoup(page_content, 'html.parser') if soup.find('div', {'id': 'errorPage'}): return [] try: name = soup.find('span', {'itemprop': 'name'}).text.strip() except AttributeError: return [] page_content = page_content.replace(name, urllib.parse.quote(name)) soup = BeautifulSoup(page_content, 'html.parser') prices_containers = soup.findAll('div', 'detailprecioBig') if not prices_containers: return [] if soup.findAll('div', {'id': 'productPageAdd2Cart'}): stock = -1 else: stock = 0 if len(prices_containers) == 1: return [] normal_price = prices_containers[1].text if not remove_words(normal_price).strip(): return [] normal_price = Decimal(remove_words(normal_price)) if len(prices_containers) >= 3: offer_price = Decimal(remove_words(prices_containers[2].text)) else: offer_price = normal_price if offer_price > normal_price: offer_price = normal_price sku = soup.find('meta', {'name': 'pageIdentifier'})['content'] description = html_to_markdown(str( soup.find('p', attrs={'id': re.compile(r'product_longdescription_.*')})), baseurl='https://www.abcdin.cl') pictures_data = json.loads(soup.find('div', 'jsonProduct').text) pictures_dict = pictures_data[0]['Attributes'] if 'ItemAngleFullImage' in pictures_dict: sorted_pictures = sorted( pictures_dict['ItemAngleFullImage'].items(), key=lambda pair: int(pair[0].replace('image_', ''))) picture_urls = [ 'https://www.abcdin.cl' + picture_pair[1].replace(' ', '') for picture_pair in sorted_pictures ] else: picture_urls = [ 'https://www.abcdin.cl' + soup.find('img', {'id': 'productMainImage'})['src'] ] flixmedia_id = None video_urls = None flixmedia_tag = soup.find( 'script', {'src': '//media.flixfacts.com/js/loader.js'}) if flixmedia_tag: try: flixmedia_id = flixmedia_tag['data-flix-mpn'] video_urls = flixmedia_video_urls(flixmedia_id) except KeyError: pass if 'reacondicionado' in name.lower(): condition = 'https://schema.org/RefurbishedCondition' else: condition = 'https://schema.org/NewCondition' has_virtual_assistant = \ 'cdn.livechatinc.com/tracking.js' in page_content product = Product(name, cls.__name__, category, url, url, sku, stock, normal_price, offer_price, 'CLP', sku=sku, description=description, picture_urls=picture_urls, video_urls=video_urls, flixmedia_id=flixmedia_id, condition=condition, has_virtual_assistant=has_virtual_assistant) return [product]
def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) page_source = session.get(url).text soup = BeautifulSoup(page_source, 'html.parser') if soup.find('p', 'title-not-found'): return [] description_text = re.search( r'<div class="row detalles-producto">[\S\s]*' r'<div class="row recomendados-productos">', page_source) if description_text: description = html_to_markdown(description_text.group()) else: description = '' sku = soup.find('div', 'skuReference').text.strip() # Pictures picture_urls = [] gallery_links = soup.findAll('a', {'id': 'botaoZoom'}) for link in gallery_links: picture_url = link['zoom'] if not picture_url: picture_url = link['rel'][0] picture_urls.append(picture_url) # Offer price offer_price = None corona_price_container = soup.find('td', 'Oferta') if corona_price_container: offer_price_text = corona_price_container.string.split( '$')[-1].split('Con')[0] try: offer_price = Decimal(remove_words(offer_price_text)) except InvalidOperation: pass flixmedia_id = None video_urls = None flixmedia_tag = soup.find( 'script', {'src': '//media.flixfacts.com/js/loader.js'}) if flixmedia_tag: mpn = flixmedia_tag['data-flix-mpn'].strip() video_urls = flixmedia_video_urls(mpn) if video_urls is not None: flixmedia_id = mpn # SKUS pricing skus_data = re.search(r'var skuJson_0 = ([\S\s]+?);', page_source).groups()[0] skus_data = json.loads(skus_data) products = [] for sku_data in skus_data['skus']: name = sku_data['skuname'] key = str(sku_data['sku']) stock = sku_data['availablequantity'] if stock == 99999: stock = -1 normal_price = Decimal(sku_data['bestPrice'] / 100) if offer_price and offer_price < normal_price: sku_offer_price = offer_price else: sku_offer_price = normal_price products.append( Product(name, cls.__name__, category, url, url, key, stock, normal_price, sku_offer_price, 'CLP', sku=sku, description=description, picture_urls=picture_urls, video_urls=video_urls, flixmedia_id=flixmedia_id)) return products
def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) response = session.get(url) if not response.ok: return [] page_source = response.text soup = BeautifulSoup(page_source, 'html.parser') name = soup.find('div', 'product-name').text.strip() sku = soup.find('span', 'sku-code-value').text.strip() prices = soup.find('div', 'prices') la_polar_card = prices.find('p', 'js-tlp-price') highlighted_price = prices.find('p', 'la-polar').find( 'span', 'price-value') \ .text.strip().replace('$', '').replace('.', '') highlighted_price = Decimal(highlighted_price) if la_polar_card: offer_price = highlighted_price normal_price = prices.find('p', 'internet').find( 'span', 'price-value').text.strip() \ .replace('$', '').replace('.', '') normal_price = Decimal(normal_price) else: offer_price = normal_price = highlighted_price stock = -1 description = html_to_markdown( str(soup.find('div', 'description-wrapper'))) picture_containers = soup.findAll('div', 'primary-image') picture_urls = [ picture.find('img')['src'].replace(' ', '%20') for picture in picture_containers ] if 'reacondicionado' in name.lower(): condition = 'https://schema.org/RefurbishedCondition' else: condition = 'https://schema.org/NewCondition' flixmedia_id = None video_urls = None if 'LG' in name and '//media.flixfacts.com/js/loader.js' in \ response.text: details_tab = soup.find('div', 'details-tab') for label in details_tab.findAll('div', 'attr-label'): if label.text.strip() == 'Modelo:': model = label.parent.find('div', 'attr-value').text.strip() video_urls = flixmedia_video_urls(model) if video_urls is not None: flixmedia_id = model break variation_container = soup.find('div', 'swatch-wrapper') variations = [] if variation_container: variations = variation_container.findAll('a') products = [] if variations: for variation in variations: variation_url = variation['href'] variation_data = json.loads(session.get(variation_url).text) attributes = variation_data["product"]["variationAttributes"] for attribute in attributes: if attribute["displayName"] != "Compañía": continue values = attribute["values"] for value in values: if value["selectable"]: sv_data = json.loads( session.get(value["url"]).text) svas = sv_data["product"]["variationAttributes"] for sva in svas: if sva["displayName"] != "Color": continue for v in sva["values"]: if v["selected"]: v_name = "{} {} ({})".format( name, value["displayValue"], v["displayValue"]) v_sku = "{}-{}".format( sku, sv_data["product"] ["selectedVariantID"]) vis = sv_data["product"]["images"][ "large"] vpu = [i["url"] for i in vis] products.append( Product(v_name, cls.__name__, category, url, url, v_sku, stock, normal_price, offer_price, 'CLP', sku=v_sku, description=description, picture_urls=vpu, condition=condition)) return products p = Product(name, cls.__name__, category, url, url, sku, stock, normal_price, offer_price, 'CLP', sku=sku, description=description, picture_urls=picture_urls, condition=condition, flixmedia_id=flixmedia_id, video_urls=video_urls) return [p]
def _get_product(cls, url, category, extra_args): print(url) session = session_with_proxy(extra_args) response = session.get(url) if response.status_code in [410, 404]: return [] soup = BeautifulSoup(response.text, 'html.parser') name = soup.find('h1', {'itemprop': 'name'}) if not name: return [] name = name.text.strip() sku = soup.find('div', 'pdp-main')['data-pid'].strip() offer_price_container = soup.find('div', 'cencosud-price-2') if soup.find('div', 'out-of-stock') or \ soup.find('img', {'src': '/on/demandware.static/-/Sites/es_CL/' 'dwdbab8891/marketing/home/promotext/' 'promotext-plp-event3-SF.png'}): stock = 0 else: stock = -1 if offer_price_container: offer_price = Decimal( remove_words(offer_price_container.contents[0])) normal_price = Decimal( remove_words( soup.find( 'div', 'price-internet').text.split('$')[1].split('\n')[0])) else: price_text = soup.find('div', 'default-price').contents[0].strip() print(price_text) if price_text == 'N/A': return [] normal_price = Decimal(remove_words(price_text)) offer_price = normal_price picture_urls = [] for tag in soup.findAll('img', 'pdpMod_Mobile_GalleryImage'): picture_url = tag['src'].split('?')[0] if '.webm' in picture_url: continue picture_urls.append( picture_url.replace(' ', '%20').replace('href=', '')) video_urls = [] for iframe in soup.findAll('iframe'): match = re.match('https://www.youtube.com/embed/(.+)', iframe['src']) if match: video_urls.append('https://www.youtube.com/watch?v={}'.format( match.groups()[0])) flixmedia_id = None flixmedia_tag = soup.find( 'script', {'src': '//media.flixfacts.com/js/loader.js'}) if flixmedia_tag: mpn = flixmedia_tag['data-flix-mpn'].strip() flix_videos = flixmedia_video_urls(mpn) if flix_videos is not None: video_urls.extend(flix_videos) flixmedia_id = mpn description = html_to_markdown( str(soup.find('div', {'id': 'collapseDetails'}))) reviews_endpoint = 'https://api.bazaarvoice.com/data/batch.json?pass' \ 'key=caKNy0lDYfGnjpRhD27b7ZtxiSbxdwBcuuIEwXCyc9Zr' \ 'M&apiversion=5.5&resource.q0=reviews&filter.q0=p' \ 'roductid%3Aeq%3A{}&limit.q0=100'.format(sku) review_data = json.loads(session.get(reviews_endpoint).text) reviews = review_data['BatchedResults']['q0']['Results'] review_count = len(reviews) sum_review_scores = 0 for review in reviews: sum_review_scores += review['Rating'] if review_count: review_avg_score = sum_review_scores / review_count else: review_avg_score = None seller_container = soup.find('b', 'sellerMkp') if seller_container: seller = seller_container.text.strip() else: seller = None p = Product(name, cls.__name__, category, url, url, sku, stock, normal_price, offer_price, 'CLP', sku=sku, description=description, picture_urls=picture_urls, video_urls=video_urls, flixmedia_id=flixmedia_id, review_count=review_count, review_avg_score=review_avg_score, seller=seller) return [p]
def products_for_url(cls, url, category=None, extra_args=None): print(url) session = session_with_proxy(extra_args) response = session.get(url, timeout=60) if response.status_code in [404, 410]: return [] page_source = response.text soup = BeautifulSoup(page_source, 'html.parser') if soup.find('section', 'error-page'): return [] if soup.find('img', {'src': '/public/statics/images/404.svg'}): return [] name = soup.find('h1', 'product-name').text sku = soup.find('span', 'product-id').text availability_match = re.search(r'"availability":"(.+)"', response.text) availability_text = availability_match.groups()[0] if availability_text == 'http://schema.org/OutOfStock': stock = 0 elif availability_text == 'http://schema.org/InStock': stock = -1 else: raise Exception('Invalid availability text: {}'.format(availability_text)) prices = soup.find('div', 'prices') offer_price_container = prices.find('span', 'hites-price') offer_price = None if offer_price_container: offer_price = Decimal(offer_price_container.text.strip() .replace('$', '').replace('.', '')) normal_price_container = prices.find('span', 'sales') if not normal_price_container: normal_price_container = prices.find('span', 'list') if not normal_price_container and not offer_price_container: return [] normal_price = Decimal( normal_price_container.find('span', 'value')['content']) if not offer_price: offer_price = normal_price has_virtual_assistant = \ 'cdn.livechatinc.com/tracking.js' in response.text flixmedia_container = soup.find( 'script', {'src': '//media.flixfacts.com/js/loader.js'}) flixmedia_id = None video_urls = None if flixmedia_container: mpn = flixmedia_container['data-flix-mpn'] video_urls = flixmedia_video_urls(mpn) if video_urls is not None: flixmedia_id = mpn if 'reacondicionado' in name.lower(): condition = 'https://schema.org/RefurbishedCondition' else: condition = 'https://schema.org/NewCondition' images = soup.find('div', 'primary-images')\ .findAll('div', 'carousel-item') picture_urls = [i.find('img')['src'] for i in images] p = Product( name, cls.__name__, category, url, url, sku, stock, normal_price, offer_price, 'CLP', sku=sku, condition=condition, picture_urls=picture_urls, video_urls=video_urls, has_virtual_assistant=has_virtual_assistant, flixmedia_id=flixmedia_id ) return [p]