Beispiel #1
0
    def discover_urls_for_keyword(cls, keyword, threshold, extra_args=None):
        session = get_cf_session(extra_args)
        product_urls = []

        page = 1

        while True:
            if page > 40:
                raise Exception('Page overflow')

            search_url = 'https://simple.ripley.cl/search/{}?page={}'\
                .format(keyword, page)
            response = session.get(search_url, allow_redirects=False)

            if response.status_code != 200:
                raise Exception('Invalid search: ' + keyword)

            soup = BeautifulSoup(response.text, 'html.parser')

            products_container = soup.find('div', 'catalog-container')

            if not products_container:
                break

            products = products_container.findAll('a', 'catalog-product-item')

            for product in products:
                product_url = 'https://simple.ripley.cl' + product['href']
                product_urls.append(product_url)
                if len(product_urls) == threshold:
                    return product_urls

            page += 1

        return product_urls
Beispiel #2
0
    def discover_entries_for_category(cls, category, extra_args=None):
        url_base = 'https://simple.ripley.cl/{}?page={}'

        category_paths = [
            [
                'tecno/computacion/notebooks', ['Notebook'],
                'Tecno > Computación > Notebooks', 1
            ],
            # ['tecno/computacion/2-en-1convertibles', ['Notebook'],
            #  'Tecno > Cmoputación > 2 en 1/Convertibles', 1],
            [
                'tecno/computacion/notebooks-gamer', ['Notebook'],
                'Tecno > Computación > Notebooks gamer', 1
            ],
            [
                'tecno/computacion/tablets-y-e-readers', ['Tablet'],
                'Tecno > Computación > Tablets y E-readers', 1
            ],
            [
                'tecno/impresoras-y-tintas', ['Printer'],
                'Tecno > Computación > Impresoras y Tintas', 1
            ],
            [
                'tecno/computacion/almacenamiento',
                ['UsbFlashDrive', 'ExternalStorageDevice'],
                'Tecno > Computación > Almacenamiento', 0.5
            ],
            [
                'tecno/computacion/pc-all-in-one', ['AllInOne'],
                'Tecno > Computación > PC/All in one', 1
            ],
            [
                'tecno/computacion/proyectores-y-monitores',
                ['Monitor', 'Projector'],
                'Tecno > Computación > Proyectores y monitores', 0.5
            ],
            # ['computacion/computadores/notebooks-gamers', 'Notebook'],
            # ['mercado-ripley/gamer', 'Mouse'],
            ['tecno/television', ['Television'], 'Tecno > Televisión', 1],
            [
                'tecno/television/smart-tv', ['Television'],
                'Tecno > Televisión > Smart TV', 1
            ],
            # ['tecno/television/4k-uhd-nanocell', ['Television'],
            # 'Tecno > Televisión > 4K - UHD - NANOCELL', 1],
            # ['tecno/television/premium-oled-qled-8k', ['Television'],
            #  'Tecno > Televisión > PREMIUM - OLED - QLED - 8K', 1],
            # "['tecno/television/hd-full-hd', ['Television'],
            #  'Tecno > Televisión > HD - FULL HD', 1],
            [
                'electro/refrigeracion', ['Refrigerator'],
                'Electro > Refrigeración', 1
            ],
            [
                'electro/refrigeracion/side-by-side', ['Refrigerator'],
                'Electro > Refrigeración > Side by Side', 1
            ],
            [
                'electro/refrigeracion/refrigeradores', ['Refrigerator'],
                'Electro > Refrigeración > Refrigeradores', 1
            ],
            [
                'electro/refrigeracion/freezers-y-congeladores',
                ['Refrigerator'],
                'Electro > Refrigeración > Freezers y congeladores', 1
            ],
            [
                'electro/refrigeracion/frigobar', ['Refrigerator'],
                'Electro > Refrigeración > Frigobar', 1
            ],
            [
                'electro/refrigeracion/door-in-door', ['Refrigerator'],
                'Electro > Refrigeración > Door in Door', 1
            ],
            [
                'electro/cocina/cocinas', ['Stove'],
                'Electro > Cocina > Cocinas', 1
            ],
            [
                'electro/cocina/microondas', ['Oven'],
                'Electro > Cocina > Microondas', 1
            ],
            [
                'electro/cocina/hornos-y-microondas', ['Oven'],
                'Electro > Cocina > Hornos y Microondas', 1
            ],
            # ['electro/cocina/hornos', ['Oven'],
            #  'Electro > Cocina > Hornos', 1],
            [
                'electro/cocina/lavavajillas', ['DishWasher'],
                'Electro > Cocina > Lavavajillas', 1
            ],
            [
                'electro/aseo/aspiradoras-y-enceradoras', ['VacuumCleaner'],
                'Electro > Aseo > Aspiradoras y enceradoras', 1
            ],
            [
                'electro/lavanderia', ['WashingMachine'],
                'Electro > Lavandería', 1
            ],
            [
                'electro/lavanderia/lavadoras', ['WashingMachine'],
                'Electro > Lavandería > Lavadoras', 1
            ],
            [
                'electro/lavanderia/secadoras', ['WashingMachine'],
                'Electro > Lavandería > Secadoras', 1
            ],
            [
                'electro/lavanderia/lavadora-secadora', ['WashingMachine'],
                'Electro > Lavandería > Lavadora-secadora', 1
            ],
            [
                'electro/lavanderia/doble-carga', ['WashingMachine'],
                'Electro > Lavandería > Doble carga', 1
            ],

            # ['tecno/telefonia', ['Cell'],
            #  'Tecno > Telefonía', 1],
            [
                'tecno/telefonia/android', ['Cell'],
                'Tecno > Telefonía > Android', 1
            ],
            [
                'tecno/telefonia/iphone', ['Cell'],
                'Tecno > Telefonía > iPhone', 1
            ],
            [
                'tecno/telefonia/basicos', ['Cell'],
                'Tecno > Telefonía > Básicos', 1
            ],
            [
                'tecno/fotografia-y-video/camaras-reflex', ['Camera'],
                'Tecno > Fotografía y Video > Camaras reflex', 1
            ],
            [
                'tecno/fotografia-y-video/semi-profesionales', ['Camera'],
                'Tecno > Fotografía y Video > Semi profesionales', 1
            ],
            # ['entretenimiento/fotografia/camaras-compactas', 'Camera'],
            [
                'tecno/audio-y-musica/equipos-de-musica', ['StereoSystem'],
                'Tecno > Audio y Música > Equipos de música', 1
            ],
            [
                'tecno/audio-y-musica/parlantes-portables', ['StereoSystem'],
                'Tecno > Audio y Música > Parlantes Portables', 1
            ],
            [
                'tecno/audio-y-musica/soundbar-y-home-theater',
                ['StereoSystem'],
                'Tecno > Audio y Música > Soundbar y Home theater', 1
            ],
            # ['tecno/audio-y-musica/hi-fi', 'StereoSystem'],
            # ['tecno/audio-y-musica/parlantes-y-subwoofer', 'StereoSystem'],
            # ['tecno/audio-y-musica/microcomponentes', 'StereoSystem'],
            # ['tecno/audio-y-musica/home-cinema', 'StereoSystem'],
            [
                'tecno/television/bluray-dvd-y-tv-portatil',
                ['OpticalDiskPlayer'],
                'Tecno > Televisión > Bluray -DVD y TV Portátil', 1
            ],

            # ['telefonia/accesorios-telefonia/4kmemorias', 'MemoryCard'],
            [
                'tecno/mundo-gamer/consolas', ['VideoGameConsole'],
                'Tecno > Mundo Gamer > Consolas', 1
            ],
            [
                'electro/climatizacion/aire-acondicionado', ['AirConditioner'],
                'Electro > Climatización > Ventiladores y aire acondicionado',
                1
            ],
            [
                'electro/climatizacion/purificadores-y-humificadores',
                ['AirConditioner'],
                'Electro > Climatización > Purificadores y humidificadores', 1
            ],
            [
                'electro/climatizacion/estufas-y-calefactores',
                ['SpaceHeater'],
                'Electro > Climatización > Estufas y calefactores', 1
            ],
            [
                'tecno/corner-smartwatch', ['Wearable'],
                'Tecno > Telefonía > Smartwatches y Wearables', 1
            ],
            [
                'tecno/especial-audifonos', ['Headphones'],
                'Tecno > Audio y Música > Audífonos', 1
            ],
            # ['telefonia/smartwatches-and-wearables/smartwatch', 'Wearable'],
        ]

        session = get_cf_session(extra_args)
        product_entries = defaultdict(lambda: [])

        for e in category_paths:
            category_path, local_categories, section_name, category_weight = e

            if category not in local_categories:
                continue

            page = 1
            current_position = 1

            while True:
                if page > 100:
                    raise Exception('Page overflow')

                category_url = url_base.format(category_path, page)
                print(category_url)
                response = session.get(category_url, allow_redirects=False)

                if response.status_code != 200 and page == 1:
                    raise Exception('Invalid section: ' + category_url)

                soup = BeautifulSoup(response.text, 'html.parser')
                product_link_container = soup.find('div', 'catalog-container')

                if not product_link_container:
                    if page == 1:
                        raise Exception('Empty category path: {} - {}'.format(
                            category, category_path))
                    else:
                        break

                product_link_containers = product_link_container.findAll(
                    'a', 'catalog-product-item')

                if not product_link_containers:
                    product_link_containers = product_link_container.findAll(
                        'a', 'ProductItem__Name')

                if not product_link_containers:
                    raise Exception('Category error: ' + category_path)

                for idx, link_tag in enumerate(product_link_containers):
                    product_url = 'https://simple.ripley.cl' + link_tag['href']
                    if cls.filter_url(product_url):
                        product_entries[product_url].append({
                            'category_weight':
                            category_weight,
                            'section_name':
                            section_name,
                            'value':
                            current_position
                        })
                    current_position += 1

                page += 1

        return product_entries
Beispiel #3
0
    def _products_for_url(cls, url, category=None, extra_args=None, retries=9):
        session = get_cf_session(extra_args)
        page_source = session.get(url).text

        soup = BeautifulSoup(page_source, 'html.parser')

        if soup.find('div', 'error-page'):
            return []

        product_data = re.search(r'window.__PRELOADED_STATE__ = (.+);',
                                 page_source)
        if not product_data:
            if retries:
                return cls._products_for_url(url,
                                             category,
                                             extra_args,
                                             retries=retries - 1)
            else:
                return []
        product_json = json.loads(product_data.groups()[0])
        specs_json = product_json['product']['product']

        sku = specs_json['partNumber']
        name = specs_json['name'].encode('ascii', 'ignore').decode('ascii')
        short_description = specs_json.get('shortDescription', '')

        # If it's a cell sold by Ripley directly (not Mercado Ripley) add the
        # "Prepago" information in its description
        if category in ['Cell', 'Unknown'] and 'MPM' not in sku:
            name += ' ({})'.format(short_description)

        if specs_json['isOutOfStock'] or specs_json['isUnavailable']:
            stock = 0
        else:
            stock = -1

        if 'offerPrice' in specs_json['prices']:
            normal_price = Decimal(specs_json['prices']['offerPrice'])
        elif 'listPrice' in specs_json['prices']:
            normal_price = Decimal(specs_json['prices']['listPrice'])
        else:
            return []

        offer_price = Decimal(specs_json['prices'].get('cardPrice',
                                                       normal_price))

        if offer_price > normal_price:
            offer_price = normal_price

        description = ''

        refurbished_notice = soup.find('div', 'emblemaReaccondicionados19')

        if refurbished_notice:
            description += html_to_markdown(str(refurbished_notice))

        if 'longDescription' in specs_json:
            description += html_to_markdown(specs_json['longDescription'])

        description += '\n\nAtributo | Valor\n-- | --\n'

        for attribute in specs_json['attributes']:
            if 'name' in attribute and 'value' in attribute:
                description += '{} | {}\n'.format(attribute['name'],
                                                  attribute['value'])

        description += '\n\n'
        condition = 'https://schema.org/NewCondition'

        if 'reacondicionado' in description.lower() or \
                'reacondicionado' in name.lower() or \
                'reacondicionado' in short_description.lower():
            condition = 'https://schema.org/RefurbishedCondition'

        if soup.find(
                'img',
            {'src': '//home.ripley.cl/promo-badges/'
             'reacondicionado.png'}):
            condition = 'https://schema.org/RefurbishedCondition'

        picture_urls = []
        for path in specs_json['images']:
            picture_url = path

            if 'file://' in picture_url:
                continue

            if not picture_url.startswith('http'):
                picture_url = 'https:' + picture_url

            picture_urls.append(picture_url)

        if not picture_urls:
            picture_urls = None

        flixmedia_id = None
        video_urls = []

        flixmedia_urls = [
            '//media.flixfacts.com/js/loader.js',
            'https://media.flixfacts.com/js/loader.js'
        ]

        for flixmedia_url in flixmedia_urls:
            flixmedia_tag = soup.find('script', {'src': flixmedia_url})
            if flixmedia_tag and flixmedia_tag.has_attr('data-flix-mpn'):
                flixmedia_id = flixmedia_tag['data-flix-mpn']
                video_urls = flixmedia_video_urls(flixmedia_id)
                break

        review_count = int(specs_json['powerReview']['fullReviews'])

        if review_count:
            review_avg_score = float(
                specs_json['powerReview']['averageRatingDecimal'])
        else:
            review_avg_score = None

        has_virtual_assistant = False

        for keyword in keywords:
            if keyword in url:
                has_virtual_assistant = True
                break

        if 'shopName' in specs_json['marketplace']:
            seller = specs_json['marketplace']['shopName']
        elif specs_json['isMarketplaceProduct']:
            seller = 'Mercado R'
        else:
            seller = None

        p = Product(name,
                    cls.__name__,
                    category,
                    url,
                    url,
                    sku,
                    stock,
                    normal_price,
                    offer_price,
                    'CLP',
                    sku=sku,
                    description=description,
                    picture_urls=picture_urls,
                    condition=condition,
                    flixmedia_id=flixmedia_id,
                    review_count=review_count,
                    review_avg_score=review_avg_score,
                    video_urls=video_urls,
                    has_virtual_assistant=has_virtual_assistant,
                    seller=seller)

        return [p]
Beispiel #4
0
    def banners(cls, extra_args=None):
        extra_args = cls._extra_args_with_preflight(extra_args)
        base_url = 'https://simple.ripley.cl/{}'

        sections_data = [
            [bs.HOME, 'Home', bs.SUBSECTION_TYPE_HOME, ''],
            [bs.ELECTRO_RIPLEY, 'Electro Ripley',
             bs.SUBSECTION_TYPE_CATEGORY_PAGE, 'electro/'],
            [bs.TECNO_RIPLEY, 'Tecno Ripley',
             bs.SUBSECTION_TYPE_CATEGORY_PAGE, 'tecno/'],
            [bs.REFRIGERATION, 'Refrigeración',
             bs.SUBSECTION_TYPE_MOSAIC, 'electro/refrigeracion/'],
            [bs.REFRIGERATION, 'Side by Side',
             bs.SUBSECTION_TYPE_MOSAIC, 'electro/refrigeracion/side-by-side/'],
            [bs.REFRIGERATION, 'Refrigeradores', bs.SUBSECTION_TYPE_MOSAIC,
             'electro/refrigeracion/refrigeradores/'],
            [bs.REFRIGERATION, 'Freezers y congeladores',
             bs.SUBSECTION_TYPE_MOSAIC,
             'electro/refrigeracion/freezers-y-congeladores/'],
            [bs.REFRIGERATION, 'Door In Door',
             bs.SUBSECTION_TYPE_MOSAIC,
             'electro/refrigeracion/door-in-door/'],
            [bs.REFRIGERATION, 'Frigobar',
             bs.SUBSECTION_TYPE_MOSAIC,
             'electro/refrigeracion/frigobar/'],
            [bs.REFRIGERATION, 'Refrigeracion Comercial e Industrial',
             bs.SUBSECTION_TYPE_MOSAIC,
             'electro/refrigeracion/refrigeracion-comercial-e-industrial/'],
            [bs.WASHING_MACHINES, 'Lavandería',
             bs.SUBSECTION_TYPE_MOSAIC, 'electro/lavanderia'],
            [bs.WASHING_MACHINES, 'Lavadoras',
             bs.SUBSECTION_TYPE_MOSAIC, 'electro/lavanderia/lavadoras'],
            [bs.WASHING_MACHINES, 'Lavadora-secadora',
             bs.SUBSECTION_TYPE_MOSAIC,
             'electro/lavanderia/lavadora-secadora'],
            [bs.WASHING_MACHINES, 'Secadoras',
             bs.SUBSECTION_TYPE_MOSAIC,
             'electro/lavanderia/secadoras'],
            [bs.WASHING_MACHINES, 'Doble Carga',
             bs.SUBSECTION_TYPE_MOSAIC, 'electro/lavanderia/doble-carga'],
            [bs.TELEVISIONS, 'Televisión',
             bs.SUBSECTION_TYPE_MOSAIC, 'tecno/television'],
            [bs.TELEVISIONS, 'Smart TV',
             bs.SUBSECTION_TYPE_MOSAIC, 'tecno/television/smart-tv'],
            [bs.TELEVISIONS, 'Ultra HD 4K',
             bs.SUBSECTION_TYPE_MOSAIC, 'tecno/television/ultra-hd-4k'],
            [bs.TELEVISIONS, 'Premium y 8K',
             bs.SUBSECTION_TYPE_MOSAIC,
             'tecno/television/premium-y-8k'],
            [bs.TELEVISIONS, 'HD y Full HD',
             bs.SUBSECTION_TYPE_MOSAIC, 'tecno/television/hd-y-full-hd'],
            [bs.AUDIO, 'Audio y Música',
             bs.SUBSECTION_TYPE_MOSAIC, 'tecno/audio-y-musica'],
            [bs.AUDIO, 'Parlantes Portables',
             bs.SUBSECTION_TYPE_MOSAIC,
             'tecno/audio-y-musica/parlantes-portables'],
            [bs.AUDIO, 'Soundbar y Home theater',
             bs.SUBSECTION_TYPE_MOSAIC,
             'tecno/audio-y-musica/soundbar-y-home-theater'],
            [bs.AUDIO, 'Receiver y Amplificadores',
             bs.SUBSECTION_TYPE_MOSAIC,
             'tecno/audio-y-musica/receiver-y-amplificadores'],
            [bs.AUDIO, 'Equipos de música',
             bs.SUBSECTION_TYPE_MOSAIC,
             'tecno/audio-y-musica/equipos-de-musica'],
            [bs.AUDIO, 'Accesorios',
             bs.SUBSECTION_TYPE_MOSAIC,
             'tecno/audio-y-musica/accesorios-audio'],
            [bs.CELLS, 'Telefonía',
             bs.SUBSECTION_TYPE_MOSAIC, 'tecno/telefonia'],
            [bs.CELLS, 'Android',
             bs.SUBSECTION_TYPE_MOSAIC, 'tecno/telefonia/android'],
            [bs.CELLS, 'iPhone',
             bs.SUBSECTION_TYPE_MOSAIC, 'tecno/telefonia/iphone']
        ]

        banners = []

        for section, subsection, subsection_type, url_suffix in sections_data:
            url = base_url.format(url_suffix)
            print(url)

            if subsection_type == bs.SUBSECTION_TYPE_HOME:
                banners = banners + cls.get_owl_banners(
                    url, section, subsection, subsection_type, extra_args)

            elif subsection_type == bs.SUBSECTION_TYPE_CATEGORY_PAGE:
                banners = banners + cls.get_owl_banners(
                    url, section, subsection, subsection_type, extra_args)
            elif subsection_type == bs.SUBSECTION_TYPE_MOSAIC:
                session = get_cf_session(extra_args)
                soup = BeautifulSoup(session.get(url).text, 'html.parser')
                picture_container = soup.find('section', 'catalog-top-banner')

                if not picture_container:
                    raise Exception('No banners for: ' + url)
                    # print('No banners')
                    # continue

                picture_url = picture_container.find('img')

                if not picture_url:
                    continue

                destination = soup.find(
                    'section', 'catalog-top-banner').find('a')
                destination_urls = []

                if destination:
                    destination_urls = [destination['href']]

                banners.append({
                    'url': url,
                    'picture_url': picture_url.get('src') or
                    picture_url.get('data-src'),
                    'destination_urls': destination_urls,
                    'key': picture_url.get('src') or
                    picture_url.get('data-src'),
                    'position': 1,
                    'section': section,
                    'subsection': subsection,
                    'type': subsection_type
                })
            else:
                raise Exception('Invalid subsection type')

        return banners
Beispiel #5
0
    def products_for_url(cls, url, category=None, extra_args=None):
        category_paths = [
            ['tecno/computacion/notebooks', ['Notebook'],
             'Tecno > Computación > Notebooks', 1],
            ['tecno/computacion/notebooks-gamer', ['Notebook'],
             'Tecno > Computación > Notebooks gamer', 1],
            ['tecno/computacion/tablets-y-e-readers', ['Tablet'],
             'Tecno > Computación > Tablets y E-readers', 1],
            ['tecno/impresoras-y-tintas', ['Printer'],
             'Tecno > Computación > Impresoras y Tintas', 1],
            ['tecno/computacion/almacenamiento',
             ['UsbFlashDrive', 'ExternalStorageDrive'],
             'Tecno > Computación > Almacenamiento', 0.5],
            ['tecno/computacion/pc-all-in-one', ['AllInOne'],
             'Tecno > Computación > PC/All in one', 1],
            ['tecno/computacion/proyectores-y-monitores',
             ['Monitor', 'Projector'],
             'Tecno > Computación > Proyectores y monitores', 0.5],
            ['tecno/television', ['Television'],
             'Tecno > Televisión', 1],
            ['tecno/television/smart-tv', ['Television'],
             'Tecno > Televisión > Smart TV', 1],
            ['electro/refrigeracion', ['Refrigerator'],
             'Electro > Refrigeración', 1],
            ['electro/refrigeracion/side-by-side', ['Refrigerator'],
             'Electro > Refrigeración > Side by Side', 1],
            ['electro/refrigeracion/refrigeradores', ['Refrigerator'],
             'Electro > Refrigeración > Refrigeradores', 1],
            ['electro/refrigeracion/freezers-y-congeladores', ['Refrigerator'],
             'Electro > Refrigeración > Freezers y congeladores', 1],
            ['electro/refrigeracion/frigobar', ['Refrigerator'],
             'Electro > Refrigeración > Frigobar', 1],
            ['electro/refrigeracion/door-in-door', ['Refrigerator'],
             'Electro > Refrigeración > Door in Door', 1],
            ['electro/cocina/cocinas', ['Stove'],
             'Electro > Cocina > Cocinas', 1],
            ['electro/electrodomesticos/hornos-y-microondas', ['Oven'],
             'Electro > Electrodomésticos > Hornos y Microondas', 1],
            ['electro/cocina/lavavajillas', ['DishWasher'],
             'Electro > Cocina > Lavavajillas', 1],
            ['electro/aseo/aspiradoras-y-enceradoras', ['VacuumCleaner'],
             'Electro > Aseo > Aspiradoras y enceradoras', 1],
            ['electro/lavanderia', ['WashingMachine'],
             'Electro > Lavandería', 1],
            ['electro/lavanderia/lavadoras', ['WashingMachine'],
             'Electro > Lavandería > Lavadoras', 1],
            ['electro/lavanderia/secadoras', ['WashingMachine'],
             'Electro > Lavandería > Secadoras', 1],
            ['electro/lavanderia/lavadora-secadora', ['WashingMachine'],
             'Electro > Lavandería > Lavadora-secadora', 1],
            # ['electro/lavanderia/doble-carga', ['WashingMachine'],
            #  'Electro > Lavandería > Doble carga', 1],
            ['tecno/telefonia/iphone', ['Cell'],
             'Tecno > Telefonía > iPhone', 1],
            ['tecno/telefonia/samsung', ['Cell'],
             'Tecno > Telefonía > Samsung', 1],
            ['tecno/telefonia/huawei', ['Cell'],
             'Tecno > Telefonía > Huawei', 1],
            ['tecno/telefonia/xiaomi', ['Cell'],
             'Tecno > Telefonía > Xiaomi', 1],
            ['tecno/telefonia/motorola', ['Cell'],
             'Tecno > Telefonía > Motorola', 1],
            ['tecno/telefonia/basicos', ['Cell'],
             'Tecno > Telefonía > Básicos', 1],
            ['tecno/fotografia-y-video/camaras-reflex', ['Camera'],
             'Tecno > Fotografía y Video > Camaras reflex', 1],
            ['tecno/fotografia-y-video/semi-profesionales', ['Camera'],
             'Tecno > Fotografía y Video > Semi profesionales', 1],
            ['tecno/audio-y-musica/equipos-de-musica', ['StereoSystem'],
             'Tecno > Audio y Música > Equipos de música', 1],
            ['tecno/audio-y-musica/parlantes-portables', ['StereoSystem'],
             'Tecno > Audio y Música > Parlantes Portables', 1],
            ['tecno/audio-y-musica/soundbar-y-home-theater', ['StereoSystem'],
             'Tecno > Audio y Música > Soundbar y Home theater', 1],
            ['tecno/television/bluray-dvd-y-tv-portatil',
             ['OpticalDiskPlayer'],
             'Tecno > Televisión > Bluray -DVD y TV Portátil', 1],
            ['tecno/playstation/consolas', ['VideoGameConsole'],
             'Tecno > PlayStation > Consolas', 1],
            ['tecno/nintendo/consolas', ['VideoGameConsole'],
             'Tecno > Nintendo > Consolas', 1],
            ['electro/climatizacion/aire-acondicionado',
             ['AirConditioner'],
             'Electro > Climatización > Ventiladores y aire acondicionado', 1],
            ['electro/climatizacion/purificadores-y-humificadores',
             ['AirConditioner'],
             'Electro > Climatización > Purificadores y humidificadores', 1],
            ['electro/climatizacion/estufas-y-calefactores',
             ['SpaceHeater'],
             'Electro > Climatización > Estufas y calefactores', 1],
            ['tecno/corner-smartwatch/garmin', ['Wearable'],
             'Tecno > Telefonía > Smartwatches y Wearables > Garmin', 1],
            ['tecno/corner-smartwatch/polar', ['Wearable'],
             'Tecno > Telefonía > Smartwatches y Wearables > Polar', 1],
            ['tecno/corner-smartwatch/apple-watch', ['Wearable'],
             'Tecno > Telefonía > Smartwatches y Wearables > Apple Watch', 1],
            ['tecno/corner-smartwatch/samsung', ['Wearable'],
             'Tecno > Telefonía > Smartwatches y Wearables > Samsung', 1],
            ['tecno/corner-smartwatch/huawei', ['Wearable'],
             'Tecno > Telefonía > Smartwatches y Wearables > Huawei', 1],
            ['tecno/especial-audifonos', ['Headphones'],
             'Tecno > Audio y Música > Audífonos', 1],
        ]

        if extra_args is None:
            extra_args = {}

        session = get_cf_session(extra_args)
        fast_mode = extra_args.pop('fast_mode', False)

        url_base = 'https://simple.ripley.cl/{}?page={}'
        product_dict = {}

        for e in category_paths:
            category_path, local_categories, section_name, category_weight = e

            if category not in local_categories:
                continue

            page = 1
            position = 1

            while True:
                if page > 100:
                    raise Exception('Page overflow')

                category_url = url_base.format(category_path, page)
                print(category_url)
                response = session.get(category_url, allow_redirects=False)

                if response.status_code != 200 and page == 1:
                    raise Exception('Invalid section: ' + category_url)

                soup = BeautifulSoup(response.text, 'html.parser')
                products_data = soup.find('script',
                                          {'type': 'application/ld+json'})

                products_soup = soup.find('div', 'catalog-container')

                if not products_data or not products_soup:
                    if page == 1:
                        raise Exception('Empty path: {}'.format(category_url))
                    else:
                        break

                products_elements = products_soup.findAll(
                    'div', 'ProductItem__Row')

                if not products_elements:
                    products_elements = products_soup.findAll(
                        'a', 'catalog-product-item')

                products_json = json.loads(products_data.text)[
                    'itemListElement']

                assert (len(products_elements) == len(products_json))

                for product_json in products_json:
                    product_element = products_elements[
                        int(product_json['position']) - 1]
                    product_data = product_json['item']

                    brand = product_data.get('brand', '').upper()

                    # If the product is LG or Samsung and is sold directly by
                    # Ripley (not marketplace) obtain the full data
                    if brand in ['LG', 'SAMSUNG'] and 'MPM' not in \
                            product_data['sku'] and not fast_mode:

                        product = product_dict.get(product_data['sku'], None)
                        if not product:
                            url = cls._get_entry_url(product_element)
                            product = cls._assemble_full_product(
                                url, category, extra_args)
                    else:
                        product = cls._assemble_product(
                            product_data, product_element, category)

                    if product:
                        if product.sku in product_dict:
                            product_to_update = product_dict[product.sku]
                        else:
                            product_dict[product.sku] = product
                            product_to_update = product

                        product_to_update.positions[section_name] = position

                    position += 1

                page += 1

        products_list = [p for p in product_dict.values()]

        return products_list
Beispiel #6
0
    def banners(cls, extra_args=None):
        from .ripley_chile_base_cf import RipleyChileBaseCf

        extra_args = RipleyChileBaseCf._extra_args_with_preflight(extra_args)

        base_url = 'https://simple.ripley.cl/{}'

        sections_data = [
            [bs.HOME, 'Home', bs.SUBSECTION_TYPE_HOME, ''],
            [bs.ELECTRO_RIPLEY, 'Electro Ripley',
             bs.SUBSECTION_TYPE_CATEGORY_PAGE, 'electro/'],
            [bs.TECNO_RIPLEY, 'Tecno Ripley',
             bs.SUBSECTION_TYPE_CATEGORY_PAGE, 'tecno/'],
            [bs.REFRIGERATION, 'Refrigeración',
             bs.SUBSECTION_TYPE_MOSAIC, 'electro/refrigeracion/'],
            [bs.REFRIGERATION, 'Refrigeradores', bs.SUBSECTION_TYPE_MOSAIC,
             'electro/refrigeracion/refrigeradores/'],
            [bs.WASHING_MACHINES, 'Lavandería',
             bs.SUBSECTION_TYPE_MOSAIC, 'electro/lavanderia'],
            [bs.WASHING_MACHINES, 'Lavadoras',
             bs.SUBSECTION_TYPE_MOSAIC, 'electro/lavanderia/lavadoras'],
            [bs.WASHING_MACHINES, 'Lavadora-secadora',
             bs.SUBSECTION_TYPE_MOSAIC,
             'electro/lavanderia/lavadora-secadora'],
            [bs.WASHING_MACHINES, 'Doble Carga',
             bs.SUBSECTION_TYPE_MOSAIC, 'electro/lavanderia/doble-carga'],
            [bs.TELEVISIONS, 'Televisión',
             bs.SUBSECTION_TYPE_MOSAIC, 'tecno/television'],
            [bs.TELEVISIONS, 'Smart TV',
             bs.SUBSECTION_TYPE_MOSAIC, 'tecno/television/smart-tv'],
            [bs.TELEVISIONS, '4K – UHD - NanoCell',
             bs.SUBSECTION_TYPE_MOSAIC, 'tecno/television/4k-uhd-nanocell'],
            [bs.TELEVISIONS, 'Premium - OLED - QLED - 8K',
             bs.SUBSECTION_TYPE_MOSAIC,
             'tecno/television/premium-oled-qled-8k'],
            [bs.TELEVISIONS, 'HD - Full HD',
             bs.SUBSECTION_TYPE_MOSAIC, 'tecno/television/hd-full-hd'],
            [bs.AUDIO, 'Audio y Música',
             bs.SUBSECTION_TYPE_MOSAIC, 'tecno/audio-y-musica'],
            # [AUDIO, 'Parlantes y Subwoofer', SUBSECTION_TYPE_MOSAIC,
            #  'tecno/audio-y-musica/parlantes-y-subwoofer'],
            # [AUDIO, 'Microcomponentes',
            #  SUBSECTION_TYPE_MOSAIC,
            #  'tecno/audio-y-musica/microcomponentes'],
            [bs.AUDIO, 'Soundbar y Home theater',
             bs.SUBSECTION_TYPE_MOSAIC,
             'tecno/audio-y-musica/soundbard-y-home-theater'],
            [bs.AUDIO, 'Parlantes Portables',
             bs.SUBSECTION_TYPE_MOSAIC,
             'tecno/audio-y-musica/parlantes-portables'],
            [bs.CELLS, 'Telefonía',
             bs.SUBSECTION_TYPE_MOSAIC, 'tecno/telefonia'],
            [bs.CELLS, 'Android',
             bs.SUBSECTION_TYPE_MOSAIC, 'tecno/telefonia/android'],
            [bs.CELLS, 'iPhone',
             bs.SUBSECTION_TYPE_MOSAIC, 'tecno/telefonia/iphone']
        ]

        debug = extra_args.get('debug', False)
        if debug:
            session = session_with_proxy(extra_args)
        else:
            session = get_cf_session(extra_args)
        banners = []

        for section, subsection, subsection_type, url_suffix in sections_data:
            url = base_url.format(url_suffix)
            response = session.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')

            if subsection_type == bs.SUBSECTION_TYPE_HOME:
                banners = banners + cls.get_owl_banners(
                    url, section, subsection, subsection_type, extra_args)

            elif subsection_type == bs.SUBSECTION_TYPE_CATEGORY_PAGE:
                if soup.find('div', 'owl-carousel'):
                    banners = banners + cls.get_owl_banners(
                        url, section, subsection, subsection_type, extra_args)
                else:
                    images = soup.findAll('a', 'item')

                    if not images:
                        print('No banners')

                    for index, image in enumerate(images):
                        picture = image.find('span', 'bg-item')
                        picture_url = re.search(
                            r'url\((.*?)\)', picture['style']).group(1)

                        destination_urls = [image['href']]

                        banners.append({
                            'url': url,
                            'picture_url': picture_url,
                            'destination_urls': destination_urls,
                            'key': picture_url,
                            'position': index + 1,
                            'section': section,
                            'subsection': subsection,
                            'type': subsection_type
                        })
            elif subsection_type == bs.SUBSECTION_TYPE_MOSAIC:
                picture_container = soup.find('section', 'catalog-top-banner')

                if not picture_container:
                    print('No banners')
                    continue

                picture_url = picture_container.find('img')

                if not picture_url:
                    continue

                destination = soup.find(
                    'section', 'catalog-top-banner').find('a')
                destination_urls = []

                if destination:
                    destination_urls = [destination['href']]

                banners.append({
                    'url': url,
                    'picture_url': picture_url.get('src') or
                    picture_url.get('data-src'),
                    'destination_urls': destination_urls,
                    'key': picture_url.get('src') or
                    picture_url.get('data-src'),
                    'position': 1,
                    'section': section,
                    'subsection': subsection,
                    'type': subsection_type
                })
            else:
                raise Exception('Invalid subsection type')

        return banners