Example #1
0
    def products_for_url(cls, url, category=None, extra_args=None):
        session = session_with_proxy(extra_args)
        session.headers['user-agent'] = 'python-requests/2.21.0'
        response = session.get(url)

        if response.status_code == 404:
            return []

        page_source = response.text
        soup = BeautifulSoup(page_source, 'html.parser')

        if not soup.find('body') or \
                not soup.find('h1', {'id': 'nombre-producto'}):
            return []

        name = soup.find('h1', {'id': 'nombre-producto'}).text.strip()
        sku = soup.find('div', {'itemprop': 'sku'}).text.strip()

        ajax_session = session_with_proxy(extra_args)
        ajax_session.headers['user-agent'] = 'python-requests/2.21.0'
        ajax_session.headers['x-requested-with'] = 'XMLHttpRequest'
        ajax_session.headers['content-type'] = \
            'application/x-www-form-urlencoded'

        stock_data = json.loads(
            ajax_session.post(
                'https://catalogo.movistar.cl/fullprice/stockproducto/validar/',
                'sku=' + sku).text)

        stock = stock_data['respuesta']['cantidad']

        price_container = soup.find('span', 'special-price').find('p')
        price = Decimal(remove_words(price_container.text))

        description = html_to_markdown(
            str(soup.find('div', 'detailed-desktop')))

        if 'seminuevo' in description:
            condition = 'https://schema.org/RefurbishedCondition'
        else:
            condition = 'https://schema.org/NewCondition'

        picture_urls = [soup.find('meta', {'property': 'og:image'})['content']]

        return [
            Product(name,
                    cls.__name__,
                    category,
                    url,
                    url,
                    sku,
                    stock,
                    price,
                    price,
                    'CLP',
                    condition=condition,
                    sku=sku,
                    description=description,
                    picture_urls=picture_urls)
        ]
Example #2
0
    def discover_urls_for_category(cls, category, extra_args=None):
        session = session_with_proxy(extra_args)
        session.headers['Content-Type'] = 'application/x-www-form-urlencoded'

        category_paths = [
            ('Id_Subrubro=338', 'ExternalStorageDrive'),
            # Discos rígidos -> Externos
            ('Id_Subrubro=336', 'StorageDrive'),
            # Discos rígidos -> Internos
            ('Id_Subrubro=495', 'SolidStateDrive'),    # Discos rígidos -> SSD
            ('Id_Subrubro=611', 'SolidStateDrive'),    # Memorias -> SSD
            ('Id_Subrubro=396', 'MemoryCard'),    # Memorias -> Micro SD Card
            # ('Id_Subrubro=298', 'MemoryCard'),    # Memorias -> SD Card
            ('Id_Rubro=2', 'UsbFlashDrive'),    # Pen Drives
        ]

        product_urls = []

        session = session_with_proxy(extra_args)
        session.headers['Content-Type'] = 'application/x-www-form-urlencoded'

        for category_path, local_category in category_paths:
            if local_category != category:
                continue

            page = 0

            while True:
                category_url = \
                    'http://www.stylus.com.ar/productos.php?{}&pag={}' \
                    ''.format(category_path, page)
                print(category_url)

                response = cls._retrieve_page(
                    session,
                    category_url,
                    extra_args)
                soup = BeautifulSoup(response.text, 'html.parser')

                products_container = soup.find('div', 'prod-lista')

                if not products_container:
                    if page == 0:
                        raise Exception('Empty category: {}'.format(
                            category_url))
                    break

                for product_cell in products_container.findAll('li'):
                    product_url = 'http://www.stylus.com.ar/' + \
                                  product_cell.find('a')['href'].replace(
                                      '&Menu=', '')
                    product_urls.append(product_url)

                page += 1

        return product_urls
Example #3
0
    def products_for_url(cls, url, category=None, extra_args=None):
        session = session_with_proxy(extra_args)
        soup = BeautifulSoup(session.get(url).text, 'html.parser')

        name = soup.find('meta', {'name': 'Title'})['content'].strip()
        sku = soup.find('input', {'name': 'id'})['value'].strip()

        price_string = soup.find('input', {'id': 'product_price'})['value']
        price = Decimal(price_string)

        description = html_to_markdown(
            str(soup.find('div', {'id': 'especificaciones-container'})))

        picture_urls = [tag['data-zoom-image'] for tag in
                        soup.find('div', 'owl-carousel').findAll('img')]

        p = Product(
            name,
            cls.__name__,
            category,
            url,
            url,
            sku,
            -1,
            price,
            price,
            'ARS',
            sku=sku,
            description=description,
            picture_urls=picture_urls
        )

        return [p]
Example #4
0
    def products_for_url(cls, url, category=None, extra_args=None):
        print(url)
        session = session_with_proxy(extra_args)
        soup = BeautifulSoup(session.get(url).text, 'html.parser')

        name = soup.find('h1', 'product_title').text.strip()
        sku = soup.find('span', 'sku').text.strip()
        stock = -1

        if 'LG' not in name.upper().split(' '):
            stock = 0

        price = Decimal(
            soup.find('p', 'price').find('span', 'amount').text.replace(
                '₲.', '').replace('.', ''))

        picture_urls = [soup.find('meta', {'name': 'og:image'})['content']]

        return [
            Product(name,
                    cls.__name__,
                    category,
                    url,
                    url,
                    sku,
                    stock,
                    price,
                    price,
                    'PYG',
                    sku=sku,
                    picture_urls=picture_urls)
        ]
Example #5
0
    def products_for_url(cls, url, category=None, extra_args=None):
        session = session_with_proxy(extra_args)
        response = session.get(url, timeout=20)

        if response.url != url or response.status_code == 404:
            return []

        soup = BeautifulSoup(response.text, 'html.parser')
        model_id = soup.find('div', 'pdp-conts-area')['id']
        model_data = cls._retrieve_api_model(model_id)
        sibling_groups = model_data['siblings']
        sibling_ids = [model_id]

        for sibling_group in sibling_groups:
            if sibling_group['siblingType'] in ['COLOR', 'SIZE', 'CAPACITY']:
                for sibling in sibling_group['siblingModels']:
                    if sibling['modelId'] not in sibling_ids:
                        sibling_ids.append(sibling['modelId'])
            else:
                raise Exception('Unkown sibling type for: ' + url)

        products = []

        for sibling_id in sibling_ids:
            sibling = cls._retrieve_single_product(sibling_id, category)
            if sibling:
                products.append(sibling)

        return products
Example #6
0
    def discover_urls_for_category(cls, category, extra_args=None):
        category_paths = [
            ('cpt.notebook', 'Notebook'),  # Portatiles
            ('cpt.ultrabook', 'Notebook'),  # Ultrabooks
            ('cpt.allone', 'AllInOne'),  # all-en-uno
        ]

        product_urls = []
        session = session_with_proxy(extra_args)

        for category_path, local_category in category_paths:
            if local_category != category:
                continue

            category_url = 'http://store.intcomex.com/es-XCL/Products/' \
                           'ByCategory/{}?rpp=1000'.format(category_path)

            print(category_url)

            soup = cls._retrieve_page(session, category_url, extra_args)

            product_containers = soup.findAll('div', 'productArea')

            if not product_containers:
                raise Exception('Empty category: ' + category_url)

            for container in product_containers:
                product_url = 'http://store.intcomex.com' + \
                              container.find('a')['href']
                product_urls.append(product_url)

        return product_urls
Example #7
0
    def discover_urls_for_category(cls, category, extra_args=None):
        category_paths = [
            ['televisores-y-teatro-en-casa/televisores', 'Television'],
            ['celulares-y-tablets/smartphones-xperia', 'Cell'],
            ['camaras/cyber-shot', 'Camera'],
            ['audio/sistemas-de-audio', 'StereoSystem'],
            [
                'televisores-y-teatro-en-casa/reproductores-de-blu-ray-disc'
                '-y-dvd', 'OpticalDiskPlayer'
            ],
            ['televisores-y-teatro-en-casa/teatro-en-casa', 'StereoSystem'],
            ['audio/audifonos', 'Headphones'],
        ]

        product_urls = []
        session = session_with_proxy(extra_args)

        for category_path, local_category in category_paths:
            if local_category != category:
                continue
            category_url = 'https://store.sony.cl/{}?PS=48'.format(
                category_path)

            soup = BeautifulSoup(session.get(category_url).text, 'html.parser')

            containers = soup.findAll('div', 'prod')

            if not containers:
                logging.warning('Empty category: ' + category_url)

            for product_container in containers:
                product_url = product_container.find('a')['href']
                product_urls.append(product_url)

        return product_urls
Example #8
0
    def discover_urls_for_category(cls, category, extra_args=None):
        category_paths = [
            # ['empresa/workstation/notebook.html', 'Notebook'],
            ['empresa/notebook-comercial.html', 'Notebook'],
            ['pc-y-portatiles/portatiles.html', 'Notebook'],
            ['empresa/plotters.html', 'Printer'],
            ['impresion-e-imagen/impresoras-de-tinta.html', 'Printer'],
            # ['impresion-e-imagen/impresoras-laser.html', 'Printer'],
            ['impresion-e-imagen/multifuncionales.html', 'Printer'],
            ['impresion-e-imagen/multifuncionales-laser.html', 'Printer'],
            # ['pc-y-portatiles/escritorio.html', 'AllInOne'],
            # ['audio/teclados-y-mouse.html', 'Mouse'],
            # ['audio/parlantes.html', 'StereoSystem'],
            ['monitores/monitores.html', 'Monitor'],
        ]

        session = session_with_proxy(extra_args)
        session.headers['X-Requested-With'] = 'XMLHttpRequest'

        product_urls = []
        for category_path, local_category in category_paths:
            if local_category != category:
                continue

            subcategory_product_urls = []
            page = 1

            while True:
                category_url = 'https://www.scglobal.cl/index.php/{}?p={}' \
                               ''.format(category_path, page)
                print(category_url)

                if page >= 10:
                    raise Exception('Page overflow: ' + category_url)

                json_data = json.loads(
                    session.get(category_url, verify=False).text)
                soup = BeautifulSoup(json_data['listing'], 'html.parser')
                product_cells = soup.findAll('li', 'item')

                if not product_cells and page == 1:
                    raise Exception('Empty category: ' + category_url)

                done = False

                for cell in product_cells:
                    product_url = cell.find('a')['href']
                    if product_url in subcategory_product_urls:
                        done = True
                        break
                    subcategory_product_urls.append(product_url)

                if done:
                    break

                page += 1

            product_urls.extend(subcategory_product_urls)

        return product_urls
Example #9
0
    def discover_urls_for_category(cls, category, extra_args=None):
        category_paths = [
            # Ampolletas LED
            ['hogar/ampolletas', 'Lamp'],
            # Proyectores LED
            ['hogar/proyectores', 'LightProjector'],
            # Tubos LED
            ['hogar/equipos-y-tubos', 'LightTube'],
        ]

        discovery_urls = []
        session = session_with_proxy(extra_args)

        for category_path, local_category in category_paths:
            if local_category != category:
                continue

            category_url = 'http://ledlightchile.cl/categoria-producto/{}' \
                           ''.format(category_path)

            soup = BeautifulSoup(session.get(category_url).text, 'html.parser')

            product_containers = soup.findAll('div', 'wf-cell')

            for container in product_containers:
                subcategory_url = container.find('a')['href']
                discovery_urls.append(subcategory_url)

        return discovery_urls
Example #10
0
    def discover_urls_for_category(cls, category, extra_args=None):
        category_paths = [
            ['hardware/hard-disk/hd-sata-iii', 'StorageDrive'],
            ['hardware/hard-disk/ssd', 'SolidStateDrive'],
        ]

        product_urls = []
        session = session_with_proxy(extra_args)

        for category_path, local_category in category_paths:
            if local_category != category:
                continue

            category_url = 'https://www.terabyteshop.com.br/{}' \
                           ''.format(category_path)

            soup = BeautifulSoup(session.get(category_url).text, 'html.parser')

            containers = soup.findAll('div', 'pbox')

            if not containers:
                raise Exception('Empty category: ' + category_url)

            for container in containers:
                product_url = 'https://www.terabyteshop.com.br' + \
                              container.find('a')['href']
                product_urls.append(product_url)

        return product_urls
Example #11
0
    def discover_urls_for_category(cls, category, extra_args=None):
        category_filters = [
            ('tv', 'Television'),
            ('celulares', 'Cell'),
            ('equipos-de-sonido', 'StereoSystem'),
            ('barras-de-sonido', 'StereoSystem'),
            ('bocinas', 'StereoSystem'),
            ('aires-acondicionados/Inverter', 'AirConditioner'),
            ('aires-acondicionados/Básico', 'AirConditioner'),
            ('estufas', 'Stove'),
            ('lavadoras', 'WashingMachine'),
            ('secadoras', 'WashingMachine'),
            # ('centro-de-lavado', 'WashingMachine'),
            ('refrigeradoras', 'Refrigerator'),
            # ('congeladores', 'Refrigerator'),
            # ('microondas', 'Oven'),
            # ('hornos', 'Oven'),
            ('monitores', 'Monitor'),
        ]

        session = session_with_proxy(extra_args)
        product_urls = []

        for category_path, local_category in category_filters:
            if local_category != category:
                continue
            page = 1
            done = False

            while not done:
                if page >= 10:
                    raise Exception('Page overflow')

                url = 'https://www.multimax.net/collections/{}?page={}'\
                    .format(category_path, page)

                print(url)

                response = session.get(url)
                soup = BeautifulSoup(response.text, 'html5lib')

                container = soup.find('div', 'collection-products')
                items = container.findAll('article', 'item')

                if not items:
                    if page == 1:
                        raise Exception('No products for category {}'
                                        .format(category))
                    break

                for item in items:
                    if 'LG' not in item.find('div', 'vendor').text.upper():
                        continue
                    product_url = 'https://www.multimax.net{}'\
                        .format(item.find('a')['href'])
                    product_urls.append(product_url)

                page += 1

        return product_urls
Example #12
0
    def discover_urls_for_category(cls, category, extra_args=None):
        category_paths = [
            ['Televisores', 'Television'],
            ['Aires-Split', 'AirConditioner'],
            ['Lavadoras', 'WashingMachine'],
            # ['Equipos-de-sonido', 'StereoSystem'],
            ['Refrigeradoras', 'Refrigerator'],
            ['Cocinas', 'Oven']
        ]

        session = session_with_proxy(extra_args)
        base_url = 'https://www.almaceneslaganga.com/' \
                   'pedidos-en-linea/efectivo/{}/LG'
        product_urls = []

        for url_extension, local_category in category_paths:
            if category != local_category:
                continue

            url = base_url.format(url_extension)
            soup = BeautifulSoup(session.get(url).text, 'html.parser')
            products = soup.findAll('div', 'esquema_producto')

            if not products:
                raise Exception('Empty path: ' + url)

            for product in products:
                product_slug = product.find(
                    'button', 'btn-detalles')['producto']
                product_url = 'https://www.almaceneslaganga.com/' \
                              'pedidos-en-linea/efectivo/{}'\
                    .format(product_slug)
                product_urls.append(product_url)

        return product_urls
Example #13
0
    def discover_urls_for_category(cls, category, extra_args=None):
        url_extensions = [
            # Ampolletas LED
            ['ampolletas-led', 'Lamp'],
            # Proyectores LED
            ['proyectores-led', 'LightProjector'],
            # Tubos LED
            ['tubos-led', 'LightTube'],
        ]

        product_urls = []

        session = session_with_proxy(extra_args)

        for category_path, local_category in url_extensions:
            if local_category != category:
                continue

            category_url = 'http://www.belight.cl/productos/categoria/{}' \
                           ''.format(category_path)

            soup = BeautifulSoup(session.get(category_url).text, 'html.parser')

            product_containers = soup.findAll('div', 'producto')

            for container in product_containers:
                product_url = 'http://www.belight.cl' + \
                              container.find('a')['href']
                product_urls.append(product_url)

        return product_urls
Example #14
0
    def discover_urls_for_category(cls, category, extra_args=None):
        category_paths = [
            ['3074457345616709688', 'Notebook'],
        ]

        session = session_with_proxy(extra_args)
        product_urls = []

        for category_path, local_category in category_paths:
            if local_category != category:
                continue

            url = 'http://www.efe.com.pe/webapp/wcs/stores/servlet/' \
                  'ProductListingView?resultsPerPage=1000&storeId=10152&' \
                  'categoryId=' + category_path
            soup = BeautifulSoup(session.get(url).text, 'html.parser')

            a_links = soup.findAll('div', 'product')

            if not a_links:
                raise Exception('Empty category: ' + url)

            for container in a_links:
                product_url = container.find('a')['href']
                product_urls.append(product_url)
        return product_urls
Example #15
0
    def discover_urls_for_category(cls, category, extra_args=None):
        base_url = 'https://www.costco.com.mx'

        category_paths = [
            ['negocios-y-papeleria/accesorios-de-escritorio/'
             'unidades-de-almacenamiento', 'MemoryCard'],
            ['electronica-y-computo/computacion/discos-duros-y-memorias',
             'ExternalStorageDrive'],
        ]

        session = session_with_proxy(extra_args)
        product_urls = []

        for category_path, local_category in category_paths:
            if local_category != category:
                continue

            url_webpage = '{}/view/c/{}'.format(base_url, category_path)

            page_source = session.get(url_webpage).text
            page_source = re.sub(r'(<!--\[if.[\s|\S]*<!\[endif\]-->)', '',
                                 page_source)

            soup = BeautifulSoup(page_source, 'html.parser')

            link_containers = soup.findAll('div', 'productList_item')

            for link_container in link_containers:
                product_url = base_url + link_container.find('a')['href']
                product_urls.append(product_url)

        return product_urls
Example #16
0
    def discover_urls_for_category(cls, category, extra_args=None):
        url_base = 'http://www.hiraoka.com.pe/'

        category_paths = [
            # ['029', 'Notebook'],      # Notebooks
            # ['031', 'Notebook'],      # Convertibles
            ['123', 'UsbFlashDrive'],
        ]

        session = session_with_proxy(extra_args)
        product_urls = []

        for category_path, local_category in category_paths:
            if local_category != category:
                continue

            category_url = '{}productlist.php?ss={}'.format(
                url_base, category_path)
            soup = BeautifulSoup(session.get(category_url).text, 'html.parser')

            p_paragraphs = soup.findAll('div', 'proditem')

            for p in p_paragraphs:
                product_url = url_base + p.find('a')['href']
                product_url = product_url.split('&n')[0]

                product_urls.append(product_url)
        return product_urls
Example #17
0
    def discover_urls_for_category(cls, category, extra_args=None):
        category_paths = [
            # Ampolletas LED
            ['iluminacion/lamparas-led', 'Lamp'],
            # Proyectores LED
            ['iluminacion/proyectores-led', 'LightProjector'],
        ]

        product_urls = []
        session = session_with_proxy(extra_args)

        for category_path, local_category in category_paths:
            if local_category != category:
                continue

            category_url = 'https://www.gobantes.cl/{}?limit=200'.format(
                category_path)

            soup = BeautifulSoup(
                session.get(category_url, verify=False).text, 'html.parser')

            product_containers = soup.findAll('div', 'image')

            if not product_containers:
                raise Exception('Empty category: ' + category_url)

            for container in product_containers:
                product_url = container.find('a')['href'].replace(
                    '&limit=200', '').replace('https://gobantes.cl/',
                                              'https://www.gobantes.cl/')
                product_urls.append(product_url)

        return product_urls
Example #18
0
    def discover_entries_for_category(cls, category, extra_args=None):
        session = session_with_proxy(extra_args)
        session.headers['User-Agent'] = 'curl'
        discovered_entries = defaultdict(lambda: [])

        category_paths = [
            ['mac', ['Notebook'], 'Mac', 1],
            ['ipad', ['Tablet'], 'iPad', 1],
            ['iphone', ['Cell'], 'iPhone', 1],
        ]

        for e in category_paths:
            category_path, local_categories, section_name, category_weight = e

            if category not in local_categories:
                continue

            category_url = 'https://www.maconline.com/t/{}'\
                .format(category_path)
            print(category_url)

            soup = BeautifulSoup(session.get(category_url).text, 'html.parser')

            subcategories = soup.find('ul', 'list-unstyled').findAll('li')

            for idx, subcategory in enumerate(subcategories):
                subcategory_url = 'https://www.maconline.com{}'.format(
                    subcategory.find('a')['href'].split('?')[0])
                discovered_entries[subcategory_url].append({
                    'category_weight': category_weight,
                    'section_name': section_name,
                    'value': idx + 1
                })

        return discovered_entries
Example #19
0
    def discover_urls_for_category(cls, category, extra_args=None):
        product_urls = []

        if category == 'Cell':
            session = session_with_proxy(extra_args)
            offset = 0

            while True:
                category_url = 'https://tienda.clarochile.cl/webapp/wcs/' \
                               'stores/servlet/CategoryDisplay?categoryId=' \
                               '10008&pageSize=18&storeId=10151&beginIndex=' \
                               '{}'.format(offset)
                print(category_url)
                soup = BeautifulSoup(
                    session.get(category_url, verify=False).text,
                    'html.parser')

                containers = soup.find('div',
                                       'product_listing_container').findAll(
                                           'div', 'product')

                if not containers:
                    if offset == 0:
                        raise Exception('Empty list')

                    break

                for container in containers:
                    product_url = container.find('a')['href']
                    product_urls.append(product_url)

                offset += 18

        return product_urls
Example #20
0
    def products_for_url(cls, url, category=None, extra_args=None):
        session = session_with_proxy(extra_args)
        soup = BeautifulSoup(session.get(url).text, 'html.parser')

        name = soup.find('h1', 'entry-title').text.strip()
        sku = soup.find('input', {'name': 'product_id'})['value'].strip()
        description = html_to_markdown(
            str(soup.find('div', 'product_description')))
        picture_urls = [tag['href'] for tag in soup.findAll('a', 'thickbox')]
        price = Decimal(remove_words(soup.find('span', 'currentprice').text))

        price *= Decimal('1.19')
        price = price.quantize(0)

        p = Product(name,
                    cls.__name__,
                    category,
                    url,
                    url,
                    sku,
                    -1,
                    price,
                    price,
                    'CLP',
                    sku=sku,
                    description=description,
                    picture_urls=picture_urls)

        return [p]
Example #21
0
    def discover_urls_for_category(cls, category, extra_args=None):
        category_paths = [
            # Ampolletas LED
            ['Ampolletas+LED', 'Lamp'],
        ]

        session = session_with_proxy(extra_args)
        product_urls = []

        for category_path, local_category in category_paths:
            if local_category != category:
                continue

            category_url = 'http://www.eglo.cl/productos?' \
                           'subgrupo_desc_buscar%5B%5D={}'.format(
                               category_path)

            soup = BeautifulSoup(session.get(category_url).text, 'html.parser')

            product_containers = soup.findAll('div', 'product-preview-wrapper')

            if not product_containers:
                raise Exception('Empty category: ' + category_url)

            for container in product_containers:
                product_url = 'http://www.eglo.cl' + \
                              container.find('a')['href']
                product_urls.append(product_url)

        return product_urls
Example #22
0
    def discover_urls_for_keyword(cls, keyword, threshold, extra_args=None):
        session = session_with_proxy(extra_args)
        product_urls = []

        page = 1

        while True:
            if page >= 40:
                raise Exception('Page overflow: ' + keyword)

            url = 'https://www.corona.cl/buscapagina?ft={}&PS=15&' \
                  'sl=4e4d7aaa-6b5b-4390-8d3a-e6ce5e306488&cc=3&sm=0' \
                  '&PageNumber={}'.format(keyword, page)

            print(url)

            soup = BeautifulSoup(session.get(url).text, 'html.parser')
            product_blocks = soup.findAll('div', 'product')

            if not product_blocks:
                break

            for block in product_blocks:
                if block.find('div', 'outOfStock'):
                    continue
                product_url = block.find('a')['href']
                product_urls.append(product_url)

                if len(product_urls) == threshold:
                    return product_urls

            page += 1

        return product_urls
Example #23
0
    def discover_urls_for_category(cls, category, extra_args=None):
        category_paths = cls._category_paths()
        discovered_urls = []
        session = session_with_proxy(extra_args)
        session.headers['content-type'] = 'application/x-www-form-urlencoded'

        endpoint_url = 'https://www.lg.com/{}/mkt/ajax/category/' \
                       'retrieveCategoryProductList'.format(cls.region_code)

        for category_id, local_category, is_active in \
                category_paths:
            if local_category != category:
                continue

            if is_active:
                status = 'ACTIVE'
            else:
                status = 'DISCONTINUED'

            payload = 'categoryId={}&modelStatusCode={}&bizType=B2C&viewAll' \
                      '=Y'.format(category_id, status)
            json_response = json.loads(
                session.post(endpoint_url, payload).text)
            product_entries = json_response['data'][0]['productList']

            if not product_entries:
                raise Exception('Empty category: {} - {}'.format(
                    category_id, is_active))

            for product_entry in product_entries:
                if product_entry['whereToBuyFlag'] == 'Y':
                    product_url = cls.base_url + product_entry['modelUrlPath']
                    discovered_urls.append(product_url)

        return discovered_urls
Example #24
0
    def products_for_url(cls, url, category=None, extra_args=None):
        session = session_with_proxy(extra_args)
        soup = BeautifulSoup(session.get(url).text, 'html.parser')

        name = soup.find('div', 'product-name').text.strip()
        sku = soup.find('input', {'name': 'product'})['value'].strip()

        price_string = soup.find('span', 'price').text

        price = Decimal(price_string.replace(
            '.', '').replace('$', '').replace(',', '.'))

        description = html_to_markdown(
            str(soup.find('div', 'product-collateral')))

        picture_urls = [tag['src'] for tag in
                        soup.findAll('img', {'id': 'image'})]

        p = Product(
            name,
            cls.__name__,
            category,
            url,
            url,
            sku,
            -1,
            price,
            price,
            'ARS',
            sku=sku,
            description=description,
            picture_urls=picture_urls
        )

        return [p]
Example #25
0
    def discover_urls_for_category(cls, category, extra_args=None):
        category_codes = [
            ['13147', 'UsbFlashDrive'],
            ['13341', 'ExternalStorageDrive'],
        ]

        product_urls = []
        session = session_with_proxy(extra_args)
        session.headers['Content-Type'] = 'application/x-www-form-urlencoded'

        for category_code, local_category in category_codes:
            if local_category != category:
                continue

            category_url = 'http://www.coppel.com/ProductListingView?' \
                           'storeId=12761&categoryId=' + category_code

            response = session.post(category_url, data='pageSize=1000')
            soup = BeautifulSoup(response.text, 'html.parser')

            containers = soup.findAll('div', 'product')

            if not containers:
                raise Exception('Empty category: ' + category_code)

            for container in containers:
                product_url = container.find('a')['href']
                product_urls.append(product_url)

        return product_urls
Example #26
0
    def discover_urls_for_category(cls, category, extra_args=None):
        category_paths = [
            ['procesador-2', 'Processor'],
            ['placa-madre', 'Motherboard'],
            ['tarjeta-de-video', 'VideoCard'],
            ['disco-de-estado-solido', 'SolidStateDrive'],
            ['fuente-de-poder', 'PowerSupply'],
            ['mouse-y-teclados-2', 'Mouse'],
        ]

        product_urls = []
        session = session_with_proxy(extra_args)
        session.headers['User-Agent'] = \
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
            '(KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'

        for category_path, local_category in category_paths:
            if local_category != category:
                continue

            url = 'https://aopc.cl/categoria/{}/?post_type=product'\
                .format(category_path)
            response = session.get(url)

            soup = BeautifulSoup(response.text, 'html.parser')
            products = soup.findAll('li', 'product-col')

            if not products:
                raise Exception('Empty path: {}'.format(url))

            for product in products:
                product_url = product.find('a')['href']
                product_urls.append(product_url)

        return product_urls
Example #27
0
    def discover_urls_for_category(cls, category, extra_args=None):
        category_paths = [
            ['linea-blanca/cocinas.html', 'Stove'],
            ['linea-blanca/refrigeracion.html', 'Refrigerator'],
            ['linea-blanca/lavadoras-y-secadoras.html', 'WashingMachine'],
            ['climatizacion/aires-acondicionados.html', 'AirConditioner'],
            ['audio/minicomponentes.html', 'StereoSystem'],
            ['televisores/hd.html', 'Television'],
            ['televisores/4k.html', 'Television'],
        ]

        session = session_with_proxy(extra_args)
        base_url = 'https://www.artefacta.com/productos/{}?at_marca=LG'
        product_urls = []

        for url_extension, local_category in category_paths:
            if category != local_category:
                continue

            url = base_url.format(url_extension)
            soup = BeautifulSoup(session.get(url).text, 'html.parser')
            products = soup.findAll('a', 'product-item-link')

            if not products:
                raise Exception('Empty path: ' + url)

            for product in products:
                try:
                    product_url = product['href']
                    product_urls.append(product_url)
                except KeyError:
                    continue

        return product_urls
Example #28
0
    def discover_urls_for_category(cls, category, extra_args=None):
        session = session_with_proxy(extra_args)
        offset = 1
        product_urls = []

        if category != 'StorageDrive':
            return []

        while True:
            category_url = 'https://listado.mercadolibre.com.ar/_Desde_{}{}' \
                           ''.format(offset, cls.store_id)
            print(category_url)

            soup = BeautifulSoup(session.get(category_url).text, 'html.parser')
            product_containers = soup.findAll('li', 'results-item')

            if not product_containers:
                if offset == 1:
                    raise Exception('Empty store: {}'.format(category_url))
                break

            for container in product_containers:
                product_urls.append(container.find('a')['href'])

            offset += 48

        return product_urls
Example #29
0
    def discover_urls_for_keyword(cls, keyword, threshold, extra_args=None):
        session = session_with_proxy(extra_args)
        session.headers['user-agent'] = 'curl/7.64.1'

        base_url = "https://www.falabella.com/falabella-cl/search?" \
                   "Ntt={}&page={}"

        discovered_urls = []
        page = 1
        while True:
            if page > 60:
                raise Exception('Page overflow ' + keyword)

            search_url = base_url.format(keyword, page)
            res = session.get(search_url, timeout=None)

            if res.status_code == 500:
                break

            soup = BeautifulSoup(res.text, 'html.parser')

            script = soup.find('script', {'id': '__NEXT_DATA__'})
            json_data = json.loads(script.text)

            for product_data in json_data['props']['pageProps']['results']:
                product_url = product_data['url']
                discovered_urls.append(product_url)

                if len(discovered_urls) == threshold:
                    return discovered_urls

            page += 1

        return discovered_urls
Example #30
0
    def discover_urls_for_category(cls, category, extra_args=None):
        category_paths = [
            ['4232-refrigeradoras', 'Refrigerator'], ['72-microondas', 'Oven'],
            ['93-lavadoras', 'WashingMachine'],
            ['94-secadoras', 'WashingMachine'],
            ['95-lavadoras-y-secadoras-todo-en-1', 'WashingMachine'],
            ['309-televisores', 'Television'],
            ['2849-parlantes', 'StereoSystem'],
            ['4248-micro-y-mini-componentes', 'StereoSystem'],
            ['4249-barras-de-sonido-y-teatros-en-casa', 'StereoSystem'],
            ['4251-celulares-y-tablets', 'Cell']
        ]

        session = session_with_proxy(extra_args)
        base_url = 'https://www.sukasa.com/{}?q=Marca-LG'
        product_urls = []

        for url_extension, local_category in category_paths:
            if category != local_category:
                continue

            url = base_url.format(url_extension)
            soup = BeautifulSoup(session.get(url).text, 'html.parser')
            products = soup.findAll('div', 'product-container')

            if not products:
                raise Exception('Empty path: ' + url)

            for product in products:
                product_url = product.find('a')['href']
                product_urls.append(product_url)

        return product_urls