def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) session.headers['user-agent'] = 'python-requests/2.21.0' response = session.get(url) if response.status_code == 404: return [] page_source = response.text soup = BeautifulSoup(page_source, 'html.parser') if not soup.find('body') or \ not soup.find('h1', {'id': 'nombre-producto'}): return [] name = soup.find('h1', {'id': 'nombre-producto'}).text.strip() sku = soup.find('div', {'itemprop': 'sku'}).text.strip() ajax_session = session_with_proxy(extra_args) ajax_session.headers['user-agent'] = 'python-requests/2.21.0' ajax_session.headers['x-requested-with'] = 'XMLHttpRequest' ajax_session.headers['content-type'] = \ 'application/x-www-form-urlencoded' stock_data = json.loads( ajax_session.post( 'https://catalogo.movistar.cl/fullprice/stockproducto/validar/', 'sku=' + sku).text) stock = stock_data['respuesta']['cantidad'] price_container = soup.find('span', 'special-price').find('p') price = Decimal(remove_words(price_container.text)) description = html_to_markdown( str(soup.find('div', 'detailed-desktop'))) if 'seminuevo' in description: condition = 'https://schema.org/RefurbishedCondition' else: condition = 'https://schema.org/NewCondition' picture_urls = [soup.find('meta', {'property': 'og:image'})['content']] return [ Product(name, cls.__name__, category, url, url, sku, stock, price, price, 'CLP', condition=condition, sku=sku, description=description, picture_urls=picture_urls) ]
def discover_urls_for_category(cls, category, extra_args=None): session = session_with_proxy(extra_args) session.headers['Content-Type'] = 'application/x-www-form-urlencoded' category_paths = [ ('Id_Subrubro=338', 'ExternalStorageDrive'), # Discos rígidos -> Externos ('Id_Subrubro=336', 'StorageDrive'), # Discos rígidos -> Internos ('Id_Subrubro=495', 'SolidStateDrive'), # Discos rígidos -> SSD ('Id_Subrubro=611', 'SolidStateDrive'), # Memorias -> SSD ('Id_Subrubro=396', 'MemoryCard'), # Memorias -> Micro SD Card # ('Id_Subrubro=298', 'MemoryCard'), # Memorias -> SD Card ('Id_Rubro=2', 'UsbFlashDrive'), # Pen Drives ] product_urls = [] session = session_with_proxy(extra_args) session.headers['Content-Type'] = 'application/x-www-form-urlencoded' for category_path, local_category in category_paths: if local_category != category: continue page = 0 while True: category_url = \ 'http://www.stylus.com.ar/productos.php?{}&pag={}' \ ''.format(category_path, page) print(category_url) response = cls._retrieve_page( session, category_url, extra_args) soup = BeautifulSoup(response.text, 'html.parser') products_container = soup.find('div', 'prod-lista') if not products_container: if page == 0: raise Exception('Empty category: {}'.format( category_url)) break for product_cell in products_container.findAll('li'): product_url = 'http://www.stylus.com.ar/' + \ product_cell.find('a')['href'].replace( '&Menu=', '') product_urls.append(product_url) page += 1 return product_urls
def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) soup = BeautifulSoup(session.get(url).text, 'html.parser') name = soup.find('meta', {'name': 'Title'})['content'].strip() sku = soup.find('input', {'name': 'id'})['value'].strip() price_string = soup.find('input', {'id': 'product_price'})['value'] price = Decimal(price_string) description = html_to_markdown( str(soup.find('div', {'id': 'especificaciones-container'}))) picture_urls = [tag['data-zoom-image'] for tag in soup.find('div', 'owl-carousel').findAll('img')] p = Product( name, cls.__name__, category, url, url, sku, -1, price, price, 'ARS', sku=sku, description=description, picture_urls=picture_urls ) return [p]
def products_for_url(cls, url, category=None, extra_args=None): print(url) session = session_with_proxy(extra_args) soup = BeautifulSoup(session.get(url).text, 'html.parser') name = soup.find('h1', 'product_title').text.strip() sku = soup.find('span', 'sku').text.strip() stock = -1 if 'LG' not in name.upper().split(' '): stock = 0 price = Decimal( soup.find('p', 'price').find('span', 'amount').text.replace( '₲.', '').replace('.', '')) picture_urls = [soup.find('meta', {'name': 'og:image'})['content']] return [ Product(name, cls.__name__, category, url, url, sku, stock, price, price, 'PYG', sku=sku, picture_urls=picture_urls) ]
def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) response = session.get(url, timeout=20) if response.url != url or response.status_code == 404: return [] soup = BeautifulSoup(response.text, 'html.parser') model_id = soup.find('div', 'pdp-conts-area')['id'] model_data = cls._retrieve_api_model(model_id) sibling_groups = model_data['siblings'] sibling_ids = [model_id] for sibling_group in sibling_groups: if sibling_group['siblingType'] in ['COLOR', 'SIZE', 'CAPACITY']: for sibling in sibling_group['siblingModels']: if sibling['modelId'] not in sibling_ids: sibling_ids.append(sibling['modelId']) else: raise Exception('Unkown sibling type for: ' + url) products = [] for sibling_id in sibling_ids: sibling = cls._retrieve_single_product(sibling_id, category) if sibling: products.append(sibling) return products
def discover_urls_for_category(cls, category, extra_args=None): category_paths = [ ('cpt.notebook', 'Notebook'), # Portatiles ('cpt.ultrabook', 'Notebook'), # Ultrabooks ('cpt.allone', 'AllInOne'), # all-en-uno ] product_urls = [] session = session_with_proxy(extra_args) for category_path, local_category in category_paths: if local_category != category: continue category_url = 'http://store.intcomex.com/es-XCL/Products/' \ 'ByCategory/{}?rpp=1000'.format(category_path) print(category_url) soup = cls._retrieve_page(session, category_url, extra_args) product_containers = soup.findAll('div', 'productArea') if not product_containers: raise Exception('Empty category: ' + category_url) for container in product_containers: product_url = 'http://store.intcomex.com' + \ container.find('a')['href'] product_urls.append(product_url) return product_urls
def discover_urls_for_category(cls, category, extra_args=None): category_paths = [ ['televisores-y-teatro-en-casa/televisores', 'Television'], ['celulares-y-tablets/smartphones-xperia', 'Cell'], ['camaras/cyber-shot', 'Camera'], ['audio/sistemas-de-audio', 'StereoSystem'], [ 'televisores-y-teatro-en-casa/reproductores-de-blu-ray-disc' '-y-dvd', 'OpticalDiskPlayer' ], ['televisores-y-teatro-en-casa/teatro-en-casa', 'StereoSystem'], ['audio/audifonos', 'Headphones'], ] product_urls = [] session = session_with_proxy(extra_args) for category_path, local_category in category_paths: if local_category != category: continue category_url = 'https://store.sony.cl/{}?PS=48'.format( category_path) soup = BeautifulSoup(session.get(category_url).text, 'html.parser') containers = soup.findAll('div', 'prod') if not containers: logging.warning('Empty category: ' + category_url) for product_container in containers: product_url = product_container.find('a')['href'] product_urls.append(product_url) return product_urls
def discover_urls_for_category(cls, category, extra_args=None): category_paths = [ # ['empresa/workstation/notebook.html', 'Notebook'], ['empresa/notebook-comercial.html', 'Notebook'], ['pc-y-portatiles/portatiles.html', 'Notebook'], ['empresa/plotters.html', 'Printer'], ['impresion-e-imagen/impresoras-de-tinta.html', 'Printer'], # ['impresion-e-imagen/impresoras-laser.html', 'Printer'], ['impresion-e-imagen/multifuncionales.html', 'Printer'], ['impresion-e-imagen/multifuncionales-laser.html', 'Printer'], # ['pc-y-portatiles/escritorio.html', 'AllInOne'], # ['audio/teclados-y-mouse.html', 'Mouse'], # ['audio/parlantes.html', 'StereoSystem'], ['monitores/monitores.html', 'Monitor'], ] session = session_with_proxy(extra_args) session.headers['X-Requested-With'] = 'XMLHttpRequest' product_urls = [] for category_path, local_category in category_paths: if local_category != category: continue subcategory_product_urls = [] page = 1 while True: category_url = 'https://www.scglobal.cl/index.php/{}?p={}' \ ''.format(category_path, page) print(category_url) if page >= 10: raise Exception('Page overflow: ' + category_url) json_data = json.loads( session.get(category_url, verify=False).text) soup = BeautifulSoup(json_data['listing'], 'html.parser') product_cells = soup.findAll('li', 'item') if not product_cells and page == 1: raise Exception('Empty category: ' + category_url) done = False for cell in product_cells: product_url = cell.find('a')['href'] if product_url in subcategory_product_urls: done = True break subcategory_product_urls.append(product_url) if done: break page += 1 product_urls.extend(subcategory_product_urls) return product_urls
def discover_urls_for_category(cls, category, extra_args=None): category_paths = [ # Ampolletas LED ['hogar/ampolletas', 'Lamp'], # Proyectores LED ['hogar/proyectores', 'LightProjector'], # Tubos LED ['hogar/equipos-y-tubos', 'LightTube'], ] discovery_urls = [] session = session_with_proxy(extra_args) for category_path, local_category in category_paths: if local_category != category: continue category_url = 'http://ledlightchile.cl/categoria-producto/{}' \ ''.format(category_path) soup = BeautifulSoup(session.get(category_url).text, 'html.parser') product_containers = soup.findAll('div', 'wf-cell') for container in product_containers: subcategory_url = container.find('a')['href'] discovery_urls.append(subcategory_url) return discovery_urls
def discover_urls_for_category(cls, category, extra_args=None): category_paths = [ ['hardware/hard-disk/hd-sata-iii', 'StorageDrive'], ['hardware/hard-disk/ssd', 'SolidStateDrive'], ] product_urls = [] session = session_with_proxy(extra_args) for category_path, local_category in category_paths: if local_category != category: continue category_url = 'https://www.terabyteshop.com.br/{}' \ ''.format(category_path) soup = BeautifulSoup(session.get(category_url).text, 'html.parser') containers = soup.findAll('div', 'pbox') if not containers: raise Exception('Empty category: ' + category_url) for container in containers: product_url = 'https://www.terabyteshop.com.br' + \ container.find('a')['href'] product_urls.append(product_url) return product_urls
def discover_urls_for_category(cls, category, extra_args=None): category_filters = [ ('tv', 'Television'), ('celulares', 'Cell'), ('equipos-de-sonido', 'StereoSystem'), ('barras-de-sonido', 'StereoSystem'), ('bocinas', 'StereoSystem'), ('aires-acondicionados/Inverter', 'AirConditioner'), ('aires-acondicionados/Básico', 'AirConditioner'), ('estufas', 'Stove'), ('lavadoras', 'WashingMachine'), ('secadoras', 'WashingMachine'), # ('centro-de-lavado', 'WashingMachine'), ('refrigeradoras', 'Refrigerator'), # ('congeladores', 'Refrigerator'), # ('microondas', 'Oven'), # ('hornos', 'Oven'), ('monitores', 'Monitor'), ] session = session_with_proxy(extra_args) product_urls = [] for category_path, local_category in category_filters: if local_category != category: continue page = 1 done = False while not done: if page >= 10: raise Exception('Page overflow') url = 'https://www.multimax.net/collections/{}?page={}'\ .format(category_path, page) print(url) response = session.get(url) soup = BeautifulSoup(response.text, 'html5lib') container = soup.find('div', 'collection-products') items = container.findAll('article', 'item') if not items: if page == 1: raise Exception('No products for category {}' .format(category)) break for item in items: if 'LG' not in item.find('div', 'vendor').text.upper(): continue product_url = 'https://www.multimax.net{}'\ .format(item.find('a')['href']) product_urls.append(product_url) page += 1 return product_urls
def discover_urls_for_category(cls, category, extra_args=None): category_paths = [ ['Televisores', 'Television'], ['Aires-Split', 'AirConditioner'], ['Lavadoras', 'WashingMachine'], # ['Equipos-de-sonido', 'StereoSystem'], ['Refrigeradoras', 'Refrigerator'], ['Cocinas', 'Oven'] ] session = session_with_proxy(extra_args) base_url = 'https://www.almaceneslaganga.com/' \ 'pedidos-en-linea/efectivo/{}/LG' product_urls = [] for url_extension, local_category in category_paths: if category != local_category: continue url = base_url.format(url_extension) soup = BeautifulSoup(session.get(url).text, 'html.parser') products = soup.findAll('div', 'esquema_producto') if not products: raise Exception('Empty path: ' + url) for product in products: product_slug = product.find( 'button', 'btn-detalles')['producto'] product_url = 'https://www.almaceneslaganga.com/' \ 'pedidos-en-linea/efectivo/{}'\ .format(product_slug) product_urls.append(product_url) return product_urls
def discover_urls_for_category(cls, category, extra_args=None): url_extensions = [ # Ampolletas LED ['ampolletas-led', 'Lamp'], # Proyectores LED ['proyectores-led', 'LightProjector'], # Tubos LED ['tubos-led', 'LightTube'], ] product_urls = [] session = session_with_proxy(extra_args) for category_path, local_category in url_extensions: if local_category != category: continue category_url = 'http://www.belight.cl/productos/categoria/{}' \ ''.format(category_path) soup = BeautifulSoup(session.get(category_url).text, 'html.parser') product_containers = soup.findAll('div', 'producto') for container in product_containers: product_url = 'http://www.belight.cl' + \ container.find('a')['href'] product_urls.append(product_url) return product_urls
def discover_urls_for_category(cls, category, extra_args=None): category_paths = [ ['3074457345616709688', 'Notebook'], ] session = session_with_proxy(extra_args) product_urls = [] for category_path, local_category in category_paths: if local_category != category: continue url = 'http://www.efe.com.pe/webapp/wcs/stores/servlet/' \ 'ProductListingView?resultsPerPage=1000&storeId=10152&' \ 'categoryId=' + category_path soup = BeautifulSoup(session.get(url).text, 'html.parser') a_links = soup.findAll('div', 'product') if not a_links: raise Exception('Empty category: ' + url) for container in a_links: product_url = container.find('a')['href'] product_urls.append(product_url) return product_urls
def discover_urls_for_category(cls, category, extra_args=None): base_url = 'https://www.costco.com.mx' category_paths = [ ['negocios-y-papeleria/accesorios-de-escritorio/' 'unidades-de-almacenamiento', 'MemoryCard'], ['electronica-y-computo/computacion/discos-duros-y-memorias', 'ExternalStorageDrive'], ] session = session_with_proxy(extra_args) product_urls = [] for category_path, local_category in category_paths: if local_category != category: continue url_webpage = '{}/view/c/{}'.format(base_url, category_path) page_source = session.get(url_webpage).text page_source = re.sub(r'(<!--\[if.[\s|\S]*<!\[endif\]-->)', '', page_source) soup = BeautifulSoup(page_source, 'html.parser') link_containers = soup.findAll('div', 'productList_item') for link_container in link_containers: product_url = base_url + link_container.find('a')['href'] product_urls.append(product_url) return product_urls
def discover_urls_for_category(cls, category, extra_args=None): url_base = 'http://www.hiraoka.com.pe/' category_paths = [ # ['029', 'Notebook'], # Notebooks # ['031', 'Notebook'], # Convertibles ['123', 'UsbFlashDrive'], ] session = session_with_proxy(extra_args) product_urls = [] for category_path, local_category in category_paths: if local_category != category: continue category_url = '{}productlist.php?ss={}'.format( url_base, category_path) soup = BeautifulSoup(session.get(category_url).text, 'html.parser') p_paragraphs = soup.findAll('div', 'proditem') for p in p_paragraphs: product_url = url_base + p.find('a')['href'] product_url = product_url.split('&n')[0] product_urls.append(product_url) return product_urls
def discover_urls_for_category(cls, category, extra_args=None): category_paths = [ # Ampolletas LED ['iluminacion/lamparas-led', 'Lamp'], # Proyectores LED ['iluminacion/proyectores-led', 'LightProjector'], ] product_urls = [] session = session_with_proxy(extra_args) for category_path, local_category in category_paths: if local_category != category: continue category_url = 'https://www.gobantes.cl/{}?limit=200'.format( category_path) soup = BeautifulSoup( session.get(category_url, verify=False).text, 'html.parser') product_containers = soup.findAll('div', 'image') if not product_containers: raise Exception('Empty category: ' + category_url) for container in product_containers: product_url = container.find('a')['href'].replace( '&limit=200', '').replace('https://gobantes.cl/', 'https://www.gobantes.cl/') product_urls.append(product_url) return product_urls
def discover_entries_for_category(cls, category, extra_args=None): session = session_with_proxy(extra_args) session.headers['User-Agent'] = 'curl' discovered_entries = defaultdict(lambda: []) category_paths = [ ['mac', ['Notebook'], 'Mac', 1], ['ipad', ['Tablet'], 'iPad', 1], ['iphone', ['Cell'], 'iPhone', 1], ] for e in category_paths: category_path, local_categories, section_name, category_weight = e if category not in local_categories: continue category_url = 'https://www.maconline.com/t/{}'\ .format(category_path) print(category_url) soup = BeautifulSoup(session.get(category_url).text, 'html.parser') subcategories = soup.find('ul', 'list-unstyled').findAll('li') for idx, subcategory in enumerate(subcategories): subcategory_url = 'https://www.maconline.com{}'.format( subcategory.find('a')['href'].split('?')[0]) discovered_entries[subcategory_url].append({ 'category_weight': category_weight, 'section_name': section_name, 'value': idx + 1 }) return discovered_entries
def discover_urls_for_category(cls, category, extra_args=None): product_urls = [] if category == 'Cell': session = session_with_proxy(extra_args) offset = 0 while True: category_url = 'https://tienda.clarochile.cl/webapp/wcs/' \ 'stores/servlet/CategoryDisplay?categoryId=' \ '10008&pageSize=18&storeId=10151&beginIndex=' \ '{}'.format(offset) print(category_url) soup = BeautifulSoup( session.get(category_url, verify=False).text, 'html.parser') containers = soup.find('div', 'product_listing_container').findAll( 'div', 'product') if not containers: if offset == 0: raise Exception('Empty list') break for container in containers: product_url = container.find('a')['href'] product_urls.append(product_url) offset += 18 return product_urls
def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) soup = BeautifulSoup(session.get(url).text, 'html.parser') name = soup.find('h1', 'entry-title').text.strip() sku = soup.find('input', {'name': 'product_id'})['value'].strip() description = html_to_markdown( str(soup.find('div', 'product_description'))) picture_urls = [tag['href'] for tag in soup.findAll('a', 'thickbox')] price = Decimal(remove_words(soup.find('span', 'currentprice').text)) price *= Decimal('1.19') price = price.quantize(0) p = Product(name, cls.__name__, category, url, url, sku, -1, price, price, 'CLP', sku=sku, description=description, picture_urls=picture_urls) return [p]
def discover_urls_for_category(cls, category, extra_args=None): category_paths = [ # Ampolletas LED ['Ampolletas+LED', 'Lamp'], ] session = session_with_proxy(extra_args) product_urls = [] for category_path, local_category in category_paths: if local_category != category: continue category_url = 'http://www.eglo.cl/productos?' \ 'subgrupo_desc_buscar%5B%5D={}'.format( category_path) soup = BeautifulSoup(session.get(category_url).text, 'html.parser') product_containers = soup.findAll('div', 'product-preview-wrapper') if not product_containers: raise Exception('Empty category: ' + category_url) for container in product_containers: product_url = 'http://www.eglo.cl' + \ container.find('a')['href'] product_urls.append(product_url) return product_urls
def discover_urls_for_keyword(cls, keyword, threshold, extra_args=None): session = session_with_proxy(extra_args) product_urls = [] page = 1 while True: if page >= 40: raise Exception('Page overflow: ' + keyword) url = 'https://www.corona.cl/buscapagina?ft={}&PS=15&' \ 'sl=4e4d7aaa-6b5b-4390-8d3a-e6ce5e306488&cc=3&sm=0' \ '&PageNumber={}'.format(keyword, page) print(url) soup = BeautifulSoup(session.get(url).text, 'html.parser') product_blocks = soup.findAll('div', 'product') if not product_blocks: break for block in product_blocks: if block.find('div', 'outOfStock'): continue product_url = block.find('a')['href'] product_urls.append(product_url) if len(product_urls) == threshold: return product_urls page += 1 return product_urls
def discover_urls_for_category(cls, category, extra_args=None): category_paths = cls._category_paths() discovered_urls = [] session = session_with_proxy(extra_args) session.headers['content-type'] = 'application/x-www-form-urlencoded' endpoint_url = 'https://www.lg.com/{}/mkt/ajax/category/' \ 'retrieveCategoryProductList'.format(cls.region_code) for category_id, local_category, is_active in \ category_paths: if local_category != category: continue if is_active: status = 'ACTIVE' else: status = 'DISCONTINUED' payload = 'categoryId={}&modelStatusCode={}&bizType=B2C&viewAll' \ '=Y'.format(category_id, status) json_response = json.loads( session.post(endpoint_url, payload).text) product_entries = json_response['data'][0]['productList'] if not product_entries: raise Exception('Empty category: {} - {}'.format( category_id, is_active)) for product_entry in product_entries: if product_entry['whereToBuyFlag'] == 'Y': product_url = cls.base_url + product_entry['modelUrlPath'] discovered_urls.append(product_url) return discovered_urls
def products_for_url(cls, url, category=None, extra_args=None): session = session_with_proxy(extra_args) soup = BeautifulSoup(session.get(url).text, 'html.parser') name = soup.find('div', 'product-name').text.strip() sku = soup.find('input', {'name': 'product'})['value'].strip() price_string = soup.find('span', 'price').text price = Decimal(price_string.replace( '.', '').replace('$', '').replace(',', '.')) description = html_to_markdown( str(soup.find('div', 'product-collateral'))) picture_urls = [tag['src'] for tag in soup.findAll('img', {'id': 'image'})] p = Product( name, cls.__name__, category, url, url, sku, -1, price, price, 'ARS', sku=sku, description=description, picture_urls=picture_urls ) return [p]
def discover_urls_for_category(cls, category, extra_args=None): category_codes = [ ['13147', 'UsbFlashDrive'], ['13341', 'ExternalStorageDrive'], ] product_urls = [] session = session_with_proxy(extra_args) session.headers['Content-Type'] = 'application/x-www-form-urlencoded' for category_code, local_category in category_codes: if local_category != category: continue category_url = 'http://www.coppel.com/ProductListingView?' \ 'storeId=12761&categoryId=' + category_code response = session.post(category_url, data='pageSize=1000') soup = BeautifulSoup(response.text, 'html.parser') containers = soup.findAll('div', 'product') if not containers: raise Exception('Empty category: ' + category_code) for container in containers: product_url = container.find('a')['href'] product_urls.append(product_url) return product_urls
def discover_urls_for_category(cls, category, extra_args=None): category_paths = [ ['procesador-2', 'Processor'], ['placa-madre', 'Motherboard'], ['tarjeta-de-video', 'VideoCard'], ['disco-de-estado-solido', 'SolidStateDrive'], ['fuente-de-poder', 'PowerSupply'], ['mouse-y-teclados-2', 'Mouse'], ] product_urls = [] session = session_with_proxy(extra_args) session.headers['User-Agent'] = \ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \ '(KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36' for category_path, local_category in category_paths: if local_category != category: continue url = 'https://aopc.cl/categoria/{}/?post_type=product'\ .format(category_path) response = session.get(url) soup = BeautifulSoup(response.text, 'html.parser') products = soup.findAll('li', 'product-col') if not products: raise Exception('Empty path: {}'.format(url)) for product in products: product_url = product.find('a')['href'] product_urls.append(product_url) return product_urls
def discover_urls_for_category(cls, category, extra_args=None): category_paths = [ ['linea-blanca/cocinas.html', 'Stove'], ['linea-blanca/refrigeracion.html', 'Refrigerator'], ['linea-blanca/lavadoras-y-secadoras.html', 'WashingMachine'], ['climatizacion/aires-acondicionados.html', 'AirConditioner'], ['audio/minicomponentes.html', 'StereoSystem'], ['televisores/hd.html', 'Television'], ['televisores/4k.html', 'Television'], ] session = session_with_proxy(extra_args) base_url = 'https://www.artefacta.com/productos/{}?at_marca=LG' product_urls = [] for url_extension, local_category in category_paths: if category != local_category: continue url = base_url.format(url_extension) soup = BeautifulSoup(session.get(url).text, 'html.parser') products = soup.findAll('a', 'product-item-link') if not products: raise Exception('Empty path: ' + url) for product in products: try: product_url = product['href'] product_urls.append(product_url) except KeyError: continue return product_urls
def discover_urls_for_category(cls, category, extra_args=None): session = session_with_proxy(extra_args) offset = 1 product_urls = [] if category != 'StorageDrive': return [] while True: category_url = 'https://listado.mercadolibre.com.ar/_Desde_{}{}' \ ''.format(offset, cls.store_id) print(category_url) soup = BeautifulSoup(session.get(category_url).text, 'html.parser') product_containers = soup.findAll('li', 'results-item') if not product_containers: if offset == 1: raise Exception('Empty store: {}'.format(category_url)) break for container in product_containers: product_urls.append(container.find('a')['href']) offset += 48 return product_urls
def discover_urls_for_keyword(cls, keyword, threshold, extra_args=None): session = session_with_proxy(extra_args) session.headers['user-agent'] = 'curl/7.64.1' base_url = "https://www.falabella.com/falabella-cl/search?" \ "Ntt={}&page={}" discovered_urls = [] page = 1 while True: if page > 60: raise Exception('Page overflow ' + keyword) search_url = base_url.format(keyword, page) res = session.get(search_url, timeout=None) if res.status_code == 500: break soup = BeautifulSoup(res.text, 'html.parser') script = soup.find('script', {'id': '__NEXT_DATA__'}) json_data = json.loads(script.text) for product_data in json_data['props']['pageProps']['results']: product_url = product_data['url'] discovered_urls.append(product_url) if len(discovered_urls) == threshold: return discovered_urls page += 1 return discovered_urls
def discover_urls_for_category(cls, category, extra_args=None): category_paths = [ ['4232-refrigeradoras', 'Refrigerator'], ['72-microondas', 'Oven'], ['93-lavadoras', 'WashingMachine'], ['94-secadoras', 'WashingMachine'], ['95-lavadoras-y-secadoras-todo-en-1', 'WashingMachine'], ['309-televisores', 'Television'], ['2849-parlantes', 'StereoSystem'], ['4248-micro-y-mini-componentes', 'StereoSystem'], ['4249-barras-de-sonido-y-teatros-en-casa', 'StereoSystem'], ['4251-celulares-y-tablets', 'Cell'] ] session = session_with_proxy(extra_args) base_url = 'https://www.sukasa.com/{}?q=Marca-LG' product_urls = [] for url_extension, local_category in category_paths: if category != local_category: continue url = base_url.format(url_extension) soup = BeautifulSoup(session.get(url).text, 'html.parser') products = soup.findAll('div', 'product-container') if not products: raise Exception('Empty path: ' + url) for product in products: product_url = product.find('a')['href'] product_urls.append(product_url) return product_urls