# Category Scraping for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for p in range(20): urlp = url.format(page=p + 1) fpath = fpath_namer(shop_id, 'ctg', ctg, p) print(fpath, p, urlp) if not op.exists(fpath): driver.get(urlp) # driver.driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.7);") # print('sleeping') # sleep(10) # driver.waitclick('//*[contains(@class, "bb-modal-close-button")]', timeout=1, silent=False) driver.save_page(fpath, scroll_to_bottom=True) tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath('//div[@id="venta-main"]/div'): produrl = li.xpath('.//a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl produrl = clean_url(produrl, root_url) products[produrl] = { 'pdct_name_on_eretailer': ' '.join(''.join(li.xpath('.//h2/a/text()')[:1]).split()), 'raw_price': ' '.join(''.join( li.xpath( './/div[@class="wineblock-leftprice"]//*[@class="uc-price"]//text()'
'https://www.sainsburys.co.uk/shop/gb/groceries/beer-wine-and-spirits-/rum-44#langId=44&storeId=10151&catalogId=10123&categoryId=340889&parent_category_rn=340854&top_category=340854&pageSize=108&orderBy=FAVOURITES_ONLY%7CSEQUENCING%7CTOP_SELLERS&searchTerm=&beginIndex=0', 'tequila': 'https://www.sainsburys.co.uk/shop/gb/groceries/beer-wine-and-spirits-/tequila-44#langId=44&storeId=10151&catalogId=10123&categoryId=340889&parent_category_rn=340854&top_category=340854&pageSize=108&orderBy=FAVOURITES_ONLY%7CSEQUENCING%7CTOP_SELLERS&searchTerm=&beginIndex=0', 'liquor': 'https://www.sainsburys.co.uk/shop/gb/groceries/beer-wine-and-spirits-/liqueurs---speciality-spirits#langId=44&storeId=10151&catalogId=10123&categoryId=340889&parent_category_rn=340854&top_category=340854&pageSize=108&orderBy=FAVOURITES_ONLY%7CSEQUENCING%7CTOP_SELLERS&searchTerm=&beginIndex=0', } # Categories scraping for ctg, url in urls_ctgs_dict.items(): print(ctg, url) categories[ctg] = [] fpath = fpath_namer(shop_id, 'ctg', ctg, 0) if not op.exists(fpath): driver.get(url) sleep(1) driver.save_page(fpath, scroll_to_bottom=True) tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath('//ul[@class="productLister gridView"]/li'): produrl = li.xpath('.//h3/a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl categories[ctg].append(produrl) products[produrl] = { 'pdct_name_on_eretailer': " ".join("".join( li.xpath( './/div[@class="productNameAndPromotions"]//h3//text()')). split()), 'raw_price': " ".join("".join(
try: driver.waitclick('//div[@class="mod_product_list__more"]/a', timeout=5, silent=True) except: pass # Wait to load page sleep(2) # Calculate new scroll height and compare with last scroll height new_height = driver.driver.execute_script( "return document.body.scrollHeight") if new_height == last_height: break last_height = new_height driver.save_page(fpath) tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser) for li in tree.xpath('//li[@class="mod_product_list__item"]'): produrl = li.xpath('.//h3/a/@href')[0] produrl = parse_qs( urlsplit(produrl).query)['url'][0] if 'url' in parse_qs( urlsplit(produrl).query) else produrl produrl = clean_url(produrl, root_url) products[produrl] = { 'pdct_name_on_eretailer': ' '.join(''.join(li.xpath('.//h3/a//text()')).split()), 'ctg_denom_txt': ' '.join(''.join(li.xpath('.//h3/a//text()')).split()), 'raw_price':