Exemple #1
0
            " ".join("".join(
                li.xpath('.//p[@class="pricePerUnit"]/text()')[0]).split()),
        }
        print(products[produrl])
        products[produrl]['price'] = getprice(products[produrl]['raw_price'])
        print(products[produrl])
print([(c, len(categories[c])) for c in categories])

for kw in keywords:
    searches[kw] = []
    number_of_pdcts_in_kw_search = 0
    search_input_box_xpath = u'//*[@id="search"]'
    fpath = fpath_namer(shop_id, 'search', kw, 0)

    if not op.exists(fpath_namer(shop_id, 'search', kw, 0)):
        if not driver.check_exists_by_xpath(search_input_box_xpath):
            # Getting back to root if search input box is not found
            driver.get(
                'https://www.sainsburys.co.uk/shop/gb/groceries/beer-wine-and-spirits-/'
            )
        driver.text_input(kw, search_input_box_xpath, enter=True)
        sleep(2)
        driver.save_page(fpath, scroll_to_bottom=True)

    # Storing and extracting infos
    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//ul[@class="productLister gridView"]/li'):
        produrl = li.xpath('.//h3/a/@href')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
            products[produrl] = {
                'pdct_name_on_eretailer':
                li.xpath('.//div[@class="productName"]//text()')[0].strip(),
                'raw_price':
                "".join(
                    li.xpath('.//div[@class="productCurrentPrice"]//text()')).
                replace('Now', ''),
            }
            # print(products[produrl])
            products[produrl]['price'] = getprice(
                products[produrl]['raw_price'])
            # print(products[produrl])
        # Going to next page if need be
        next_page_click = '//a[@class="resultsNext"]'
        if not op.exists(fpath_namer(shop_id, 'ctg', ctg, p + 1)):
            if not driver.check_exists_by_xpath(next_page_click):
                break
            else:
                driver.waitclick(next_page_click)
    print(ctg, url, p, len(categories[ctg]))

# Difficult case, where you should click a button to get on next page and send the request via the search bar
for kw in keywords:
    searches[kw] = []
    number_of_pdcts_in_kw_search = 0
    search_input_box_xpath = u'//*[@id="SimpleSearchForm_SearchTerm"]'
    if not op.exists(fpath_namer(shop_id, 'search', kw, 0)):
        if not driver.check_exists_by_xpath(search_input_box_xpath):
            # Getting back to root if search input box is not found
            driver.get(root_url)
        driver.text_input(kw, search_input_box_xpath, enter=True)
        tree = etree.parse(open(fpath, 'rb'), parser=parser)
        for li in tree.xpath('//div[@class="col-main-content"]//ul/li'):
            produrl = li.xpath('.//h2[@class="product-name"]/a/@href')[0]
            produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl
            products[produrl] = {
                'pdct_name_on_eretailer': "".join(li.xpath('.//h2[@class="product-name"]//text()')),
                'raw_price': ''.join(w for t in li.xpath('.//span[@class="price"]/text()') for w in t.split()).strip(),
            }
            print(products[produrl], produrl)
            products[produrl]['price'] = getprice(products[produrl]['raw_price'])
            print(products[produrl])
            categories[ctg].append(produrl)
        # Going to next page if need be
        next_page_click = '//a[@class="next i-next"]'
        if not op.exists(fpath_namer(shop_id, 'ctg', ctg, p+1)):
            if not driver.check_exists_by_xpath(next_page_click):
                break
            else:
                driver.waitclick(next_page_click)
    print(ctg, url, p, len(categories[ctg]))


# KW searches Scraping - with selenium - one page per search
search_url = "http://twinliquors.com/shop/catalogsearch/result/?q={kw}"
for kw in keywords:
    searches[kw] = []
    # Storing and extracting infos
    fpath = fpath_namer(shop_id, 'search', kw, 0)
    url = search_url.format(kw=kw, page=0)
    if not op.exists(fpath):
        driver.get(url)
Exemple #4
0
    'still_wines': 'https://www.hawesko.de/weisswein',
    'cognac': 'https://www.hawesko.de/spirituosen',
    'red_wine': 'https://www.hawesko.de/rotwein',
    'white_wine': 'https://www.hawesko.de/weisswein',
}

# Category Scraping - with selenium - one page per category
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    fpath = fpath_namer(shop_id, 'ctg', ctg, 0)
    if not op.exists(fpath):
        driver.get(url)
        for k in range(20):
            sleep(1.5)
            if driver.check_exists_by_xpath(
                    '//div[@class="article list loader"]//*[@class="button loading loaderbutton"]'
            ):
                driver.waitclick(
                    '//div[@class="article list loader"]//*[@class="button loading loaderbutton"]'
                )
                sleep(1)
            else:
                break
        driver.save_page(fpath)
    tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
    for li in tree.xpath('//div[@data-module="article"]'):
        if not li.xpath('.//div/a/@href'):
            break
        produrl = li.xpath('.//div/a/@href')[0]

        produrl = parse_qs(