Esempio n. 1
0
# Category Scraping
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    fpath = fpath_namer(shop_id, 'ctg', ctg, 0)
    if not op.exists(fpath):
        driver.get(url)
        # Get scroll height
        last_height = driver.driver.execute_script(
            "return document.body.scrollHeight")
        while True:
            # Scroll down to bottom
            driver.driver.execute_script(
                "window.scrollTo(0, document.body.scrollHeight);")
            try:
                driver.waitclick('//div[@class="mod_product_list__more"]/a',
                                 timeout=5,
                                 silent=True)
            except:
                pass
            # Wait to load page
            sleep(2)

            # Calculate new scroll height and compare with last scroll height
            new_height = driver.driver.execute_script(
                "return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
        driver.save_page(fpath)

    tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
            produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl
            products[produrl] = {
                'pdct_name_on_eretailer': "".join(li.xpath('.//h2[@class="product-name"]//text()')),
                'raw_price': ''.join(w for t in li.xpath('.//span[@class="price"]/text()') for w in t.split()).strip(),
            }
            print(products[produrl], produrl)
            products[produrl]['price'] = getprice(products[produrl]['raw_price'])
            print(products[produrl])
            categories[ctg].append(produrl)
        # Going to next page if need be
        next_page_click = '//a[@class="next i-next"]'
        if not op.exists(fpath_namer(shop_id, 'ctg', ctg, p+1)):
            if not driver.check_exists_by_xpath(next_page_click):
                break
            else:
                driver.waitclick(next_page_click)
    print(ctg, url, p, len(categories[ctg]))


# KW searches Scraping - with selenium - one page per search
search_url = "http://twinliquors.com/shop/catalogsearch/result/?q={kw}"
for kw in keywords:
    searches[kw] = []
    # Storing and extracting infos
    fpath = fpath_namer(shop_id, 'search', kw, 0)
    url = search_url.format(kw=kw, page=0)
    if not op.exists(fpath):
        driver.get(url)
        sleep(2)
        driver.save_page(fpath, scroll_to_bottom=True)
    tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
Esempio n. 3
0
    'white_wine': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits/wine/white_wine',
    'gin': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits/spirits_and_liqueurs/gin',
    'tequila': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits/spirits_and_liqueurs/tequila',
    'rum': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits/spirits_and_liqueurs/rum',
    'liquor': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits/spirits_and_liqueurs/liqueurs_and_aperitifs',
    'brandy': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits/spirits_and_liqueurs/brandy',
}

# Category Scraping - with selenium - one page per category
for ctg, url in urls_ctgs_dict.items():
    count = 1
    categories[ctg] = []
    fpath = fpath_namer(shop_id, 'ctg', ctg, 0)
    if not op.exists(fpath):
        driver.get(url)
        driver.waitclick('//*[@class="closeNoticeSomethingDifferentPopup"]', timeout=4)
        last_height = driver.driver.execute_script("return document.body.scrollHeight")
        while True:
            sleep(1)
            driver.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            driver.waitclick('//*[@data-actiontype="load"]', timeout=3)
            driver.waitclick('//*[@data-actiontype="load"]', timeout=0.5)
            new_height = driver.driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        driver.save_page(fpath, scroll_to_bottom=True)
    tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
    for li in tree.xpath('//article[@data-test="product-pod"]'):
        produrl = clean_url(li.xpath('.//a[h2]/@href')[0], root_url)
Esempio n. 4
0
}


# Category Scraping - with selenium - multiple pages per category (click on next page)
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    if not op.exists(fpath_namer(shop_id, 'ctg', ctg, 0)):
        driver.get(url)
    for p in range(100):
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        if not op.exists(fpath):
            sleep(2)
            driver.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            sleep(2)
            driver.waitclick('//button[@data-dismiss="modal"]', timeout=7)
            driver.save_page(fpath, scroll_to_bottom=False)
        tree = etree.parse(open(fpath, 'rb'), parser=parser)
        for li in tree.xpath('//article[@data-entity-type="product"]'):
            produrl = li.xpath('.//a[@data-ec-linklabel="Product Text"]/@href')[0]
            produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl
            products[produrl] = {
                'pdct_name_on_eretailer': " ".join("".join(li.xpath('.//h1//text()')).split()),
                'volume': " ".join("".join(li.xpath('.//h1//text()')).split()),
                'raw_price': ''.join(w for t in li.xpath('.//*[starts-with(@class, "price") and not(contains(@class, "crossed"))]//text()') for w in t.split()).strip().split('*')[0],
                'raw_promo_price': ''.join(w for t in li.xpath('.//*[starts-with(@class, "price") and contains(@class, "crossed")]//text()') for w in t.split()).strip(),
            }
            print(products[produrl], produrl)
            tmp_price_str = products[produrl]['raw_price'].split("€")[1]
            print(tmp_price_str, "€" + tmp_price_str[:-2] + ',' + tmp_price_str[-2:])
            products[produrl]['price'] = getprice("€" + tmp_price_str[:-2] + ',' + tmp_price_str[-2:])
Esempio n. 5
0
        # if not r.from_cache:
        #     sleep(2)
print([(c, len(categories[c])) for c in categories])


# KW searches Scraping - with selenium - with search string - multiple page per search
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
search_url = "http://www.selfridges.com/GB/en/cat/foodhall/wines-spirits/"
for kw in keywords:
    print("Searching", kw)
    searches[kw] = []
    fpath = fpath_namer(shop_id, 'search', kw, 0)
    if not op.exists(fpath):
        driver.get(search_url)
        driver.waitclick('//*[@title="Search Selfridges..."]')
        # driver.waitclick('//*[@class="searchSubmit"]')
        actions = ActionChains(driver.driver)
        actions.send_keys(kw)
        actions.send_keys(Keys.ENTER)
        actions.perform()
        driver.save_page(fpath, scroll_to_bottom=True)

    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    for li in tree.xpath('//div[@class="productsInner"]/div[not(@class="productContainerOrphan")]'):
        produrl = str(li.xpath('.//a/@href')[0])
        produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
            urlsplit(produrl).query) else produrl
        produrl = clean_url(produrl, root_url)
        products[produrl] = {
            'pdct_name_on_eretailer': ' '.join(' '.join(li.xpath('.//div[@class="productContainerDesc"]/a//text()')).split()).strip(),
Esempio n. 6
0
    'white_wine': 'https://www.hawesko.de/weisswein',
}

# Category Scraping - with selenium - one page per category
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    fpath = fpath_namer(shop_id, 'ctg', ctg, 0)
    if not op.exists(fpath):
        driver.get(url)
        for k in range(20):
            sleep(1.5)
            if driver.check_exists_by_xpath(
                    '//div[@class="article list loader"]//*[@class="button loading loaderbutton"]'
            ):
                driver.waitclick(
                    '//div[@class="article list loader"]//*[@class="button loading loaderbutton"]'
                )
                sleep(1)
            else:
                break
        driver.save_page(fpath)
    tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
    for li in tree.xpath('//div[@data-module="article"]'):
        if not li.xpath('.//div/a/@href'):
            break
        produrl = li.xpath('.//div/a/@href')[0]

        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        produrl = clean_url(produrl, root_url)
Esempio n. 7
0
                'raw_promo_price': ''.join(w for t in li.xpath('.//*[@class="thumbnail-priceOld"]//text()')[:3] for w in t.split()).strip(),
            }
            print(products[produrl], produrl)
            if products[produrl]['raw_price'].count('$') >= 2:
                products[produrl]['raw_price'] = ''.join(w for t in li.xpath('.//*[@class="thumbnail-price promo"]//text()')[:3] for w in t.split()).strip()
            products[produrl]['price'] = getprice(products[produrl]['raw_price'])
            products[produrl]['promo_price'] = getprice(products[produrl]['raw_promo_price'])
            print(products[produrl])
            categories[ctg].append(produrl)

        # Checking if it was the last page
        if len(set(categories[ctg])) == number_of_pdcts_in_ctg:
            break
        else:
            number_of_pdcts_in_ctg = len(set(categories[ctg]))
            driver.waitclick('//*[class="glyphicon glyphicon-menu-right"]')

print([(c, len(categories[c])) for c in categories])


# KW searches Scraping - with requests - one page per search
kw_search_url = "https://www.macave.leclerc/catalogsearch/result/?q={kw}"
for kw in keywords:
    print('Requesting', kw)
    searches[kw] = []
    fpath = fpath_namer(shop_id, 'search', kw, 0)
    if not op.exists(fpath):
        driver.get(kw_search_url.format(kw=kw))
        driver.waitclick('//div[@class="limiter"]/span[last()]')
        sleep(2)
        driver.save_page(fpath, scroll_to_bottom=True)
Esempio n. 8
0
                'raw_price':
                " ".join("".join(
                    li.xpath(
                        './/span[@class="price"]/span[last()]//text()|.//span[@class="wasprice"]/span[2]//text()'
                    )[:1]).split()),
            }
            print(products[produrl])
            products[produrl]['price'] = getprice(
                products[produrl]['raw_price'])
            print(products[produrl])
        if len(set(categories[ctg])) == number_of_pdcts_in_ctg:
            break
        else:
            number_of_pdcts_in_ctg = len(set(categories[ctg]))
            if scraping_launched:
                driver.waitclick(
                    '//a[contains(@class, "forward-listing btn")]')
                sleep(1)
print([(c, len(categories[c])) for c in categories])

# Easy case, where you scroll down to get the whole page
search_url = "https://groceries.asda.com/search/{kw}"
for kw in keywords:
    print(kw)
    searches[kw] = []
    # Storing and extracting infos
    fpath = fpath_namer(shop_id, 'search', kw, 0)
    url = search_url.format(kw=kw, page=0)
    if not op.exists(fpath):
        driver.get(url)
        sleep(3)
        driver.save_page(fpath, scroll_to_bottom=True)
Esempio n. 9
0
            produrl = tr.xpath('.//*[contains(@class, "prodstitle")]/@href')[0]
            produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl
            produrl = clean_url(produrl, root_url)
            products[produrl] = {
                'volume': tr.xpath('.//*[contains(@class, "prodstitle")]/@title')[0],
                'pdct_name_on_eretailer': tr.xpath('.//*[contains(@class, "prodstitle")]/@title')[0],
                'raw_price': ''.join(tr.xpath('.//span[contains(@class, "prodsprice")]/text()')).strip(),
                'raw_promo_price': ''.join(tr.xpath('.//span[contains(@class, "prodsprice")]/s/text()')).strip(),
            }
            print(products[produrl], produrl)
            products[produrl]['price'] = getprice(products[produrl]['raw_price'])
            products[produrl]['promo_price'] = getprice(products[produrl]['raw_promo_price'])
            print(products[produrl])

            categories[ctg].append(produrl)
        if req_sent and not driver.waitclick('//div[contains(@class, "c data2")]/table[last()]//a[contains(text(),"{page}")]'.format(page=page+1)):
            break
    print(ctg, len(categories[ctg]))


######################################
# # KW searches scrapping ############
######################################

# KW searches Scraping - with requests - one page per search
kw_search_url = 'https://www.b-21.com/searchprods.asp?searchstring={kw}&pagenumber={page}&val=0'
for kw in keywords:
    searches[kw] = []
    number_of_pdcts_in_kw_search = 0
    req_sent = False
Esempio n. 10
0
# Category Scraping - with selenium - multiple pages per category (click on next page)
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    fpath = fpath_namer(shop_id, 'ctg', ctg, 0)
    if not op.exists(fpath):
        driver.get(url)
        click_trials = 0
        while True:
            driver.scroll_to_bottom()
            sleep(2)
            if driver.wait_for_xpath(
                    '//a[@class="btn load-products loading-button externalLink"]'
            ):
                driver.waitclick(
                    '//a[@class="btn load-products loading-button externalLink"]'
                )
                click_trials += 1
                if click_trials > 1:
                    break
            else:
                break
        driver.save_page(fpath)

    tree = etree.parse(open(fpath, 'rb'), parser=parser)
    # for li in tree.xpath('//li[contains(@id,"WC_CatalogSearchResultDisplay")]'):
    for li in tree.xpath('//div[@class="item isUpdated"]'):
        if not li.xpath('.//figure/a/@href'):
            continue
        # produrl = li.xpath('./a/@href')[0]
        produrl = li.xpath('.//figure/a/@href')[0]
Esempio n. 11
0
                'raw_price': ' '.join(''.join(li.xpath('.//span[@class="RegularPrice"]/text()')).split()),
                'raw_promo_price': ' '.join(''.join(li.xpath('.//span[@class="cart-price-strike"]//text()')).split()),
            }
            print(products[produrl], produrl)
            products[produrl]['price'] = getprice(products[produrl]['raw_price'])
            products[produrl]['promo_price'] = getprice(products[produrl]['raw_promo_price'])
            print(products[produrl])

            categories[ctg].append(produrl)

        # Checking if it was the last page
        if len(set(categories[ctg])) == number_of_pdcts_in_ctg:
            break
        else:
            number_of_pdcts_in_ctg = len(set(categories[ctg]))
            driver.waitclick('//b[contains(text(),">>")]')

# KW searches Scraping - with selenium - with nb page hard-coded in url - multiple page per search
search_url = "https://buckhead.towerwinespirits.com/main.asp?request=SEARCH&search={kw}"
for kw in keywords:
    searches[kw] = []
    number_of_pdcts_in_kw_search = 0

    # Storing and extracting infos
    urlp = search_url.format(kw=kw)

    fpath = fpath_namer(shop_id, 'search', kw, 0)
    if not op.exists(fpath):
        driver.get(urlp)
        sleep(1)
        driver.save_page(fpath, scroll_to_bottom=True)
Esempio n. 12
0
    if not op.exists(fpath):
        if not minibardelivery_was_initialised:
            init_minibardelivery(driver)
            minibardelivery_was_initialised = True
        driver.get(url)
        # Get scroll height
        last_height = driver.driver.execute_script(
            "return document.body.scrollHeight")

        while True:
            # Scroll down to bottom
            driver.driver.execute_script(
                "window.scrollTo(0, document.body.scrollHeight);")
            try:
                driver.waitclick(
                    '//*[contains(@class, "product-list-load-more")]',
                    timeout=6)
            except:
                pass
            # Wait to load page
            sleep(2)

            # Calculate new scroll height and compare with last scroll height
            new_height = driver.driver.execute_script(
                "return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        driver.save_page(fpath)
    # Parsing
    "whisky": "https://www.lagrandeepicerie.com/fr/cave/spiritueux/whiskies/",
    "cognac": "https://www.lagrandeepicerie.com/fr/cave/spiritueux/cognac/",
    "vodka": "https://www.lagrandeepicerie.com/fr/cave/spiritueux/vodka/",
    "rum": "https://www.lagrandeepicerie.com/fr/cave/spiritueux/rhums/",
    "gin": "https://www.lagrandeepicerie.com/fr/cave/spiritueux/gin/",
}

# Category Scraping
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []

    fpath = fpath_namer(shop_id, 'ctg', ctg, 0)
    if not op.exists(fpath):
        driver.get(url)
        sleep(2)
        driver.waitclick('//*[@id="show-more-product"]', timeout=5)
        # driver.click_to_bottom('//*[@id="show-more-product"]')
        driver.save_page(fpath, scroll_to_bottom=True)
    tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
    print(ctg, fpath)
    for li in tree.xpath('//*[@id="search-result-items"]/div'):
        produrl = li.xpath('.//a[@class="thumb-link"]/@href')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        produrl = clean_url(produrl, root_url)
        products[produrl] = {
            'pdct_name_on_eretailer':
            ' '.join(''.join(li.xpath(
                './/div[@class="product-name"]//text()')).split()).strip(),
            'raw_price':