from lxml import etree

parser = etree.HTMLParser()
from custom_browser import CustomDriver
import random
# Init variables and assets
driver = CustomDriver(headless=False, firefox=True, download_images=True)
random.choice([1, 2, 3])

count = 0
driver.get('https://www.leparisien.fr')
while True:
    print('Looping', count)
    elems = driver.driver.find_elements_by_xpath("//a[@href]")
    elems = [el.get_attribute('href') for el in elems]
    elems2 = [el for el in elems if "www.leparisien.fr" in el]
    elems3 = [
        el for el in elems if ("www.leparisien.fr" in el) and ('.php' in el)
    ]
    if elems3:
        url = random.choice(elems3)
        count += 1
        print(count, url)
        driver.get(url)
    elif elems2:
        url = random.choice(elems2)
        count += 1
        print(count, url)
        driver.get(url)
    else:
        driver.get('https://www.leparisien.fr')
                  'rum': 'https://www.argonautliquor.com/search/categories/Rum/result_size/96/page/{page}',
                  'liquor': 'https://www.argonautliquor.com/search/categories/Liqueur/result_size/96/page/{page}',
                  'brandy': 'https://www.argonautliquor.com/search/categories/Brandy/result_size/96/page/{page}',
                  'mezcal': 'https://www.argonautliquor.com/search/categories/Mezcal/result_size/96/page/{page}',
                  }

# Category Scraping
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    for p in range(100):
        urlp = url.format(page=p + 1)

        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        if not op.exists(fpath):
            driver.get(urlp)
            sleep(2)
            driver.save_page(fpath)
        tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)

        # r = requests.get(urlp)
        # tree = etree.parse(BytesIO(r.content), parser=parser)

        for li in tree.xpath('//div[@id="product-list"]//div[@class="grid-item"]'):
            produrl = li.xpath('.//a[@class="product-link"]/@href')[0]
            produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
            produrl = clean_url(produrl, root_url)
            products[produrl] = {
                'pdct_name_on_eretailer': ' '.join(
                    ''.join(li.xpath('.//div[@class="product-name"]//text()')).split()).strip(),
    'red_wine': 'https://www.seijoishii.com/c/1283?&row_limit=50&page={page}',
    'bourbon': 'https://www.seijoishii.com/c/277?&row_limit=50&page={page}',
    'brandy': 'https://www.seijoishii.com/c/239?&row_limit=50&page={page}',
    # 'rum': '',
}

# Category Scraping - with selenium - multiple pages per category (click on next page)
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0

    for p in range(100):
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)

        if not op.exists(fpath):
            driver.get(url.format(page=p + 1))
            sleep(2)
            driver.save_page(fpath, scroll_to_bottom=True)
        categories, products = ctg_parsing(fpath, ctg, categories, products)

        if len(set(categories[ctg])) == number_of_pdcts_in_ctg:
            break
        else:
            number_of_pdcts_in_ctg = len(set(categories[ctg]))
    print(ctg, url, p, len(categories[ctg]))

######################################
# # KW searches scrapping ############
######################################

# KW searches Scraping - with requests - one page per search
    'https://iyec.omni7.jp/basic/42450?sort=recommend&displayCnt=80&startIndex={page}',
    # 'bourbon': '',#na
    # 'brandy': '',#na
    # 'rum': '',#na
}

# Category Scraping - with selenium - multiple pages per category (click on next page)
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0

    for p in range(100):
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)

        if not op.exists(fpath):
            driver.get(url.format(page=p * 80))
            sleep(2)
            driver.save_page(fpath, scroll_to_bottom=True)
        categories, products = ctg_parsing(fpath, ctg, categories, products)

        if len(set(categories[ctg])) == number_of_pdcts_in_ctg:
            break
        else:
            number_of_pdcts_in_ctg = len(set(categories[ctg]))
    print(ctg, url, p, len(categories[ctg]))

######################################
# # KW searches scrapping ############
######################################

# KW searches Scraping - with requests - one page per search
        return price.named['pound'] * 100
    else:
        return price.named['pound'] * 100 + price.named['pence']


# Category Scraping
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    for page in range(10):

        fpath = fpath_namer(shop_id, 'ctg', ctg, page)
        if not op.exists(fpath):
            print(url.format(page=page + 1))
            driver.respawn()
            driver.get(url.format(page=page + 1))
            driver.save_page(fpath, scroll_to_bottom=True)

        tree = etree.parse(open(fpath, 'rb'), parser=parser)
        for li in tree.xpath('//li[@class="product-grid_item"]'):
            produrl = li.xpath(
                './/a[contains(@class, "product-card_link")]/@href')[0]
            produrl = parse_qs(
                urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                    urlsplit(produrl).query) else produrl
            produrl = clean_url(produrl, root_url)
            categories[ctg].append(produrl)
            products[produrl] = {
                'pdct_name_on_eretailer':
                ' '.join([
                    li.xpath('.//span[@class="product-card_brand"]//text()')
Exemple #6
0
    'rum':
    'https://www.auchandrive.fr/catalog/boissons-3686969/bieres-alcools-3686338/rhums-R3702929',
    'liquor':
    'https://www.auchandrive.fr/catalog/boissons-3686969/bieres-alcools-3686338/aperitifs-anises-R3702917',
}

# Category Scraping
for ctg, url in urls_ctgs_dict.items():
    print('Beginning,', ctg, url)
    categories[ctg] = []
    fpath = fpath_namer(shop_id, 'ctg', ctg, 0)
    if not op.exists(fpath):
        if not auchan_drive_was_initialised:
            init_auchan_drive(driver)
            auchan_drive_was_initialised = True
        driver.get(url)
        driver.smooth_scroll()
        driver.save_page(fpath, scroll_to_bottom=True)
    categories, products = ctg_parsing(fpath, ctg, categories, products)
    print(ctg, url, len(categories[ctg]))

######################################
# # KW searches scrapping ############
######################################

# KW searches Scraping - with requests - one page per search
kw_search_url = "https://www.auchandrive.fr/recherche/{kw}"  # TODO : modify URL
for kw in keywords:
    searches[kw] = []
    fpath = fpath_namer(shop_id, 'search', kw, 0)
    if not op.exists(fpath):
Exemple #7
0

saucey_was_initialised = False

# Categories scraping
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    for p in range(10):
        print(ctg, p)
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        if not op.exists(fpath):
            if not saucey_was_initialised:
                init_saucey(driver)
                saucey_was_initialised = True
            driver.get(url.format(page=p * 60))
            driver.wait_for_xpath('//*[@itemtype="http://schema.org/Product"]',
                                  timeout=10)
            driver.smooth_scroll(sleep_time=0.3)
            driver.save_page(fpath, scroll_to_bottom=True)
        # Parsing
        tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
        for li in tree.xpath('//*[@itemtype="http://schema.org/Product"]'):
            produrl = "".join(li.xpath('.//a[@itemprop="url"]/@href'))
            produrl = parse_qs(
                urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                    urlsplit(produrl).query) else produrl
            produrl = clean_url(produrl, root_url)
            products[produrl] = {
                'pdct_name_on_eretailer':
                " ".join(''.join(
Exemple #8
0
urls_ctgs_dict = {
    'champagne': 'https://www.mondovino.ch/catalogue/typedevin/Champagne',
    'vodka': 'https://www.mondovino.ch/selections/spiritueux/Cfr',
    'cognac': 'https://www.mondovino.ch/selections/spiritueux/Cfr',
    'whisky': 'https://www.mondovino.ch/selections/spiritueux/Cfr',
    'still_wines': 'https://www.mondovino.ch/catalogue/typedevin/Vin+blanc',
    'white_wine': 'https://www.mondovino.ch/catalogue/typedevin/Vin+blanc',
    'red_wine': 'https://www.mondovino.ch/catalogue/typedevin/Vin+rouge',
}

# Category Scraping
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    fpath = fpath_namer(shop_id, 'ctg', ctg, 0)
    if not op.exists(fpath):
        driver.get(url)
        # Get scroll height
        last_height = driver.driver.execute_script(
            "return document.body.scrollHeight")
        while True:
            # Scroll down to bottom
            driver.driver.execute_script(
                "window.scrollTo(0, document.body.scrollHeight);")
            try:
                driver.waitclick('//div[@class="mod_product_list__more"]/a',
                                 timeout=5,
                                 silent=True)
            except:
                pass
            # Wait to load page
            sleep(2)
Exemple #9
0
    # 'bourbon': '',#na
    'brandy':
    'http://www.kakuyasu.co.jp/ec/disp/CSfDispListPage_001.jsp?dispNo=001014&q=&j=&min=&max=&ys=&yl=&yoryotanni=&allSearch=&type=01&sort=01&page={page}',
    # 'rum': '',#na
}

# Category Scraping - with selenium - multiple pages per category (click on next page)
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0

    for p in range(100):
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)

        if not op.exists(fpath):
            driver.get(url.format(page=p + 1))
            sleep(2)
            driver.save_page(fpath, scroll_to_bottom=True)
        categories, products = ctg_parsing(fpath, ctg, categories, products)

        if len(set(categories[ctg])) == number_of_pdcts_in_ctg:
            print("Finishing with :", len(set(categories[ctg])), "products")
            break
        else:
            number_of_pdcts_in_ctg = len(set(categories[ctg]))
    print(ctg, url, p, len(categories[ctg]))

######################################
# # KW searches scrapping ############
######################################
Exemple #10
0
    'white_wine':'https://www.abcfws.com/category/WINE/WHITE/pc/2/16.uts?currentIndex={start}&pageSize=48',
    'red_wine':'https://www.abcfws.com/category/WINE/RED/pc/2/3.uts?currentIndex={start}&pageSize=48',
    'gin':'https://www.abcfws.com/category/SPIRITS/GIN/pc/46/50.uts?currentIndex={start}&pageSize=48',
    'tequila':'https://www.abcfws.com/category/SPIRITS/TEQUILA/pc/46/59.uts?currentIndex={start}&pageSize=48',
    'rum':'https://www.abcfws.com/category/SPIRITS/RUM/pc/46/51.uts?currentIndex={start}&pageSize=48',
    'scotch':'https://www.abcfws.com/thumbnail/SPIRITS/WHISKEY/SCOTCH/pc/46/c/67/74.uts?currentIndex={start}&pageSize=48',
    'bourbon':'https://www.abcfws.com/thumbnail/SPIRITS/WHISKEY/BOURBON/pc/46/c/67/69.uts?currentIndex={start}&pageSize=48',
}

for ctg, url in categories_urls.items():
    categories[ctg] = []
    for p, start in enumerate(range(0, 1000, 48)):
        # r = requests.get(url.format(start=start))
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        if not op.exists(fpath):
            driver.get(url.format(start = start))
            sleep(2)
            driver.save_page(fpath, scroll_to_bottom=True)
        # tree = etree.parse(BytesIO(r.content), parser=parser)
        tree = etree.parse(open(fpath, 'rb'), parser=parser)
        articles = tree.xpath('//section[contains(@class, "productsList")]/div[@class="product"]')
        aurls = [a.xpath('.//div[@class="name"]/a/@href')[0] for a in articles]
        if not articles:
            break
        categories[ctg] += aurls
        for a in articles:
            data = {
                'url': a.xpath('.//div[@class="name"]/a/@href')[0],
                'pdct_name_on_eretailer': a.xpath('.//div[@class="name"]/a/text()')[0].strip(),
                'volume': a.xpath('.//div[@class="volume"]//text()')[0].strip(),
                'price': getprice(''.join(a.xpath('.//div[@class="price pl0"]/span/text()')).strip()),
Exemple #11
0
    'still_wines': 'https://www.b-21.com/searchprods.asp?searchstring=wine&pagenumber={page}&val=0',
    'red_wine': 'https://www.b-21.com/searchprods.asp?searchstring=red+wine&pagenumber={page}&val=0',
    'white_wine': 'https://www.b-21.com/searchprods.asp?searchstring=white+wine&pagenumber={page}&val=0',
    'tequila': 'https://www.b-21.com/searchprods.asp?searchstring=tequila&pagenumber={page}&val=0',
    'gin': 'https://www.b-21.com/searchprods.asp?searchstring=gin&pagenumber={page}&val=0',
    'rum': 'https://www.b-21.com/searchprods.asp?searchstring=rum&pagenumber={page}&val=0',
    'brandy': 'https://www.b-21.com/searchprods.asp?searchstring=brandy&pagenumber={page}&val=0',
}

for ctg, caturl in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    req_sent = False
    if not op.exists(fpath_namer(shop_id, 'ctg', ctg, 1)):
        req_sent = True
        driver.get('https://www.b-21.com/')
        driver.text_input(ctg, '//input[@id="code"]', enter=True)
    for page in range(1, 100):
        url = caturl.format(page=page)
        fpath = fpath_namer(shop_id, 'ctg', ctg, page)
        if not op.exists(fpath) and req_sent:
            driver.smooth_scroll()
            driver.save_page(fpath, scroll_to_bottom=True)
        elif not op.exists(fpath) and not req_sent:
            break
        tree = etree.parse(open(fpath, 'rb'), parser=parser)
        for tr in tree.xpath('//div[contains(@class, "c data2")]/table[3]/tbody/tr'):
            if not tr.xpath('.//*[contains(@class, "prodstitle")]/@href'):
                continue
            produrl = tr.xpath('.//*[contains(@class, "prodstitle")]/@href')[0]
            produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl
Exemple #12
0
site_was_initialised = False

# Categories scraping
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    for p in range(100):
        print(ctg, p)
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        print(fpath)
        if not op.exists(fpath):
            # if not site_was_initialised:
            #     init_site(driver)
            #     site_was_initialised = True
            driver.get(url.format(page=p + 1))
            driver.save_page(fpath, scroll_to_bottom=True)

        # Parsing
        tree = etree.parse(open(fpath, 'rb'), parser=parser)
        for li in tree.xpath('//*[@class="product-list"]/div'):
            produrl = "".join(li.xpath('.//a[@class="rebl15"]/@href'))
            produrl = parse_qs(
                urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                    urlsplit(produrl).query) else produrl

            produrl = clean_url(produrl, root_url)
            products[produrl] = {
                'pdct_name_on_eretailer':
                li.xpath('.//a[@class="rebl15"]/text()')[0].strip(),
                'volume':
Exemple #13
0
    'bourbon':
    'https://www.liquorland.com.au/Spirits?facets=spiritproducttype%3dBourbon?show=200&page={page}',
    'liquor':
    'https://www.liquorland.com.au/Spirits?facets=spiritproducttype%3dImported+Liqueurs?show=200&page={page}',
    'tequila':
    'https://www.liquorland.com.au/Spirits?facets=spiritproducttype%3dTequila?show=200&page={page}',
}

# Category Scraping - with selenium - one page per category
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    for p in range(20):
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        if not op.exists(fpath):
            driver.get(url.format(page=p))
            sleep(2)
            driver.save_page(fpath, scroll_to_bottom=True)
        tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
        for li in tree.xpath('//ul[@class="productList"]/li'):
            produrl = li.xpath('.//div/h2/a/@href')[0]
            produrl = parse_qs(
                urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                    urlsplit(produrl).query) else produrl
            produrl = clean_url(produrl, root_url)
            products[produrl] = {
                'pdct_name_on_eretailer':
                "".join(li.xpath('.//div/h2//text()')).strip(),
                'raw_price':
                ''.join(
                    w for t in li.xpath('.//div[@class="valueLarge"]//text()')
Exemple #14
0
    # 'tequila': '',#no tequila
    # 'liquor': '',#no liquor
    # 'white_wine': 'https://www.aeondewine.com/shop/c/c060102/?l:inkid=aw69_avGM7kHb',
    # 'red_wine': 'https://www.aeondewine.com/shop/c/c060101/?linkid=aw69_Xl3132nk',
    # 'bourbon': '',#no bourbon
    # 'brandy': '',#no brandy
    # 'rum': '',#no rum
}

# Category Scraping - with selenium - multiple pages per category (click on next page)
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    print("Beginning ", ctg, url)
    if not op.exists(fpath_namer(shop_id, 'ctg', ctg, 0)):
        driver.get(url)

    # If files exist, don't scrap
    perform_scrapping = not op.exists(fpath_namer(shop_id, 'ctg', ctg, 0))
    for p in range(100):
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        if not op.exists(fpath) and perform_scrapping:
            sleep(2)
            driver.save_page(fpath, scroll_to_bottom=True)
        categories, products = ctg_parsing(fpath, ctg, categories, products)
        print(fpath, ctg, p, len(categories[ctg]))

        # Break or change pages
        if number_of_pdcts_in_ctg == len(categories[ctg]):
            print("Finished, because no more new products")
            break
    'https://www.goodygoody.com/Products/Products?searchTerm=&category=1AGN&type=0&orderBy=name&minprice=&maxprice=',
    'tequila':
    'https://www.goodygoody.com/Products/Products?searchTerm=&category=1ATQ&type=0&orderBy=name&minprice=&maxprice=',
    'rum':
    'https://www.goodygoody.com/Products/Products?searchTerm=&category=1ARM&type=0&orderBy=name&minprice=&maxprice=',
    'brandy':
    'https://www.goodygoody.com/Products/Products?searchTerm=&category=1ABR&type=0&orderBy=name&minprice=&maxprice=',
    'bourbon':
    'https://www.goodygoody.com/Products/Products?searchTerm=&category=1ABN&type=0&orderBy=name&minprice=&maxprice=',
}

# Categories scraping
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    if not op.exists(fpath_namer(shop_id, 'ctg', ctg, 0)):
        driver.get(url)

    for p in range(100):
        # Scraping
        urlp = url.format(page=p + 1)
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        if not op.exists(fpath):
            if not goodygoody_was_initialised:
                init_goodygoody(driver)
                goodygoody_was_initialised = True
            sleep(2)
            driver.save_page(fpath)
        # Parsing
        tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
        for li in tree.xpath(
                '//div[@class="row productRow"]//div[@class="row"]'):
    'red_wine': 'http://www.waitrosecellar.com/all-wines/wine-type/red-wine',
    'white_wine':
    'http://www.waitrosecellar.com/all-wines/wine-type/white-wine',
    'gin': 'http://www.waitrosecellar.com/gin',
    'rum': 'http://www.waitrosecellar.com/rum',
    'tequila': 'http://www.waitrosecellar.com/tequila',
    'liquor': 'http://www.waitrosecellar.com/liqueurs',
}

# Difficult case, where you should click a button to get on next page and send the request via the search bar
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    if not op.exists(fpath_namer(shop_id, 'ctg', ctg, 0)):
        # Getting back to root if search input box is not found
        driver.get(url)
    for p in range(100):
        # Storing and extracting infos
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        if not op.exists(fpath):
            driver.save_page(fpath)
            sleep(2)
        tree = etree.parse(open(fpath, 'rb'), parser=parser)
        for li in tree.xpath('//div[@class="productCard"]'):
            produrl = li.xpath('.//div[@class="productName"]/a/@href')[0]
            produrl = parse_qs(
                urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                    urlsplit(produrl).query) else produrl
            categories[ctg].append(produrl)
            products[produrl] = {
                'pdct_name_on_eretailer':
Exemple #17
0
    'http://shop.bevmo.com/search?format=varietal&lbc=bevmo&method=and&p=Q&ts=custom&uid=644456520&view=list&w=rum&af=varietal%3aliqueur&srt={page}',
}

# Categories scraping
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    for p in range(100):
        print(ctg, p)
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        if not op.exists(fpath):
            if not bevmo_was_initialised:
                init_bevmo(driver)
                bevmo_was_initialised = True
            print(url.format(page=32 * p))
            driver.get(url.format(page=32 * p))
            sleep(1)
            driver.save_page(fpath, scroll_to_bottom=True)

        # Parsing
        tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
        for li in tree.xpath(
                '//ul[contains(@class, "products")]//li[@class="item"]'):
            produrl = "".join(
                li.xpath('.//h2[contains(@class, "product-name")]/a/@href'))
            produrl = parse_qs(
                urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                    urlsplit(produrl).query) else produrl
            produrl = clean_url(produrl, root_url)
            products[produrl] = {
                'pdct_name_on_eretailer':
    if not pricestr:
        return ''
    price = parse('${pound:d}.{pence:d}', pricestr)
    if not price:
        price = parse('${th:d},{pound:d}.{pence:d}', pricestr)
        return price.named['th'] * 100000 + price.named['pound'] * 100 + price.named['pence']
    return price.named['pound'] * 100 + price.named['pence']


# Category Scraping - with selenium - multiple pages per category (click on next page)
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    if not op.exists(fpath_namer(shop_id, 'ctg', ctg, 0)):
        # Getting to ctg url
        driver.get(url)
    for p in range(100):
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        if not op.exists(fpath):
            driver.save_page(fpath)
            sleep(2)
        tree = etree.parse(open(fpath, 'rb'), parser=parser)
        for li in tree.xpath('//div[@class="col-main-content"]//ul/li'):
            produrl = li.xpath('.//h2[@class="product-name"]/a/@href')[0]
            produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl
            products[produrl] = {
                'pdct_name_on_eretailer': "".join(li.xpath('.//h2[@class="product-name"]//text()')),
                'raw_price': ''.join(w for t in li.xpath('.//span[@class="price"]/text()') for w in t.split()).strip(),
            }
            print(products[produrl], produrl)
            products[produrl]['price'] = getprice(products[produrl]['raw_price'])
Exemple #19
0
    'red_wine': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits/wine/red_wine',
    'white_wine': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits/wine/white_wine',
    'gin': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits/spirits_and_liqueurs/gin',
    'tequila': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits/spirits_and_liqueurs/tequila',
    'rum': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits/spirits_and_liqueurs/rum',
    'liquor': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits/spirits_and_liqueurs/liqueurs_and_aperitifs',
    'brandy': 'https://www.waitrose.com/ecom/shop/browse/groceries/beer_wine_and_spirits/spirits_and_liqueurs/brandy',
}

# Category Scraping - with selenium - one page per category
for ctg, url in urls_ctgs_dict.items():
    count = 1
    categories[ctg] = []
    fpath = fpath_namer(shop_id, 'ctg', ctg, 0)
    if not op.exists(fpath):
        driver.get(url)
        driver.waitclick('//*[@class="closeNoticeSomethingDifferentPopup"]', timeout=4)
        last_height = driver.driver.execute_script("return document.body.scrollHeight")
        while True:
            sleep(1)
            driver.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            driver.waitclick('//*[@data-actiontype="load"]', timeout=3)
            driver.waitclick('//*[@data-actiontype="load"]', timeout=0.5)
            new_height = driver.driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        driver.save_page(fpath, scroll_to_bottom=True)
    tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
    for li in tree.xpath('//article[@data-test="product-pod"]'):
    "https://www.freshdirect.com/browse.jsp?pageType=browse&id=vin_spirits_liqueurs&pageSize=100&all=true&activePage=1&sortBy=Sort_PopularityUp&orderAsc=true&activeTab=product",
}

# Categories scraping
for ctg, url in urls_ctgs_dict.items():

    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    for p in range(100):
        print(ctg, p)
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        if not op.exists(fpath):
            # if not freshdirect_was_initialised:
            #     init_freshdirect(driver)
            #     freshdirect_was_initialised = True
            driver.get(url.format(page=p + 1))
            sleep(1)
            driver.save_page(fpath, scroll_to_bottom=True)

        # Parsing
        tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
        for li in tree.xpath(
                '//ul[contains(@class, "products transactional")]/li'):
            produrl = li.xpath(
                './/a[@class="portrait-item-image-link"]/@href')[0]
            produrl = parse_qs(
                urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                    urlsplit(produrl).query) else produrl
            produrl = clean_url(produrl, root_url)
            products[produrl] = {
                'pdct_name_on_eretailer':
Exemple #21
0
    'scotch':
    'https://www.abcfws.com/thumbnail/SPIRITS/WHISKEY/SCOTCH/pc/46/c/67/74.uts?currentIndex={start}&pageSize=48',
    'bourbon':
    'https://www.abcfws.com/thumbnail/SPIRITS/WHISKEY/BOURBON/pc/46/c/67/69.uts?currentIndex={start}&pageSize=48',
}

# Category Scraping - with selenium - multiple pages per category (click on next page)
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0

    for p, start in enumerate(range(0, 1000, 48)):
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)

        if not op.exists(fpath):
            driver.get(url.format(start=start))
            sleep(2)
            driver.save_page(fpath, scroll_to_bottom=True)
        categories, products = ctg_parsing(fpath, ctg, categories, products)

        if len(set(categories[ctg])) == number_of_pdcts_in_ctg:
            break
        else:
            number_of_pdcts_in_ctg = len(set(categories[ctg]))
    print(ctg, url, p, len(categories[ctg]))

######################################
# # KW searches scrapping ############
######################################

# KW searches Scraping - with requests - one page per search
    "brandy": 'https://www.bodeboca.com/destilados-licores/brandy?page={page}',
    "red_wine": 'https://www.bodeboca.com/vino/tinto?page={page}',
    "white_wine": 'https://www.bodeboca.com/vino/blanco?page={page}',
}

# Category Scraping
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    for p in range(20):
        urlp = url.format(page=p + 1)

        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        print(fpath, p, urlp)
        if not op.exists(fpath):
            driver.get(urlp)
            # driver.driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.7);")
            # print('sleeping')
            # sleep(10)
            # driver.waitclick('//*[contains(@class, "bb-modal-close-button")]', timeout=1, silent=False)
            driver.save_page(fpath, scroll_to_bottom=True)
        tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)

        for li in tree.xpath('//div[@id="venta-main"]/div'):
            produrl = li.xpath('.//a/@href')[0]
            produrl = parse_qs(
                urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                    urlsplit(produrl).query) else produrl
            produrl = clean_url(produrl, root_url)
            products[produrl] = {
                'pdct_name_on_eretailer':
Exemple #23
0
    "brandy":
    "https://www.nicks.com.au/store/spirits-liqueurs/other-brandy-eau-de-vie?limit=60&mode=grid&p={page}",
    "liquor":
    "https://www.nicks.com.au/store/spirits-liqueurs/liqueurs?limit=60&mode=grid&p={page}",
}

# Category Scraping
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0
    for p in range(100):
        urlp = url.format(page=p + 1)

        fpath = fpath_namer(shop_id, 'ctg', ctg, p)
        if not op.exists(fpath):
            driver.get(urlp)
            sleep(2)
            driver.save_page(fpath)
        tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)

        for li in tree.xpath('//div[@class="product item"]'):
            produrl = li.xpath(
                './/div[@class="productblock-title"]/a/@href')[0]
            produrl = parse_qs(
                urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                    urlsplit(produrl).query) else produrl
            produrl = clean_url(produrl, root_url)
            products[produrl] = {
                'pdct_name_on_eretailer':
                ' '.join(''.join(
                    li.xpath('.//div[@class="productblock-title"]/a//text()')).
Exemple #24
0
    'https://www.sainsburys.co.uk/shop/gb/groceries/beer-wine-and-spirits-/gin-340887-44#langId=44&storeId=10151&catalogId=10123&categoryId=340889&parent_category_rn=340854&top_category=340854&pageSize=108&orderBy=FAVOURITES_ONLY%7CSEQUENCING%7CTOP_SELLERS&searchTerm=&beginIndex=0',
    'rum':
    'https://www.sainsburys.co.uk/shop/gb/groceries/beer-wine-and-spirits-/rum-44#langId=44&storeId=10151&catalogId=10123&categoryId=340889&parent_category_rn=340854&top_category=340854&pageSize=108&orderBy=FAVOURITES_ONLY%7CSEQUENCING%7CTOP_SELLERS&searchTerm=&beginIndex=0',
    'tequila':
    'https://www.sainsburys.co.uk/shop/gb/groceries/beer-wine-and-spirits-/tequila-44#langId=44&storeId=10151&catalogId=10123&categoryId=340889&parent_category_rn=340854&top_category=340854&pageSize=108&orderBy=FAVOURITES_ONLY%7CSEQUENCING%7CTOP_SELLERS&searchTerm=&beginIndex=0',
    'liquor':
    'https://www.sainsburys.co.uk/shop/gb/groceries/beer-wine-and-spirits-/liqueurs---speciality-spirits#langId=44&storeId=10151&catalogId=10123&categoryId=340889&parent_category_rn=340854&top_category=340854&pageSize=108&orderBy=FAVOURITES_ONLY%7CSEQUENCING%7CTOP_SELLERS&searchTerm=&beginIndex=0',
}

# Categories scraping
for ctg, url in urls_ctgs_dict.items():
    print(ctg, url)
    categories[ctg] = []
    fpath = fpath_namer(shop_id, 'ctg', ctg, 0)
    if not op.exists(fpath):
        driver.get(url)
        sleep(1)
        driver.save_page(fpath, scroll_to_bottom=True)
    tree = etree.parse(BytesIO(open(fpath, 'rb').read()), parser=parser)
    for li in tree.xpath('//ul[@class="productLister gridView"]/li'):
        produrl = li.xpath('.//h3/a/@href')[0]
        produrl = parse_qs(
            urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(
                urlsplit(produrl).query) else produrl
        categories[ctg].append(produrl)
        products[produrl] = {
            'pdct_name_on_eretailer':
            " ".join("".join(
                li.xpath(
                    './/div[@class="productNameAndPromotions"]//h3//text()')).
                     split()),
Exemple #25
0
    # 'brandy': '',#na
    # 'rum': '',#na
}


# Category Scraping - with selenium - multiple pages per category (click on next page)
for ctg, url in urls_ctgs_dict.items():
    categories[ctg] = []
    number_of_pdcts_in_ctg = 0

    for p in range(100):
        fpath = fpath_namer(shop_id, 'ctg', ctg, p)

        if not op.exists(fpath):
            print(url.format(page=p+1))
            driver.get(url.format(page=p+1), True)
            sleep(2)
            driver.save_page(fpath, scroll_to_bottom=True)
        categories, products = ctg_parsing(fpath, ctg, categories, products)

        if len(set(categories[ctg])) == number_of_pdcts_in_ctg:
            break
        else:
            number_of_pdcts_in_ctg = len(set(categories[ctg]))
    print(ctg, url, p, len(categories[ctg]))
    break


######################################
# # KW searches scrapping ############
######################################