# 'gin': '',#no gin # 'tequila': '',#no tequila # 'liquor': '',#no liquor # 'white_wine': 'https://www.aeondewine.com/shop/c/c060102/?l:inkid=aw69_avGM7kHb', # 'red_wine': 'https://www.aeondewine.com/shop/c/c060101/?linkid=aw69_Xl3132nk', # 'bourbon': '',#no bourbon # 'brandy': '',#no brandy # 'rum': '',#no rum } # Category Scraping - with selenium - multiple pages per category (click on next page) for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 print("Beginning ", ctg, url) if not op.exists(fpath_namer(shop_id, 'ctg', ctg, 0)): driver.get(url) # If files exist, don't scrap perform_scrapping = not op.exists(fpath_namer(shop_id, 'ctg', ctg, 0)) for p in range(100): fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath) and perform_scrapping: sleep(2) driver.save_page(fpath, scroll_to_bottom=True) categories, products = ctg_parsing(fpath, ctg, categories, products) print(fpath, ctg, p, len(categories[ctg])) # Break or change pages if number_of_pdcts_in_ctg == len(categories[ctg]): print("Finished, because no more new products")
urls_ctgs_dict = { 'champagne': 'https://www.vicampo.de/weine/subart/Champagner', 'sparkling': 'https://www.vicampo.de/weine/subart/Sekt', 'still_wines': 'https://www.vicampo.de/weine/art/Wei%C3%9Fwein', 'white_wine': 'https://www.vicampo.de/weine/art/Wei%C3%9Fwein', 'red_wine': 'https://www.vicampo.de/weine/subart/Rotwein', } # Category Scraping - with selenium - multiple pages per category (click on next page) for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 if not op.exists(fpath_namer(shop_id, 'ctg', ctg, 0)): driver.get(url) for p in range(100): fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): sleep(2) driver.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") sleep(2) driver.waitclick('//button[@data-dismiss="modal"]', timeout=7) driver.save_page(fpath, scroll_to_bottom=False) tree = etree.parse(open(fpath, 'rb'), parser=parser) for li in tree.xpath('//article[@data-entity-type="product"]'): produrl = li.xpath('.//a[@data-ec-linklabel="Product Text"]/@href')[0] produrl = parse_qs(urlsplit(produrl).query)['url'][0] if 'url' in parse_qs(urlsplit(produrl).query) else produrl products[produrl] = { 'pdct_name_on_eretailer': " ".join("".join(li.xpath('.//h1//text()')).split()),
from create_csvs import create_csvs from custom_browser import CustomDriver from ers import COLLECTION_DATE, file_hash, img_path_namer from ers import all_keywords_fr as keywords, TEST_PAGES_FOLDER_PATH from ers import clean_xpathd_text from ers import fpath_namer, mh_brands, clean_url, headers from matcher import BrandMatcher from validators import validate_raw_files parser = etree.HTMLParser() # Init variables and assets shop_id = 'gourmondo' root_url = 'https://www.gourmondo.de' requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache')) country = 'DE' searches, categories, products = {}, {}, {} driver = CustomDriver(headless=True, download_images=True) def getprice(pricestr): pricestr = re.sub("[^0-9.€,]", "", pricestr) if pricestr.endswith('*'): pricestr = pricestr[:-1] if not pricestr: return price = parse('{pound:d}€', pricestr) if price: return price.named['pound'] * 100 price = parse('{pound:d},{pence:d}€', pricestr)
from validators import validate_raw_files from create_csvs import create_csvs from ers import all_keywords_usa as keywords, fpath_namer, mh_brands, clean_url, headers from matcher import BrandMatcher from ers import COLLECTION_DATE, file_hash, img_path_namer import shutil from custom_browser import CustomDriver from parse import parse import re # Init variables and assets shop_id = "pogos_wine_spirits" root_url = "https://www.pogoswine.com/" requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache')) country = "USA" searches, categories, products = {}, {}, {} # If necessary driver = CustomDriver(headless=True) def getprice(pricestr): if pricestr == '': return pricestr if pricestr.count('$') >= 2: pricestr = "$" + pricestr.split('$')[1] pricestr = re.sub("[^0-9.$]", "", pricestr) price = parse('${pound:d}.{pence:d}', pricestr)
import requests import requests_cache, imghdr from ers import all_keywords_es as keywords, fpath_namer, mh_brands, clean_url, headers from matcher import BrandMatcher from ers import COLLECTION_DATE, file_hash, img_path_namer import shutil from custom_browser import CustomDriver from parse import parse from validators import validate_raw_files from create_csvs import create_csvs # Init variables and assets shop_id = "bodeboca" root_url = "https://bodeboca.com" requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache')) country = "ES" searches, categories, products = {}, {}, {} # If necessary driver = CustomDriver(headless=False, download_images=True, firefox=True) def getprice(pricestr): if pricestr == '': return pricestr pricestr = re.sub("[^0-9,€]", "", pricestr) pricestr = pricestr.split('€')[0] + '€' price = parse('{pound:d},{pence:d}€', pricestr) if price is None: price = parse('{pence:d}p', pricestr)
parser = etree.HTMLParser() from urllib.parse import quote_plus import requests import requests_cache, imghdr from validators import validate_raw_files from create_csvs import create_csvs from ers import all_keywords_usa as keywords, fpath_namer, mh_brands, clean_url, headers from matcher import BrandMatcher from ers import COLLECTION_DATE, file_hash, img_path_namer import shutil import re # Init variables and assets shop_id = 'astor_wines' root_url = 'http://www.astorwines.com' requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache')) country = 'USA' searches, categories, products = {}, {}, {} from parse import parse def getprice(pricestr): if not pricestr: return pricestr = re.sub("[^0-9.$]", "", pricestr) price = parse('${pound:d}.{pence:d}', pricestr) if not price: price = parse('${th:d},{pound:d}.{pence:d}', pricestr) return price.named['th'] * 100000 + price.named[ 'pound'] * 100 + price.named['pence']
import requests_cache, imghdr from validators import validate_raw_files from create_csvs import create_csvs from ers import all_keywords_de as keywords, fpath_namer, mh_brands, clean_url, headers from matcher import BrandMatcher from ers import COLLECTION_DATE, file_hash, img_path_namer, TEST_PAGES_FOLDER_PATH import shutil from custom_browser import CustomDriver from ers import clean_xpathd_text # Init variables and assets shop_id = 'real' root_url = 'https://www.real.de' session = requests_cache.CachedSession(fpath_namer(shop_id, 'requests_cache')) country = 'DE' searches, categories, products = {}, {}, {} driver = CustomDriver(headless=True) brm = BrandMatcher() from parse import parse def getprice(pricestr): if not pricestr: return price = parse('{pound:d}', pricestr) if price: return price.named['pound'] * 100 price = parse('{pound:d},{pence:d}', pricestr)
from validators import validate_raw_files, check_products_detection from create_csvs import create_csvs from ers import all_keywords_usa as keywords, fpath_namer, mh_brands, clean_url, shop_inventory_lw_csv from matcher import BrandMatcher from ers import COLLECTION_DATE, file_hash, img_path_namer, TEST_PAGES_FOLDER_PATH from custom_browser import CustomDriver from parse import parse from ers import clean_xpathd_text # Init variables and assets shop_id = 'astor_wines' root_url = 'http://www.astorwines.com' requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache')) country = 'USA' searches, categories, products = {}, {}, {} driver = CustomDriver(headless=True) brm = BrandMatcher() def getprice(pricestr): if not pricestr: return pricestr = re.sub("[^0-9.$]", "", pricestr) price = parse('${pound:d}.{pence:d}', pricestr) if not price: price = parse('${th:d},{pound:d}.{pence:d}', pricestr) return price.named['th'] * 100000 + price.named['pound'] * 100 + price.named['pence']
from create_csvs import create_csvs from custom_browser import CustomDriver from ers import COLLECTION_DATE, file_hash, img_path_namer from ers import all_keywords_fr as keywords, TEST_PAGES_FOLDER_PATH from ers import clean_xpathd_text from ers import fpath_namer, mh_brands, clean_url, headers from matcher import BrandMatcher from validators import validate_raw_files parser = etree.HTMLParser() # Init variables and assets shop_id = 'auchan_drive' root_url = 'https://www.auchandrive.fr' requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache')) country = 'FR' searches, categories, products = {}, {}, {} driver = CustomDriver(headless=True, download_images=True) def getprice(pricestr): if not pricestr: return None pricestr = re.sub("[^0-9,€]", "", pricestr) price = parse('{euro:d},{cent:d}€', pricestr) if price is not None: return price.named['euro'] * 100 + price.named['cent'] ###################
import requests_cache, imghdr from validators import validate_raw_files from create_csvs import create_csvs from ers import all_keywords_aus as keywords, fpath_namer, mh_brands, clean_url, headers from matcher import BrandMatcher from ers import COLLECTION_DATE, file_hash, img_path_namer import shutil from parse import parse from custom_browser import CustomDriver # Init variables and assets shop_id = 'goodygoody' root_url = 'https://www.goodygoody.com/' requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache')) country = 'AUS' searches, categories, products = {}, {}, {} driver = CustomDriver(headless=True, download_images=True) def getprice(pricestr): if pricestr == '': return pricestr pricestr = pricestr.replace(',', '').strip() price = parse('${dol:d}.{pence:d}', pricestr) if price is None: price = parse('{pence:d}p', pricestr) return price.named['pence'] else: return price.named['dol'] * 100 + price.named['pence']
# 'gin': '',#no gin # 'tequila': '',#no tequila # 'liquor': '',#no liquor # 'white_wine': 'https://www.aeondewine.com/shop/c/c060102/?l:inkid=aw69_avGM7kHb', # 'red_wine': 'https://www.aeondewine.com/shop/c/c060101/?linkid=aw69_Xl3132nk', # 'bourbon': '',#no bourbon # 'brandy': '',#no brandy # 'rum': '',#no rum } # Category Scraping - with selenium - multiple pages per category (click on next page) for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 print("Beginning ", ctg, url) if not op.exists(fpath_namer(shop_id, 'ctg', ctg, 0)): driver.get(url) # If files exist, don't scrap perform_scrapping = not op.exists(fpath_namer(shop_id, 'ctg', ctg, 0)) for p in range(100): fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath) and perform_scrapping: sleep(2) driver.save_page(fpath, scroll_to_bottom=True) categories, products = ctg_parsing(fpath, ctg, categories, products) print(fpath, ctg, p, len(categories[ctg])) # Break or change pages if number_of_pdcts_in_ctg == len(categories[ctg]): print("Finished, because no more new products")
'http://www.kakuyasu.co.jp/ec/disp/CSfDispListPage_001.jsp?dispNo=001005024&q=&j=&min=&max=&ys=&yl=&yoryotanni=&allSearch=&type=01&sort=01&page={page}', 'red_wine': 'http://www.kakuyasu.co.jp/ec/disp/CSfDispListPage_001.jsp?dispNo=001005001&q=&j=&min=&max=&ys=&yl=&yoryotanni=&allSearch=&type=01&sort=01&page={page}', # 'bourbon': '',#na 'brandy': 'http://www.kakuyasu.co.jp/ec/disp/CSfDispListPage_001.jsp?dispNo=001014&q=&j=&min=&max=&ys=&yl=&yoryotanni=&allSearch=&type=01&sort=01&page={page}', # 'rum': '',#na } # Category Scraping - with selenium - multiple pages per category (click on next page) for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for p in range(100): fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): driver.get(url.format(page=p + 1)) sleep(2) driver.save_page(fpath, scroll_to_bottom=True) categories, products = ctg_parsing(fpath, ctg, categories, products) if len(set(categories[ctg])) == number_of_pdcts_in_ctg: print("Finishing with :", len(set(categories[ctg])), "products") break else: number_of_pdcts_in_ctg = len(set(categories[ctg])) print(ctg, url, p, len(categories[ctg])) ######################################
import requests_cache, imghdr from validators import validate_raw_files from create_csvs import create_csvs from custom_browser import CustomDriver from ers import all_keywords_usa as keywords, mh_brands, clean_url, headers from matcher import BrandMatcher from ers import COLLECTION_DATE, file_hash, img_path_namer,fpath_namer import shutil driver = CustomDriver(headless=False, download_images=True) # Init variables and assets shop_id = 'tower' root_url = 'http://buckhead.towerwinespirits.com' session = requests_cache.core.CachedSession( fpath_namer(shop_id, 'requests_cache'), allowable_methods=('GET', 'POST')) country = 'USA' searches, categories, products = {}, {}, {} from parse import parse def getprice(pricestr): if not pricestr: return pricestr = pricestr.replace("Reg.", "").replace("\xa0", "") price = parse('${pound:d}.{pence:d}', pricestr) if not price: price = parse('${th:d},{pound:d}.{pence:d}', pricestr) return price.named['th'] * 100000 + price.named['pound'] * 100 + price.named['pence'] return price.named['pound'] * 100 + price.named['pence']
import requests_cache, imghdr from validators import validate_raw_files from create_csvs import create_csvs from ers import all_keywords_usa as keywords, fpath_namer, mh_brands, clean_url, headers from matcher import BrandMatcher from ers import COLLECTION_DATE, file_hash, img_path_namer import shutil from custom_browser import CustomDriver from parse import parse # Init variables and assets shop_id = 'twin_liquors' root_url = 'http://www.twinliquors.com' requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache')) country = 'USA' searches, categories, products = {}, {}, {} driver = CustomDriver(headless=True, download_images=False) urls_ctgs_dict = { 'champagne': 'http://twinliquors.com/shop/catalogsearch/result/?q=champagne', 'sparkling': 'http://twinliquors.com/shop/catalogsearch/result/?q=Sparkling+wine', 'still_wines': 'http://twinliquors.com/shop/wine.html', 'whisky': 'http://twinliquors.com/shop/catalogsearch/result/?q=whisky', 'cognac': 'http://twinliquors.com/shop/catalogsearch/result/?q=cognac', 'vodka': 'http://twinliquors.com/shop/catalogsearch/result/?q=vodka', 'red_wine': 'http://twinliquors.com/shop/catalogsearch/result/?q=red+wine', 'white_wine': 'http://twinliquors.com/shop/catalogsearch/result/?q=red+wine', 'tequila': 'http://twinliquors.com/shop/catalogsearch/result/?q=tequila', 'gin': 'http://twinliquors.com/shop/catalogsearch/result/?q=gin',
'https://lohaco.jp/g3/71-5107-5110002/?resultCount=100&page={page}', 'red_wine': 'https://lohaco.jp/g3/71-5107-5107002/?resultCount=100&page={page}', # 'bourbon': '',#na 'brandy': 'https://lohaco.jp/g3/71-5111-5110009/?resultCount=100&page={page}', 'rum': 'https://lohaco.jp/g3/71-5111-5110012/?resultCount=100&page={page}', } # Category Scraping - with selenium - multiple pages per category (click on next page) for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for p in range(100): fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): driver.get(url.format(page=p + 1)) sleep(2) driver.save_page(fpath, scroll_to_bottom=True) categories, products = ctg_parsing(fpath, ctg, categories, products) if len(set(categories[ctg])) == number_of_pdcts_in_ctg: break else: number_of_pdcts_in_ctg = len(set(categories[ctg])) print(ctg, url, p, len(categories[ctg])) ###################################### # # KW searches scrapping ############
if 'pdct_img_main_url' in pdt and pdt[ 'pdct_img_main_url'] and brm.find_brand( pdt['pdct_name_on_eretailer'])['brand'] in mh_brands: print(pdt['pdct_name_on_eretailer'] + "." + pdt['pdct_img_main_url'].split('.')[-1]) response = requests.get(pdt['pdct_img_main_url'], stream=True, verify=False, headers=headers) # response.raw.decode_content = True tmp_file_path = '/tmp/' + shop_id + 'mhers_tmp_{}.imgtype'.format( abs(hash(pdt['pdct_img_main_url']))) img_path = img_path_namer(shop_id, pdt['pdct_name_on_eretailer']) with open(tmp_file_path, 'wb') as out_file: shutil.copyfileobj(response.raw, out_file) if imghdr.what(tmp_file_path) is not None: img_path = img_path.split('.')[0] + '.' + imghdr.what( '/tmp/' + shop_id + 'mhers_tmp_{}.imgtype'.format( abs(hash(pdt['pdct_img_main_url'])))) shutil.copyfile( '/tmp/' + shop_id + 'mhers_tmp_{}.imgtype'.format( abs(hash(pdt['pdct_img_main_url']))), img_path) products[url].update({ 'img_path': img_path, 'img_hash': file_hash(img_path) }) create_csvs(products, categories, searches, shop_id, fpath_namer(shop_id, 'raw_csv'), COLLECTION_DATE) validate_raw_files(fpath_namer(shop_id, 'raw_csv'))
from validators import validate_raw_files from create_csvs import create_csvs import requests_cache, imghdr from ers import all_keywords_de as keywords, mh_brands, clean_url from matcher import BrandMatcher from ers import COLLECTION_DATE, file_hash, img_path_namer, fpath_namer import requests import shutil from helpers.random_user_agent import randomua from parse import parse # Init variables and assets shop_id = 'rewe' root_url = 'https://shop.rewe.de' session = requests_cache.CachedSession(fpath_namer(shop_id, 'requests_cache')) session.headers = {'User-Agent': randomua()} country = 'DE' searches, categories, products = {}, {}, {} def getprice(pricestr): pricestr = pricestr.replace(' ', '') if pricestr == '': return pricestr price = parse('{dol:d},{pence:d}€', pricestr) if price is None: price = parse('{dol:d}€', pricestr) return price.named['dol'] * 100 else: return price.named['dol'] * 100 + price.named['pence']
from create_csvs import create_csvs from custom_browser import CustomDriver from ers import COLLECTION_DATE, file_hash, img_path_namer, TEST_PAGES_FOLDER_PATH, shop_inventory_lw_csv from ers import all_keywords_fr as keywords from ers import clean_xpathd_text from ers import fpath_namer, mh_brands, clean_url from matcher import BrandMatcher from validators import validate_raw_files, check_products_detection parser = etree.HTMLParser() # Init variables and assets shop_id = 'auchan_drive' root_url = 'https://www.auchandrive.fr' requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache')) country = 'FR' searches, categories, products = {}, {}, {} driver = CustomDriver(headless=True, download_images=True) brm = BrandMatcher() def getprice(pricestr): if not pricestr: return None pricestr = re.sub("[^0-9,€]", "", pricestr) price = parse('{euro:d},{cent:d}€', pricestr) if price is not None: return price.named['euro'] * 100 + price.named['cent']
import requests_cache, imghdr from validators import validate_raw_files from create_csvs import create_csvs from ers import all_keywords_usa as keywords, fpath_namer, mh_brands, clean_url, headers from matcher import BrandMatcher from ers import COLLECTION_DATE, file_hash, img_path_namer import shutil from custom_browser import CustomDriver from parse import parse # Init variables and assets shop_id = "wallys" root_url = "http://www.wallywine.com/" requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache')) country = "USA" searches, categories, products = {}, {}, {} # If necessary driver = CustomDriver(headless=True) def getprice(pricestr): if pricestr == '': return pricestr pricestr = re.sub("[^0-9.$]", "", pricestr) price = parse('${pound:d}.{pence:d}', pricestr) if price is None: price = parse('{pence:d}p', pricestr)
'liquor': 'https://www.seijoishii.com/c/242?&row_limit=50&page={page}', 'white_wine': 'https://www.seijoishii.com/c/1284?&row_limit=50&page={page}', 'red_wine': 'https://www.seijoishii.com/c/1283?&row_limit=50&page={page}', 'bourbon': 'https://www.seijoishii.com/c/277?&row_limit=50&page={page}', 'brandy': 'https://www.seijoishii.com/c/239?&row_limit=50&page={page}', # 'rum': '', } # Category Scraping - with selenium - multiple pages per category (click on next page) for ctg, url in urls_ctgs_dict.items(): categories[ctg] = [] number_of_pdcts_in_ctg = 0 for p in range(100): fpath = fpath_namer(shop_id, 'ctg', ctg, p) if not op.exists(fpath): driver.get(url.format(page=p + 1)) sleep(2) driver.save_page(fpath, scroll_to_bottom=True) categories, products = ctg_parsing(fpath, ctg, categories, products) if len(set(categories[ctg])) == number_of_pdcts_in_ctg: break else: number_of_pdcts_in_ctg = len(set(categories[ctg])) print(ctg, url, p, len(categories[ctg])) ###################################### # # KW searches scrapping ############
from create_csvs import create_csvs from ers import all_keywords_aus as keywords, mh_brands, headers from matcher import BrandMatcher from ers import COLLECTION_DATE, file_hash, img_path_namer, fpath_namer import shutil from helpers.random_user_agent import randomua import requests from custom_browser import CustomDriver from time import sleep # Init variables and assets shop_id = 'my_bottle_shop' root_url = 'https://www.mybottleshop.com.au' session = requests_cache.CachedSession(fpath_namer(shop_id, 'requests_cache')) session.headers = {'User-Agent': randomua()} driver = CustomDriver(headless=False, download_images=True) with session.cache_disabled(): session.get('https://www.mybottleshop.com.au/directory/currency/switch/currency/AUD/uenc/') # print(session.cookies) country = 'AUS' searches, categories, products = {}, {}, {} from parse import parse def getprice(pricestr): if not pricestr: return price = parse('{pound:d}', pricestr) if price:
import requests_cache, imghdr from parse import parse from validators import validate_raw_files from create_csvs import create_csvs from ers import all_keywords_uk as keywords from ers import fpath_namer, mh_brands, clean_url, headers from matcher import BrandMatcher from ers import COLLECTION_DATE, file_hash, img_path_namer import shutil from custom_browser import CustomDriver # Init variables and assets shop_id = 'sainsbury' root_url = 'https://www.sainsburys.co.uk' requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache')) country = 'UK' searches, categories, products = {}, {}, {} driver = CustomDriver(headless=True, download_images=False) def getprice(pricestr): if pricestr == '': return pricestr pricestr = pricestr.replace(',', '').strip() price = parse('£{pound:d}.{pence:d}', pricestr) if price is None: price = parse('{pence:d}p', pricestr) return price.named['pence'] else: return price.named['pound'] * 100 + price.named['pence']
import requests_cache from validators import validate_raw_files, check_products_detection from create_csvs import create_csvs from ers import all_keywords_usa as keywords, fpath_namer, mh_brands, clean_url, shop_inventory_lw_csv from matcher import BrandMatcher from ers import COLLECTION_DATE, file_hash, img_path_namer, TEST_PAGES_FOLDER_PATH from custom_browser import CustomDriver from parse import parse from ers import clean_xpathd_text # Init variables and assets shop_id = 'goodygoody' root_url = 'https://www.goodygoody.com/' requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache')) country = 'AUS' searches, categories, products = {}, {}, {} driver = CustomDriver(headless=True) brm = BrandMatcher() def getprice(pricestr): if pricestr == '': return pricestr pricestr = pricestr.replace(',', '').strip() price = parse('${dol:d}.{pence:d}', pricestr) if price is None: price = parse('{pence:d}p', pricestr) return price.named['pence']