class PowerSpider(scrapy.Spider): name = 'power_prices' allowed_domains = ['www.power.dk'] start_urls = ['https://www.power.dk'] options = webdriver.ChromeOptions() WINDOW_SIZE = "1920,1080" options.add_argument("--headless") options.add_argument("--window-size=%s" % WINDOW_SIZE) # options.add_argument("--start-maximized") driver = webdriver.Chrome(executable_path=r"chromedriver.exe", chrome_options=options) products = lf.load_names() # products = ['KGE36BW40', 'KG49EBI40'] def start_requests(self): for url in self.start_urls: yield scrapy.Request(url=url, callback=self.parse_search_page) def parse_search_page(self, response): url = response.request.url links_list = [] now = datetime.datetime.now() date = now.strftime("%Y-%m-%d") self.driver.get(response.url) for product in self.products: search_form = self.driver.find_element_by_id('search-input') search_form.send_keys(product) time.sleep(1.0) search_button = self.driver.find_element_by_id('search-button') search_button.click() time.sleep(1.0) try: link_container = self.driver.find_element_by_xpath( '//div[@class="product product-element angi product-4col"]' ) link_ext = str(link_container.get_attribute("data-gtmurl")) link_all = self.start_urls[0] + link_ext yield scrapy.Request(url=link_all, callback=self.parse_product_page) except NoSuchElementException: continue self.driver.close() def parse_product_page(self, response): now = datetime.datetime.now() date = now.strftime("%Y-%m-%d") for product in self.products: if product.lower() in response.request.url: product_match = product p = ProductLoader(item=Product(), response=response) p.add_xpath('price', '//div[@class="prices-container "]//span/text()') p.add_value('date', date) p.add_value('retailer', "Power") p.add_value('product', product_match) yield p.load_item()
class WhiteawaySpider(scrapy.Spider): name = 'whiteaway_prices' allowed_domains = ['whiteaway.com'] products = lf.load_names() # self.products = ['KGE36BI40'] url_base = 'https://www.whiteaway.com' url_before = 'https://www.whiteaway.com/search_result/?keywords=' url_end = '#/' def start_requests(self): for product in self.products: url = self.url_before + product + self.url_end yield scrapy.Request(url=url, callback=self.parse_search_page) def parse_search_page(self, response): search_container = response.xpath('//a[@class="srp__product-link"]') if len(search_container) > 0: url_add = search_container.xpath('@href').extract_first() url = self.url_base + url_add yield scrapy.Request(url=url, callback=self.parse_product_page) else: for product in self.products: if product.lower() in response.request.url: product_match = product print(' ') print('------------------------------------') print(product_match, " not found") print('------------------------------------') print(' ') def parse_product_page(self, response): now = datetime.datetime.now() date = now.strftime("%Y-%m-%d") product_match = 'unkown' for product in self.products: if product.lower() in response.request.url: product_match = product p = ProductLoader(item=Product(), response=response) p.add_value('product', product_match) p.add_xpath('price', '//div[@class="vip__price-box-price"]/text()') p.add_value('date', date) p.add_value('retailer', "Whiteaway") yield p.load_item() w = WebsiteLoader(item=Website(), response=response) w.add_value('html', response.body) w.add_value('date', date) w.add_value('product', product_match) w.add_value('retailer', "Whiteaway") yield w.load_item()
class WhiteawayLinksSpider(scrapy.Spider): name = 'whiteaway_links' allowed_domains = ['whiteaway.com'] start_urls = ['https://www.whiteaway.com/'] products = lf.load_names() product_url = [product.lower() for product in products] def __init__(self): # self.driver = webdriver.Firefox() options = webdriver.ChromeOptions() options.add_argument("--start-maximized") self.driver = webdriver.Chrome(executable_path=r"chromedriver.exe", chrome_options=options) self.products = lf.load_names() # self.products = ['KGE36BW40', 'KG49EBI40'] def parse(self, response): url = response.request.url links_list = [] now = datetime.datetime.now() date = now.strftime("%Y-%m-%d") self.driver.get(response.url) for product in self.products: # search_form_class = "search__input js-search-field js-sniper-click js-click-track" search_form_class = "search__input" search_form = self.driver.find_element_by_class_name(search_form_class) search_form.send_keys(product) time.sleep(1) search_button_class = "search__submit" search_button = self.driver.find_element_by_class_name(search_button_class) search_button.click() time.sleep(1) try: link_container = self.driver.find_element_by_xpath( '//div[@class="srp-product-box__image-section list"]//a') link = str(link_container.get_attribute("href")) product_link = 'https://www.whiteaway.com' + link l = LinkLoader(item=Link()) l.add_value('link', product_link) l.add_value('product', product) l.add_value('date', date) l.add_value('retailer', 'Whiteaway') yield l.load_item() except NoSuchElementException: print(" ") print("-----------------------------") print("Element does not exist") print("-----------------------------") print(" ") self.driver.close()
class WuptiSpider(scrapy.Spider): name = 'wupti_prices' allowed_domains = ['wupti.com', 'wupti.dk'] start_urls = ['https://www.wupti.com/produkter/hvidevarer/koel-og-frys/'] # start_urls = [ # 'https://www.wupti.com/produkter/hvidevarer/koel-og-frys/koele/fryseskabe/siemens-kg39ebi40-taenk-bedst' # ] products = lf.load_names() product_url = [product.lower() for product in products] rules = ( Rule(LinkExtractor(allow=(product_url)), callback='parse_item'), Rule(LinkExtractor(allow=r"koele")), ) def parse_item(self, response): now = datetime.datetime.now() date = now.strftime("%Y-%m-%d") for product in self.products: if product.lower() in response.request.url: product_match = product p = ProductLoader(item=Product(), response=response) p.add_value('product', product_match) normal_div = '//div[@class="seller seller-first"]//div[@class="productPrice nobefore"]//strong/text()' discount_div = '//div[@class="seller seller-first"]//div[@class="productPrice"]//strong/text()' normal_container = response.xpath(normal_div) discount_container = response.xpath(discount_div) if len(discount_container) > 0: price_div = discount_div else: price_div = normal_div print('============================') print(normal_container) print(discount_container) print('============================') p.add_xpath('price', price_div) p.add_value('date', date) p.add_value('retailer', "Wupti") yield p.load_item() w = WebsiteLoader(item=Website(), response=response) w.add_value('html', response.body) w.add_value('date', date) w.add_value('product', product_match) w.add_value('retailer', "Wupti") yield w.load_item()
class Punkt1LinksSpider(scrapy.Spider): name = 'punkt1_links' allowed_domains = ['punkt1.com'] start_urls = ['https://www.punkt1.dk/'] # driver = webdriver.Firefox() options = webdriver.ChromeOptions() WINDOW_SIZE = "1920,1080" options.add_argument("--headless") options.add_argument("--window-size=%s" % WINDOW_SIZE) # options.add_argument("--start-maximized") driver = webdriver.Chrome(executable_path=r"chromedriver.exe", chrome_options=options) products = lf.load_names() # products = ['KGE36BW40', 'KG49EBI40'] def parse(self, response): url = response.request.url links_list = [] now = datetime.datetime.now() date = now.strftime("%Y-%m-%d") self.driver.get(response.url) for product in self.products: search_form = self.driver.find_element_by_id('search-input') search_form.send_keys(product) time.sleep(1.0) search_button = self.driver.find_element_by_id('search-button') search_button.click() time.sleep(1.0) try: link_container = self.driver.find_element_by_xpath( '//div[@class="product product-element angi product-4col"]' ) link = str(link_container.get_attribute("data-gtmurl")) product_link = 'https://www.punkt1.dk' + link l = LinkLoader(item=Link()) l.add_value('link', product_link) l.add_value('product', product) l.add_value('date', date) l.add_value('retailer', 'Punkt1') yield l.load_item() except NoSuchElementException: print(" ") print("-----------------------------") print("Element does not exist") print("-----------------------------") print(" ") self.driver.close()
class SkousenSpider(CrawlSpider): name = 'skousen_prices' allowed_domains = ['skousen.dk'] start_urls = ['https://www.skousen.dk/hvidevarer/koele-fryseskab/'] products = lf.load_names() product_url = [product.lower() for product in products] rules = ( Rule(LinkExtractor(allow=(product_url)), callback='parse_item'), Rule(LinkExtractor(allow=r"koele")), ) def parse_item(self, response): now = datetime.datetime.now() date = now.strftime("%Y-%m-%d") for product in self.products: if product.lower() in response.request.url: product_match = product p = ProductLoader(item=Product(), response=response) p.add_value('product', product_match) normal_div = '//div[@class="vip__price-box-price"]/text()' discount_div = '//div[@class="campaign-banner__info__right"]//span[@class="top"]/text()' normal_container = response.xpath(normal_div) discount_container = response.xpath(discount_div) if len(discount_container) > 0: price_div = discount_div else: price_div = normal_div print('============================') print(normal_container) print(discount_container) print('============================') p.add_xpath('price', price_div) p.add_value('date', date) p.add_value('retailer', "Skousen") yield p.load_item() w = WebsiteLoader(item=Website(), response=response) w.add_value('html', response.body) w.add_value('date', date) w.add_value('product', product_match) w.add_value('retailer', "Skousen") yield w.load_item()
class Punkt1Spider(scrapy.Spider): name = 'punkt1_prices_old' allowed_domains = ['punkt1.com'] # start_urls = ['https://www.wupti.com/produkter/hvidevarer/koel-og-frys/'] start_urls = [ 'https://www.punkt1.dk/hvidevarer/koel-frys-og-koelefryseskabe/koelefryseskabe/bosch-kge36bw40/p-552643/' ] products = lf.load_names() product_url = [product.lower() for product in products] rules = ( Rule(LinkExtractor(allow=(product_url)), callback='parse_item'), Rule(LinkExtractor(allow=r"koele")), ) def parse(self, response): now = datetime.datetime.now() date = now.strftime("%Y-%m-%d") for product in self.products: if product.lower() in response.request.url: product_match = product p = ProductLoader(item=Product(), response=response) p.add_value('product', product_match) normal_div = '//span[@class="product-price-tag"]/text()' discount_div = '//span[@class="product-price-tag"]/text()' normal_container = response.xpath(normal_div) discount_container = response.xpath(discount_div) if len(discount_container) > 0: price_div = discount_div else: price_div = normal_div print('============================') print(normal_container) print(discount_container) print('============================') p.add_xpath('price', price_div) p.add_value('date', date) p.add_value('retailer', "Punkt1") yield p.load_item()
class ElgigantenSpider(scrapy.Spider): name = 'elgiganten_prices' allowed_domains = ['https://www.elgiganten.dk'] products = lf.load_names() url_base = 'https://www.elgiganten.dk' url_before = 'https://www.elgiganten.dk/search?SearchTerm=' url_end = '&search=&searchResultTab=' def start_requests(self): for product in self.products: url = self.url_before + product + self.url_end yield scrapy.Request(url=url, callback=self.parse_product_page) def parse_product_page(self, response): now = datetime.datetime.now() date = now.strftime("%Y-%m-%d") for product in self.products: if product.lower() in response.request.url: product_match = product p = ProductLoader(item=Product(), response=response) p.add_value('product', product_match) p.add_xpath( 'price', '//div[@class="product-price-container"]//span/text()') p.add_value('date', date) p.add_value('retailer', "Elgiganten") yield p.load_item() w = WebsiteLoader(item=Website(), response=response) w.add_value('html', response.body) w.add_value('date', date) w.add_value('product', product_match) w.add_value('retailer', "Elgiganten") yield w.load_item()
def __init__(self): self.products = lf.load_names() # self.products = ['KG49EBI40'] self.url_base = 'https://hvidevarer.bilka.dk' self.url_before = 'https://hvidevarer.bilka.dk/search?searchText=' self.url_end = ''
def __init__(self): # self.driver = webdriver.Firefox() options = webdriver.ChromeOptions() options.add_argument("--start-maximized") self.driver = webdriver.Chrome(executable_path=r"chromedriver.exe", chrome_options=options) self.products = lf.load_names()
def __init__(self): self.products = lf.load_names() self.base_url = 'https://www.elgiganten.dk/product/hvidevarer/kolefryseskab/'
class SkousenSpider(scrapy.Spider): name = 'skousen_prices' allowed_domains = ['skousen.dk'] products = lf.load_names() product_url = [product.lower() for product in products] products = lf.load_names() # products = ['KGE36BI40'] url_base = 'https://www.skousen.dk' url_before = 'https://www.skousen.dk/search_result/?keywords=' url_end = '#/' def start_requests(self): for product in self.products: url = self.url_before + product + self.url_end yield scrapy.Request(url=url, callback=self.parse_search_page) def parse_search_page(self, response): search_container = response.xpath('//a[@class="srp__product-link"]') if len(search_container) > 0: url_add = search_container.xpath('@href').extract_first() url = self.url_base + url_add yield scrapy.Request(url=url, callback=self.parse_product_page) else: for product in self.products: if product.lower() in response.request.url: product_match = product print(' ') print('------------------------------------') print(product_match, " not found") print('------------------------------------') print(' ') def parse_product_page(self, response): now = datetime.datetime.now() date = now.strftime("%Y-%m-%d") for product in self.products: if product.lower() in response.request.url: product_match = product p = ProductLoader(item=Product(), response=response) p.add_value('product', product_match) normal_div = '//div[@class="vip__price-box-price"]/text()' discount_div = '//div[@class="campaign-banner__info__right"]//span[@class="top"]/text()' normal_container = response.xpath(normal_div) discount_container = response.xpath(discount_div) if len(discount_container) > 0: price_div = discount_div else: price_div = normal_div print('============================') print(normal_container) print(discount_container) print('============================') p.add_xpath('price', price_div) p.add_value('date', date) p.add_value('retailer', "Skousen") yield p.load_item() w = WebsiteLoader(item=Website(), response=response) w.add_value('html', response.body) w.add_value('date', date) w.add_value('product', product_match) w.add_value('retailer', "Skousen") yield w.load_item()
class WuptiSpider(CrawlSpider): name = 'wupti_prices' allowed_domains = ['wupti.com', 'wupti.dk'] products = lf.load_names() # products = ['KGE36BI40'] url_base = 'https://www.wupti.com' url_before = 'https://www.wupti.com/search?searchText=' url_end = '' def start_requests(self): for product in self.products: url = self.url_before + product + self.url_end yield scrapy.Request(url=url, callback=self.parse_search_page) def parse_search_page(self, response): search_container = response.xpath('//a[@class="productPhotoLink"]') if len(search_container) > 0: url_add = search_container.xpath('@href').extract_first() url = self.url_base + url_add yield scrapy.Request(url=url, callback=self.parse_product_page) else: for product in self.products: if product.lower() in response.request.url: product_match = product print(' ') print('------------------------------------') print(product_match, " not found") print('------------------------------------') print(' ') def parse_product_page(self, response): now = datetime.datetime.now() date = now.strftime("%Y-%m-%d") for product in self.products: if product.lower() in response.request.url: product_match = product p = ProductLoader(item=Product(), response=response) p.add_value('product', product_match) normal_div = '//div[@class="seller seller-first"]//div[@class="productPrice nobefore"]//strong/text()' discount_div = '//div[@class="seller seller-first"]//div[@class="productPrice"]//strong/text()' normal_container = response.xpath(normal_div) discount_container = response.xpath(discount_div) if len(discount_container) > 0: price_div = discount_div else: price_div = normal_div print('============================') print(normal_container) print(discount_container) print('============================') p.add_xpath('price', price_div) p.add_value('date', date) p.add_value('retailer', "Wupti") yield p.load_item() w = WebsiteLoader(item=Website(), response=response) w.add_value('html', response.body) w.add_value('date', date) w.add_value('product', product_match) w.add_value('retailer', "Wupti") yield w.load_item()
def __init__(self): self.driver = webdriver.Firefox() self.products = lf.load_names()