Ejemplo n.º 1
0
class PowerSpider(scrapy.Spider):
    name = 'power_prices'
    allowed_domains = ['www.power.dk']
    start_urls = ['https://www.power.dk']

    options = webdriver.ChromeOptions()
    WINDOW_SIZE = "1920,1080"
    options.add_argument("--headless")
    options.add_argument("--window-size=%s" % WINDOW_SIZE)
    # options.add_argument("--start-maximized")
    driver = webdriver.Chrome(executable_path=r"chromedriver.exe",
                              chrome_options=options)
    products = lf.load_names()

    # products = ['KGE36BW40', 'KG49EBI40']

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url=url, callback=self.parse_search_page)

    def parse_search_page(self, response):
        url = response.request.url
        links_list = []
        now = datetime.datetime.now()
        date = now.strftime("%Y-%m-%d")
        self.driver.get(response.url)
        for product in self.products:
            search_form = self.driver.find_element_by_id('search-input')
            search_form.send_keys(product)
            time.sleep(1.0)
            search_button = self.driver.find_element_by_id('search-button')
            search_button.click()
            time.sleep(1.0)
            try:
                link_container = self.driver.find_element_by_xpath(
                    '//div[@class="product product-element angi product-4col"]'
                )
                link_ext = str(link_container.get_attribute("data-gtmurl"))
                link_all = self.start_urls[0] + link_ext
                yield scrapy.Request(url=link_all,
                                     callback=self.parse_product_page)
            except NoSuchElementException:
                continue
        self.driver.close()

    def parse_product_page(self, response):
        now = datetime.datetime.now()
        date = now.strftime("%Y-%m-%d")
        for product in self.products:
            if product.lower() in response.request.url:
                product_match = product
                p = ProductLoader(item=Product(), response=response)
                p.add_xpath('price',
                            '//div[@class="prices-container "]//span/text()')
                p.add_value('date', date)
                p.add_value('retailer', "Power")
                p.add_value('product', product_match)
                yield p.load_item()
Ejemplo n.º 2
0
class WhiteawaySpider(scrapy.Spider):
    name = 'whiteaway_prices'
    allowed_domains = ['whiteaway.com']
    products = lf.load_names()
    # self.products = ['KGE36BI40']
    url_base = 'https://www.whiteaway.com'
    url_before = 'https://www.whiteaway.com/search_result/?keywords='
    url_end = '#/'

    def start_requests(self):
        for product in self.products:
            url = self.url_before + product + self.url_end
            yield scrapy.Request(url=url, callback=self.parse_search_page)

    def parse_search_page(self, response):
        search_container = response.xpath('//a[@class="srp__product-link"]')
        if len(search_container) > 0:
            url_add = search_container.xpath('@href').extract_first()
            url = self.url_base + url_add
            yield scrapy.Request(url=url, callback=self.parse_product_page)
        else:
            for product in self.products:
                if product.lower() in response.request.url:
                    product_match = product
            print(' ')
            print('------------------------------------')
            print(product_match, " not found")
            print('------------------------------------')
            print(' ')

    def parse_product_page(self, response):
        now = datetime.datetime.now()
        date = now.strftime("%Y-%m-%d")
        product_match = 'unkown'
        for product in self.products:
            if product.lower() in response.request.url:
                product_match = product

                p = ProductLoader(item=Product(), response=response)
                p.add_value('product', product_match)

                p.add_xpath('price',
                            '//div[@class="vip__price-box-price"]/text()')
                p.add_value('date', date)
                p.add_value('retailer', "Whiteaway")
                yield p.load_item()

                w = WebsiteLoader(item=Website(), response=response)
                w.add_value('html', response.body)
                w.add_value('date', date)
                w.add_value('product', product_match)
                w.add_value('retailer', "Whiteaway")
                yield w.load_item()
class WhiteawayLinksSpider(scrapy.Spider):

    name = 'whiteaway_links'
    allowed_domains = ['whiteaway.com']
    start_urls = ['https://www.whiteaway.com/']

    products = lf.load_names()
    product_url = [product.lower() for product in products]

    def __init__(self):
        # self.driver = webdriver.Firefox()
        options = webdriver.ChromeOptions()
        options.add_argument("--start-maximized")
        self.driver = webdriver.Chrome(executable_path=r"chromedriver.exe", chrome_options=options)
        self.products = lf.load_names()
        # self.products = ['KGE36BW40', 'KG49EBI40']

    def parse(self, response):
        url = response.request.url
        links_list = []
        now = datetime.datetime.now()
        date = now.strftime("%Y-%m-%d")
        self.driver.get(response.url)

        for product in self.products:
            # search_form_class = "search__input js-search-field js-sniper-click js-click-track"
            search_form_class = "search__input"
            search_form = self.driver.find_element_by_class_name(search_form_class)
            search_form.send_keys(product)
            time.sleep(1)
            search_button_class = "search__submit"
            search_button = self.driver.find_element_by_class_name(search_button_class)
            search_button.click()
            time.sleep(1)
            try:
                link_container = self.driver.find_element_by_xpath(
                    '//div[@class="srp-product-box__image-section list"]//a')
                link = str(link_container.get_attribute("href"))
                product_link = 'https://www.whiteaway.com' + link
                l = LinkLoader(item=Link())
                l.add_value('link', product_link)
                l.add_value('product', product)
                l.add_value('date', date)
                l.add_value('retailer', 'Whiteaway')
                yield l.load_item()
            except NoSuchElementException:
                print(" ")
                print("-----------------------------")
                print("Element does not exist")
                print("-----------------------------")
                print(" ")
        self.driver.close()
Ejemplo n.º 4
0
class WuptiSpider(scrapy.Spider):
    name = 'wupti_prices'
    allowed_domains = ['wupti.com', 'wupti.dk']
    start_urls = ['https://www.wupti.com/produkter/hvidevarer/koel-og-frys/']
    # start_urls = [
    #     'https://www.wupti.com/produkter/hvidevarer/koel-og-frys/koele/fryseskabe/siemens-kg39ebi40-taenk-bedst'
    # ]

    products = lf.load_names()
    product_url = [product.lower() for product in products]

    rules = (
        Rule(LinkExtractor(allow=(product_url)), callback='parse_item'),
        Rule(LinkExtractor(allow=r"koele")),
    )

    def parse_item(self, response):
        now = datetime.datetime.now()
        date = now.strftime("%Y-%m-%d")
        for product in self.products:
            if product.lower() in response.request.url:
                product_match = product
        p = ProductLoader(item=Product(), response=response)
        p.add_value('product', product_match)

        normal_div = '//div[@class="seller seller-first"]//div[@class="productPrice nobefore"]//strong/text()'
        discount_div = '//div[@class="seller seller-first"]//div[@class="productPrice"]//strong/text()'
        normal_container = response.xpath(normal_div)
        discount_container = response.xpath(discount_div)

        if len(discount_container) > 0:
            price_div = discount_div
        else:
            price_div = normal_div

        print('============================')
        print(normal_container)
        print(discount_container)
        print('============================')

        p.add_xpath('price', price_div)
        p.add_value('date', date)
        p.add_value('retailer', "Wupti")
        yield p.load_item()

        w = WebsiteLoader(item=Website(), response=response)
        w.add_value('html', response.body)
        w.add_value('date', date)
        w.add_value('product', product_match)
        w.add_value('retailer', "Wupti")
        yield w.load_item()
class Punkt1LinksSpider(scrapy.Spider):

    name = 'punkt1_links'
    allowed_domains = ['punkt1.com']
    start_urls = ['https://www.punkt1.dk/']

    # driver = webdriver.Firefox()
    options = webdriver.ChromeOptions()
    WINDOW_SIZE = "1920,1080"
    options.add_argument("--headless")
    options.add_argument("--window-size=%s" % WINDOW_SIZE)
    # options.add_argument("--start-maximized")
    driver = webdriver.Chrome(executable_path=r"chromedriver.exe",
                              chrome_options=options)
    products = lf.load_names()

    # products = ['KGE36BW40', 'KG49EBI40']

    def parse(self, response):
        url = response.request.url
        links_list = []
        now = datetime.datetime.now()
        date = now.strftime("%Y-%m-%d")
        self.driver.get(response.url)

        for product in self.products:
            search_form = self.driver.find_element_by_id('search-input')
            search_form.send_keys(product)
            time.sleep(1.0)
            search_button = self.driver.find_element_by_id('search-button')
            search_button.click()
            time.sleep(1.0)
            try:
                link_container = self.driver.find_element_by_xpath(
                    '//div[@class="product product-element angi product-4col"]'
                )
                link = str(link_container.get_attribute("data-gtmurl"))
                product_link = 'https://www.punkt1.dk' + link
                l = LinkLoader(item=Link())
                l.add_value('link', product_link)
                l.add_value('product', product)
                l.add_value('date', date)
                l.add_value('retailer', 'Punkt1')
                yield l.load_item()
            except NoSuchElementException:
                print(" ")
                print("-----------------------------")
                print("Element does not exist")
                print("-----------------------------")
                print(" ")
        self.driver.close()
Ejemplo n.º 6
0
class SkousenSpider(CrawlSpider):
    name = 'skousen_prices'
    allowed_domains = ['skousen.dk']
    start_urls = ['https://www.skousen.dk/hvidevarer/koele-fryseskab/']
    products = lf.load_names()
    product_url = [product.lower() for product in products]

    rules = (
        Rule(LinkExtractor(allow=(product_url)), callback='parse_item'),
        Rule(LinkExtractor(allow=r"koele")),
    )

    def parse_item(self, response):
        now = datetime.datetime.now()
        date = now.strftime("%Y-%m-%d")
        for product in self.products:
            if product.lower() in response.request.url:
                product_match = product
        p = ProductLoader(item=Product(), response=response)
        p.add_value('product', product_match)

        normal_div = '//div[@class="vip__price-box-price"]/text()'
        discount_div = '//div[@class="campaign-banner__info__right"]//span[@class="top"]/text()'
        normal_container = response.xpath(normal_div)
        discount_container = response.xpath(discount_div)

        if len(discount_container) > 0:
            price_div = discount_div
        else:
            price_div = normal_div

        print('============================')
        print(normal_container)
        print(discount_container)
        print('============================')

        p.add_xpath('price', price_div)
        p.add_value('date', date)
        p.add_value('retailer', "Skousen")
        yield p.load_item()

        w = WebsiteLoader(item=Website(), response=response)
        w.add_value('html', response.body)
        w.add_value('date', date)
        w.add_value('product', product_match)
        w.add_value('retailer', "Skousen")
        yield w.load_item()
Ejemplo n.º 7
0
class Punkt1Spider(scrapy.Spider):

    name = 'punkt1_prices_old'
    allowed_domains = ['punkt1.com']
    # start_urls = ['https://www.wupti.com/produkter/hvidevarer/koel-og-frys/']
    start_urls = [
        'https://www.punkt1.dk/hvidevarer/koel-frys-og-koelefryseskabe/koelefryseskabe/bosch-kge36bw40/p-552643/'
    ]

    products = lf.load_names()
    product_url = [product.lower() for product in products]

    rules = (
        Rule(LinkExtractor(allow=(product_url)), callback='parse_item'),
        Rule(LinkExtractor(allow=r"koele")),
    )

    def parse(self, response):
        now = datetime.datetime.now()
        date = now.strftime("%Y-%m-%d")
        for product in self.products:
            if product.lower() in response.request.url:
                product_match = product
        p = ProductLoader(item=Product(), response=response)
        p.add_value('product', product_match)

        normal_div = '//span[@class="product-price-tag"]/text()'
        discount_div = '//span[@class="product-price-tag"]/text()'
        normal_container = response.xpath(normal_div)
        discount_container = response.xpath(discount_div)

        if len(discount_container) > 0:
            price_div = discount_div
        else:
            price_div = normal_div

        print('============================')
        print(normal_container)
        print(discount_container)
        print('============================')

        p.add_xpath('price', price_div)
        p.add_value('date', date)
        p.add_value('retailer', "Punkt1")
        yield p.load_item()
Ejemplo n.º 8
0
class ElgigantenSpider(scrapy.Spider):
    name = 'elgiganten_prices'
    allowed_domains = ['https://www.elgiganten.dk']

    products = lf.load_names()
    url_base = 'https://www.elgiganten.dk'
    url_before = 'https://www.elgiganten.dk/search?SearchTerm='
    url_end = '&search=&searchResultTab='

    def start_requests(self):
        for product in self.products:
            url = self.url_before + product + self.url_end
            yield scrapy.Request(url=url, callback=self.parse_product_page)

    def parse_product_page(self, response):
        now = datetime.datetime.now()
        date = now.strftime("%Y-%m-%d")
        for product in self.products:
            if product.lower() in response.request.url:
                product_match = product

                p = ProductLoader(item=Product(), response=response)
                p.add_value('product', product_match)
                p.add_xpath(
                    'price',
                    '//div[@class="product-price-container"]//span/text()')
                p.add_value('date', date)
                p.add_value('retailer', "Elgiganten")
                yield p.load_item()

                w = WebsiteLoader(item=Website(), response=response)
                w.add_value('html', response.body)
                w.add_value('date', date)
                w.add_value('product', product_match)
                w.add_value('retailer', "Elgiganten")
                yield w.load_item()
Ejemplo n.º 9
0
 def __init__(self):
     self.products = lf.load_names()
     # self.products = ['KG49EBI40']
     self.url_base = 'https://hvidevarer.bilka.dk'
     self.url_before = 'https://hvidevarer.bilka.dk/search?searchText='
     self.url_end = ''
 def __init__(self):
     # self.driver = webdriver.Firefox()
     options = webdriver.ChromeOptions()
     options.add_argument("--start-maximized")
     self.driver = webdriver.Chrome(executable_path=r"chromedriver.exe", chrome_options=options)
     self.products = lf.load_names()
 def __init__(self):
     self.products = lf.load_names()
     self.base_url = 'https://www.elgiganten.dk/product/hvidevarer/kolefryseskab/'
Ejemplo n.º 12
0
class SkousenSpider(scrapy.Spider):
    name = 'skousen_prices'
    allowed_domains = ['skousen.dk']
    products = lf.load_names()
    product_url = [product.lower() for product in products]
    products = lf.load_names()
    # products = ['KGE36BI40']
    url_base = 'https://www.skousen.dk'
    url_before = 'https://www.skousen.dk/search_result/?keywords='
    url_end = '#/'

    def start_requests(self):
        for product in self.products:
            url = self.url_before + product + self.url_end
            yield scrapy.Request(url=url, callback=self.parse_search_page)

    def parse_search_page(self, response):
        search_container = response.xpath('//a[@class="srp__product-link"]')
        if len(search_container) > 0:
            url_add = search_container.xpath('@href').extract_first()
            url = self.url_base + url_add
            yield scrapy.Request(url=url, callback=self.parse_product_page)
        else:
            for product in self.products:
                if product.lower() in response.request.url:
                    product_match = product
            print(' ')
            print('------------------------------------')
            print(product_match, " not found")
            print('------------------------------------')
            print(' ')

    def parse_product_page(self, response):
        now = datetime.datetime.now()
        date = now.strftime("%Y-%m-%d")
        for product in self.products:
            if product.lower() in response.request.url:
                product_match = product
                p = ProductLoader(item=Product(), response=response)
                p.add_value('product', product_match)

                normal_div = '//div[@class="vip__price-box-price"]/text()'
                discount_div = '//div[@class="campaign-banner__info__right"]//span[@class="top"]/text()'
                normal_container = response.xpath(normal_div)
                discount_container = response.xpath(discount_div)

                if len(discount_container) > 0:
                    price_div = discount_div
                else:
                    price_div = normal_div

                print('============================')
                print(normal_container)
                print(discount_container)
                print('============================')

                p.add_xpath('price', price_div)
                p.add_value('date', date)
                p.add_value('retailer', "Skousen")
                yield p.load_item()

                w = WebsiteLoader(item=Website(), response=response)
                w.add_value('html', response.body)
                w.add_value('date', date)
                w.add_value('product', product_match)
                w.add_value('retailer', "Skousen")
                yield w.load_item()
Ejemplo n.º 13
0
class WuptiSpider(CrawlSpider):
    name = 'wupti_prices'
    allowed_domains = ['wupti.com', 'wupti.dk']
    products = lf.load_names()
    # products = ['KGE36BI40']
    url_base = 'https://www.wupti.com'
    url_before = 'https://www.wupti.com/search?searchText='
    url_end = ''

    def start_requests(self):
        for product in self.products:
            url = self.url_before + product + self.url_end
            yield scrapy.Request(url=url, callback=self.parse_search_page)

    def parse_search_page(self, response):
        search_container = response.xpath('//a[@class="productPhotoLink"]')
        if len(search_container) > 0:
            url_add = search_container.xpath('@href').extract_first()
            url = self.url_base + url_add
            yield scrapy.Request(url=url, callback=self.parse_product_page)
        else:
            for product in self.products:
                if product.lower() in response.request.url:
                    product_match = product
            print(' ')
            print('------------------------------------')
            print(product_match, " not found")
            print('------------------------------------')
            print(' ')

    def parse_product_page(self, response):
        now = datetime.datetime.now()
        date = now.strftime("%Y-%m-%d")
        for product in self.products:
            if product.lower() in response.request.url:
                product_match = product
                p = ProductLoader(item=Product(), response=response)
                p.add_value('product', product_match)

                normal_div = '//div[@class="seller seller-first"]//div[@class="productPrice nobefore"]//strong/text()'
                discount_div = '//div[@class="seller seller-first"]//div[@class="productPrice"]//strong/text()'
                normal_container = response.xpath(normal_div)
                discount_container = response.xpath(discount_div)

                if len(discount_container) > 0:
                    price_div = discount_div
                else:
                    price_div = normal_div

                print('============================')
                print(normal_container)
                print(discount_container)
                print('============================')

                p.add_xpath('price', price_div)
                p.add_value('date', date)
                p.add_value('retailer', "Wupti")
                yield p.load_item()

                w = WebsiteLoader(item=Website(), response=response)
                w.add_value('html', response.body)
                w.add_value('date', date)
                w.add_value('product', product_match)
                w.add_value('retailer', "Wupti")
                yield w.load_item()
Ejemplo n.º 14
0
 def __init__(self):
     self.driver = webdriver.Firefox()
     self.products = lf.load_names()