Esempi in Python per valid_price, esempi in Python per pricecheck.valid_price

Esempio n. 1

0

Mostra file

    def parse(self, response):
        data = json.loads(response.body)
        i = 0

        #Extract the mpns of the first product.
        mpns = data['items'][0]['product'].get('mpns', [''])[0]
        if mpns:
            #Search for the lowest price for the products with the same mpns
            lowest = None
            data_mpns = {
                'items': [
                    item for item in data['items'] if item['product'].get(
                        'mpns', [''])[0].lower() == mpns.lower()
                ]
            }
            while True:
                res = self._get_item(data_mpns, i, response)
                if not res:
                    break
                pr = res[0]
                item = res[1]
                invalid_domain = any([
                    self._check_domain(domain, pr['url'])
                    for domain in FILTER_DOMAINS
                ])
                if invalid_domain:
                    i += 1
                else:
                    if valid_price(response.meta['price'], pr['price']) and \
                        (lowest is None or lowest['price'] > pr['price']):
                        lowest = pr
                    i += 1
            if lowest:
                yield lowest
        else:
            #Search for the first product with a valid price range.
            first_valid = None
            while True:
                res = self._get_item(data, i, response)
                if not res:
                    break
                pr = res[0]
                item = res[1]
                invalid_domain = any([
                    self._check_domain(domain, pr['url'])
                    for domain in FILTER_DOMAINS
                ])
                if invalid_domain:
                    i += 1
                else:
                    if valid_price(response.meta['price'], pr['price']):
                        first_valid = pr
                        break
                    i += 1
            if first_valid:
                yield first_valid

Esempio n. 2

0

Mostra file

File: amazon_spider.py Progetto: 0--key/lib

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath("name", './/*[contains(@class, "Title") or contains(@class, "title")]//a/text()')
            # if not accept_product(loader.get_output_value('name')):
            #    continue
            loader.add_xpath("url", './/*[contains(@class, "Title") or contains(@class, "title")]//a/@href')
            loader.add_xpath("price", './/*[@class="newPrice"]//span/text()')
            loader.add_value("sku", response.meta["sku"])
            loader.add_value("identifier", response.meta["sku"])
            # loader.add_value('sku', response.meta['sku'])
            # loader.add_value('identifier', response.meta['sku'])
            if (
                loader.get_output_value("price")
                and (pr is None or pr.get_output_value("price") > loader.get_output_value("price"))
                and valid_price(response.meta["price"], loader.get_output_value("price"))
            ):
                pr = loader

        if pr:
            yield pr.load_item()

Esempio n. 3

0

Mostra file

File: giabri.py Progetto: oceancloud82/scraping

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        search_results = []
        products = hxs.select(u'//div[@class="resultBox"]')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            url = product.select(u'./h2/a/@href')[0].extract()
            url = urljoin_rfc(get_base_url(response), url)
            loader.add_value('url', url)
            name = product.select(u'./h2/a/text()')[0].extract().strip()
            loader.add_value('name', name)
            loader.add_value('sku', response.meta['sku'])
            loader.add_value('identifier', response.meta['sku'])
            price = product.select(
                u'./ul/li[@class="price"]/h3[@class="mainPrice"]/text()'
            )[0].extract().replace(',', '.')
            loader.add_value('price', price)
            if valid_price(response.meta['price'],
                           loader.get_output_value('price')):
                search_results.append(loader)
            search_results.sort(key=lambda x: x.get_output_value('price'))

        search_q = response.meta['model'].lower().split(' ')
        for result in search_results:
            name = result.get_output_value('name')
            if all([x in name.lower() for x in search_q]):
                yield result.load_item()
                return

        if search_results:
            yield search_results[0].load_item()

Esempio n. 4

0

Mostra file

File: ebay.py Progetto: 0--key/lib

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        product = hxs.select('//td[@r="1"]')
        if not product:
            product = hxs.select('//table[@r="1"]')

        if not product and response.meta.get('_retries', 0) >= 3:
            #log.msg('ALERT! ' + response.url)
            #f = open(os.path.join(HERE, response.meta['sku'] + '.html'), 'w')
            #f.write(response.body)
            #f.close()

            return
        elif not product:
            retries = response.meta.get('_retries', 0)
            yield Request(response.url, meta={'sku': response.meta['sku'],
                                              '_retries': retries + 1},
                                              dont_filter=True)
            return

        loader = ProductLoader(item=Product(), selector=product)
        loader.add_xpath('name', './/div[@class="ittl"]//a[@class="vip"]/text()')
        loader.add_xpath('url', './/div[@class="ittl"]//a[@class="vip"]/@href')
        loader.add_xpath('price', './/div[@class="prices"]//span[@class="amt"]/text()')
        loader.add_xpath('price', './/div[@class="prices"]//span[@class="g-b amt"]/text()')
        loader.add_xpath('price', './/td[@class="prc"]//div[@class="g-b"]/text()')
        loader.add_xpath('price', './/*[@itemprop="price"]/text()')
        loader.add_value('sku', response.meta['sku'])
        loader.add_value('identifier', response.meta['sku'])

        if not 'apparelsave' in loader.get_output_value('name').lower() \
           and valid_price(response.meta['price'], loader.get_output_value('price')):
            yield loader.load_item()

Esempio n. 5

0

Mostra file

File: amazon_spider.py Progetto: ontiyonke/lib

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select(
            '//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            soup = BeautifulSoup(product.extract())
            loader.add_value(
                'name',
                soup.find('h3', attrs={
                    'class': 'newaps'
                }).findAll('span')[0].string)
            loader.add_value(
                'url',
                soup.find('h3', attrs={
                    'class': 'newaps'
                }).findAll('a')[0]['href'])
            loader.add_value(
                'price',
                soup.find('ul', attrs={
                    'class': 'rsltL'
                }).findAll('span')[0].string)
            #loader.add_value('sku', response.meta['sku'])
            #loader.add_value('identifier', response.meta['sku'])
            if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') >
                                                                   loader.get_output_value('price')) and \
               valid_price(response.meta['price'], loader.get_output_value('price')):
                pr = loader

        if pr:
            yield pr.load_item()

Esempio n. 6

0

Mostra file

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select(
            '//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath(
                'name',
                './/*[contains(@class, "Title") or contains(@class, "title")]//a/text()'
            )
            #if not accept_product(loader.get_output_value('name')):
            #    continue
            loader.add_xpath(
                'url',
                './/*[contains(@class, "Title") or contains(@class, "title")]//a/@href'
            )
            loader.add_xpath('price', './/*[@class="newPrice"]//span/text()')
            loader.add_value('sku', response.meta['sku'])
            loader.add_value('identifier', response.meta['sku'])
            #loader.add_value('sku', response.meta['sku'])
            #loader.add_value('identifier', response.meta['sku'])
            if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') >
                                                                   loader.get_output_value('price')) and \
               valid_price(response.meta['price'], loader.get_output_value('price')):
                pr = loader

        if pr:
            yield pr.load_item()

Esempio n. 7

0

Mostra file

File: google.py Progetto: 0--key/lib

 def parse(self, response):
     data = json.loads(response.body)
     i = 0
     
     #Extract the mpns of the first product.
     mpns = data['items'][0]['product'].get('mpns',[''])[0]
     if mpns:
         #Search for the lowest price for the products with the same mpns
         lowest = None
         data_mpns = {'items': [item for item in data['items'] if item['product'].get('mpns',[''])[0]==mpns]}
         while True:
             res = self._get_item(data_mpns, i, response)
             if not res:
                 break
             pr = res[0]
             item = res[1]
             invalid_domain = any([self._check_domain(domain, pr['url']) for domain in FILTER_DOMAINS])
             if invalid_domain:
                 i += 1
             else:
                 if valid_price(response.meta['price'], pr['price']) and \
                     (lowest is None or lowest['price'] > pr['price']):
                     lowest = pr
                 i += 1
         if lowest:
             yield lowest
     else:
         #Search for the first product with a valid price range.
         while True:
             res = self._get_item(data, i, response)
             if not res:
                 break
             pr = res[0]
             item = res[1]
             invalid_domain = any([self._check_domain(domain, pr['url']) for domain in FILTER_DOMAINS])
             if invalid_domain:
                 i += 1
             else:
                 if valid_price(response.meta['price'], pr['price']):
                     first_valid = pr
                     break
                 i += 1
         if first_valid:
             yield first_valid

Esempio n. 8

0

Mostra file

File: amazon.py Progetto: oceancloud82/scraping

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        products += hxs.select('//div[@id="btfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        search_results = []
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            name = product.select('.//*[contains(@class, "Title") or contains(@class, "title")]//a/text()').extract()
            if not name:
                name = product.select('h3[@class="newaps"]/a/span/text()').extract()
            loader.add_value('name', name)

            url = product.select('.//*[contains(@class, "Title") or contains(@class, "title")]//a/@href').extract()
            if not url:
                url = product.select('h3[@class="newaps"]/a/@href').extract()
            loader.add_value('url', url)

            price = product.select('.//*[@class="newPrice"]//span[contains(@class,"price")]/text()').extract()
            if not price:
                price = product.select('.//div[@class="usedNewPrice"]//span[@class="price"]/text()').extract()
            if not price:
                price = product.select('.//div[@class="usedPrice"]//span//text()').extract()
            if not price:
                price = product.select('.//ul[@class="rsltGridList"]/li[1]/a/span[@class="bld lrg red"]//text()').extract()
            if not price:
                price = product.select('.//ul[@class="rsltGridList grey"]/li[1]/a/span[@class="price bld"]//text()').extract()
            if not price:
                price = product.select('.//ul[@class="rsltGridList grey"]/li[1]/a/span[@class="bld lrg red"]//text()').extract()
            if not price:
                price = product.select('.//ul[@class="rsltL"]/li[1]/a/span[@class="bld lrg red"]//text()').extract()
            if not price:
                price = product.select('.//ul[@class="rsltL"]/li[1]/a/span[@class="price bld"]//text()').extract()
            if not price:
                price = product.select('.//ul[@class="rsltL"]/li[1]/a/span[@class="bld lrg red"]//text()').extract()
            print price

            if price:
                loader.add_value('price', price[0].replace(',', '.'))
            else:
                self.log("No price found")
                continue
            loader.add_value('sku', response.meta['sku'])
            loader.add_value('identifier', response.meta['sku'])
            pr = loader
            if valid_price(response.meta['price'], pr.get_output_value('price')):
                search_results.append(pr)

        search_results.sort(key=lambda x: x.get_output_value('price'))

        if search_results:
            cur_prod = search_results[0]
            next_prods = search_results[1:]
            yield cur_prod.load_item()

Esempio n. 9

0

Mostra file

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select(
            '//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        i = 0
        for product in products:
            i += 1
            product_loader = ProductLoader(item=Product(), selector=product)

            name = product.select(
                './/h3[@class="newaps"]/a/span/text()').extract()
            if not name:
                if i == 1:
                    self.log("ERROR name not found")
                continue

            product_loader.add_value('name', name[0])

            price = product.select(
                './/ul[@class="rsltL"]//span[1]/text()').extract()

            if not price:
                price = product.select(
                    './/ul[contains(@class,"rsltGridList grey")]//span[1]/text()'
                ).extract()
                if not price:
                    self.log("ERROR price not found2")
                    continue

            product_loader.add_value('price', price[0])

            url = product.select('.//h3[@class="newaps"]/a/@href').extract()

            if not url:
                self.log("ERROR url not found")
            else:
                product_loader.add_value('url', url[0])

            product_loader.add_value('sku', response.meta['sku'])
            product_loader.add_value('identifier', response.meta['sku'])

            #self.log("price: " + str(product_loader.get_output_value('price')) + ", price_meta: " + str(response.meta['price']) + ", url: " + response.url)

            if product_loader.get_output_value('price') and \
                (pr is None or pr.get_output_value('price') > product_loader.get_output_value('price')) and \
                valid_price(response.meta['price'], product_loader.get_output_value('price')):
                pr = product_loader

        if pr:
            yield pr.load_item()

Esempio n. 10

0

Mostra file

File: google.py Progetto: 0--key/lib

    def parse(self, response):
        data = json.loads(response.body)

        i = 0
        while True:
            res = self._get_item(data, i, response)
            if not res:
                return

            pr = res[0]
            item = res[1]
            if "apparelsave" in item["product"]["author"]["name"].lower() or "shoemetro.com" in pr["url"]:
                i += 1
            else:
                if valid_price(response.meta["price"], pr["price"]):
                    yield pr
                    return
                else:
                    i += 1

Esempio n. 11

0

Mostra file

File: google.py Progetto: 0--key/lib

 def parse(self, response):
     data = json.loads(response.body)
     i = 0
     lowest = None
     while True:
         res = self._get_item(data, i, response)
         if not res:
             break
         pr = res[0]
         item = res[1]
         invalid_domain = any([self._check_domain(domain, pr['url']) for domain in FILTER_DOMAINS])
         if invalid_domain:
             i += 1
         else:
             if valid_price(response.meta['price'], pr['price']) and \
                 (lowest is None or lowest['price'] > pr['price']):
                 lowest = pr
             i += 1
     if lowest:
         yield lowest

Esempio n. 12

0

Mostra file

    def parse(self, response):
        data = json.loads(response.body)

        i = 0
        while True:
            res = self._get_item(data, i, response)
            if not res:
                return

            pr = res[0]
            item = res[1]
            if 'apparelsave' in item['product']['author']['name'].lower() or \
               'shoemetro.com' in pr['url']:
                i += 1
            else:
                if valid_price(response.meta['price'], pr['price']):
                    yield pr
                    return
                else:
                    i += 1

Esempio n. 13

0

Mostra file

File: amazon_spider.py Progetto: 0--key/lib

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            soup = BeautifulSoup(product.extract())
            loader.add_value('name', soup.find('h3', attrs={'class': 'newaps'}).findAll('span')[0].string)
            loader.add_value('url', soup.find('h3', attrs={'class': 'newaps'}).findAll('a')[0]['href'])
            loader.add_value('price', soup.find('ul', attrs={'class': 'rsltL'}).findAll('span')[0].string)
            loader.add_value('sku', response.meta['sku'])
            loader.add_value('identifier', response.meta['sku'])
            if loader.get_output_value('price'): 
                if (pr is None or pr.get_output_value('price') > loader.get_output_value('price')):
                    if valid_price(response.meta['price'], loader.get_output_value('price')):
                        pr = loader

        if pr:
            yield pr.load_item()

Esempio n. 14

0

Mostra file

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        product = hxs.select('//td[@r="1"]')
        if not product:
            product = hxs.select('//table[@r="1"]')

        if not product and response.meta.get('_retries', 0) >= 3:
            #log.msg('ALERT! ' + response.url)
            #f = open(os.path.join(HERE, response.meta['sku'] + '.html'), 'w')
            #f.write(response.body)
            #f.close()

            return
        elif not product:
            retries = response.meta.get('_retries', 0)
            yield Request(response.url,
                          meta={
                              'sku': response.meta['sku'],
                              '_retries': retries + 1
                          },
                          dont_filter=True)
            return

        loader = ProductLoader(item=Product(), selector=product)
        loader.add_xpath('name',
                         './/div[@class="ittl"]//a[@class="vip"]/text()')
        loader.add_xpath('url', './/div[@class="ittl"]//a[@class="vip"]/@href')
        loader.add_xpath('price',
                         './/div[@class="prices"]//span[@class="amt"]/text()')
        loader.add_xpath(
            'price', './/div[@class="prices"]//span[@class="g-b amt"]/text()')
        loader.add_xpath('price',
                         './/td[@class="prc"]//div[@class="g-b"]/text()')
        loader.add_xpath('price', './/*[@itemprop="price"]/text()')
        loader.add_value('sku', response.meta['sku'])
        loader.add_value('identifier', response.meta['sku'])

        if not 'apparelsave' in loader.get_output_value('name').lower() \
           and valid_price(response.meta['price'], loader.get_output_value('price')):
            yield loader.load_item()

Esempio n. 15

0

Mostra file

File: google.py Progetto: ontiyonke/lib

 def parse(self, response):
     data = json.loads(response.body)
     i = 0
     lowest = None
     while True:
         res = self._get_item(data, i, response)
         if not res:
             break
         pr = res[0]
         item = res[1]
         invalid_domain = any([
             self._check_domain(domain, pr['url'])
             for domain in FILTER_DOMAINS
         ])
         if invalid_domain:
             i += 1
         else:
             if valid_price(response.meta['price'], pr['price']) and \
                 (lowest is None or lowest['price'] > pr['price']):
                 lowest = pr
             i += 1
     if lowest:
         yield lowest

Esempio n. 16

0

Mostra file

File: amazon.py Progetto: oceancloud82/scraping

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@id="atfResults"]'
                              '//div[starts-with(@id, "result_")]')
        log.msg(">>>>>>> FOUND %s ITEMS >>>" % len(products))
        pr = None
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('name', './/h3/a/span/text()')
            if not accept_product(loader.get_output_value('name')):
                continue
            loader.add_xpath('url', './/h3/a/@href')
            loader.add_xpath('price', './/*[@class="newp"]//span/text()')
            loader.add_value('sku', response.meta['sku'])
            loader.add_value('identifier', response.meta['sku'])
            if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') >
                                                                   loader.get_output_value('price')) and \
                valid_price(response.meta['price'], loader.get_output_value('price')):
                pr = loader

        if pr:
            yield pr.load_item()

Esempio n. 17

0

Mostra file

File: amazon_spider.py Progetto: 0--key/lib

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)

            soup = BeautifulSoup(product.extract())
            loader.add_value("name", soup.find("h3", attrs={"class": "newaps"}).findAll("span")[0].string)
            loader.add_value("url", soup.find("h3", attrs={"class": "newaps"}).findAll("a")[0]["href"])
            loader.add_value("price", soup.find("ul", attrs={"class": "rsltL"}).findAll("span")[0].string)
            # loader.add_value('sku', response.meta['sku'])
            # loader.add_value('identifier', response.meta['sku'])

            if (
                loader.get_output_value("price")
                and (pr is None or pr.get_output_value("price") > loader.get_output_value("price"))
                and valid_price(response.meta["price"], loader.get_output_value("price"))
            ):
                pr = loader

        if pr:
            yield pr.load_item()

Esempio n. 18

0

Mostra file

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_xpath('name', u'//span[@id="btAsinTitle"]/text()')
        loader.add_value('url', response.url)

        loader.add_xpath('image_url',
                         u'//tr[@id="prodImageContainer"]//img/@src')
        if not loader.get_output_value(u'image_url'):
            soup = BeautifulSoup(response.body)
            image_url = soup.find(lambda tag: tag.name == u'img' and tag.
                                  findParent(u'tr', id=u'prodImageContainer'))
            if image_url:
                loader.add_value('image_url', image_url.get(u'src'))

        loader.add_xpath(
            'brand',
            u'//span[@class="tsLabel" and contains(text(),"Brand")]/following-sibling::span/text()'
        )

        loader.add_xpath('price', u'//b[@class="priceLarge"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//span[@class="priceLarge"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//span[@class="price"]/text()')

        sku = hxs.select(
            u'//li/b[contains(text(),"Item model number")]/../text()').extract(
            )
        if sku:
            sku = sku[0].strip()
        else:
            log.msg('No sku.')
        csv_sku = response.meta['sku'].strip()
        log.msg('SKU: [%s == %s]' % (sku.lower() if sku else u'None', csv_sku))

        csv_name = response.meta['name'].lower().split(u' ')
        site_name = loader.get_output_value('name').lower().split(u' ')
        log.msg(u'NAME: [%s == %s]' % (csv_name, site_name))
        name_match = any(map(lambda elem: elem in site_name, csv_name))

        if sku and (self.match_skus(sku, csv_sku)
                    or self.match_skus(csv_sku, sku)) and name_match:
            if valid_price(response.meta['price'],
                           loader.get_output_value('price')):
                loader.add_value('sku', response.meta['sku'])
                loader.add_value('identifier', response.meta['sku'].lower())
                # if loader.get_output_value('price'):
                yield loader.load_item()
        else:
            meta = response.meta
            next_result = meta['next_results']
            if next_result:
                next_result = next_result[0]
                meta['next_results'] = meta['next_results'][1:]
                yield Request(next_result,
                              callback=self.parse_product,
                              meta=response.meta)
            elif meta.get('next_page'):
                next_page = meta['next_page']
                yield Request(next_page, meta=response.meta)
            elif meta.get('search_urls'):
                meta = response.meta
                search_url = meta['search_urls'][0]
                meta['search_urls'] = meta['search_urls'][1:]
                yield Request(search_url % {'q': meta['sku']}, meta=meta)

Esempio n. 19

0

Mostra file

class GoogleSpider(BaseSpider):
    name = 'ldmountaincentre-google-shopping.com'
    allowed_domains = ['google.com']

    start_urls = ['http://www.google.com']

    errors = []

    F_LAST_RESULTS = 'gshopping_last_results.csv'
    SHOPPING_URL = 'http://www.google.co.uk/shopping?hl=en'

    def __init__(self, *args, **kwargs):
        super(GoogleSpider, self).__init__(*args, **kwargs)
        dispatcher.connect(self.spider_closed, signals.spider_closed)

        self._browsers = []

        browser_profiles = ({
            'proxy': '77.75.105.70:22955',
            'proxy-type': 'http',
            'proxy-auth': 'pp-dampssno:poekensi'
        }, {
            'proxy': '80.83.124.85:48008',
            'proxy-type': 'http',
            'proxy-auth': 'pp-nobfizze:hathapic'
        }, {
            'proxy': '194.242.113.229:30230',
            'proxy-type': 'http',
            'proxy-auth': 'pp-dawnyrou:dupradin'
        }, {
            'proxy': '118.127.29.47:10858',
            'proxy-type': 'http',
            'proxy-auth': 'pp-eyakarpe:rmsaingr'
        })

        for profile in browser_profiles:
            if profile['proxy']:
                proxy = {}
                proxy['host'] = profile['proxy']
                proxy['type'] = profile['proxy-type']
                if profile['proxy-auth']:
                    proxy['auth'] = profile['proxy-auth']
            else:
                proxy = None
            browser = PhantomJS.create_browser(proxy=proxy)
            user_agent = browser.desired_capabilities[
                u'phantomjs.page.settings.userAgent']
            self._browsers.append({
                'webdriver':
                PhantomJS.create_browser(proxy=proxy),
                'useragent':
                user_agent,
                'proxy':
                profile['proxy']
            })

        self._today_result_ids = {}
        file_last_results = os.path.join(HERE, self.F_LAST_RESULTS)
        if os.path.exists(file_last_results):
            today = time.gmtime().tm_yday
            last_day = time.gmtime(os.path.getctime(file_last_results)).tm_yday
            if last_day == today:
                shutil.copy(file_last_results, '%s.bak' % file_last_results)
                with open(file_last_results) as f_today:
                    reader = csv.DictReader(f_today)
                    for row in reader:
                        self._today_result_ids[row['identifier']] = row

    def spider_closed(self, spider):
        for browser in self._browsers:
            browser['webdriver'].quit()
        shutil.copy('data/%s_products.csv' % spider.crawl_id,
                    os.path.join(HERE, self.F_LAST_RESULTS))

    def parse(self, response):
        f = open(os.path.join(HERE, 'product_skus.csv'))
        reader = csv.DictReader(f)

        url = self.SHOPPING_URL

        # GET Google Shopping website
        for browser in self._browsers:
            self.log('\n'
                     '>>> PROXY: %s\n'
                     '>>> UA: %s\n'
                     '>>> GET: %s\n' %
                     (browser['proxy'], browser['useragent'], url))
            browser['webdriver'].get(url)
            self.log('>>> BROWSER => OK')

        browsers_free = len(self._browsers)

        row = next(reader, None)

        # Search items
        while row is not None:
            # If exists today's results then it loads them
            if row['identifier'] in self._today_result_ids:
                yield self.load_item_(
                    self._today_result_ids[row['identifier']], adurl=False)
                row = next(reader, None)  # Next row
                continue
            if browsers_free:
                browsers_free -= 1

                if row['sku']:
                    search = row['sku']
                    self.log('>>> Search by SKU: ' + search)
                else:
                    search = row['name']
                    self.log('>>> Search by NAME: ' + search)

                meta = {
                    'sku': row['sku'],
                    'price': row['price'],
                    'identifier': row['identifier']
                }

                self._browsers[browsers_free]['search'] = search
                self._browsers[browsers_free]['meta'] = meta

            row = next(reader, None)  # Next row

            if browsers_free:
                if row:
                    continue
            else:
                browsers_free = len(self._browsers)

            for browser in self._browsers:
                browser['webdriver'].delete_all_cookies()

            time.sleep(random.choice(range(5, 25)))

            for browser in self._browsers:
                if not browser['search']:
                    continue
                try:
                    self.log(
                        '\n'
                        '>>> BROWSER: Clear current search and send new...\n'
                        '>>> PROXY: %s\n'
                        '>>> UA: %s\n'
                        '>>> SEARCH: %s\n' %
                        (browser['proxy'], browser['useragent'],
                         browser['search']))

                    try:
                        browser['search_input'] = browser[
                            'webdriver'].find_element_by_id('gbqfq')
                    except:
                        browser['search_input'] = browser[
                            'webdriver'].find_element_by_name('q')
                    try:
                        browser['search_button'] = browser[
                            'webdriver'].find_element_by_id('gbqfb')
                    except:
                        browser['search_button'] = browser[
                            'webdriver'].find_element_by_xpath(
                                '//button[@value="Search"]')

                    browser['search_input'].clear()
                    browser['search_input'].send_keys(browser['search'])
                except Exception, e:
                    if browser['search']:
                        self.log('\n>>> ERROR: Failed to search %s\n' %
                                 browser['search'])
                        browser['search'] = None
                    # This should be a change in the website style, to save the screenshot and source and not continue
                    browser['webdriver'].save_screenshot(
                        os.path.join(HERE, 'browser_error.png'))
                    with open(os.path.join(HERE, 'browser_error.html'),
                              'w') as f:
                        f.write(
                            browser['webdriver'].page_source.encode('utf-8'))
                    raise e

            time.sleep(random.choice(range(5, 10)))

            for browser in self._browsers:
                if not browser['search']:
                    continue
                try:
                    self.log('\n'
                             '>>> BROWSER: Click search button...\n'
                             '>>> PROXY: %s\n'
                             '>>> UA: %s\n'
                             '>>> SEARCH: %s\n' %
                             (browser['proxy'], browser['useragent'],
                              browser['search']))
                    browser['search_button'].click()
                    self.log('>>> BROWSER => OK')
                except Exception, e:
                    self.log(e)
                    if browser['search']:
                        self.log('\n>>> ERROR: Failed to search %s\n' %
                                 browser['search'])
                        browser['search'] = None

            time.sleep(random.choice(range(5, 10)))

            browsers_get_more = []

            for i, browser in enumerate(self._browsers):
                if not browser['search']:
                    continue
                browser['item'] = None
                try:
                    products = browser['webdriver'].find_elements_by_xpath(
                        '//div[@id="search"]//li[contains(@class, "g")]')
                    link = None
                    item_url = ''
                    item_found = False
                    for product in products:
                        link = product.find_element_by_xpath(
                            './/h3[contains(@class, "r")]/a')
                        item_url = link.get_attribute('href')
                        if 'ldmountaincentre' not in item_url:
                            item_found = True
                            break  # First valid

                    if not item_found:
                        continue

                    if not link:
                        self.log('Not link')
                        continue

                    name = link.text

                    try:
                        price = product.find_element_by_xpath(
                            './/div[@class="psliprice"]//b').text
                    except:
                        try:
                            price = product.find_element_by_xpath(
                                './/div[contains(@class, "psrpcont")]/span[@class="psrp"]'
                            ).text
                        except:
                            try:
                                price = product.find_element_by_xpath(
                                    './/div[@class="psliprice"]').text
                            except Exception, e:
                                self.errors.append(
                                    'WARNING: No price searching %s' %
                                    browser['search'])
                                # Go to shopping again
                                browser['webdriver'].get(self.SHOPPING_URL)
                                time.sleep(random.choice(range(5, 10)))
                                raise e
                    try:
                        more_stores = re.findall(
                            r'from \d+\+ stores',
                            product.find_element_by_xpath(
                                './/div[contains(@class, "psrpcont")]').text)
                    except:
                        try:
                            more_stores = re.findall(r'from \d+\+ stores',
                                                     product.text)
                        except:
                            more_stores = None

                    item = {'name': name, 'url': item_url}
                    if more_stores:
                        browser['item'] = item
                        browsers_get_more.append(i)
                        self.log('\n'
                                 '>>> PROXY: %s\n'
                                 '>>> UA: %s\n'
                                 '>>> ITEM FOUND: %s\n'
                                 '>>> MORE STORES: %s\n' %
                                 (browser['proxy'], browser['useragent'],
                                  item['name'], item['url']))
                    else:
                        item['price'] = extract_price(price)
                        if valid_price(browser['meta']['price'],
                                       item['price']):
                            self.log('\n'
                                     '>>> PROXY: %s\n'
                                     '>>> UA: %s\n'
                                     '>>> ITEM FOUND: %s\n'
                                     '>>> ITEM PRICE: %s\n' %
                                     (browser['proxy'], browser['useragent'],
                                      item['name'], item['price']))
                            yield self.load_item_(item, browser)
                except Exception, e:
                    self.log('>>>> ERROR IN %s' %
                             browser['webdriver'].current_url)
                    self.log('>>>> %s' % e)
                    if browser['search']:
                        self.log('\n>>> ERROR: Failed to search %s\n' %
                                 browser['search'])
                        browser['search'] = None

Esempio n. 20

0

Mostra file

                            'a').get_attribute('href')
                    except Exception, e:
                        if browser['search']:
                            self.log('\n>>> ERROR: Failed to search %s\n' %
                                     browser['search'])
                            browser['search'] = None
                        # This should be a change in the website style, to save the screenshot and source and not continue
                        browser['webdriver'].save_screenshot(
                            os.path.join(HERE, 'browser_error.png'))
                        with open(os.path.join(HERE, 'browser_error.html'),
                                  'w') as f:
                            f.write(browser['webdriver'].page_source.encode(
                                'utf-8'))
                        raise e

                    if valid_price(browser['meta']['price'], price):
                        item = browser['item']
                        item['price'] = price
                        item['url'] = item_url
                        yield self.load_item_(item, browser)

            # Set search to None
            for browser in self._browsers:
                browser['search'] = None

    def load_item_(self, item, browser=None, adurl=True):
        if browser:
            response = HtmlResponse(url=browser['webdriver'].current_url,
                                    body=browser['webdriver'].page_source,
                                    encoding='utf-8')
        else:

Esempio n. 21

0

Mostra file

File: amazon_uk.py Progetto: oceancloud82/scraping

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        products = hxs.select(
            '//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        # if not products:
        # products = hxs.select('//div[starts-with(@id, "result_")]')
        pr = None
        search_results = []
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            name = product.select(
                './/*[contains(@class, "Title") or contains(@class, "title")]//a/text()'
            ).extract()
            if not name:
                name = product.select(
                    'h3[@class="newaps"]/a/span/text()').extract()
            loader.add_value('name', name)

            url = product.select(
                './/*[contains(@class, "Title") or contains(@class, "title")]//a/@href'
            ).extract()
            if not url:
                url = product.select('h3[@class="newaps"]/a/@href').extract()
            loader.add_value('url', url)

            price = product.select(
                './/*[@class="newPrice"]//span[contains(@class,"price")]/text()'
            ).extract()
            if not price:
                price = product.select(
                    './/div[@class="usedNewPrice"]//span[@class="price"]/text()'
                ).extract()
            if not price:
                price = product.select(
                    './/ul[@class="rsltGridList"]/li[1]/a/span[@class="bld lrg red"]//text()'
                ).extract()
            if not price:
                price = product.select(
                    './/ul[@class="rsltGridList"]/li[1]/a/span[@class="price bld"]//text()'
                ).extract()
            if not price:
                price = product.select(
                    './/ul[@class="rsltL"]/li[1]/a/span[@class="bld lrg red"]//text()'
                ).extract()
            if not price:
                price = product.select(
                    './/ul[@class="rsltL"]/li[1]/a/span[@class="price bld"]//text()'
                ).extract()
            if not price:
                price = product.select(
                    './/ul[@class="rsltGridList grey"]/li[1]/a/span[@class="price bld"]//text()'
                ).extract()
            if not price:
                price = product.select(
                    './/ul[@class="rsltGridList grey"]/li[1]/a/span[@class="bld lrg red"]//text()'
                ).extract()
            if not price:
                price = product.select(
                    './/*[@class="newPrice"]//span/text()').extract()
            if not price:
                price = product.select(
                    './/span[@class="bld lrg red"]//text()').extract()

            if price:
                if '-' in price[0]:
                    price = price[0].split('-')[0]
                else:
                    price = price[0]
                price = re.sub(u'[^\d\.,]', u'', price)
                price = Decimal(price.replace(',', '')) / Decimal(1.2)
                price = round(price, 2)
                loader.add_value('price', str(price))
            loader.add_value('sku', response.meta['sku'])
            loader.add_value('identifier', response.meta['sku'])
            pr = loader
            if price and valid_price(response.meta['price'],
                                     loader.get_output_value('price')):
                search_results.append(pr)

        if search_results:
            search_results.sort(
                key=lambda elem: elem.get_output_value('price'))
            cur_prod = search_results[0]
            next_prods = search_results[1:]
            meta = response.meta
            meta['cur_prod'] = cur_prod
            meta['next_prods'] = next_prods
            yield Request(cur_prod.get_output_value('url'),
                          callback=self.parse_product,
                          meta=meta,
                          dont_filter=True)
        elif response.meta.get('desc_req'):
            yield response.meta.get('desc_req')