Ejemplo n.º 1
0
 def test_url_query_parameter_2(self):
     """
     This problem was seen several times in the feeds. Sometime affiliate URLs contains
     nested encoded affiliate URL with direct URL as parameters. For example:
     aff_url1 = 'http://www.tkqlhce.com/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EChildren%26%2339%3Bs+garden+furniture%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357023%2526langId%253D-1'
     the typical code to extract needed URL from it is:
     aff_url2 = url_query_parameter(aff_url1, 'url')
     after this aff2_url is:
     'http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN&params=adref%3DGarden and DIY->Garden furniture->Children's gardenfurniture&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357023%26langId%3D-1'
     the direct URL extraction is
     url = url_query_parameter(aff_url2, 'referredURL')
     but this will not work, because aff_url2 contains ' (comma sign encoded in the feed)
     and the URL extraction will fail, current workaround was made in the spider,
     just a replace for ' to %27
     """
     return # FIXME: this test should pass but currently doesnt
     # correct case
     aff_url1 = "http://www.anrdoezrs.net/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EGarden+table+and+chair+sets%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357199%2526langId%253D-1"
     aff_url2 = url_query_parameter(aff_url1, 'url')
     self.assertEqual(aff_url2, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN&params=adref%3DGarden and DIY->Garden furniture->Garden table and chair sets&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357199%26langId%3D-1")
     prod_url = url_query_parameter(aff_url2, 'referredURL')
     self.assertEqual(prod_url, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay?storeId=10001&catalogId=1500001501&productId=1500357199&langId=-1")
     # weird case
     aff_url1 = "http://www.tkqlhce.com/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EChildren%26%2339%3Bs+garden+furniture%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357023%2526langId%253D-1"
     aff_url2 = url_query_parameter(aff_url1, 'url')
     self.assertEqual(aff_url2, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN&params=adref%3DGarden and DIY->Garden furniture->Children's garden furniture&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357023%26langId%3D-1")
     prod_url = url_query_parameter(aff_url2, 'referredURL')
     # fails, prod_url is None now
     self.assertEqual(prod_url, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay?storeId=10001&catalogId=1500001501&productId=1500357023&langId=-1")
Ejemplo n.º 2
0
 def parse(self, response):
     if self.meta_df is None and hasattr(self, 'prev_crawl_id'):
         meta_filename = os.path.join(
             DATA_DIR, 'meta/%s_meta.json-lines' % self.prev_crawl_id)
         if os.path.exists(meta_filename):
             with open(meta_filename) as f:
                 self.meta_df = pd.DataFrame(
                     columns=['identifier', 'promo_start', 'promo_end'],
                     dtype=pd.np.str)
                 for i, line in enumerate(f):
                     p = json.loads(line.strip())
                     self.meta_df.loc[i] = {
                         'identifier': p['identifier'],
                         'promo_start': p['metadata'].get('promo_start'),
                         'promo_end': p['metadata'].get('promo_end')
                     }
                 self.meta_df.set_index('identifier',
                                        drop=False,
                                        inplace=True)
     elif not hasattr(self, 'prev_crawl_id'):
         self.log('prev_crawl_id attr not found')
     for url in response.xpath(
             '//*[@id="header"]/nav/div/ul/li/a/@href').extract():
         u_id = url_query_parameter(url, 'id')
         u_cat = url_query_parameter(url, 'cat')
         if u_id and u_cat:
             yield scrapy.Request(
                 'http://www.phonehouse.pt/api.php/getProducts/' + u_id +
                 '/' + u_cat + '/0',
                 callback=self.parse_products,
                 meta={
                     'u_id': u_id,
                     'u_cat': u_cat,
                     'offset': 0
                 })
Ejemplo n.º 3
0
 def test_url_query_parameter_2(self):
     """
     This problem was seen several times in the feeds. Sometime affiliate URLs contains
     nested encoded affiliate URL with direct URL as parameters. For example:
     aff_url1 = 'http://www.tkqlhce.com/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EChildren%26%2339%3Bs+garden+furniture%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357023%2526langId%253D-1'
     the typical code to extract needed URL from it is:
     aff_url2 = url_query_parameter(aff_url1, 'url')
     after this aff2_url is:
     'http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN&params=adref%3DGarden and DIY->Garden furniture->Children's gardenfurniture&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357023%26langId%3D-1'
     the direct URL extraction is
     url = url_query_parameter(aff_url2, 'referredURL')
     but this will not work, because aff_url2 contains ' (comma sign encoded in the feed)
     and the URL extraction will fail, current workaround was made in the spider,
     just a replace for ' to %27
     """
     return # FIXME: this test should pass but currently doesnt
     # correct case
     aff_url1 = "http://www.anrdoezrs.net/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EGarden+table+and+chair+sets%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357199%2526langId%253D-1"
     aff_url2 = url_query_parameter(aff_url1, 'url')
     self.assertEqual(aff_url2, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN&params=adref%3DGarden and DIY->Garden furniture->Garden table and chair sets&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357199%26langId%3D-1")
     prod_url = url_query_parameter(aff_url2, 'referredURL')
     self.assertEqual(prod_url, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay?storeId=10001&catalogId=1500001501&productId=1500357199&langId=-1")
     # weird case
     aff_url1 = "http://www.tkqlhce.com/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EChildren%26%2339%3Bs+garden+furniture%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357023%2526langId%253D-1"
     aff_url2 = url_query_parameter(aff_url1, 'url')
     self.assertEqual(aff_url2, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN&params=adref%3DGarden and DIY->Garden furniture->Children's garden furniture&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357023%26langId%3D-1")
     prod_url = url_query_parameter(aff_url2, 'referredURL')
     # fails, prod_url is None now
     self.assertEqual(prod_url, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay?storeId=10001&catalogId=1500001501&productId=1500357023&langId=-1")
Ejemplo n.º 4
0
 def test_url_query_parameter(self):
     self.assertEqual(url_query_parameter("product.html?id=200&foo=bar", "id"),
                      '200')
     self.assertEqual(url_query_parameter("product.html?id=200&foo=bar", "notthere", "mydefault"),
                      'mydefault')
     self.assertEqual(url_query_parameter("product.html?id=", "id"),
                      None)
     self.assertEqual(url_query_parameter("product.html?id=", "id", keep_blank_values=1),
                      '')
Ejemplo n.º 5
0
 def test_url_query_parameter(self):
     self.assertEqual(url_query_parameter("product.html?id=200&foo=bar", "id"),
                      '200')
     self.assertEqual(url_query_parameter("product.html?id=200&foo=bar", "notthere", "mydefault"),
                      'mydefault')
     self.assertEqual(url_query_parameter("product.html?id=", "id"),
                      None)
     self.assertEqual(url_query_parameter("product.html?id=", "id", keep_blank_values=1),
                      '')
Ejemplo n.º 6
0
    def parse_results(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        products = hxs.select('//div[contains(@class, "item") and contains(@class, "product")]')
        cookie_jar = response.meta['cookiejar']
        current_page = url_query_parameter(response.url, 'page')
        pages_found = bool(hxs.select('//div[@class="item-count"]/strong/text()').extract())
        if current_page and pages_found:
            current_page = int(current_page)
            per_page = int(url_query_parameter(response.url, 'per_page'))
            total_items = int(hxs.select('//div[@class="item-count"]/strong/text()').extract()[-1])
            next_page = current_page + 1
            total_pages = total_items / per_page
            if (total_items % per_page) > 0:
                total_pages += 1
            if next_page <= total_pages:
                next_url = add_or_replace_parameter(response.url, 'page', str(next_page))
                yield Request(next_url, meta={'cookiejar': cookie_jar}, callback=self.parse_results)

        for product in products:
            product_url = product.select('.//div[@class="title"]//a/@href').extract()
            if not product_url:
                continue
            product_url = urljoin_rfc(base_url, product_url[0])
            if product_url in self.viewed_urls:
                continue

            self.viewed_urls.append(product_url)

            self._browser.get(product_url)
            response = HtmlResponse(url=self._browser.driver.current_url, body=self._browser.driver.page_source, encoding='utf-8')
            for item in self.parse_product(response):
                if item['identifier'] not in self.new_ids:
                    self.new_ids.append(item['identifier'])
                    yield item

            options = product.select('.//*[@class="color-selector-items"]/a/@href').extract()
            for option_url in options:
                option_url = urljoin_rfc(base_url, option_url)
                if option_url in self.viewed_urls:
                    continue
                self.viewed_urls.append(option_url)
                self._browser.get(option_url)
                response = HtmlResponse(url=self._browser.driver.current_url, body=self._browser.driver.page_source, encoding='utf-8')
                for item in self.parse_product(response):
                    if item['identifier'] not in self.new_ids:
                        self.new_ids.append(item['identifier'])
                        yield item

                time.sleep(5)
Ejemplo n.º 7
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     meta = response.meta
     name = hxs.select('//*[@id="sku-title"]/h1/text()').extract()
     if not name:
         return
     name = name[0]
     image_url = hxs.select(
         '//*[@id="postcard-thumbnail"]//img[@itemprop="image"]/@src'
     ).extract()
     identifier = url_query_parameter(response.url, 'id')
     price = hxs.select(
         '//*[@id="priceblock-wrapper-wrapper"]//div[@class="item-price"]/text()'
     ).extract()[0]
     loader = ProductLoader(item=Product(), response=response)
     loader.add_value('identifier', identifier)
     loader.add_value('name', name)
     loader.add_value('category', meta['product']['category'])
     loader.add_value('brand', meta['product']['brand'])
     loader.add_value('sku', meta['product']['sku'])
     loader.add_value('url', response.url)
     loader.add_value('price', price)
     if image_url:
         loader.add_value('image_url', image_url[0])
     yield loader.load_item()
Ejemplo n.º 8
0
    def parse_products_list(self, response):
        products = response.xpath('//div[contains(@class, "card--product")]')
        for product in products:
            presc = ' '.join(product.xpath('.//div[@class="links_widget"]/p/a/span/text()').extract())
            if 'I Have a Private Prescription' in presc or 'I Need a Private Prescription' in presc or 'I Have an NHS Prescription' in presc:
                continue
            loader = ProductLoader(item=Product(), selector=product)
            name = product.xpath('.//h2/a/text()').extract()[0]
            loader.add_value('name', name)
            url = product.xpath('.//h2/a/@href').extract()[0]
            loader.add_value('url', url)
            identifier = product.xpath('.//div/button/@data-product-id').extract()[0]
            loader.add_value('identifier', identifier)
            loader.add_value('sku', identifier)
            price = product.xpath('.//span[@class="special-price"]/span[@class="price"]/text()').extract()
            if not price:
                price = product.xpath('.//span[@class="regular-price"]/span[@class="price"]/text()').extract()
            price = extract_price(price[0])
            loader.add_value('price', price)
            category = response.xpath('//nav[@class="breadcrumb"]//li/span/text()').extract()
            category = category[-1] if category else ''
            loader.add_value('category', category)
            if price < 40:
                loader.add_value('shipping_cost', 3.19)
            image_url = product.xpath('.//img[contains(@id, "product-collection-image")]/@src').extract()
            image_url = response.urljoin(image_url[0]) if image_url else ''
            loader.add_value('image_url', image_url)
            yield loader.load_item()

        url_list = products.xpath('.//h2/a/@href').extract()
        if products and url_list != response.meta.get('url_list', []):
            current_page = url_query_parameter(response.url,'p', '1')
            next_url = add_or_replace_parameter(response.url, 'infinitescroll', '1')
            next_url = add_or_replace_parameter(next_url, 'p', str(int(current_page)+1))
            yield Request(next_url, callback=self.parse_products_list, meta={'url_list': url_list})
Ejemplo n.º 9
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        products = hxs.select('//div[@class="bloc_article_float"]')
        for product in products:
            meta = {}
            meta['category'] = product.select('table/tr/td//div[@class="marque trunc"]/@title').extract()[0]
            meta['name'] = product.select('table/tr/td//div[@class="nom trunc"]/div/span/a/text()').extract()[0].strip()
            meta['sku'] = meta['name'].split('-')[0]
            meta['brand'] = "LEGO"
            meta['price'] = product.select('table/tr/td//div[@class="prix"]/text()').extract()[0].strip().replace(',', '.')
            url = product.select('table/tr/td//div[@class="nom trunc"]/div/span/a/@href').extract()[0].strip()
            image = product.select('table/tr/td[contains(@class, "photo")]//img/@src').extract()[0].replace('MED', 'ZOO')

            l = ProductLoader(item=Product(), response=response)
            l.add_value('identifier', url_query_parameter(url, 'id_article'))
            l.add_value('name', meta['category'] + ' ' + meta['name'])
            l.add_value('category', meta['category'])
            l.add_value('brand', meta['brand'])
            l.add_value('sku', meta['sku'])
            l.add_value('url', url)
            l.add_value('price', meta['price'])
            l.add_value('image_url', image)

            yield l.load_item()

            # yield Request(url, callback=self.parse_product, meta=meta)
        next = hxs.select('//a[text()="Page suivante "]/@href').extract()
        if next:
            yield Request(next[0])
Ejemplo n.º 10
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        data = demjson.decode(response.body)

        product = None
        for product in data['itemList']:
            product_loader = ProductLoader(item=Product(), selector=hxs)
            image_url = '//d39rqydp4iuyht.cloudfront.net/store/product/image/{}.gif'.format(product['id'])
            product_identifier = product['id']
            product_name = product['name']
            product_loader.add_value('identifier', product_identifier)
            product_loader.add_value('name', product_name)
            product_loader.add_value('image_url', image_url)
            price = product['minPrice']
            sku = ''
            for match in re.finditer(r"([\d,\.]+)", product_name):
                if len(match.group()) > len(sku):
                    sku = match.group()
            product_loader.add_value('sku', sku)
            product_loader.add_value('price', price)
            url = '/store/ck/item/' + str(product['id'])
            product_loader.add_value('url', urljoin_rfc(base_url, url))
            yield product_loader.load_item()

        if product and product['dataPosition'] < data['numItems']:
            page = int(url_query_parameter(response.url, 'page')) + 1
            url = add_or_replace_parameter(response.url, 'page', str(page))
            yield Request(url)
Ejemplo n.º 11
0
    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        try:
            products = hxs.select(
                '//div[@class="product_name"]/a/@href').extract()
            for product in products:
                yield Request(urljoin_rfc(get_base_url(response), product),
                              callback=self.parse_product,
                              meta=response.meta)

            if len(products) >= 500:
                index = int(url_query_parameter(response.url, 'beginIndex', 0))
                url = add_or_replace_parameter(response.url, 'beginIndex',
                                               str(index + 500))
                yield Request(url,
                              callback=self.parse_products,
                              meta=response.meta)

        except:
            log.msg('PAGE ERROR >>>')
            log.msg(str(response.body))
            retry = response.meta.get('retry', 0) + 1
            if retry <= 7:
                log.msg('Retry: ' + response.url)
                time.sleep(5)
                yield Request(response.url,
                              dont_filter=True,
                              callback=self.parse_products,
                              meta={'retry': retry})
Ejemplo n.º 12
0
    def load_item_(self, item, browser=None, use_adurl=True):
        if browser:
            response = HtmlResponse(url=browser['webdriver'].current_url,
                                    body=browser['webdriver'].page_source,
                                    encoding='utf-8')
        else:
            response = HtmlResponse(url='http://www.google.co.uk/shopping',
                                    body='<html></html>',
                                    encoding='utf-8')
        l = ProductLoader(item=Product(), response=response)
        l.add_value('name', self._try_encoding(item['name']))

        # Item URL
        url = self._try_encoding(item['url'])
        adurl = url_query_parameter(url, 'adurl')
        if adurl and use_adurl:
            item_url = adurl
        else:
            item_url = url

        l.add_value('url', item_url)
        l.add_value('price', item['price'])
        l.add_value('shipping_cost', item.get('shipping_cost', 0))
        l.add_value('dealer', item.get('dealer', ''))
        l.add_value(
            'identifier',
            browser['meta']['identifier'] if browser else item['identifier'])
        l.add_value('sku', browser['meta']['sku'] if browser else item['sku'])

        return l.load_item()
Ejemplo n.º 13
0
    def parse(self, response):
        categories = response.xpath(
            '//div[contains(@class, "div-category")]//a/@href').extract()
        categories += response.xpath(
            '//ul[contains(@class, "category-list")]//a/@href').extract()
        for category in categories:
            yield Request(category)

        brands = response.xpath(
            '//dl[@id="narrow-by-list"]//a[contains(@href, "?manufacturer=")]/@href'
        ).extract()
        if response.meta.get('extract_brands', True):
            for brand in brands:
                manufacturer_id = url_query_parameter(brand, 'manufacturer',
                                                      None)
                if manufacturer_id:
                    manufacturer_id = manufacturer_id.split(',')[0]
                    if brand.endswith(manufacturer_id):
                        yield Request(brand, meta={'extract_brands': False})

        products = response.xpath(
            '//h3[contains(@class, "product-name")]/a/@href').extract()
        for product in products:
            yield Request(product, callback=self.parse_product)

        next = response.xpath('//a[contains(text(), "Next")]/@href').extract()
        if next:
            yield Request(next[0])
Ejemplo n.º 14
0
 def parse_brand(self, response):
     brand = url_query_parameter(response.url, 'Brand', '')
     urls = response.xpath(
         '//section[@id="productList"]//a/@href').extract()
     for url in urls:
         yield Request(urljoin(get_base_url(response), url),
                       meta={'brand': brand},
                       callback=self.parse_product)
Ejemplo n.º 15
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        price_decimal = None
        try:
            price_decimal = min(
                map(
                    lambda p: Decimal(p),
                    hxs.select(
                        '//section[@class="product-offers-group"]//tr/@data-offer-price'
                    ).extract()))
        except:
            price = hxs.select(
                '//*[@itemprop="price"]/text()|//*[@itemprop="lowprice"]/text()'
            ).extract()
            price_decimal = extract_price_eu(price[0]) if price else None
        unavailable = 'Aktualnie brak ofert tego produktu. Zobacz inne produkty z kategorii' in response.body
        if (not price_decimal) and (not unavailable):
            blocked_url = url_query_parameter(response.url, 'returnUrl')
            if blocked_url:
                blocked_url = urljoin_rfc(base_url, blocked_url)
                self.log('ERROR: Blocked URL => %s' % blocked_url)
            else:
                self.log('ERROR: No product found in => %s' % response.url)
            retry_no = int(response.meta.get('retry_no', 0))
            if retry_no < 10:
                retry_no += 1
                self.log('DEBUG: Retrying page - Retry No: %s' % retry_no)
                yield Request(blocked_url or response.url,
                              meta={
                                  'category': response.meta['category'],
                                  'cookiejar': response.meta['cookiejar'],
                                  'retry_no': retry_no
                              },
                              dont_filter=True,
                              callback=self.parse_product)
            return

        if price_decimal:
            loader = ProductLoader(item=Product(), response=response)
            loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
            loader.add_xpath('identifier', '//input[@name="ProductID"]/@value')
            loader.add_xpath('sku', '//input[@name="ProductID"]/@value')
            loader.add_value('url', response.url)
            loader.add_value('price', price_decimal)
            loader.add_value('category', response.meta['category'].split(','))
            image_url = hxs.select('//img[@itemprop="image"]/@src').extract()
            if image_url:
                loader.add_value(
                    'image_url',
                    urljoin_rfc(base_url, image_url[0]).split('?')[0])

            item = loader.load_item()

            if item['identifier'] not in self.new_ids:
                self.new_ids.append(item['identifier'])
                yield item
Ejemplo n.º 16
0
    def parse_product(self, response):
        name = response.xpath("//h2/span[@itemprop='name']/text()").extract()
        if not name:
            name = response.xpath("//table//tr/td//h2/text()").extract()
        name = name[0]
        price = response.xpath("//span[@itemprop='price']/text()").re('[\d\.]+')
        if not price:
            price = response.xpath("//span[@class='pr-price']/strong/text()").re('[\d\.]+')
        price = price[0]
        stock = response.xpath("//*[@itemprop='availability']/@href").extract()
        if stock:
            if 'InStock' in stock[0]:
                stock = None
            else:
                stock = 0
        else:
            stock = None

        cats = response.xpath("//div[@class='grid_10']/h1/a/text()").extract()
        brand = cats[-1]
        image_url = response.xpath("//img[@alt='{}']/@src".format(name)).extract()
        m = re.search("details(.*)\.html", response.url)
        if m:
            identifier = m.group(1)
        else:
            entryid = url_query_parameter(response.url, 'entryid')
            priceid = url_query_parameter(response.url, 'priceid')
            if not entryid or not priceid:
                raise KeyError("Not found entryid and priceid in url: {}".format(response.url))
            identifier = entryid + priceid
        sku = identifier

        loader = ProductLoaderWithNameStrip(Product(), response=response)

        loader.add_value('name', name)
        loader.add_value('price', price)
        loader.add_value('stock', stock)
        loader.add_value('url', response.url)
        loader.add_value('brand', brand)
        loader.add_value('sku', sku)
        loader.add_value('identifier', identifier)
        loader.add_value('image_url', image_url)
        loader.add_value('category', cats)

        yield loader.load_item()
Ejemplo n.º 17
0
    def load_item_(self, item, browser, use_adurl=True):
        response = HtmlResponse(url=browser['webdriver'].current_url,
                                body=browser['webdriver'].page_source,
                                encoding='utf-8')

        l = ProductLoader(item=Product(), response=response)
        l.add_value('name', self._try_encoding(item['name']))
        l.add_value('brand', self._try_encoding(item.get('brand', '')))

        # Item URL
        url = self._try_encoding(item['url'])
        adurl = url_query_parameter(url, 'adurl')
        if adurl and use_adurl:
            item_url = adurl
        else:
            item_url = url

        dest_url = url_query_parameter(item_url, 'ds_dest_url') or url_query_parameter(item_url, 'url')
        if dest_url:
            item_url = dest_url

        if ('%s/url' % self.GOOGLE_DOMAIN) in item_url:
            url_q = url_query_parameter(item_url, 'q')
            if not url_q:
                url_q = url_query_parameter(item_url, 'url')
            if url_q:
                item_url = url_q

        l.add_value('url', item_url)
        l.add_value('price', item['price'])
        l.add_value('shipping_cost', item.get('shipping_cost', 0))
        l.add_value('dealer', item.get('dealer', ''))
        l.add_value('identifier', item['identifier'])
        l.add_value('sku', item.get('sku'))
        if 'meta' in browser:
            for k, v in browser['meta'].items():
                l.add_value(k, v)

        res = l.load_item()
        if 'metadata' in item:
            res['metadata'] = item['metadata']

        return res
Ejemplo n.º 18
0
    def parse(self, response):
        result = json.loads(response.body)
        page = url_query_parameter(response.url, 'p')
        hxs = HtmlXPathSelector(text=result['html'])
        product_urls = hxs.select('//li/a/@href').extract()
        self.log('{} products found'.format(len(product_urls)))
        for url in product_urls:
            yield Request(url, callback=self.parse_product)

        if result['is_there_a_next_page']:
            yield Request(self.search_url.format(int(page) + 1))
Ejemplo n.º 19
0
    def parse(self, response):
        base_url = get_base_url(response)

        products = json.loads(response.body)['products']
        for product in products:
            yield Request(urljoin(base_url, product['link']), callback=self.parse_product)

        if products:
            page = int(url_query_parameter(response.url, 'p', '0'))
            page += 1
            yield Request(add_or_replace_parameter(response.url, 'p', str(page)))
Ejemplo n.º 20
0
 def parse_product_list(self, response):
     i = 0
     for match in re.finditer(r'(?si)<h5>.+?href=\\"(.*?)\\"', response.body):
         i += 1
         url = match.group(1)
         yield Request(url.replace('\\', ''), callback=self.parse_product)
     if i == 100:
         page = int(url_query_parameter(response.url, 'p', '1'))
         page += 1
         url = add_or_replace_parameter(response.url, 'p', str(page))
         yield Request(url, callback=self.parse_product_list)
Ejemplo n.º 21
0
    def parse_search(self, response):

        brand = url_query_parameter(response.url, 'Brand', '')
        urls = response.xpath(
            '//section[@id="productList"]//a/@href').extract()
        for url in urls:
            yield Request(urljoin(get_base_url(response), url),
                          meta={'brand': brand},
                          callback=self.parse_product)

        yield Request('http://www.ezyvision.co.nz/ajax/search',
                      callback=self.parse_ajax_search)
Ejemplo n.º 22
0
    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        category = hxs.select('//div[@id="crumblinks"]//a/text()').extract()
        category = category[-1] if category else ''
        image_url = hxs.select('//img[@id="product-big"]/@src').extract()
        image_url = urljoin_rfc(base_url, image_url[0]) if image_url else ''

        product_brand = ''
        brand_url = hxs.select(
            '//div[@class="description"]//img[@alt="Brand Image"]/parent::a/@href'
        ).extract()
        if brand_url:
            brand_url = urljoin_rfc(base_url, brand_url[0])
            product_brand = url_query_parameter(brand_url, 'search')

        name = hxs.select("//h1[@class='coarse']/text()")[0].extract().strip()
        options = hxs.select('//div[@class="generated"]/table/tr')[1:]
        select = hxs.select(
            '//form[@id="cart_form"]//select[@class="prodoptions"]').extract()
        if options:
            # options
            for option in options:
                name2 = option.select('./td[position()=4]/text()')
                name2 = name2[0].extract().strip() if name2 else ''
                price = option.select('.//td/text()').extract()[-2].strip()
                loader = ProductLoader(item=Product(), selector=option)
                loader.add_xpath('identifier', './td[position()=2]/text()')
                loader.add_xpath('sku', './td[position()=3]/text()')
                loader.add_value('url', response.url)
                loader.add_value(
                    'name', name + ' %s %s' %
                    (loader.get_output_value('identifier'), name2))
                loader.add_value('price', price)
                loader.add_value('category', category)
                loader.add_value('image_url', image_url)
                loader.add_value('brand', product_brand)
                yield loader.load_item()
        else:
            price = "".join(
                hxs.select(".//span[@class='bigprice']/text()").re(
                    r'([0-9\,\. ]+)')).strip()
            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('url', response.url)
            loader.add_value('name', name)
            loader.add_value('price', price)
            loader.add_value('identifier', response.url)
            loader.add_value('image_url', image_url)
            loader.add_value('category', category)
            loader.add_xpath('sku', './td[position()=2]/text()')
            loader.add_value('brand', product_brand)
            yield loader.load_item()
Ejemplo n.º 23
0
    def parse_categories(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        product_urls = hxs.select('//p[contains(@class, "product-name")]//a/@href').extract()
        for url in product_urls:
            yield Request(urljoin_rfc(base_url, url), callback=self.parse_product)

        if product_urls:
            next_page = str(int(url_query_parameter(response.url, 'p', 0)) + 1)
            next_url = add_or_replace_parameter(response.url, 'p', next_page)
            yield Request(next_url, callback=self.parse_categories)
Ejemplo n.º 24
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[contains(@data-plugins,"ProductGrid")]//div[contains(@class, "product")]/a/@href').extract()
        for url in products:
            yield Request(urljoin_rfc(get_base_url(response), url), callback=self.parse_product)

        if products:
            current_page = url_query_parameter(response.url, '_iv_page')
            current_page = int(current_page) if current_page else 1
            next_url = add_or_replace_parameter(response.url, '_iv_page', str(current_page + 1))
            yield Request(next_url)
    def parse_product_list(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        '''
        brand_cats_urls = hxs.select('//div[@class="left_menu"]/div[@class="category_container"]//a/@href').extract()
        for url in brand_cats_urls:
            yield Request(urljoin_rfc(base_url, url),
                          meta=response.meta.copy(),
                          callback=self.parse_product_list)
        '''

        if not url_query_parameter(response.url, 'f'):
            filter_brand_url = hxs.select(
                u'//p[@class="arrow_head" and span[@class="leftsubcat_categories" '
                u'and contains(text(), "Brand")]]/following-sibling::ul[contains(@class, "brand_list")]'
                u'//span[@id="refine_label" and contains(text(), "%s")]/parent::a/@href'
                % response.meta['brand']).extract()
            if filter_brand_url:
                url = filter_brand_url[0]
                yield Request(urljoin_rfc(base_url, url),
                              meta=response.meta.copy(),
                              callback=self.parse_product_list)
                return

        all_products_link = hxs.select(
            '//div[@class="left_nav brand_cat"]//a[p[@class="upto_cat"]]/@href'
        ).extract()
        if all_products_link:
            url = all_products_link[0]
            yield Request(urljoin_rfc(base_url, url),
                          meta=response.meta.copy(),
                          callback=self.parse_product_list)

        products = hxs.select(
            '//div[@id="grid-view"]/div[@class="grid_view_row"]'
            '/div[contains(@class, "products_details_container")]'
            '/div[contains(@class, "products_details")]'
            '//li[contains(@class, "description")]/a/@href').extract()
        for url in products:
            url = urljoin_rfc(base_url, url)
            yield Request(url,
                          meta=response.meta.copy(),
                          callback=self.parse_product)

        pages = hxs.select(
            '//div[@class="pagination"][1]/a[not(@class="active")]/@href'
        ).extract()
        for next_page in pages:
            url = urljoin_rfc(base_url, next_page)
            yield Request(url,
                          meta=response.meta.copy(),
                          callback=self.parse_product_list)
Ejemplo n.º 26
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        products = hxs.select('//div[@class="list-item-info"]//h4/a/@href').extract()
        for product in products:
            yield Request(urljoin_rfc(base_url, product), callback=self.parse_product)

        next = hxs.select('//li[@class="pager-next"]/a/@href').extract()
        if next:
            current_page = int(url_query_parameter(response.url, 'cp', '0')) + 1
            next_page = add_or_replace_parameter(response.url, 'cp', str(current_page))
            yield Request(urljoin_rfc(base_url, next_page))
Ejemplo n.º 27
0
 def parse_item(self, response):
     dom = url_query_parameter(response.url, "dom")
     if dom:
         hxs = HtmlXPathSelector(response)
         base = hxs.select('/html/body/table/tr[2]/td/table[3]/tr/td/table')
         l = MarnetLoader(item=MarnetItem(), selector=base)
         l.add_value('domain', dom)
         l.add_xpath('dosie', './/tr/td/div/b/i/text()', re=':(\\d+)')
         l.add_xpath('datum', './/tr[3]/td[2]/text()')
         l.add_xpath('ime', './/tr[4]/td[2]/text()')
         l.add_xpath('administrative', './/tr[11]/td[2]/text()')
         l.add_xpath('techical', './/tr[15]/td[2]/text()')
         l.add_xpath('dns', './/tr[@align="center"]/td/text()')
         return l.load_item()
Ejemplo n.º 28
0
 def parse_item(self, response):
     dom = url_query_parameter(response.url, "dom")
     if dom:
         hxs = HtmlXPathSelector(response)
         base = hxs.select('/html/body/table/tr[2]/td/table[3]/tr/td/table')
         l = MarnetLoader(item=MarnetItem(), selector=base)
         l.add_value('domain', dom)
         l.add_xpath('dosie', './/tr/td/div/b/i/text()', re=':(\\d+)')
         l.add_xpath('datum', './/tr[3]/td[2]/text()')
         l.add_xpath('ime', './/tr[4]/td[2]/text()')
         l.add_xpath('administrative', './/tr[11]/td[2]/text()')
         l.add_xpath('techical', './/tr[15]/td[2]/text()')
         l.add_xpath('dns', './/tr[@align="center"]/td/text()')
         return l.load_item()
Ejemplo n.º 29
0
    def parse(self, response):
        base_url = get_base_url(response)

        data = json.loads(response.body)
        if data:
            products = data['Products']
            for product in products:
                yield Request(urljoin_rfc(base_url, product['ProductUrl']),
                              callback=self.parse_product)

            if products:
                page = url_query_parameter(response.url, 'page', '1')
                next_page = add_or_replace_parameter(response.url, 'page',
                                                     str(int(page) + 1))
                yield Request(next_page)
Ejemplo n.º 30
0
    def parse_product_list(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        products = hxs.select(
            '//*[@id="wrapper_page_content"]//ul[@class="product"]')
        for product in products:
            url = product.select('./li[1]/a/@href').extract()
            if url:
                url = url[0]
                discount = product.select(
                    './li[contains(@class,"product_promo")]/img/@alt').re(
                        r'(\d+)')
                if discount:
                    url = add_or_replace_parameter(url, 'qbDiscount',
                                                   discount[0])
                yield Request(urljoin_rfc(base_url, url),
                              callback=self.parse_product)
        if not response.meta.get('all_pages_done', False):
            urls = hxs.select('//div[@class="pages"]//a/@href').extract()
            if urls:
                url = "http://www.bhs.co.uk/webapp/wcs/stores/servlet/CatalogNavigationSearchResultCmd"

                catId = response.xpath(
                    '//form[@id="form_mercado_filters"]/input[@name="categoryId"]/@value'
                ).extract()[0]
                parent_categoryId = response.xpath(
                    '//form[@id="form_mercado_filters"]/input[@name="parent_categoryId"]/@value'
                ).extract()[0]
                n_field = response.xpath(
                    '//select[@id="sel_sort_field"]/option[@selected="selected"]/@value'
                ).extract()[0]
                n_field = url_query_parameter(n_field, 'N').replace(' ', '+')

                dimSelected = "?N=" + n_field + "&Ndr=100000&Nr=OR%28emailBackInStock%3AY%2CNOT%28product.inventory%3A0%29%29&siteId=%2F13077&sort_field=Relevance&No=0&Nrpp=9999&catId=" + catId + "&parent_categoryId=" + parent_categoryId

                formdata = {}
                formdata['langId'] = '-1'
                formdata['storeId'] = response.xpath(
                    '//input[@name="storeId"]/@value').extract()[0]
                formdata['isHash'] = 'false'
                formdata['dimSelected'] = dimSelected
                formdata['catalogId'] = response.xpath(
                    '//input[@name="catalogId"]/@value').extract()[0]
                yield FormRequest(url,
                                  dont_filter=True,
                                  formdata=formdata,
                                  callback=self.parse_product_list,
                                  meta={'all_pages_done': True})
Ejemplo n.º 31
0
    def parse(self, response):
        formdata = {
            'currency': 'AUD',
            'delivery_destination': '13',
            'update_currency_destination': 'Update'
        }

        base_url = "http://www.uksoccershop.com"

        categories = response.xpath(
            '//li[contains(a/span/text(), "Football Shirts")]//a/@href'
        ).extract()
        categories += response.xpath(
            '//div[h4/span/a/text()="Euro 2016 National Teams"]//div[contains(@class, "newitem")]/a/@href'
        ).extract()
        for category in categories:
            yield Request(response.urljoin(category))

        products = response.xpath(
            '//div[contains(@class, "productList")]//div[@class="productListLink"]/a/@href'
        ).extract()
        for product in products:
            yield FormRequest(urljoin_rfc(base_url, product),
                              formdata=self.formdata,
                              method='POST',
                              callback=self.parse_product)

        if products:
            next_url = "http://www.uksoccershop.com/index.html?cPath=%s&page=%s&ppp=48"
            cat_id = re.findall('current_category_id = (\d+)', response.body)
            if not cat_id:
                cat_id = response.xpath(
                    '//input[@name="cPath"]/@value').extract()
                cat_id = cat_id[0].split('_') if cat_id else None
            if cat_id:
                cat_id = cat_id[-1]
                current_page = url_query_parameter(response.url, 'page', '1')
                next_page = int(current_page) + 1
                yield Request(next_url % (cat_id, next_page))
            else:
                request_to_urls = re.findall("var request_to = '(.*)'\+ 48",
                                             response.body)
                all_products = filter(
                    lambda x: x if 'main_page' in x else None, request_to_urls)
                if all_products:
                    yield Request(response.urljoin(all_products[0]) + '9999')
Ejemplo n.º 32
0
 def parse_search(self, response):
     json_data = re.search("ispSearchResult\((.*)\);", response.body)
     brand = response.meta.get('brand', '')
     if json_data:
         items = json.loads(json_data.group(1))['items']
         for item in items:
             if brand.upper() in item.get('l', '').upper().strip():
                 yield Request(item['u'],
                               callback=self.parse_product,
                               meta=response.meta)
         if items:
             current_page = int(url_query_parameter(response.url, 'p', 0))
             next_url = add_or_replace_parameter(response.url, 'p',
                                                 str(current_page + 1))
             yield Request(next_url,
                           callback=self.parse_search,
                           meta=response.meta)
Ejemplo n.º 33
0
    def parse_products(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        last_page = int(
            hxs.select('//div[@class="lastpagenumber"]/@id').extract()[0][1:])
        cur_page = int(url_query_parameter(response.url, 'p'))
        if cur_page <= last_page:
            cur_page += 1
            next_page_url = add_or_replace_parameter(response.url, 'p',
                                                     str(cur_page))
            yield Request(next_page_url, callback=self.parse_products)

        for url in hxs.select(
                '//div[@class="products-set"]/ul/li/h4/a/@href').extract():
            yield Request(urljoin_rfc(base_url, url),
                          callback=self.parse_product)
Ejemplo n.º 34
0
	def parse(self, response):
		lx = lxml.html.fromstring(response.body_as_unicode())
		episodes = episode_sel(lx)
		for episode in episodes:
			url = link_sel(episode)[0]
			url = urljoin_rfc(self.start_urls[0], url.attrib['href'])
			yield Request(url=url, callback=self.parse_video_page)

		# Simulate pagination
		if episodes:
			current = url_query_parameter(response.url, 'page')
			if not current:
				current = '2' # XHR request starts at page 2
			url = "http://blip.tv/pr/show_get_full_episode_list?"
			url += "users_id=348873&lite=1&esi=1&page=%s"
			url = url % str(int(current)+ 1)
			
			yield Request(url=url, callback=self.parse)
Ejemplo n.º 35
0
 def parse_minutes(self, response):
     filename = url_query_parameter(response.url, "hfile")
     daesu = url_query_parameter(response.url, "daesu")
     date = self.parse_date(response)
     save_file(minute_filepath_fmt.format(DATA_DIR=DATA_DIR, **locals()), response.body)