Ejemplo n.º 1
0
 def yield_product(self, product):
     metadata = SigmaSportMeta()
     metadata['price_exc_vat'] = extract_exc_vat_price(product)
     product['metadata'] = metadata
     if product['price'] < 15:
         product['shipping_cost'] = 1.99
     return product
 def preprocess_product(self, item):
     metadata = SigmaSportMeta()
     metadata['price_exc_vat'] = extract_exc_vat_price(item)
     item['metadata'] = metadata
     if Decimal(item['price']) < 9:
         item['shipping_cost'] = 2
     return item
Ejemplo n.º 3
0
 def parse_secondary(self, response):
     for obj in super(EvansCyclesComSpider, self).parse_secondary(response):
         if isinstance(obj, Product):
             metadata = SigmaSportMeta()
             metadata['price_exc_vat'] = extract_exc_vat_price(obj)
             obj['metadata'] = metadata
         yield obj
Ejemplo n.º 4
0
 def preprocess_product(self, item):
     metadata = SigmaSportMeta()
     if not item['price']:
         item['price'] = '0.00'
     elif extract_price(item['price']) < 9:
         item['shipping_cost'] = 1.99
     metadata['price_exc_vat'] = extract_exc_vat_price(item)
     item['metadata'] = metadata
     return item
Ejemplo n.º 5
0
    def parse_item(self, response):
        '''
                skuArray.push({
                    productexternalid: 72833,
                    colour: 'Light Grey/Grey',
                    size: '49',
                    skuNopId: 91684,
                    skuId: 227272,
                    price: '£90.00',
                    priceAsDecimal: 90.0000,
                    stockquantity: 0,
                    preorder: true,
                    outofstock: true,
                    issubscribed: false,
                    availableDate: 'Due in 02/07/2015'
                    });
        '''
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        products_data = []
        collect_product = False
        for i, l in enumerate(response.body.split('\n')):
            if 'skuArray.push({' in l:
                collect_product = True
                current_product = {}
                continue
            if '});' in l and collect_product:
                collect_product = False
                products_data.append(current_product)
                continue
            if collect_product:
                attr_data = [a.strip() for a in l.split(':')]
                current_product[attr_data[0]] = eval(attr_data[1].replace('false', 'False').replace('true', 'True'))
                if isinstance(current_product[attr_data[0]], tuple):
                    current_product[attr_data[0]] = current_product[attr_data[0]][0]

        main_name = hxs.select('//h1[@itemprop="name"]/text()').extract()[0].strip()
        categories = hxs.select('//div[@id="breadcrumb"]//span[@itemprop="title"]/text()').extract()[1:]

        for p in products_data:
            loader = ProductLoader(item=Product(), response=response)
            loader.add_xpath('image_url', '//img[@itemprop="image"]/@src', lambda a: urljoin_rfc(base_url, a[0]) if a else '')
            loader.add_value('identifier', p['skuId'])
            loader.add_value('sku', p['productexternalid'])
            loader.add_value('price', p['priceAsDecimal'])
            loader.add_value('stock', p['stockquantity'])
            loader.add_value('category', categories)
            loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content')
            loader.add_value('url', response.url)
            loader.add_value('name', main_name + ' - ' + p['colour'] + ' - ' + p['size'])

            product = loader.load_item()
            metadata = SigmaSportMeta()
            metadata['price_exc_vat'] = extract_exc_vat_price(product)
            product['metadata'] = metadata
            yield product
Ejemplo n.º 6
0
 def parse_product_data(self, response):
     s = response.body
     try:
         content = unicode(s, 'utf-8', errors='replace')
     except (LookupError, TypeError):
         content = unicode(s, errors='replace')
     try:
         data = json.loads(content)
     except ValueError:
         meta = response.meta
         retry = meta.get('retry', 1)
         retry += 1
         if retry < 10:
             meta['retry'] = retry
             self.log('WARNING - Retry #{} {}'.format(retry, response.meta.get('url')))
             yield Request(response.url,
                           meta=meta,
                           callback=self.parse_product_data,
                           dont_filter=True)
         else:
             self.log('ERROR - Maximum retry count reached! {} {}'.format(response.meta.get('url'), response.body))
             yield []
     else:
         item = data.get('productItemDetails')
         if item:
             product_loader = ProductLoader(item=Product(), response=response)
             product_loader.add_value('name', item.get('name'))
             price = extract_price(item.get('nowPriceRaw'))
             if price:
                 shipping = 2.99 if price < 30 else ''
                 product_loader.add_value('price', price)
                 product_loader.add_value('shipping_cost', shipping)
                 product_loader.add_value('category', response.meta.get('category'))
                 product_loader.add_value('url', response.meta.get('url'))
                 # product_loader.add_value('image_url', image_url)
                 product_loader.add_value('brand', response.meta.get('brand'))
                 product_loader.add_value('sku', item.get('itemCode'))
                 product_loader.add_value('identifier', item.get('itemCode'))
                 product = product_loader.load_item()
                 metadata = SigmaSportMeta()
                 metadata['price_exc_vat'] = extract_exc_vat_price(product)
                 product['metadata'] = metadata
                 if product['identifier'] not in self._identifiers_viewed:
                     if self.simple_run and (product['identifier'] not in self.matched_identifiers):
                         return
                     self._identifiers_viewed.add(product['identifier'])
                     yield product
Ejemplo n.º 7
0
 def parse_options(self, response):
     data = json.loads(response.body_as_unicode())['productItemDetails']
     product_loader = ProductLoader(item=Product(response.meta['item']), response=response)
     product_loader.add_value('name', data['name'])
     product_loader.add_value('identifier', data['itemCode'])
     product_loader.add_value('sku', data['itemCode'])
     product_loader.add_value('price', data['nowPriceRaw'])
     if not data['inStock']:
         product_loader.add_value('stock', 0)
     product = product_loader.load_item()
     if product['price'] < 30:
         product['shipping_cost'] = 2.99
     metadata = SigmaSportMeta()
     metadata['price_exc_vat'] = extract_exc_vat_price(product)
     product['metadata'] = metadata
     if product['identifier'] not in self._identifiers_viewed:
         if self.simple_run and (product['identifier'] not in self.matched_identifiers):
             return
         self._identifiers_viewed.add(product['identifier'])
         yield product
Ejemplo n.º 8
0
    def preprocess_product(self, item):
	metadata = SigmaSportMeta()
	metadata['price_exc_vat'] = extract_exc_vat_price(item)
	item['metadata'] = metadata
        return item
Ejemplo n.º 9
0
    def parse_product(self, response):

        try:
            category = response.xpath('//nav[@id="breadcrumb"]//ul/li[@class="penultimateStep"]/a/text()').extract()[0].strip()
        except IndexError:
            category = ''
        image_url = response.xpath('//meta[@property="og:image"]/@content').extract()
        if image_url:
            image_url = image_url[0].replace('merchzone', 'main')
        brand = response.xpath('//div[@class="hproduct"]/span[@class="brand"]/text()').extract()
        price = response.xpath('//div[@id="priceAndLogo" or @id="priceAndRating"]/h2/text()').re(r'[\d,.]+')

        options = re.findall('multiVariantArray:(.*),', response.body)
        try:
            variants = json.loads(options[0].strip())
        except:
            options = ''
        if options and response.xpath('//div[@class="productOptions"]//div[contains(@id, "itemVariantSelectionWidget")]'):
            parameters = {
                'action': 'getProductItemDetails',
                'langId': '-1',
                'storeId': '10001'
                }
            msg = {
                'productId': response.xpath('//input[@name="productId"]/@value').extract()[0].encode(),
                'catalogId': response.xpath('//input[@name="catalogId"]/@value').extract()[0].encode(),
                'categoryId': response.xpath('//input[@name="categoryId"]/@value').extract()[0].encode()
            }
            option_url = 'http://www.halfords.com/webapp/wcs/stores/servlet/GetProductItemDetails'
            for variant in variants:
                msg['catEntryId'] = variant['itemId']
                parameters['msg'] = msg
                url = option_url
                for par in parameters:
                    url = add_or_replace_parameter(url, par, parameters[par])
                product_loader = ProductLoader(item=Product(), response=response)
                product_loader.add_value('url', response.url)
                product_loader.add_value('category', category)
                product_loader.add_value('brand', brand)
                product_loader.add_value('image_url', image_url)
                product = product_loader.load_item()
                yield Request(url, meta={'item':Product(product)}, callback=self.parse_options)
            return

        identifier = response.xpath('//input[@name="productId"]/@value').extract()
        if not identifier:
            self.log('No identifier found for %s' %response.url)
            return
        identifier = identifier.pop()
        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_value('identifier', identifier)
        product_loader.add_value('sku', identifier)
        product_loader.add_xpath('name', '//h1[@class="productDisplayTitle"]/text()')
        price = response.xpath('//div[@class="productDisplayPricing"]'
                           '//div[@class="pricewrapper"]/div[@class="total"]'
                           '/span[@class="totalPrice"]/text()').extract()

        if not price:
            price = re.findall("price:\s?\'&pound;(.+?)\'", response.body)
            if not price:
                self.log('WARNING: No price can be found, ignoring product %s' %response.url)
                return
        price = extract_price(price[0])
        if price:
            shipping = 2.99 if price < 30 else ''
            product_loader.add_value('price', price)
            product_loader.add_value('shipping_cost', shipping)
            product_loader.add_value('url', response.url)
            product_loader.add_value('category', category)
            product_loader.add_value('image_url', image_url)
            product_loader.add_value('brand', brand)
            if response.xpath('//div[@id="productBuyable"][@class="hidden"]'):
                product_loader.add_value('stock', 0)
            product = product_loader.load_item()
            metadata = SigmaSportMeta()
            metadata['price_exc_vat'] = extract_exc_vat_price(product)
            product['metadata'] = metadata
            if product['identifier'] not in self._identifiers_viewed:
                #if self.simple_run and (product['identifier'] not in self.matched_identifiers):
                    #return
                self._identifiers_viewed.add(product['identifier'])
                yield product
Ejemplo n.º 10
0
    def parse(self, response):
        transport = paramiko.Transport((CLIENTS_SFTP_HOST, CLIENTS_SFTP_PORT))
        password = "******"
        username = "******"
        transport.connect(username = username, password = password)
        sftp = paramiko.SFTPClient.from_transport(transport)
        files = sftp.listdir_attr()

        last = get_last_file("SigmaFirst1000Products", "xlsx", files)
        file_path = HERE+'/SigmaFirst1000Products.xlsx'
        sftp.get(last.filename, file_path)

        wb = xlrd.open_workbook(file_path)
        sh = wb.sheet_by_name('Sheet1')

        product_ids = {}
        for rownum in xrange(sh.nrows):
            if rownum < 1:
                continue
            row = sh.row_slice(rownum)
            product_id = row[2].value
            if row[2].ctype == 2:
                product_id = str(int(row[2].value))
            product_ids[product_id.replace('-GB', '')] = []

        last = get_last_file("feedspark", "tsv", files)

        file_path = HERE+'/feedspark.tsv'
        sftp.get(last.filename, file_path)
        with open(file_path) as f:
            reader = csv.DictReader(f, delimiter='\t')
            for row in reader:
                product_id = row['id'].replace('-GB', '').upper().strip()
                
                if product_id in product_ids.keys():
                    loader = ProductLoader(response=response, item=Product())
                    loader.add_value('sku', row['code'].replace('-gb', '').replace('-GB', ''))
                    categories = row['mapped_category'].split('>')
                    for category in categories:
                        loader.add_value('category', category.strip().encode('utf-8'))
                    loader.add_value('brand', row['brand'].encode('utf-8'))
                    name = [row['title']]
                    if row['colour']:
                        name.append(row['colour'])
                    if row['size']:
                        name.append(row['size'])
                    try:
                        loader.add_value('name', " ".join(name).encode('utf-8'))
                    except:
                        loader.add_value('name', " ".join(name).decode('utf-8'))
                    loader.add_value('price', row['price'])
                    loader.add_value('image_url', row['image_link'])
                    loader.add_value('url', row['link'])
                    loader.add_value('identifier', row['id'])
                    if row['availability'].lower() == 'out of stock':
                        loader.add_value('stock', 0)

                    if loader.get_output_value('price')<10:
                        loader.add_value('shipping_cost', 1.99)
 
                    product = loader.load_item()
                    metadata = SigmaSportMeta()
                    metadata['mpn'] = row['mpn']
                    metadata['item_group_number'] = row['item_group_id']
                    metadata['cost_price'] = row.get('cost_price', '0.00').replace(' GBP', '')
                    metadata['price_exc_vat'] = extract_exc_vat_price(product)
                    metadata['sku_gb'] = str(product['sku']) + '-GB'if product.get('sku', None) else ''
                    product['metadata'] = metadata
                    
                    # Check if the products have different prices
                    collected_products = product_ids[product_id]
                    prices = []
                    for collected_product in collected_products:
                        prices.append(product['price'])

                    if product['price'] not in prices:
                        product_ids[product_id].append(product)

            # Collects all the products for each name
            for name, products in product_ids.iteritems():
                for product in products:
                    yield product
Ejemplo n.º 11
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        product_id = response.xpath(
            '//div[contains(@class, "productContainer")]/@data-product-id'
        ).extract_first()
        if not product_id:
            return
        image_url = hxs.select(
            '//div[@id="thumbnails"]/div/a/img/@src').extract()
        category = response.css('.breadcrumb span::text').extract()[-2]
        main_name = ''.join(
            hxs.select('//h1[@class="product-title"]/text()').extract())
        brand = hxs.select('//li[@class="brand"]/a/span/text()').re(
            'About (\w+)')
        sku = ''.join(
            hxs.select('//div[@class="stockCode"]/text()').extract()).strip()

        options = response.xpath(
            '//div[contains(@id, "productOption")]/ul[@role="menu"]/li')
        for option in options:
            product_loader = ProductLoader(item=Product(), selector=option)
            option_id = option.select('@data-id').extract()[0]
            product_loader.add_value('brand', brand)

            product_loader.add_value('category', category)
            name = ''.join(
                option.select('a/span[@class="title"]/text()').extract())
            if name == main_name:
                name = main_name
            else:
                group_name = option.xpath(
                    'preceding-sibling::div[1]/strong/text()').extract_first()
                if group_name:
                    name = group_name + ' ' + name
                name = ' '.join((main_name, name))

            product_loader.add_value('name', name)
            product_loader.add_value('url', response.url)
            identifier = product_id + '-' + option_id
            product_loader.add_value('identifier', identifier)

            # product_loader.add_value('brand', brand)
            product_loader.add_value('sku', sku)
            stock = option.select('./@data-stock').extract()

            rrp = option.select('./@data-rrp').extract()
            rrp = str(extract_price(rrp[0])) if rrp else ''
            price = option.select('./@data-merlin-price').extract()
            if price:
                price = '{0:.2f}'.format(float(price[0]))
                product_loader.add_value('price', price)

            in_stock = stock[0] == 'inStock' if stock else None
            if not in_stock:
                product_loader.add_value('stock', 0)

            product_loader.add_value('image_url', image_url)
            product = product_loader.load_item()
            metadata = SigmaSportMeta()
            metadata['price_exc_vat'] = extract_exc_vat_price(product)
            product['metadata'] = metadata
            yield product

        if not options:
            identifier = product_id + '-0'
            stock = response.css('.productContainer .inStock')
            rpp = hxs.select(
                '//div[@class="productContainer"]//span[@class="rrp"]/span/span[@class="price"]/text()'
            ).extract()
            price = hxs.select('//meta[@itemprop="price"]/@content').extract()

            product_loader = ProductLoader(item=Product(), selector=hxs)
            product_loader.add_value('identifier', identifier)
            product_loader.add_value('category', category)
            product_loader.add_value('name', main_name)
            product_loader.add_value('url', response.url)
            product_loader.add_value('sku', sku)
            product_loader.add_value('price', price)
            product_loader.add_value('brand', brand)
            product_loader.add_value('image_url', image_url)
            if not stock:
                product_loader.add_value('stock', 0)

            product = product_loader.load_item()
            metadata = SigmaSportMeta()
            metadata['price_exc_vat'] = extract_exc_vat_price(product)
            product['metadata'] = metadata
            yield product
Ejemplo n.º 12
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        image_url = hxs.select('//a[@itemprop="image"]/@href').extract()
        try:
            product_identifier = hxs.select(
                '//*[@id="productDetailsAddToCartForm"]/input[@name="product_id"]/@value'
            ).extract()[0].strip()
        except:
            return
        product_name = hxs.select(
            '//*[@id="ProductDetails"]/h1/text()').extract()[0].strip()
        category = hxs.select(
            '//ul[@class="breadcrumbs"]//a/text()').extract()[1:]
        brand = hxs.select(
            '//*[@id="ProductDetails"]/div[@itemprop="brand"]//span/text()'
        ).extract()
        brand = brand[0].strip() if brand else ''
        product_price = hxs.select(
            '//span[@class="ProductPrice VariationProductPrice"]/text()'
        ).extract()[0]
        product_price = extract_price(product_price)
        options = []
        product_options = hxs.select('//div[@class="productOptionViewRadio"]')
        if product_options:
            for select in product_options:
                values = select.select('.//li/label/input/@value').extract()
                titles = select.select('.//li/label/span/text()').extract()
                opts = []
                for value, title in zip(values, titles):
                    opts.append({'identifier': value, 'name': title})
                if opts:
                    options.append(opts)
        product_options = hxs.select('//div[@class="productOptionViewSelect"]')

        if product_options:
            for select in product_options:
                values = select.select('./select/option/@value').extract()
                titles = select.select('./select/option/text()').extract()
                opts = []
                for value, title in zip(values, titles):
                    if value:
                        opts.append({'identifier': value, 'name': title})
                if opts:
                    options.append(opts)
        if options:
            for opts in itertools.product(*options):
                name = product_name
                identifier = product_identifier
                for option in opts:
                    name += ' ' + option['name']
                    identifier += '_' + option['identifier']
                product_loader = ProductLoader(item=Product(), selector=hxs)
                product_loader.add_value('identifier', identifier)
                product_loader.add_value('name', name)
                if image_url:
                    product_loader.add_value(
                        'image_url', urljoin_rfc(base_url, image_url[0]))
                product_loader.add_value('price', product_price)
                product_loader.add_value('url', response.url)
                product_loader.add_value('brand', brand)
                product_loader.add_value('category', category)
                product = product_loader.load_item()
                metadata = SigmaSportMeta()
                metadata['price_exc_vat'] = extract_exc_vat_price(product)
                product['metadata'] = metadata
                yield product
        else:
            product_loader = ProductLoader(item=Product(), selector=hxs)
            product_loader.add_value('identifier', product_identifier)
            product_loader.add_value('name', product_name)
            if image_url:
                product_loader.add_value('image_url',
                                         urljoin_rfc(base_url, image_url[0]))
            product_loader.add_value('price', product_price)
            product_loader.add_value('url', response.url)
            product_loader.add_value('brand', brand)
            product_loader.add_value('category', category)
            product = product_loader.load_item()
            metadata = SigmaSportMeta()
            metadata['price_exc_vat'] = extract_exc_vat_price(product)
            product['metadata'] = metadata
            yield product
Ejemplo n.º 13
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        image_url = hxs.select(
            '//div[@class="product-image main-product-image"]//img[@class="product-img"]/@src'
        ).extract()
        try:
            product_identifier = hxs.select(
                '//*[@id="productId"]/@value').extract()[0].strip()
        except:
            self.log('Error! No product ID on the page! {}'.format(
                response.url))
            retry = response.meta.get('retry', 0)
            if retry < 10:
                meta = response.meta.copy()
                meta['retry'] = retry + 1
                meta['dont_merge_cookies'] = True
                yield Request(response.url,
                              meta=meta,
                              callback=self.parse_product,
                              dont_filter=True)
            return
        product_name = hxs.select(
            '//div[@class="product-title-wrap"]/h1/text()').extract()[0].strip(
            )
        category = response.url.split('/')[3].replace('-', ' ').title()
        brand = response.xpath(
            '//th[contains(text(),"Brand:")]/../td//text()[normalize-space(.)!=""]'
        ).extract()
        brand = brand[0].strip() if brand else ''
        product_price = response.css('span.price::text').extract_first()
        product_price = extract_price(product_price)
        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('identifier', product_identifier)
        product_loader.add_value('sku', product_identifier)
        product_loader.add_value('name', product_name)
        if image_url:
            product_loader.add_value('image_url',
                                     urljoin_rfc(base_url, image_url[0]))
        product_loader.add_value('price', product_price)
        product_loader.add_value('url', response.url)
        product_loader.add_value('brand', brand)
        product_loader.add_value('category', category)
        if product_price < 10:
            product_loader.add_value('shipping_cost', 1.99)
        else:
            product_loader.add_value('shipping_cost', 0)
        product = product_loader.load_item()

        variations = hxs.select(
            '//div[@class="variation-dropdowns fl"]/form//input[@name="variation"]/@value'
        ).extract()

        product_options = hxs.select(
            '//div[@class="variation-dropdowns fl"]/form[1]//select/option/@value'
        ).extract()
        if product_options:
            for option_id in product_options:
                if option_id:
                    yield Request(
                        'http://www.probikekit.co.uk/variations.json?productId='
                        + product_identifier + '&selected=1&variation1=' +
                        variations[0] + '&option1=' + option_id +
                        '&switchcurrency=GBP',
                        meta={
                            'product': product,
                            'cur_variation': 1
                        },
                        callback=self.parse_product_option)
        else:
            metadata = SigmaSportMeta()
            metadata['price_exc_vat'] = extract_exc_vat_price(product)
            product['metadata'] = metadata
            yield product
Ejemplo n.º 14
0
    def parse_product_option(self, response):
        base_url = get_base_url(response)
        product_data = json.loads(response.body)

        if 'variations' not in product_data or not product_data['variations']:
            self.log('Error! No options on the page! {}'.format(response.url))
            retry = response.meta.get('retry', 0)
            if retry < 10:
                meta = response.meta.copy()
                meta['retry'] = retry + 1
                yield Request(response.url,
                              meta=meta,
                              callback=self.parse_product_option,
                              dont_filter=True)

        product = response.meta['product']
        cur_variation = response.meta['cur_variation']

        if cur_variation == len(product_data['variations']):
            name = ''
            for variation in product_data['variations']:
                name += ' ' + variation['options'][0]['name']
            name = name.replace('One Colour',
                                '').replace('One Option',
                                            '').replace('One Option', '')
            name = ' '.join(name.split())
            new_item = copy.deepcopy(product)
            new_item['name'] += ' ' + name
            new_item['identifier'] = str(product_data['selected-product-id'])
            new_item['price'] = extract_price(
                product_data['price'].split(';')[1])
            if new_item['price'] < 10:
                new_item['shipping_cost'] = 1.99
            else:
                new_item['shipping_cost'] = 0
            if product_data['images']:
                new_item['image_url'] = urljoin_rfc(
                    'http://s1.thcdn.com/', product_data['images'][2]['name'])

            metadata = SigmaSportMeta()
            metadata['price_exc_vat'] = extract_exc_vat_price(new_item)
            new_item['metadata'] = metadata
            yield new_item
        else:
            base_url = 'http://www.probikekit.co.uk/variations.json?productId='
            base_url += str(product['identifier']) + '&selected=' + str(
                cur_variation + 1) + '&switchcurrency=GBP'
            i = 0
            for variation in product_data['variations'][0:cur_variation]:
                i += 1
                base_url += '&variation' + str(i) + '=' + str(variation['id'])
                base_url += '&option' + str(i) + '=' + str(
                    variation['options'][0]['id'])
            i += 1
            for option in product_data['variations'][cur_variation]['options']:
                url = base_url + '&variation' + str(i) + '=' + str(
                    product_data['variations'][cur_variation]['id'])
                url += '&option' + str(i) + '=' + str(option['id'])
                yield Request(url,
                              meta={
                                  'product': product,
                                  'cur_variation': cur_variation + 1
                              },
                              callback=self.parse_product_option)