Example #1
0
 def match(self, meta, search_item, found_item):
     if 'rachael ray' in (found_item.get('brand', '') or '').lower():
         self.log(
             u'[[NAVICO_AMER_AMAZON]] found Rachael Ray item: {brand} {name} ({url})'
             .format(**found_item))
         return False
     if 'e-cloth' in found_item['name'].lower():
         self.log(
             u'[[NAVICO_AMER_AMAZON]] found e-cloth item: {brand} {name} ({url})'
             .format(**found_item))
         return False
     if not found_item.get('brand'):
         self.log(
             u"[[NAVICO_AMER_AMAZON]] found item with no brand: {name} ({url})"
             .format(**found_item))
     elif (found_item.get('brand', '') or '').lower() not in self.brands:
         self.log(
             u"[[NAVICO_AMER_AMAZON]] found item with incorrect brand: {brand} {name} ({url})"
             .format(**found_item))
     if 'model' in found_item:
         self.log(u"[[NAVICO_AMER_AMAZON]] Found model for product {}: {}".
                  format(found_item['name'], found_item['model']))
         search_sku = remove_punctuation_and_spaces(
             search_item['sku']).lower()
         found_sku = remove_punctuation_and_spaces(
             found_item['model']).lower()
         if fuzzy_match_ratio(search_sku, found_sku) >= 90:
             self.log("[[NAVICO_AMER_AMAZON]] Model {} match sku {}".format(
                 found_item['model'], search_item['sku']))
             return True
         self.log(
             u"[[NAVICO_AMER_AMAZON]] Model {} do not match sku {}".format(
                 found_item['model'], search_item['sku']))
         return False
     else:
         self.log(
             u"[[NAVICO_AMER_AMAZON]] No model for product {}: {}".format(
                 found_item['name'], found_item['url']))
         return True
Example #2
0
    def _item_product(self, product, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=product)
        xpath = 'div[@class="ligne_titre"]/a/@href'
        url = ''
        try:
            category = hxs.select(
                '//div[@id="chemin_os"]//a/span[@itemprop="title"]/text()'
            ).extract()[-1].strip()
        except:
            category = None
        if product.select(xpath).extract():
            url = ("http://www.hmdiffusion.com/" +
                   product.select(xpath).extract()[0].strip())
        loader.add_value('url', url)
        name = product.select('div[@class="ligne_titre"]/a/span/strong/text()')
        name = name[0].extract().strip()
        name = name.replace('- OFFRE SPECIALE !', '').strip()
        loader.add_value('name', name)
        xpath = 'div[@class="lignebeige"]/div[@class="bloc_prix bloc_prix deuxprix"]/b[@class="prix"]/text()'
        if product.select(xpath):
            price = product.select(xpath).extract()[0]
            loader.add_value('price', self._encode_price(price))
        else:
            xpath = 'div[@class="lignebeige"]/div[@class="bloc_prix "]/b[@class="prix"]/text()'
            if product.select(xpath):
                price = product.select(xpath).extract()[0]
                loader.add_value('price', self._encode_price(price))
            else:
                xpath = 'div[@class="lignebeige"]/div[@class="bloc_prix deuxprix"]/b[@class="prix"]/text()'
                if product.select(xpath):
                    price = product.select(xpath).extract()[0]
                    loader.add_value('price', self._encode_price(price))
        sku = product.select(
            './/div[@class="lignebeige"]//span[@class="reference"]/text()'
        )[1].extract().strip()
        if category:
            loader.add_value('category', category)
        identifier = remove_punctuation_and_spaces(name).lower()
        loader.add_value('identifier', identifier)
        loader.add_value('sku', sku)
        image = product.select(
            './/preceding-sibling::div[@class="colonne_0"]//img/@src').extract(
            )
        if image:
            loader.add_value('image_url',
                             urljoin_rfc(get_base_url(response), image[0]))

        loader.add_value('stock', 1)
        return loader.load_item()
def _get_product_key(product):
    """
    >>> a = {'name': 'Burgess Excel Tasty Nuggets for Adult Rabbits-4kg', 'identifier': '1928'}
    >>> b = {'name': 'Burgess Excel Tasty Nuggets for Adult Rabbits 4kg', 'identifier': '1928'}
    >>> res1 = _get_product_key(a)
    >>> res2 = _get_product_key(b)
    >>> res1 == res2
    True
    >>> res1
    ('1928', 'burgessexceltastynuggetsforadultrabbits4kg')
    """
    identifier = product['identifier']
    name = remove_punctuation_and_spaces(product['name']).lower()
    return identifier, name
Example #4
0
 def _single_product(self, product, response):
     """ Some pages have only one product, and got a different structure.
         This function returns the unique Item on this type of pages.
     """
     hxs = HtmlXPathSelector(response)
     try:
         category = hxs.select(
             '//div[@id="chemin_os"]//a/span[@itemprop="title"]/text()'
         ).extract()[-2].strip()
     except:
         category = None
     hxs = HtmlXPathSelector(response)
     loader = ProductLoader(item=Product(), selector=product)
     loader.add_value('url', response.url)
     name = product.select('div[@id="fichetitre"]/text()')
     name = name[0].extract().strip()
     name = name.replace('- OFFRE SPECIALE !', '').strip()
     loader.add_value('name', name)
     loader.add_xpath('sku',
                      './/span[@class="reference"]/text()',
                      re=r'R\xe9f. (.*)')
     identifier = remove_punctuation_and_spaces(name).lower()
     loader.add_value('identifier', identifier)
     image = product.select('.//div[@id="lien_zoom0"]//img/@src').extract()
     if image:
         loader.add_value('image_url',
                          urljoin_rfc(get_base_url(response), image[0]))
     if category:
         loader.add_value('category', category)
     xpath = 'form/div//div/div/div/div/b[@class="prix"]/text()'
     if product.select(xpath):
         price = product.select(xpath).extract()[0]
         loader.add_value('price', self._encode_price(price))
     # stock = hxs.select('//span[@content="in_stock"]')
     # if stock:
     #     loader.add_value('stock', 1)
     # else:
     loader.add_value('stock', 1)
     return loader.load_item()
Example #5
0
    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)
        pages = hxs.select('//select[@name="nbPagesPerPage"]')
        cat_text = hxs.select('//h2[@class="titre_image titre_image_niv1"]')
        if not pages and not cat_text:
            try:
                category = hxs.select(
                    '//div[@id="chemin_os"]//a/span[@itemprop="title"]/text()'
                ).extract()[-1]
            except:
                category = None

            main_ref = hxs.select(
                '//div[@id="ficheProduitPied"]//span[@class="reference"]/text()'
            ).re(r'R\xe9f. (.*)')
            name = response.xpath(
                '//div[@id="ficheProduitPied"]/div[@id="fichetitre"]/text()'
            ).extract()
            if not name or (name and not name[0].strip()):
                name = response.xpath(
                    '//span[@itemprop="name"]/text()').extract()
            price = ''.join(
                response.xpath(
                    '//div[@id="ficheProduitPied"]//*[@class="prix"]/text()').
                re('\S+'))
            if name:
                identifier = remove_punctuation_and_spaces(name[0]).lower()
                image_url = response.xpath(
                    '//div[@id="ficheProduitPied"]//img/@src').extract()
                image_url = urljoin_rfc(get_base_url(response),
                                        image_url[0]) if image_url else ''

                l = ProductLoader(
                    item=Product(),
                    selector=response.xpath('//div[@id="ficheProduitPied"]'))
                l.add_value('identifier', identifier)
                l.add_value('name', name)
                if category:
                    l.add_value('category', category)
                l.add_xpath('sku',
                            '//div[@id="ligne_achat"]//text()',
                            re=':(.+)')
                l.add_value('stock', 1)
                l.add_value('url', response.url)
                l.add_value('price', price)
                l.add_value('image_url', image_url)
                yield l.load_item()

            products = hxs.select(
                '//div[@id="bloc_offre"]/div/div[@class="bloc_cadre_pied"]/form[@class="mini_fiche_ligne"]'
            )
            products += hxs.select(
                '//div[@id="bloc_accessoire"]/div/div[@class="bloc_cadre_pied"]/form[@class="mini_fiche_ligne"]'
            )
            for p in products:
                p_url = p.select(
                    './/div[@class="ligne_titre"]/a/@href').extract()
                if p_url:
                    yield Request(urljoin_rfc(get_base_url(response),
                                              p_url[0]),
                                  callback=self.parse_product)
                    continue
                name = p.select(
                    './/div[@class="colonne_1"]/div[@class="ligne_titre"]/span[@class="titre_descriptif"]/strong/text()'
                )
                if not name:
                    name = p.select(
                        './/div[@class="colonne_1"]/div[@class="ligne_titre"]/a/span[@class="titre_descriptif"]/strong/text()'
                    )
                name = name[0].extract().strip()
                name = name.replace('- OFFRE SPECIALE !', '').strip()
                url = response.url
                price = "".join(
                    p.select(
                        './/div[@class="lignebeige"]/div[@class="wrapperPrix"]/div/div/div/b/text()'
                    ).re(r'([0-9\,\. ]+)')).strip()
                # identifier = p.select('.//div/div/span[@class="reference"]/text()').extract()[1].strip()
                identifier = remove_punctuation_and_spaces(name).lower()
                image_url = p.select('.//div/img/@src').extract()
                if image_url:
                    image_url = urljoin_rfc(get_base_url(response),
                                            image_url[0])
                sku = ''
                p_ref = p.select('.//span[@class="reference"]//text()').re(
                    r'(\d+)')
                if main_ref and p_ref:
                    if p_ref[0] == main_ref[0]:
                        p_sku = p.select(
                            '//div[@id="ligne_achat"]/table/tr/td/text()'
                        ).extract()
                        if p_sku:
                            try:
                                sku = p_sku[0].strip().split(': ')[1]
                            except IndexError:
                                sku = p.select(
                                    '//div[@id="ligne_achat"]/table/tr/td/text()'
                                ).re('\S+')[2]

                l = ProductLoader(item=Product(), response=response)
                l.add_value('identifier', identifier)
                l.add_value('name', name)
                if category:
                    l.add_value('category', category)
                l.add_value('sku', sku)
                l.add_value('stock', 1)
                l.add_value('url', url)
                l.add_value('price', price)
                l.add_value('image_url', image_url)
                yield l.load_item()
Example #6
0
def make_product(loader,
                 product_info,
                 operator,
                 channel,
                 plan_name,
                 per_month,
                 period,
                 one_time_charge,
                 strip_operator=False,
                 ignore_rec_charge_diff=False):
    plan_name = _fix_planname(plan_name, operator)
    # fix orange to salt
    plan_name = plan_name.replace('Orange', 'Salt')
    # category, per_month = pick_plan_category2(operator, plan_name, per_month, ignore_rec_charge_diff)
    brand = _pick_brand(product_info)
    # fix device name when brand is already found
    device_name = _fix_device_name(product_info['device_name'], brand,
                                   strip_operator)
    period = re.search("\d+", period).group(0)
    one_time_charge = one_time_charge if one_time_charge else 0

    device_identifier = product_info[
        'identifier'] if 'identifier' in product_info else device_name
    identifier_plan = _get_plan_identifier(plan_name)
    identifier = device_identifier + '_' + identifier_plan + '_' + period
    network_gen = product_info.get('network_gen', '')
    if network_gen:
        if '4g' in network_gen.lower():
            network_gen = '4G'
        else:
            network_gen = '3G'
    else:
        network_gen = ''
    # if network_gen:
    #     identifier += '_' + network_gen

    # loader = ProductLoader(item=Product(), response=response)
    loader.add_value('name', device_name)
    loader.add_value('url', product_info['url'])
    loader.add_value('brand', brand)
    loader.add_value('image_url', product_info['image_url'])
    # loader.add_value('category', category)

    loader.add_value('price', one_time_charge)
    loader.add_value('identifier', identifier)

    meta_loader = OrangeNewMetaLoader(item=OrangeNewMeta())

    meta_loader.add_value('device_name',
                          remove_punctuation_and_spaces(device_name).lower())
    meta_loader.add_value('device_identifier', device_identifier)
    meta_loader.add_value('plan_name', plan_name)
    meta_loader.add_value('period', period)
    meta_loader.add_value('one_time_charge', one_time_charge)
    meta_loader.add_value('per_month', per_month)
    meta_loader.add_value('operator', operator)
    meta_loader.add_value('channel', channel)
    # meta_loader.add_value('category', category)

    meta_loader.add_value('network_gen', network_gen)

    if 'in_stock' in product_info:
        in_stock = product_info['in_stock']
        if isinstance(in_stock, bool):
            if not in_stock:
                loader.add_value('stock', 0)
        elif isinstance(in_stock, int):
            loader.add_value('stock', in_stock)

    product = loader.load_item()
    product['metadata'] = meta_loader.load_item()

    return product