Esempio n. 1
0
    def parse_product(self, response):
        sel = Selector(response)
        metadata = {
            'brand_id': self.spider_data['brand_id'],
            'url': response.url,
            'tags_mapping': {},
            'color': [],
        }

        #单品model
        model = None
        mt = re.search(r'.+/product/(\d+).*', response.url)
        if mt:
            model = mt.group(1)
        model = self.reformat(model)
        if model:
            metadata['model'] = model
        else:
            return

        #单品region
        region = None
        mt = re.search('.+com/(\w+)/.+', response.url)
        if mt:
            region = mt.group(1)
        #替换gb为uk
        if region == 'gb':
            region = 'uk'
        region = self.reformat(region)
        if region:
            metadata['region'] = region
        else:
            return

        #左上类型标签
        type_nodes = sel.xpath('//ul[@class="breadcrumbs"]//li')
        category_index = 0
        for node in type_nodes:
            type_node = node.xpath('./a')
            if not type_node:
                continue

            type_text = type_node.xpath('./text()').extract()[0]
            type_text = self.reformat(type_text)
            type_name = type_text.lower()
            if type_text and type_name:
                category_type = str.format('category-{0}', category_index)
                metadata['tags_mapping'][category_type] = [{
                    'name': type_name,
                    'title': type_text
                }]
                category_index += 1

                gender = common.guess_gender(type_name)
                if gender:
                    metadata['gender'] = [gender]

        #价格标签
        price_node = sel.xpath('//span[@id="text-price"]//span')
        if price_node:
            price = price_node.xpath('./text()').extract()[0]
            price = self.reformat(price)
            if price:
                metadata['price'] = price

        #单品名称
        name_node = sel.xpath('//h1')
        if name_node:
            name = name_node.xpath('./text()').extract()[0]
            name = self.reformat(name)
            if name:
                metadata['name'] = name

        #详情标签
        description_node = sel.xpath('//div[@class="description"]')
        if description_node:
            description_text_node = description_node.xpath('.//p[1]')
            if description_text_node:
                description = description_text_node.xpath(
                    './text()').extract()[0]
                description = self.reformat(description)
                if description:
                    metadata['description'] = description

            detailText_node = description_node.xpath('.//*[preceding::h2[2]]')
            if detailText_node:
                detail = ''.join(detailText_node.xpath('./text()').extract())
                detail = self.reformat(detail)
                if detail:
                    metadata['details'] = detail

        #颜色标签,获取各种颜色的图片
        color_nodes = sel.xpath('//*[@id="options-articles"]//li')
        for node in color_nodes:
            color_node = node.xpath('.//span')
            if color_node:
                tmp = color_node.xpath('./text()').extract()
                if not tmp:
                    continue
                color_text = self.reformat(tmp[0])
                if color_text:
                    metadata['color'] += [color_text]

            color_image_node = node.xpath('.//a')
            if color_image_node:
                color_image_href = color_image_node.xpath(
                    './@href').extract()[0]
                color_image_href = re.sub(ur'\?.+', color_image_href,
                                          response.url)

                m = copy.deepcopy(metadata)

                yield Request(url=color_image_href,
                              callback=self.parse_images,
                              errback=self.onerr,
                              meta={'userdata': m},
                              dont_filter=True)

        item = ProductItem()
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        item['metadata'] = metadata

        yield item
Esempio n. 2
0
    def parse_product(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        other_nodes = sel.xpath(
            '//div[@class="product-detail-container"]//ul[@class="swatch-set clearfix"]/li/a[@href][@title]')
        for node in other_nodes:
            m = copy.deepcopy(metadata)

            try:
                href = node.xpath('./@href').extract()[0]
                href = self.process_href(href, response.url)
            except(TypeError, IndexError):
                continue

            yield Request(url=href,
                          callback=self.parse_product,
                          errback=self.onerr,
                          meta={'userdata': m})

        metadata['url'] = response.url

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        colors = self.fetch_color(response)
        if colors:
            metadata['color'] = colors

        image_urls = []
        image_nodes = sel.xpath(
            '//div[@class="product-detail-container"]//ul[@class="variant-thumbnail-set"]/li/a/img[@src]')
        for node in image_nodes:
            try:
                url = node.xpath('./@src').extract()[0]
                url = self.process_href(url, response.url)
                if url:
                    image_url = re.sub(ur'/\d+/\d+/', u'/2000/2000/', url)
                    if image_url:
                        image_urls += [image_url]
            except(TypeError, IndexError):
                pass

        item = ProductItem()
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        if image_urls:
            item['image_urls'] = image_urls
        item['metadata'] = metadata

        yield item
Esempio n. 3
0
    def parse_product(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        metadata['url'] = response.url

        other_nodes = sel.xpath(
            '//div[@class="attributePanel"]//div[@class="palette"]/a[@href]')
        for node in other_nodes:
            m = copy.deepcopy(metadata)

            try:
                other_href = node.xpath('./@href').extract()[0]
                other_href = re.sub(ur'\s', '', other_href)
                other_href = self.process_href(other_href, response.url)
            except (TypeError, IndexError):
                continue

            yield Request(url=other_href,
                          callback=self.parse_product,
                          errback=self.onerr,
                          meta={'userdata': m})

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        colors = self.fetch_color(response)
        if colors:
            metadata['color'] = colors

        image_urls = []
        image_nodes = sel.xpath(
            '//div[@id="mainPictureBlock"]//div[@id="productSheetSlideshow"]//li[not(@id)]/img[@data-src]'
        )
        for node in image_nodes:
            try:
                image_src = node.xpath('./@data-src').extract()[0]
                # mt = re.search(ur'/([^/]+)\.\w+$', image_src)
                # if mt:
                #     image_name = mt.group(1)
                #
                #     image_src = str.format("{0}{1}/jcr:content/renditions/{2}_550x550.jpg",
                #                            self.spider_data['image_host'], image_src, image_name)
                #     if image_src:
                #         image_urls += [image_src]
                image_src = str.format("{0}{1}",
                                       self.spider_data['image_host'],
                                       image_src)
                if image_src:
                    image_urls += [image_src]
            except (TypeError, IndexError):
                continue

        item = ProductItem()
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        if image_urls:
            item['image_urls'] = image_urls
        item['metadata'] = metadata

        yield item
Esempio n. 4
0
    def parse_other_procut(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        metadata['url'] = response.url

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        detail = self.fetch_details(response)
        if detail:
            metadata['details'] = detail

        colors = self.fetch_color(response)
        if colors:
            metadata['color'] = colors

        # 这里会有比需要的图片多
        image_fix_list = re.findall(r'"(\d{2}_[a-z])"', response.body)
        # 这里去掉一下没用的后缀
        max_fix = '0'
        for fix in image_fix_list:
            if fix > max_fix:
                max_fix = fix[:2]

        def func(item):
            mt = re.search(str.format('{0}_[a-z]', max_fix), item)
            if mt:
                return True
            else:
                return False

        image_fix_list = filter(func, image_fix_list)

        # 用页面中图片的地址取的他们图片服务器的地址
        # 顺便用它里边已经写好的单品的id和颜色的id
        image_urls = None
        try:
            image_node = sel.xpath(
                '//aside[@class="itemSidebar"]//div[@class="colors"]/div[@class="colorSizeContent colorSlider"]/div[@class="colorMask"]//img[@src]')
            if image_node:
                image_urls = [
                    re.sub('\d{2}_[a-z]', val, src)
                    for val in image_fix_list
                    for src in image_node.xpath('./@src').extract()
                ]
        except(TypeError, IndexError):
            pass

        item = ProductItem()
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        if image_urls:
            item['image_urls'] = image_urls
        item['metadata'] = metadata

        yield item
Esempio n. 5
0
    def parse_product(self, response):
        """
        解析单品页面
        """

        metadata = response.meta['userdata']
        sel = Selector(response)

        metadata['url'] = response.url

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        # colors = self.fetch_color(response)
        # if colors:
        #     metadata['color'] = colors

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        detail = self.fetch_details(response)
        if detail:
            metadata['details'] = detail

        # if not metadata.get('model'):
        #     model_node = sel.xpath('//div[@class="l-info-container"]/div[@class="l-info-title"]/h1')
        #     if model_node:
        #         try:
        #             model = model_node.xpath('./text()').extract()[0]
        #             model = self.reformat(model)
        #             if model:
        #                 metadata['model'] = model.upper()
        #                 metadata['name'] = model.lower()
        #         except(TypeError, IndexError):
        #             pass
        #
        # if not metadata.get('model'):
        #     return
        #
        # if not metadata.get('price'):
        #     price_node = sel.xpath('//div[@class="l-info-container"]/div[@class="l-info-title"]/h2')
        #     if price_node:
        #         try:
        #             price = price_node.xpath('./text()').extract()[0]
        #             price = self.reformat(price)
        #             if price:
        #                 metadata['price'] = price
        #         except(TypeError, IndexError):
        #             pass
        #
        # # 有两个部分都应该是description
        # # 这是图片右边的部分
        # description1 = None
        # description_node1 = sel.xpath('//div[@class="l-info-description"]/div/div[contains(@class, "description")]')
        # if description_node1:
        #     try:
        #         description1 = description_node1.xpath('./text()').extract()[0]
        #         description1 = self.reformat(description1)
        #     except(TypeError, IndexError):
        #         pass
        # # 这是图片左下的部分
        # description2 = None
        # description_node2 = sel.xpath(
        #     '//div[@class="l-details"]/div[contains(@class, "information")]/div[contains(@class, "description")]/div[@style]')
        # if description_node2:
        #     try:
        #         description2 = description_node2.xpath('./text()').extract()[0]
        #         description2 = self.reformat(description2)
        #     except(TypeError, IndexError):
        #         pass
        # # 组合两部分
        # description = '\r'.join(
        #     filter(None, [description1, description2])
        # )
        # description = self.reformat(description)
        # if description:
        #     metadata['description'] = description
        #
        # detail_nodes = sel.xpath(
        #     '//div[@class="l-details"]/div[contains(@class, "technical")]/*[not(@id="technicaldetails")][not(contains(@class, "button"))]')
        # if detail_nodes:
        #
        #     def func(node):
        #         try:
        #             node_name = node._root.tag
        #             allText = ''.join(self.reformat(val) for val in node.xpath('./text()').extract())
        #             # h5标签说明他是一行的开头
        #             if node_name == 'h5':
        #                 return '\r' + allText
        #             else:
        #                 return allText
        #         except(TypeError, IndexError):
        #             return ''
        #
        #     try:
        #         detail = ''.join(func(node) for node in detail_nodes)
        #         detail = self.reformat(detail)
        #         if detail:
        #             metadata['details'] = detail
        #     except(TypeError, IndexError):
        #         pass

        image_urls = []
        image_nodes = sel.xpath('//div[@id="scroll"]/ul/li[@data-hdimage]')
        for image_node in image_nodes:
            try:
                url = image_node.xpath('./@data-hdimage').extract()[0]
                url = self.reformat(url)
                if url:
                    url = self.process_href(url, response.url)
                    if url:
                        image_urls += [url]
            except (TypeError, IndexError):
                continue
        # if image_nodes:
        #     try:
        #         image_urls = [
        #             self.process_href(val, response.url)
        #             for val in image_nodes.xpath('./@data-hdimage').extract()
        #         ]
        #     except(TypeError, IndexError):
        #         pass

        item = ProductItem()
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        if image_urls:
            item['image_urls'] = image_urls
        item['metadata'] = metadata

        yield item
Esempio n. 6
0
    def parse_product(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        other_nodes = sel.xpath(
            '//div[@id="pdpATCDivsubProductDiv"]//ul[@id="swatchesselect"]/li/a[@name]'
        )
        for node in other_nodes:
            m = copy.deepcopy(metadata)

            try:
                href = node.xpath('./@name').extract()[0]
                href = self.process_href(href, response.url)
            except (TypeError, IndexError):
                continue

            yield Request(url=href,
                          callback=self.parse_product,
                          errback=self.onerr,
                          meta={'userdata': m})

        metadata['url'] = response.url

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_disount' in ret:
            metadata['price_discount'] = ret['price_discount']

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        details = self.fetch_details(response)
        if details:
            metadata['details'] = details

        colors = self.fetch_color(response)
        if colors:
            metadata['color'] = colors

        image_urls = []
        image_node = sel.xpath('//input[@id="pdpImgUrl"][@value]')
        if image_node:
            try:
                image_request_value = image_node.xpath('./@value').extract()[0]
                if image_request_value:
                    m = copy.deepcopy(metadata)
                    image_request_ref = str.format(
                        'http://s7d5.scene7.com/is/image/ToryBurchLLC/{0}_S?req=imageset',
                        image_request_value)

                    yield Request(url=image_request_ref,
                                  callback=self.parse_image_request,
                                  errback=self.onerr,
                                  meta={'userdata': m})

                    image_urls += [
                        str.format(
                            'http://s7d5.scene7.com/is/image/ToryBurchLLC/{0}?scl=2',
                            image_request_value)
                    ]
            except (TypeError, IndexError):
                pass

        item = ProductItem()
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        if image_urls:
            item['image_urls'] = image_urls
        item['metadata'] = metadata

        yield item
Esempio n. 7
0
    def parse_sku1(self, response):
        self.log(str.format('PARSE_SKU1: {0}', response.url), level=log.DEBUG)
        mt = re.search(r'chanel\.com/([^/]+)/', response.url)
        region = None
        for a, b in self.spider_data['base_url'].items():
            if b == mt.group(1):
                region = a
                break
        if not region:
            return

        mt = re.search(r'\?sku=(\d+)$', response.url)
        if not mt:
            return
        model = mt.group(1)

        metadata = {
            'region': region,
            'brand_id': self.spider_data['brand_id'],
            'model': model,
            'url': response.url,
            'tags_mapping': {},
            'category': set([])
        }

        sel = Selector(response)
        cat_idx = 0
        cat_list = []
        for node in sel.xpath(
                '//div[contains(@class,"trackingSettings")]/span[@class]'):
            cat = unicodify(node._root.text)
            if not cat:
                continue
                #if node._root.attrib['class'] == 'WT_cg_s':
            #    if 'category' not in metadata:
            #        metadata['category'] = set([])
            #    metadata['category'].add(cat.lower())
            if cat.lower() in cat_list:
                continue

            cat_idx += 1
            cat_list.append(cat.lower())
            metadata['tags_mapping'][str.format('category-{0}', cat_idx)] = [{
                'name':
                cat.lower(),
                'title':
                cat
            }]
            gender = cm.guess_gender(cat)
            if gender:
                if 'gender' not in metadata:
                    metadata['gender'] = set([])
                metadata['gender'].add(gender)

        temp = sel.xpath('//div[@class="productName"]')
        name_list = []
        if len(temp) > 0:
            product_name = temp[0]
            temp = product_name.xpath(
                './h1[@class="family"]/span[@class="familyText"]')
            if len(temp) > 0:
                name = unicodify(temp[0]._root.text)
                if name:
                    name_list.append(name)
                name = u', '.join([
                    unicodify(val.text)
                    for val in temp[0]._root.iterdescendants()
                    if val.text and val.text.strip()
                ])
                if name:
                    name_list.append(name.strip())
            temp = product_name.xpath('./h2[@class="name"]')
            if len(temp) > 0:
                name = unicodify(temp[0]._root.text)
                if name:
                    name_list.append(name)
                name = u', '.join([
                    unicodify(val.text)
                    for val in temp[0]._root.iterdescendants()
                    if val.text and val.text.strip()
                ])
                if name:
                    name_list.append(name.strip())
        name = u' - '.join(name_list)
        metadata['name'] = name if name else None

        # Description and details
        temp = sel.xpath('//div[@class="tabHolderFullWidth tabHolder"]')
        if len(temp) > 0:
            content_node = temp[0]
            content_map = {}
            for node in content_node.xpath('./div[@class="tabs"]//a[@rel]'):
                temp = unicodify(node._root.text)
                if temp and temp in self.spider_data['description_hdr']:
                    content_map['description'] = node._root.attrib['rel']
                if temp and temp in self.spider_data['details_hdr']:
                    content_map['details'] = node._root.attrib['rel']

            for term in ('description', 'details'):
                if term in content_map:
                    temp = content_node.xpath(
                        str.format('./div[@id="{0}"]', content_map[term]))
                    if len(temp) > 0:
                        content_list = []
                        content = unicodify(temp[0]._root.text)
                        if content:
                            content_list.append(content)
                        content_list.extend([
                            unicodify(val.text)
                            for val in temp[0]._root.iterdescendants()
                            if val.text and val.text.strip()
                        ])
                        metadata[term] = u', '.join(content_list)

        # Images
        # image_urls = []
        # for node in hxs.select('//div[@class="major productImg"]/img[@src]'):
        #     href = node._root.attrib['src']
        #     if re.search(r'^http://', href):
        #         image_urls.append(href)
        #     else:
        #         image_urls.append(str.format('{0}/{1}', self.spider_data['host'], href))
        # image_urls = list(set([re.sub(r'\.+', '.', val) for val in image_urls]))

        image_urls = list(
            set(
                cm.norm_url(node._root.attrib['src'],
                            self.spider_data['base_url'])
                for node in sel.xpath(
                    '//div[@class="major productImg"]/img[@src]') if
                node._root.attrib['src'] and node._root.attrib['src'].strip()))

        if 'color' in metadata:
            metadata['color'] = list(metadata['color'])
        if 'gender' in metadata:
            metadata['gender'] = list(metadata['gender'])
            #metadata['category'] = list(metadata['category'])

        if 'model' in metadata:
            item = ProductItem()
            item['image_urls'] = image_urls
            item['url'] = metadata['url']
            item['model'] = metadata['model']
            item['metadata'] = metadata
            yield item
Esempio n. 8
0
    def parse_details(self, response):
        metadata = response.meta['userdata']
        metadata['url'] = response.url
        sel = Selector(response)

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        detail = self.fetch_details(response)
        if detail:
            metadata['details'] = detail

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        colors = self.fetch_color(response)
        if colors:
            metadata['color'] = colors

        # image_urls = sel.xpath('//div[@id="itemContent"]//img/@src').extract()

        # 获得图片
        hdr = None
        tail = None
        img0 = sel.xpath(
            '//meta[@property="og:image" and @content]/@content').extract()
        if img0:
            img0 = img0[0]
            mt = re.search(r'(.+)_\d+_\w(\..+)$', img0)
            if mt:
                hdr = mt.group(1)
                tail = mt.group(2)
        idx = response.body.find('jsinit_item')
        img_item = None
        if idx != -1:
            tmp = response.body[idx:]
            idx = tmp.find('ALTERNATE')
            if idx != -1:
                try:
                    img_item = json.loads(
                        cm.extract_closure(tmp[idx:], r'\[', r'\]')[0])
                except ValueError:
                    pass
        image_urls = []
        if hdr and tail and img_item:
            for item in img_item:
                mt = re.search(r'(\d+)_\w', item)
                if not mt:
                    continue
                start_idx = int(mt.group(1))
                for idx in xrange(start_idx, 15):
                    tmp = re.sub(r'\d+_(\w)', str.format(r'{0}_\1', idx), item)
                    image_urls.append(str.format('{0}_{1}{2}', hdr, tmp, tail))

        item = ProductItem()
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        item['image_urls'] = image_urls
        item['metadata'] = metadata
        yield item
Esempio n. 9
0
    def parse_details_us(self, response):
        metadata = response.meta['userdata']
        sel = Selector(response)
        metadata['url'] = response.url

        # 查找不同的颜色版本
        try:
            idx = response.body.find('var productURLs')
            data = json.loads(cm.extract_closure(response.body[idx:], '\{', '\}')[0].replace("'", '"'))
            for color_key in data:
                tmp = sel.xpath(str.format('//select/option[@value="{0}"]', color_key))
                if not tmp:
                    continue
                color_node = tmp[0]
                # 是否为当前选择的颜色?
                if not color_node.xpath('@selected'):
                    m = copy.deepcopy(metadata)
                    tmp = color_node.xpath('text()').extract()
                    if tmp:
                        m['color'] = [self.reformat(tmp[0])]
                    yield Request(url=self.process_href(data[color_key], response.url),
                                  callback=self.spider_data['callbacks'][metadata['region']][2],
                                  errback=self.onerr, meta={'userdata': m})
                else:
                    tmp = color_node.xpath('text()').extract()
                    if tmp:
                        metadata['color'] = [self.reformat(tmp[0])]
        except ValueError:
            pass

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        detail = self.fetch_details(response)
        if detail:
            metadata['details'] = detail

        image_urls = []
        for img_node in sel.xpath('//div[contains(@class,"slider_selector") or @id="frg_thumb_list"]/ul'
                                  '/li[contains(@id,"productAngle")]//img[@src or @data-url]'):
            tmp = img_node.xpath('@data-url').extract()
            if tmp:
                image_urls.append(self.process_href(tmp[0], response.url))
            else:
                tmp = img_node.xpath('@src').extract()[0]
                a, b = os.path.splitext(tmp)
                image_urls.append(self.process_href(str.format('{0}_zoom{1}', a, b), response.url))

        #image_urls = [self.process_href(val, response.url) for val in
        #              sel.xpath('//div[contains(@class,"slider_selector") or @id="frg_thumb_list"]/ul'
        #                        '/li[contains(@id,"productAngle")]/img[@src and @data-url]/@data-url').extract()]
        item = ProductItem()
        item['image_urls'] = image_urls
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        item['metadata'] = metadata
        yield item
Esempio n. 10
0
    def parse_product(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        other_product_nodes = sel.xpath(
            '//div[@id="content"]//div[@id="product-content"]//div[@class="product-variations"]/ul/li/div/ul[@class="swatches Color"]/li/a[@href]'
        )
        for node in other_product_nodes:
            m = copy.deepcopy(metadata)

            try:
                href = node.xpath('./@href').extract()[0]
                href = self.process_href(href, response.url)
            except (TypeError, IndexError):
                continue

            yield Request(url=href,
                          callback=self.parse_product,
                          errback=self.onerr,
                          meta={'userdata': m})

        metadata['url'] = response.url

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        colors = self.fetch_color(response)
        if colors:
            metadata['color'] = colors

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        detail = self.fetch_details(response)
        if detail:
            metadata['details'] = detail

        image_urls = []
        origin_image_node = sel.xpath(
            '//div[@id="content"]//div[@id="pdp-pinterest-container"]/img[@src]'
        )
        if origin_image_node:
            try:
                origin_image_url = origin_image_node.xpath(
                    './@src').extract()[0]
                origin_image_url = self.process_href(origin_image_url,
                                                     response.url)
                origin_image_url = re.sub(
                    ur'\?.*$', ur'_A1?$Demandware%20Large%20Rectangle$',
                    origin_image_url)
                if origin_image_url:
                    image_urls += [origin_image_url]
                    image_urls += [
                        re.sub(ur'_A\d\?', str.format(r'_A{0}?', val),
                               origin_image_url) for val in xrange(2, 5)
                    ]
            except (TypeError, IndexError):
                pass

        item = ProductItem()
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        if image_urls:
            item['image_urls'] = image_urls
        item['metadata'] = metadata

        yield item
Esempio n. 11
0
    def parse_product_us(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        other_nodes = sel.xpath(
            '//div[contains(@class, "product-detail")]//ul[@class="swatches Color"]/li/a[@href]'
        )
        for node in other_nodes:
            m = copy.deepcopy(metadata)

            try:
                href = node.xpath('./@href').extract()[0]
                href = self.process_href(href, response.url)
            except (TypeError, IndexError):
                continue

            yield Request(url=href,
                          callback=self.parse_product_us,
                          errback=self.onerr,
                          meta={'userdata': m})

        metadata['url'] = response.url

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        colors = self.fetch_color(response)
        if colors:
            metadata['color'] = colors

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        image_urls = []
        image_nodes = sel.xpath(
            '//div[@id="primary"]//div[@class="product-thumbnails"]/ul/li/a[@href]'
        )
        for node in image_nodes:
            try:
                href = node.xpath('./@href').extract()[0]
                href = self.process_href(href, response.url)

                href = re.sub(r'\?.*', '', href)

                if href:
                    image_urls += [href]
            except (TypeError, IndexError):
                continue
        if not image_urls:
            image_node = sel.xpath('//div[@id="primary-image"]/a[@href]')
            try:
                href = image_node.xpath('./@href').extract()[0]
                href = self.process_href(href, response.url)

                href = re.sub(r'\?.*', '', href)

                mt = re.search(r'noimage', href)
                if not mt:
                    if href:
                        image_urls += [href]
            except (TypeError, IndexError):
                pass

        item = ProductItem()
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        if image_urls:
            item['image_urls'] = image_urls
        item['metadata'] = metadata

        yield item
Esempio n. 12
0
    def parse_product_ca(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        other_nodes = sel.xpath('//ul[@id="product_thumbnails"]/li/a[@href]')
        for node in other_nodes:
            m = copy.deepcopy(metadata)

            try:
                href = node.xpath('./@href').extract()[0]
                href = self.process_href(href, response.url)
            except (TypeError, IndexError):
                continue

            yield Request(url=href,
                          callback=self.parse_product_ca,
                          errback=self.onerr,
                          meta={'userdata': m})

        metadata['url'] = response.url

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        colors = self.fetch_color(response)
        if colors:
            metadata['color'] = colors

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        image_urls = None
        image_node = sel.xpath(
            '//div[@class="product-main-info"]//div[@class="float-left"]/div/a[child::img[@src]][@href]'
        )
        if image_node:
            try:
                image_urls = [
                    self.process_href(val, response.url)
                    for val in image_node.xpath('./@href').extract()
                ]
            except (TypeError, IndexError):
                pass

        item = ProductItem()
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        if image_urls:
            item['image_urls'] = image_urls
        item['metadata'] = metadata

        yield item
Esempio n. 13
0
    def parse_product(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        metadata['url'] = response.url

        model = None
        model_node = sel.xpath(
            '//div[@id="info1"]/div[@class="padding15"]/table//tr[2]/td[not(child::*)][2]'
        )
        if model_node:
            try:
                model = model_node.xpath('./text()').extract()[0]
                model = self.reformat(model)
            except (TypeError, IndexError):
                pass

        if model:
            metadata['model'] = model
        else:
            return

        name_node = sel.xpath('//div[@id="info1"]//h1')
        if name_node:
            try:
                name = name_node.xpath('./text()').extract()[0]
                name = self.reformat(name)
                if name:
                    metadata['name'] = name
            except (TypeError, IndexError):
                pass

        # 价格是用js后加载的
        default_price = None
        default_price_re = re.search(r'defaultPrice: "(.*)"', response.body)
        if default_price_re:
            try:
                default_price = default_price_re.group(1)
                default_price = self.reformat(default_price)
                default_price = re.sub(ur'&nbsp', ur' ', default_price)
            except (TypeError, IndexError):
                pass
        # 这里这个defaultComparePrice是原价
        # 如果没有,就是没有打折
        old_price = None
        old_price_re = re.search(r'defaultComparePrice: "(.*)"', response.body)
        if old_price_re:
            try:
                old_price = old_price_re.group(1)
                old_price = self.reformat(old_price)
                old_price = re.sub(ur'&nbsp', ur' ', old_price)
            except (TypeError, IndexError):
                pass

        if old_price:
            # 有打折
            metadata['price'] = old_price
            if default_price:
                metadata['price_discount'] = default_price
        elif default_price:
            # 没打折
            metadata['price'] = default_price

        # 颜色标签
        colors = None
        color_nodes = sel.xpath(
            '//div[@id="tallasdiv"]/div[@class="colors_detail"]/div[@title]')
        if color_nodes:
            try:
                colors = [
                    self.reformat(val)
                    for val in color_nodes.xpath('./@title').extract()
                ]
            except (TypeError, IndexError):
                pass
        if colors:
            metadata['color'] = colors

        # 这个所有放大图片的地址,实在源码中找到的
        image_urls = None
        image_nodes = sel.xpath('//div[contains(@id, "superzoom_")]/div[@rel]')
        if image_nodes:
            try:
                image_urls = [
                    self.process_href(val, response.url)
                    for val in image_nodes.xpath('./@rel').extract()
                ]
            except (TypeError, IndexError):
                pass

        item = ProductItem()
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        if image_urls:
            item['image_urls'] = image_urls
        item['metadata'] = metadata

        yield item
Esempio n. 14
0
    def parse_product(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        # 这里不进入其他页面,因为后边找图片的方法,可以把所有颜色的图片找全
        # # 其他颜色页面
        # color_href_nodes = sel.xpath('//div[@class="variationattributes"]/div[@class="swatches color"]/ul/li/a[@href]')
        # for node in color_href_nodes:
        #     m = copy.deepcopy(metadata)
        #
        #     href = node.xpath('./@href').extract()[0]
        #     href = self.process_href(href, response.url)
        #
        #     Request(url=href,
        #             callback=self.parse_product,
        #             errback=self.onerr,
        #             meta={'userdata': m})

        metadata['url'] = response.url

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        colors = self.fetch_color(response)
        if colors:
            metadata['color'] = colors

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        detail = self.fetch_details(response)
        if detail:
            metadata['details'] = detail

        image_urls = []
        try:
            start = 0
            while 1:
                mt = re.search(r'xlarge:', response.body[start:])
                if mt:
                    result = common.extract_closure(response.body[mt.start():],
                                                    '\[', '\]')
                    content = result[0]
                    start = result[2]
                    if 0 == start:
                        break
                    url_list = re.findall('"url":.*\'(.+)\?.*\'', content)
                    for url in url_list:
                        image_urls += [self.process_href(url, response.url)]
                else:
                    break
        except (TypeError, IndexError):
            pass

        item = ProductItem()
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        if image_urls:
            item['image_urls'] = image_urls
        item['metadata'] = metadata

        yield item
Esempio n. 15
0
    def parse_sku2(self, response):
        self.log(str.format('PARSE_SKU2: {0}', response.url), level=log.DEBUG)
        mt = re.search(r'chanel\.com/([^/]+)/', response.url)
        region = None
        for a, b in self.spider_data['base_url'].items():
            if b == mt.group(1):
                region = a
                break
        if not region:
            return

        mt = re.search(r'/sku/(\d+)$', response.url)
        if not mt:
            return
        model = mt.group(1)

        metadata = {
            'region': region,
            'brand_id': self.spider_data['brand_id'],
            'model': model,
            'url': response.url,
            'tags_mapping': {}
        }

        sel = Selector(response)
        cat_idx = 0
        cat_list = []
        for node in sel.xpath(
                '//div[contains(@class,"trackingSettings")]/span[@class]'):
            cat = unicodify(node._root.text)
            if not cat:
                continue
                #if node._root.attrib['class'] == 'WT_cg_s':
            #    metadata['category'].add(cat.lower())
            if cat.lower() in cat_list:
                continue

            cat_idx += 1
            cat_list.append(cat.lower())
            cat_name = str.format('category-{0}', cat_idx)
            metadata['tags_mapping'][cat_name] = [{
                'name': cat.lower(),
                'title': cat
            }]
            gender = cm.guess_gender(cat)
            if gender:
                if 'gender' not in metadata:
                    metadata['gender'] = set([])
                metadata['gender'].add(gender)

        temp = sel.xpath('//div[contains(@class, "product_detail_container")]')
        name_list = []
        if len(temp) > 0:
            product_name = temp[0]
            temp = product_name.xpath('./h1[@class="product_name"]')
            if len(temp) > 0:
                name = unicodify(temp[0]._root.text)
                if name:
                    name_list.append(name)
            temp = product_name.xpath('./h2[@class="product_subtitle"]')
            if len(temp) > 0:
                name = unicodify(temp[0]._root.text)
                if name:
                    name_list.append(name)

            temp = product_name.xpath('.//h3[@class="product_price"]')
            if len(temp) > 0:
                metadata['price'] = unicodify(temp[0]._root.text)
        name = u' - '.join(name_list)
        metadata['name'] = name if name else None

        # Description and details
        temp = sel.xpath('//div[@class="description_container"]')
        if len(temp) > 0:
            content_node = temp[0]
            content_map = {}
            for node in content_node.xpath(
                    './/div[@class="accordion-heading"]/a[@href]'):
                temp = unicodify(node._root.text)
                if temp and temp in self.spider_data['description_hdr']:
                    content_map['description'] = re.sub(
                        r'^#', '', node._root.attrib['href'])
                if temp and temp in self.spider_data['details_hdr']:
                    content_map['details'] = re.sub(r'^#', '',
                                                    node._root.attrib['href'])

            for term in ('description', 'details'):
                if term in content_map:
                    temp = content_node.xpath(
                        str.format('.//div[@id="{0}"]', content_map[term]))
                    if len(temp) > 0:
                        content_list = []
                        content = unicodify(temp[0]._root.text)
                        if content:
                            content_list.append(content)
                        content_list.extend([
                            unicodify(val.text)
                            for val in temp[0]._root.iterdescendants()
                            if val.text and val.text.strip()
                        ])
                        metadata[term] = u', '.join(content_list)

        # Images
        image_urls = list(
            set(
                cm.norm_url(node._root.attrib['src'],
                            self.spider_data['base_url'])
                for node in sel.xpath(
                    '//section[@class="product_image_container"]/img[@src and @class="product_image"]'
                ) if node._root.attrib['src']
                and node._root.attrib['src'].strip()))

        if 'color' in metadata:
            metadata['color'] = list(metadata['color'])
        if 'gender' in metadata:
            metadata['gender'] = list(metadata['gender'])
            #metadata['category'] = list(metadata['category'])

        if 'model' in metadata:
            item = ProductItem()
            item['image_urls'] = image_urls
            item['url'] = metadata['url']
            item['model'] = metadata['model']
            item['metadata'] = metadata
            yield item
Esempio n. 16
0
    def parse_product(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        # country_node = sel.xpath('//div[@class="ecommerce-nav"]/ul/li/span[2][text()]')
        # if country_node:
        #     try:
        #         country = country_node.xpath('./text()').extract()[0]
        #         self.log(str.format('region: {0}    country : {1}', metadata['region'], country))
        #     except(TypeError, IndexError):
        #         pass

        # TODO dkny的爬虫通过cookie切换国家,这里的url是无意义的,需要尝试用url切换到指定
        metadata['url'] = response.url

        # 有货号不在URL中的
        # 比如:http://www.dkny.com/bags/shop-by-shape/view-all/resort13bags145/dknypure-large-hobo?p=2&s=12
        # 也有不在那个li的node中的
        # 比如:http://www.dkny.com/sale/womens-sale/dresses/n43731afa/dknypure-dress-with-sleek-jersey-yoke-and-sleeves
        model = None
        model_node = sel.xpath('//li[@class="product"][@id]')
        if model_node:
            try:
                model_text = model_node.xpath('./@id').extract()[0]
                mt = re.search(r'-(\w+)$', model_text)
                if mt:
                    model = mt.group(1)
            except (TypeError, IndexError):
                pass
        if not model:
            try:
                mt = re.search(r'.+/(\w+)/.+$', response.url)
                if mt:
                    model = mt.group(1)
                    if model:
                        model = model.upper()
            except (TypeError, IndexError):
                pass
        if model:
            metadata['model'] = model
        else:
            return

        description_node = sel.xpath(
            '//div[contains(@class, "view-product_detail")]//div[@class="product-description"]'
        )
        if description_node:
            try:
                description = '\r'.join(
                    self.reformat(val)
                    for val in description_node.xpath('.//text()').extract())
                description = self.reformat(description)

                if description:
                    metadata['description'] = description
            except (TypeError, IndexError):
                pass

        colors = None
        color_nodes = sel.xpath(
            '//div[@class="product-info-container"]//form/ul/li/ul/li/a/img[@alt]'
        )
        if color_nodes:
            try:
                colors = [
                    self.reformat(val).lower()
                    for val in color_nodes.xpath('./@alt').extract()
                ]
            except (TypeError, IndexError):
                pass
        if colors:
            metadata['color'] = colors

        image_urls = []
        image_nodes = sel.xpath(
            '//div[contains(@class, "view-product_detail")]//div[@class="partial-product_viewer"]/ul/li/a/img[@src]'
        )
        for image_node in image_nodes:
            try:
                src = image_node.xpath('./@src').extract()[0]
                src = self.process_href(src, response.url)

                # 这里,把src里边的/60/80/替换为/0/0/即可得到全尺寸图片
                src = re.sub(r'/(\d+/\d+)/', '/0/0/', src)

                image_urls += [src]
            except (TypeError, IndexError):
                continue

        # # TODO 这里其他颜色的图片怎么取的
        # # 这里发送请求,找到其他颜色图片
        # # 这里好像有两种请求,一种用了link_id,model,value_id三个参数,一种用了model,value_id两个参数
        # link_id = None
        # link_node = sel.xpath('//link[@rel="canonical"][@href]')
        # if link_node:
        #     link_text = link_node.xpath('./@href').extract()[0]
        #     if link_text:
        #         mt = re.search(r'.+/(\w+)/.+$', link_text)
        #         if mt:
        #             link_id = mt.group(1).upper()
        # if link_id:
        #     other_color_node = sel.xpath('//ul[@class="product-set"]//ul[@class="option-set"]//ul[@class="option-value-set"]/li[@id][child::a[child::img]]')
        #     for node in other_color_node:
        #         value_id = None
        #         value_id_text = node.xpath('./@id').extract()[0]
        #         if value_id_text:
        #             mt = re.search(r'.+/(\w+)/.+$', value_id_text)
        #             if mt:
        #                 value_id = mt.group(1)
        #         if value_id:
        #             m = copy.deepcopy(metadata)
        #
        #             href = str.format('http://www.dkny.com/product/detailpartial?id={0}&variantId={1}', model, value_id)
        #
        #             yield Request(url=href,
        #                           callback=self.parse_other_color,
        #                           errback=self.onerr,
        #                           meta={'meta': m})

        item = ProductItem()
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        if image_urls:
            item['image_urls'] = image_urls
        item['metadata'] = metadata

        yield item
Esempio n. 17
0
    def parse_details(self, response):
        """
        解析“系列”下面的单品
        @param response:
        """
        metadata = response.meta['userdata']
        sel = Selector(response)

        try:
            model = sel.xpath(
                '//div[@id="product-detail"]/div[@class="inner-detail"]//*[@class="reference-number"]/'
                'text()').extract()[0]
            if not model:
                return
            metadata['model'] = model
        except IndexError:
            return
        metadata['url'] = unicodify(response.url)

        if 'name' not in metadata or not metadata['name']:
            tmp = sel.xpath(
                '//div[@id="product-detail"]/div[@class="inner-detail"]//*[@class="format"]'
                '/text()').extract()
            if tmp:
                metadata['name'] = self.reformat(unicodify(tmp[0]))

        # 颜色
        sub_products = sel.xpath(
            '//div[@id="product-detail"]/div[@class="inner-detail"]//ul[@class="color-list"]'
            '/li/a[@href]/@href').extract()
        for href in sub_products:
            if href in response.url:
                continue
            yield Request(url=self.process_href(href, response.url),
                          callback=self.parse_details,
                          errback=self.onerr,
                          meta={'userdata': copy.deepcopy(metadata)})

        try:
            metadata['description'] = self.reformat(
                unicodify(
                    sel.xpath(
                        '//div[@id="tabs-product-detail-overview"]'
                        '/div[@class="product-detail-tab-content"]'
                        '/p[@class="slide-paragraph"]/text()').extract()[0]))
        except IndexError:
            pass

        details_nodes = sel.xpath(
            '//div[@id="tabs-product-detail-specification"]/'
            'div[@class="product-detail-tab-content"]//li/span[@class="tooltip" or '
            '@class="title"]/..')
        details = self.reformat(
            unicodify('\r'.join(': '.join(node.xpath('*/text()').extract())
                                for node in details_nodes)))
        if details:
            metadata['details'] = details

        image_urls = [
            self.process_href(val, response.url) for val in sel.xpath(
                '//div[@id="product-gallery"]/div[@class="product-gallery-part"]'
                '/div[contains(@class,"positioned-product")]/img[@src]/@src').
            extract()
        ]
        image_urls.extend([
            self.process_href(val, response.url) for val in sel.xpath(
                '//div[@id="product-gallery"]/div[@class="product-gallery-part"]'
                '/img[@src]/@src').extract()
        ])

        item = ProductItem()
        item['image_urls'] = image_urls
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        item['metadata'] = metadata
        yield item
Esempio n. 18
0
    def parse_details(self, response):
        # 确定所属国家
        region = None
        for tmp in self.spider_data['domains']:
            if self.spider_data['domains'][tmp] in response.url:
                region = tmp
                break
        if not region:
            return

        metadata = {'region': region, 'brand_id': self.spider_data['brand_id'], 'tags_mapping': {}, 'url': response.url}

        # 根据referer,获得category信息
        referer = response.request.headers['Referer']
        if referer not in self.url_cat_dict:
            return Request(url=referer, callback=self.parse_cat,
                           meta={'stash': response, 'coach-referer': referer, 'callback': self.parse_details},
                           errback=self.onerr, dont_filter=True)
        tag_list = self.url_cat_dict[referer]
        for tag in tag_list:
            metadata['tags_mapping'][tag['type']] = [{'name': tag['name'], 'title': tag['title']}]

        # 商品信息在var productJSONObject中
        mt = re.search(r'var\s+productJSONObject\s*=', response.body)
        if not mt:
            return
        try:
            data = json.loads(cm.extract_closure(response.body[mt.end():], "{", "}")[0].replace(r'\"',
                                                                                                '"').replace(r"\'", "'"))
        except(TypeError, IndexError, ValueError):
            return
        if 'style' not in data:
            return
        metadata['model'] = data['style']
        if 'productName' in data:
            metadata['name'] = self.reformat(data['productName'])

        try:
            metadata['color'] = [self.reformat(swatch['color']).lower() for swatch in data['swatchGroup']['swatches']
                                 if 'color' in swatch]
        except KeyError:
            pass

        # 价格信息
        try:
            for item in data['swatchGroup']['swatches']:
                if 'listPrice' in item:
                    metadata['price'] = self.reformat(item['listPrice'])
                    if 'unitPrice' in item:
                        metadata['price_discount'] = self.reformat(item['unitPrice'])
                    break
        except KeyError:
            pass

        # 图像链接
        image_urls = []
        try:
            image_host = 'http://s7d2.scene7.com/is/image/Coach/{0}{1}'
            style_for_images = data['styleForImages']
            for item in data['swatchGroup']['swatches']:
                for subimg in ('aImages', 'nImages', 'mImages'):
                    for tmp in [val['imageName'] for val in item[subimg]]:
                        if tmp not in image_urls:
                            image_urls.append(tmp)
            image_urls = [str.format(image_host, style_for_images, val) for val in image_urls]
        except KeyError:
            pass

        item = ProductItem()
        item['image_urls'] = image_urls
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        item['metadata'] = metadata
        return item
Esempio n. 19
0
    def parse_products(self, response):
        metadata = response.meta['userdata']
        # self.log(unicode.format(u'PROCESSING {0} -> {1} -> {2}: {3}', metadata['extra']['category-0'][0],
        #                         metadata['extra']['category-1'][0], metadata['name'], response.url).encode('utf-8'),
        #          log.DEBUG)
        for k in ('post_token', 'page_id'):
            if k in metadata:
                metadata.pop(k)
        sel = Selector(response)

        temp = sel.xpath(
            '//div[@class="product-header"]//span[@class="page-product-title"]'
        )
        if temp:
            collection = unicodify(temp[0]._root.text)
            if collection:
                metadata['tags_mapping']['collection'] = [{
                    'name':
                    collection.lower(),
                    'title':
                    collection
                }]

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        if 'name' not in metadata or not metadata['name']:
            name = self.fetch_name(response)
            if name:
                metadata['name'] = name

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        detail = self.fetch_details(response)
        if detail:
            metadata['details'] = detail

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        temp = sel.xpath(
            '//div[@class="column-images"]//a[@href and contains(@class,"zoom-trigger-link")]'
        )
        image_urls = [
            self.process_href(val._root.attrib['href'], response.url)
            for val in temp
        ]

        metadata['url'] = response.url
        item = ProductItem()
        item['image_urls'] = image_urls
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        item['metadata'] = metadata

        return item
Esempio n. 20
0
    def parse_product(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        other_nodes = sel.xpath(
            '//div[@class="product-container"]//div[@class="prod-options"]/div[@class="colors"]/ul/li/a[@href]')
        for node in other_nodes:
            m = copy.deepcopy(metadata)

            try:
                href = node.xpath('./@href').extract()[0]
                href = self.process_href(href, response.url)
            except(TypeError, IndexError):
                continue

            yield Request(url=href,
                          callback=self.parse_product,
                          errback=self.onerr,
                          meta={'userdata': m})

        metadata['url'] = response.url

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        colors = self.fetch_color(response)
        if colors:
            metadata['color'] = colors

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        detail = self.fetch_details(response)
        if detail:
            metadata['details'] = detail

        image_urls = None
        image_node = sel.xpath('//link[@rel="image_src"][@href]')
        if image_node:
            try:
                image_href = image_node.xpath('./@href').extract()[0]
                image_href = re.sub(r'_\d+x\d+\$', '', image_href)

                image_urls = [image_href]
            except(TypeError, IndexError):
                pass

        item = ProductItem()
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        if image_urls:
            item['image_urls'] = image_urls
        item['metadata'] = metadata

        yield item
Esempio n. 21
0
    def parse_product(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        color_nodes = sel.xpath('//div[@id="product-content-detail"]//div[@class="swatch-Slider"]/ul/li/a[@href]')
        for node in color_nodes:
            m = copy.deepcopy(metadata)

            try:
                href = node.xpath('./@href').extract()[0]
                href = self.process_href(href, response.url)
            except(TypeError, IndexError):
                continue

            yield Request(url=href,
                          callback=self.parse_product,
                          errback=self.onerr,
                          meta={'userdata': m})

        metadata['url'] = response.url

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        detail = self.fetch_details(response)
        if detail:
            metadata['details'] = detail

        colors = self.fetch_color(response)
        if colors:
            metadata['color'] = colors

        image_urls = []
        try:
            image_nodes = sel.xpath('//div[@id="pdpMain"]//ul[@class="product-slides-list"]/li/a/img[@src]')
            for image_node in image_nodes:
                image_url = image_node.xpath('./@src').extract()[0]
                image_url = re.sub(r'\?.*', '', image_url)
                if image_url:
                    image_urls += [image_url]
        except(TypeError, IndexError):
            pass

        item = ProductItem()
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        if image_urls:
            item['image_urls'] = image_urls
        item['metadata'] = metadata

        yield item
    def parse_product(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        # 进入不同颜色的单品页,它给了不同的单品号
        other_nodes = sel.xpath('//dl[@id="media-tabs"]/dd[2]//a')
        for other_node in other_nodes:
            m = copy.deepcopy(metadata)

            href = other_node.xpath('./@href').extract()[0]
            href = self.process_href(href, response.url)

            yield Request(url=href,
                          callback=self.parse_product,
                          errback=self.onerr,
                          meta={'userdata': m})

        metadata['url'] = response.url

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

            gender = common.guess_gender(name,
                                         extra={
                                             'male': [],
                                             'female': ['lady']
                                         })
            if gender:
                metadata['gender'] = [gender]

        colors = self.fetch_color(response)
        if colors:
            metadata['color'] = colors

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        detail = self.fetch_details(response)
        if detail:
            metadata['details'] = detail

        image_urls = None
        image_nodes = sel.xpath(
            '//dl[@id="media-tabs"]/dd/div[@class="more-views"]/ul/li/a[@href]'
        )
        if image_nodes:
            image_urls = [
                self.process_href(val, response.url)
                for val in image_nodes.xpath('./@href').extract()
            ]

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        item = ProductItem()
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        if image_urls:
            item['image_urls'] = image_urls
        item['metadata'] = metadata

        yield item
Esempio n. 23
0
    def parse_details(self, response):
        self.log(unicode.format(u'PARSE_DETAILS: URL={0}',
                                response.url).encode('utf-8'),
                 level=log.DEBUG)
        metadata = response.meta['userdata']

        hxs = Selector(response)
        # 访问商品的其它颜色版本
        ret = hxs.xpath(
            "//div[contains(@class,'colors')]/ul[contains(@class,'color-set')]"
            "/li[contains(@class,'color') and not(contains(@class,'color-selected'))]"
            "/a[@title and @data-color-link]")
        for node in ret:
            m = copy.deepcopy(metadata)
            m['color'] = [
                self.reformat(unicodify(
                    node.xpath('@title').extract()[0])).lower()
            ]
            url = self.process_href(
                node.xpath('@data-color-link').extract()[0], response.url)
            m['url'] = url
            yield Request(url=url,
                          callback=self.parse_details,
                          errback=self.onerr,
                          meta={'userdata': m})

        colors = self.fetch_color(response)
        if colors:
            metadata['color'] = colors

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        detail = self.fetch_details(response)
        if detail:
            metadata['details'] = detail

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        if 'name' in metadata and 'details' in metadata and 'description' in metadata:
            ret = hxs.xpath(
                "//div[@class='product_detail_container']/div[@class='product_viewer']"
                "//ul[@class='product-media-set']/li[@class='product-image']/img[@src]/@src"
            ).extract()
            image_urls = [self.process_href(val, response.url) for val in ret]
            item = ProductItem()
            item['image_urls'] = image_urls
            item['url'] = metadata['url']
            item['model'] = metadata['model']
            item['metadata'] = metadata
            yield item
        else:
            self.log(
                unicode.format(u'INVALID ITEM: {0}',
                               metadata['url']).encode('utf-8'), log.ERROR)