コード例 #1
0
ファイル: ysl_spider.py プロジェクト: haizi-zh/ofashion
    def parse(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        nav_nodes = sel.xpath(
            '//div[@id="menu"]/ul/li[child::a[@href][text()]][child::div[@class="submenuMask"]]'
        )
        for node in nav_nodes:
            try:
                tag_text = node.xpath('./a[@href][text()]/text()').extract()[0]
                tag_text = self.reformat(tag_text)
                tag_name = tag_text.lower()
            except (TypeError, IndexError):
                continue

            if tag_text and tag_name:
                m = copy.deepcopy(metadata)

                m['tags_mapping']['category-0'] = [{
                    'name': tag_name,
                    'title': tag_text
                }]

                gender = common.guess_gender(tag_name)
                if gender:
                    m['gender'] = [gender]

                sub_nodes = node.xpath(
                    './div[@class="submenuMask"]/ul/li/a[@href][text()]')
                for sub_node in sub_nodes:
                    try:
                        tag_text = sub_node.xpath('./text()').extract()[0]
                        tag_text = self.reformat(tag_text)
                        tag_name = tag_text.lower()
                    except (TypeError, IndexError):
                        continue

                    if tag_text and tag_name:
                        mc = copy.deepcopy(m)

                        mc['tags_mapping']['category-1'] = [{
                            'name': tag_name,
                            'title': tag_text
                        }]

                        gender = common.guess_gender(tag_name)
                        if gender:
                            mc['gender'] = [gender]

                        try:
                            href = sub_node.xpath('./@href').extract()[0]
                            href = self.process_href(href, response.url)
                        except (TypeError, IndexError):
                            continue

                        yield Request(url=href,
                                      callback=self.parse_product_list,
                                      errback=self.onerr,
                                      meta={'userdata': mc})
コード例 #2
0
ファイル: valentino_spider.py プロジェクト: haizi-zh/ofashion
    def parse_gender(self, response):
        metadata = response.meta['userdata']
        sel = Selector(response)

        node_list = sel.xpath('//div[contains(@class,"switchGender")]')
        if node_list:
            for node in node_list[0].xpath(
                    './ul/li/a[@href and @class="notSelGender"]'):
                try:
                    tmp = self.reformat(
                        node.xpath('text()').extract()[0]).lower()
                except (TypeError, IndexError):
                    continue
                m = copy.deepcopy(metadata)
                gender = cm.guess_gender(tmp)
                if gender:
                    m['gender'] = [gender]
                yield Request(url=self.process_href(
                    node.xpath('@href').extract()[0], response.url),
                              callback=self.parse_cat1,
                              errback=self.onerr,
                              meta={'userdata': m})
            try:
                tmp = self.reformat(node_list[0].xpath(
                    './ul/li/span[@class="selGender"]/text()').extract()
                                    [0]).lower()
                gender = cm.guess_gender(tmp)
                if gender:
                    metadata['gender'] = [gender]
            except (TypeError, IndexError):
                pass

        for val in self.parse_cat1(response):
            yield val
コード例 #3
0
    def parse(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        nav_nodes = sel.xpath('//div[contains(@class, "global-nav")]/ul/li')
        for node in nav_nodes:
            try:
                tag_text = ' '.join(node.xpath('./a//text()').extract())
                tag_text = self.reformat(tag_text)
                tag_name = tag_text.lower()
            except (TypeError, IndexError):
                continue

            if tag_text and tag_name:
                m = copy.deepcopy(metadata)

                m['tags_mapping']['category-0'] = [{
                    'name': tag_name,
                    'title': tag_text,
                }]

                gender = common.guess_gender(tag_name)
                if gender:
                    m['gender'] = [gender]

                # 这个不包含最后一个nav里边的链接,那里边没单品
                sub_nodes = node.xpath(
                    './div/div/ul/li[child::a[text()][@href]]')
                for sub_node in sub_nodes:
                    try:
                        tag_text = sub_node.xpath('./a/text()').extract()[0]
                        tag_text = self.reformat(tag_text)
                        tag_name = tag_text.lower()
                    except (TypeError, IndexError):
                        continue

                    if tag_text and tag_name:
                        mc = copy.deepcopy(m)

                        mc['tags_mapping']['category-1'] = [{
                            'name': tag_name,
                            'title': tag_text,
                        }]

                        gender = common.guess_gender(tag_name)
                        if gender:
                            mc['gender'] = [gender]

                        try:
                            href = sub_node.xpath('./a/@href').extract()[0]
                            href = self.process_href(href, response.url)
                        except (TypeError, IndexError):
                            continue

                        yield Request(url=href,
                                      callback=self.parse_product_list,
                                      errback=self.onerr,
                                      meta={'userdata': mc})
コード例 #4
0
    def parse(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        nav_nodes = sel.xpath('//div[@id="pre-footer"]/ul[@class="nav"]/li[child::h4[text()]]')
        for node in nav_nodes:
            try:
                tag_text = ''.join(
                    self.reformat(val)
                    for val in node.xpath('./h4//text()').extract()
                )
                tag_text = self.reformat(tag_text)
                tag_name = tag_text.lower()
            except(TypeError, IndexError):
                continue

            if tag_text and tag_name:
                m = copy.deepcopy(metadata)

                m['tags_mapping']['category-0'] = [
                    {'name': tag_name, 'title': tag_text, },
                ]

                gender = common.guess_gender(tag_name)
                if gender:
                    m['gender'] = [gender]

                sub_nodes = node.xpath('./ul/li[child::a[@href][text()]]')
                for sub_node in sub_nodes:
                    try:
                        tag_text = sub_node.xpath('./a/text()').extract()[0]
                        tag_text = self.reformat(tag_text)
                        tag_name = tag_text.lower()
                    except(TypeError, IndexError):
                        continue

                    if tag_text and tag_name:
                        mc = copy.deepcopy(m)

                        mc['tags_mapping']['category-1'] = [
                            {'name': tag_name, 'title': tag_text, },
                        ]

                        gender = common.guess_gender(tag_name)
                        if gender:
                            mc['gender'] = [gender]

                        try:
                            href = sub_node.xpath('./a/@href').extract()[0]
                            href = self.process_href(href, response.url)
                        except(TypeError, IndexError):
                            continue

                        yield Request(url=href,
                                      callback=self.parse_filter,
                                      errback=self.onerr,
                                      meta={'userdata': mc})
コード例 #5
0
ファイル: lacoste_spider.py プロジェクト: haizi-zh/ofashion
    def parse(self, response):
        metadata = response.meta['userdata']
        sel = Selector(response)

        for node1 in sel.xpath('//ul[@class="mainNavi"]//li[contains(@class,"mainNavi_item")]'):
            try:
                tmp = node1.xpath('.//a[@href and contains(@class,"mainNavi_link")]/span/text()').extract()
                cat_title = self.reformat(tmp[0])
                cat_name = cat_title.lower()
            except (IndexError, TypeError):
                continue
            m1 = copy.deepcopy(metadata)
            m1['tags_mapping']['category-0'] = [{'title': cat_title, 'name': cat_name}]
            gender = cm.guess_gender(cat_name)
            if gender:
                m1['gender'] = [gender]

            for node2 in node1.xpath('.//ul[@class="nav_category_list"]/li/a[@href]'):
                url = self.process_href(node2.xpath('@href').extract()[0], response.url)
                try:
                    tmp = node2.xpath('./span/text()').extract()
                    cat_title = self.reformat(tmp[0])
                    cat_name = cat_title.lower()
                except (IndexError, TypeError):
                    continue
                m2 = copy.deepcopy(m1)
                m2['tags_mapping']['category-1'] = [{'title': cat_title, 'name': cat_name}]
                yield Request(url=url, callback=self.parse_list, errback=self.onerr, meta={'userdata': m2})
コード例 #6
0
ファイル: swarovski_spider.py プロジェクト: haizi-zh/ofashion
    def parse(self, response):
        metadata = response.meta['userdata']
        sel = Selector(response)

        for node in sel.xpath('//ul/li/a[@href and @data-cat]'):
            try:
                cat_title = self.reformat(node.xpath('@data-cat').extract()[0])
                cat_name = cat_title.lower()
                url = self.process_href(
                    node.xpath('@href').extract()[0], response.url)
            except (IndexError, TypeError):
                continue
            m = copy.deepcopy(metadata)
            m['tags_mapping']['category-0'] = [{
                'title': cat_title,
                'name': cat_name
            }]
            gender = cm.guess_gender(cat_name)
            if gender:
                m['gender'] = [gender]

            yield Request(callback=self.parse_cat,
                          errback=self.onerr,
                          meta={'userdata': m},
                          url=url)
コード例 #7
0
 def parse(self, response):
     metadata = response.meta['userdata']
     #处理常规部分
     link_extractor = SgmlLinkExtractor(restrict_xpaths=('//div[@class="linksList"]//a'))
     links = link_extractor.extract_links(response)
     for link in links:
         m = copy.deepcopy(metadata)
         url = link.url
         cat_title = link.text
         cat_name = cat_title.lower()
         m['tags_mapping']['category-0'] = [{'title': cat_title, 'name': cat_name}]
         gender = cm.guess_gender(cat_name)
         if gender:
             m['gender'] = [gender]
         yield Request(url=url, callback=self.parse_cat, errback=self.onerr, meta={'userdata': m})
     #处理区域特别部分
     region = metadata['region']
     if region == 'jp':
         extra_urls = [
             'http://www.paulsmith.co.jp/shop/gifts/products',
             'http://www.paulsmith.co.jp/shop/reserve/products',
             'http://www.paulsmith.co.jp/shop/sales/products',
             'http://www.paulsmith.co.jp/shop/paulsmithcollection/products'
         ]
         for url in extra_urls:
             m = copy.deepcopy(metadata)
             yield Request(url=url, callback=self.parse_cat, errback=self.onerr, meta={'userdata': m})
     else:
         extra_urls = [
             'http://www.paulsmith.co.uk/%s-en/shop/valentines-day-gifts/valentines-day-gifts-for-her' % region,
             'http://www.paulsmith.co.uk/%s-en/shop/valentines-day-gifts/valentines-day-gifts-for-him' % region,
         ]
         for url in extra_urls:
             m = copy.deepcopy(metadata)
             yield Request(url=url, callback=self.parse_cat, errback=self.onerr, meta={'userdata': m})
コード例 #8
0
    def parse_cat(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        nav_nodes = sel.xpath('//ul[@id="main-nav"]/li[position()>1]/a[@href][text()]')
        for node in nav_nodes:
            try:
                tag_text = node.xpath('./text()').extract()[0]
                tag_text = self.reformat(tag_text)
                tag_name = tag_text.lower()
            except(TypeError, IndexError):
                continue

            if tag_text and tag_name:
                m = copy.deepcopy(metadata)

                m['tags_mapping']['category-0'] = [
                    {'name': tag_name, 'title': tag_text},
                ]

                gender = common.guess_gender(tag_name)
                if gender:
                    m['gender'] = [gender]

                try:
                    href = node.xpath('./@href').extract()[0]
                    href = self.process_href(href, response.url)
                except(TypeError, IndexError):
                    continue

                yield Request(url=href,
                              callback=self.parse_product_list,
                              errback=self.onerr,
                              meta={'userdata': m})
コード例 #9
0
    def parse_procut_list(self, response):
        """
        处理单品列表
        """

        metadata = response.meta['userdata']
        sel = Selector(response)

        product_nodes = sel.xpath('//div[@class="category-view"]/div/a')
        for node in product_nodes:
            m = copy.deepcopy(metadata)

            name_node = node.xpath('.//h3[text()]')
            if name_node:
                name = name_node.xpath('./text()').extract()[0]
                name = self.reformat(name)

                m['name'] = name

                gender = common.guess_gender(name,
                                             extra={
                                                 'male': [],
                                                 'female': ['lady']
                                             })
                if gender:
                    m['gender'] = [gender]

            href = node.xpath('./@href').extract()[0]
            href = self.process_href(href, response.url)

            yield Request(url=href,
                          callback=self.parse_product,
                          errback=self.onerr,
                          meta={'userdata': m},
                          dont_filter=True)
コード例 #10
0
    def parse(self, response):
        metadata = response.meta['userdata']
        sel = Selector(response)

        for node in sel.xpath(
                '//ul[@id="global-nav" or @id="rl-globalnav"]/li/a[@title and @href]'
        ):
            try:
                cat_title = self.reformat(node.xpath('@title').extract()[0])
                cat_name = cat_title.lower()
            except (IndexError, TypeError):
                continue

            m = copy.deepcopy(metadata)
            m['tags_mapping']['category-0'] = [{
                'name': cat_name,
                'title': cat_title
            }]
            gender = cm.guess_gender(cat_name)
            if gender:
                m['gender'] = [gender]
            url = self.process_href(
                node.xpath('@href').extract()[0], response.url)
            yield Request(url=url,
                          callback=self.parse_1,
                          errback=self.onerr,
                          meta={'userdata': m},
                          dont_filter=True)
コード例 #11
0
 def parse(self, response):
     metadata = response.meta['userdata']
     m = re.search(r'([a-zA-Z]{2})\.burberry\.com', response.url)
     if m:
         hxs = Selector(response)
         for item in hxs.xpath(
                 "//div[@id='shared_sidebar']//div[@id='nav']//ul[@class='l-1-set']//li[@class='l-1-link "
                 "l-1-link-open']//li/a[@href and @title]"):
             href = item.xpath('@href').extract()[0]
             # TODO What is cat?
             cat = self.reformat(re.sub(r'/', '', href)).lower()
             title = self.reformat(item.xpath('@title').extract()[0])
             m = copy.deepcopy(metadata)
             m['tags_mapping']['category-1'] = [{
                 'name': cat,
                 'title': title
             }]
             gender = cm.guess_gender(cat)
             if gender:
                 m['gender'] = [gender]
             yield Request(url=self.process_href(href, response.url),
                           meta={'userdata': m},
                           dont_filter=True,
                           callback=self.parse_category_1,
                           errback=self.onerr)
コード例 #12
0
    def parse(self, response):
        metadata = response.meta['userdata']
        sel = Selector(response)

        # 左侧边栏第一栏中带连接的那些node
        nav_nodes = sel.xpath('//div[@id="sidebar"]/ul/ul/li/a[@href]')
        for node in nav_nodes:
            m = copy.deepcopy(metadata)

            try:
                tag_text = node.xpath('./text()').extract()[0]
                tag_text = self.reformat(tag_text)
                tag_name = tag_text.lower()
            except(TypeError, IndexError):
                continue

            if tag_text and tag_name:
                m['tags_mapping']['category-0'] = [
                    {'name': tag_name, 'title': tag_text, },
                ]

                gender = common.guess_gender(tag_name)
                if gender:
                    m['gender'] = [gender]

                try:
                    href = node.xpath('./@href').extract()[0]
                    href = self.process_href(href, response.url)
                except(TypeError, IndexError):
                    continue

                yield Request(url=href,
                              callback=self.parse_filter1,
                              errback=self.onerr,
                              meta={'userdata': m})
コード例 #13
0
ファイル: sisley_spider.py プロジェクト: haizi-zh/ofashion
    def parse(self, response):
        metadata = response.meta['userdata']
        sel = Selector(response)

        for node1 in sel.xpath('//li/a[contains(@class,"header_menu")]'):
            try:
                tmp = node1.xpath('./*/text()').extract()
                cat_title = self.reformat(tmp[0])
                cat_name = cat_title.lower()
            except (IndexError, TypeError):
                continue
            m1 = copy.deepcopy(metadata)
            m1['tags_mapping']['category-0'] = [{
                'title': cat_title,
                'name': cat_name
            }]
            gender = cm.guess_gender(cat_name)
            if gender:
                m1['gender'] = [gender]

            for node2 in node1.xpath(
                    '../div[contains(@class,"submenu")]/ul/li[contains(@class,"title_column")]/a[@href]'
            ):
                try:
                    tmp = node2.xpath('./*/text()').extract()
                    cat_title = self.reformat(tmp[0])
                    cat_name = cat_title.lower()
                except (IndexError, TypeError):
                    continue
                m2 = copy.deepcopy(m1)
                m2['tags_mapping']['category-1'] = [{
                    'title': cat_title,
                    'name': cat_name
                }]
                gender = cm.guess_gender(cat_name)
                if gender:
                    if 'gender' in m2 and m2['gender']:
                        tmp = set(m2['gender'])
                        tmp.add(gender)
                        m2['gender'] = list(tmp)
                    else:
                        m2['gender'] = [gender]
                yield Request(url=self.process_href(
                    node2.xpath('@href').extract()[0], response.url),
                              callback=self.parse_grid,
                              errback=self.onerr,
                              meta={'userdata': m2})
コード例 #14
0
    def parse_collection(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        collection_nodes = sel.xpath(
            '//table[@id="top-watches-list"]//tr[@class="top-list-buttons"]/td/a'
        )
        for node in collection_nodes:
            m = copy.deepcopy(metadata)

            try:
                tag_text = node.xpath('./@title').extract()[0]
                tag_text = self.reformat(tag_text)
                tag_name = tag_text.lower()
            except (TypeError, IndexError):
                continue

            if tag_text and tag_name:
                m['tags_mapping']['category-1'] = [
                    {
                        'name': tag_name,
                        'title': tag_text,
                    },
                ]

                gender = common.guess_gender(tag_name)
                if gender:
                    m['gender'] = [gender]

            try:
                href = node.xpath('./@href').extract()[0]
                href = self.process_href(href, response.url)
            except (TypeError, IndexError):
                continue

            yield Request(url=href,
                          callback=self.parse_product_list_collection,
                          errback=self.onerr,
                          meta={'userdata': m})

        # 这里既可能是男女的collection页,可能是watch finder页
        view_all_node = sel.xpath('//div[@id="l-gender-teaser"]//a[@href]')
        if view_all_node:
            try:
                href = view_all_node.xpath('./@href').extract()[0]
                href = self.process_href(href, response.url)

                yield Request(url=href,
                              callback=self.parse_product_list_collection,
                              errback=self.onerr,
                              meta={'userdata': metadata})
            except (TypeError, IndexError):
                pass
        else:
            for val in self.parse_product_list_watchesfinder(response):
                yield val
コード例 #15
0
    def parse_details(self, response):
        metadata = response.meta['userdata']
        sel = Selector(response)

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        colors = self.fetch_color(response)
        if colors:
            metadata['color'] = colors

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        image_urls = []
        for image_node in sel.xpath(
                '//article[@class="product"]/figure[@class="slider"]/img[@data-zoom-url]'
        ):
            tmp = image_node.xpath('./@data-zoom-url').extract()
            if tmp:
                if tmp[0] == '/static_assets/images/products/placeholders/standard.jpg':
                    tmp = image_node.xpath('./@src').extract()
                    if tmp and re.search(
                            r'\.(jpg|png|jpeg)', tmp[0], flags=re.IGNORECASE):
                        image_urls.append(
                            self.process_href(tmp[0], response.url))
                else:
                    image_urls.append(self.process_href(tmp[0], response.url))

        gender = cm.guess_gender(
            metadata['tags_mapping']['category-0'][0]['name'])
        if gender:
            metadata['gender'] = [gender]

        metadata['url'] = response.url
        item = ProductItem()
        item['image_urls'] = image_urls
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        item['metadata'] = metadata

        return item
コード例 #16
0
ファイル: ferragamo_spider.py プロジェクト: haizi-zh/ofashion
    def parse_base(self, response, xpath_dict, sel=None, metadata=None, cat_level=0, is_leaf=None):
        """
        @param is_leaf: 函数:判断当前节点是否为叶节点
        @param response:
        @param xpath_dict: 形式:{'cat_level_0: [xpath, xpath_extra], 'cat_level_1': xpath, ...'cat_level_extra': xpath}
        @param sel:
        @param metadata:
        @param cat_level:
        """
        if not metadata:
            metadata = response.meta['userdata']
        if not sel:
            sel = Selector(response)
        if not is_leaf:
            is_leaf = lambda x: False
        if cat_level == 0:
            xpath, xpath_extra = xpath_dict['cat_level_0']
            node_list = sel.xpath(xpath)
            if xpath_extra:
                node_list.extend(sel.xpath(xpath_extra))
        else:
            cat_key = str.format('cat_level_{0}', cat_level)
            if cat_key in xpath_dict:
                xpath = xpath_dict[cat_key]
            else:
                xpath = xpath_dict['cat_level_extra']
            node_list = sel.xpath(xpath)

        if node_list and not is_leaf(sel):
            # 深度优先递归,继续下级分支
            for node in node_list:
                try:
                    tag_title = self.reformat(node.xpath('text()').extract()[0])
                    tag_name = tag_title.lower()
                except (TypeError, IndexError):
                    continue
                m1 = copy.deepcopy(metadata)
                if cat_level == 0:
                    gender = cm.guess_gender(tag_name)
                    if gender:
                        m1['gender'] = [gender]
                m1['tags_mapping'][str.format('category-{0}', cat_level)] = [{'name': tag_name, 'title': tag_title}]

                for val in self.parse_base(response, xpath_dict, node, m1, cat_level + 1, is_leaf=is_leaf):
                    yield val
        else:
            # 到达叶节点
            tmp = sel.xpath('@href').extract()
            if tmp:
                yield Request(url=self.process_href(tmp[0], response.url),
                              callback=self.spider_data['callbacks'][metadata['region']][1], errback=self.onerr,
                              meta={'userdata': metadata})
コード例 #17
0
    def parse(self, response, metadata=None, current_node=None, level=0):
        if not metadata:
            metadata = response.meta['userdata']
        sel = Selector(response)
        if current_node:
            node_list = current_node.xpath('../ul/li/a[@href]')
        else:
            node_list = sel.xpath(
                '//*[@id="sidebarMenu"]/ul/li[contains(@class,"selected")]/a[@href]'
            )

        if node_list:
            for node1 in node_list:
                try:
                    tag_text = self.reformat(
                        node1.xpath('text()').extract()[0])
                    tag_name = tag_text.lower()
                except (IndexError, TypeError):
                    continue
                m1 = copy.deepcopy(metadata)
                gender = cm.guess_gender(tag_text)
                if gender:
                    m1['gender'] = [gender]
                    new_level = level
                else:
                    m1['tags_mapping'][str.format('category-{0}', level)] = [{
                        'name':
                        tag_name,
                        'title':
                        tag_text
                    }]
                    new_level = level + 1
                for val in self.parse(response, m1, node1, new_level):
                    yield val

        else:
            prod_list = sel.xpath('//*[@id="elementsContainer"]')
            if prod_list:
                # 到达单品页面
                for val in self.parse_list(response, metadata):
                    yield val
            else:
                # 继续
                try:
                    url = self.process_href(
                        current_node.xpath('@href').extract()[0], response.url)
                    yield Request(url=url,
                                  callback=self.parse_list,
                                  errback=self.onerr,
                                  meta={'userdata': metadata})
                except (IndexError, TypeError):
                    pass
コード例 #18
0
    def parse_sub_nav(self, response):
        """
        处理二级分类
        有些分类有二级分类,比如men
        """

        metadata = response.meta['userdata']
        sel = Selector(response)

        sub_nav_nodes = sel.xpath(
            '//div[@id="main"]/div/div[contains(@class, "navigation")]/ul/li/ul/li/ul/li/a[text()]'
        )
        for sub_node in sub_nav_nodes:
            m = copy.deepcopy(metadata)

            tag_text = sub_node.xpath('./text()').extract()[0]
            tag_text = self.reformat(tag_text)
            tag_name = tag_text.lower()

            if tag_text and tag_name:
                m['tags_mapping']['category-1'] = [
                    {
                        'name': tag_name,
                        'title': tag_text,
                    },
                ]

                gender = common.guess_gender(tag_name,
                                             extra={
                                                 'male': [],
                                                 'female': ['lady']
                                             })
                if gender:
                    m['gender'] = [gender]

            href = sub_node.xpath('./@href').extract()[0]
            href = self.process_href(href, response.url)

            yield Request(url=href,
                          callback=self.parse_third_nav,
                          errback=self.onerr,
                          meta={'userdata': m})

        for val in self.parse_procut_list(response):
            yield val
コード例 #19
0
    def parse_productList(self, response):
        '''
        解析单品列表
        '''

        metadata = response.meta['userdata']
        sel = Selector(response)

        product_list_nodes = sel.xpath('//div[@class="models-list"]//li')
        for node in product_list_nodes:
            try:
                m = copy.deepcopy(metadata)

                model_node = node.xpath('.//h5')
                if model_node:
                    model = model_node.xpath('.//div').extract()[0]
                    model = self.reformat(model)
                    if model:
                        m['model'] = model
                    else:
                        continue
                else:
                    continue

                name_node = node.xpath('.//h5')
                if name_node:
                    nameText = name_node.xpath('./text()').extract()[0]
                    nameText = self.reformat(nameText)
                    if nameText:
                        m['name'] = nameText

                if m['name']:
                    gender = common.guess_gender(m['name'])
                    if gender:
                        m['gender'] = [gender]

                href = node.xpath('.//a/@href').extract()[0]
                href = self.process_href(href, response.url)

                yield Request(url=href,
                              callback=self.parse_product,
                              errback=self.onerr,
                              meta={'userdata': m})
            except (TypeError, IndexError):
                continue
コード例 #20
0
ファイル: mcqueen_spider.py プロジェクト: haizi-zh/ofashion
    def parse(self, response):
        sel = Selector(response)
        metadata = response.meta['userdata']

        for node1 in sel.xpath(
                '//nav[@id="mainMenu"]/ul[contains(@class, "menuHeader") and '
                'contains(@class, "firstLevel")]/li'):
            tag_text = None
            if 'data-main-menu' in node1._root.attrib:
                tag_text = self.reformat(
                    unicodify(node1._root.attrib['data-main-menu']))
            else:
                tmp = node1.xpath('./a[@href]')
                if tmp:
                    tag_text = self.reformat(unicodify(tmp[0]._root.text))
            if not tag_text:
                continue

            m1 = copy.deepcopy(metadata)
            m1['tags_mapping']['category-0'] = [{
                'name': tag_text.lower(),
                'title': tag_text
            }]
            gender = cm.guess_gender(tag_text.lower())
            if gender:
                m1['gender'] = [gender]

            for node2 in node1.xpath(
                    './ul[contains(@class,"secondLevel")]/li/a[@href]'):
                tag_text = self.reformat(unicodify(node2._root.text))
                if not tag_text:
                    continue

                m2 = copy.deepcopy(metadata)
                m2['tags_mapping']['category-1'] = [{
                    'name': tag_text.lower(),
                    'title': tag_text
                }]
                m2['category'] = [tag_text.lower()]
                yield Request(url=self.process_href(node2._root.attrib['href'],
                                                    response.url),
                              callback=self.parse_cat1,
                              errback=self.onerr,
                              meta={'userdata': m2})
コード例 #21
0
    def parse_left_nav_collection(self, response):
        """
        这里解析左边当前类别右侧的系列,看起来是当前类别的下属
        """

        metadata = response.meta['userdata']
        sel = Selector(response)

        sub_nodes = sel.xpath(
            '//div[@id="col_colizq"]/div[@id="col_list"]/ul/li[child::a[text()]]'
        )
        for sub_node in sub_nodes:
            m = copy.deepcopy(metadata)

            try:
                tag_text = sub_node.xpath('./a/text()').extract()[0]
                tag_text = self.reformat(tag_text)
                tag_name = tag_text.lower()
            except (TypeError, IndexError):
                continue

            if tag_text and tag_name:
                m['tags_mapping']['category-1'] = [
                    {
                        'name': tag_name,
                        'title': tag_text,
                    },
                ]

                gender = common.guess_gender(tag_name, {
                    'male': [],
                    'female': [u'少女']
                })
                if gender:
                    m['gender'] = [gender]

            href = sub_node.xpath('./a/@href').extract()[0]
            href = self.process_href(href, response.url)

            yield Request(url=href,
                          callback=self.parse_product_list,
                          errback=self.onerr,
                          meta={'userdata': m})
コード例 #22
0
    def parse_cat2_us(self, response):
        metadata = response.meta['userdata']
        sel = Selector(response)

        cat_nodes = sel.xpath(
            '//div[@id="content"]/div[@id="categories"]/div[contains(@class, "category")]/a[@href][child::img[@title]]'
        )
        for node in cat_nodes:
            try:
                tag_text = node.xpath('./img/@title').extract()[0]
                tag_text = self.reformat(tag_text)
                tag_name = tag_text.lower()
            except (TypeError, IndexError):
                continue

            if tag_text and tag_name:
                m = copy.deepcopy(metadata)

                m['tags_mapping']['category-2'] = [
                    {
                        'name': tag_name,
                        'title': tag_text,
                    },
                ]

                gender = cm.guess_gender(tag_name)
                if gender:
                    m['gender'] = [gender]

                try:
                    href = node.xpath('./@href').extract()[0]
                    href = self.process_href(href, response.url)
                    href = self.process_href_for_us(href)
                except (TypeError, IndexError):
                    continue

                yield Request(url=href,
                              callback=self.parse_product_list_us,
                              errback=self.onerr,
                              meta={'userdata': m})

        for val in self.parse_product_list_us(response):
            yield val
コード例 #23
0
    def parse_filter2(self, response):
        """
        有些类别有二级的分类
        比如:http://usa.agnesb.com/en/shopping_online/tous-produits/accessories/women-1
        """

        metadata = response.meta['userdata']
        sel = Selector(response)

        sub_nodes = sel.xpath('//div[@id="sidebar"]/ul/ul/ul/ul/li/a[@href]')
        for sub_node in sub_nodes:
            m = copy.deepcopy(metadata)

            try:
                tag_text = sub_node.xpath('./text()').extract()[0]
                tag_text = self.reformat(tag_text)
                tag_name = tag_text.lower()
            except(TypeError, IndexError):
                continue

            if tag_text and tag_name:
                m['tags_mapping']['category-2'] = [
                    {'name': tag_name, 'title': tag_text, },
                ]

                gender = common.guess_gender(tag_name)
                if gender:
                    m['gender'] = [gender]

                try:
                    href = sub_node.xpath('./@href').extract()[0]
                    href = self.process_href(href, response.url)
                except(TypeError, IndexError):
                    continue

                yield Request(url=href,
                              callback=self.parse_product_list,
                              errback=self.onerr,
                              meta={'userdata': m})

        for val in self.parse_product_list(response):
            yield val
コード例 #24
0
    def parse_left_filter(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        # 有些类别有第三级展开,比如中国,促销,女装
        nav_nodes = sel.xpath('//nav[@id="navMenu"]//ul//ul//ul//li//a[@href]')
        if not nav_nodes:  # 针对美国官网
            nav_nodes = sel.xpath(
                '//div[@class="left-navigation"]//ul/li/ul/li/a[@href]')
        for node in nav_nodes:
            try:
                tag_text = node.xpath('./text()').extract()[0]
                tag_text = self.reformat(tag_text)
                tag_name = tag_text.lower()
            except (TypeError, IndexError):
                continue

            if tag_text and tag_name:
                m = copy.deepcopy(metadata)

                m['tags_mapping']['category-2'] = [
                    {
                        'name': tag_name,
                        'title': tag_text
                    },
                ]

                gender = common.guess_gender(tag_name)
                if gender:
                    m['gender'] = gender

                href = node.xpath('./@href').extract()[0]
                href = self.process_href(href, response.url)

                yield Request(url=href,
                              callback=self.parse_product_list,
                              errback=self.onerr,
                              meta={'userdata': m})

        for val in self.parse_product_list(response):
            yield val
コード例 #25
0
ファイル: dunhill_spider.py プロジェクト: haizi-zh/ofashion
 def parse_cat(self, response):
     link_extractor = SgmlLinkExtractor(
         restrict_xpaths=('//div[@class="inner-nav-content"]//a'))
     links = link_extractor.extract_links(response)
     metadata = response.meta['userdata']
     for link in links:
         m = copy.deepcopy(metadata)
         url = link.url
         cat_title = link.text
         cat_name = cat_title.lower()
         m['tags_mapping']['category-0'] = [{
             'title': cat_title,
             'name': cat_name
         }]
         gender = cm.guess_gender(cat_name)
         if gender:
             m['gender'] = [gender]
         yield Request(url=url,
                       callback=self.parse_type,
                       errback=self.onerr,
                       meta={'userdata': m})
コード例 #26
0
    def parse(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        nav_nodes = sel.xpath('//div[@id="main_menu_menu"]/ul/li')
        for node in nav_nodes:
            m = copy.deepcopy(metadata)

            try:
                tag_text = ''.join(
                    self.reformat(val)
                    for val in node.xpath('.//text()').extract())
                tag_text = self.reformat(tag_text)
                tag_name = tag_text.lower()
            except (TypeError, IndexError):
                continue

            if tag_text and tag_name:
                m['tags_mapping']['category-0'] = [
                    {
                        'name': tag_name,
                        'title': tag_text,
                    },
                ]

                gender = common.guess_gender(tag_name, {
                    'male': [],
                    'female': [u'少女']
                })
                if gender:
                    m['gender'] = [gender]

            href = node.xpath('./a[@href]/@href').extract()[0]
            href = self.process_href(href, response.url)

            yield Request(url=href,
                          callback=self.parse_left_nav,
                          errback=self.onerr,
                          meta={'userdata': m})
コード例 #27
0
ファイル: maxmara_spider.py プロジェクト: haizi-zh/ofashion
    def parse_collection(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        collection_nodes = sel.xpath('//div[@id="main"]/ul//div[@class="row"]')
        for node in collection_nodes:
            try:
                tag_text = node.xpath('.//h2/text()').extract()[0]
                tag_text = self.reformat(tag_text)
                tag_name = tag_text.lower()
            except(TypeError, IndexError):
                continue

            if tag_text and tag_name:
                m = copy.deepcopy(metadata)

                m['tags_mapping']['category-2'] = [
                    {'name': tag_name, 'title': tag_text, },
                ]

                gender = common.guess_gender(tag_name)
                if gender:
                    m['gender'] = [gender]

                product_nodes = node.xpath('.//div[@class="thumbnail"][child::a[@href]]')
                for product_node in product_nodes:
                    mc = copy.deepcopy(m)

                    try:
                        href = product_node.xpath('./a[@href]/@href').extract()[0]
                        href = self.process_href(href, response.url)
                    except(TypeError, IndexError):
                        continue

                    yield Request(url=href,
                                  callback=self.parse_product,
                                  errback=self.onerr,
                                  meta={'userdata': mc},
                                  dont_filter=True)
コード例 #28
0
    def parse(self, response):
        metadata = response.meta['userdata']
        sel = Selector(response)

        for node in sel.xpath(
                '//div[contains(@class,"main-menu")]//li[contains(@class,"level0")]'
        ):
            node_class = node._root.attrib['class']
            mt = re.search(r'\b(\w+)\s*$', node_class)
            if not mt:
                continue

            tag_type = 'category-0'
            tag_name = unicodify(mt.group(1)).lower()
            temp = node.xpath('./a[@href]')
            if not temp:
                continue
            href = temp[0]._root.attrib['href']
            tag_text = u', '.join([
                cm.html2plain(unicodify(val.text))
                for val in temp[0]._root.iterdescendants()
                if val.text and val.text.strip()
            ])

            m = copy.deepcopy(metadata)
            m['tags_mapping'][tag_type] = [{
                'name': tag_name,
                'title': tag_text
            }]
            gender = cm.guess_gender(tag_name)
            if gender:
                m['gender'] = [gender]

            if not href or not href.strip():
                continue
            else:
                yield Request(url=href,
                              meta={'userdata': m},
                              callback=self.parse_category_0)
コード例 #29
0
    def parse(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        nav_nodes = sel.xpath(
            '//div[@class="l-watches-navigation"]/div/div[@class="navigation-element"]'
        )
        for node in nav_nodes:
            m = copy.deepcopy(metadata)

            try:
                tag_text = ''.join(
                    self.reformat(val)
                    for val in node.xpath('./h3//text()').extract())
                tag_text = self.reformat(tag_text)
                tag_name = tag_text.lower()
            except (TypeError, IndexError):
                continue

            if tag_text and tag_name:
                m['tags_mapping']['category-0'] = [
                    {
                        'name': tag_name,
                        'title': tag_text,
                    },
                ]

                gender = common.guess_gender(tag_name)
                if gender:
                    m['gender'] = [gender]

            href = node.xpath('.//a[@href]/@href').extract()[0]
            href = self.process_href(href, response.url)

            yield Request(url=href,
                          callback=self.parse_collection,
                          errback=self.onerr,
                          meta={'userdata': m})
コード例 #30
0
 def parse(self, response):
     sel = Selector(response)
     cat_title = ''.join(''.join(
         sel.xpath('//div[@id="wrapperOuter"]/nav/h2//text()').extract()))
     cat_name = cat_title.lower()
     link_extractor = SgmlLinkExtractor(
         restrict_xpaths=('//section[@id="main"]'))
     links = link_extractor.extract_links(response)
     metadata = response.meta['userdata']
     for link in links:
         m = copy.deepcopy(metadata)
         m['tags_mapping']['category-0'] = [{
             'title': cat_title,
             'name': cat_name
         }]
         gender = cm.guess_gender(cat_name)
         if gender:
             m['gender'] = [gender]
         url = link.url
         yield Request(url=url,
                       callback=self.parse_details,
                       errback=self.onerr,
                       meta={'userdata': m})