Example #1
0
    def bagPage(self, url):
        page = self.crawler.getData(url, self.home_url)
        if not page or page == '': return

        serie_title = '包袋'
        p = re.compile(r'<li data-position="\d+\s*" class="product isAvailable".+?data-category="(.+?)".+?>\s*<div class="prodContent"><div class="imagesContainer".+?>.+?<img.+?data-original="(.+?)".+?>.+?</div>\s*<div class="\s*productDescription\s*">\s*<a href="(.+?)".+?><h2.+?>(.+?)</h2>\s*</a>\s*<div class="price">.+?<span class="currency">(.+?)</span>.*?<span class="priceValue">(.+?)</span>.+?</li>', flags=re.S)
        for item in p.finditer(page):
            tab_name, i_img, i_url, i_name, s_unit, s_price = item.group(1).strip(),item.group(2),item.group(3),item.group(4).strip(),item.group(5),item.group(6)
            i_unit = ""
            if s_unit.find("¥") != -1:
                i_unit = "CNY"
            i_price = re.sub(r'<.+?>','',s_price).strip()
            print tab_name, i_img, self.home_url+i_url, i_name, i_unit, i_price
            
            if i_url and i_url != '':
                self.link_list.append((serie_title,tab_name,url,i_name,self.home_url+i_url,i_img,i_price,i_unit))
            else:
                i = BagItem(self.brand_type)
                i.initItem(serie_title, tab_name, i_name, i_price, i_unit, '', i_url, i_img)
                self.items.append(i.outItem())
        page_num = 2
        ajax_url = "http://www.dolcegabbana.com.cn/yeti/api/DOLCEEGABBANA_CN/searchIndented.json?page=2&sortRule=PriorityDescending&format=full&authorlocalized=&macro=1147&micro=&color=&look=&size=&gender=D&season=P%2CE&department=&brand=&heel=&heeltype=&wedge=&washtype=&washcode=&colortype=&fabric=&waist=&family=&structure=&environment=&author=&textSearch=&minPrice=&maxPrice=&occasion=&salesline=&prints=&stone=&material=&agerange=&productsPerPage=20&gallery=&macroMarchio=&modelnames=&GroupBy=&style=&site=DOLCEEGABBANA&baseurl=http://www.dolcegabbana.com.cn/searchresult.asp"
        a_url = re.sub('page=\d+&', 'page=%d&'%page_num, ajax_url)
        a_page = self.crawler.getData(a_url, url)
        result = self.ajax_item(a_page, url)
        while result: 
            page_num += 1
            a_url = re.sub('page=\d+&', 'page=%d&'%page_num, ajax_url)
            a_page = self.crawler.getData(a_url, url)
            result = self.ajax_item(a_page, url)
Example #2
0
    def bagPage(self, url):
        page = self.crawler.getData(url, self.home_url)
        if not page or page == '': return
       
        tab_list = []
        m = re.search(r'<ul class="tabsList collections">(.+?)</ul>', page, flags=re.S)
        if m:
            tabs_list_info = m.group(1)

            p = re.compile(r'<li class=".+?">\s+<a href="(.+?)" data-magento_call_page="(.+?)".+?>(.+?)</a>\s+</li>', flags=re.S)
            for tab in p.finditer(tabs_list_info):
                tab_list.append((tab.group(3).strip(),self.home_url+tab.group(2),url+tab.group(1)))

        for tab in tab_list:
            tab_name,tab_data_url,tab_url = tab
            print '# tab:',tab_name,tab_data_url,tab_url
            tab_page = self.crawler.getData(tab_data_url, url)

            p = re.compile(r'<li class="li-product.+?>\s+<a href="(.*?)" class="linkProduct">.+?<img src="(.+?)".+?/>.+?<span class="description".+?>.+?<span class="title">(.+?)</span>.+?</span>\s+</a>\s+</li>', flags=re.S)
            for item in p.finditer(tab_page):
                i_url, i_img, i_name = self.home_url+item.group(1), self.home_url+item.group(2), item.group(3)
                print i_url, i_img, i_name
                if i_url and i_url != '':
                    self.link_list.append((tab_name,tab_url,i_name,i_url,i_img))
                else:
                    i = BagItem(self.brand_type)
                    i.initItem(tab_name, '', i_name, '', '', '', i_url, self.home_url+i_img)
                    self.items.append(i.outItem())
Example #3
0
    def itemPage(self, val):
        serie_title, refers, i_name, i_url, i_img, i_price, i_unit = val
        if i_url == "":
            return
        page = self.crawler.getData(i_url, refers)
        if not page or page == "":
            return

        m = re.search(r'<div class="product-title">(.+?)</div>', page, flags=re.S)
        if m:
            i_name = " ".join(m.group(1).strip().split())

        m = re.search(r'<div class="product-prices">(.+?)</div>', page, flags=re.S)
        if m:
            s_price = m.group(1).strip()
            if s_price.find("¥") != -1 or s_price.find("¥") != -1:
                i_unit = "CNY"
            i_price = s_price.replace("¥", "").replace("¥", "").strip()

        m = re.search(
            r'<div class="item-list">\s*<ul.+?>\s*<li class="first">.+?<img.+?src="(.+?)".*?/>', page, flags=re.S
        )
        if m:
            i_img = m.group(1)

        i_size = ""
        i_number = ""
        m = re.search(r'<div class="product-code">(.+?)型号代码(.+?)</div>', page, flags=re.S)
        if m:
            i_size, i_number = m.group(1).strip(), m.group(2).strip()

        i = BagItem(self.brand_type)
        i.initItem(serie_title, "", i_name, i_price, i_unit, i_size, i_url, i_img, i_number)
        print "# itemPage:", i.outItem()
Example #4
0
    def itemPage(self, val):
        serie_title, refers, i_name, i_url = val
        
        page = self.crawler.getData(i_url, refers)
        if not page or page == '': return
        
        i_title, i_img, i_size, i_price, i_unit = '', '', '', '', ''
        
        m = re.search(r'<h2 class="" itemprop="name">(.+?)<br />(.+?)</h2>', page, flags=re.S)
        if m:
            i_title = m.group(1).strip()

        m = re.search(r'<li class="firstThumbnails">\s+<a href="#" class="active".+?>\s+<img src="(.+?)" alt="" />\s+</a>\s+</li>', page, flags=re.S)
        if m: i_img  = self.home_url + m.group(1)

        m = re.search(r'<div class="modText">\s+<h4.+?>说明</h4>\s+<p>(.+?)</p>\s+</div>', page, flags=re.S)
        if m: 
            i_desc = m.group(1)
            m = re.search(r'尺寸:(.+?)<br />', i_desc, flags=re.S)
            if m:
                i_size = m.group(1).strip()
            else:
                m = re.search(r'尺寸:(.+?)$', i_desc, flags=re.S)
                if m: i_size = m.group(1).strip()

        i_number = ''
        m = re.search(r'<div class="columns-wrapper">.+?<div class="column">.*?<div class="reference">\s*<p>(.+?)</p>\s*</div>', page, flags=re.S)
        if m:
            s_number = m.group(1)
            i_number = s_number.split('-')[1].strip()
                
        i = BagItem()
        i.initItem(serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number)
        self.items.append(i.outItem)    
        print '# itemPage :', serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img
Example #5
0
    def itemPage(self, val):
        serie_title, i_title, refers, i_name, i_url = val
        
        page = self.crawler.getData(i_url, refers)
        if not page or page == '': return
        
        i_name, i_img, ref_price, i_size, i_price, i_unit, i_number = '', '', '', '', '', '', ''
        

        m = re.search(r'<div class="product-details details".*?>.+?<p class="description info.*?">(.+?)</p>', page, flags=re.S)
        if m: i_name = re.sub(r'<.+?>', '', m.group(1)).strip()
        else:
            m = re.search(r'<title>(.+?)</title>', page, flags=re.S)
            if m: i_name = m.group(1).split('-')[0].strip()
        
        m = re.search(r'<div class="productimage.*?"><img src="(.+?)".*?/>', page, flags=re.S)
        if m: i_img  = self.home_url + m.group(1)

        p = re.compile(r'<p class="size info">(.+?)</p>', flags=re.S)
        for size in p.finditer(page):
            if self.item_size != '': i_size += '-' + size.group(1)
            else: i_size = size.group(1)
                
        #m = re.search(r'<div class="ref info">\s*<p>(.+?)</p>', page, flags=re.S)
        #if m:
        p = re.compile(r'<div class="ref info">\s*<p>(.+?)</p>', flags=re.S)
        for number in p.finditer(page):
            item_number = number.group(1)
            if self.item_number != '': self.item_number += '-' + item_number
            else: self.item_number = item_number

            refs = item_number.split(' ')[:-1]
            ref_price = ''.join(refs)

            p_url = self.price_url %ref_price
            data = self.crawler.getData(p_url, i_url)
            if not data or data == '': return
            
            # 抽取json报文
            r = re.search(r'localJsonpPricingCallback\(\[(.+?)\]\)', data, flags=re.S)
            if r:
                price, unit = '', ''
                try:
                    js_data = json.loads(r.group(1))
                    price, unit = js_data["price"]["amount"], js_data["price"]["currency-symbol"]
                except Exception as e:
                    m = re.search(r'"amount":"(.+?)"', data, flags=re.S)
                    if m: price = m.group(1)
                    m = re.search(r'"currency-symbol":"(.+?)"', data, flags=re.S)
                    if m: unit = m.group(1)
                if self.item_price != '':
                    if price: i_price += '-' + price
                else:
                    if price: i_price = price
                    if unit: i_unit  = unit
        
        i = BagItem(self.brand_type)
        i.initItem(serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number)
Example #6
0
    def itemPage(self, val):
        serie_title, refers, i_name, i_url, i_img, i_price, i_unit = val
        if i_url == '': return
        page = self.crawler.getData(i_url, refers)
        if not page or page == '': return

        m = re.search(r'<h2 class="productName">(.+?)</h2>', page, flags=re.S)
        if m:
            i_name = m.group(1).strip()

        m = re.search(
            r'<div id="zoomImageWrapper">\s*<img.+?src="(.+?)".*?/>\s*</div>',
            page,
            flags=re.S)
        if m:
            i_img = m.group(1)
        else:
            m = re.search(
                r'<div id="thumbsWrapper">.+?<div class="thumbElement".+?>\s*<img.+?src="(.+?)".*?/>\s*</div>',
                page,
                flags=re.S)
            if m:
                i_img = m.group(1)

        m = re.search(
            r'<span class="currency">(.+?)</span>.*?<span class="priceValue">(.+?)</span>',
            page,
            flags=re.S)
        if m:
            currency, i_price = m.group(1), re.sub(r'<.*>', '', m.group(2))
            if currency.find("¥") != -1:
                i_unit = "CNY"
            else:
                i_unit = currency

        m = re.search(r'<div class="attributes">(.+?)</div>', page, flags=re.S)
        if m:
            size_str = re.sub(r'<.*?>', '', m.group(1))
            #i_size = "".join(size_str.split())
            i_size = re.sub(r'\s*', '', size_str)
            print "".join(i_size.split())

        i_number = ''
        m = re.search(
            r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>',
            page,
            flags=re.S)
        if m:
            i_number = m.group(1)

        i = BagItem()
        i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url,
                   i_img, i_number)
        self.items.append(i.outItem)
Example #7
0
    def itemPage(self, val):
        serie_title, refers, i_name, i_url, i_img, i_price, i_unit = val
        if i_url == '': return
        page = self.crawler.getData(i_url, refers)
        if not page or page == '': return

        m = re.search(r'<h1 class="product-name">(.+?)</h1>', page, flags=re.S)
        if m:
            i_name = ' '.join(m.group(1).strip().split())

        m = re.search(
            r'<div class="product-prices">.+?<dd class="saleprice">(.+?)</dd>.+?</div>',
            page,
            flags=re.S)
        if m:
            s_price = m.group(1).strip()
            if s_price.find("¥") != -1 or s_price.find("¥") != -1:
                i_unit = "CNY"
            i_price = s_price.replace('¥', '').replace('¥', '').strip()

        m = re.search(
            r'<div class="image">.+?<div class="container".*?>\s*<table.+?>\s*<tr>\s*<td>\s*<a.+?>\s*<img.+?class="thumb".+?big="(.+?)".*?/>\s*</a>\s*</td>',
            page,
            flags=re.S)
        if m:
            i_img = m.group(1)

        i_size = ''
        m = re.search(
            r'<div class="tabpage inc".+?>.+?<span.*?>(尺寸大小.+?)</span>',
            page,
            flags=re.S)
        if m:
            s_size = m.group(1)
            i_size = s_size.split(':')[1]
        if i_size == '':
            m = re.search(r'<span.+?>尺寸大小:</span>(.+?)</span>',
                          page,
                          flags=re.S)
            if m:
                i_size = re.sub(r'<.+?>', '', m.group(1))

        i_number = ''
        m = re.search(
            r'<div class="base">\s*<div class="sku-brand">.+?<dl class="hidden"><dt>商品货号: </dt><dd>(.+?)</dd></dl>\s*</div>',
            page,
            flags=re.S)
        if m:
            i_number = m.group(1)

        i = BagItem(self.brand_type)
        i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url,
                   i_img, i_number)
        print '# itemPage:', i.outItem()
Example #8
0
    def itemPage(self, val):
        item_title, refers, i_name, i_url, i_img, i_price, i_unit = val
        if i_url == '': return
        page = self.crawler.getData(i_url, refers)
        if not page or page == '': return

        m = re.search(r'<div id="itemTechSheet">\s*<h1>(.+?)</h1>',
                      page,
                      flags=re.S)
        if m:
            i_name = m.group(1).strip()

        m = re.search(
            r'<div id="itemTechSheet">.+?<div class="price">(.+?)</div>',
            page,
            flags=re.S)
        if m:
            s_price = m.group(1).strip()
            if s_price.find("¥") != -1:
                i_unit = "CNY"
            i_price = re.sub(r'<.+?>', '', s_price).replace('¥', '').strip()

        m = re.search(
            r'<div id="itemImagesBox".*?>\s*<img.+?class="mainImage" src="(.+?)">',
            page,
            flags=re.S)
        if m:
            i_img = m.group(1)

        i_size = ''
        m = re.search(r'<div class="scrollCnt">\s*<ul>.+?<li>(尺寸.+?)</li>',
                      page,
                      flags=re.S)
        if m:
            i_size = m.group(1)
        else:
            m = re.search(r'<div class="scrollCnt">\s*<ul>.+?<li>(尺码.+?)</li>',
                          page,
                          flags=re.S)
            if m:
                i_size = m.group(1)

        i_number = ''
        m = re.search(
            r'<div id="itemTechSheet">.+?<p class="prodCode">(.+?)</p>',
            page,
            flags=re.S)
        if m:
            i_number = m.group(1).split(':')[1].strip()

        i = BagItem(self.brand_type)
        i.initItem('', item_title, i_name, i_price, i_unit, i_size, i_url,
                   i_img, i_number)
        print '# itemPage:', i.outItem()
Example #9
0
    def bagPage(self, url):
        page = self.crawler.getData(url, self.home_url)
        if not page or page == '': return

        p = re.compile(r'<div id="slot_\d+".+?<a.+?href="(.+?)".+?>\s*<img.+?src="(.+?)".+?/>\s*<div class="iteminfo">\s*<div class="headgroup">\s*<div class="extra">\s*<h1 class="modelname">(.+?)</h1>', flags=re.S)
        for item in p.finditer(page):
            i_url, i_img, i_name = item.group(1),item.group(2),item.group(3)
            print i_url, i_img, i_name
            if i_url and i_url != '':
                self.link_list.append(('',url,i_name,i_url,i_img))
            else:
                i = BagItem(self.brand_type)
                i.initItem('',url,i_name,i_url,i_img)
                self.items.append(i.outItem())

        p = re.compile(r'<div class="slot lazySlot".+?data-slot="(.+?)".+?>',flags=re.S)
        for item in p.finditer(page):
            data_info = item.group(1)
            data_info_str = data_info.replace('&quot;','"')
            i_url, i_img, i_name = '', '', ''
            m = re.search(r'"Link":"(.+?)"', data_info_str, flags=re.S)
            if m:
                i_url = m.group(1)

            m = re.search(r'"ModelName":"(.+?)",', data_info_str, flags=re.S)
            if m:
                i_name = m.group(1)

            print i_url, i_img, i_name
            if i_url and i_url != '':
                self.link_list.append(('',url,i_name,i_url,i_img))
            else:
                i = BagItem(self.brand_type)
                i.initItem('',url,i_name,i_url,i_img)
                self.items.append(i.outItem())
Example #10
0
 def ajax_item(self, page, refers):
     if not page or page == '': return False
     try:
         result = json.loads(page)
         if result.has_key("ApiResult"):
             r_ApiResult = result["ApiResult"]
             if r_ApiResult.has_key("Items"):
                 for item in r_ApiResult["Items"]:
                     tab_name, i_img, i_url, i_name, i_price = "", "", "", "", ""
                     if item.has_key("MicroCategory"):
                         tab_name = item["MicroCategory"].strip()
                     if item.has_key("DefaultCode10"):
                         item_code10 = item["DefaultCode10"]
                         if item.has_key("ImageTypes"):
                             if "12_f" in item["ImageTypes"]:
                                 i_img = "http://cdn.yoox.biz/55/%s_%s.jpg" % (
                                     item_code10, "12_f")
                             else:
                                 i_img = "http://cdn.yoox.biz/55/%s_%s.jpg" % (
                                     item_code10, max(item["ImageTypes"]))
                     if item.has_key("SingleSelectLink"):
                         i_url = self.home_url + item[
                             "SingleSelectLink"].strip()
                     if item.has_key("TitleAttribute"):
                         i_name = item["TitleAttribute"].strip()
                     if item.has_key("FullPrice"):
                         i_price = '{0:,}'.format(int(item["FullPrice"]))
                     i_unit = "CNY"
                     print tab_name, i_name, i_url, i_img, i_price, i_unit
                     if i_url and i_url != '':
                         self.link_list.append(
                             (tab_name, refers, i_name, i_url, i_img,
                              i_price, i_unit))
                     else:
                         i = BagItem(self.brand_type)
                         i.initItem('', tab_name, i_name, i_price, i_unit,
                                    '', i_url, i_img)
                         self.items.append(i.outItem())
         if result.has_key("Page"):
             r_Page = result["Page"]
             if r_Page.has_key("CurrentSearchPage") and r_Page.has_key(
                     "TotalPages"):
                 if int(r_Page["CurrentSearchPage"]) < int(
                         r_Page["TotalPages"]):
                     return True
         return False
     except Exception as e:
         print e
         return False
Example #11
0
    def itemPage(self, val):
        serie_title, refers, i_name, i_url = val

        page = self.crawler.getData(i_url, refers)
        if not page or page == '': return

        i_title, i_img, i_size, i_price, i_unit = '', '', '', '', ''

        m = re.search(r'<h2 class="" itemprop="name">(.+?)<br />(.+?)</h2>',
                      page,
                      flags=re.S)
        if m:
            i_title = m.group(1).strip()

        m = re.search(
            r'<li class="firstThumbnails">\s+<a href="#" class="active".+?>\s+<img src="(.+?)" alt="" />\s+</a>\s+</li>',
            page,
            flags=re.S)
        if m: i_img = self.home_url + m.group(1)

        m = re.search(
            r'<div class="modText">\s+<h4.+?>说明</h4>\s+<p>(.+?)</p>\s+</div>',
            page,
            flags=re.S)
        if m:
            i_desc = m.group(1)
            m = re.search(r'尺寸:(.+?)<br />', i_desc, flags=re.S)
            if m:
                i_size = m.group(1).strip()
            else:
                m = re.search(r'尺寸:(.+?)$', i_desc, flags=re.S)
                if m: i_size = m.group(1).strip()

        i_number = ''
        m = re.search(
            r'<div class="columns-wrapper">.+?<div class="column">.*?<div class="reference">\s*<p>(.+?)</p>\s*</div>',
            page,
            flags=re.S)
        if m:
            s_number = m.group(1)
            i_number = s_number.split('-')[1].strip()

        i = BagItem()
        i.initItem(serie_title, i_title, i_name, i_price, i_unit, i_size,
                   i_url, i_img, i_number)
        self.items.append(i.outItem)
        print '# itemPage :', serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img
Example #12
0
 def run_items(self, items_info, tab_name, tab_url):
     p = re.compile(r'<li class="productlist-item ">\s*<div class="product-image".+?>\s*<a.+?><img src="(.+?)".+?/>\s*</a>\s*</div>.+?<div class="product-title">\s*<a href="(.+?)".+?>(.+?)</a>\s*</div>.+?<p>\s*<span class="product-price">(.+?)</span>\s*</p>\s*</li>', flags=re.S)
     for item in p.finditer(items_info):
         i_img, i_url, i_name, s_price = item.group(1),item.group(2),item.group(3),item.group(4)
         i_price, i_unit = '', ''
         if s_price.find("¥") != -1 or s_price.find("¥") != -1:
             i_unit = "CNY"
         i_price = s_price.replace('¥','').replace('¥','').strip()
         
         if i_url and i_url != '':
             print self.home_url+i_url, i_img, i_name, i_price, i_unit
             self.link_list.append((tab_name,tab_url,i_name,self.home_url+i_url,i_img,i_price,i_unit))
         else:
             print self.home_url+i_url, i_img, i_name, i_price, i_unit
             i = BagItem(self.brand_type)
             i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, i_img)
             self.items.append(i.outItem())
Example #13
0
    def bagPage(self, url):
        page = self.crawler.getData(url, self.home_url)
        if not page or page == '': return

        tab_list_info = ''
        m = re.search(r'<li class="mega-menu-item">\s*<div.+?data-megaMenu="women".*?>\s*<span class="onlyML">手袋</span>.+?</div>\s*<div class="mega-menu-content">.+?<ul class="level3">(.+?)</ul>', page, flags=re.S)
        if m:
            tab_list_info = m.group(1).strip()

        tab_list = []
        p = re.compile(r'<li class="mm-block">.+?<a.+?href="(.+?)">\s*<p class="mm-push-p">(.+?)</p>\s*</a>.+?</li>', flags=re.S)
        for tab_info in p.finditer(tab_list_info):
            tab_list.append((tab_info.group(2).strip(),self.home_url+tab_info.group(1)))
            print tab_info.group(2).strip(),self.home_url+tab_info.group(1)

        i = 0
        for tab in tab_list:
            refers = url
            tab_name, tab_url = tab
            print '# tab:',tab_name, tab_url
            tab_page = self.crawler.getData(tab_url, refers)
            m = re.search(r'<button.+?class="pl-next".+?data-next="(.+?)">',tab_page,flags=re.S)
            while m:
                refers = tab_url
                tab_url = re.sub(r'/to-\d+', '', tab_url)  + "/to-%s"%m.group(1)
                tab_page = self.crawler.getData(tab_url, refers)
                m = re.search(r'<button.+?class="pl-next".+?data-next="(.+?)">',tab_page,flags=re.S)

            p = re.compile(r'<a id="sku_.*?" href="(.+?)".+?>.+?<div class="description">\s*<div class="productName toMinimize">(.+?)</div>\s*<div class="productPrice.+?data-htmlContent="(.+?)">\s*</div>\s*</div>', flags=re.S)
            for item in p.finditer(tab_page):
                i_url, i_name, s_price = item.group(1),item.group(2),item.group(3)
                print self.home_url+i_url, i_name, s_price
                i_unit = ""
                if s_price.find("¥") != -1:
                    i_unit = "CNY"
                i_price = s_price.replace('¥','').strip()
                
                if i_url and i_url != '':
                    if Common.isBag(i_name):
                        self.link_list.append((tab_name,tab_url,i_name,self.home_url+i_url,i_price,i_unit))
                else:
                    if Common.isBag(i_name):
                        i = BagItem(self.brand_type)
                        i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, '')
                        self.items.append(i.outItem())
Example #14
0
    def itemPage(self, val):
        serie_title, refers, i_name, i_url, i_price, i_unit = val
        if i_url == '': return
        page = self.crawler.getData(i_url, refers)
        if not page or page == '': return
        
        m = re.search(r'<div class="productName title" id="productName">\s*<h1 itemprop="name">(.+?)</h1>\s*</div>', page, flags=re.S)
        if m:
            i_name = m.group(1).strip()

        m = re.search(r'<table class="priceButton">\s*<tr>\s*<td class="priceValue price-sheet">(.+?)</td>', page, flags=re.S)
        if m:
            i_price = m.group(1).strip()
            if i_price.find("¥") != -1:
                i_unit = "CNY"
    
        m = re.search(r'<noscript>\s*<img src="(.+?)".+?itemprop="image".*?/>\s*</noscript', page, flags=re.S)
        if m:
            i_img = m.group(1)

        i_size = ''
        m = re.search(r'<div class="textClientInfo exp_content".*?>\s*<div class="innerContent functional-text">(.+?)</div>', page, flags=re.S)
        if m:
            s_content = m.group(1).replace('&nbsp;','').strip()
            if s_content.find('宽)') != -1:
                s_size = s_content.split('宽)')[0]
                self.item_size = re.sub('<.+?>','',s_size) + "宽)"
            elif s_content.find('高)') != -1:
                s_size = s_content.split('高)')[0]
                self.item_size = re.sub('<.+?>','',s_size) + "高)"
            else:
                s_size = ''.join(s_content.split())

        i_number
        m = re.search(r'<h2 class="sku reading-and-link-text">(.+?)</h2>', page, flags=re.S)
        if m:
            i_number = m.group(1).strip()
        else:
            m = re.search(r'<meta itemprop="identifier" content="sku:(.+?)"/>', page, flags=re.S)
            if m:
                i_number = m.group(1).strip()

        i = BagItem(self.brand_type)
        i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number)
        print '# itemPage:',i.outItem()
Example #15
0
    def run_items(self, items_info, tab_name, tab_url):
        p = re.compile(
            r'<div class="large.+?columns.+?">\s*<a href="(.+?)">\s*<img.*?src="(.+?)".*?/><span class="prodcaption">(.+?)</br>(.+?)</span>\s*</a>',
            flags=re.S,
        )
        for item in p.finditer(items_info):
            i_url, i_img, i_name, s_price = item.group(1), item.group(2), item.group(3), item.group(4)
            i_price, i_unit = "", ""
            if s_price.find("¥") != -1 or s_price.find("¥") != -1:
                i_unit = "CNY"
            i_price = s_price.replace("¥", "").replace("¥", "").strip()

            if i_url and i_url != "":
                print self.home_url + i_url, i_img, i_name, i_price, i_unit
                self.link_list.append((tab_name, tab_url, i_name, self.home_url + i_url, i_img, i_price, i_unit))
            else:
                print self.home_url + i_url, i_img, i_name, i_price, i_unit
                i = BagItem(self.brand_type)
                i.initItem(tab_name, "", i_name, i_price, i_unit, "", i_url, i_img)
                self.items.append(i.outItem())
Example #16
0
    def itemPage(self, val):
        serie_title, refers, i_name, i_url, i_img, i_price, i_unit = val
        if i_url == '': return
        page = self.crawler.getData(i_url, refers)
        if not page or page == '': return
        
        m = re.search(r'<h2 class="productName">(.+?)</h2>', page, flags=re.S)
        if m:
            i_name = m.group(1).strip()

        m = re.search(r'<div id="zoomImageWrapper">\s*<img.+?src="(.+?)".*?/>\s*</div>', page, flags=re.S)
        if m:
            i_img = m.group(1)
        else:
            m = re.search(r'<div id="thumbsWrapper">.+?<div class="thumbElement".+?>\s*<img.+?src="(.+?)".*?/>\s*</div>', page, flags=re.S)
            if m:
                i_img = m.group(1)

        m = re.search(r'<span class="currency">(.+?)</span>.*?<span class="priceValue">(.+?)</span>', page, flags=re.S)
        if m:
            currency, i_price = m.group(1), re.sub(r'<.*>','',m.group(2))
            if currency.find("¥") != -1:
                i_unit = "CNY"
            else:
                i_unit = currency

        m = re.search(r'<div class="attributes">(.+?)</div>', page, flags=re.S)
        if m:
            size_str = re.sub(r'<.*?>','',m.group(1))
            #i_size = "".join(size_str.split())
            i_size = re.sub(r'\s*','',size_str)
            print "".join(i_size.split())

        i_number = ''
        m = re.search(r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>', page, flags=re.S)
        if m:
            i_number = m.group(1)

        i = BagItem()
        i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number)
        self.items.append(i.outItem)    
Example #17
0
 def ajax_item(self, page, refers):
     if not page or page == '': return False
     try:
         result = json.loads(page)
         if result.has_key("ApiResult"):
             r_ApiResult = result["ApiResult"]
             if r_ApiResult.has_key("Items"):
                 for item in r_ApiResult["Items"]:
                     tab_name, i_img, i_url, i_name, i_price = "", "", "", "", ""
                     if item.has_key("MicroCategory"):
                         tab_name = item["MicroCategory"].strip()
                     if item.has_key("DefaultCode10"):
                         item_code10 = item["DefaultCode10"]
                         if item.has_key("ImageTypes"):
                             if "12_f" in item["ImageTypes"]:
                                 i_img = "http://cdn.yoox.biz/55/%s_%s.jpg"%(item_code10,"12_f")
                             else:
                                 i_img = "http://cdn.yoox.biz/55/%s_%s.jpg"%(item_code10,max(item["ImageTypes"]))
                     if item.has_key("SingleSelectLink"):
                         i_url = self.home_url + item["SingleSelectLink"].strip()
                     if item.has_key("TitleAttribute"):
                         i_name = item["TitleAttribute"].strip()
                     if item.has_key("FullPrice"):
                         i_price = '{0:,}'.format(int(item["FullPrice"]))
                     i_unit = "CNY"
                     print tab_name,i_name,i_url,i_img,i_price,i_unit
                     if i_url and i_url != '':
                         self.link_list.append((tab_name,refers,i_name,i_url,i_img,i_price,i_unit))
                     else:
                         i = BagItem(self.brand_type)
                         i.initItem('', tab_name, i_name, i_price, i_unit, '', i_url, i_img)
                         self.items.append(i.outItem())
         if result.has_key("Page"):
             r_Page = result["Page"]
             if r_Page.has_key("CurrentSearchPage") and r_Page.has_key("TotalPages"):
                 if int(r_Page["CurrentSearchPage"]) < int(r_Page["TotalPages"]):
                     return True
         return False
     except Exception as e:
         print e
         return False
Example #18
0
    def itemPage(self, val):
        serie_title, refers, i_name, i_url, i_img = val
        if i_url == '': return
        page = self.crawler.getData(i_url, refers)
        if not page or page == '': return
        
        m = re.search(r'<h1 class="producTitle".+?>\s*<div class="modelName".+?>\s*<span class="modelName">(.+?)</span>', page, flags=re.S)
        if m:
            i_name = m.group(1).strip()

        m = re.search(r'<div class="mainImage".+?>\s*<img.+?src="(.+?)".*?/>\s*</div>', page, flags=re.S)
        if m:
            i_img = m.group(1)
        else:
            m = re.search(r'<section id="bgItem">\s*<img.+?src="(.+?)".*?/>\s*</section>', page, flags=re.S)
            if m:
                i_img = m.group(1)
        
        i_size = ''
        m = re.search(r'<div class="localizedAttributes">.*?<div class="height">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>', page, flags=re.S)
        if m:
            i_size += m.group(1) + ":" + m.group(2) + ";"
        m = re.search(r'<div class="localizedAttributes">.*?<div class="depth">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>', page, flags=re.S)
        if m:
            i_size += m.group(1) + ":" + m.group(2) + ";"
        m = re.search(r'<div class="localizedAttributes">.*?<div class="length_of_strap">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>', page, flags=re.S)
        if m:
            i_size += m.group(1) + ":" + m.group(2) + ";"
        m = re.search(r'<div class="localizedAttributes">.*?<div class="width">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>\s*</div>', page, flags=re.S)
        if m:
            i_size += m.group(1) + ":" + m.group(2) + ";"

        i_number
        m = re.search(r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>', page, flags=re.S)
        if m:
            i_number = m.group(1)

        i = BagItem(self.brand_type)
        i.initItem(serie_title, '', i_name, '', '', i_size, i_url, i_img, i_number)
        print '# itemPage:',i.outItem()
Example #19
0
    def itemPage(self, val):
        serie_title, refers, i_name, i_url, i_img, i_price, i_unit = val
        if i_url == '': return
        page = self.crawler.getData(i_url, refers)
        if not page or page == '': return

        m = re.search(r'<div class="product-title">(.+?)</div>',
                      page,
                      flags=re.S)
        if m:
            i_name = ' '.join(m.group(1).strip().split())

        m = re.search(r'<div class="product-prices">(.+?)</div>',
                      page,
                      flags=re.S)
        if m:
            s_price = m.group(1).strip()
            if s_price.find("¥") != -1 or s_price.find("¥") != -1:
                i_unit = "CNY"
            i_price = s_price.replace('¥', '').replace('¥', '').strip()

        m = re.search(
            r'<div class="item-list">\s*<ul.+?>\s*<li class="first">.+?<img.+?src="(.+?)".*?/>',
            page,
            flags=re.S)
        if m:
            i_img = m.group(1)

        i_size = ''
        i_number = ''
        m = re.search(r'<div class="product-code">(.+?)型号代码(.+?)</div>',
                      page,
                      flags=re.S)
        if m:
            i_size, i_number = m.group(1).strip(), m.group(2).strip()

        i = BagItem(self.brand_type)
        i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url,
                   i_img, i_number)
        print '# itemPage:', i.outItem()
Example #20
0
    def crawl(self):
        while True:
            _data = None
            try:
                try:
                    # 取队列消息
                    _data = self.get_q()
                except Empty as e:
                    # 队列为空,退出
                    #print '# queue is empty', e
                    break

                _val = _data[1]
                item = BagItem(self.home_url, self.brand_type)
                item.antPage(_val)
                self.push_back(self.items, item.outItem())

                sql = item.outTuple()
                self.mysqlAccess.insert_item(sql)

                # 延时
                time.sleep(0.1)
                # 通知queue, task结束
                self.queue.task_done()

            except Exception as e:
                print 'Unknown exception crawl item :', e
                Common.traceback_log()
                self.crawlRetry(_data)
                # 通知queue, task结束
                self.queue.task_done()
                time.sleep(5)
Example #21
0
    def bagPage(self, url):
        page = self.crawler.getData(url, self.home_url)
        if not page or page == '': return

        tab_list = []
        m = re.search(r'<ul class="tabsList collections">(.+?)</ul>',
                      page,
                      flags=re.S)
        if m:
            tabs_list_info = m.group(1)

            p = re.compile(
                r'<li class=".+?">\s+<a href="(.+?)" data-magento_call_page="(.+?)".+?>(.+?)</a>\s+</li>',
                flags=re.S)
            for tab in p.finditer(tabs_list_info):
                tab_list.append(
                    (tab.group(3).strip(), self.home_url + tab.group(2),
                     url + tab.group(1)))

        for tab in tab_list:
            tab_name, tab_data_url, tab_url = tab
            print '# tab:', tab_name, tab_data_url, tab_url
            tab_page = self.crawler.getData(tab_data_url, url)

            p = re.compile(
                r'<li class="li-product.+?>\s+<a href="(.*?)" class="linkProduct">.+?<img src="(.+?)".+?/>.+?<span class="description".+?>.+?<span class="title">(.+?)</span>.+?</span>\s+</a>\s+</li>',
                flags=re.S)
            for item in p.finditer(tab_page):
                i_url, i_img, i_name = self.home_url + item.group(
                    1), self.home_url + item.group(2), item.group(3)
                print i_url, i_img, i_name
                if i_url and i_url != '':
                    self.link_list.append(
                        (tab_name, tab_url, i_name, i_url, i_img))
                else:
                    i = BagItem(self.brand_type)
                    i.initItem(tab_name, '', i_name, '', '', '', i_url,
                               self.home_url + i_img)
                    self.items.append(i.outItem())
Example #22
0
    def itemPage(self, val):
        serie_title, refers, i_name, i_url, i_img, i_price, i_unit = val
        if i_url == '': return
        page = self.crawler.getData(i_url, refers)
        if not page or page == '': return

        m = re.search(r'<h1 class="product-name">(.+?)</h1>', page, flags=re.S)
        if m:
            i_name = ' '.join(m.group(1).strip().split())

        m = re.search(r'<div class="product-prices">.+?<dd class="saleprice">(.+?)</dd>.+?</div>', page, flags=re.S)
        if m:
            s_price = m.group(1).strip()
            if s_price.find("¥") != -1 or s_price.find("¥") != -1:
                i_unit = "CNY"
            i_price = s_price.replace('¥','').replace('¥','').strip()
    
        m = re.search(r'<div class="image">.+?<div class="container".*?>\s*<table.+?>\s*<tr>\s*<td>\s*<a.+?>\s*<img.+?class="thumb".+?big="(.+?)".*?/>\s*</a>\s*</td>', page, flags=re.S)
        if m:
            i_img = m.group(1)

        i_size = ''
        m = re.search(r'<div class="tabpage inc".+?>.+?<span.*?>(尺寸大小.+?)</span>', page, flags=re.S)
        if m:
            s_size = m.group(1)
            i_size = s_size.split(':')[1]
        if i_size == '':
            m = re.search(r'<span.+?>尺寸大小:</span>(.+?)</span>', page, flags=re.S)
            if m:
                i_size = re.sub(r'<.+?>','',m.group(1))

        i_number = ''
        m = re.search(r'<div class="base">\s*<div class="sku-brand">.+?<dl class="hidden"><dt>商品货号: </dt><dd>(.+?)</dd></dl>\s*</div>', page, flags=re.S)
        if m:
            i_number = m.group(1)

        i = BagItem(self.brand_type)
        i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number)
        print '# itemPage:',i.outItem()
Example #23
0
    def itemPage(self, val):
        item_title, refers, i_name, i_url, i_img, i_price, i_unit = val
        if i_url == '': return
        page = self.crawler.getData(i_url, refers)
        if not page or page == '': return
        
        m = re.search(r'<div id="itemTechSheet">\s*<h1>(.+?)</h1>', page, flags=re.S)
        if m:
            i_name = m.group(1).strip()

        m = re.search(r'<div id="itemTechSheet">.+?<div class="price">(.+?)</div>', page, flags=re.S)
        if m:
            s_price = m.group(1).strip()
            if s_price.find("¥") != -1:
                i_unit = "CNY"
            i_price = re.sub(r'<.+?>','',s_price).replace('¥','').strip()
            
        m = re.search(r'<div id="itemImagesBox".*?>\s*<img.+?class="mainImage" src="(.+?)">', page, flags=re.S)
        if m:
            i_img = m.group(1)

        i_size = ''
        m = re.search(r'<div class="scrollCnt">\s*<ul>.+?<li>(尺寸.+?)</li>', page, flags=re.S)
        if m:
            i_size = m.group(1)
        else:
            m = re.search(r'<div class="scrollCnt">\s*<ul>.+?<li>(尺码.+?)</li>', page, flags=re.S)
            if m:
                i_size = m.group(1)

        i_number = ''
        m = re.search(r'<div id="itemTechSheet">.+?<p class="prodCode">(.+?)</p>', page, flags=re.S)
        if m:
            i_number = m.group(1).split(':')[1].strip()

        i = BagItem(self.brand_type)
        i.initItem('', item_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number)
        print '# itemPage:',i.outItem()
Example #24
0
    def bagPage(self, url):
        page = self.crawler.getData(url, self.home_url)
        if not page or page == '': return

        p = re.compile(
            r'<div id="slot_\d+".+?<a.+?href="(.+?)".+?>\s*<img.+?src="(.+?)".+?/>\s*<div class="iteminfo">\s*<div class="headgroup">\s*<div class="extra">\s*<h1 class="modelname">(.+?)</h1>',
            flags=re.S)
        for item in p.finditer(page):
            i_url, i_img, i_name = item.group(1), item.group(2), item.group(3)
            print i_url, i_img, i_name
            if i_url and i_url != '':
                self.link_list.append(('', url, i_name, i_url, i_img))
            else:
                i = BagItem(self.brand_type)
                i.initItem('', url, i_name, i_url, i_img)
                self.items.append(i.outItem())

        p = re.compile(r'<div class="slot lazySlot".+?data-slot="(.+?)".+?>',
                       flags=re.S)
        for item in p.finditer(page):
            data_info = item.group(1)
            data_info_str = data_info.replace('&quot;', '"')
            i_url, i_img, i_name = '', '', ''
            m = re.search(r'"Link":"(.+?)"', data_info_str, flags=re.S)
            if m:
                i_url = m.group(1)

            m = re.search(r'"ModelName":"(.+?)",', data_info_str, flags=re.S)
            if m:
                i_name = m.group(1)

            print i_url, i_img, i_name
            if i_url and i_url != '':
                self.link_list.append(('', url, i_name, i_url, i_img))
            else:
                i = BagItem(self.brand_type)
                i.initItem('', url, i_name, i_url, i_img)
                self.items.append(i.outItem())
Example #25
0
    def run_items(self, items_info, tab_name, tab_url):
        p = re.compile(
            r'<div class="large.+?columns.+?">\s*<a href="(.+?)">\s*<img.*?src="(.+?)".*?/><span class="prodcaption">(.+?)</br>(.+?)</span>\s*</a>',
            flags=re.S)
        for item in p.finditer(items_info):
            i_url, i_img, i_name, s_price = item.group(1), item.group(
                2), item.group(3), item.group(4)
            i_price, i_unit = '', ''
            if s_price.find("¥") != -1 or s_price.find("¥") != -1:
                i_unit = "CNY"
            i_price = s_price.replace('¥', '').replace('¥', '').strip()

            if i_url and i_url != '':
                print self.home_url + i_url, i_img, i_name, i_price, i_unit
                self.link_list.append(
                    (tab_name, tab_url, i_name, self.home_url + i_url, i_img,
                     i_price, i_unit))
            else:
                print self.home_url + i_url, i_img, i_name, i_price, i_unit
                i = BagItem(self.brand_type)
                i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url,
                           i_img)
                self.items.append(i.outItem())
Example #26
0
    def run_items(self, items_info, tab_name, tab_url):
        p = re.compile(
            r'<li class="productlist-item ">\s*<div class="product-image".+?>\s*<a.+?><img src="(.+?)".+?/>\s*</a>\s*</div>.+?<div class="product-title">\s*<a href="(.+?)".+?>(.+?)</a>\s*</div>.+?<p>\s*<span class="product-price">(.+?)</span>\s*</p>\s*</li>',
            flags=re.S)
        for item in p.finditer(items_info):
            i_img, i_url, i_name, s_price = item.group(1), item.group(
                2), item.group(3), item.group(4)
            i_price, i_unit = '', ''
            if s_price.find("¥") != -1 or s_price.find("¥") != -1:
                i_unit = "CNY"
            i_price = s_price.replace('¥', '').replace('¥', '').strip()

            if i_url and i_url != '':
                print self.home_url + i_url, i_img, i_name, i_price, i_unit
                self.link_list.append(
                    (tab_name, tab_url, i_name, self.home_url + i_url, i_img,
                     i_price, i_unit))
            else:
                print self.home_url + i_url, i_img, i_name, i_price, i_unit
                i = BagItem(self.brand_type)
                i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url,
                           i_img)
                self.items.append(i.outItem())
Example #27
0
 def run_items(self, items_info, tab_name, tab_url):
     p = re.compile(r'<.+?>\s*<a.+?>\s*<img.+?src="(.+?)".*?>\s*</a><a href="(.+?)">.+?<div class="infoDescription">(.+?)</div>\s*(<div class="infoPrice">.+?</div>).+?</a>\s*<.+?>', flags=re.S)
     for item in p.finditer(items_info):
         i_img, i_url, s_name, price_info = item.group(1),item.group(2),item.group(3),item.group(4)
         i_name = re.sub(r'<.+?>','',s_name)
         i_price, i_unit = '', ''
         m = re.search(r'<div.+?class="newprice">(.+?)</div>', price_info, flags=re.S)
         if m:
             s_price = re.sub(r'<.+?>','',m.group(1))
             if s_price.find("¥") != -1:
                 i_unit = "CNY"
             i_price = s_price.replace('¥','').strip()
         
         if i_url and i_url != '':
             if Common.isBag(i_name) or Common.isBag(unquote(i_url)):
                 print self.home_url+i_url, i_img, i_name, i_price, i_unit
                 self.link_list.append((tab_name,tab_url,i_name,self.home_url+i_url,i_img,i_price,i_unit))
         else:
             if Common.isBag(i_name) or Common.isBag(unquote(i_url)):
                 print self.home_url+i_url, i_img, i_name, i_price, i_unit
                 i = BagItem(self.brand_type)
                 i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, i_img)
                 self.items.append(i.outItem())
Example #28
0
    def bagPage(self, url):
        page = self.crawler.getData(url, self.home_url)
        if not page or page == '': return

        serie_title = '包袋'
        p = re.compile(
            r'<li data-position="\d+\s*" class="product isAvailable".+?data-category="(.+?)".+?>\s*<div class="prodContent"><div class="imagesContainer".+?>.+?<img.+?data-original="(.+?)".+?>.+?</div>\s*<div class="\s*productDescription\s*">\s*<a href="(.+?)".+?><h2.+?>(.+?)</h2>\s*</a>\s*<div class="price">.+?<span class="currency">(.+?)</span>.*?<span class="priceValue">(.+?)</span>.+?</li>',
            flags=re.S)
        for item in p.finditer(page):
            tab_name, i_img, i_url, i_name, s_unit, s_price = item.group(
                1).strip(), item.group(2), item.group(3), item.group(
                    4).strip(), item.group(5), item.group(6)
            i_unit = ""
            if s_unit.find("¥") != -1:
                i_unit = "CNY"
            i_price = re.sub(r'<.+?>', '', s_price).strip()
            print tab_name, i_img, self.home_url + i_url, i_name, i_unit, i_price

            if i_url and i_url != '':
                self.link_list.append(
                    (serie_title, tab_name, url, i_name, self.home_url + i_url,
                     i_img, i_price, i_unit))
            else:
                i = BagItem(self.brand_type)
                i.initItem(serie_title, tab_name, i_name, i_price, i_unit, '',
                           i_url, i_img)
                self.items.append(i.outItem())
        page_num = 2
        ajax_url = "http://www.dolcegabbana.com.cn/yeti/api/DOLCEEGABBANA_CN/searchIndented.json?page=2&sortRule=PriorityDescending&format=full&authorlocalized=&macro=1147&micro=&color=&look=&size=&gender=D&season=P%2CE&department=&brand=&heel=&heeltype=&wedge=&washtype=&washcode=&colortype=&fabric=&waist=&family=&structure=&environment=&author=&textSearch=&minPrice=&maxPrice=&occasion=&salesline=&prints=&stone=&material=&agerange=&productsPerPage=20&gallery=&macroMarchio=&modelnames=&GroupBy=&style=&site=DOLCEEGABBANA&baseurl=http://www.dolcegabbana.com.cn/searchresult.asp"
        a_url = re.sub('page=\d+&', 'page=%d&' % page_num, ajax_url)
        a_page = self.crawler.getData(a_url, url)
        result = self.ajax_item(a_page, url)
        while result:
            page_num += 1
            a_url = re.sub('page=\d+&', 'page=%d&' % page_num, ajax_url)
            a_page = self.crawler.getData(a_url, url)
            result = self.ajax_item(a_page, url)
Example #29
0
    def itemPage(self, val):
        serie_title, refers, i_name, i_url, i_img, i_price, i_unit = val
        if i_url == '': return
        page = self.crawler.getData(i_url, refers)
        if not page or page == '': return

        m = re.search(r'<div id="itemInfo">.+?<h1><span class="customItemDescription" itemprop="name">(.+?)</span></h1>', page, flags=re.S)
        if m:
            i_name = m.group(1).strip()

        m = re.search(r'<div id="itemPrice".+?><div.*?class="newprice">(.+?)</div>', page, flags=re.S)
        if m:
            s_price = m.group(1).strip()
            s_price = re.sub(r'<.+?>','',s_price)
            if s_price.find("¥") != -1:
                i_unit = "CNY"
                i_price = s_price.replace('¥','').strip()
            else:
                i_price = s_price
    
        m = re.search(r'<div id="mainImageContainer"><img.+?src="(.+?)".*?/></div>', page, flags=re.S)
        if m:
            i_img = m.group(1)

        i_size = ''
        m = re.search(r'<div class="itemDimensions">.+?<span class="dimensions">(.+?)</span></div>', page, flags=re.S)
        if m:
            i_size = m.group(1)

        i_number
        m = re.search(r'<div class="styleIdDescription">货号.+?<span.*?>(.+?)</span></div>', page, flags=re.S)
        if m:
            i_number = m.group(1)

        i = BagItem(self.brand_type)
        i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number)
        print '# itemPage:',i.outItem()
Example #30
0
    def bagPage(self):
        tab_list = [
                    ("giorgio armani","http://www.armani.cn/cn/giorgioarmani/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B"),
                    ("emporio armani","http://www.armani.cn/cn/emporioarmani/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B"),
                    ("armani jeans","http://www.armani.cn/cn/armanijeans/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B")]
        for tab in tab_list:
            tab_name,tab_data_url = tab
            print '# tab:',tab_name,tab_data_url
            tab_page = self.crawler.getData(tab_data_url, self.home_url)

            p = re.compile(r'<div class="item hproduct".+?>.+?<a href="(.+?)".+?class="url">\s*<div class="hproductPhotoCont">\s*<img.+?(src|data-original)="(.+?)".*?/>\s*</div>\s*</a>\s*<div class="itemDesc">\s*<a.+?>\s*<h3.+?>(.+?)</h3>\s*</a>.+?<div class="itemPrice">.+?<span class="prezzoProdottoSaldo".*?>(.+?)</span>\s*</div>.+?</div>', flags=re.S)
            for item in p.finditer(tab_page):
                i_url, i_img, i_name, s_price = self.home_url+item.group(1), item.group(2), item.group(4).strip(), item.group(5)
                print i_url, i_img, i_name, s_price
                i_unit = ""
                if s_price.find("¥") != -1:
                    i_unit = "CNY"
                i_price = s_price.replace('¥','').strip()
                if i_url and i_url != '':
                    self.link_list.append((tab_name,tab_data_url,i_name,i_url,i_img,i_price,i_unit))
                else:
                    i = BagItem(self.brand_type)
                    i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, i_img)
                    self.items.append(i.outItem())
Example #31
0
    def bagPage(self):
        tab_list = [
            ("giorgio armani",
             "http://www.armani.cn/cn/giorgioarmani/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B"
             ),
            ("emporio armani",
             "http://www.armani.cn/cn/emporioarmani/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B"
             ),
            ("armani jeans",
             "http://www.armani.cn/cn/armanijeans/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B"
             )
        ]
        for tab in tab_list:
            tab_name, tab_data_url = tab
            print '# tab:', tab_name, tab_data_url
            tab_page = self.crawler.getData(tab_data_url, self.home_url)

            p = re.compile(
                r'<div class="item hproduct".+?>.+?<a href="(.+?)".+?class="url">\s*<div class="hproductPhotoCont">\s*<img.+?(src|data-original)="(.+?)".*?/>\s*</div>\s*</a>\s*<div class="itemDesc">\s*<a.+?>\s*<h3.+?>(.+?)</h3>\s*</a>.+?<div class="itemPrice">.+?<span class="prezzoProdottoSaldo".*?>(.+?)</span>\s*</div>.+?</div>',
                flags=re.S)
            for item in p.finditer(tab_page):
                i_url, i_img, i_name, s_price = self.home_url + item.group(
                    1), item.group(2), item.group(4).strip(), item.group(5)
                print i_url, i_img, i_name, s_price
                i_unit = ""
                if s_price.find("¥") != -1:
                    i_unit = "CNY"
                i_price = s_price.replace('¥', '').strip()
                if i_url and i_url != '':
                    self.link_list.append((tab_name, tab_data_url, i_name,
                                           i_url, i_img, i_price, i_unit))
                else:
                    i = BagItem(self.brand_type)
                    i.initItem(tab_name, '', i_name, i_price, i_unit, '',
                               i_url, i_img)
                    self.items.append(i.outItem())
Example #32
0
    def itemPage(self, val):
        serie_title, refers, i_name, i_url, i_img = val
        if i_url == '': return
        page = self.crawler.getData(i_url, refers)
        if not page or page == '': return

        m = re.search(
            r'<h1 class="producTitle".+?>\s*<div class="modelName".+?>\s*<span class="modelName">(.+?)</span>',
            page,
            flags=re.S)
        if m:
            i_name = m.group(1).strip()

        m = re.search(
            r'<div class="mainImage".+?>\s*<img.+?src="(.+?)".*?/>\s*</div>',
            page,
            flags=re.S)
        if m:
            i_img = m.group(1)
        else:
            m = re.search(
                r'<section id="bgItem">\s*<img.+?src="(.+?)".*?/>\s*</section>',
                page,
                flags=re.S)
            if m:
                i_img = m.group(1)

        i_size = ''
        m = re.search(
            r'<div class="localizedAttributes">.*?<div class="height">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>',
            page,
            flags=re.S)
        if m:
            i_size += m.group(1) + ":" + m.group(2) + ";"
        m = re.search(
            r'<div class="localizedAttributes">.*?<div class="depth">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>',
            page,
            flags=re.S)
        if m:
            i_size += m.group(1) + ":" + m.group(2) + ";"
        m = re.search(
            r'<div class="localizedAttributes">.*?<div class="length_of_strap">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>',
            page,
            flags=re.S)
        if m:
            i_size += m.group(1) + ":" + m.group(2) + ";"
        m = re.search(
            r'<div class="localizedAttributes">.*?<div class="width">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>\s*</div>',
            page,
            flags=re.S)
        if m:
            i_size += m.group(1) + ":" + m.group(2) + ";"

        i_number
        m = re.search(
            r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>',
            page,
            flags=re.S)
        if m:
            i_number = m.group(1)

        i = BagItem(self.brand_type)
        i.initItem(serie_title, '', i_name, '', '', i_size, i_url, i_img,
                   i_number)
        print '# itemPage:', i.outItem()
Example #33
0
    def itemPage(self, val):
        serie_title, i_title, refers, i_name, i_url = val

        page = self.crawler.getData(i_url, refers)
        if not page or page == '': return

        i_name, i_img, ref_price, i_size, i_price, i_unit, i_number = '', '', '', '', '', '', ''

        m = re.search(
            r'<div class="product-details details".*?>.+?<p class="description info.*?">(.+?)</p>',
            page,
            flags=re.S)
        if m: i_name = re.sub(r'<.+?>', '', m.group(1)).strip()
        else:
            m = re.search(r'<title>(.+?)</title>', page, flags=re.S)
            if m: i_name = m.group(1).split('-')[0].strip()

        m = re.search(r'<div class="productimage.*?"><img src="(.+?)".*?/>',
                      page,
                      flags=re.S)
        if m: i_img = self.home_url + m.group(1)

        p = re.compile(r'<p class="size info">(.+?)</p>', flags=re.S)
        for size in p.finditer(page):
            if self.item_size != '': i_size += '-' + size.group(1)
            else: i_size = size.group(1)

        #m = re.search(r'<div class="ref info">\s*<p>(.+?)</p>', page, flags=re.S)
        #if m:
        p = re.compile(r'<div class="ref info">\s*<p>(.+?)</p>', flags=re.S)
        for number in p.finditer(page):
            item_number = number.group(1)
            if self.item_number != '': self.item_number += '-' + item_number
            else: self.item_number = item_number

            refs = item_number.split(' ')[:-1]
            ref_price = ''.join(refs)

            p_url = self.price_url % ref_price
            data = self.crawler.getData(p_url, i_url)
            if not data or data == '': return

            # 抽取json报文
            r = re.search(r'localJsonpPricingCallback\(\[(.+?)\]\)',
                          data,
                          flags=re.S)
            if r:
                price, unit = '', ''
                try:
                    js_data = json.loads(r.group(1))
                    price, unit = js_data["price"]["amount"], js_data["price"][
                        "currency-symbol"]
                except Exception as e:
                    m = re.search(r'"amount":"(.+?)"', data, flags=re.S)
                    if m: price = m.group(1)
                    m = re.search(r'"currency-symbol":"(.+?)"',
                                  data,
                                  flags=re.S)
                    if m: unit = m.group(1)
                if self.item_price != '':
                    if price: i_price += '-' + price
                else:
                    if price: i_price = price
                    if unit: i_unit = unit

        i = BagItem(self.brand_type)
        i.initItem(serie_title, i_title, i_name, i_price, i_unit, i_size,
                   i_url, i_img, i_number)