예제 #1
0
    def bagPage(self, url):
        page = self.crawler.getData(url, self.home_url)
        if not page or page == '': return

        tab_list_info = ''
        m = re.search(r'<li class="mega-menu-item">\s*<div.+?data-megaMenu="women".*?>\s*<span class="onlyML">手袋</span>.+?</div>\s*<div class="mega-menu-content">.+?<ul class="level3">(.+?)</ul>', page, flags=re.S)
        if m:
            tab_list_info = m.group(1).strip()

        tab_list = []
        p = re.compile(r'<li class="mm-block">.+?<a.+?href="(.+?)">\s*<p class="mm-push-p">(.+?)</p>\s*</a>.+?</li>', flags=re.S)
        for tab_info in p.finditer(tab_list_info):
            tab_list.append((tab_info.group(2).strip(),self.home_url+tab_info.group(1)))
            print tab_info.group(2).strip(),self.home_url+tab_info.group(1)

        i = 0
        for tab in tab_list:
            refers = url
            tab_name, tab_url = tab
            print '# tab:',tab_name, tab_url
            tab_page = self.crawler.getData(tab_url, refers)
            m = re.search(r'<button.+?class="pl-next".+?data-next="(.+?)">',tab_page,flags=re.S)
            while m:
                refers = tab_url
                tab_url = re.sub(r'/to-\d+', '', tab_url)  + "/to-%s"%m.group(1)
                tab_page = self.crawler.getData(tab_url, refers)
                m = re.search(r'<button.+?class="pl-next".+?data-next="(.+?)">',tab_page,flags=re.S)

            p = re.compile(r'<a id="sku_.*?" href="(.+?)".+?>.+?<div class="description">\s*<div class="productName toMinimize">(.+?)</div>\s*<div class="productPrice.+?data-htmlContent="(.+?)">\s*</div>\s*</div>', flags=re.S)
            for item in p.finditer(tab_page):
                i_url, i_name, s_price = item.group(1),item.group(2),item.group(3)
                print self.home_url+i_url, i_name, s_price
                i_unit = ""
                if s_price.find("¥") != -1:
                    i_unit = "CNY"
                i_price = s_price.replace('¥','').strip()
                
                if i_url and i_url != '':
                    if Common.isBag(i_name):
                        self.link_list.append((tab_name,tab_url,i_name,self.home_url+i_url,i_price,i_unit))
                else:
                    if Common.isBag(i_name):
                        i = BagItem(self.brand_type)
                        i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, '')
                        self.items.append(i.outItem())
예제 #2
0
 def run_items(self, items_info, tab_name, tab_url):
     p = re.compile(r'<.+?>\s*<a.+?>\s*<img.+?src="(.+?)".*?>\s*</a><a href="(.+?)">.+?<div class="infoDescription">(.+?)</div>\s*(<div class="infoPrice">.+?</div>).+?</a>\s*<.+?>', flags=re.S)
     for item in p.finditer(items_info):
         i_img, i_url, s_name, price_info = item.group(1),item.group(2),item.group(3),item.group(4)
         i_name = re.sub(r'<.+?>','',s_name)
         i_price, i_unit = '', ''
         m = re.search(r'<div.+?class="newprice">(.+?)</div>', price_info, flags=re.S)
         if m:
             s_price = re.sub(r'<.+?>','',m.group(1))
             if s_price.find("¥") != -1:
                 i_unit = "CNY"
             i_price = s_price.replace('¥','').strip()
         
         if i_url and i_url != '':
             if Common.isBag(i_name) or Common.isBag(unquote(i_url)):
                 print self.home_url+i_url, i_img, i_name, i_price, i_unit
                 self.link_list.append((tab_name,tab_url,i_name,self.home_url+i_url,i_img,i_price,i_unit))
         else:
             if Common.isBag(i_name) or Common.isBag(unquote(i_url)):
                 print self.home_url+i_url, i_img, i_name, i_price, i_unit
                 i = BagItem(self.brand_type)
                 i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, i_img)
                 self.items.append(i.outItem())
예제 #3
0
    def bagPage(self, url):
        page = self.crawler.getData(url, self.home_url)
        if not page or page == '': return
       
        tab_list = []
        m = re.search(r'<p>女装配饰</p>\s+<ul class="menu-level-4">(.+?)</ul>', page, flags=re.S)
        if m:
            tabs_list_info = m.group(1)

            p = re.compile(r'<li class="submenu-item">\s+<a.+?href="(.+?)">\s+<span lang="fr" class="lang-fr">(.+?)</span>(.+?)</a>\s+</li>', flags=re.S)
            for tab in p.finditer(tabs_list_info):
                tab_list.append((tab.group(2)+tab.group(3).strip(),tab.group(1)))

        for tab in tab_list:
            tab_name,tab_url = tab
            print '# tab:',tab_name,tab_url
            tab_page = self.crawler.getData(tab_url, url)

            m = re.search(r'<div id="layer-1".+?>\s+<a href="(.+?)" class="layer-links">.+?</a>', tab_page, flags=re.S)
            if m:
                ajax_url = self.home_url + m.group(1) + "?ajax=true&fragment=true"
                ajax_data = self.crawler.getData(ajax_url, tab_url)

                if ajax_data:
                    #data = json.loads(ajax_data)
                    #if data and data.has_key("html"):
                    #    print data["html"].decode("unicode-escape")
                    r_data = ajax_data.decode("unicode-escape")
                    if r_data:
                        m = re.search(r'"html":"(.+?)"}', r_data, flags=re.S)
                        if m:
                            data_html = m.group(1).replace("\/","/")
                            #print data_html
                            #break
                            p = re.compile(r'<li class="lookbook-item line" data-idlook="\d+">\s+<div class="disp-n">.+?<div class="look-info article">\s+<p>(.+?)</p>.+?<p class="look-ref">(.+?)</p>.+?</div>.+?</div>\s+<a href="(.+?)".+?>.+?<img .+?data-src="(.+?)".*?/>\s+</li>', flags=re.S)
                            for item in p.finditer(data_html):
                                i_url, i_img, s_number, i_name = self.home_url+item.group(3), item.group(4), item.group(2), re.sub(r'<.+?>','',item.group(1)).strip()
                                i_number = ''
                                m = re.search(r'<span class="look-ref-sku">\s*<span.+?>(.+?)</span>\s*</span>', s_number, flags=re.S)
                                if m:
                                    i_number = m.group(1)
                                print i_url, i_img, i_name, i_number
                                if Common.isBag(i_name):
                                    self.link_list.append((tab_name, tab_url, i_name, i_url, i_img, i_number))
예제 #4
0
    def bagPage(self, url):
        page = self.crawler.getData(url, self.home_url)
        if not page or page == '': return

        tab_list = []
        m = re.search(r'<p>女装配饰</p>\s+<ul class="menu-level-4">(.+?)</ul>',
                      page,
                      flags=re.S)
        if m:
            tabs_list_info = m.group(1)

            p = re.compile(
                r'<li class="submenu-item">\s+<a.+?href="(.+?)">\s+<span lang="fr" class="lang-fr">(.+?)</span>(.+?)</a>\s+</li>',
                flags=re.S)
            for tab in p.finditer(tabs_list_info):
                tab_list.append(
                    (tab.group(2) + tab.group(3).strip(), tab.group(1)))

        for tab in tab_list:
            tab_name, tab_url = tab
            print '# tab:', tab_name, tab_url
            tab_page = self.crawler.getData(tab_url, url)

            m = re.search(
                r'<div id="layer-1".+?>\s+<a href="(.+?)" class="layer-links">.+?</a>',
                tab_page,
                flags=re.S)
            if m:
                ajax_url = self.home_url + m.group(
                    1) + "?ajax=true&fragment=true"
                ajax_data = self.crawler.getData(ajax_url, tab_url)

                if ajax_data:
                    #data = json.loads(ajax_data)
                    #if data and data.has_key("html"):
                    #    print data["html"].decode("unicode-escape")
                    r_data = ajax_data.decode("unicode-escape")
                    if r_data:
                        m = re.search(r'"html":"(.+?)"}', r_data, flags=re.S)
                        if m:
                            data_html = m.group(1).replace("\/", "/")
                            #print data_html
                            #break
                            p = re.compile(
                                r'<li class="lookbook-item line" data-idlook="\d+">\s+<div class="disp-n">.+?<div class="look-info article">\s+<p>(.+?)</p>.+?<p class="look-ref">(.+?)</p>.+?</div>.+?</div>\s+<a href="(.+?)".+?>.+?<img .+?data-src="(.+?)".*?/>\s+</li>',
                                flags=re.S)
                            for item in p.finditer(data_html):
                                i_url, i_img, s_number, i_name = self.home_url + item.group(
                                    3), item.group(4), item.group(2), re.sub(
                                        r'<.+?>', '', item.group(1)).strip()
                                i_number = ''
                                m = re.search(
                                    r'<span class="look-ref-sku">\s*<span.+?>(.+?)</span>\s*</span>',
                                    s_number,
                                    flags=re.S)
                                if m:
                                    i_number = m.group(1)
                                print i_url, i_img, i_name, i_number
                                if Common.isBag(i_name):
                                    self.link_list.append(
                                        (tab_name, tab_url, i_name, i_url,
                                         i_img, i_number))