Example #1
0
    def TMItem(self):
        if self.item_url != '':
            page = self.crawler.getData(self.item_url, self.refers)
            if not page or page == '':
                raise Common.InvalidPageException(
                    "# TMItem: not find item page,itemid:%s,item_url:%s" %
                    (str(self.item_id), self.item_url))

            m = re.search(r'sellerId:"(\d+)",', page, flags=re.S)
            if m:
                self.seller_id = m.group(1)
            m = re.search(r'shopId:"(\d+)",', page, flags=re.S)
            if m:
                self.shop_id = m.group(1)
            m = re.search(
                r'<div class="slogo">\s*<a class="slogo-shopname" href="(.+?)".+?><strong>(.+?)</strong></a>',
                page,
                flags=re.S)
            if m:
                self.shop_url, self.shop_name = Common.fix_url(
                    m.group(1)), m.group(2).strip()

            m = re.search(r'TShop\.Setup\((.+?)\);', page, flags=re.S)
            if m:
                TShop_s = m.group(1).strip()
                m = re.search(r'"brand":"(.+?)",', TShop_s, flags=re.S)
                if m:
                    self.brand_name = Common.htmlDecode(m.group(1).strip())
                m = re.search(r'"brandId":"(\d+)",', TShop_s, flags=re.S)
                if m:
                    self.brand_id = m.group(1)
                m = re.search(r'"categoryId":"(\d+)",', TShop_s, flags=re.S)
                if m:
                    self.category_id = m.group(1)
                m = re.search(r'"sellerNickName":"(.+?)",',
                              TShop_s,
                              flags=re.S)
                if m:
                    self.seller_name = Common.urlDecode(m.group(1).strip())

                m = re.search(r'"initApi":"(.+?)",', TShop_s, flags=re.S)
                if m:
                    ts = "&callback=setMdskip&timestamp=%s" % str(
                        int(time.time() * 1000))
                    initapi_url = Common.fix_url(m.group(1).strip(
                    )) + ts + "&ref=%s" % Common.urlCode(self.refers)
                    init_page = self.crawler.getData(initapi_url,
                                                     self.item_url)
                    if not init_page and init_page == '':
                        print '# init page is null..'
                    else:
                        m = re.search(r'"sellCountDO":{"sellCount":(\d+),',
                                      init_page,
                                      flags=re.S)
                        if m:
                            self.item_sellCount = m.group(1)
Example #2
0
    def getPage(self, url, shop_home_url):
        position = 1
        i = 1
        max_page = 0

        asyn_url = ''
        i_url = url
        refers = shop_home_url
        result_s = self.get_asyn_data(i_url, refers, shop_home_url)
        m = re.search(r'<b class="ui-page-s-len">\d+/(\d+)</b>',
                      result_s,
                      flags=re.S)
        if m:
            max_page = int(m.group(1))
        print '# page num:', max_page
        while i <= max_page:
            m = re.search(
                r'<div class="J_TItems">(.+?)<div class="pagination">',
                result_s,
                flags=re.S)
            if m:
                items_s = m.group(1)
                p = re.compile(
                    r'<dl class=".+?".+?data-id="(.+?)">.+?<dd class="detail">\s*<a class="item-name".+?href="(.+?)".+?>(.+?)</a>\s*<div class="attribute">\s*<div class="cprice-area">\s*<span class="symbol">(.+?)</span>\s*<span\s*class="c-price">(.+?)</span>\s*</div>.+?</dl>'
                )
                j = 1
                for item in p.finditer(items_s):
                    item_id, url_s, item_name, price_symbol, price = item.group(
                        1), item.group(2), Common.htmlDecode(
                            item.group(3).strip()), item.group(
                                4).strip(), item.group(5).strip()
                    if url_s.find('http') == -1:
                        item_url = 'http:' + url_s
                    else:
                        item_url = url_s
                    print '### item ###'
                    print '# item val:', item_id, item_name, price, item_url
                    item = Item()
                    item.parserTM((item_id, item_name, price, item_url, i_url,
                                   self.begin_time))
                    print '# item info:', item.outItemSql()
                    self.mysqlAccess.insert_parser_item_info(item.outItemSql())
                    time.sleep(2)

            refers = i_url
            if i_url.find('pageNo=') == -1:
                i_url = re.sub(r'&tsearch=y',
                               '&pageNo=%d&tsearch=y#anchor' % i, refers)
            else:
                i_url = re.sub(r'&pageNo=\d+&', '&pageNo=%d&' % i, refers)

            i += 1
            time.sleep(2)
            result_s = self.get_asyn_data(i_url, refers, shop_home_url)
Example #3
0
 def get_asyn_data(self, i_url, refers, shop_home_url):
     result = ''
     result_s = ''
     page = self.crawler.getData(i_url, refers)
     m = re.search(r'<input id="J_ShopAsynSearchURL".+?value="(.+?)"\s*/>', page, flags=re.S)
     if m:
         ts = '?_ksTS=%s&callback=jsonp135&' % (str(int(time.time()*1000)) + '_' + str(random.randint(100,999)))
         a_url = shop_home_url + Common.htmlDecode(m.group(1))
         asyn_url = re.sub('\?', ts, a_url)
         result = self.crawler.getData(asyn_url, i_url)
         m = re.search(r'jsonp135\("(.+?)"\)', result, flags=re.S)
         if m:
             result_s = re.sub(r'\\"', '"', m.group(1))
     return result_s
Example #4
0
 def get_asyn_data(self, i_url, refers, shop_home_url):
     result = ''
     result_s = ''
     page = self.crawler.getData(i_url, refers)
     m = re.search(r'<input id="J_ShopAsynSearchURL".+?value="(.+?)"\s*/>',
                   page,
                   flags=re.S)
     if m:
         ts = '?_ksTS=%s&callback=jsonp135&' % (str(int(
             time.time() * 1000)) + '_' + str(random.randint(100, 999)))
         a_url = shop_home_url + Common.htmlDecode(m.group(1))
         asyn_url = re.sub('\?', ts, a_url)
         result = self.crawler.getData(asyn_url, i_url)
         m = re.search(r'jsonp135\("(.+?)"\)', result, flags=re.S)
         if m:
             result_s = re.sub(r'\\"', '"', m.group(1))
     return result_s
Example #5
0
File: Item.py Project: xzhoutxd/tb
    def TMItem(self):
        if self.item_url != '':
            page = self.crawler.getData(self.item_url, self.refers)
            if not page or page == '':
                raise Common.InvalidPageException("# TMItem: not find item page,itemid:%s,item_url:%s"%(str(self.item_id), self.item_url))

            m = re.search(r'sellerId:"(\d+)",', page, flags=re.S)
            if m:
                self.seller_id = m.group(1)
            m = re.search(r'shopId:"(\d+)",', page, flags=re.S)
            if m:
                self.shop_id = m.group(1)
            m = re.search(r'<div class="slogo">\s*<a class="slogo-shopname" href="(.+?)".+?><strong>(.+?)</strong></a>', page, flags=re.S)
            if m:
                self.shop_url, self.shop_name = Common.fix_url(m.group(1)), m.group(2).strip()

            m = re.search(r'TShop\.Setup\((.+?)\);', page, flags=re.S)
            if m:
                TShop_s = m.group(1).strip()
                m = re.search(r'"brand":"(.+?)",', TShop_s, flags=re.S)
                if m:
                    self.brand_name = Common.htmlDecode(m.group(1).strip())
                m = re.search(r'"brandId":"(\d+)",', TShop_s, flags=re.S)
                if m:
                    self.brand_id = m.group(1)
                m = re.search(r'"categoryId":"(\d+)",', TShop_s, flags=re.S)
                if m:
                    self.category_id = m.group(1)
                m = re.search(r'"sellerNickName":"(.+?)",', TShop_s, flags=re.S)
                if m:
                    self.seller_name = Common.urlDecode(m.group(1).strip())

                m = re.search(r'"initApi":"(.+?)",', TShop_s, flags=re.S)
                if m:
                    ts = "&callback=setMdskip&timestamp=%s" % str(int(time.time()*1000))
                    initapi_url = Common.fix_url(m.group(1).strip()) + ts + "&ref=%s" % Common.urlCode(self.refers)
                    init_page = self.crawler.getData(initapi_url, self.item_url)
                    if not init_page and init_page == '':
                        print '# init page is null..'
                    else:
                        m = re.search(r'"sellCountDO":{"sellCount":(\d+),', init_page, flags=re.S)
                        if m:
                            self.item_sellCount = m.group(1)
Example #6
0
    def getPage(self, url, shop_home_url):
        position = 1
        i = 1
        max_page = 0
       
        asyn_url = ''
        i_url = url
        refers = shop_home_url
        result_s = self.get_asyn_data(i_url, refers, shop_home_url)
        m = re.search(r'<b class="ui-page-s-len">\d+/(\d+)</b>', result_s, flags=re.S) 
        if m:
            max_page = int(m.group(1))
        print '# page num:', max_page
        while i <= max_page:
            m = re.search(r'<div class="J_TItems">(.+?)<div class="pagination">', result_s, flags=re.S)
            if m:
                items_s = m.group(1)
                p = re.compile(r'<dl class=".+?".+?data-id="(.+?)">.+?<dd class="detail">\s*<a class="item-name".+?href="(.+?)".+?>(.+?)</a>\s*<div class="attribute">\s*<div class="cprice-area">\s*<span class="symbol">(.+?)</span>\s*<span\s*class="c-price">(.+?)</span>\s*</div>.+?</dl>')
                j = 1
                for item in p.finditer(items_s):
                    item_id, url_s, item_name, price_symbol, price = item.group(1), item.group(2), Common.htmlDecode(item.group(3).strip()), item.group(4).strip(), item.group(5).strip()
                    if url_s.find('http') == -1:
                        item_url = 'http:' + url_s
                    else:
                        item_url = url_s
                    print '### item ###'
                    print '# item val:', item_id, item_name, price, item_url
                    item = Item()
                    item.parserTM((item_id, item_name, price, item_url, i_url, self.begin_time))
                    print '# item info:',item.outItemSql()
                    self.mysqlAccess.insert_parser_item_info(item.outItemSql())
                    time.sleep(2)
            
            refers = i_url
            if i_url.find('pageNo=') == -1:
                i_url = re.sub(r'&tsearch=y','&pageNo=%d&tsearch=y#anchor' % i, refers)
            else:
                i_url = re.sub(r'&pageNo=\d+&','&pageNo=%d&' % i, refers)

            i += 1
            time.sleep(2)
            result_s = self.get_asyn_data(i_url, refers, shop_home_url)