def TMItem(self): if self.item_url != '': page = self.crawler.getData(self.item_url, self.refers) if not page or page == '': raise Common.InvalidPageException( "# TMItem: not find item page,itemid:%s,item_url:%s" % (str(self.item_id), self.item_url)) m = re.search(r'sellerId:"(\d+)",', page, flags=re.S) if m: self.seller_id = m.group(1) m = re.search(r'shopId:"(\d+)",', page, flags=re.S) if m: self.shop_id = m.group(1) m = re.search( r'<div class="slogo">\s*<a class="slogo-shopname" href="(.+?)".+?><strong>(.+?)</strong></a>', page, flags=re.S) if m: self.shop_url, self.shop_name = Common.fix_url( m.group(1)), m.group(2).strip() m = re.search(r'TShop\.Setup\((.+?)\);', page, flags=re.S) if m: TShop_s = m.group(1).strip() m = re.search(r'"brand":"(.+?)",', TShop_s, flags=re.S) if m: self.brand_name = Common.htmlDecode(m.group(1).strip()) m = re.search(r'"brandId":"(\d+)",', TShop_s, flags=re.S) if m: self.brand_id = m.group(1) m = re.search(r'"categoryId":"(\d+)",', TShop_s, flags=re.S) if m: self.category_id = m.group(1) m = re.search(r'"sellerNickName":"(.+?)",', TShop_s, flags=re.S) if m: self.seller_name = Common.urlDecode(m.group(1).strip()) m = re.search(r'"initApi":"(.+?)",', TShop_s, flags=re.S) if m: ts = "&callback=setMdskip×tamp=%s" % str( int(time.time() * 1000)) initapi_url = Common.fix_url(m.group(1).strip( )) + ts + "&ref=%s" % Common.urlCode(self.refers) init_page = self.crawler.getData(initapi_url, self.item_url) if not init_page and init_page == '': print '# init page is null..' else: m = re.search(r'"sellCountDO":{"sellCount":(\d+),', init_page, flags=re.S) if m: self.item_sellCount = m.group(1)
def getPage(self, url, shop_home_url): position = 1 i = 1 max_page = 0 asyn_url = '' i_url = url refers = shop_home_url result_s = self.get_asyn_data(i_url, refers, shop_home_url) m = re.search(r'<b class="ui-page-s-len">\d+/(\d+)</b>', result_s, flags=re.S) if m: max_page = int(m.group(1)) print '# page num:', max_page while i <= max_page: m = re.search( r'<div class="J_TItems">(.+?)<div class="pagination">', result_s, flags=re.S) if m: items_s = m.group(1) p = re.compile( r'<dl class=".+?".+?data-id="(.+?)">.+?<dd class="detail">\s*<a class="item-name".+?href="(.+?)".+?>(.+?)</a>\s*<div class="attribute">\s*<div class="cprice-area">\s*<span class="symbol">(.+?)</span>\s*<span\s*class="c-price">(.+?)</span>\s*</div>.+?</dl>' ) j = 1 for item in p.finditer(items_s): item_id, url_s, item_name, price_symbol, price = item.group( 1), item.group(2), Common.htmlDecode( item.group(3).strip()), item.group( 4).strip(), item.group(5).strip() if url_s.find('http') == -1: item_url = 'http:' + url_s else: item_url = url_s print '### item ###' print '# item val:', item_id, item_name, price, item_url item = Item() item.parserTM((item_id, item_name, price, item_url, i_url, self.begin_time)) print '# item info:', item.outItemSql() self.mysqlAccess.insert_parser_item_info(item.outItemSql()) time.sleep(2) refers = i_url if i_url.find('pageNo=') == -1: i_url = re.sub(r'&tsearch=y', '&pageNo=%d&tsearch=y#anchor' % i, refers) else: i_url = re.sub(r'&pageNo=\d+&', '&pageNo=%d&' % i, refers) i += 1 time.sleep(2) result_s = self.get_asyn_data(i_url, refers, shop_home_url)
def get_asyn_data(self, i_url, refers, shop_home_url): result = '' result_s = '' page = self.crawler.getData(i_url, refers) m = re.search(r'<input id="J_ShopAsynSearchURL".+?value="(.+?)"\s*/>', page, flags=re.S) if m: ts = '?_ksTS=%s&callback=jsonp135&' % (str(int(time.time()*1000)) + '_' + str(random.randint(100,999))) a_url = shop_home_url + Common.htmlDecode(m.group(1)) asyn_url = re.sub('\?', ts, a_url) result = self.crawler.getData(asyn_url, i_url) m = re.search(r'jsonp135\("(.+?)"\)', result, flags=re.S) if m: result_s = re.sub(r'\\"', '"', m.group(1)) return result_s
def get_asyn_data(self, i_url, refers, shop_home_url): result = '' result_s = '' page = self.crawler.getData(i_url, refers) m = re.search(r'<input id="J_ShopAsynSearchURL".+?value="(.+?)"\s*/>', page, flags=re.S) if m: ts = '?_ksTS=%s&callback=jsonp135&' % (str(int( time.time() * 1000)) + '_' + str(random.randint(100, 999))) a_url = shop_home_url + Common.htmlDecode(m.group(1)) asyn_url = re.sub('\?', ts, a_url) result = self.crawler.getData(asyn_url, i_url) m = re.search(r'jsonp135\("(.+?)"\)', result, flags=re.S) if m: result_s = re.sub(r'\\"', '"', m.group(1)) return result_s
def TMItem(self): if self.item_url != '': page = self.crawler.getData(self.item_url, self.refers) if not page or page == '': raise Common.InvalidPageException("# TMItem: not find item page,itemid:%s,item_url:%s"%(str(self.item_id), self.item_url)) m = re.search(r'sellerId:"(\d+)",', page, flags=re.S) if m: self.seller_id = m.group(1) m = re.search(r'shopId:"(\d+)",', page, flags=re.S) if m: self.shop_id = m.group(1) m = re.search(r'<div class="slogo">\s*<a class="slogo-shopname" href="(.+?)".+?><strong>(.+?)</strong></a>', page, flags=re.S) if m: self.shop_url, self.shop_name = Common.fix_url(m.group(1)), m.group(2).strip() m = re.search(r'TShop\.Setup\((.+?)\);', page, flags=re.S) if m: TShop_s = m.group(1).strip() m = re.search(r'"brand":"(.+?)",', TShop_s, flags=re.S) if m: self.brand_name = Common.htmlDecode(m.group(1).strip()) m = re.search(r'"brandId":"(\d+)",', TShop_s, flags=re.S) if m: self.brand_id = m.group(1) m = re.search(r'"categoryId":"(\d+)",', TShop_s, flags=re.S) if m: self.category_id = m.group(1) m = re.search(r'"sellerNickName":"(.+?)",', TShop_s, flags=re.S) if m: self.seller_name = Common.urlDecode(m.group(1).strip()) m = re.search(r'"initApi":"(.+?)",', TShop_s, flags=re.S) if m: ts = "&callback=setMdskip×tamp=%s" % str(int(time.time()*1000)) initapi_url = Common.fix_url(m.group(1).strip()) + ts + "&ref=%s" % Common.urlCode(self.refers) init_page = self.crawler.getData(initapi_url, self.item_url) if not init_page and init_page == '': print '# init page is null..' else: m = re.search(r'"sellCountDO":{"sellCount":(\d+),', init_page, flags=re.S) if m: self.item_sellCount = m.group(1)
def getPage(self, url, shop_home_url): position = 1 i = 1 max_page = 0 asyn_url = '' i_url = url refers = shop_home_url result_s = self.get_asyn_data(i_url, refers, shop_home_url) m = re.search(r'<b class="ui-page-s-len">\d+/(\d+)</b>', result_s, flags=re.S) if m: max_page = int(m.group(1)) print '# page num:', max_page while i <= max_page: m = re.search(r'<div class="J_TItems">(.+?)<div class="pagination">', result_s, flags=re.S) if m: items_s = m.group(1) p = re.compile(r'<dl class=".+?".+?data-id="(.+?)">.+?<dd class="detail">\s*<a class="item-name".+?href="(.+?)".+?>(.+?)</a>\s*<div class="attribute">\s*<div class="cprice-area">\s*<span class="symbol">(.+?)</span>\s*<span\s*class="c-price">(.+?)</span>\s*</div>.+?</dl>') j = 1 for item in p.finditer(items_s): item_id, url_s, item_name, price_symbol, price = item.group(1), item.group(2), Common.htmlDecode(item.group(3).strip()), item.group(4).strip(), item.group(5).strip() if url_s.find('http') == -1: item_url = 'http:' + url_s else: item_url = url_s print '### item ###' print '# item val:', item_id, item_name, price, item_url item = Item() item.parserTM((item_id, item_name, price, item_url, i_url, self.begin_time)) print '# item info:',item.outItemSql() self.mysqlAccess.insert_parser_item_info(item.outItemSql()) time.sleep(2) refers = i_url if i_url.find('pageNo=') == -1: i_url = re.sub(r'&tsearch=y','&pageNo=%d&tsearch=y#anchor' % i, refers) else: i_url = re.sub(r'&pageNo=\d+&','&pageNo=%d&' % i, refers) i += 1 time.sleep(2) result_s = self.get_asyn_data(i_url, refers, shop_home_url)