def outItemSql(self): return (Common.time_s(self.crawling_time), self.item_id, self.item_name, self.item_price, self.item_sellCount, self.item_url, self.seller_id, self.seller_name, self.shop_id, self.shop_name, self.shop_url, self.brand_id, self.brand_name, self.category_id, self.crawling_beginDate, self.crawling_beginHour)
def getPage(self, url): position = 1 i = 1 i_url = url refers = self.home_url max_page = 10 size_page = 48 while i <= max_page: page = self.crawler.getData(i_url, refers) refers = i_url i_url = url + '&bcoffset=1&s=%s' % str(i*size_page) i += 1 if not page or page == '': print 'not find data url:',i_url time.sleep(4) continue m = re.search(r'<script>\s+g_page_config = ({.+?});.+?</script>', page, flags=re.S) if m: page_config = m.group(1) page_config_s = re.sub(r'\n+','',page_config) data = json.loads(page_config_s) if data.has_key("mods"): if data["mods"].has_key("itemlist"): itemlist = data["mods"]["itemlist"] if itemlist.has_key("data"): itemlist_data = itemlist["data"] if itemlist_data.has_key("auctions"): for item in itemlist_data["auctions"]: item_id = position m = re.search(r'id=(\d+)', item["detail_url"], flags=re.S) if m: item_id = m.group(1) item_sales = item["view_sales"] m = re.search(r'(\d+)', item["view_sales"], flags=re.S) if m: item_sales = m.group(1) print Common.time_s(Common.now()), position, item_id, item["raw_title"], item["view_price"], item_sales, item["user_id"], item["nick"], "http:" + item["detail_url"], "http:" + item["shopLink"] self.mysqlAccess.insert_item((Common.time_s(Common.now()), str(item_id), str(position), str(item["raw_title"]), str(item["view_price"]), str(item_sales), "http:" + item["detail_url"], item["user_id"], str(item["nick"]), "http:" + item["shopLink"])) position += 1 time.sleep(4)
def outTuple(self): return (Common.time_s(self.crawling_time), self.brand_type, self.serie_title, self.item_title, self.item_name, self.item_price, self.item_unit, self.item_size, self.item_url, self.item_img, self.item_number, self.crawling_beginDate, self.crawling_beginHour)
def outItemSql(self): return (Common.time_s(self.crawling_time),self.item_id,self.item_name,self.item_price,self.item_sellCount,self.item_url,self.seller_id,self.seller_name,self.shop_id,self.shop_name,self.shop_url,self.brand_id,self.brand_name,self.category_id,self.crawling_beginDate,self.crawling_beginHour)