def initItem(self): # 商品抓取设置 self.crawling_time = Common.now() self.crawling_begintime = '' # 本次抓取开始时间 self.crawling_beginDate = '' # 本次爬取日期 self.crawling_beginHour = '' # 本次爬取小时 # 商品属性 self.item_id = '' # 商品ID self.item_name = '' # 商品名称 self.item_price = '' # 商品价格 self.item_url = '' # 商品链接 self.item_spuId = '' # SPU ID self.item_sellCount = 0 # 月销售数 self.brand_name = '' self.brand_id = '' self.category_id = '' # 商品页 self.item_page = None # 商品首页 # item html urls self.item_urls = [] # 商品链接列表 # item html pages #self.item_pages = [] # 商品网页列表 self.item_pages = {} # 商品网页列表 # 成交记录 self.deal_url = '' self.deal_stopCrawl = False self.deal_deadLine = 0.0 # 上次抓取的成交记录最晚时间 self.deal_deadLine2 = 0.0 # 本次抓取的成交记录最早时间
def __init__(self, home_url, brand_type): # 抓取设置 self.crawler = MyCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_beginDate = time.strftime( "%Y-%m-%d", time.localtime(self.crawling_time)) # 本次爬取日期 self.crawling_beginHour = time.strftime( "%H", time.localtime(self.crawling_time)) # 本次爬取小时 # 品牌官网链接 self.home_url = home_url # 品牌type self.brand_type = brand_type self.serie_title = '' self.item_title = '' self.item_name = '' self.item_price = '' self.item_unit = '' self.item_size = '' self.item_url = '' self.item_img = '' self.item_number = ''
def getPage(self, url): position = 1 i = 1 i_url = url refers = self.home_url max_page = 10 size_page = 48 while i <= max_page: page = self.crawler.getData(i_url, refers) refers = i_url i_url = url + '&bcoffset=1&s=%s' % str(i*size_page) i += 1 if not page or page == '': print 'not find data url:',i_url time.sleep(4) continue m = re.search(r'<script>\s+g_page_config = ({.+?});.+?</script>', page, flags=re.S) if m: page_config = m.group(1) page_config_s = re.sub(r'\n+','',page_config) data = json.loads(page_config_s) if data.has_key("mods"): if data["mods"].has_key("itemlist"): itemlist = data["mods"]["itemlist"] if itemlist.has_key("data"): itemlist_data = itemlist["data"] if itemlist_data.has_key("auctions"): for item in itemlist_data["auctions"]: item_id = position m = re.search(r'id=(\d+)', item["detail_url"], flags=re.S) if m: item_id = m.group(1) item_sales = item["view_sales"] m = re.search(r'(\d+)', item["view_sales"], flags=re.S) if m: item_sales = m.group(1) print Common.time_s(Common.now()), position, item_id, item["raw_title"], item["view_price"], item_sales, item["user_id"], item["nick"], "http:" + item["detail_url"], "http:" + item["shopLink"] self.mysqlAccess.insert_item((Common.time_s(Common.now()), str(item_id), str(position), str(item["raw_title"]), str(item["view_price"]), str(item_sales), "http:" + item["detail_url"], item["user_id"], str(item["nick"]), "http:" + item["shopLink"])) position += 1 time.sleep(4)
def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # db self.mysqlAccess = MysqlAccess() # mysql access # 品牌官网链接 self.home_url = 'http://www.taobao.com' self.refers = None # 抓取商品列表 self.link_list = [] self.items = [] self.begin_time = Common.now()
def __init__(self, home_url, brand_type): # 抓取设置 self.crawler = MyCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_beginDate = time.strftime("%Y-%m-%d", time.localtime(self.crawling_time)) # 本次爬取日期 self.crawling_beginHour = time.strftime("%H", time.localtime(self.crawling_time)) # 本次爬取小时 # 品牌官网链接 self.home_url = home_url # 品牌type self.brand_type = brand_type self.serie_title = '' self.item_title = '' self.item_name = '' self.item_price = '' self.item_unit = '' self.item_size = '' self.item_url = '' self.item_img = '' self.item_number = ''