def __init__(self): # 抓取设置 self.crawler = XCCrawler() self.retrycrawler = RetryCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_time_s = Common.time_s(self.crawling_time) self.crawling_begintime = '' # 本次抓取开始时间 self.crawling_beginDate = '' # 本次爬取日期 self.crawling_beginHour = '' # 本次爬取小时 # 频道信息 self.platform = '携程-pc' # 品牌团所在平台 self.channel_id = '' # 频道id self.channel_url = '' # 频道链接 self.channel_name = '' # 频道name self.channel_type = '' # 频道类型 # 频道所属地理位置信息 self.province_id = 0 # 省,州id self.province_name = '' # 省,州名称 # 原数据信息 self.channel_page = '' # 频道页面html内容 self.channel_pages = {} # 频道页面内请求数据列表 # channel items self.channel_items = [] # channel list self.channel_list = []
def __init__(self): # 抓取设置 self.crawler = XCCrawler() # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' # wait time self.w_time = 1
def __init__(self): # 商品页面抓取设置 self.crawler = XCCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_begintime = '' # 本次抓取开始时间 self.crawling_beginDate = '' # 本次爬取日期 self.crawling_beginHour = '' # 本次爬取小时 # 门票所属商品信息 self.item_id = '' # 商品Id self.item_name = '' # 商品Name self.item_type = '' # 商品类型 # 门票类型 self.ticket_type = '' # 门票类型 # 门票信息 self.ticket_id = '' # 门票id self.ticket_name = '' # 门票名称 self.ticket_price = '' # 门票价 self.ticket_adprice = '' # 门票活动价 self.ticket_unit_name = '' # 门票(套票 单票 套餐等信息) self.ticket_tag = '' # 门票特点 # 数据信息 self.ticket_pages = {}
def __init__(self): # 商品页面抓取设置 self.crawler = XCCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_begintime = '' # 本次抓取开始时间 self.crawling_beginDate = '' # 本次爬取日期 self.crawling_beginHour = '' # 本次爬取小时 # 单品类型商品所属频道 self.channel_id = '' self.channel_name = '' self.channel_url = '' self.channel_type = '' self.item_position = 0 # 商品信息 self.item_id = '' # 商品Id self.item_url = '' # 商品链接 self.item_pic_url = '' # 商品展示图片链接 self.item_name = '' # 商品Name self.item_desc = '' # 商品说明 self.item_book_status = 1 # 商品是否售卖 0:不售,1:在售 self.item_level = '' # 级别 self.item_area = '' # 地址 self.item_service = '' # 服务 self.item_comment = '' # 评论数 self.item_comment_grade = '' # 评分 # 商品交易 self.item_oriprice = '' # 商品原价 self.item_disprice = '' # 商品折扣价 self.item_discount = '' # 商品打折 # 门票 self.item_tickets = [] # 原数据信息 self.item_pageData = '' # 商品所属数据项内容 self.item_page = '' # 商品页面html内容 self.item_pages = {} # 商品页面内请求数据列表
def __init__(self): # xc spot type self.worker_type = Config.XC_Spot # DB self.xc_type = Config.XC_TYPE # queue type self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # 抓取设置 self.crawler = XCCrawler() # message self.message = Message() # 抓取时间设定 self.crawling_time = Common.now() # 当前爬取时间 self.begin_time = Common.now() self.begin_date = Common.today_s() self.begin_hour = Common.nowhour_s()
class Channel(): '''A class of XC channel''' def __init__(self): # 抓取设置 self.crawler = XCCrawler() self.retrycrawler = RetryCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_time_s = Common.time_s(self.crawling_time) self.crawling_begintime = '' # 本次抓取开始时间 self.crawling_beginDate = '' # 本次爬取日期 self.crawling_beginHour = '' # 本次爬取小时 # 频道信息 self.platform = '携程-pc' # 品牌团所在平台 self.channel_id = '' # 频道id self.channel_url = '' # 频道链接 self.channel_name = '' # 频道name self.channel_type = '' # 频道类型 # 频道所属地理位置信息 self.province_id = 0 # 省,州id self.province_name = '' # 省,州名称 # 原数据信息 self.channel_page = '' # 频道页面html内容 self.channel_pages = {} # 频道页面内请求数据列表 # channel items self.channel_items = [] # channel list self.channel_list = [] # 频道页初始化 def init(self, channel_id, channel_url, channel_type, begin_time): self.channel_id = channel_id self.channel_url = channel_url self.channel_type = channel_type self.crawling_begintime = begin_time self.crawling_beginDate = time.strftime("%Y-%m-%d", time.localtime(self.crawling_begintime)) self.crawling_beginHour = time.strftime("%H", time.localtime(self.crawling_begintime)) def config(self): self.channelPage() if self.channel_type == 1: self.spot() #elif self.channel_type == 2: else: Common.log('# not find this channel type...') def spot(self): if self.channel_page: m = re.search(r'<div class="cate_select">(.+?)</div>', self.channel_page, flags=re.S) if m: cate_select = m.group(1) c_list = [] p = re.compile(r'<a.+?class="select">(.+?)</a>', flags=re.S) for c in p.finditer(cate_select): c_list.append(re.sub(r'<.+?>', '', c.group(1)).strip()) self.channel_name = '-'.join(c_list) i_p = 1 i_page = 1 m_page = 1 page_main = '' m = re.search(r'<div id="base_bd">.+?<div class="bg_miancolor">.+?<div class="vacation_bd">(.+?)<div class="vacation_bd bottom_seo">', self.channel_page, flags=re.S) if m: page_main = m.group(1) else: page_main = self.channel_page Common.log(i_page) i_p = self.get_items(page_main, i_p) m = re.search(r'<span class="c_page2_numtop">(.+?)</span>', self.channel_page, flags=re.S) if m: m_page_info = m.group(1) m = re.search(r'\d+/(\d+)', m_page_info, flags=re.S) if m: m_page = int(m.group(1)) page_url = self.channel_url[0:-1] + 'P%s/' while i_page < m_page: i_page += 1 p_url = page_url % str(i_page) Common.log(i_page) page = self.retrycrawler.getData(p_url, self.channel_url) i_p = self.get_items(page, i_p) def get_items(self, page_main, i_p): if page_main: p = re.compile(r'<div class="searchresult_product04">\s+<div class="search_ticket_caption basefix">\s+<a href="(.+?)".+?>\s+<img src="(.+?)".+?/>.+?<div class="search_ticket_title">\s+<h2>\s+<a.+?>(.+?)</a>.+?</h2>\s+<div class="adress">(.+?)</div>\s+<div class="exercise">(.*?)</div>', flags=re.S) for info in p.finditer(page_main): if int(self.channel_type) == 1: i_url = Config.xc_piao_home + info.group(1) else: i_url = Config.xc_home + info.group(1) i_img, i_name, i_area, i_desc = info.group(2), info.group(3).strip(), info.group(4).strip(), info.group(5).strip() i_book = 1 i_id = 0 if i_url != '': m = re.search(r't(\d+).html', i_url) if m: i_id = m.group(1) val = (self.channel_id, self.channel_name, self.channel_url, self.channel_type, (i_book, i_id, i_url, i_img, i_name, i_desc, i_area, i_p, self.crawling_begintime)) self.channel_items.append(val) i_p += 1 return i_p def channelList(self): self.channelPage() if self.channel_page: m = re.search(r'<ul class="search_cate">\s+<li class="cate_content.+?">\s+<span class="b">.+?<span class="area_box">(.+?)</span>', self.channel_page, flags=re.S) if m: area_infos = m.group(1) p = re.compile(r'<a href="(.+?)".+?>(.+?)</a>', flags=re.S) for area in p.finditer(area_infos): channel_url, c_name = Config.xc_piao_home + area.group(1), area.group(2) channel_id = 0 if channel_url: m = re.search(r'D(\d+)', channel_url) if m: channel_id = m.group(1) if c_name: m = re.search(r'(.+?)\(', c_name, flags=re.S) if m: channel_name = m.group(1).strip() else: channel_name = c_name.strip() if int(channel_id) != 0 and channel_url: self.channel_list.append((channel_id, channel_name, channel_url, str(self.channel_type), str(self.province_id), self.province_name)) def channelPage(self): if self.channel_url: refers = Config.xc_home if int(self.channel_type) == 1: refers = Config.xc_piao_home data = self.crawler.getData(self.channel_url, Config.xc_home) if not data and data == '': raise Common.InvalidPageException("# channelPage:not find channel page,channel_id:%s,channel_url:%s"%(str(self.channel_id), self.channel_url)) if data and data != '': self.channel_page = data self.channel_pages['channel-home'] = (self.channel_url, data) def antPage(self, val): channel_id, channel_url, channel_type, begin_time = val self.init(channel_id, channel_url, channel_type, begin_time) self.config() def antChannelList(self, val): self.channel_url, self.channel_type, self.province_id, self.province_name = val self.channelList()
class RetryCrawler(): '''A class of retry crawl data''' def __init__(self): # 抓取设置 self.crawler = XCCrawler() # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' # wait time self.w_time = 1 # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._tag)) except Exception as e: Common.log('# To dial router exception: %s' % e) def getData(self, url, refers='', max_retry=20): page = '' retry = 1 while True: try: page = self.crawler.getData(url, refers) break except Common.InvalidPageException as e: if retry >= max_retry: break retry += 1 Common.log('# Invalid page exception: %s' % e) time.sleep(self.w_time*retry) except Common.DenypageException as e: if retry >= max_retry: break retry += 1 Common.log('# Deny page exception: %s' % e) # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(random.uniform(10,30)) except Common.SystemBusyException as e: if retry >= max_retry: break retry += 1 Common.log('# System busy exception: %s' % e) time.sleep(self.w_time*retry) except Exception as e: Common.log('# exception err in retry crawler: %s' % e) if str(e).find('Read timed out') != -1: if retry >= max_retry: break retry += 1 time.sleep(random.uniform(1,3)) elif str(e).find('Name or service not known') != -1 or str(e).find('Temporary failure in name resolution'): if retry >= max_retry: break retry += 1 # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(random.uniform(10,30)) else: break return page
class Item(): '''A class of xc Item''' def __init__(self): # 商品页面抓取设置 self.crawler = XCCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_begintime = '' # 本次抓取开始时间 self.crawling_beginDate = '' # 本次爬取日期 self.crawling_beginHour = '' # 本次爬取小时 # 单品类型商品所属频道 self.channel_id = '' self.channel_name = '' self.channel_url = '' self.channel_type = '' self.item_position = 0 # 商品信息 self.item_id = '' # 商品Id self.item_url = '' # 商品链接 self.item_pic_url = '' # 商品展示图片链接 self.item_name = '' # 商品Name self.item_desc = '' # 商品说明 self.item_book_status = 1 # 商品是否售卖 0:不售,1:在售 self.item_level = '' # 级别 self.item_area = '' # 地址 self.item_service = '' # 服务 self.item_comment = '' # 评论数 self.item_comment_grade = '' # 评分 # 商品交易 self.item_oriprice = '' # 商品原价 self.item_disprice = '' # 商品折扣价 self.item_discount = '' # 商品打折 # 门票 self.item_tickets = [] # 原数据信息 self.item_pageData = '' # 商品所属数据项内容 self.item_page = '' # 商品页面html内容 self.item_pages = {} # 商品页面内请求数据列表 # 商品页信息 def spotConfig(self, _val): self.item_book_status, self.item_id, self.item_url, self.item_pic_url, self.item_name, self.item_desc, self.item_area, self.item_position, self.crawling_begintime = _val # 本次抓取开始日期 self.crawling_beginDate = time.strftime( "%Y-%m-%d", time.localtime(self.crawling_begintime)) # 本次抓取开始小时 self.crawling_beginHour = time.strftime( "%H", time.localtime(self.crawling_begintime)) if self.item_book_status == 1: # 商品页信息 self.itemPage() page = self.item_page self.item_pages['item-home'] = (self.item_url, self.item_page) m = re.search( r'<div class="media-price".+?>.+?<div class="price-box">.+?<span>(\d+)</span>', page, flags=re.S) if m: self.item_disprice = m.group(1) m = re.search(r'<li class="promise".+?>(.+?)</li>', page, flags=re.S) if m: s_service = m.group(1) promise_list = [] p = re.compile( r'<div id="J-MediaLabel" class="media-label-wrapper">\s+<span class="media-label">(.+?)</span>', flags=re.S) for promise in p.finditer(s_service): promise_list.append(promise.group(1)) self.item_service = ';'.join(promise_list) m = re.search( r'<div class="grade" id="J-grade" data-value="(.+?)".+?>', page, flags=re.S) if m: self.item_comment_grade = m.group(1) m = re.search( r'<div class="grade".+?>.+?<a href=".+?" class="mark goToAnchor" data-target="J-Yhdp">(.+?)</a></div>', page, flags=re.S) if m: s_comment = m.group(1) m = re.search(r'(\d+)', s_comment) if m: self.item_comment = m.group(1) m = re.search(r'<span class="media-grade" style="">(.+?)</span>', page, flags=re.S) if m: self.item_level = re.sub(r'<.+?>', '', m.group(1)).strip() self.itemTicket() def itemTicket(self): if self.item_page: m = re.search( r'<div id="J-Ticket" class="tab-content">\s+<table class="ticket-table">.+?<tbody>(.+?)</tbody>\s+</table>', self.item_page, flags=re.S) if m: infos = m.group(1) t_type = '' t_i = 1 p = re.compile( r'<tr class="ticket-info.+?" data-id="(.*?)".+?>(.+?)</tr>', flags=re.S) for info in p.finditer(infos): t_id, t_data = info.group(1), info.group(2) if not t_id or t_id == '': t_id = t_i val = (self.item_id, self.item_name, self.channel_type, t_type, t_id, t_data, self.crawling_begintime) t = Ticket() t.antPage(val) self.item_tickets.append(t.outSql()) t_i += 1 t_type = t.ticket_type # 商品详情页html def itemPage(self): if self.item_url != '': refer_url = self.channel_url page = self.crawler.getData(self.item_url, refer_url) if type(self.crawler.history) is list and len( self.crawler.history) != 0 and re.search( r'302', str(self.crawler.history[0])): if not self.itempage_judge(page): Common.log('#crawler history:') Common.log(self.crawler.history) raise Common.NoPageException( "# itemPage: not find item page, redirecting to other page,id:%s,item_url:%s" % (str(self.item_id), self.item_url)) if not page or page == '': Common.log('#crawler history:') Common.log(self.crawler.history) raise Common.InvalidPageException( "# itemPage: find item page empty,id:%s,item_url:%s" % (str(self.item_id), self.item_url)) self.item_page = page else: raise Common.NoPageException( "# itemPage: not find item page, url is null,id:%s,item_url:%s" % (str(self.item_id), self.item_url)) # 执行 def antPage(self, val): self.channel_id, self.channel_name, self.channel_url, self.channel_type, i_val = val if self.channel_type == 1: self.spotConfig(i_val) def outTuple(self): return (self.channel_id, self.channel_name, self.channel_url, self.channel_type, self.item_position, self.item_book_status, self.item_id, self.item_url, self.item_pic_url, self.item_name, self.item_desc, self.item_level, self.item_area, self.item_service, self.item_comment, self.item_comment_grade, self.item_oriprice, self.item_disprice, self.item_discount, self.crawling_beginDate, self.crawling_beginHour) def outSql(self): return (Common.time_s(float(self.crawling_time)), str(self.item_id), self.item_name, self.item_desc, self.item_url, self.item_pic_url, str(self.item_book_status), self.item_level, self.item_area, self.item_service, str(self.item_comment), str(self.item_comment_grade), str(self.item_oriprice), str(self.item_disprice), str(self.item_discount), str(self.channel_id), str(self.item_position), self.crawling_beginDate, self.crawling_beginHour)
class Channel(): '''A class of XC channel''' def __init__(self): # 抓取设置 self.crawler = XCCrawler() self.retrycrawler = RetryCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_time_s = Common.time_s(self.crawling_time) self.crawling_begintime = '' # 本次抓取开始时间 self.crawling_beginDate = '' # 本次爬取日期 self.crawling_beginHour = '' # 本次爬取小时 # 频道信息 self.platform = '携程-pc' # 品牌团所在平台 self.channel_id = '' # 频道id self.channel_url = '' # 频道链接 self.channel_name = '' # 频道name self.channel_type = '' # 频道类型 # 频道所属地理位置信息 self.province_id = 0 # 省,州id self.province_name = '' # 省,州名称 # 原数据信息 self.channel_page = '' # 频道页面html内容 self.channel_pages = {} # 频道页面内请求数据列表 # channel items self.channel_items = [] # channel list self.channel_list = [] # 频道页初始化 def init(self, channel_id, channel_url, channel_type, begin_time): self.channel_id = channel_id self.channel_url = channel_url self.channel_type = channel_type self.crawling_begintime = begin_time self.crawling_beginDate = time.strftime( "%Y-%m-%d", time.localtime(self.crawling_begintime)) self.crawling_beginHour = time.strftime( "%H", time.localtime(self.crawling_begintime)) def config(self): self.channelPage() if self.channel_type == 1: self.spot() #elif self.channel_type == 2: else: Common.log('# not find this channel type...') def spot(self): if self.channel_page: m = re.search(r'<div class="cate_select">(.+?)</div>', self.channel_page, flags=re.S) if m: cate_select = m.group(1) c_list = [] p = re.compile(r'<a.+?class="select">(.+?)</a>', flags=re.S) for c in p.finditer(cate_select): c_list.append(re.sub(r'<.+?>', '', c.group(1)).strip()) self.channel_name = '-'.join(c_list) i_p = 1 i_page = 1 m_page = 1 page_main = '' m = re.search( r'<div id="base_bd">.+?<div class="bg_miancolor">.+?<div class="vacation_bd">(.+?)<div class="vacation_bd bottom_seo">', self.channel_page, flags=re.S) if m: page_main = m.group(1) else: page_main = self.channel_page Common.log(i_page) i_p = self.get_items(page_main, i_p) m = re.search(r'<span class="c_page2_numtop">(.+?)</span>', self.channel_page, flags=re.S) if m: m_page_info = m.group(1) m = re.search(r'\d+/(\d+)', m_page_info, flags=re.S) if m: m_page = int(m.group(1)) page_url = self.channel_url[0:-1] + 'P%s/' while i_page < m_page: i_page += 1 p_url = page_url % str(i_page) Common.log(i_page) page = self.retrycrawler.getData(p_url, self.channel_url) i_p = self.get_items(page, i_p) def get_items(self, page_main, i_p): if page_main: p = re.compile( r'<div class="searchresult_product04">\s+<div class="search_ticket_caption basefix">\s+<a href="(.+?)".+?>\s+<img src="(.+?)".+?/>.+?<div class="search_ticket_title">\s+<h2>\s+<a.+?>(.+?)</a>.+?</h2>\s+<div class="adress">(.+?)</div>\s+<div class="exercise">(.*?)</div>', flags=re.S) for info in p.finditer(page_main): if int(self.channel_type) == 1: i_url = Config.xc_piao_home + info.group(1) else: i_url = Config.xc_home + info.group(1) i_img, i_name, i_area, i_desc = info.group(2), info.group( 3).strip(), info.group(4).strip(), info.group(5).strip() i_book = 1 i_id = 0 if i_url != '': m = re.search(r't(\d+).html', i_url) if m: i_id = m.group(1) val = (self.channel_id, self.channel_name, self.channel_url, self.channel_type, (i_book, i_id, i_url, i_img, i_name, i_desc, i_area, i_p, self.crawling_begintime)) self.channel_items.append(val) i_p += 1 return i_p def channelList(self): self.channelPage() if self.channel_page: m = re.search( r'<ul class="search_cate">\s+<li class="cate_content.+?">\s+<span class="b">.+?<span class="area_box">(.+?)</span>', self.channel_page, flags=re.S) if m: area_infos = m.group(1) p = re.compile(r'<a href="(.+?)".+?>(.+?)</a>', flags=re.S) for area in p.finditer(area_infos): channel_url, c_name = Config.xc_piao_home + area.group( 1), area.group(2) channel_id = 0 if channel_url: m = re.search(r'D(\d+)', channel_url) if m: channel_id = m.group(1) if c_name: m = re.search(r'(.+?)\(', c_name, flags=re.S) if m: channel_name = m.group(1).strip() else: channel_name = c_name.strip() if int(channel_id) != 0 and channel_url: self.channel_list.append( (channel_id, channel_name, channel_url, str(self.channel_type), str(self.province_id), self.province_name)) def channelPage(self): if self.channel_url: refers = Config.xc_home if int(self.channel_type) == 1: refers = Config.xc_piao_home data = self.crawler.getData(self.channel_url, Config.xc_home) if not data and data == '': raise Common.InvalidPageException( "# channelPage:not find channel page,channel_id:%s,channel_url:%s" % (str(self.channel_id), self.channel_url)) if data and data != '': self.channel_page = data self.channel_pages['channel-home'] = (self.channel_url, data) def antPage(self, val): channel_id, channel_url, channel_type, begin_time = val self.init(channel_id, channel_url, channel_type, begin_time) self.config() def antChannelList(self, val): self.channel_url, self.channel_type, self.province_id, self.province_name = val self.channelList()
class Item(): '''A class of xc Item''' def __init__(self): # 商品页面抓取设置 self.crawler = XCCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_begintime = '' # 本次抓取开始时间 self.crawling_beginDate = '' # 本次爬取日期 self.crawling_beginHour = '' # 本次爬取小时 # 单品类型商品所属频道 self.channel_id = '' self.channel_name = '' self.channel_url = '' self.channel_type = '' self.item_position = 0 # 商品信息 self.item_id = '' # 商品Id self.item_url = '' # 商品链接 self.item_pic_url = '' # 商品展示图片链接 self.item_name = '' # 商品Name self.item_desc = '' # 商品说明 self.item_book_status = 1 # 商品是否售卖 0:不售,1:在售 self.item_level = '' # 级别 self.item_area = '' # 地址 self.item_service = '' # 服务 self.item_comment = '' # 评论数 self.item_comment_grade = '' # 评分 # 商品交易 self.item_oriprice = '' # 商品原价 self.item_disprice = '' # 商品折扣价 self.item_discount = '' # 商品打折 # 门票 self.item_tickets = [] # 原数据信息 self.item_pageData = '' # 商品所属数据项内容 self.item_page = '' # 商品页面html内容 self.item_pages = {} # 商品页面内请求数据列表 # 商品页信息 def spotConfig(self, _val): self.item_book_status, self.item_id, self.item_url, self.item_pic_url, self.item_name, self.item_desc, self.item_area, self.item_position, self.crawling_begintime = _val # 本次抓取开始日期 self.crawling_beginDate = time.strftime("%Y-%m-%d", time.localtime(self.crawling_begintime)) # 本次抓取开始小时 self.crawling_beginHour = time.strftime("%H", time.localtime(self.crawling_begintime)) if self.item_book_status == 1: # 商品页信息 self.itemPage() page = self.item_page self.item_pages['item-home'] = (self.item_url, self.item_page) m = re.search(r'<div class="media-price".+?>.+?<div class="price-box">.+?<span>(\d+)</span>', page, flags=re.S) if m: self.item_disprice = m.group(1) m = re.search(r'<li class="promise".+?>(.+?)</li>', page, flags=re.S) if m: s_service = m.group(1) promise_list = [] p = re.compile(r'<div id="J-MediaLabel" class="media-label-wrapper">\s+<span class="media-label">(.+?)</span>', flags=re.S) for promise in p.finditer(s_service): promise_list.append(promise.group(1)) self.item_service = ';'.join(promise_list) m = re.search(r'<div class="grade" id="J-grade" data-value="(.+?)".+?>', page, flags=re.S) if m: self.item_comment_grade = m.group(1) m = re.search(r'<div class="grade".+?>.+?<a href=".+?" class="mark goToAnchor" data-target="J-Yhdp">(.+?)</a></div>', page, flags=re.S) if m: s_comment = m.group(1) m = re.search(r'(\d+)', s_comment) if m: self.item_comment = m.group(1) m = re.search(r'<span class="media-grade" style="">(.+?)</span>', page, flags=re.S) if m: self.item_level = re.sub(r'<.+?>', '', m.group(1)).strip() self.itemTicket() def itemTicket(self): if self.item_page: m = re.search(r'<div id="J-Ticket" class="tab-content">\s+<table class="ticket-table">.+?<tbody>(.+?)</tbody>\s+</table>', self.item_page, flags=re.S) if m: infos = m.group(1) t_type = '' t_i = 1 p = re.compile(r'<tr class="ticket-info.+?" data-id="(.*?)".+?>(.+?)</tr>', flags=re.S) for info in p.finditer(infos): t_id, t_data = info.group(1), info.group(2) if not t_id or t_id == '': t_id = t_i val = (self.item_id, self.item_name, self.channel_type, t_type, t_id, t_data, self.crawling_begintime) t = Ticket() t.antPage(val) self.item_tickets.append(t.outSql()) t_i += 1 t_type = t.ticket_type # 商品详情页html def itemPage(self): if self.item_url != '': refer_url = self.channel_url page = self.crawler.getData(self.item_url, refer_url) if type(self.crawler.history) is list and len(self.crawler.history) != 0 and re.search(r'302',str(self.crawler.history[0])): if not self.itempage_judge(page): Common.log('#crawler history:') Common.log(self.crawler.history) raise Common.NoPageException("# itemPage: not find item page, redirecting to other page,id:%s,item_url:%s"%(str(self.item_id), self.item_url)) if not page or page == '': Common.log('#crawler history:') Common.log(self.crawler.history) raise Common.InvalidPageException("# itemPage: find item page empty,id:%s,item_url:%s"%(str(self.item_id), self.item_url)) self.item_page = page else: raise Common.NoPageException("# itemPage: not find item page, url is null,id:%s,item_url:%s"%(str(self.item_id), self.item_url)) # 执行 def antPage(self, val): self.channel_id, self.channel_name, self.channel_url, self.channel_type, i_val = val if self.channel_type == 1: self.spotConfig(i_val) def outTuple(self): return (self.channel_id, self.channel_name, self.channel_url, self.channel_type, self.item_position, self.item_book_status, self.item_id, self.item_url, self.item_pic_url, self.item_name, self.item_desc, self.item_level, self.item_area, self.item_service, self.item_comment, self.item_comment_grade, self.item_oriprice, self.item_disprice, self.item_discount, self.crawling_beginDate, self.crawling_beginHour) def outSql(self): return (Common.time_s(float(self.crawling_time)), str(self.item_id), self.item_name, self.item_desc, self.item_url, self.item_pic_url, str(self.item_book_status), self.item_level, self.item_area, self.item_service, str(self.item_comment), str(self.item_comment_grade), str(self.item_oriprice), str(self.item_disprice), str(self.item_discount), str(self.channel_id), str(self.item_position), self.crawling_beginDate, self.crawling_beginHour)