Beispiel #1
0
    def __init__(self):
        # 抓取设置
        self.crawler            = XCCrawler()
        self.retrycrawler       = RetryCrawler()
        self.crawling_time      = Common.now() # 当前爬取时间
        self.crawling_time_s    = Common.time_s(self.crawling_time)
        self.crawling_begintime = '' # 本次抓取开始时间
        self.crawling_beginDate = '' # 本次爬取日期
        self.crawling_beginHour = '' # 本次爬取小时

        # 频道信息
        self.platform           = '携程-pc' # 品牌团所在平台
        self.channel_id         = '' # 频道id
        self.channel_url        = '' # 频道链接
        self.channel_name       = '' # 频道name
        self.channel_type       = '' # 频道类型

        # 频道所属地理位置信息
        self.province_id        = 0  # 省,州id
        self.province_name      = '' # 省,州名称

        # 原数据信息
        self.channel_page       = '' # 频道页面html内容
        self.channel_pages      = {} # 频道页面内请求数据列表

        # channel items
        self.channel_items      = []

        # channel list
        self.channel_list       = []
Beispiel #2
0
    def __init__(self):
        # 抓取设置
        self.crawler     = XCCrawler()
        # dial client
        self.dial_client = DialClient()
        # local ip
        self._ip         = Common.local_ip()
        # router tag
        self._tag        = 'ikuai'

        # wait time
        self.w_time      = 1
Beispiel #3
0
    def __init__(self):
        # 商品页面抓取设置
        self.crawler            = XCCrawler()
        self.crawling_time      = Common.now() # 当前爬取时间
        self.crawling_begintime = '' # 本次抓取开始时间
        self.crawling_beginDate = '' # 本次爬取日期
        self.crawling_beginHour = '' # 本次爬取小时

        # 单品类型商品所属频道
        self.channel_id         = ''
        self.channel_name       = ''
        self.channel_url        = ''
        self.channel_type       = ''
        self.item_position      = 0

        # 商品信息
        self.item_id            = '' # 商品Id
        self.item_url           = '' # 商品链接
        self.item_pic_url       = '' # 商品展示图片链接
        self.item_name          = '' # 商品Name
        self.item_desc          = '' # 商品说明
        self.item_book_status   = 1  # 商品是否售卖 0:不售,1:在售
        self.item_level         = '' # 级别
        self.item_area          = '' # 地址
        self.item_service       = '' # 服务
        self.item_comment       = '' # 评论数
        self.item_comment_grade = '' # 评分

        # 商品交易
        self.item_oriprice      = '' # 商品原价
        self.item_disprice      = '' # 商品折扣价
        self.item_discount      = '' # 商品打折

        # 门票
        self.item_tickets       = []

        # 原数据信息
        self.item_pageData      = '' # 商品所属数据项内容
        self.item_page          = '' # 商品页面html内容
        self.item_pages         = {} # 商品页面内请求数据列表
Beispiel #4
0
class Channel():
    '''A class of XC channel'''
    def __init__(self):
        # 抓取设置
        self.crawler            = XCCrawler()
        self.retrycrawler       = RetryCrawler()
        self.crawling_time      = Common.now() # 当前爬取时间
        self.crawling_time_s    = Common.time_s(self.crawling_time)
        self.crawling_begintime = '' # 本次抓取开始时间
        self.crawling_beginDate = '' # 本次爬取日期
        self.crawling_beginHour = '' # 本次爬取小时

        # 频道信息
        self.platform           = '携程-pc' # 品牌团所在平台
        self.channel_id         = '' # 频道id
        self.channel_url        = '' # 频道链接
        self.channel_name       = '' # 频道name
        self.channel_type       = '' # 频道类型

        # 频道所属地理位置信息
        self.province_id        = 0  # 省,州id
        self.province_name      = '' # 省,州名称

        # 原数据信息
        self.channel_page       = '' # 频道页面html内容
        self.channel_pages      = {} # 频道页面内请求数据列表

        # channel items
        self.channel_items      = []

        # channel list
        self.channel_list       = []

    # 频道页初始化
    def init(self, channel_id, channel_url, channel_type, begin_time):
        self.channel_id = channel_id
        self.channel_url = channel_url
        self.channel_type = channel_type
        self.crawling_begintime = begin_time
        self.crawling_beginDate = time.strftime("%Y-%m-%d", time.localtime(self.crawling_begintime))
        self.crawling_beginHour = time.strftime("%H", time.localtime(self.crawling_begintime))

    def config(self):
        self.channelPage()
        if self.channel_type == 1:
            self.spot()
        #elif self.channel_type == 2:
        else:
            Common.log('# not find this channel type...')

    def spot(self):
        if self.channel_page:
            m = re.search(r'<div class="cate_select">(.+?)</div>', self.channel_page, flags=re.S)
            if m:
                cate_select = m.group(1)
                c_list = []
                p = re.compile(r'<a.+?class="select">(.+?)</a>', flags=re.S)
                for c in p.finditer(cate_select):
                    c_list.append(re.sub(r'<.+?>', '', c.group(1)).strip())
                self.channel_name = '-'.join(c_list)

            i_p = 1
            i_page = 1
            m_page = 1
            page_main = ''
            m = re.search(r'<div id="base_bd">.+?<div class="bg_miancolor">.+?<div class="vacation_bd">(.+?)<div class="vacation_bd bottom_seo">', self.channel_page, flags=re.S)
            if m:
                page_main = m.group(1)
            else:
                page_main = self.channel_page
            
            Common.log(i_page)
            i_p = self.get_items(page_main, i_p)

            m = re.search(r'<span class="c_page2_numtop">(.+?)</span>', self.channel_page, flags=re.S)
            if m:
                m_page_info = m.group(1)
                m = re.search(r'\d+/(\d+)', m_page_info, flags=re.S)
                if m:
                    m_page = int(m.group(1))

            page_url = self.channel_url[0:-1] + 'P%s/'
            while i_page < m_page:
                i_page += 1
                p_url = page_url % str(i_page)
                Common.log(i_page)
                page = self.retrycrawler.getData(p_url, self.channel_url)
                i_p = self.get_items(page, i_p)

    def get_items(self, page_main, i_p):
        if page_main:
            p = re.compile(r'<div class="searchresult_product04">\s+<div class="search_ticket_caption basefix">\s+<a href="(.+?)".+?>\s+<img src="(.+?)".+?/>.+?<div class="search_ticket_title">\s+<h2>\s+<a.+?>(.+?)</a>.+?</h2>\s+<div class="adress">(.+?)</div>\s+<div class="exercise">(.*?)</div>', flags=re.S)
            for info in p.finditer(page_main):
                if int(self.channel_type) == 1:
                    i_url = Config.xc_piao_home + info.group(1)
                else:
                    i_url = Config.xc_home + info.group(1)
                i_img, i_name, i_area, i_desc = info.group(2), info.group(3).strip(), info.group(4).strip(), info.group(5).strip()
                i_book = 1
                i_id = 0
                if i_url != '':
                    m = re.search(r't(\d+).html', i_url)
                    if m:
                        i_id = m.group(1)
                    val = (self.channel_id, self.channel_name, self.channel_url, self.channel_type, (i_book, i_id, i_url, i_img, i_name, i_desc, i_area, i_p, self.crawling_begintime))
                    self.channel_items.append(val)
                i_p += 1
        return i_p

    def channelList(self): 
        self.channelPage()
        if self.channel_page:
            m = re.search(r'<ul class="search_cate">\s+<li class="cate_content.+?">\s+<span class="b">.+?<span class="area_box">(.+?)</span>', self.channel_page, flags=re.S)
            if m:
                area_infos = m.group(1)
                p = re.compile(r'<a href="(.+?)".+?>(.+?)</a>', flags=re.S)
                for area in p.finditer(area_infos):
                    channel_url, c_name = Config.xc_piao_home + area.group(1), area.group(2)
                    channel_id = 0
                    if channel_url:
                        m = re.search(r'D(\d+)', channel_url)
                        if m:
                            channel_id = m.group(1)
                    if c_name:
                        m = re.search(r'(.+?)\(', c_name, flags=re.S)
                        if m:
                            channel_name = m.group(1).strip()
                        else:
                            channel_name = c_name.strip()
                    if int(channel_id) != 0 and channel_url:
                        self.channel_list.append((channel_id, channel_name, channel_url, str(self.channel_type), str(self.province_id), self.province_name))
                    
    def channelPage(self):
        if self.channel_url:
            refers = Config.xc_home
            if int(self.channel_type) == 1:
                refers = Config.xc_piao_home
            data = self.crawler.getData(self.channel_url, Config.xc_home)
            if not data and data == '': raise Common.InvalidPageException("# channelPage:not find channel page,channel_id:%s,channel_url:%s"%(str(self.channel_id), self.channel_url))
            if data and data != '':
                self.channel_page = data
                self.channel_pages['channel-home'] = (self.channel_url, data)

    def antPage(self, val):
        channel_id, channel_url, channel_type, begin_time = val
        self.init(channel_id, channel_url, channel_type, begin_time)
        self.config()

    def antChannelList(self, val):
        self.channel_url, self.channel_type, self.province_id, self.province_name = val
        self.channelList()
Beispiel #5
0
class RetryCrawler():
    '''A class of retry crawl data'''
    def __init__(self):
        # 抓取设置
        self.crawler     = XCCrawler()
        # dial client
        self.dial_client = DialClient()
        # local ip
        self._ip         = Common.local_ip()
        # router tag
        self._tag        = 'ikuai'

        # wait time
        self.w_time      = 1

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = '%s_%s' %(_type, _obj)
            self.dial_client.send((_module, self._ip, self._tag))
        except Exception as e:
            Common.log('# To dial router exception: %s' % e)

    def getData(self, url, refers='', max_retry=20):
        page = ''
        retry = 1
        while True:
            try:
                page = self.crawler.getData(url, refers)
                break
            except Common.InvalidPageException as e:
                if retry >= max_retry:
                    break
                retry += 1
                Common.log('# Invalid page exception: %s' % e)
                time.sleep(self.w_time*retry)
            except Common.DenypageException as e:
                if retry >= max_retry:
                    break
                retry += 1
                Common.log('# Deny page exception: %s' % e)
                # 重新拨号
                try:
                    self.dialRouter(4, 'chn')
                except Exception as e:
                    Common.log('# DailClient Exception err: %s' % e)
                time.sleep(random.uniform(10,30))

            except Common.SystemBusyException as e:
                if retry >= max_retry:
                    break
                retry += 1
                Common.log('# System busy exception: %s' % e)
                time.sleep(self.w_time*retry)
            except Exception as e:
                Common.log('# exception err in retry crawler: %s' % e)
                if str(e).find('Read timed out') != -1:
                    if retry >= max_retry:
                        break
                    retry += 1
                    time.sleep(random.uniform(1,3))
                elif str(e).find('Name or service not known') != -1 or str(e).find('Temporary failure in name resolution'):
                    if retry >= max_retry:
                        break
                    retry += 1
                    
                    # 重新拨号
                    try:
                        self.dialRouter(4, 'chn')
                    except Exception as e:
                        Common.log('# DailClient Exception err: %s' % e)
                    time.sleep(random.uniform(10,30))
                else:
                    break

        return page
Beispiel #6
0
class Item():
    '''A class of xc Item'''
    def __init__(self):
        # 商品页面抓取设置
        self.crawler            = XCCrawler()
        self.crawling_time      = Common.now() # 当前爬取时间
        self.crawling_begintime = '' # 本次抓取开始时间
        self.crawling_beginDate = '' # 本次爬取日期
        self.crawling_beginHour = '' # 本次爬取小时

        # 单品类型商品所属频道
        self.channel_id         = ''
        self.channel_name       = ''
        self.channel_url        = ''
        self.channel_type       = ''
        self.item_position      = 0

        # 商品信息
        self.item_id            = '' # 商品Id
        self.item_url           = '' # 商品链接
        self.item_pic_url       = '' # 商品展示图片链接
        self.item_name          = '' # 商品Name
        self.item_desc          = '' # 商品说明
        self.item_book_status   = 1  # 商品是否售卖 0:不售,1:在售
        self.item_level         = '' # 级别
        self.item_area          = '' # 地址
        self.item_service       = '' # 服务
        self.item_comment       = '' # 评论数
        self.item_comment_grade = '' # 评分

        # 商品交易
        self.item_oriprice      = '' # 商品原价
        self.item_disprice      = '' # 商品折扣价
        self.item_discount      = '' # 商品打折

        # 门票
        self.item_tickets       = []

        # 原数据信息
        self.item_pageData      = '' # 商品所属数据项内容
        self.item_page          = '' # 商品页面html内容
        self.item_pages         = {} # 商品页面内请求数据列表


    # 商品页信息
    def spotConfig(self, _val):
        self.item_book_status, self.item_id, self.item_url, self.item_pic_url, self.item_name, self.item_desc, self.item_area, self.item_position, self.crawling_begintime = _val
        # 本次抓取开始日期
        self.crawling_beginDate = time.strftime("%Y-%m-%d", time.localtime(self.crawling_begintime))
        # 本次抓取开始小时
        self.crawling_beginHour = time.strftime("%H", time.localtime(self.crawling_begintime))
        if self.item_book_status == 1:
            # 商品页信息
            self.itemPage()
            page = self.item_page
            self.item_pages['item-home'] = (self.item_url, self.item_page)

            m = re.search(r'<div class="media-price".+?>.+?<div class="price-box">.+?<span>(\d+)</span>', page, flags=re.S)
            if m:
                self.item_disprice = m.group(1)

            m = re.search(r'<li class="promise".+?>(.+?)</li>', page, flags=re.S)
            if m:
                s_service = m.group(1)
                promise_list = []
                p = re.compile(r'<div id="J-MediaLabel" class="media-label-wrapper">\s+<span class="media-label">(.+?)</span>', flags=re.S)
                for promise in p.finditer(s_service):
                    promise_list.append(promise.group(1))
                self.item_service = ';'.join(promise_list)

            m = re.search(r'<div class="grade" id="J-grade" data-value="(.+?)".+?>', page, flags=re.S)
            if m:
                self.item_comment_grade = m.group(1)
            m = re.search(r'<div class="grade".+?>.+?<a href=".+?" class="mark goToAnchor" data-target="J-Yhdp">(.+?)</a></div>', page, flags=re.S)
            if m:
                s_comment = m.group(1)
                m = re.search(r'(\d+)', s_comment)
                if m:
                    self.item_comment = m.group(1)

            m = re.search(r'<span class="media-grade" style="">(.+?)</span>', page, flags=re.S)
            if m:
                self.item_level = re.sub(r'<.+?>', '', m.group(1)).strip()
            
            self.itemTicket()

    def itemTicket(self):
        if self.item_page:
            m = re.search(r'<div id="J-Ticket" class="tab-content">\s+<table class="ticket-table">.+?<tbody>(.+?)</tbody>\s+</table>', self.item_page, flags=re.S)
            if m: 
                infos = m.group(1)
                t_type = ''
                t_i = 1
                p = re.compile(r'<tr class="ticket-info.+?" data-id="(.*?)".+?>(.+?)</tr>', flags=re.S)
                for info in p.finditer(infos):
                    t_id, t_data = info.group(1), info.group(2)
                    if not t_id or t_id == '':
                        t_id = t_i
                    val = (self.item_id, self.item_name, self.channel_type, t_type, t_id, t_data, self.crawling_begintime)
                    t = Ticket()
                    t.antPage(val)
                    self.item_tickets.append(t.outSql())
                    t_i += 1
                    t_type = t.ticket_type
                                    
    # 商品详情页html
    def itemPage(self):
        if self.item_url != '':
            refer_url = self.channel_url
            page = self.crawler.getData(self.item_url, refer_url)

            if type(self.crawler.history) is list and len(self.crawler.history) != 0 and re.search(r'302',str(self.crawler.history[0])):
                if not self.itempage_judge(page):
                    Common.log('#crawler history:')
                    Common.log(self.crawler.history)
                    raise Common.NoPageException("# itemPage: not find item page, redirecting to other page,id:%s,item_url:%s"%(str(self.item_id), self.item_url))

            if not page or page == '':
                Common.log('#crawler history:')
                Common.log(self.crawler.history)
                raise Common.InvalidPageException("# itemPage: find item page empty,id:%s,item_url:%s"%(str(self.item_id), self.item_url))
            self.item_page = page
        else:
            raise Common.NoPageException("# itemPage: not find item page, url is null,id:%s,item_url:%s"%(str(self.item_id), self.item_url))

    # 执行
    def antPage(self, val):
        self.channel_id, self.channel_name, self.channel_url, self.channel_type, i_val = val
        if self.channel_type == 1:
            self.spotConfig(i_val)

    def outTuple(self):
        return (self.channel_id, self.channel_name, self.channel_url, self.channel_type, self.item_position, self.item_book_status, self.item_id, self.item_url, self.item_pic_url, self.item_name, self.item_desc, self.item_level, self.item_area, self.item_service, self.item_comment, self.item_comment_grade, self.item_oriprice, self.item_disprice, self.item_discount, self.crawling_beginDate, self.crawling_beginHour)

    def outSql(self):
        return (Common.time_s(float(self.crawling_time)), str(self.item_id), self.item_name, self.item_desc, self.item_url, self.item_pic_url, str(self.item_book_status), self.item_level, self.item_area, self.item_service, str(self.item_comment), str(self.item_comment_grade), str(self.item_oriprice), str(self.item_disprice), str(self.item_discount), str(self.channel_id), str(self.item_position), self.crawling_beginDate, self.crawling_beginHour)